diff design_primers.py @ 5:b321e0517be3 draft

Uploaded
author ben-warren
date Thu, 22 May 2014 20:30:19 -0400
parents 21053f7f9ed1
children f201e8c6e004
line wrap: on
line diff
--- a/design_primers.py	Thu Oct 18 19:47:07 2012 -0400
+++ b/design_primers.py	Thu May 22 20:30:19 2014 -0400
@@ -1,9 +1,13 @@
-#!/usr/local/bin/python2.6
-##design primers to features in multiple sequences
-##usage: python  design_primers.py <fasta-file> <gff file> <file of target IDs> <prod_min_size> <prod_max_size>
 
+#!/usr/bin/env python
+##design primers to features in multiple sequences, with option to predict melting
+#usage: design_HRM_primers.py [-h] -i IN_FILE -g GFF_FILE -T TARGET_FILE [-u]
+#                             [-n MAX_PRIMERS] [-p PROD_MIN_SIZE]
+#                             [-P PROD_MAX_SIZE] [-l OPT_PRIMER_LENGTH]
+#                             [-m MAX_TM_DIFF] [-t OPTIMUM_TM]
+#                             [-G OPT_GC_PERCENT] [-x MAXPOLYX] [-c GC_CLAMP]
 
-#Copyright 2012 John McCallum & Leshi Chen
+#Copyright 2013 John McCallum & Susan Thomson
 #New Zealand Institute for Plant and Food Research
 #This program is free software: you can redistribute it and/or modify
 #     it under the terms of the GNU General Public License as published by
@@ -18,44 +22,86 @@
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-
 import os
 import StringIO
 import re
-import tempfile
-import subprocess
 import copy
 import sys
 from BCBio import GFF
 from BCBio.GFF import GFFExaminer
 from Bio import SeqIO
-from Bio.Emboss.Applications import Primer3Commandline
-from Bio.Emboss import Primer3
+import run_p3 as P3
+#import umelt_service as umelts
+import argparse	
+
+##Primer3 defaults or additional options defined as dictionary 
+def_dict={
+'PRIMER_MIN_SIZE':'18',
+'PRIMER_MAX_SIZE':'25',
+'PRIMER_MAX_NS_ACCEPTED':'1'}
+
+#parse arguments
+parser = argparse.ArgumentParser(description='Primer set design and melt prediction parameters')
+parser.add_argument('-i', type=argparse.FileType('r'), help="input sequence file, required", dest='in_file', required=True)
+parser.add_argument('-g', type=argparse.FileType('r'), help="input gff file with SNP and indels, required", dest='gff_file', required=True)
+parser.add_argument('-T', type=argparse.FileType('r'), help="input target SNP file, required", dest='target_file', required=True)
+parser.add_argument('-u',help="do uMelt prediction, optional", dest='run_uMelt',action='store_true', default=False )
+parser.add_argument('-n', type=int, help="maximum number of primer pairs to return, default=5", dest='max_primers', default=5) ## PRIMER_NUM_RETURN
+parser.add_argument('-p', type=int, help="minimum product size", dest='prod_min_size', default=100)                             ## PRIMER_PRODUCT_SIZE_RANGE min
+parser.add_argument('-P', type=int, help="maximum product size", dest='prod_max_size', default=300)                            ## PRIMER_PRODUCT_SIZE_RANGE max
+parser.add_argument('-l', type=int, help="optimum primer length", dest='opt_primer_length', default=20)                        ## PRIMER_OPT_SIZE
+parser.add_argument('-m', type=int, help="maximum tm difference between primers", dest='max_tm_diff', default=1)               ## PRIMER_PAIR_MAX_DIFF_TM
+parser.add_argument('-t', type=int, help="optimum Tm for primers, recommend range 59 to 61", dest='optimum_tm', default=59)    ## PRIMER_OPT_TM
+parser.add_argument('-G', type=int, help="optimum GC percentage of primers", dest='opt_GC_percent', default=50)                ## PRIMER_OPT_GC_PERCENT
+parser.add_argument('-x', type=int, help="maximum polyx, recommend less than 4", dest='maxpolyx', default=3)                   ## PRIMER_MAX_POLY_X
+parser.add_argument('-c', type=int, help="number of C/Gs at end, recommend 2", dest='gc_clamp', default=1)                     ## PRIMER_GC_CLAMP
+
+parser.add_argument('-e', type=int, help="maximum allowable 3'-anchored complementarity", dest='maxselfend', default=3)                     ## PRIMER_MAX_SELF_END
+parser.add_argument('-a', type=int, help="maximum complementarity between left and right or self", dest='maxselfany', default=8)                     ## PRIMER_MAX_SELF_ANY
+parser.add_argument('-maxgc', type=float, help="Maximum allowable percentage of Gs and Cs in any primer.", dest='maxgc', default=80.0)                     ## PRIMER_MAX_GC
+parser.add_argument('-mingc', type=float, help="Minimum allowable percentage of Gs and Cs in any primer.", dest='mingc', default=20.0)                     ## PRIMER_MIN_GC
 
 
-in_file = sys.argv[1]
-gff_file = sys.argv[2] 
-target_file =  sys.argv[3]
-prod_min_size = int(sys.argv[4])
-prod_max_size = int(sys.argv[5])
+parser.add_argument('-d', type=str, help="variant indentifier delimiter, used to separate sequence ID from rest ", dest='target_delim', default=':')                      
+try:
+        my_args = parser.parse_args()  
+except SystemExit:
+        print("\nOops, an argument is missing/invalid, exiting...\n")
+        sys.exit(0)
+
+##update from args. NEEDS TO BE FINISHED
+productsizerange = str(my_args.prod_min_size) + "-" + str(my_args.prod_max_size)
+def_dict['PRIMER_PRODUCT_SIZE_RANGE']=productsizerange
+def_dict['PRIMER_NUM_RETURN']=str(my_args.max_primers +1)
+def_dict['PRIMER_OPT_SIZE']=str(my_args.opt_primer_length)
+def_dict['PRIMER_PAIR_MAX_DIFF_TM']=str(my_args.max_tm_diff)
+def_dict['PRIMER_OPT_TM']=str(my_args.optimum_tm)
+def_dict['PRIMER_OPT_GC_PERCENT']=str(my_args.opt_GC_percent)
+def_dict['PRIMER_MAX_POLY_X']=str(my_args.maxpolyx)
+def_dict['PRIMER_GC_CLAMP']=str(my_args.gc_clamp)
 
-max_tm_diff=1                                        ##
-opt_GC_percent=50                                    ##
-maxpolyx=4                                           ##
-optimum_length=20
-##target is specified in start, end format 
-productsizerange = str(prod_min_size) + "," + str(prod_max_size)
+def_dict['PRIMER_MAX_SELF_END']=str(my_args.maxselfend)
+def_dict['PRIMER_MAX_SELF_ANY']=str(my_args.maxselfany)
+def_dict['PRIMER_MAX_GC']=str(my_args.maxgc)
+def_dict['PRIMER_MIN_GC']=str(my_args.mingc)
+
+
+
+##conditional import of umelt
+if my_args.run_uMelt:
+    import umelt_service as umelts
+
 #open input files
-in_seq_handle = open(in_file)
-in_gff_handle = open(gff_file)
-in_target_handle=open(target_file)
-#read  target feature IDs into list
-targets=in_target_handle.readlines()
-in_target_handle.close()
+
+targets=my_args.target_file.readlines()
+my_args.target_file.close()
 ##and create a hit list of sequences from this
-target_seq_id_list = list(set([line.split(":")[0] for line in targets]))
+target_seq_id_list = [re.split(my_args.target_delim,X)[0] for X in targets] ## target_delimiter defaults to ':'  e.g. ABC:SNP:SAMTOOL:1234 
+
+##print header
+print "SNP_Target_ID", "Position","Ref_base","Variant_base" ,"Amplicon_bp","PRIMER_LEFT_SEQUENCE",'PRIMER_RIGHT_SEQUENCE', "ref_melt_Tm","var_melt_Tm","Tm_difference"
 ##create iterator returning sequence records
-for myrec in SeqIO.parse(in_seq_handle, "fasta"):
+for myrec in SeqIO.parse(my_args.in_file, "fasta"):
     #check if this sequence is included in the target list
     if myrec.id in target_seq_id_list:
         ##create sequence dictionary so we can add in gff annotations
@@ -63,9 +109,9 @@
         ##just limit to gff annotations for this sequence
         limit_info = dict(gff_id = [ myrec.id ])
         ##rewind gff filehandle
-        in_gff_handle.seek(0)
+        my_args.gff_file.seek(0)
         ##read annotations into sequence dictionary for this sequence record only 
-        annotations = [r for r in GFF.parse(in_gff_handle, base_dict=seq_dict,limit_info=limit_info)]
+        annotations = [r for r in GFF.parse(my_args.gff_file, base_dict=seq_dict,limit_info=limit_info)]
         ##if there are any annotations, then  proceed. 
         if annotations:
             rec=annotations[0]
@@ -75,79 +121,58 @@
                 target_annotations = [f for f in rec.features if f.id == target_ID ]
                 if target_annotations:
                     mytarget = target_annotations[0]
-                    #create temporary files
-                    tempfastaFile = tempfile.mktemp()
-                    tempproutfile = tempfile.mktemp()
                     #just consider slice of sequence in a window of +/- prod_max_size  bp
                     ##from feature UNLESS feature is close to end
                     ##Note that slice is zero-based
                     featLocation = mytarget.location.start.position 
-                    if featLocation > prod_max_size:
-                        slice_start = featLocation - prod_max_size
-                        featPosition = prod_max_size  
+                    if featLocation > my_args.prod_max_size:
+                        slice_start = featLocation -  my_args.prod_max_size
+                        featPosition =  my_args.prod_max_size  
                     else:
                         slice_start = 0
                         featPosition = featLocation
-                    if (len(rec) - featLocation) < prod_max_size:
+                    if (len(rec) - featLocation) <  my_args.prod_max_size:
                         slice_end = len(rec)
                     else:
-                        slice_end = featLocation + prod_max_size
+                        slice_end = featLocation +  my_args.prod_max_size
                     ###grab slice of sequence fom this window.
                     targetRec = rec[slice_start:slice_end]
                     matching_feature = [f for f in targetRec.features if f.id == mytarget.id]
                     if matching_feature:
                         target_feat = matching_feature[0]
+                        my_target_dict={} # re-initialize target dictionary
                         if target_feat.location.start.position == 0:
                             target_feat.location.start.position = 1
-                        #we get the mask features by removing the target...all features are masked as just using snp and indels
-                        ##a smarter filter could be added 
-                        ##note use of list copy to avoid possible side-effects
-                        exclude_feat = list(targetRec.features)
+                        #get the mask features by removing  target...all features are masked as just using snp and indels, a smarter filter could be added 
+			exclude_feat = list(targetRec.features) ##list copy to avoid possible side-effects
                         exclude_feat.remove(target_feat)
-                        ##print'targetRec.features',  targetRec.features ##for debug
-                        mask_str=map(lambda f: str(f.location.start.position+1) + "," + str(f.location.end.position + 1) ,exclude_feat)
-                        #mask_str=map(lambda f: str(f.location).strip('[]'),exclude_feat)
-                        p3_exclude_str = str(mask_str).replace('\', \'',':')
-                        p3_target = str(target_feat.location.start.position +1)  + "," + str(target_feat.location.end.position +1)
-                        #write sequence record into template file as  fasta
-                        t_output_handle = open(tempfastaFile, "w")
-                        SeqIO.write([targetRec], t_output_handle, "fasta")
-                        t_output_handle.close()
-                        #create Primer3Commandline() for emboss tool
-                        primer_cl = Primer3Commandline()
-                        #set the emboss tool to suppress  output as this will make Galaxy  think it is error message although it is a message to state run success
-                        primer_cl.set_parameter("-auto",'1')
-                        #pass  sequence file to emboss
-                        primer_cl.set_parameter("-sequence",tempfastaFile)
-                        #add target location
-                        primer_cl.set_parameter("-target", p3_target)
-                        ##mask off other features...dumb masking of everything at present, beware
-                        if (p3_exclude_str != ""):
-                            primer_cl.set_parameter("-excludedregion", p3_exclude_str)
-                        #add temporary output file to get the result
-                        primer_cl.set_parameter("-outfile", tempproutfile)
-                        #specify maximum different of tm
-                        primer_cl.set_parameter("-maxdifftm",max_tm_diff )
-                        #other useful parameters
-                        primer_cl.set_parameter("-ogcpercent", opt_GC_percent)
-                        primer_cl.set_parameter("-opolyxmax", maxpolyx)  
-                        primer_cl.set_parameter("-osize", optimum_length)
-                        #set product size range
-                        primer_cl.set_parameter("-prange", productsizerange)
-                        #using python subprocess method to run emboss command line programe with the parameters given
-                        fnull = open(os.devnull, 'w')
-                        result=subprocess.check_call(str(primer_cl),shell=True ,stdout = fnull, stderr = fnull)
-                        #read temporary outputfile
-                        handle = open(tempproutfile)
-                        record = Primer3.read(handle)
-                        ##just return first set, if there is one
-                        if len(record.primers) > 0:
-                            primer= record.primers[0]
-                            outputstr=[mytarget.id, primer.forward_seq,primer.reverse_seq,primer.size]
-                        else:
-                            outputstr=[mytarget.id,"NONE","NONE","NONE"]
-                        print('\t'.join(map(str,outputstr)))
+			excludes_str=' '.join([str(x.location.start.position)+','+str(x.location.end.position -x.location.start.position) for x in exclude_feat])
+                        my_target_dict={'SEQUENCE_ID' : rec.name, 'SEQUENCE_TEMPLATE': targetRec.seq.tostring(),'SEQUENCE_TARGET': str(target_feat.location.start.position) + ',1','SEQUENCE_INTERNAL_EXCLUDED_REGION': excludes_str}
+                        my_target_dict.update(def_dict) ##add in defaults
+			result=P3.run_P3(target_dict=my_target_dict)
+                        if my_args.run_uMelt:
+                            amp_seq=targetRec.seq ##need to make this conditional on getting a result >0 and melt=True
+                            mutamp_seq=targetRec.seq.tomutable()
+                            mutamp_seq[target_feat.location.start:target_feat.location.end]=target_feat.qualifiers['Variant_seq'][0] #mutate to variant
+			for primerset in result:
+				amp_start=int(primerset['PRIMER_LEFT'].split(',')[0])
+				amp_end=int(primerset['PRIMER_RIGHT'].split(',')[0])
+                                ref_melt_Tm=0
+                                var_melt_Tm=0
+                                if my_args.run_uMelt:
+                                    try:
+                                        ref_melt_Tm=umelts.getTm(umelts.getmelt(amp_seq.tostring()[amp_start:amp_end+1]))
+					var_melt_Tm=umelts.getTm(umelts.getmelt(mutamp_seq.tostring()[amp_start:amp_end+1]))
+                                    except:
+					ref_melt_Tm=0 ##preferably something more informative?
+					var_melt_Tm=0 ##exception handling to be added
+                                reference_seq=target_feat.qualifiers['Reference_seq'][0]
+                                if target_feat.qualifiers.has_key('Variant_seq'):
+                                    variant_seq=target_feat.qualifiers['Variant_seq'][0]
+                                else:
+                                    variant_seq="NA"
+                                print mytarget.id, featLocation + 1 ,reference_seq, variant_seq,amp_end-amp_start,primerset['PRIMER_LEFT_SEQUENCE'],primerset['PRIMER_RIGHT_SEQUENCE'], ref_melt_Tm,var_melt_Tm,abs(ref_melt_Tm-var_melt_Tm)#, amp_seq.tostring()[amp_start:amp_end+1], mutamp_seq.tostring()[amp_start:amp_end+1]
 
-                        
-in_gff_handle.close()
-in_seq_handle.close()
+my_args.gff_file.close()
+my_args.in_file.close()
+