pcr_markers: design_primers.py comparison

comparison design_primers.py @ 5:b321e0517be3 draft

Uploaded

author	ben-warren
date	Thu, 22 May 2014 20:30:19 -0400
parents	21053f7f9ed1
children	f201e8c6e004

comparison

equal deleted inserted replaced

-:be070a68521e
+:b321e0517be3
-#!/usr/local/bin/python2.6
-##design primers to features in multiple sequences
-##usage: python  design_primers.py <fasta-file> <gff file> <file of target IDs> <prod_min_size> <prod_max_size>
+#!/usr/bin/env python
+##design primers to features in multiple sequences, with option to predict melting
+#usage: design_HRM_primers.py [-h] -i IN_FILE -g GFF_FILE -T TARGET_FILE [-u]
+#                             [-n MAX_PRIMERS] [-p PROD_MIN_SIZE]
+#                             [-P PROD_MAX_SIZE] [-l OPT_PRIMER_LENGTH]
+#                             [-m MAX_TM_DIFF] [-t OPTIMUM_TM]
+#                             [-G OPT_GC_PERCENT] [-x MAXPOLYX] [-c GC_CLAMP]
-#Copyright 2012 John McCallum & Leshi Chen
+#Copyright 2013 John McCallum & Susan Thomson
 #New Zealand Institute for Plant and Food Research
 #This program is free software: you can redistribute it and/or modify
 #     it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, either version 3 of the License, or
 #    (at your option) any later version.
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 import os
 import StringIO
 import re
-import tempfile
-import subprocess
 import copy
 import sys
 from BCBio import GFF
 from BCBio.GFF import GFFExaminer
 from Bio import SeqIO
-from Bio.Emboss.Applications import Primer3Commandline
+import run_p3 as P3
-from Bio.Emboss import Primer3
+#import umelt_service as umelts
+import argparse
+##Primer3 defaults or additional options defined as dictionary
+def_dict={
+'PRIMER_MIN_SIZE':'18',
+'PRIMER_MAX_SIZE':'25',
+'PRIMER_MAX_NS_ACCEPTED':'1'}
+#parse arguments
+parser = argparse.ArgumentParser(description='Primer set design and melt prediction parameters')
+parser.add_argument('-i', type=argparse.FileType('r'), help="input sequence file, required", dest='in_file', required=True)
+parser.add_argument('-g', type=argparse.FileType('r'), help="input gff file with SNP and indels, required", dest='gff_file', required=True)
+parser.add_argument('-T', type=argparse.FileType('r'), help="input target SNP file, required", dest='target_file', required=True)
+parser.add_argument('-u',help="do uMelt prediction, optional", dest='run_uMelt',action='store_true', default=False )
+parser.add_argument('-n', type=int, help="maximum number of primer pairs to return, default=5", dest='max_primers', default=5) ## PRIMER_NUM_RETURN
+parser.add_argument('-p', type=int, help="minimum product size", dest='prod_min_size', default=100)                             ## PRIMER_PRODUCT_SIZE_RANGE min
+parser.add_argument('-P', type=int, help="maximum product size", dest='prod_max_size', default=300)                            ## PRIMER_PRODUCT_SIZE_RANGE max
+parser.add_argument('-l', type=int, help="optimum primer length", dest='opt_primer_length', default=20)                        ## PRIMER_OPT_SIZE
+parser.add_argument('-m', type=int, help="maximum tm difference between primers", dest='max_tm_diff', default=1)               ## PRIMER_PAIR_MAX_DIFF_TM
+parser.add_argument('-t', type=int, help="optimum Tm for primers, recommend range 59 to 61", dest='optimum_tm', default=59)    ## PRIMER_OPT_TM
+parser.add_argument('-G', type=int, help="optimum GC percentage of primers", dest='opt_GC_percent', default=50)                ## PRIMER_OPT_GC_PERCENT
+parser.add_argument('-x', type=int, help="maximum polyx, recommend less than 4", dest='maxpolyx', default=3)                   ## PRIMER_MAX_POLY_X
+parser.add_argument('-c', type=int, help="number of C/Gs at end, recommend 2", dest='gc_clamp', default=1)                     ## PRIMER_GC_CLAMP
+parser.add_argument('-e', type=int, help="maximum allowable 3'-anchored complementarity", dest='maxselfend', default=3)                     ## PRIMER_MAX_SELF_END
+parser.add_argument('-a', type=int, help="maximum complementarity between left and right or self", dest='maxselfany', default=8)                     ## PRIMER_MAX_SELF_ANY
+parser.add_argument('-maxgc', type=float, help="Maximum allowable percentage of Gs and Cs in any primer.", dest='maxgc', default=80.0)                     ## PRIMER_MAX_GC
+parser.add_argument('-mingc', type=float, help="Minimum allowable percentage of Gs and Cs in any primer.", dest='mingc', default=20.0)                     ## PRIMER_MIN_GC
-in_file = sys.argv[1]
+parser.add_argument('-d', type=str, help="variant indentifier delimiter, used to separate sequence ID from rest ", dest='target_delim', default=':')
-gff_file = sys.argv[2]
+try:
-target_file =  sys.argv[3]
+my_args = parser.parse_args()
-prod_min_size = int(sys.argv[4])
+except SystemExit:
-prod_max_size = int(sys.argv[5])
+print("\nOops, an argument is missing/invalid, exiting...\n")
+sys.exit(0)
-max_tm_diff=1                                        ##
+##update from args. NEEDS TO BE FINISHED
-opt_GC_percent=50                                    ##
+productsizerange = str(my_args.prod_min_size) + "-" + str(my_args.prod_max_size)
-maxpolyx=4                                           ##
+def_dict['PRIMER_PRODUCT_SIZE_RANGE']=productsizerange
-optimum_length=20
+def_dict['PRIMER_NUM_RETURN']=str(my_args.max_primers +1)
-##target is specified in start, end format
+def_dict['PRIMER_OPT_SIZE']=str(my_args.opt_primer_length)
-productsizerange = str(prod_min_size) + "," + str(prod_max_size)
+def_dict['PRIMER_PAIR_MAX_DIFF_TM']=str(my_args.max_tm_diff)
+def_dict['PRIMER_OPT_TM']=str(my_args.optimum_tm)
+def_dict['PRIMER_OPT_GC_PERCENT']=str(my_args.opt_GC_percent)
+def_dict['PRIMER_MAX_POLY_X']=str(my_args.maxpolyx)
+def_dict['PRIMER_GC_CLAMP']=str(my_args.gc_clamp)
+def_dict['PRIMER_MAX_SELF_END']=str(my_args.maxselfend)
+def_dict['PRIMER_MAX_SELF_ANY']=str(my_args.maxselfany)
+def_dict['PRIMER_MAX_GC']=str(my_args.maxgc)
+def_dict['PRIMER_MIN_GC']=str(my_args.mingc)
+##conditional import of umelt
+if my_args.run_uMelt:
+import umelt_service as umelts
 #open input files
-in_seq_handle = open(in_file)
-in_gff_handle = open(gff_file)
+targets=my_args.target_file.readlines()
-in_target_handle=open(target_file)
+my_args.target_file.close()
-#read  target feature IDs into list
-targets=in_target_handle.readlines()
-in_target_handle.close()
 ##and create a hit list of sequences from this
-target_seq_id_list = list(set([line.split(":")[0] for line in targets]))
+target_seq_id_list = [re.split(my_args.target_delim,X)[0] for X in targets] ## target_delimiter defaults to ':'  e.g. ABC:SNP:SAMTOOL:1234
+##print header
+print "SNP_Target_ID", "Position","Ref_base","Variant_base" ,"Amplicon_bp","PRIMER_LEFT_SEQUENCE",'PRIMER_RIGHT_SEQUENCE', "ref_melt_Tm","var_melt_Tm","Tm_difference"
 ##create iterator returning sequence records
-for myrec in SeqIO.parse(in_seq_handle, "fasta"):
+for myrec in SeqIO.parse(my_args.in_file, "fasta"):
 #check if this sequence is included in the target list
 if myrec.id in target_seq_id_list:
 ##create sequence dictionary so we can add in gff annotations
 seq_dict = {myrec.id : myrec}
 ##just limit to gff annotations for this sequence
 limit_info = dict(gff_id = [ myrec.id ])
 ##rewind gff filehandle
-in_gff_handle.seek(0)
+my_args.gff_file.seek(0)
 ##read annotations into sequence dictionary for this sequence record only
-annotations = [r for r in GFF.parse(in_gff_handle, base_dict=seq_dict,limit_info=limit_info)]
+annotations = [r for r in GFF.parse(my_args.gff_file, base_dict=seq_dict,limit_info=limit_info)]
 ##if there are any annotations, then  proceed.
 if annotations:
 rec=annotations[0]
 ##iterate over list of target IDs
 for t in targets:
 target_ID = t.strip('\n')
 target_annotations = [f for f in rec.features if f.id == target_ID ]
 if target_annotations:
 mytarget = target_annotations[0]
-#create temporary files
-tempfastaFile = tempfile.mktemp()
-tempproutfile = tempfile.mktemp()
 #just consider slice of sequence in a window of +/- prod_max_size  bp
 ##from feature UNLESS feature is close to end
 ##Note that slice is zero-based
 featLocation = mytarget.location.start.position
-if featLocation > prod_max_size:
+if featLocation > my_args.prod_max_size:
-slice_start = featLocation - prod_max_size
+slice_start = featLocation -  my_args.prod_max_size
-featPosition = prod_max_size
+featPosition =  my_args.prod_max_size
 else:
 slice_start = 0
 featPosition = featLocation
-if (len(rec) - featLocation) < prod_max_size:
+if (len(rec) - featLocation) <  my_args.prod_max_size:
 slice_end = len(rec)
 else:
-slice_end = featLocation + prod_max_size
+slice_end = featLocation +  my_args.prod_max_size
 ###grab slice of sequence fom this window.
 targetRec = rec[slice_start:slice_end]
 matching_feature = [f for f in targetRec.features if f.id == mytarget.id]
 if matching_feature:
 target_feat = matching_feature[0]
+my_target_dict={} # re-initialize target dictionary
 if target_feat.location.start.position == 0:
 target_feat.location.start.position = 1
-#we get the mask features by removing the target...all features are masked as just using snp and indels
+#get the mask features by removing  target...all features are masked as just using snp and indels, a smarter filter could be added
-##a smarter filter could be added
+			exclude_feat = list(targetRec.features) ##list copy to avoid possible side-effects
-##note use of list copy to avoid possible side-effects
-exclude_feat = list(targetRec.features)
 exclude_feat.remove(target_feat)
-##print'targetRec.features',  targetRec.features ##for debug
+			excludes_str=' '.join([str(x.location.start.position)+','+str(x.location.end.position -x.location.start.position) for x in exclude_feat])
-mask_str=map(lambda f: str(f.location.start.position+1) + "," + str(f.location.end.position + 1) ,exclude_feat)
+my_target_dict={'SEQUENCE_ID' : rec.name, 'SEQUENCE_TEMPLATE': targetRec.seq.tostring(),'SEQUENCE_TARGET': str(target_feat.location.start.position) + ',1','SEQUENCE_INTERNAL_EXCLUDED_REGION': excludes_str}
-#mask_str=map(lambda f: str(f.location).strip('[]'),exclude_feat)
+my_target_dict.update(def_dict) ##add in defaults
-p3_exclude_str = str(mask_str).replace('\', \'',':')
+			result=P3.run_P3(target_dict=my_target_dict)
-p3_target = str(target_feat.location.start.position +1)  + "," + str(target_feat.location.end.position +1)
+if my_args.run_uMelt:
-#write sequence record into template file as  fasta
+amp_seq=targetRec.seq ##need to make this conditional on getting a result >0 and melt=True
-t_output_handle = open(tempfastaFile, "w")
+mutamp_seq=targetRec.seq.tomutable()
-SeqIO.write([targetRec], t_output_handle, "fasta")
+mutamp_seq[target_feat.location.start:target_feat.location.end]=target_feat.qualifiers['Variant_seq'][0] #mutate to variant
-t_output_handle.close()
+			for primerset in result:
-#create Primer3Commandline() for emboss tool
+				amp_start=int(primerset['PRIMER_LEFT'].split(',')[0])
-primer_cl = Primer3Commandline()
+				amp_end=int(primerset['PRIMER_RIGHT'].split(',')[0])
-#set the emboss tool to suppress  output as this will make Galaxy  think it is error message although it is a message to state run success
+ref_melt_Tm=0
-primer_cl.set_parameter("-auto",'1')
+var_melt_Tm=0
-#pass  sequence file to emboss
+if my_args.run_uMelt:
-primer_cl.set_parameter("-sequence",tempfastaFile)
+try:
-#add target location
+ref_melt_Tm=umelts.getTm(umelts.getmelt(amp_seq.tostring()[amp_start:amp_end+1]))
-primer_cl.set_parameter("-target", p3_target)
+					var_melt_Tm=umelts.getTm(umelts.getmelt(mutamp_seq.tostring()[amp_start:amp_end+1]))
-##mask off other features...dumb masking of everything at present, beware
+except:
-if (p3_exclude_str != ""):
+					ref_melt_Tm=0 ##preferably something more informative?
-primer_cl.set_parameter("-excludedregion", p3_exclude_str)
+					var_melt_Tm=0 ##exception handling to be added
-#add temporary output file to get the result
+reference_seq=target_feat.qualifiers['Reference_seq'][0]
-primer_cl.set_parameter("-outfile", tempproutfile)
+if target_feat.qualifiers.has_key('Variant_seq'):
-#specify maximum different of tm
+variant_seq=target_feat.qualifiers['Variant_seq'][0]
-primer_cl.set_parameter("-maxdifftm",max_tm_diff )
+else:
-#other useful parameters
+variant_seq="NA"
-primer_cl.set_parameter("-ogcpercent", opt_GC_percent)
+print mytarget.id, featLocation + 1 ,reference_seq, variant_seq,amp_end-amp_start,primerset['PRIMER_LEFT_SEQUENCE'],primerset['PRIMER_RIGHT_SEQUENCE'], ref_melt_Tm,var_melt_Tm,abs(ref_melt_Tm-var_melt_Tm)#, amp_seq.tostring()[amp_start:amp_end+1], mutamp_seq.tostring()[amp_start:amp_end+1]
-primer_cl.set_parameter("-opolyxmax", maxpolyx)
-primer_cl.set_parameter("-osize", optimum_length)
-#set product size range
-primer_cl.set_parameter("-prange", productsizerange)
-#using python subprocess method to run emboss command line programe with the parameters given
-fnull = open(os.devnull, 'w')
-result=subprocess.check_call(str(primer_cl),shell=True ,stdout = fnull, stderr = fnull)
-#read temporary outputfile
-handle = open(tempproutfile)
-record = Primer3.read(handle)
-##just return first set, if there is one
-if len(record.primers) > 0:
-primer= record.primers[0]
-outputstr=[mytarget.id, primer.forward_seq,primer.reverse_seq,primer.size]
-else:
-outputstr=[mytarget.id,"NONE","NONE","NONE"]
-print('\t'.join(map(str,outputstr)))
+my_args.gff_file.close()
-in_gff_handle.close()
+my_args.in_file.close()
-in_seq_handle.close()

Mercurial > repos > john-mccallum > pcr_markers

comparison design_primers.py @ 5:b321e0517be3 draft