comparison design_primers.py @ 5:b321e0517be3 draft

Uploaded
author ben-warren
date Thu, 22 May 2014 20:30:19 -0400
parents 21053f7f9ed1
children f201e8c6e004
comparison
equal deleted inserted replaced
4:be070a68521e 5:b321e0517be3
1 #!/usr/local/bin/python2.6
2 ##design primers to features in multiple sequences
3 ##usage: python design_primers.py <fasta-file> <gff file> <file of target IDs> <prod_min_size> <prod_max_size>
4 1
2 #!/usr/bin/env python
3 ##design primers to features in multiple sequences, with option to predict melting
4 #usage: design_HRM_primers.py [-h] -i IN_FILE -g GFF_FILE -T TARGET_FILE [-u]
5 # [-n MAX_PRIMERS] [-p PROD_MIN_SIZE]
6 # [-P PROD_MAX_SIZE] [-l OPT_PRIMER_LENGTH]
7 # [-m MAX_TM_DIFF] [-t OPTIMUM_TM]
8 # [-G OPT_GC_PERCENT] [-x MAXPOLYX] [-c GC_CLAMP]
5 9
6 #Copyright 2012 John McCallum & Leshi Chen 10 #Copyright 2013 John McCallum & Susan Thomson
7 #New Zealand Institute for Plant and Food Research 11 #New Zealand Institute for Plant and Food Research
8 #This program is free software: you can redistribute it and/or modify 12 #This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by 13 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or 14 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version. 15 # (at your option) any later version.
16 # GNU General Public License for more details. 20 # GNU General Public License for more details.
17 # 21 #
18 # You should have received a copy of the GNU General Public License 22 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. 23 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 24
21
22 import os 25 import os
23 import StringIO 26 import StringIO
24 import re 27 import re
25 import tempfile
26 import subprocess
27 import copy 28 import copy
28 import sys 29 import sys
29 from BCBio import GFF 30 from BCBio import GFF
30 from BCBio.GFF import GFFExaminer 31 from BCBio.GFF import GFFExaminer
31 from Bio import SeqIO 32 from Bio import SeqIO
32 from Bio.Emboss.Applications import Primer3Commandline 33 import run_p3 as P3
33 from Bio.Emboss import Primer3 34 #import umelt_service as umelts
35 import argparse
36
37 ##Primer3 defaults or additional options defined as dictionary
38 def_dict={
39 'PRIMER_MIN_SIZE':'18',
40 'PRIMER_MAX_SIZE':'25',
41 'PRIMER_MAX_NS_ACCEPTED':'1'}
42
43 #parse arguments
44 parser = argparse.ArgumentParser(description='Primer set design and melt prediction parameters')
45 parser.add_argument('-i', type=argparse.FileType('r'), help="input sequence file, required", dest='in_file', required=True)
46 parser.add_argument('-g', type=argparse.FileType('r'), help="input gff file with SNP and indels, required", dest='gff_file', required=True)
47 parser.add_argument('-T', type=argparse.FileType('r'), help="input target SNP file, required", dest='target_file', required=True)
48 parser.add_argument('-u',help="do uMelt prediction, optional", dest='run_uMelt',action='store_true', default=False )
49 parser.add_argument('-n', type=int, help="maximum number of primer pairs to return, default=5", dest='max_primers', default=5) ## PRIMER_NUM_RETURN
50 parser.add_argument('-p', type=int, help="minimum product size", dest='prod_min_size', default=100) ## PRIMER_PRODUCT_SIZE_RANGE min
51 parser.add_argument('-P', type=int, help="maximum product size", dest='prod_max_size', default=300) ## PRIMER_PRODUCT_SIZE_RANGE max
52 parser.add_argument('-l', type=int, help="optimum primer length", dest='opt_primer_length', default=20) ## PRIMER_OPT_SIZE
53 parser.add_argument('-m', type=int, help="maximum tm difference between primers", dest='max_tm_diff', default=1) ## PRIMER_PAIR_MAX_DIFF_TM
54 parser.add_argument('-t', type=int, help="optimum Tm for primers, recommend range 59 to 61", dest='optimum_tm', default=59) ## PRIMER_OPT_TM
55 parser.add_argument('-G', type=int, help="optimum GC percentage of primers", dest='opt_GC_percent', default=50) ## PRIMER_OPT_GC_PERCENT
56 parser.add_argument('-x', type=int, help="maximum polyx, recommend less than 4", dest='maxpolyx', default=3) ## PRIMER_MAX_POLY_X
57 parser.add_argument('-c', type=int, help="number of C/Gs at end, recommend 2", dest='gc_clamp', default=1) ## PRIMER_GC_CLAMP
58
59 parser.add_argument('-e', type=int, help="maximum allowable 3'-anchored complementarity", dest='maxselfend', default=3) ## PRIMER_MAX_SELF_END
60 parser.add_argument('-a', type=int, help="maximum complementarity between left and right or self", dest='maxselfany', default=8) ## PRIMER_MAX_SELF_ANY
61 parser.add_argument('-maxgc', type=float, help="Maximum allowable percentage of Gs and Cs in any primer.", dest='maxgc', default=80.0) ## PRIMER_MAX_GC
62 parser.add_argument('-mingc', type=float, help="Minimum allowable percentage of Gs and Cs in any primer.", dest='mingc', default=20.0) ## PRIMER_MIN_GC
34 63
35 64
36 in_file = sys.argv[1] 65 parser.add_argument('-d', type=str, help="variant indentifier delimiter, used to separate sequence ID from rest ", dest='target_delim', default=':')
37 gff_file = sys.argv[2] 66 try:
38 target_file = sys.argv[3] 67 my_args = parser.parse_args()
39 prod_min_size = int(sys.argv[4]) 68 except SystemExit:
40 prod_max_size = int(sys.argv[5]) 69 print("\nOops, an argument is missing/invalid, exiting...\n")
70 sys.exit(0)
41 71
42 max_tm_diff=1 ## 72 ##update from args. NEEDS TO BE FINISHED
43 opt_GC_percent=50 ## 73 productsizerange = str(my_args.prod_min_size) + "-" + str(my_args.prod_max_size)
44 maxpolyx=4 ## 74 def_dict['PRIMER_PRODUCT_SIZE_RANGE']=productsizerange
45 optimum_length=20 75 def_dict['PRIMER_NUM_RETURN']=str(my_args.max_primers +1)
46 ##target is specified in start, end format 76 def_dict['PRIMER_OPT_SIZE']=str(my_args.opt_primer_length)
47 productsizerange = str(prod_min_size) + "," + str(prod_max_size) 77 def_dict['PRIMER_PAIR_MAX_DIFF_TM']=str(my_args.max_tm_diff)
78 def_dict['PRIMER_OPT_TM']=str(my_args.optimum_tm)
79 def_dict['PRIMER_OPT_GC_PERCENT']=str(my_args.opt_GC_percent)
80 def_dict['PRIMER_MAX_POLY_X']=str(my_args.maxpolyx)
81 def_dict['PRIMER_GC_CLAMP']=str(my_args.gc_clamp)
82
83 def_dict['PRIMER_MAX_SELF_END']=str(my_args.maxselfend)
84 def_dict['PRIMER_MAX_SELF_ANY']=str(my_args.maxselfany)
85 def_dict['PRIMER_MAX_GC']=str(my_args.maxgc)
86 def_dict['PRIMER_MIN_GC']=str(my_args.mingc)
87
88
89
90 ##conditional import of umelt
91 if my_args.run_uMelt:
92 import umelt_service as umelts
93
48 #open input files 94 #open input files
49 in_seq_handle = open(in_file) 95
50 in_gff_handle = open(gff_file) 96 targets=my_args.target_file.readlines()
51 in_target_handle=open(target_file) 97 my_args.target_file.close()
52 #read target feature IDs into list
53 targets=in_target_handle.readlines()
54 in_target_handle.close()
55 ##and create a hit list of sequences from this 98 ##and create a hit list of sequences from this
56 target_seq_id_list = list(set([line.split(":")[0] for line in targets])) 99 target_seq_id_list = [re.split(my_args.target_delim,X)[0] for X in targets] ## target_delimiter defaults to ':' e.g. ABC:SNP:SAMTOOL:1234
100
101 ##print header
102 print "SNP_Target_ID", "Position","Ref_base","Variant_base" ,"Amplicon_bp","PRIMER_LEFT_SEQUENCE",'PRIMER_RIGHT_SEQUENCE', "ref_melt_Tm","var_melt_Tm","Tm_difference"
57 ##create iterator returning sequence records 103 ##create iterator returning sequence records
58 for myrec in SeqIO.parse(in_seq_handle, "fasta"): 104 for myrec in SeqIO.parse(my_args.in_file, "fasta"):
59 #check if this sequence is included in the target list 105 #check if this sequence is included in the target list
60 if myrec.id in target_seq_id_list: 106 if myrec.id in target_seq_id_list:
61 ##create sequence dictionary so we can add in gff annotations 107 ##create sequence dictionary so we can add in gff annotations
62 seq_dict = {myrec.id : myrec} 108 seq_dict = {myrec.id : myrec}
63 ##just limit to gff annotations for this sequence 109 ##just limit to gff annotations for this sequence
64 limit_info = dict(gff_id = [ myrec.id ]) 110 limit_info = dict(gff_id = [ myrec.id ])
65 ##rewind gff filehandle 111 ##rewind gff filehandle
66 in_gff_handle.seek(0) 112 my_args.gff_file.seek(0)
67 ##read annotations into sequence dictionary for this sequence record only 113 ##read annotations into sequence dictionary for this sequence record only
68 annotations = [r for r in GFF.parse(in_gff_handle, base_dict=seq_dict,limit_info=limit_info)] 114 annotations = [r for r in GFF.parse(my_args.gff_file, base_dict=seq_dict,limit_info=limit_info)]
69 ##if there are any annotations, then proceed. 115 ##if there are any annotations, then proceed.
70 if annotations: 116 if annotations:
71 rec=annotations[0] 117 rec=annotations[0]
72 ##iterate over list of target IDs 118 ##iterate over list of target IDs
73 for t in targets: 119 for t in targets:
74 target_ID = t.strip('\n') 120 target_ID = t.strip('\n')
75 target_annotations = [f for f in rec.features if f.id == target_ID ] 121 target_annotations = [f for f in rec.features if f.id == target_ID ]
76 if target_annotations: 122 if target_annotations:
77 mytarget = target_annotations[0] 123 mytarget = target_annotations[0]
78 #create temporary files
79 tempfastaFile = tempfile.mktemp()
80 tempproutfile = tempfile.mktemp()
81 #just consider slice of sequence in a window of +/- prod_max_size bp 124 #just consider slice of sequence in a window of +/- prod_max_size bp
82 ##from feature UNLESS feature is close to end 125 ##from feature UNLESS feature is close to end
83 ##Note that slice is zero-based 126 ##Note that slice is zero-based
84 featLocation = mytarget.location.start.position 127 featLocation = mytarget.location.start.position
85 if featLocation > prod_max_size: 128 if featLocation > my_args.prod_max_size:
86 slice_start = featLocation - prod_max_size 129 slice_start = featLocation - my_args.prod_max_size
87 featPosition = prod_max_size 130 featPosition = my_args.prod_max_size
88 else: 131 else:
89 slice_start = 0 132 slice_start = 0
90 featPosition = featLocation 133 featPosition = featLocation
91 if (len(rec) - featLocation) < prod_max_size: 134 if (len(rec) - featLocation) < my_args.prod_max_size:
92 slice_end = len(rec) 135 slice_end = len(rec)
93 else: 136 else:
94 slice_end = featLocation + prod_max_size 137 slice_end = featLocation + my_args.prod_max_size
95 ###grab slice of sequence fom this window. 138 ###grab slice of sequence fom this window.
96 targetRec = rec[slice_start:slice_end] 139 targetRec = rec[slice_start:slice_end]
97 matching_feature = [f for f in targetRec.features if f.id == mytarget.id] 140 matching_feature = [f for f in targetRec.features if f.id == mytarget.id]
98 if matching_feature: 141 if matching_feature:
99 target_feat = matching_feature[0] 142 target_feat = matching_feature[0]
143 my_target_dict={} # re-initialize target dictionary
100 if target_feat.location.start.position == 0: 144 if target_feat.location.start.position == 0:
101 target_feat.location.start.position = 1 145 target_feat.location.start.position = 1
102 #we get the mask features by removing the target...all features are masked as just using snp and indels 146 #get the mask features by removing target...all features are masked as just using snp and indels, a smarter filter could be added
103 ##a smarter filter could be added 147 exclude_feat = list(targetRec.features) ##list copy to avoid possible side-effects
104 ##note use of list copy to avoid possible side-effects
105 exclude_feat = list(targetRec.features)
106 exclude_feat.remove(target_feat) 148 exclude_feat.remove(target_feat)
107 ##print'targetRec.features', targetRec.features ##for debug 149 excludes_str=' '.join([str(x.location.start.position)+','+str(x.location.end.position -x.location.start.position) for x in exclude_feat])
108 mask_str=map(lambda f: str(f.location.start.position+1) + "," + str(f.location.end.position + 1) ,exclude_feat) 150 my_target_dict={'SEQUENCE_ID' : rec.name, 'SEQUENCE_TEMPLATE': targetRec.seq.tostring(),'SEQUENCE_TARGET': str(target_feat.location.start.position) + ',1','SEQUENCE_INTERNAL_EXCLUDED_REGION': excludes_str}
109 #mask_str=map(lambda f: str(f.location).strip('[]'),exclude_feat) 151 my_target_dict.update(def_dict) ##add in defaults
110 p3_exclude_str = str(mask_str).replace('\', \'',':') 152 result=P3.run_P3(target_dict=my_target_dict)
111 p3_target = str(target_feat.location.start.position +1) + "," + str(target_feat.location.end.position +1) 153 if my_args.run_uMelt:
112 #write sequence record into template file as fasta 154 amp_seq=targetRec.seq ##need to make this conditional on getting a result >0 and melt=True
113 t_output_handle = open(tempfastaFile, "w") 155 mutamp_seq=targetRec.seq.tomutable()
114 SeqIO.write([targetRec], t_output_handle, "fasta") 156 mutamp_seq[target_feat.location.start:target_feat.location.end]=target_feat.qualifiers['Variant_seq'][0] #mutate to variant
115 t_output_handle.close() 157 for primerset in result:
116 #create Primer3Commandline() for emboss tool 158 amp_start=int(primerset['PRIMER_LEFT'].split(',')[0])
117 primer_cl = Primer3Commandline() 159 amp_end=int(primerset['PRIMER_RIGHT'].split(',')[0])
118 #set the emboss tool to suppress output as this will make Galaxy think it is error message although it is a message to state run success 160 ref_melt_Tm=0
119 primer_cl.set_parameter("-auto",'1') 161 var_melt_Tm=0
120 #pass sequence file to emboss 162 if my_args.run_uMelt:
121 primer_cl.set_parameter("-sequence",tempfastaFile) 163 try:
122 #add target location 164 ref_melt_Tm=umelts.getTm(umelts.getmelt(amp_seq.tostring()[amp_start:amp_end+1]))
123 primer_cl.set_parameter("-target", p3_target) 165 var_melt_Tm=umelts.getTm(umelts.getmelt(mutamp_seq.tostring()[amp_start:amp_end+1]))
124 ##mask off other features...dumb masking of everything at present, beware 166 except:
125 if (p3_exclude_str != ""): 167 ref_melt_Tm=0 ##preferably something more informative?
126 primer_cl.set_parameter("-excludedregion", p3_exclude_str) 168 var_melt_Tm=0 ##exception handling to be added
127 #add temporary output file to get the result 169 reference_seq=target_feat.qualifiers['Reference_seq'][0]
128 primer_cl.set_parameter("-outfile", tempproutfile) 170 if target_feat.qualifiers.has_key('Variant_seq'):
129 #specify maximum different of tm 171 variant_seq=target_feat.qualifiers['Variant_seq'][0]
130 primer_cl.set_parameter("-maxdifftm",max_tm_diff ) 172 else:
131 #other useful parameters 173 variant_seq="NA"
132 primer_cl.set_parameter("-ogcpercent", opt_GC_percent) 174 print mytarget.id, featLocation + 1 ,reference_seq, variant_seq,amp_end-amp_start,primerset['PRIMER_LEFT_SEQUENCE'],primerset['PRIMER_RIGHT_SEQUENCE'], ref_melt_Tm,var_melt_Tm,abs(ref_melt_Tm-var_melt_Tm)#, amp_seq.tostring()[amp_start:amp_end+1], mutamp_seq.tostring()[amp_start:amp_end+1]
133 primer_cl.set_parameter("-opolyxmax", maxpolyx)
134 primer_cl.set_parameter("-osize", optimum_length)
135 #set product size range
136 primer_cl.set_parameter("-prange", productsizerange)
137 #using python subprocess method to run emboss command line programe with the parameters given
138 fnull = open(os.devnull, 'w')
139 result=subprocess.check_call(str(primer_cl),shell=True ,stdout = fnull, stderr = fnull)
140 #read temporary outputfile
141 handle = open(tempproutfile)
142 record = Primer3.read(handle)
143 ##just return first set, if there is one
144 if len(record.primers) > 0:
145 primer= record.primers[0]
146 outputstr=[mytarget.id, primer.forward_seq,primer.reverse_seq,primer.size]
147 else:
148 outputstr=[mytarget.id,"NONE","NONE","NONE"]
149 print('\t'.join(map(str,outputstr)))
150 175
151 176 my_args.gff_file.close()
152 in_gff_handle.close() 177 my_args.in_file.close()
153 in_seq_handle.close() 178