Mercurial > repos > john-mccallum > pcr_markers
comparison design_primers.py @ 5:b321e0517be3 draft
Uploaded
author | ben-warren |
---|---|
date | Thu, 22 May 2014 20:30:19 -0400 |
parents | 21053f7f9ed1 |
children | f201e8c6e004 |
comparison
equal
deleted
inserted
replaced
4:be070a68521e | 5:b321e0517be3 |
---|---|
1 #!/usr/local/bin/python2.6 | |
2 ##design primers to features in multiple sequences | |
3 ##usage: python design_primers.py <fasta-file> <gff file> <file of target IDs> <prod_min_size> <prod_max_size> | |
4 | 1 |
2 #!/usr/bin/env python | |
3 ##design primers to features in multiple sequences, with option to predict melting | |
4 #usage: design_HRM_primers.py [-h] -i IN_FILE -g GFF_FILE -T TARGET_FILE [-u] | |
5 # [-n MAX_PRIMERS] [-p PROD_MIN_SIZE] | |
6 # [-P PROD_MAX_SIZE] [-l OPT_PRIMER_LENGTH] | |
7 # [-m MAX_TM_DIFF] [-t OPTIMUM_TM] | |
8 # [-G OPT_GC_PERCENT] [-x MAXPOLYX] [-c GC_CLAMP] | |
5 | 9 |
6 #Copyright 2012 John McCallum & Leshi Chen | 10 #Copyright 2013 John McCallum & Susan Thomson |
7 #New Zealand Institute for Plant and Food Research | 11 #New Zealand Institute for Plant and Food Research |
8 #This program is free software: you can redistribute it and/or modify | 12 #This program is free software: you can redistribute it and/or modify |
9 # it under the terms of the GNU General Public License as published by | 13 # it under the terms of the GNU General Public License as published by |
10 # the Free Software Foundation, either version 3 of the License, or | 14 # the Free Software Foundation, either version 3 of the License, or |
11 # (at your option) any later version. | 15 # (at your option) any later version. |
16 # GNU General Public License for more details. | 20 # GNU General Public License for more details. |
17 # | 21 # |
18 # You should have received a copy of the GNU General Public License | 22 # You should have received a copy of the GNU General Public License |
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. | 23 # along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | 24 |
21 | |
22 import os | 25 import os |
23 import StringIO | 26 import StringIO |
24 import re | 27 import re |
25 import tempfile | |
26 import subprocess | |
27 import copy | 28 import copy |
28 import sys | 29 import sys |
29 from BCBio import GFF | 30 from BCBio import GFF |
30 from BCBio.GFF import GFFExaminer | 31 from BCBio.GFF import GFFExaminer |
31 from Bio import SeqIO | 32 from Bio import SeqIO |
32 from Bio.Emboss.Applications import Primer3Commandline | 33 import run_p3 as P3 |
33 from Bio.Emboss import Primer3 | 34 #import umelt_service as umelts |
35 import argparse | |
36 | |
37 ##Primer3 defaults or additional options defined as dictionary | |
38 def_dict={ | |
39 'PRIMER_MIN_SIZE':'18', | |
40 'PRIMER_MAX_SIZE':'25', | |
41 'PRIMER_MAX_NS_ACCEPTED':'1'} | |
42 | |
43 #parse arguments | |
44 parser = argparse.ArgumentParser(description='Primer set design and melt prediction parameters') | |
45 parser.add_argument('-i', type=argparse.FileType('r'), help="input sequence file, required", dest='in_file', required=True) | |
46 parser.add_argument('-g', type=argparse.FileType('r'), help="input gff file with SNP and indels, required", dest='gff_file', required=True) | |
47 parser.add_argument('-T', type=argparse.FileType('r'), help="input target SNP file, required", dest='target_file', required=True) | |
48 parser.add_argument('-u',help="do uMelt prediction, optional", dest='run_uMelt',action='store_true', default=False ) | |
49 parser.add_argument('-n', type=int, help="maximum number of primer pairs to return, default=5", dest='max_primers', default=5) ## PRIMER_NUM_RETURN | |
50 parser.add_argument('-p', type=int, help="minimum product size", dest='prod_min_size', default=100) ## PRIMER_PRODUCT_SIZE_RANGE min | |
51 parser.add_argument('-P', type=int, help="maximum product size", dest='prod_max_size', default=300) ## PRIMER_PRODUCT_SIZE_RANGE max | |
52 parser.add_argument('-l', type=int, help="optimum primer length", dest='opt_primer_length', default=20) ## PRIMER_OPT_SIZE | |
53 parser.add_argument('-m', type=int, help="maximum tm difference between primers", dest='max_tm_diff', default=1) ## PRIMER_PAIR_MAX_DIFF_TM | |
54 parser.add_argument('-t', type=int, help="optimum Tm for primers, recommend range 59 to 61", dest='optimum_tm', default=59) ## PRIMER_OPT_TM | |
55 parser.add_argument('-G', type=int, help="optimum GC percentage of primers", dest='opt_GC_percent', default=50) ## PRIMER_OPT_GC_PERCENT | |
56 parser.add_argument('-x', type=int, help="maximum polyx, recommend less than 4", dest='maxpolyx', default=3) ## PRIMER_MAX_POLY_X | |
57 parser.add_argument('-c', type=int, help="number of C/Gs at end, recommend 2", dest='gc_clamp', default=1) ## PRIMER_GC_CLAMP | |
58 | |
59 parser.add_argument('-e', type=int, help="maximum allowable 3'-anchored complementarity", dest='maxselfend', default=3) ## PRIMER_MAX_SELF_END | |
60 parser.add_argument('-a', type=int, help="maximum complementarity between left and right or self", dest='maxselfany', default=8) ## PRIMER_MAX_SELF_ANY | |
61 parser.add_argument('-maxgc', type=float, help="Maximum allowable percentage of Gs and Cs in any primer.", dest='maxgc', default=80.0) ## PRIMER_MAX_GC | |
62 parser.add_argument('-mingc', type=float, help="Minimum allowable percentage of Gs and Cs in any primer.", dest='mingc', default=20.0) ## PRIMER_MIN_GC | |
34 | 63 |
35 | 64 |
36 in_file = sys.argv[1] | 65 parser.add_argument('-d', type=str, help="variant indentifier delimiter, used to separate sequence ID from rest ", dest='target_delim', default=':') |
37 gff_file = sys.argv[2] | 66 try: |
38 target_file = sys.argv[3] | 67 my_args = parser.parse_args() |
39 prod_min_size = int(sys.argv[4]) | 68 except SystemExit: |
40 prod_max_size = int(sys.argv[5]) | 69 print("\nOops, an argument is missing/invalid, exiting...\n") |
70 sys.exit(0) | |
41 | 71 |
42 max_tm_diff=1 ## | 72 ##update from args. NEEDS TO BE FINISHED |
43 opt_GC_percent=50 ## | 73 productsizerange = str(my_args.prod_min_size) + "-" + str(my_args.prod_max_size) |
44 maxpolyx=4 ## | 74 def_dict['PRIMER_PRODUCT_SIZE_RANGE']=productsizerange |
45 optimum_length=20 | 75 def_dict['PRIMER_NUM_RETURN']=str(my_args.max_primers +1) |
46 ##target is specified in start, end format | 76 def_dict['PRIMER_OPT_SIZE']=str(my_args.opt_primer_length) |
47 productsizerange = str(prod_min_size) + "," + str(prod_max_size) | 77 def_dict['PRIMER_PAIR_MAX_DIFF_TM']=str(my_args.max_tm_diff) |
78 def_dict['PRIMER_OPT_TM']=str(my_args.optimum_tm) | |
79 def_dict['PRIMER_OPT_GC_PERCENT']=str(my_args.opt_GC_percent) | |
80 def_dict['PRIMER_MAX_POLY_X']=str(my_args.maxpolyx) | |
81 def_dict['PRIMER_GC_CLAMP']=str(my_args.gc_clamp) | |
82 | |
83 def_dict['PRIMER_MAX_SELF_END']=str(my_args.maxselfend) | |
84 def_dict['PRIMER_MAX_SELF_ANY']=str(my_args.maxselfany) | |
85 def_dict['PRIMER_MAX_GC']=str(my_args.maxgc) | |
86 def_dict['PRIMER_MIN_GC']=str(my_args.mingc) | |
87 | |
88 | |
89 | |
90 ##conditional import of umelt | |
91 if my_args.run_uMelt: | |
92 import umelt_service as umelts | |
93 | |
48 #open input files | 94 #open input files |
49 in_seq_handle = open(in_file) | 95 |
50 in_gff_handle = open(gff_file) | 96 targets=my_args.target_file.readlines() |
51 in_target_handle=open(target_file) | 97 my_args.target_file.close() |
52 #read target feature IDs into list | |
53 targets=in_target_handle.readlines() | |
54 in_target_handle.close() | |
55 ##and create a hit list of sequences from this | 98 ##and create a hit list of sequences from this |
56 target_seq_id_list = list(set([line.split(":")[0] for line in targets])) | 99 target_seq_id_list = [re.split(my_args.target_delim,X)[0] for X in targets] ## target_delimiter defaults to ':' e.g. ABC:SNP:SAMTOOL:1234 |
100 | |
101 ##print header | |
102 print "SNP_Target_ID", "Position","Ref_base","Variant_base" ,"Amplicon_bp","PRIMER_LEFT_SEQUENCE",'PRIMER_RIGHT_SEQUENCE', "ref_melt_Tm","var_melt_Tm","Tm_difference" | |
57 ##create iterator returning sequence records | 103 ##create iterator returning sequence records |
58 for myrec in SeqIO.parse(in_seq_handle, "fasta"): | 104 for myrec in SeqIO.parse(my_args.in_file, "fasta"): |
59 #check if this sequence is included in the target list | 105 #check if this sequence is included in the target list |
60 if myrec.id in target_seq_id_list: | 106 if myrec.id in target_seq_id_list: |
61 ##create sequence dictionary so we can add in gff annotations | 107 ##create sequence dictionary so we can add in gff annotations |
62 seq_dict = {myrec.id : myrec} | 108 seq_dict = {myrec.id : myrec} |
63 ##just limit to gff annotations for this sequence | 109 ##just limit to gff annotations for this sequence |
64 limit_info = dict(gff_id = [ myrec.id ]) | 110 limit_info = dict(gff_id = [ myrec.id ]) |
65 ##rewind gff filehandle | 111 ##rewind gff filehandle |
66 in_gff_handle.seek(0) | 112 my_args.gff_file.seek(0) |
67 ##read annotations into sequence dictionary for this sequence record only | 113 ##read annotations into sequence dictionary for this sequence record only |
68 annotations = [r for r in GFF.parse(in_gff_handle, base_dict=seq_dict,limit_info=limit_info)] | 114 annotations = [r for r in GFF.parse(my_args.gff_file, base_dict=seq_dict,limit_info=limit_info)] |
69 ##if there are any annotations, then proceed. | 115 ##if there are any annotations, then proceed. |
70 if annotations: | 116 if annotations: |
71 rec=annotations[0] | 117 rec=annotations[0] |
72 ##iterate over list of target IDs | 118 ##iterate over list of target IDs |
73 for t in targets: | 119 for t in targets: |
74 target_ID = t.strip('\n') | 120 target_ID = t.strip('\n') |
75 target_annotations = [f for f in rec.features if f.id == target_ID ] | 121 target_annotations = [f for f in rec.features if f.id == target_ID ] |
76 if target_annotations: | 122 if target_annotations: |
77 mytarget = target_annotations[0] | 123 mytarget = target_annotations[0] |
78 #create temporary files | |
79 tempfastaFile = tempfile.mktemp() | |
80 tempproutfile = tempfile.mktemp() | |
81 #just consider slice of sequence in a window of +/- prod_max_size bp | 124 #just consider slice of sequence in a window of +/- prod_max_size bp |
82 ##from feature UNLESS feature is close to end | 125 ##from feature UNLESS feature is close to end |
83 ##Note that slice is zero-based | 126 ##Note that slice is zero-based |
84 featLocation = mytarget.location.start.position | 127 featLocation = mytarget.location.start.position |
85 if featLocation > prod_max_size: | 128 if featLocation > my_args.prod_max_size: |
86 slice_start = featLocation - prod_max_size | 129 slice_start = featLocation - my_args.prod_max_size |
87 featPosition = prod_max_size | 130 featPosition = my_args.prod_max_size |
88 else: | 131 else: |
89 slice_start = 0 | 132 slice_start = 0 |
90 featPosition = featLocation | 133 featPosition = featLocation |
91 if (len(rec) - featLocation) < prod_max_size: | 134 if (len(rec) - featLocation) < my_args.prod_max_size: |
92 slice_end = len(rec) | 135 slice_end = len(rec) |
93 else: | 136 else: |
94 slice_end = featLocation + prod_max_size | 137 slice_end = featLocation + my_args.prod_max_size |
95 ###grab slice of sequence fom this window. | 138 ###grab slice of sequence fom this window. |
96 targetRec = rec[slice_start:slice_end] | 139 targetRec = rec[slice_start:slice_end] |
97 matching_feature = [f for f in targetRec.features if f.id == mytarget.id] | 140 matching_feature = [f for f in targetRec.features if f.id == mytarget.id] |
98 if matching_feature: | 141 if matching_feature: |
99 target_feat = matching_feature[0] | 142 target_feat = matching_feature[0] |
143 my_target_dict={} # re-initialize target dictionary | |
100 if target_feat.location.start.position == 0: | 144 if target_feat.location.start.position == 0: |
101 target_feat.location.start.position = 1 | 145 target_feat.location.start.position = 1 |
102 #we get the mask features by removing the target...all features are masked as just using snp and indels | 146 #get the mask features by removing target...all features are masked as just using snp and indels, a smarter filter could be added |
103 ##a smarter filter could be added | 147 exclude_feat = list(targetRec.features) ##list copy to avoid possible side-effects |
104 ##note use of list copy to avoid possible side-effects | |
105 exclude_feat = list(targetRec.features) | |
106 exclude_feat.remove(target_feat) | 148 exclude_feat.remove(target_feat) |
107 ##print'targetRec.features', targetRec.features ##for debug | 149 excludes_str=' '.join([str(x.location.start.position)+','+str(x.location.end.position -x.location.start.position) for x in exclude_feat]) |
108 mask_str=map(lambda f: str(f.location.start.position+1) + "," + str(f.location.end.position + 1) ,exclude_feat) | 150 my_target_dict={'SEQUENCE_ID' : rec.name, 'SEQUENCE_TEMPLATE': targetRec.seq.tostring(),'SEQUENCE_TARGET': str(target_feat.location.start.position) + ',1','SEQUENCE_INTERNAL_EXCLUDED_REGION': excludes_str} |
109 #mask_str=map(lambda f: str(f.location).strip('[]'),exclude_feat) | 151 my_target_dict.update(def_dict) ##add in defaults |
110 p3_exclude_str = str(mask_str).replace('\', \'',':') | 152 result=P3.run_P3(target_dict=my_target_dict) |
111 p3_target = str(target_feat.location.start.position +1) + "," + str(target_feat.location.end.position +1) | 153 if my_args.run_uMelt: |
112 #write sequence record into template file as fasta | 154 amp_seq=targetRec.seq ##need to make this conditional on getting a result >0 and melt=True |
113 t_output_handle = open(tempfastaFile, "w") | 155 mutamp_seq=targetRec.seq.tomutable() |
114 SeqIO.write([targetRec], t_output_handle, "fasta") | 156 mutamp_seq[target_feat.location.start:target_feat.location.end]=target_feat.qualifiers['Variant_seq'][0] #mutate to variant |
115 t_output_handle.close() | 157 for primerset in result: |
116 #create Primer3Commandline() for emboss tool | 158 amp_start=int(primerset['PRIMER_LEFT'].split(',')[0]) |
117 primer_cl = Primer3Commandline() | 159 amp_end=int(primerset['PRIMER_RIGHT'].split(',')[0]) |
118 #set the emboss tool to suppress output as this will make Galaxy think it is error message although it is a message to state run success | 160 ref_melt_Tm=0 |
119 primer_cl.set_parameter("-auto",'1') | 161 var_melt_Tm=0 |
120 #pass sequence file to emboss | 162 if my_args.run_uMelt: |
121 primer_cl.set_parameter("-sequence",tempfastaFile) | 163 try: |
122 #add target location | 164 ref_melt_Tm=umelts.getTm(umelts.getmelt(amp_seq.tostring()[amp_start:amp_end+1])) |
123 primer_cl.set_parameter("-target", p3_target) | 165 var_melt_Tm=umelts.getTm(umelts.getmelt(mutamp_seq.tostring()[amp_start:amp_end+1])) |
124 ##mask off other features...dumb masking of everything at present, beware | 166 except: |
125 if (p3_exclude_str != ""): | 167 ref_melt_Tm=0 ##preferably something more informative? |
126 primer_cl.set_parameter("-excludedregion", p3_exclude_str) | 168 var_melt_Tm=0 ##exception handling to be added |
127 #add temporary output file to get the result | 169 reference_seq=target_feat.qualifiers['Reference_seq'][0] |
128 primer_cl.set_parameter("-outfile", tempproutfile) | 170 if target_feat.qualifiers.has_key('Variant_seq'): |
129 #specify maximum different of tm | 171 variant_seq=target_feat.qualifiers['Variant_seq'][0] |
130 primer_cl.set_parameter("-maxdifftm",max_tm_diff ) | 172 else: |
131 #other useful parameters | 173 variant_seq="NA" |
132 primer_cl.set_parameter("-ogcpercent", opt_GC_percent) | 174 print mytarget.id, featLocation + 1 ,reference_seq, variant_seq,amp_end-amp_start,primerset['PRIMER_LEFT_SEQUENCE'],primerset['PRIMER_RIGHT_SEQUENCE'], ref_melt_Tm,var_melt_Tm,abs(ref_melt_Tm-var_melt_Tm)#, amp_seq.tostring()[amp_start:amp_end+1], mutamp_seq.tostring()[amp_start:amp_end+1] |
133 primer_cl.set_parameter("-opolyxmax", maxpolyx) | |
134 primer_cl.set_parameter("-osize", optimum_length) | |
135 #set product size range | |
136 primer_cl.set_parameter("-prange", productsizerange) | |
137 #using python subprocess method to run emboss command line programe with the parameters given | |
138 fnull = open(os.devnull, 'w') | |
139 result=subprocess.check_call(str(primer_cl),shell=True ,stdout = fnull, stderr = fnull) | |
140 #read temporary outputfile | |
141 handle = open(tempproutfile) | |
142 record = Primer3.read(handle) | |
143 ##just return first set, if there is one | |
144 if len(record.primers) > 0: | |
145 primer= record.primers[0] | |
146 outputstr=[mytarget.id, primer.forward_seq,primer.reverse_seq,primer.size] | |
147 else: | |
148 outputstr=[mytarget.id,"NONE","NONE","NONE"] | |
149 print('\t'.join(map(str,outputstr))) | |
150 | 175 |
151 | 176 my_args.gff_file.close() |
152 in_gff_handle.close() | 177 my_args.in_file.close() |
153 in_seq_handle.close() | 178 |