Mercurial > repos > rreumerman > snptools
comparison trams.py @ 4:bd5692103d5b draft
Uploaded
author | rreumerman |
---|---|
date | Fri, 05 Apr 2013 05:00:40 -0400 |
parents | |
children | 8de0ffc2166f |
comparison
equal
deleted
inserted
replaced
3:a808647d7312 | 4:bd5692103d5b |
---|---|
1 ################# | |
2 transl_table = 11 | |
3 intro_message = ''' +------------------------------------------------------------------+ | |
4 | Tool for Rapid Annotation of Microbial SNPs (TRAMS): a simple | | |
5 | program for rapid annotation of genomic variation in prokaryotes | | |
6 | | | |
7 | Developed by: Richard A. Reumerman, Paul R. Herron, | | |
8 | Paul A. Hoskisson and Vartul Sangal | | |
9 +------------------------------------------------------------------+\n''' | |
10 ################# | |
11 | |
12 import sys | |
13 import time | |
14 start = time.clock() | |
15 | |
16 # Command line files: SNP REF REF-TYPE ANNOT OVERL SUM; | |
17 if len(sys.argv) < 7: | |
18 exit("\nNot enough arguments given.\nUsage: TRAMS_Galaxy.py [SNP.] [REF.] [ANNOT.] [OVERL.] [SUM.]") | |
19 try: | |
20 file_snps = open(sys.argv[1], "rU") | |
21 except IOError as e: | |
22 exit("Error trying to open '"+sys.argv[1]+"': {1}".format(e.errno, e.strerror)) | |
23 try: | |
24 file_ref = open(sys.argv[2], "rU") | |
25 except IOError as e: | |
26 exit("Error trying to open '"+sys.argv[2]+"': {1}".format(e.errno, e.strerror)) | |
27 | |
28 filetype_reference = sys.argv[3] | |
29 | |
30 try: | |
31 file_out = open(sys.argv[4], "w") | |
32 except IOError as e: | |
33 exit("Error trying to open '"+sys.argv[4]+"': {1}".format(e.errno, e.strerror)) | |
34 try: | |
35 file_overlap = open(sys.argv[5], "w") | |
36 except IOError as e: | |
37 exit("Error trying to open '"+sys.argv[5]+"': {1}".format(e.errno, e.strerror)) | |
38 try: | |
39 file_summary = open(sys.argv[6], "w") | |
40 except IOError as e: | |
41 exit("Error trying to open '"+sys.argv[6]+"': {1}".format(e.errno, e.strerror)) | |
42 | |
43 import Bio | |
44 from Bio import SeqIO, SeqFeature | |
45 from Bio.SeqRecord import SeqRecord | |
46 from Bio.Seq import Seq | |
47 from Bio.Alphabet import generic_dna, IUPAC | |
48 from Bio.Data import CodonTable | |
49 | |
50 modules_loaded = time.clock() | |
51 | |
52 def non_coding_calc(gene, pos = 0): | |
53 '''This function takes a pseudogene and returns the number of bases | |
54 located in between the sub-features before 'pos'. Returns 0 if 'pseudo' = False. | |
55 Input: {start, subfeats, pseudo}, pos (default = 0)''' | |
56 if not gene['pseudo']: return 0 | |
57 | |
58 non_coding_bases = 0 | |
59 prev_subfeat_end = gene['start'] | |
60 if gene['strand'] == -1: | |
61 for subfeature in gene['subfeats']: | |
62 if subfeature.location._start.position < pos: | |
63 prev_subfeat_end = subfeature.location._end.position | |
64 continue | |
65 non_coding_bases += (subfeature.location._start.position - prev_subfeat_end) | |
66 prev_subfeat_end = subfeature.location._end.position | |
67 else: | |
68 for subfeature in gene['subfeats']: | |
69 non_coding_bases += (subfeature.location._start.position - prev_subfeat_end) | |
70 prev_subfeat_end = subfeature.location._end.position | |
71 if prev_subfeat_end >= pos and pos != 0: break | |
72 | |
73 return non_coding_bases | |
74 | |
75 | |
76 def region_calc(bounds,length): | |
77 regions = [] | |
78 lastend=i=0 | |
79 while i < len(bounds): | |
80 if bounds[i]['start'] > lastend:# Intergenic region present; | |
81 regions.append([lastend,bounds[i]['start'],-1]) | |
82 lastend = bounds[i]['start'] | |
83 else: | |
84 regions.append([bounds[i]['start'],bounds[i]['end'],i]) | |
85 if bounds[i]['end'] > lastend: | |
86 lastend = bounds[i]['end'] | |
87 i += 1 | |
88 | |
89 if regions[-1][1] < length:# Final tail of genome; | |
90 regions.append([lastend,length,-1]) | |
91 | |
92 return regions | |
93 | |
94 | |
95 def overlap_calc(bounds): | |
96 '''This function takes an array of feature starts and ends and | |
97 returns an array of starts and ends of all overlapping regions. | |
98 Input: [{start,end}]''' | |
99 i = 0 | |
100 overlaps = [] | |
101 while i < len(bounds) - 1: | |
102 for downstr in bounds[i+1:]: | |
103 if downstr[0] < bounds[i][1]:# Features overlap; | |
104 if downstr[1] < bounds[i][1]:# Complete overlap; | |
105 overlaps.append([downstr[0],downstr[1],bounds[i][2],downstr[2],[0,0]]) | |
106 else:# Partial overlap; | |
107 overlaps.append([downstr[0],bounds[i][1],bounds[i][2],downstr[2],[0,0]]) | |
108 else:# No use looking further; | |
109 break | |
110 | |
111 i += 1 | |
112 | |
113 return overlaps | |
114 | |
115 | |
116 def match_feature(bounds,pos,prev=0): | |
117 '''This function checks if a position is located inside a feature and | |
118 returns the feature's number if found or -1 if none is found. | |
119 Input: {start,end},pos,prev_feat (default = 0)''' | |
120 for i in range(prev, len(bounds)): | |
121 if (pos >= bounds[i]['start']) and (pos < bounds[i]['end']): | |
122 return i | |
123 elif pos < bounds[i]['start']:# No use looking further | |
124 return -1 | |
125 | |
126 return -1 | |
127 | |
128 | |
129 def write_output(line,target=file_out): | |
130 '''This function takes the 2 dimensional array containing all the SNP | |
131 data. It contains an array of information on the feature and an array | |
132 for each strain for which SNPs are given. | |
133 Input: [[pos],[ref],[cells],[cells],etc]''' | |
134 target.write('\n'+str(line[0][0])) | |
135 for cell in line[1]: | |
136 target.write('\t'+str(cell)) | |
137 for strain in line[2:]: | |
138 target.write('\t') | |
139 for cell in strain: | |
140 target.write('\t'+str(cell)) | |
141 | |
142 target.flush() | |
143 | |
144 | |
145 def new_codon_calc(ref_codon, new_base, pos_in_cod): | |
146 return str(ref_codon[0:pos_in_cod-1]+new_base+ref_codon[pos_in_cod:len(ref_codon)]) | |
147 | |
148 | |
149 def mut_type_check(ref_res, ref_codon, pos_in_gene, new_base, new_codon): | |
150 if str(new_codon).lower() == str(ref_codon).lower(): | |
151 return ['','','',''] | |
152 new_residue = Seq(new_codon).translate(table=transl_table) | |
153 if str(new_residue) == str(ref_res): | |
154 mut_type = 'synonymous' | |
155 elif (pos_in_gene / 3) < 1 and str(ref_codon).upper() in CodonTable.unambiguous_dna_by_id[transl_table].start_codons:# position 0,1 or 2 and SNP is in start codon; | |
156 if str(new_codon).upper() in CodonTable.unambiguous_dna_by_id[transl_table].start_codons: mut_type = 'nonsynonymous' | |
157 else: mut_type = 'nonstart' | |
158 elif str(new_residue) == '*': mut_type = 'nonsense' | |
159 elif str(ref_res) == '*': mut_type = 'nonstop' | |
160 else: mut_type = 'nonsynonymous' | |
161 | |
162 return [mut_type,new_base,new_codon,new_residue] | |
163 | |
164 | |
165 def codon_process(codon): | |
166 '''This function processes a codon. It loops through it 3 times, | |
167 once to determine which is the highest position mutated, once to | |
168 fill in the cells for the output file and once to output all lines. | |
169 Input: [empty,start_pos,[line1],[line2],[line3],strand] | |
170 It also uses global variable strain_nr''' | |
171 lastposition = [-1]*(strain_nr-1) | |
172 new_codons = ['']*(strain_nr-1) | |
173 if codon[-1] == -1:# Change codon position order for -1 features; | |
174 temp = codon [1:-1] | |
175 temp.reverse() | |
176 codon[1:-1] = temp | |
177 for i,line in enumerate(codon[1:-1],1): | |
178 if line == '': continue | |
179 for j,strain in enumerate(line[2:]): | |
180 if strain[1] in ['a','g','c','t']: | |
181 lastposition[j] = i | |
182 new_codons[j] = codon[i][1][8] | |
183 | |
184 for i,line in enumerate(codon[1:-1],1): | |
185 if codon[-1] == -1: pos_in_cod = 4-i | |
186 else: pos_in_cod = i | |
187 | |
188 if line == '': continue | |
189 for j,strain in enumerate(line[2:]): | |
190 if i == lastposition[j]: # Check for synonymous etc.; | |
191 new_codons[j] = new_codon_calc(new_codons[j],strain[1],pos_in_cod) | |
192 codon[i][j+2] = mut_type_check(line[1][9],line[1][8],codon[0],strain[1],new_codons[j]) | |
193 straininfo[j][codon[i][j+2][0]] += 1# Counting; | |
194 elif strain[1] in ['a','g','c','t']: | |
195 codon[i][j+2] = ['MNP',strain[1],'',''] | |
196 straininfo[j]['mnps'] += 1 | |
197 new_codons[j] = new_codon_calc(new_codons[j],strain[1],pos_in_cod) | |
198 elif strain[0] == 'Allele missing': codon[i][j+2] = strain | |
199 else: codon[i][j+2] = ['']*4 | |
200 | |
201 for line in codon[1:-1]: | |
202 if line != '': write_output(line) | |
203 | |
204 def feature_props(feature): | |
205 properties = {'type':feature.type,'strand':feature.location._strand, | |
206 'sequence':feature.extract(seq_record.seq),'pseudo': False, | |
207 'locus_tag':'','gene_name':'','product':'', | |
208 'start':int(feature.location._start.position), | |
209 'end':int(feature.location._end.position)} | |
210 if 'pseudo' in feature.qualifiers: | |
211 properties['pseudo'] = True | |
212 properties['type'] = 'pseudogene' | |
213 properties['pure_seq'] = properties['sequence'] | |
214 if properties['strand'] == -1: | |
215 properties['sequence'] = seq_record.seq[feature.location._start.position:feature.location._end.position].reverse_complement() | |
216 else: | |
217 properties['sequence'] = seq_record.seq[feature.location._start.position:feature.location._end.position] | |
218 if feature.sub_features: properties['subfeats'] = feature.sub_features | |
219 if 'locus_tag' in feature.qualifiers: properties['locus_tag'] = feature.qualifiers['locus_tag'][0] | |
220 if 'gene' in feature.qualifiers: properties['gene_name']= feature.qualifiers['gene'][0] | |
221 if feature.type in ['tRNA','rRNA','CDS']: properties['product'] = feature.qualifiers['product'][0] | |
222 | |
223 return properties | |
224 | |
225 # Read embl/genbank file for information on sequence features; | |
226 try: | |
227 seq_record = SeqIO.parse(file_ref, filetype_reference).next() | |
228 except: | |
229 file_ref.close() | |
230 quit("Error reading "+sys.argv[2]+", please check file for errors.") | |
231 file_ref.close() | |
232 | |
233 # Loop through genome features and save relevant properties; | |
234 feats = []# Dictionary of properties; | |
235 | |
236 feature_types = {'intergenic':0,'gene':0,'pseudogene':0} | |
237 feat_temp_store = '' | |
238 for feature in seq_record.features: | |
239 # Check if gene is defined as other feature (e.g. CDS). Else, save info from 'gene'; | |
240 if feat_temp_store != '': | |
241 if (feature.location._start.position == feat_temp_store.location._start.position and | |
242 feature.location._end.position == feat_temp_store.location._end.position):# Gene also defined as other feature; | |
243 feat_temp_store = '' | |
244 else:# Gene not also defined as CDS; | |
245 feats.append(feature_props(feat_temp_store)) | |
246 feat_temp_store = '' | |
247 elif feature.type == 'gene': | |
248 feat_temp_store = feature | |
249 | |
250 if not feature.type in ['source','gene','misc_feature']: | |
251 if not feature.type in feature_types and feature.type != 'CDS': feature_types[feature.type] = 0 | |
252 feats.append(feature_props(feature)) | |
253 | |
254 | |
255 feat_props = sorted(feats, key=lambda cells:int(cells['start'])) | |
256 feat_boundaries = [{'start':item['start'],'end':item['end']} for item in feat_props] | |
257 regions = region_calc(feat_boundaries,len(seq_record.seq)) | |
258 feat_overlap = overlap_calc(regions) | |
259 | |
260 reference_loaded = time.clock() | |
261 | |
262 # Create array of SNPs from input file for processing; | |
263 lines = [line.split('\t') for line in file_snps if line.strip()] | |
264 file_snps.close() | |
265 # First line contains headers, extract number of strains etc; | |
266 headers = lines[0] | |
267 snp_table = sorted(lines[1:], key=lambda cells:int(cells[0])) | |
268 | |
269 snps_loaded = time.clock() | |
270 | |
271 # Print output file headers; | |
272 headers[-1] = headers[-1].rstrip()# Remove newline character; | |
273 strain_nr = len(headers)-1 | |
274 strains_found = 'Found '+str(strain_nr)+' strains: '+headers[1]+' (reference)' | |
275 first_line = '\t'+headers[1]+'\t'*9 | |
276 second_line = 'Position\tFeature\tLocus tag\tGene\tProduct\tStart\tEnd\tStrand\tRef. base\tRef. codon\tRef. res.' | |
277 straininfo = [0]*(len(headers[2:])) | |
278 for i,strain in enumerate(headers[2:]): | |
279 straininfo[i] = {'snps':0,'mnps':0,'synonymous':0,'nonsynonymous':0,'nonstart':0,'nonstop':0,'nonsense':0} | |
280 straininfo[i].update(feature_types) | |
281 strains_found += ', '+strain | |
282 first_line += '\t\t'+strain+'\t'*3 | |
283 second_line += '\t\tSNP type\tNew base\tNew codon\tNew res.' | |
284 | |
285 file_out.write(first_line+'\n'+second_line) | |
286 file_out.flush() | |
287 | |
288 # Loop through SNPs from array and process them; | |
289 props = {}# Properties of a feature; | |
290 prev_snp = ''# Position of previous SNP; | |
291 to_write = []# Information of current SNP; | |
292 compl_bases = {'a':'t','t':'a','g':'c','c':'g'} | |
293 firstsnp = True# First snp of region, or of codon in cases of 3 positions in codon mutated; | |
294 prev_start=j=k=0 | |
295 overlap_snps = [] | |
296 codon = ['']*5# Array of codon positions. First item is position of first base of codon in the gene; | |
297 | |
298 for region in regions: | |
299 firstsnp = True | |
300 i = prev_start | |
301 while i < len(snp_table):# Loop through SNPs | |
302 snp_entry = snp_table[i] | |
303 if not str(snp_entry[0]).isdigit():# Not a valid line, skip; | |
304 i += 1 | |
305 continue | |
306 | |
307 pos = int(snp_entry[0])-1 | |
308 if pos < region[0]:# Not inside region yet; | |
309 i += 1 | |
310 continue | |
311 elif firstsnp and pos < region[1]: | |
312 prev_start = i | |
313 elif pos >= region[1]:# End of region, process and next; | |
314 if not firstsnp and codon != ['','','','','']: | |
315 codon_process(codon) | |
316 break | |
317 | |
318 # Documentation of SNPs in feature overlaps; | |
319 while j < len(feat_overlap)-1 and pos > feat_overlap[j][1]: j += 1 | |
320 k = j | |
321 while k < len(feat_overlap) and pos >= feat_overlap[k][0]: | |
322 if pos < feat_overlap[k][1]: | |
323 if feat_overlap[k][4][0] == 0: | |
324 feat_overlap[k][4][0] = pos | |
325 feat_overlap[k][4][1] = pos | |
326 k += 1 | |
327 | |
328 | |
329 snp_entry[-1] = snp_entry[-1].rstrip()# Remove newline character at end of line; | |
330 | |
331 # Prevent crash upon encounter nonstandard base character; | |
332 if snp_entry[1] not in ['a','g','c','t']:# Ref base | |
333 i += 1 | |
334 continue | |
335 for m,base in enumerate(snp_entry[2:],2): | |
336 if base.lower() not in ['a','g','c','t']: | |
337 snp_entry[m] = '' | |
338 # Crash prevented | |
339 | |
340 mnp=in_feat=False | |
341 snp_feat = region[2] | |
342 ref_base = snp_entry[1] | |
343 | |
344 to_write = [[pos+1]] | |
345 | |
346 # Output feature properties and reference situation; | |
347 if snp_feat == -1: | |
348 codon = ['']*5 | |
349 to_write.append(['intergenic','','','','','','',ref_base.upper(),'','']) | |
350 elif feat_props[snp_feat]['type'] not in ['CDS','gene','pseudogene']:# In feature, but non-coding; | |
351 codon = ['']*5 | |
352 props = feat_props[snp_feat] | |
353 if props['strand'] == -1: ref_base = (compl_bases[snp_entry[1].lower()]) | |
354 else: ref_base = snp_entry[1] | |
355 to_write.append([props['type'],props['locus_tag'],props['gene_name'], | |
356 props['product'],props['start']+1,props['end'], | |
357 '',ref_base.upper(),'','']) | |
358 else:# in CDS/gene feature, check codon etc; | |
359 props = feat_props[snp_feat] | |
360 sequence = props['sequence'] | |
361 if props['strand'] == -1: | |
362 pos_in_gene = props['end'] - pos - 1# Python counting | |
363 ref_base = (compl_bases[snp_entry[1].lower()]) | |
364 else: | |
365 pos_in_gene = pos - props['start']# Python counting | |
366 ref_base = snp_entry[1] | |
367 | |
368 in_feat = True | |
369 if props['pseudo'] and 'subfeats' in props:# Pseudogene that needs special attention; | |
370 in_feat = False | |
371 subfeat_boundaries = [{'start':item.location._start.position,'end':item.location._end.position} | |
372 for item in props['subfeats']] | |
373 snp_subfeat = match_feature(subfeat_boundaries,pos) | |
374 if snp_subfeat != -1: | |
375 in_feat = True | |
376 pos_in_gene -= non_coding_calc({'start':props['start'],'subfeats':props['subfeats'], | |
377 'pseudo':True,'strand':props['strand']},pos) | |
378 sequence = props['pure_seq'] | |
379 | |
380 if not in_feat:# In pseudogene non-coding region; | |
381 codon = ['']*5 | |
382 to_write.append(['non coding',props['locus_tag'],props['gene_name'],props['product'], | |
383 props['start']+1,props['end'],props['strand'],ref_base.upper(), | |
384 '','']) | |
385 else:# In coding region; | |
386 pos_in_cod = (pos_in_gene+1)%3 | |
387 if pos_in_cod == 0: pos_in_cod = 3# Remainder of division 0 means 3rd place in codon; | |
388 | |
389 old_codon = sequence[pos_in_gene-pos_in_cod+1:pos_in_gene-pos_in_cod+4].upper() | |
390 old_residue = old_codon.translate(table=transl_table) | |
391 to_write.append([props['type'],props['locus_tag'],props['gene_name'],props['product'], | |
392 props['start']+1,props['end'],props['strand'],ref_base.upper(), | |
393 old_codon,old_residue]) | |
394 | |
395 if in_feat and not firstsnp and (pos >= prev_snp):# Check if snp is in same codon as previous snp. Position check for overlapping features; | |
396 if props['strand'] == 1 and (pos - prev_snp + 1) < pos_in_cod:# Same codon (Positive strand); | |
397 mnp = True | |
398 elif props['strand'] == -1 and (pos - prev_snp + 1) <= (3 - pos_in_cod):# Same codon (negative strand); | |
399 mnp = True | |
400 | |
401 # Process previous codon if not MNP; | |
402 if in_feat and not mnp: | |
403 if not firstsnp: | |
404 codon_process(codon) | |
405 codon = [pos_in_gene-pos_in_cod+1,'','','',props['strand']] | |
406 | |
407 | |
408 for l, snp in enumerate(snp_entry[2:]):# Loop through SNPs/strains; | |
409 | |
410 snp = snp.lower() | |
411 if snp == '':# Empty cell; | |
412 to_write.append(['','','','']) | |
413 continue | |
414 | |
415 if snp == '-': # Feature not present in this strain; | |
416 to_write.append(['Allele missing','','','']) | |
417 continue | |
418 | |
419 if snp_feat == -1:# Intergenic; | |
420 if snp == ref_base.lower(): | |
421 to_write.append(['']*4) | |
422 else: | |
423 to_write.append(['',snp,'','']) | |
424 straininfo[l]['intergenic'] += 1 | |
425 straininfo[l]['snps'] += 1 | |
426 continue | |
427 | |
428 if props['strand'] == -1: | |
429 snp = compl_bases[snp] | |
430 | |
431 if snp == ref_base.lower(): | |
432 to_write.append(['']*4) | |
433 else: | |
434 to_write.append(['',snp,'','']) | |
435 straininfo[l]['snps'] += 1 | |
436 if props['type'] != 'CDS': | |
437 straininfo[l][props['type']] += 1 | |
438 | |
439 | |
440 | |
441 if props['type'] in ['CDS','gene','pseudogene'] and in_feat: | |
442 codon[pos_in_cod] = to_write | |
443 else: | |
444 write_output(to_write) | |
445 | |
446 if firstsnp: firstsnp = False | |
447 prev_snp = pos+1 | |
448 i += 1 | |
449 | |
450 | |
451 if codon != ['','','','','']: codon_process(codon) | |
452 | |
453 file_out.close() | |
454 | |
455 end = time.clock() | |
456 | |
457 file_summary.write("\n") | |
458 file_summary.write(intro_message) | |
459 file_summary.write('\n'+strains_found+'.\n') | |
460 | |
461 file_summary.write("\nFinished annotation. Total time: %s s\n\n" % round(end-start,1)) | |
462 | |
463 | |
464 file_overlap.write('SNP start\tSNP end\tFeature 1\tLocus tag\tProduct\t\tFeature 2\tLocus tag\tProduct') | |
465 for overlap in feat_overlap: | |
466 if overlap[4] != [0,0]: | |
467 overlap[4][0]+=1 | |
468 overlap[4][1]+=1 | |
469 if overlap[4][0] == overlap[4][1]: overlap[4][1] = '' | |
470 write_output([[str(overlap[4][0])],[str(overlap[4][1]),feat_props[overlap[2]]['type'],feat_props[overlap[2]]['locus_tag'],feat_props[overlap[2]]['product']], | |
471 [feat_props[overlap[3]]['type'],feat_props[overlap[3]]['locus_tag'],feat_props[overlap[3]]['product']]], | |
472 file_overlap) | |
473 | |
474 | |
475 for i,strain in enumerate(headers[2:]): | |
476 file_summary.write("\n") | |
477 info = straininfo[i] | |
478 file_summary.write("+ Strain %s:\n" % strain) | |
479 file_summary.write(" %s SNPs found\n" % info['snps']) | |
480 file_summary.write(" Number of SNPs found CDS features: %s\n" % (info['mnps']+info['nonstart']+info['nonstop']+info['nonsense']+ | |
481 info['synonymous']+info['nonsynonymous'])) | |
482 file_summary.write(" (of which in pseudogenes: %s)\n" % info['pseudogene']) | |
483 file_summary.write(" - MNPs: %s\n" % info['mnps']) | |
484 file_summary.write(" - Synonymous: %s\n" % info['synonymous']) | |
485 file_summary.write(" - Nonsynonymous: %s\n" % info['nonsynonymous']) | |
486 file_summary.write(" - Nonsense: %s\n" % info['nonsense']) | |
487 file_summary.write(" - Nonstart: %s\n" % info['nonstart']) | |
488 file_summary.write(" - Nonstop: %s\n" % info['nonstop']) | |
489 file_summary.write(" Intergenic: %s\n" % info['intergenic']) | |
490 | |
491 for typ in feature_types: | |
492 if typ not in ['intergenic','pseudogene'] and info[typ] != 0: | |
493 file_summary.write(" %s: %s\n" % (typ,info[typ])) | |
494 file_summary.flush() | |
495 | |
496 file_overlap.close() | |
497 file_summary.close() |