comparison trams.py @ 4:bd5692103d5b draft

Uploaded
author rreumerman
date Fri, 05 Apr 2013 05:00:40 -0400
parents
children 8de0ffc2166f
comparison
equal deleted inserted replaced
3:a808647d7312 4:bd5692103d5b
1 #################
2 transl_table = 11
3 intro_message = ''' +------------------------------------------------------------------+
4 | Tool for Rapid Annotation of Microbial SNPs (TRAMS): a simple |
5 | program for rapid annotation of genomic variation in prokaryotes |
6 | |
7 | Developed by: Richard A. Reumerman, Paul R. Herron, |
8 | Paul A. Hoskisson and Vartul Sangal |
9 +------------------------------------------------------------------+\n'''
10 #################
11
12 import sys
13 import time
14 start = time.clock()
15
16 # Command line files: SNP REF REF-TYPE ANNOT OVERL SUM;
17 if len(sys.argv) < 7:
18 exit("\nNot enough arguments given.\nUsage: TRAMS_Galaxy.py [SNP.] [REF.] [ANNOT.] [OVERL.] [SUM.]")
19 try:
20 file_snps = open(sys.argv[1], "rU")
21 except IOError as e:
22 exit("Error trying to open '"+sys.argv[1]+"': {1}".format(e.errno, e.strerror))
23 try:
24 file_ref = open(sys.argv[2], "rU")
25 except IOError as e:
26 exit("Error trying to open '"+sys.argv[2]+"': {1}".format(e.errno, e.strerror))
27
28 filetype_reference = sys.argv[3]
29
30 try:
31 file_out = open(sys.argv[4], "w")
32 except IOError as e:
33 exit("Error trying to open '"+sys.argv[4]+"': {1}".format(e.errno, e.strerror))
34 try:
35 file_overlap = open(sys.argv[5], "w")
36 except IOError as e:
37 exit("Error trying to open '"+sys.argv[5]+"': {1}".format(e.errno, e.strerror))
38 try:
39 file_summary = open(sys.argv[6], "w")
40 except IOError as e:
41 exit("Error trying to open '"+sys.argv[6]+"': {1}".format(e.errno, e.strerror))
42
43 import Bio
44 from Bio import SeqIO, SeqFeature
45 from Bio.SeqRecord import SeqRecord
46 from Bio.Seq import Seq
47 from Bio.Alphabet import generic_dna, IUPAC
48 from Bio.Data import CodonTable
49
50 modules_loaded = time.clock()
51
52 def non_coding_calc(gene, pos = 0):
53 '''This function takes a pseudogene and returns the number of bases
54 located in between the sub-features before 'pos'. Returns 0 if 'pseudo' = False.
55 Input: {start, subfeats, pseudo}, pos (default = 0)'''
56 if not gene['pseudo']: return 0
57
58 non_coding_bases = 0
59 prev_subfeat_end = gene['start']
60 if gene['strand'] == -1:
61 for subfeature in gene['subfeats']:
62 if subfeature.location._start.position < pos:
63 prev_subfeat_end = subfeature.location._end.position
64 continue
65 non_coding_bases += (subfeature.location._start.position - prev_subfeat_end)
66 prev_subfeat_end = subfeature.location._end.position
67 else:
68 for subfeature in gene['subfeats']:
69 non_coding_bases += (subfeature.location._start.position - prev_subfeat_end)
70 prev_subfeat_end = subfeature.location._end.position
71 if prev_subfeat_end >= pos and pos != 0: break
72
73 return non_coding_bases
74
75
76 def region_calc(bounds,length):
77 regions = []
78 lastend=i=0
79 while i < len(bounds):
80 if bounds[i]['start'] > lastend:# Intergenic region present;
81 regions.append([lastend,bounds[i]['start'],-1])
82 lastend = bounds[i]['start']
83 else:
84 regions.append([bounds[i]['start'],bounds[i]['end'],i])
85 if bounds[i]['end'] > lastend:
86 lastend = bounds[i]['end']
87 i += 1
88
89 if regions[-1][1] < length:# Final tail of genome;
90 regions.append([lastend,length,-1])
91
92 return regions
93
94
95 def overlap_calc(bounds):
96 '''This function takes an array of feature starts and ends and
97 returns an array of starts and ends of all overlapping regions.
98 Input: [{start,end}]'''
99 i = 0
100 overlaps = []
101 while i < len(bounds) - 1:
102 for downstr in bounds[i+1:]:
103 if downstr[0] < bounds[i][1]:# Features overlap;
104 if downstr[1] < bounds[i][1]:# Complete overlap;
105 overlaps.append([downstr[0],downstr[1],bounds[i][2],downstr[2],[0,0]])
106 else:# Partial overlap;
107 overlaps.append([downstr[0],bounds[i][1],bounds[i][2],downstr[2],[0,0]])
108 else:# No use looking further;
109 break
110
111 i += 1
112
113 return overlaps
114
115
116 def match_feature(bounds,pos,prev=0):
117 '''This function checks if a position is located inside a feature and
118 returns the feature's number if found or -1 if none is found.
119 Input: {start,end},pos,prev_feat (default = 0)'''
120 for i in range(prev, len(bounds)):
121 if (pos >= bounds[i]['start']) and (pos < bounds[i]['end']):
122 return i
123 elif pos < bounds[i]['start']:# No use looking further
124 return -1
125
126 return -1
127
128
129 def write_output(line,target=file_out):
130 '''This function takes the 2 dimensional array containing all the SNP
131 data. It contains an array of information on the feature and an array
132 for each strain for which SNPs are given.
133 Input: [[pos],[ref],[cells],[cells],etc]'''
134 target.write('\n'+str(line[0][0]))
135 for cell in line[1]:
136 target.write('\t'+str(cell))
137 for strain in line[2:]:
138 target.write('\t')
139 for cell in strain:
140 target.write('\t'+str(cell))
141
142 target.flush()
143
144
145 def new_codon_calc(ref_codon, new_base, pos_in_cod):
146 return str(ref_codon[0:pos_in_cod-1]+new_base+ref_codon[pos_in_cod:len(ref_codon)])
147
148
149 def mut_type_check(ref_res, ref_codon, pos_in_gene, new_base, new_codon):
150 if str(new_codon).lower() == str(ref_codon).lower():
151 return ['','','','']
152 new_residue = Seq(new_codon).translate(table=transl_table)
153 if str(new_residue) == str(ref_res):
154 mut_type = 'synonymous'
155 elif (pos_in_gene / 3) < 1 and str(ref_codon).upper() in CodonTable.unambiguous_dna_by_id[transl_table].start_codons:# position 0,1 or 2 and SNP is in start codon;
156 if str(new_codon).upper() in CodonTable.unambiguous_dna_by_id[transl_table].start_codons: mut_type = 'nonsynonymous'
157 else: mut_type = 'nonstart'
158 elif str(new_residue) == '*': mut_type = 'nonsense'
159 elif str(ref_res) == '*': mut_type = 'nonstop'
160 else: mut_type = 'nonsynonymous'
161
162 return [mut_type,new_base,new_codon,new_residue]
163
164
165 def codon_process(codon):
166 '''This function processes a codon. It loops through it 3 times,
167 once to determine which is the highest position mutated, once to
168 fill in the cells for the output file and once to output all lines.
169 Input: [empty,start_pos,[line1],[line2],[line3],strand]
170 It also uses global variable strain_nr'''
171 lastposition = [-1]*(strain_nr-1)
172 new_codons = ['']*(strain_nr-1)
173 if codon[-1] == -1:# Change codon position order for -1 features;
174 temp = codon [1:-1]
175 temp.reverse()
176 codon[1:-1] = temp
177 for i,line in enumerate(codon[1:-1],1):
178 if line == '': continue
179 for j,strain in enumerate(line[2:]):
180 if strain[1] in ['a','g','c','t']:
181 lastposition[j] = i
182 new_codons[j] = codon[i][1][8]
183
184 for i,line in enumerate(codon[1:-1],1):
185 if codon[-1] == -1: pos_in_cod = 4-i
186 else: pos_in_cod = i
187
188 if line == '': continue
189 for j,strain in enumerate(line[2:]):
190 if i == lastposition[j]: # Check for synonymous etc.;
191 new_codons[j] = new_codon_calc(new_codons[j],strain[1],pos_in_cod)
192 codon[i][j+2] = mut_type_check(line[1][9],line[1][8],codon[0],strain[1],new_codons[j])
193 straininfo[j][codon[i][j+2][0]] += 1# Counting;
194 elif strain[1] in ['a','g','c','t']:
195 codon[i][j+2] = ['MNP',strain[1],'','']
196 straininfo[j]['mnps'] += 1
197 new_codons[j] = new_codon_calc(new_codons[j],strain[1],pos_in_cod)
198 elif strain[0] == 'Allele missing': codon[i][j+2] = strain
199 else: codon[i][j+2] = ['']*4
200
201 for line in codon[1:-1]:
202 if line != '': write_output(line)
203
204 def feature_props(feature):
205 properties = {'type':feature.type,'strand':feature.location._strand,
206 'sequence':feature.extract(seq_record.seq),'pseudo': False,
207 'locus_tag':'','gene_name':'','product':'',
208 'start':int(feature.location._start.position),
209 'end':int(feature.location._end.position)}
210 if 'pseudo' in feature.qualifiers:
211 properties['pseudo'] = True
212 properties['type'] = 'pseudogene'
213 properties['pure_seq'] = properties['sequence']
214 if properties['strand'] == -1:
215 properties['sequence'] = seq_record.seq[feature.location._start.position:feature.location._end.position].reverse_complement()
216 else:
217 properties['sequence'] = seq_record.seq[feature.location._start.position:feature.location._end.position]
218 if feature.sub_features: properties['subfeats'] = feature.sub_features
219 if 'locus_tag' in feature.qualifiers: properties['locus_tag'] = feature.qualifiers['locus_tag'][0]
220 if 'gene' in feature.qualifiers: properties['gene_name']= feature.qualifiers['gene'][0]
221 if feature.type in ['tRNA','rRNA','CDS']: properties['product'] = feature.qualifiers['product'][0]
222
223 return properties
224
225 # Read embl/genbank file for information on sequence features;
226 try:
227 seq_record = SeqIO.parse(file_ref, filetype_reference).next()
228 except:
229 file_ref.close()
230 quit("Error reading "+sys.argv[2]+", please check file for errors.")
231 file_ref.close()
232
233 # Loop through genome features and save relevant properties;
234 feats = []# Dictionary of properties;
235
236 feature_types = {'intergenic':0,'gene':0,'pseudogene':0}
237 feat_temp_store = ''
238 for feature in seq_record.features:
239 # Check if gene is defined as other feature (e.g. CDS). Else, save info from 'gene';
240 if feat_temp_store != '':
241 if (feature.location._start.position == feat_temp_store.location._start.position and
242 feature.location._end.position == feat_temp_store.location._end.position):# Gene also defined as other feature;
243 feat_temp_store = ''
244 else:# Gene not also defined as CDS;
245 feats.append(feature_props(feat_temp_store))
246 feat_temp_store = ''
247 elif feature.type == 'gene':
248 feat_temp_store = feature
249
250 if not feature.type in ['source','gene','misc_feature']:
251 if not feature.type in feature_types and feature.type != 'CDS': feature_types[feature.type] = 0
252 feats.append(feature_props(feature))
253
254
255 feat_props = sorted(feats, key=lambda cells:int(cells['start']))
256 feat_boundaries = [{'start':item['start'],'end':item['end']} for item in feat_props]
257 regions = region_calc(feat_boundaries,len(seq_record.seq))
258 feat_overlap = overlap_calc(regions)
259
260 reference_loaded = time.clock()
261
262 # Create array of SNPs from input file for processing;
263 lines = [line.split('\t') for line in file_snps if line.strip()]
264 file_snps.close()
265 # First line contains headers, extract number of strains etc;
266 headers = lines[0]
267 snp_table = sorted(lines[1:], key=lambda cells:int(cells[0]))
268
269 snps_loaded = time.clock()
270
271 # Print output file headers;
272 headers[-1] = headers[-1].rstrip()# Remove newline character;
273 strain_nr = len(headers)-1
274 strains_found = 'Found '+str(strain_nr)+' strains: '+headers[1]+' (reference)'
275 first_line = '\t'+headers[1]+'\t'*9
276 second_line = 'Position\tFeature\tLocus tag\tGene\tProduct\tStart\tEnd\tStrand\tRef. base\tRef. codon\tRef. res.'
277 straininfo = [0]*(len(headers[2:]))
278 for i,strain in enumerate(headers[2:]):
279 straininfo[i] = {'snps':0,'mnps':0,'synonymous':0,'nonsynonymous':0,'nonstart':0,'nonstop':0,'nonsense':0}
280 straininfo[i].update(feature_types)
281 strains_found += ', '+strain
282 first_line += '\t\t'+strain+'\t'*3
283 second_line += '\t\tSNP type\tNew base\tNew codon\tNew res.'
284
285 file_out.write(first_line+'\n'+second_line)
286 file_out.flush()
287
288 # Loop through SNPs from array and process them;
289 props = {}# Properties of a feature;
290 prev_snp = ''# Position of previous SNP;
291 to_write = []# Information of current SNP;
292 compl_bases = {'a':'t','t':'a','g':'c','c':'g'}
293 firstsnp = True# First snp of region, or of codon in cases of 3 positions in codon mutated;
294 prev_start=j=k=0
295 overlap_snps = []
296 codon = ['']*5# Array of codon positions. First item is position of first base of codon in the gene;
297
298 for region in regions:
299 firstsnp = True
300 i = prev_start
301 while i < len(snp_table):# Loop through SNPs
302 snp_entry = snp_table[i]
303 if not str(snp_entry[0]).isdigit():# Not a valid line, skip;
304 i += 1
305 continue
306
307 pos = int(snp_entry[0])-1
308 if pos < region[0]:# Not inside region yet;
309 i += 1
310 continue
311 elif firstsnp and pos < region[1]:
312 prev_start = i
313 elif pos >= region[1]:# End of region, process and next;
314 if not firstsnp and codon != ['','','','','']:
315 codon_process(codon)
316 break
317
318 # Documentation of SNPs in feature overlaps;
319 while j < len(feat_overlap)-1 and pos > feat_overlap[j][1]: j += 1
320 k = j
321 while k < len(feat_overlap) and pos >= feat_overlap[k][0]:
322 if pos < feat_overlap[k][1]:
323 if feat_overlap[k][4][0] == 0:
324 feat_overlap[k][4][0] = pos
325 feat_overlap[k][4][1] = pos
326 k += 1
327
328
329 snp_entry[-1] = snp_entry[-1].rstrip()# Remove newline character at end of line;
330
331 # Prevent crash upon encounter nonstandard base character;
332 if snp_entry[1] not in ['a','g','c','t']:# Ref base
333 i += 1
334 continue
335 for m,base in enumerate(snp_entry[2:],2):
336 if base.lower() not in ['a','g','c','t']:
337 snp_entry[m] = ''
338 # Crash prevented
339
340 mnp=in_feat=False
341 snp_feat = region[2]
342 ref_base = snp_entry[1]
343
344 to_write = [[pos+1]]
345
346 # Output feature properties and reference situation;
347 if snp_feat == -1:
348 codon = ['']*5
349 to_write.append(['intergenic','','','','','','',ref_base.upper(),'',''])
350 elif feat_props[snp_feat]['type'] not in ['CDS','gene','pseudogene']:# In feature, but non-coding;
351 codon = ['']*5
352 props = feat_props[snp_feat]
353 if props['strand'] == -1: ref_base = (compl_bases[snp_entry[1].lower()])
354 else: ref_base = snp_entry[1]
355 to_write.append([props['type'],props['locus_tag'],props['gene_name'],
356 props['product'],props['start']+1,props['end'],
357 '',ref_base.upper(),'',''])
358 else:# in CDS/gene feature, check codon etc;
359 props = feat_props[snp_feat]
360 sequence = props['sequence']
361 if props['strand'] == -1:
362 pos_in_gene = props['end'] - pos - 1# Python counting
363 ref_base = (compl_bases[snp_entry[1].lower()])
364 else:
365 pos_in_gene = pos - props['start']# Python counting
366 ref_base = snp_entry[1]
367
368 in_feat = True
369 if props['pseudo'] and 'subfeats' in props:# Pseudogene that needs special attention;
370 in_feat = False
371 subfeat_boundaries = [{'start':item.location._start.position,'end':item.location._end.position}
372 for item in props['subfeats']]
373 snp_subfeat = match_feature(subfeat_boundaries,pos)
374 if snp_subfeat != -1:
375 in_feat = True
376 pos_in_gene -= non_coding_calc({'start':props['start'],'subfeats':props['subfeats'],
377 'pseudo':True,'strand':props['strand']},pos)
378 sequence = props['pure_seq']
379
380 if not in_feat:# In pseudogene non-coding region;
381 codon = ['']*5
382 to_write.append(['non coding',props['locus_tag'],props['gene_name'],props['product'],
383 props['start']+1,props['end'],props['strand'],ref_base.upper(),
384 '',''])
385 else:# In coding region;
386 pos_in_cod = (pos_in_gene+1)%3
387 if pos_in_cod == 0: pos_in_cod = 3# Remainder of division 0 means 3rd place in codon;
388
389 old_codon = sequence[pos_in_gene-pos_in_cod+1:pos_in_gene-pos_in_cod+4].upper()
390 old_residue = old_codon.translate(table=transl_table)
391 to_write.append([props['type'],props['locus_tag'],props['gene_name'],props['product'],
392 props['start']+1,props['end'],props['strand'],ref_base.upper(),
393 old_codon,old_residue])
394
395 if in_feat and not firstsnp and (pos >= prev_snp):# Check if snp is in same codon as previous snp. Position check for overlapping features;
396 if props['strand'] == 1 and (pos - prev_snp + 1) < pos_in_cod:# Same codon (Positive strand);
397 mnp = True
398 elif props['strand'] == -1 and (pos - prev_snp + 1) <= (3 - pos_in_cod):# Same codon (negative strand);
399 mnp = True
400
401 # Process previous codon if not MNP;
402 if in_feat and not mnp:
403 if not firstsnp:
404 codon_process(codon)
405 codon = [pos_in_gene-pos_in_cod+1,'','','',props['strand']]
406
407
408 for l, snp in enumerate(snp_entry[2:]):# Loop through SNPs/strains;
409
410 snp = snp.lower()
411 if snp == '':# Empty cell;
412 to_write.append(['','','',''])
413 continue
414
415 if snp == '-': # Feature not present in this strain;
416 to_write.append(['Allele missing','','',''])
417 continue
418
419 if snp_feat == -1:# Intergenic;
420 if snp == ref_base.lower():
421 to_write.append(['']*4)
422 else:
423 to_write.append(['',snp,'',''])
424 straininfo[l]['intergenic'] += 1
425 straininfo[l]['snps'] += 1
426 continue
427
428 if props['strand'] == -1:
429 snp = compl_bases[snp]
430
431 if snp == ref_base.lower():
432 to_write.append(['']*4)
433 else:
434 to_write.append(['',snp,'',''])
435 straininfo[l]['snps'] += 1
436 if props['type'] != 'CDS':
437 straininfo[l][props['type']] += 1
438
439
440
441 if props['type'] in ['CDS','gene','pseudogene'] and in_feat:
442 codon[pos_in_cod] = to_write
443 else:
444 write_output(to_write)
445
446 if firstsnp: firstsnp = False
447 prev_snp = pos+1
448 i += 1
449
450
451 if codon != ['','','','','']: codon_process(codon)
452
453 file_out.close()
454
455 end = time.clock()
456
457 file_summary.write("\n")
458 file_summary.write(intro_message)
459 file_summary.write('\n'+strains_found+'.\n')
460
461 file_summary.write("\nFinished annotation. Total time: %s s\n\n" % round(end-start,1))
462
463
464 file_overlap.write('SNP start\tSNP end\tFeature 1\tLocus tag\tProduct\t\tFeature 2\tLocus tag\tProduct')
465 for overlap in feat_overlap:
466 if overlap[4] != [0,0]:
467 overlap[4][0]+=1
468 overlap[4][1]+=1
469 if overlap[4][0] == overlap[4][1]: overlap[4][1] = ''
470 write_output([[str(overlap[4][0])],[str(overlap[4][1]),feat_props[overlap[2]]['type'],feat_props[overlap[2]]['locus_tag'],feat_props[overlap[2]]['product']],
471 [feat_props[overlap[3]]['type'],feat_props[overlap[3]]['locus_tag'],feat_props[overlap[3]]['product']]],
472 file_overlap)
473
474
475 for i,strain in enumerate(headers[2:]):
476 file_summary.write("\n")
477 info = straininfo[i]
478 file_summary.write("+ Strain %s:\n" % strain)
479 file_summary.write(" %s SNPs found\n" % info['snps'])
480 file_summary.write(" Number of SNPs found CDS features: %s\n" % (info['mnps']+info['nonstart']+info['nonstop']+info['nonsense']+
481 info['synonymous']+info['nonsynonymous']))
482 file_summary.write(" (of which in pseudogenes: %s)\n" % info['pseudogene'])
483 file_summary.write(" - MNPs: %s\n" % info['mnps'])
484 file_summary.write(" - Synonymous: %s\n" % info['synonymous'])
485 file_summary.write(" - Nonsynonymous: %s\n" % info['nonsynonymous'])
486 file_summary.write(" - Nonsense: %s\n" % info['nonsense'])
487 file_summary.write(" - Nonstart: %s\n" % info['nonstart'])
488 file_summary.write(" - Nonstop: %s\n" % info['nonstop'])
489 file_summary.write(" Intergenic: %s\n" % info['intergenic'])
490
491 for typ in feature_types:
492 if typ not in ['intergenic','pseudogene'] and info[typ] != 0:
493 file_summary.write(" %s: %s\n" % (typ,info[typ]))
494 file_summary.flush()
495
496 file_overlap.close()
497 file_summary.close()