0
|
1 #!/bin/env python3
|
|
2 # Writen by Devon Gregory with assistance from Christopher Bottoms
|
|
3 # University of Missouri
|
|
4 # Distributed under GNU GENERAL PUBLIC LICENSE v3
|
|
5 import os
|
|
6 import sys
|
|
7 import argparse
|
|
8 import itertools
|
|
9 import time
|
|
10 from multiprocessing import Process
|
|
11 from pathlib import Path
|
|
12
|
|
13 # process for collecing command line arguments
|
|
14 def arg_parser():
|
|
15
|
|
16 parser = argparse.ArgumentParser(
|
|
17 description='process Sam files for variant information'
|
|
18 )
|
|
19
|
|
20 parser.add_argument(
|
|
21 '--samp',
|
|
22 type=str,
|
|
23 default='Sample',
|
|
24 help='Sample Name'
|
|
25 )
|
|
26 parser.add_argument(
|
|
27 '-r', '-reference',
|
|
28 type=argparse.FileType('r'),
|
|
29 dest='ref',
|
|
30 help='reference fasta'
|
|
31 )
|
|
32 parser.add_argument(
|
|
33 '-S', '--Sam_file',
|
|
34 type=argparse.FileType('r'),
|
|
35 dest='Sam_file',
|
|
36 help='.sam file'
|
|
37 )
|
|
38 parser.add_argument(
|
|
39 '-o1', '--out_file1',
|
|
40 dest='outfile1',
|
|
41 help='output file'
|
|
42 )
|
|
43 parser.add_argument(
|
|
44 '-o2', '--out_file2',
|
|
45 dest='outfile2',
|
|
46 help='output file'
|
|
47 )
|
|
48 parser.add_argument(
|
|
49 '-o3', '--out_file3',
|
|
50 dest='outfile3',
|
|
51 help='output file'
|
|
52 )
|
|
53 parser.add_argument(
|
|
54 '-o4', '--out_file4',
|
|
55 dest='outfile4',
|
|
56 help='output file'
|
|
57 )
|
|
58 parser.add_argument(
|
|
59 '-o5', '--out_file5',
|
|
60 dest='outfile5',
|
|
61 help='output file'
|
|
62 )
|
|
63 parser.add_argument(
|
|
64 '-o6', '--out_file6',
|
|
65 dest='outfile6',
|
|
66 help='output file'
|
|
67 )
|
|
68 parser.add_argument(
|
|
69 '-o7', '--out_file7',
|
|
70 dest='outfile7',
|
|
71 help='output file'
|
|
72 )
|
|
73 parser.add_argument(
|
|
74 '--use_count',
|
|
75 type=int,
|
|
76 default=1,
|
|
77 choices=[0, 1],
|
|
78 help='Enable/Disable (1/0) use of counts in sequence IDs, default enabled (--use_count 1)'
|
|
79 )
|
|
80 parser.add_argument(
|
|
81 '--min_count',
|
|
82 type=int,
|
|
83 default=10,
|
|
84 help='Minimum observations required to be included in sample reports; >= 1 occurance count; < 1 %% observed (.1 = 10%%), (default: .001)'
|
|
85 )
|
|
86 parser.add_argument(
|
|
87 '--min_samp_abund',
|
|
88 type=float,
|
|
89 default=0.001,
|
|
90 help='Minimum abundance required for inclusion in sample reports; %% observed (.1 = 10%%), (default: .001)'
|
|
91 )
|
|
92 parser.add_argument(
|
|
93 '--ntabund',
|
|
94 type=float,
|
|
95 default=.001,
|
|
96 help='Minimum abundance relative to total reads required for a position to be reported in the nt call output; must be non-negative and < 1, %% observed (.1 = 10%%), (default: .001)'
|
|
97 )
|
|
98 parser.add_argument(
|
|
99 '--max_dist',
|
|
100 type=int,
|
|
101 default=40,
|
|
102 help='Maximum number of variances from the reference a sequence can have to be consider in covars processing (default: 40)'
|
|
103 )
|
|
104 parser.add_argument(
|
|
105 '--max_covar',
|
|
106 type=int,
|
|
107 default=8,
|
|
108 help='Maximum number of variances from the reference to be reported in covars (default: 8)'
|
|
109 )
|
|
110 parser.add_argument(
|
|
111 '--AAreport',
|
|
112 type=int,
|
|
113 default=1,
|
|
114 choices=[0, 1],
|
|
115 help='Enable/Disable (1/0) amino acid reporting, default enabled (--AAreport 1)'
|
|
116 )
|
|
117 parser.add_argument(
|
|
118 '--AAcodonasMNP',
|
|
119 type=int,
|
|
120 default=1,
|
|
121 choices=[0, 1],
|
|
122 help='Enable/Disable (1/0) reporting multiple nt changes in a single codon as one polymorphism, default enabled (--AAcodonasMNP 1), requires AAreport enabled'
|
|
123 )
|
|
124 parser.add_argument(
|
|
125 '--alpha',
|
|
126 type=float,
|
|
127 default=1.2,
|
|
128 help='Modifier for chim_rm chimera checking, default 1.2. Higher = more sensitive, more false chimeras removed; lower = less sensitive, fewer chimeras removed'
|
|
129 )
|
|
130 parser.add_argument(
|
|
131 '--foldab',
|
|
132 type=float,
|
|
133 default=1.8,
|
|
134 help='Threshold for potential parent / chimera abundance ratio for chim_rm; default is 1.8'
|
|
135 )
|
|
136 parser.add_argument(
|
|
137 '--redist',
|
|
138 type=int,
|
|
139 default=1,
|
|
140 choices=[0, 1],
|
|
141 help='Enable/Disable (1/0) redistribution of chimera counts for chim_rm, default enabled (--redist 1)'
|
|
142 )
|
|
143 parser.add_argument(
|
|
144 '--beta',
|
|
145 type=float,
|
|
146 default=1,
|
|
147 help='Modifier for covar pass checking, default 1. Higher = more sensitive, more failed checks; lower = less sensitive, fewer failed checks'
|
|
148 )
|
|
149 parser.add_argument(
|
|
150 '--autopass',
|
|
151 type=float,
|
|
152 default=.3,
|
|
153 help='threshold for a sequence to automatically pass the covar pass checking'
|
|
154 )
|
|
155 parser.add_argument(
|
|
156 '--nt_call',
|
|
157 type=int,
|
|
158 default=1,
|
|
159 choices=[0, 1],
|
|
160 help='Enable/Disable (1/0) nt_call output, default enabled (--nt_call 1)'
|
|
161 )
|
|
162 parser.add_argument(
|
|
163 '--ntvar',
|
|
164 type=int,
|
|
165 default=0,
|
|
166 choices=[0, 1],
|
|
167 help='Enable/Disable (1/0) nt_var output, default enabled (--nt_call 1)'
|
|
168 )
|
|
169 parser.add_argument(
|
|
170 '--indel',
|
|
171 type=int,
|
|
172 default=1,
|
|
173 choices=[0, 1],
|
|
174 help='Enable/Disable (1/0) indel output, default enabled (--indel 1)'
|
|
175 )
|
|
176 parser.add_argument(
|
|
177 '--seq',
|
|
178 type=int,
|
|
179 default=1,
|
|
180 choices=[0, 1],
|
|
181 help='Enable/Disable (1/0) unique seq output, default enabled (--seq 1)'
|
|
182 )
|
|
183 parser.add_argument(
|
|
184 '--covar', type=int,
|
|
185 default=1,
|
|
186 choices=[0, 1],
|
|
187 help='Enable/Disable (1/0) covar output, default enabled (--covar 1)'
|
|
188 )
|
|
189 parser.add_argument(
|
|
190 '--chim_rm',
|
|
191 type=int,
|
|
192 default=1,
|
|
193 choices=[0, 1],
|
|
194 help='Enable/Disable (1/0) chim_rm output, default enabled (--chim_rm 1)'
|
|
195 )
|
|
196 parser.add_argument(
|
|
197 '--deconv',
|
|
198 type=int,
|
|
199 default=1,
|
|
200 choices=[0, 1],
|
|
201 help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)'
|
|
202 )
|
|
203 parser.add_argument(
|
|
204 '--wgs',
|
|
205 type=int,
|
|
206 default=0,
|
|
207 choices=[0, 1],
|
|
208 help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)'
|
|
209 )
|
|
210
|
|
211 args = parser.parse_args()
|
|
212
|
|
213 # checking for proper range of some parameters and consistency/compatibility
|
|
214 if args.wgs == 1:
|
|
215 if args.deconv == 1 or args.chim_rm == 1:
|
|
216 args.deconv = 0
|
|
217 args.chim_rm = 0
|
|
218 print('WGS mode enabled, disabling chimera removal methods')
|
|
219
|
|
220 return(args)
|
|
221
|
|
222 def get_ref(ref): # get the reference ID and sequence from the FASTA file. Will only get the first.
|
|
223
|
|
224 n=0
|
|
225 refID = ''
|
|
226 refseq = ''
|
|
227 if ref:
|
|
228 for line in ref:
|
|
229 if line.startswith('>'):
|
|
230 n+=1
|
|
231 if n > 1:
|
|
232 break
|
|
233 refID = line[1:].strip("\n\r")
|
|
234 elif n == 1:
|
|
235 refseq = refseq + line.strip("\n\r")
|
|
236 refseq = refseq.upper()
|
|
237
|
|
238
|
|
239 return(refID, refseq)
|
|
240
|
|
241 def AAcall(codon): # amino acid / codon dictionary to return encoded AAs
|
|
242 AAdict = {
|
|
243 'TTT' : 'F',
|
|
244 'TTC' : 'F',
|
|
245 'TTA' : 'L',
|
|
246 'TTG' : 'L',
|
|
247 'TCT' : 'S',
|
|
248 'TCC' : 'S',
|
|
249 'TCA' : 'S',
|
|
250 'TCG' : 'S',
|
|
251 'TAT' : 'Y',
|
|
252 'TAC' : 'Y',
|
|
253 'TAA' : '*',
|
|
254 'TAG' : '*',
|
|
255 'TGT' : 'C',
|
|
256 'TGC' : 'C',
|
|
257 'TGA' : '*',
|
|
258 'TGG' : 'W',
|
|
259
|
|
260 'CTT' : 'L',
|
|
261 'CTC' : 'L',
|
|
262 'CTA' : 'L',
|
|
263 'CTG' : 'L',
|
|
264 'CCT' : 'P',
|
|
265 'CCC' : 'P',
|
|
266 'CCA' : 'P',
|
|
267 'CCG' : 'P',
|
|
268 'CAT' : 'H',
|
|
269 'CAC' : 'H',
|
|
270 'CAA' : 'Q',
|
|
271 'CAG' : 'Q',
|
|
272 'CGT' : 'R',
|
|
273 'CGC' : 'R',
|
|
274 'CGA' : 'R',
|
|
275 'CGG' : 'R',
|
|
276
|
|
277 'ATT' : 'I',
|
|
278 'ATC' : 'I',
|
|
279 'ATA' : 'I',
|
|
280 'ATG' : 'M',
|
|
281 'ACT' : 'T',
|
|
282 'ACC' : 'T',
|
|
283 'ACA' : 'T',
|
|
284 'ACG' : 'T',
|
|
285 'AAT' : 'N',
|
|
286 'AAC' : 'N',
|
|
287 'AAA' : 'K',
|
|
288 'AAG' : 'K',
|
|
289 'AGT' : 'S',
|
|
290 'AGC' : 'S',
|
|
291 'AGA' : 'R',
|
|
292 'AGG' : 'R',
|
|
293
|
|
294 'GTT' : 'V',
|
|
295 'GTC' : 'V',
|
|
296 'GTA' : 'V',
|
|
297 'GTG' : 'V',
|
|
298 'GCT' : 'A',
|
|
299 'GCC' : 'A',
|
|
300 'GCA' : 'A',
|
|
301 'GCG' : 'A',
|
|
302 'GAT' : 'D',
|
|
303 'GAC' : 'D',
|
|
304 'GAA' : 'E',
|
|
305 'GAG' : 'E',
|
|
306 'GGT' : 'G',
|
|
307 'GGC' : 'G',
|
|
308 'GGA' : 'G',
|
|
309 'GGG' : 'G'
|
|
310 }
|
|
311 AA = '?'
|
|
312 if codon in AAdict:
|
|
313 AA = AAdict[codon]
|
|
314
|
|
315 return(AA)
|
|
316
|
|
317 def singletCodon(ntPOS, nt, ref): # process to return the AA and protein seq. position based on the reference and provided nt seq position and nt
|
|
318 AAPOS = (ntPOS-1)//3
|
|
319 AAmod = (ntPOS-1)%3
|
|
320 codon = ""
|
|
321 try:
|
|
322 if AAmod == 0:
|
|
323 codon = nt+ref[1][ntPOS]+ref[1][ntPOS+1]
|
|
324 elif AAmod == 1:
|
|
325 codon = ref[1][ntPOS-2]+nt+ref[1][ntPOS]
|
|
326 elif AAmod == 2:
|
|
327 codon = ref[1][ntPOS-3]+ref[1][ntPOS-2]+nt
|
|
328 except:
|
|
329 codon = "XXX"
|
|
330
|
|
331 return(AAPOS+1, AAcall(codon))
|
|
332
|
|
333 def getCombos(qlist, clen): # returns combinations of single polymorphisms in a sequence
|
|
334 combos = []
|
|
335 if (clen == 0 or clen > len(qlist)):
|
|
336 clen = len(qlist)
|
|
337 for N in range(1, clen+1):
|
|
338 for comb in itertools.combinations(qlist, N):
|
|
339 combos.append(' '.join(comb))
|
|
340 return(combos)
|
|
341
|
|
342 def SAMparse(args, ref, refprot, file): # process SAM files
|
|
343 covar_array = []
|
|
344 seq_array = []
|
|
345
|
|
346 nt_call_dict_dict = {}
|
|
347 indel_dict = {}
|
|
348 seq_species = {}
|
|
349 sam_read_count = 0
|
|
350 sam_line_count = 0
|
|
351 firstPOS = 0
|
|
352 lastPOS = 0
|
|
353 coverage = {}
|
|
354 for line in args.Sam_file:
|
|
355 if not line.startswith('@'): # ignore header lines
|
|
356 splitline = line.split("\t")
|
|
357 if ref[0].upper().startswith(splitline[2].upper()): # check map ID matches referecne ID
|
|
358 if int(splitline[4]) > 0: # Check mapping score is positive
|
|
359
|
|
360 abund_count=1
|
|
361 if args.use_count == 1: # get the unique sequence counts
|
|
362 if '-' in splitline[0] and '=' in splitline[0]:
|
|
363 eq_split = splitline[0].split('=')
|
|
364 dash_split = splitline[0].split('-')
|
|
365 if len(eq_split[-1]) > len(dash_split[-1]):
|
|
366 abund_count=int(dash_split[-1])
|
|
367 else:
|
|
368 abund_count=int(eq_split[-1])
|
|
369
|
|
370 elif '-' in splitline[0]:
|
|
371 try:
|
|
372 abund_count=int(splitline[0].split('-')[-1])
|
|
373 except:
|
|
374 pass
|
|
375
|
|
376 elif '=' in splitline[0]:
|
|
377 try:
|
|
378 abund_count=int(splitline[0].split('=')[-1])
|
|
379 except:
|
|
380 pass
|
|
381
|
|
382
|
|
383 sam_read_count += abund_count
|
|
384
|
|
385 CIGAR = splitline[5]
|
|
386 POS = int(splitline[3])
|
|
387 if firstPOS == 0:
|
|
388 firstPOS = POS
|
|
389 elif POS < firstPOS:
|
|
390 firstPOS = POS
|
|
391
|
|
392 readID = splitline[0]
|
|
393 query_seq = splitline[9].upper()
|
|
394 run_length = 0
|
|
395 query_seq_parsed = ''
|
|
396 query_pos = 0
|
|
397 q_pars_pos = 0
|
|
398 mutations = []
|
|
399
|
|
400 for C in CIGAR: # process sequence based on standard CIGAR line
|
|
401 if C == 'M' or C == 'I' or C == 'D' or C == 'S' or C == 'H':
|
|
402 if C == 'S':
|
|
403 query_pos = query_pos + run_length
|
|
404
|
|
405 if C == 'I':
|
|
406 if query_pos > 0:
|
|
407 # add insertion to dict
|
|
408 iPOS = q_pars_pos+POS
|
|
409
|
|
410 iSeq = query_seq[query_pos: query_pos+run_length]
|
|
411 istring = str(iPOS)+'-insert'+iSeq
|
|
412
|
|
413 try:
|
|
414 indel_dict[istring]
|
|
415 except:
|
|
416 indel_dict[istring] = abund_count
|
|
417 else:
|
|
418 indel_dict[istring] += abund_count
|
|
419
|
|
420 if args.AAreport == 1 and (run_length % 3 == 0):
|
|
421
|
|
422 iProt = ''
|
|
423 if iPOS % 3 == 1:
|
|
424 for x in range(0, (run_length//3)):
|
|
425 AA = AAcall(iSeq[x*3]+iSeq[x*3+1]+iSeq[x*3+2])
|
|
426 iProt = iProt + AA
|
|
427 mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
|
|
428 elif iPOS % 3 == 2:
|
|
429 if query_pos > 0:
|
|
430 ipSeq = query_seq[query_pos-1:query_pos+run_length+2]
|
|
431 else:
|
|
432 ipSeq = "XXX"+query_seq[query_pos+2:query_pos+run_length+2]
|
|
433 for x in range(0, (run_length//3)+1):
|
|
434 AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2])
|
|
435 iProt = iProt + AA
|
|
436 mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
|
|
437 else:
|
|
438 if query_pos > 1:
|
|
439 ipSeq = query_seq[query_pos-2:query_pos+run_length+1]
|
|
440 else:
|
|
441 ipSeq = "XXX"+query_seq[query_pos+1:query_pos+run_length+1]
|
|
442
|
|
443 for x in range(0, (run_length//3)+1):
|
|
444 AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2])
|
|
445 iProt = iProt + AA
|
|
446 mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
|
|
447 else:
|
|
448 mutations.append(istring)
|
|
449
|
|
450 query_pos = query_pos + run_length
|
|
451
|
|
452 elif C == 'D':
|
|
453 for X in range(0, run_length):
|
|
454 query_seq_parsed += '-'
|
|
455
|
|
456 delstring = str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'
|
|
457
|
|
458 if args.AAreport == 1 and (run_length % 3 == 0) and not ((q_pars_pos+POS) % 3 == 1 ):
|
|
459 if (q_pars_pos+POS) % 3 == 2:
|
|
460 newcodon = query_seq[query_pos-1:query_pos+2]
|
|
461 newAArefpos = (q_pars_pos+POS) // 3
|
|
462 mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')')
|
|
463 else:
|
|
464 newcodon = query_seq[query_pos-2:query_pos+1]
|
|
465 newAArefpos = (q_pars_pos+POS) // 3
|
|
466 mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')')
|
|
467 else:
|
|
468 mutations.append(delstring)
|
|
469
|
|
470 if args.nt_call == 1:
|
|
471 for N in range(q_pars_pos+POS, q_pars_pos+POS+int(run_length)):
|
|
472 try:
|
|
473 nt_call_dict_dict[N]
|
|
474 except:
|
|
475 nt_call_dict_dict[N] = {'A' : 0,
|
|
476 'T' : 0,
|
|
477 'C' : 0,
|
|
478 'G' : 0,
|
|
479 '-' : 0}
|
|
480 nt_call_dict_dict[N]['-'] = abund_count
|
|
481 else:
|
|
482 nt_call_dict_dict[N]['-'] += abund_count
|
|
483
|
|
484 try:
|
|
485 indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del']
|
|
486 except:
|
|
487 indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] = int(abund_count)
|
|
488 else:
|
|
489 indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] += int(abund_count)
|
|
490
|
|
491 q_pars_pos = q_pars_pos + run_length
|
|
492
|
|
493 elif C == 'M':
|
|
494 offset = q_pars_pos-query_pos
|
|
495 refPOS = POS+offset
|
|
496
|
|
497 for ntPOS in range(query_pos, query_pos+run_length):
|
|
498 if query_seq[ntPOS] == 'A' or query_seq[ntPOS] == 'T' or query_seq[ntPOS] == 'C' or query_seq[ntPOS] == 'G':
|
|
499 if query_seq[ntPOS] != ref[1][refPOS+ntPOS-1]:
|
|
500 if args.AAreport == 1 and args.AAcodonasMNP == 0:
|
|
501 AAinfo = singletCodon(refPOS+ntPOS, query_seq[ntPOS], ref)
|
|
502 mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
|
|
503 else:
|
|
504 mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS])
|
|
505 if args.nt_call == 1:
|
|
506 try:
|
|
507 nt_call_dict_dict[refPOS+ntPOS]
|
|
508 except:
|
|
509 nt_call_dict_dict[refPOS+ntPOS] = {'A' : 0,
|
|
510 'T' : 0,
|
|
511 'C' : 0,
|
|
512 'G' : 0,
|
|
513 '-' : 0}
|
|
514 nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] = abund_count
|
|
515 else:
|
|
516 nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] += abund_count
|
|
517
|
|
518
|
|
519 q_pars_pos = q_pars_pos + run_length
|
|
520 query_pos = query_pos + run_length
|
|
521
|
|
522 run_length = 0
|
|
523
|
|
524
|
|
525 else:
|
|
526 run_length = (10 * run_length) + int(C)
|
|
527 # END CIGAR PARSE
|
|
528
|
|
529
|
|
530
|
|
531 if len(mutations) == 0: # record reference counts
|
|
532 if args.wgs == 0:
|
|
533 try:
|
|
534 seq_species['Reference'] += abund_count
|
|
535 except:
|
|
536 seq_species['Reference'] = abund_count
|
|
537 else:
|
|
538 try:
|
|
539 seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] += abund_count
|
|
540 except:
|
|
541 seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] = abund_count
|
|
542
|
|
543 else: # record variants and counts
|
|
544 if args.AAreport == 1 and args.AAcodonasMNP == 1:
|
|
545 codonchecked = []
|
|
546 codon = ''
|
|
547 skip = 0
|
|
548 MNP = ''
|
|
549 for i in range(0, len(mutations)): # checking for MNP
|
|
550 if 'Del' in mutations[i] or 'insert' in mutations[i]:
|
|
551 codonchecked.append(mutations[i])
|
|
552 elif skip > 0:
|
|
553 skip -= 1
|
|
554 else:
|
|
555 mut1POS = int(''.join([c for c in mutations[i] if c.isdigit()]))
|
|
556 try:
|
|
557 mutations[i+1]
|
|
558 except:
|
|
559 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
|
|
560 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
|
|
561 else:
|
|
562
|
|
563 if mut1POS % 3 == 0:
|
|
564 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
|
|
565 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
|
|
566 else:
|
|
567 mut2POS = int(''.join([c for c in mutations[i+1] if c.isdigit()]))
|
|
568 if mut2POS - mut1POS < 3:
|
|
569 AAPOS = mut1POS // 3
|
|
570 if mut1POS % 3 == 1:
|
|
571 if mut2POS % 3 == 0:
|
|
572 codon = mutations[i][-1]+ref[1][mut1POS]+mutations[i+1][-1]
|
|
573 MNP = mutations[i][-1]+'r'+mutations[i+1][-1]
|
|
574 skip = 1
|
|
575 else:
|
|
576 try:
|
|
577 mutations[i+2]
|
|
578 except:
|
|
579 codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS]
|
|
580 MNP = mutations[i][-1]+mutations[i+1][-1]+'r'
|
|
581 skip = 1
|
|
582 else:
|
|
583 mut3POS = int(''.join([c for c in mutations[i+2] if c.isdigit()]))
|
|
584 if mut2POS == mut3POS -1:
|
|
585 codon = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1]
|
|
586 MNP = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1]
|
|
587 skip = 2
|
|
588 else:
|
|
589 codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS]
|
|
590 MNP = mutations[i][-1]+mutations[i+1][-1]+'r'
|
|
591 skip = 1
|
|
592 codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')')
|
|
593 elif mut2POS - mut1POS == 1:
|
|
594 codon = ref[1][mut1POS-2]+mutations[i][-1]+mutations[i+1][-1]
|
|
595 MNP = mutations[i][-1]+mutations[i+1][-1]
|
|
596 skip = 1
|
|
597 codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')')
|
|
598 else:
|
|
599 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
|
|
600 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
|
|
601
|
|
602
|
|
603 else:
|
|
604 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
|
|
605 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
|
|
606 mutations = " ".join(codonchecked)
|
|
607 else:
|
|
608 mutations = " ".join(mutations)
|
|
609
|
|
610 if args.wgs == 0:
|
|
611 try:
|
|
612 seq_species[mutations] += abund_count
|
|
613 except:
|
|
614 seq_species[mutations] = abund_count
|
|
615 else:
|
|
616 try:
|
|
617 seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] += abund_count
|
|
618 except:
|
|
619 seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] = abund_count
|
|
620
|
|
621 if lastPOS < POS+q_pars_pos:
|
|
622 lastPOS = POS+q_pars_pos
|
|
623 for i in range(POS, POS+q_pars_pos): # update coverage
|
|
624 try:
|
|
625 coverage[i] += abund_count
|
|
626 except:
|
|
627 coverage[i] = abund_count
|
|
628
|
|
629 args.Sam_file.close()
|
|
630 # END SAM LINES
|
|
631
|
|
632 total_mapped_reads = sam_read_count
|
|
633
|
|
634 if args.seq == 1: # output the sequence
|
|
635 seq_fh = open(args.outfile1, "w")
|
|
636 seq_fh.write(args.samp+"("+str(sam_read_count)+")\n")
|
|
637 seq_fh.write("Unique Sequence\tCount\tAbundance\n")
|
|
638
|
|
639 sorted_seq = sorted(seq_species, key=seq_species.__getitem__, reverse=True)
|
|
640 for key in sorted_seq:
|
|
641 if seq_species[key] >= args.min_count:
|
|
642 if (seq_species[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0:
|
|
643 seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}\n")
|
|
644 seq_array.append(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}")
|
|
645 elif args.wgs == 1:
|
|
646 splitseqs = key.split()
|
|
647 cov = []
|
|
648 for x in range(int(splitseqs[0]), int(splitseqs[-1])):
|
|
649 cov.append(coverage[x])
|
|
650 min_cov = min(cov)
|
|
651 if (seq_species[key]/min_cov >= args.min_samp_abund):
|
|
652 seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/min_cov):.3f}\n")
|
|
653 else:
|
|
654 break
|
|
655
|
|
656 seq_fh.close()
|
|
657 # END SEQ OUT
|
|
658
|
|
659 if args.indel == 1: # and len(indel_dict) > 0: # output indels, if there are any
|
|
660 sorted_indels = sorted(indel_dict, key=indel_dict.__getitem__, reverse=True)
|
|
661 indels_to_write = []
|
|
662 for key in sorted_indels:
|
|
663 if indel_dict[key] >= args.min_count:
|
|
664 if indel_dict[key] / sam_read_count >= args.min_samp_abund and args.wgs == 0:
|
|
665 indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key]/sam_read_count):.3f}\n")
|
|
666 elif args.wgs == 1:
|
|
667 indelPOS = ''
|
|
668 for c in key:
|
|
669 if c.isdigit():
|
|
670 indelPOS += c
|
|
671 else:
|
|
672 break
|
|
673 indelPOS = int(indelPOS)
|
|
674 if indel_dict[key] / coverage[indelPOS] >= args.min_samp_abund:
|
|
675 indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key] / coverage[indelPOS]):.3f}\n")
|
|
676 else:
|
|
677 break
|
|
678 # if len(indels_to_write) > 0:
|
|
679 indel_fh = open(args.outfile2, "w")
|
|
680 indel_fh.write(args.samp+"("+str(sam_read_count)+")\n")
|
|
681 indel_fh.write("Indel\tCount\tAbundance\n")
|
|
682 for indel_entry in indels_to_write:
|
|
683 indel_fh.write(indel_entry)
|
|
684 indel_fh.close()
|
|
685 # END INDEL OUT
|
|
686
|
|
687 if args.nt_call == 1: # out put nt calls
|
|
688 ntcall_fh = open(args.outfile3, "w")
|
|
689 ntcall_fh.write(args.samp+"("+str(sam_read_count)+")\n")
|
|
690 if args.ntvar == 1:
|
|
691 ntcallv_fh = open(args.outfile7, "w")
|
|
692 ntcallv_fh.write(args.samp+"("+str(sam_read_count)+")\n")
|
|
693 sorted_POS = sorted(nt_call_dict_dict)
|
|
694 if args.AAreport == 1:
|
|
695 ntcall_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n")
|
|
696 if args.ntvar == 1:
|
|
697 ntcallv_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n")
|
|
698 for POS in sorted_POS:
|
|
699 try:
|
|
700 total = coverage[POS]
|
|
701 except:
|
|
702 total = 0
|
|
703 if total >= (sam_read_count * args.ntabund):
|
|
704 AAinfo = singletCodon(POS, ref[1][POS-1], ref)
|
|
705 POS_calls = {}
|
|
706 for key in nt_call_dict_dict[POS]:
|
|
707 POS_calls[key] = nt_call_dict_dict[POS][key]
|
|
708 sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True)
|
|
709
|
|
710 ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
|
|
711 ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
|
|
712 ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
|
|
713 ntcall_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
|
|
714 if sorted_calls[0] != ref[1][POS-1]:
|
|
715 if args.ntvar == 1:
|
|
716 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
|
|
717 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
|
|
718 ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
|
|
719 ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
|
|
720
|
|
721 mod = (POS)%3
|
|
722
|
|
723 if mod == 0:
|
|
724 try:
|
|
725 codon = ref[1][POS-3]+ref[1][POS-2]+sorted_calls[0]
|
|
726 except:
|
|
727 codon = 'NNN'
|
|
728 elif mod == 2:
|
|
729 try:
|
|
730 codon = ref[1][POS-2]+sorted_calls[0]+ref[1][POS]
|
|
731 except:
|
|
732 codon = 'NNN'
|
|
733 elif mod == 1:
|
|
734 try:
|
|
735 codon = sorted_calls[0]+ref[1][POS]+ref[1][POS+1]
|
|
736 except:
|
|
737 codon = 'NNN'
|
|
738 ntcall_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1])
|
|
739 if args.ntvar == 1:
|
|
740 ntcallv_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1])
|
|
741 if (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total >= args.min_samp_abund):
|
|
742 if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1:
|
|
743 ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
|
|
744 ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
|
|
745 if nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count:
|
|
746 if nt_call_dict_dict[POS][sorted_calls[2]] / total >= args.min_samp_abund:
|
|
747 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
|
|
748 ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
|
|
749 ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
|
|
750
|
|
751 if args.ntvar == 1:
|
|
752 ntcallv_fh.write("\n")
|
|
753 elif (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] / total >= args.min_samp_abund):
|
|
754 if args.ntvar == 1:
|
|
755 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
|
|
756 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
|
|
757 ntcallv_fh.write("\t"+str(total)+"\t\t")
|
|
758 ntcallv_fh.write(f"\t")
|
|
759 ntcallv_fh.write("\t\t")
|
|
760 ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
|
|
761 ntcall_fh.write("\t\t")
|
|
762 ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
|
|
763
|
|
764 if (nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total >= args.min_samp_abund):
|
|
765 ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
|
|
766 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
|
|
767 ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
|
|
768 if args.ntvar == 1:
|
|
769 ntcallv_fh.write("\n")
|
|
770
|
|
771 ntcall_fh.write("\n")
|
|
772 else:
|
|
773 ntcall_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n")
|
|
774 if args.ntvar == 1:
|
|
775 ntcallv_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n")
|
|
776
|
|
777 for POS in sorted_POS:
|
|
778 try:
|
|
779 total = coverage[POS] # sum(nt_call_dict_dict[POS].values())
|
|
780 except:
|
|
781 total = 0
|
|
782 if total >= (sam_read_count * args.ntabund):
|
|
783 POS_calls = {}
|
|
784 for key in nt_call_dict_dict[POS]:
|
|
785 POS_calls[key] = nt_call_dict_dict[POS][key]
|
|
786 sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True)
|
|
787
|
|
788 ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1])
|
|
789 ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
|
|
790 ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
|
|
791
|
|
792 if sorted_calls[0] != ref[1][POS-1]:
|
|
793 if args.ntvar == 1:
|
|
794 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1])
|
|
795 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
|
|
796 ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
|
|
797 ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
|
|
798 if (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund):
|
|
799 ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
|
|
800 if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1:
|
|
801 ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
|
|
802 if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund):
|
|
803 ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
|
|
804 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
|
|
805 ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
|
|
806 if args.ntvar == 1:
|
|
807 ntcallv_fh.write("\n")
|
|
808
|
|
809 elif (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund):
|
|
810 if args.ntvar == 1:
|
|
811 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1])
|
|
812 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
|
|
813 ntcallv_fh.write("\t"+str(total)+"\t\t")
|
|
814 ntcallv_fh.write(f"\t")
|
|
815 ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
|
|
816 ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
|
|
817 if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and(nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund):
|
|
818 ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
|
|
819 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
|
|
820 ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
|
|
821 if args.ntvar == 1:
|
|
822 ntcallv_fh.write("\n")
|
|
823
|
|
824 ntcall_fh.write("\n")
|
|
825
|
|
826 ntcall_fh.close()
|
|
827 if args.ntvar == 1:
|
|
828 ntcallv_fh.close()
|
|
829 # END NT CALL OUT
|
|
830 if args.covar == 1: # output covariants
|
|
831 testtrack = 0
|
|
832 combinations = {}
|
|
833 for sequence in seq_species:
|
|
834 if args.wgs == 0:
|
|
835 singles = sequence.split()
|
|
836 else:
|
|
837 singles = (sequence.split())[1:-1]
|
|
838 if len(singles) <= args.max_dist and singles[0] != 'Ref':
|
|
839 for combo in getCombos(singles, args.max_covar):
|
|
840 if not combo in combinations:
|
|
841 combinations[combo] = seq_species[sequence]
|
|
842 else:
|
|
843 combinations[combo] += seq_species[sequence]
|
|
844
|
|
845 covar_fh = open(args.outfile4, "w")
|
|
846 covar_fh.write(args.samp+"("+str(sam_read_count)+")\n")
|
|
847 covar_fh.write("Co-Variants\tCount\tAbundance\n")
|
|
848 sortedcombos = sorted(combinations, key=combinations.__getitem__, reverse=True)
|
|
849 for key in sortedcombos:
|
|
850 if (combinations[key] >= args.min_count):
|
|
851 if (combinations[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0:
|
|
852 covar_fh.write(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}\n")
|
|
853 covar_array.append(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}")
|
|
854 elif args.wgs == 1:
|
|
855 coveragepercent = 0
|
|
856 splitcombos = key.split()
|
|
857 if len(splitcombos) == 1:
|
|
858 coveragePOS = ''
|
|
859 for c in key:
|
|
860 if c.isdigit():
|
|
861 coveragePOS += c
|
|
862 else:
|
|
863 break
|
|
864 coveragepercent = combinations[key] / coverage[int(coveragePOS)]
|
|
865 else:
|
|
866 startcovPOS = ''
|
|
867 for c in splitcombos[0]:
|
|
868 if c.isdigit():
|
|
869 startcovPOS += c
|
|
870 else:
|
|
871 break
|
|
872 endcovPOS = ''
|
|
873 for c in splitcombos[-1]:
|
|
874 if c.isdigit():
|
|
875 endcovPOS += c
|
|
876 else:
|
|
877 break
|
|
878 coveragevals = []
|
|
879 for i in range(int(startcovPOS), int(endcovPOS)+1):
|
|
880 coveragevals.append(coverage[i])
|
|
881 mincov = min(coverval for coverval in coveragevals)
|
|
882 coveragepercent = combinations[key] / mincov
|
|
883 if coveragepercent >= args.min_samp_abund:
|
|
884 covar_fh.write(f"{key}\t{combinations[key]}\t{coveragepercent:.3f}\n")
|
|
885
|
|
886 covar_fh.close()
|
|
887
|
|
888
|
|
889 # END COVAR OUT
|
|
890
|
|
891 return(covar_array, seq_array, sam_read_count)
|
|
892
|
|
893 def cvdeconv(args, covardict, seqdict): # covar deconvolution process
|
|
894
|
|
895 passedseqs = {}
|
|
896 preservedseqs = {}
|
|
897 for seq in seqdict: # pass check for actual : expected abundance
|
|
898 if seq != 'total' and seq != 'singles':
|
|
899
|
|
900 splitseq = seq.split(' ')
|
|
901 abund = 1
|
|
902 for sing in splitseq:
|
|
903 try:
|
|
904 abund = abund * (covardict[sing] / covardict['total'])
|
|
905 except:
|
|
906 abund = abund * (seqdict[seq] / seqdict['total'])
|
|
907
|
|
908 try:
|
|
909 covarabund = covardict[seq]/covardict['total']
|
|
910 except:
|
|
911 covarabund = seqdict[seq]/seqdict['total']
|
|
912 covardict[seq] = seqdict[seq]
|
|
913
|
|
914 if covarabund >= args.autopass:
|
|
915 preservedseqs[seq] = max(1, args.beta, (covarabund / abund))
|
|
916 elif covarabund >= abund * args.beta:
|
|
917 passedseqs[seq] = covarabund / abund
|
|
918 elif len(seq.split(' ')) == 1:
|
|
919 passedseqs[seq] = max(1, args.beta)
|
|
920
|
|
921 if args.min_samp_abund < 1:
|
|
922 min_count = args.min_samp_abund * covardict['total']
|
|
923 else:
|
|
924 min_count = args.min_samp_abund
|
|
925
|
|
926 # sort passed covars
|
|
927 lensortedpassed = sorted(passedseqs, key = lambda key : len(key.split(' ')), reverse=True)
|
|
928 ratiolensortedpassed = sorted(lensortedpassed, key = lambda key : passedseqs[key], reverse = True)
|
|
929 sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
|
|
930 deconved = {}
|
|
931 for seq in ratiolensortedpassed: # reassign counts
|
|
932 singles = seq.split(' ')
|
|
933 first = 0
|
|
934 rem_count = 0
|
|
935 for sing in sortedsingles:
|
|
936 if sing in singles:
|
|
937 if covardict['singles'][sing] > 0:
|
|
938 if first == 0:
|
|
939 first = 1
|
|
940 rem_count = covardict['singles'][sing]
|
|
941 covardict['singles'][sing] = 0
|
|
942 deconved[seq] = rem_count
|
|
943 else:
|
|
944 covardict['singles'][sing] = covardict['singles'][sing] - rem_count
|
|
945 else:
|
|
946 break
|
|
947 sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
|
|
948
|
|
949 sortedpreserved = sorted(preservedseqs, key = lambda key : covardict[key])
|
|
950
|
|
951 for seq in sortedpreserved:
|
|
952 singles = seq.split(' ')
|
|
953 first = 0
|
|
954 rem_count = 0
|
|
955 for sing in sortedsingles:
|
|
956 if sing in singles:
|
|
957 if covardict['singles'][sing] > 0:
|
|
958 if first == 0:
|
|
959 first = 1
|
|
960 rem_count = covardict['singles'][sing]
|
|
961 covardict['singles'][sing] = 0
|
|
962 deconved[seq] = rem_count
|
|
963 else:
|
|
964 covardict['singles'][sing] = covardict['singles'][sing] - rem_count
|
|
965 else:
|
|
966 break
|
|
967 sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
|
|
968
|
|
969
|
|
970 newtotal = sum(deconved.values())
|
|
971 fh_deconv = open(args.outfile6, "w")
|
|
972 fh_deconv.write(f"{args.samp}({covardict['total']}) | ({newtotal})\tCount\tAbundance\n")
|
|
973 sorted_deconved = sorted(deconved, key = deconved.__getitem__, reverse = True)
|
|
974 for seq in sorted_deconved: # write deconv
|
|
975 if deconved[seq] >= min_count:
|
|
976 fh_deconv.write(f"{seq}\t{deconved[seq]}\t{(deconved[seq]/newtotal):.3f}\n")
|
|
977 fh_deconv.close()
|
|
978 # END COVAR DECONV OUT
|
|
979
|
|
980 return()
|
|
981
|
|
982 def dechim(args, seqs): # processing sequence dictionary to remove chimeras
|
|
983 total = seqs['total']
|
|
984 del seqs['total']
|
|
985 sorted_seqs = sorted(seqs, key=seqs.__getitem__) # sort sequences by abundance, least to greatest
|
|
986 chimeras = []
|
|
987 for seq in sorted_seqs:
|
|
988 pot_chim = ['Reference']+seq.split()+['Reference']
|
|
989 chim_halves = []
|
|
990 for i in range(0, len(pot_chim)-1): # create dimera halves
|
|
991 chim_halves.append([pot_chim[:i+1], pot_chim[i+1:]])
|
|
992 parent_pairs = []
|
|
993
|
|
994 for dimera in chim_halves: # check for potential parents
|
|
995
|
|
996 pot_left = []
|
|
997 pot_rt = []
|
|
998 lft_len = len(dimera[0])
|
|
999 rt_len = len(dimera[1])
|
|
1000 for pseq in sorted_seqs:
|
|
1001 if not seq == pseq:
|
|
1002 if (seqs[pseq] >= (seqs[seq] * args.foldab)):
|
|
1003 pot_par = ['Reference']+pseq.split()+['Reference']
|
|
1004 if dimera[0] == pot_par[:lft_len]:
|
|
1005 pot_left.append(pot_par)
|
|
1006 if ((len(pot_par) >= rt_len) and (dimera[1] == pot_par[(len(pot_par)-rt_len):])):
|
|
1007 pot_rt.append(pot_par)
|
|
1008
|
|
1009 if (len(pot_left) > 0 and len(pot_rt) > 0 ):
|
|
1010 for left_par in pot_left: # check potential parents' pairing
|
|
1011 for rt_par in pot_rt:
|
|
1012 if left_par != rt_par:
|
|
1013 left_break = left_par[lft_len]
|
|
1014 rt_break = rt_par[(len(rt_par)-rt_len)-1]
|
|
1015 if left_break == 'Reference' or rt_break == 'Reference':
|
|
1016 parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])])
|
|
1017 else:
|
|
1018 left_break_POS = ''
|
|
1019 for c in left_break:
|
|
1020 if c.isdigit():
|
|
1021 left_break_POS += c
|
|
1022 else:
|
|
1023 break
|
|
1024
|
|
1025 rt_break_POS = ''
|
|
1026 for c in rt_break:
|
|
1027 if c.isdigit():
|
|
1028 rt_break_POS += c
|
|
1029 else:
|
|
1030 break
|
|
1031
|
|
1032 if int(left_break_POS) > int(rt_break_POS):
|
|
1033 parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])])
|
|
1034
|
|
1035 par_tot_abund = 0
|
|
1036 pair_probs = []
|
|
1037 for parents in parent_pairs: # calc 'expected' abundance
|
|
1038 pair_prob = (seqs[parents[0]] / total) * (seqs[parents[1]] / total)
|
|
1039 par_tot_abund += pair_prob
|
|
1040 pair_probs.append(pair_prob)
|
|
1041
|
|
1042 recomb_count = par_tot_abund * total
|
|
1043
|
|
1044 if not seqs[seq] >= recomb_count * args.alpha: # chimera check
|
|
1045 redist_count = float(seqs[seq])
|
|
1046 chimeras.append(seq)
|
|
1047 seqs[seq] = 0
|
|
1048 if args.redist == 1: # redist counts of chimera
|
|
1049 toadd = {}
|
|
1050 for i in range(0, len(parent_pairs)):
|
|
1051 counts_to_redist = (redist_count * (pair_probs[i]/par_tot_abund))/2
|
|
1052 seqs[parent_pairs[i][0]] += counts_to_redist
|
|
1053 seqs[parent_pairs[i][1]] += counts_to_redist
|
|
1054
|
|
1055
|
|
1056
|
|
1057 for chim in chimeras: # remove chimeras
|
|
1058 del seqs[chim]
|
|
1059
|
|
1060 total = sum(seqs.values())
|
|
1061
|
|
1062
|
|
1063 # total = sum(seqs.values)
|
|
1064 seqs['total'] = total
|
|
1065
|
|
1066 return(seqs)
|
|
1067
|
|
1068 def chimrm(args, seqs): # chimera removed process
|
|
1069
|
|
1070 pre_len = len(seqs)
|
|
1071 inf_loop_shield = 0
|
|
1072 while True: # send sequences for chimera removal while chimeras are still found
|
|
1073 dechim(args, seqs)
|
|
1074 post_len = len(seqs)
|
|
1075 inf_loop_shield += 1
|
|
1076 if post_len >= pre_len:
|
|
1077 break
|
|
1078 if inf_loop_shield > 100:
|
|
1079 break
|
|
1080 pre_len = len(seqs)
|
|
1081
|
|
1082 total = seqs['total']
|
|
1083 del seqs['total']
|
|
1084 if args.min_samp_abund < 1:
|
|
1085 min_count = args.min_samp_abund * total
|
|
1086 else:
|
|
1087 min_count = args.min_samp_abund
|
|
1088 sorted_seqs = sorted(seqs, key=seqs.__getitem__, reverse=True)
|
|
1089 fh_dechime = open(args.outfile5, 'w')
|
|
1090 fh_dechime.write(f"{args.samp}({int(total)})\n")
|
|
1091 fh_dechime.write("Sequences\tCount\tAbundance\n")
|
|
1092 for seq in seqs: # write chim_rm seqs
|
|
1093 abund = seqs[seq]/total
|
|
1094 if seqs[seq] >= min_count:
|
|
1095 fh_dechime.write(f"{seq}\t{round(seqs[seq])}\t{abund:.3f}\n")
|
|
1096
|
|
1097 fh_dechime.close()
|
|
1098 return()
|
|
1099
|
|
1100 def main():
|
|
1101
|
|
1102 args = arg_parser() # getting command line arguments
|
|
1103
|
|
1104 covar_array = []
|
|
1105 seq_array = []
|
|
1106 total_mapped_reads = 0
|
|
1107
|
|
1108 if args.ref:
|
|
1109 ref = get_ref(args.ref) # get the reference ID and sequence from the FASTA file
|
|
1110 args.ref.close()
|
|
1111 if ref[1] == '':
|
|
1112 print('Reference not recognized as a Fasta format, skipping SAM parsing')
|
|
1113 else:
|
|
1114 refprot = ''
|
|
1115 if args.AAreport == 1: # make an Amino Acid sequence based on the reference sequence
|
|
1116 for x in range(0, (len(ref[1])-1)//3):
|
|
1117 AA = AAcall(ref[1][x*3]+ref[1][x*3+1]+ref[1][x*3+2])
|
|
1118 refprot = refprot + AA
|
|
1119 if (len(ref[1])-1)%3 != 0:
|
|
1120 refprot = refprot + '?'
|
|
1121
|
|
1122 covar_array, seq_array, total_mapped_reads = SAMparse(args, ref, refprot, args.Sam_file)
|
|
1123
|
|
1124 else:
|
|
1125 print('No reference provided, skipping SAM parsing')
|
|
1126
|
|
1127 in_covars = {'total' : total_mapped_reads,
|
|
1128 'singles' : {}
|
|
1129 }
|
|
1130 in_seqs = {'total' : total_mapped_reads}
|
|
1131 # Begin chimera removal if enabled
|
|
1132 if args.chim_rm == 1 or args.deconv == 1:
|
|
1133 if args.deconv == 1:
|
|
1134 for line in covar_array:
|
|
1135 lineparts = line.split("\t")
|
|
1136 in_covars[lineparts[0]] = int(lineparts[1])
|
|
1137 if len(lineparts[0].split(' ')) == 1:
|
|
1138 in_covars['singles'][lineparts[0]] = int(lineparts[1])
|
|
1139
|
|
1140 for line in seq_array:
|
|
1141 lineparts = line.split("\t")
|
|
1142 in_seqs[lineparts[0]] = float(lineparts[1])
|
|
1143
|
|
1144 if args.deconv == 1:
|
|
1145 cvdeconv(args, in_covars, in_seqs)
|
|
1146
|
|
1147 if args.chim_rm == 1:
|
|
1148 chimrm(args, in_seqs)
|
|
1149
|
|
1150 if __name__ == '__main__':
|
|
1151 main() |