comparison SAM_Refiner_Gal.py @ 0:22e3f843f1db draft

Uploaded main file
author degregory
date Mon, 13 Sep 2021 14:12:21 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:22e3f843f1db
1 #!/bin/env python3
2 # Writen by Devon Gregory with assistance from Christopher Bottoms
3 # University of Missouri
4 # Distributed under GNU GENERAL PUBLIC LICENSE v3
5 import os
6 import sys
7 import argparse
8 import itertools
9 import time
10 from multiprocessing import Process
11 from pathlib import Path
12
13 # process for collecing command line arguments
14 def arg_parser():
15
16 parser = argparse.ArgumentParser(
17 description='process Sam files for variant information'
18 )
19
20 parser.add_argument(
21 '--samp',
22 type=str,
23 default='Sample',
24 help='Sample Name'
25 )
26 parser.add_argument(
27 '-r', '-reference',
28 type=argparse.FileType('r'),
29 dest='ref',
30 help='reference fasta'
31 )
32 parser.add_argument(
33 '-S', '--Sam_file',
34 type=argparse.FileType('r'),
35 dest='Sam_file',
36 help='.sam file'
37 )
38 parser.add_argument(
39 '-o1', '--out_file1',
40 dest='outfile1',
41 help='output file'
42 )
43 parser.add_argument(
44 '-o2', '--out_file2',
45 dest='outfile2',
46 help='output file'
47 )
48 parser.add_argument(
49 '-o3', '--out_file3',
50 dest='outfile3',
51 help='output file'
52 )
53 parser.add_argument(
54 '-o4', '--out_file4',
55 dest='outfile4',
56 help='output file'
57 )
58 parser.add_argument(
59 '-o5', '--out_file5',
60 dest='outfile5',
61 help='output file'
62 )
63 parser.add_argument(
64 '-o6', '--out_file6',
65 dest='outfile6',
66 help='output file'
67 )
68 parser.add_argument(
69 '-o7', '--out_file7',
70 dest='outfile7',
71 help='output file'
72 )
73 parser.add_argument(
74 '--use_count',
75 type=int,
76 default=1,
77 choices=[0, 1],
78 help='Enable/Disable (1/0) use of counts in sequence IDs, default enabled (--use_count 1)'
79 )
80 parser.add_argument(
81 '--min_count',
82 type=int,
83 default=10,
84 help='Minimum observations required to be included in sample reports; >= 1 occurance count; < 1 %% observed (.1 = 10%%), (default: .001)'
85 )
86 parser.add_argument(
87 '--min_samp_abund',
88 type=float,
89 default=0.001,
90 help='Minimum abundance required for inclusion in sample reports; %% observed (.1 = 10%%), (default: .001)'
91 )
92 parser.add_argument(
93 '--ntabund',
94 type=float,
95 default=.001,
96 help='Minimum abundance relative to total reads required for a position to be reported in the nt call output; must be non-negative and < 1, %% observed (.1 = 10%%), (default: .001)'
97 )
98 parser.add_argument(
99 '--max_dist',
100 type=int,
101 default=40,
102 help='Maximum number of variances from the reference a sequence can have to be consider in covars processing (default: 40)'
103 )
104 parser.add_argument(
105 '--max_covar',
106 type=int,
107 default=8,
108 help='Maximum number of variances from the reference to be reported in covars (default: 8)'
109 )
110 parser.add_argument(
111 '--AAreport',
112 type=int,
113 default=1,
114 choices=[0, 1],
115 help='Enable/Disable (1/0) amino acid reporting, default enabled (--AAreport 1)'
116 )
117 parser.add_argument(
118 '--AAcodonasMNP',
119 type=int,
120 default=1,
121 choices=[0, 1],
122 help='Enable/Disable (1/0) reporting multiple nt changes in a single codon as one polymorphism, default enabled (--AAcodonasMNP 1), requires AAreport enabled'
123 )
124 parser.add_argument(
125 '--alpha',
126 type=float,
127 default=1.2,
128 help='Modifier for chim_rm chimera checking, default 1.2. Higher = more sensitive, more false chimeras removed; lower = less sensitive, fewer chimeras removed'
129 )
130 parser.add_argument(
131 '--foldab',
132 type=float,
133 default=1.8,
134 help='Threshold for potential parent / chimera abundance ratio for chim_rm; default is 1.8'
135 )
136 parser.add_argument(
137 '--redist',
138 type=int,
139 default=1,
140 choices=[0, 1],
141 help='Enable/Disable (1/0) redistribution of chimera counts for chim_rm, default enabled (--redist 1)'
142 )
143 parser.add_argument(
144 '--beta',
145 type=float,
146 default=1,
147 help='Modifier for covar pass checking, default 1. Higher = more sensitive, more failed checks; lower = less sensitive, fewer failed checks'
148 )
149 parser.add_argument(
150 '--autopass',
151 type=float,
152 default=.3,
153 help='threshold for a sequence to automatically pass the covar pass checking'
154 )
155 parser.add_argument(
156 '--nt_call',
157 type=int,
158 default=1,
159 choices=[0, 1],
160 help='Enable/Disable (1/0) nt_call output, default enabled (--nt_call 1)'
161 )
162 parser.add_argument(
163 '--ntvar',
164 type=int,
165 default=0,
166 choices=[0, 1],
167 help='Enable/Disable (1/0) nt_var output, default enabled (--nt_call 1)'
168 )
169 parser.add_argument(
170 '--indel',
171 type=int,
172 default=1,
173 choices=[0, 1],
174 help='Enable/Disable (1/0) indel output, default enabled (--indel 1)'
175 )
176 parser.add_argument(
177 '--seq',
178 type=int,
179 default=1,
180 choices=[0, 1],
181 help='Enable/Disable (1/0) unique seq output, default enabled (--seq 1)'
182 )
183 parser.add_argument(
184 '--covar', type=int,
185 default=1,
186 choices=[0, 1],
187 help='Enable/Disable (1/0) covar output, default enabled (--covar 1)'
188 )
189 parser.add_argument(
190 '--chim_rm',
191 type=int,
192 default=1,
193 choices=[0, 1],
194 help='Enable/Disable (1/0) chim_rm output, default enabled (--chim_rm 1)'
195 )
196 parser.add_argument(
197 '--deconv',
198 type=int,
199 default=1,
200 choices=[0, 1],
201 help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)'
202 )
203 parser.add_argument(
204 '--wgs',
205 type=int,
206 default=0,
207 choices=[0, 1],
208 help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)'
209 )
210
211 args = parser.parse_args()
212
213 # checking for proper range of some parameters and consistency/compatibility
214 if args.wgs == 1:
215 if args.deconv == 1 or args.chim_rm == 1:
216 args.deconv = 0
217 args.chim_rm = 0
218 print('WGS mode enabled, disabling chimera removal methods')
219
220 return(args)
221
222 def get_ref(ref): # get the reference ID and sequence from the FASTA file. Will only get the first.
223
224 n=0
225 refID = ''
226 refseq = ''
227 if ref:
228 for line in ref:
229 if line.startswith('>'):
230 n+=1
231 if n > 1:
232 break
233 refID = line[1:].strip("\n\r")
234 elif n == 1:
235 refseq = refseq + line.strip("\n\r")
236 refseq = refseq.upper()
237
238
239 return(refID, refseq)
240
241 def AAcall(codon): # amino acid / codon dictionary to return encoded AAs
242 AAdict = {
243 'TTT' : 'F',
244 'TTC' : 'F',
245 'TTA' : 'L',
246 'TTG' : 'L',
247 'TCT' : 'S',
248 'TCC' : 'S',
249 'TCA' : 'S',
250 'TCG' : 'S',
251 'TAT' : 'Y',
252 'TAC' : 'Y',
253 'TAA' : '*',
254 'TAG' : '*',
255 'TGT' : 'C',
256 'TGC' : 'C',
257 'TGA' : '*',
258 'TGG' : 'W',
259
260 'CTT' : 'L',
261 'CTC' : 'L',
262 'CTA' : 'L',
263 'CTG' : 'L',
264 'CCT' : 'P',
265 'CCC' : 'P',
266 'CCA' : 'P',
267 'CCG' : 'P',
268 'CAT' : 'H',
269 'CAC' : 'H',
270 'CAA' : 'Q',
271 'CAG' : 'Q',
272 'CGT' : 'R',
273 'CGC' : 'R',
274 'CGA' : 'R',
275 'CGG' : 'R',
276
277 'ATT' : 'I',
278 'ATC' : 'I',
279 'ATA' : 'I',
280 'ATG' : 'M',
281 'ACT' : 'T',
282 'ACC' : 'T',
283 'ACA' : 'T',
284 'ACG' : 'T',
285 'AAT' : 'N',
286 'AAC' : 'N',
287 'AAA' : 'K',
288 'AAG' : 'K',
289 'AGT' : 'S',
290 'AGC' : 'S',
291 'AGA' : 'R',
292 'AGG' : 'R',
293
294 'GTT' : 'V',
295 'GTC' : 'V',
296 'GTA' : 'V',
297 'GTG' : 'V',
298 'GCT' : 'A',
299 'GCC' : 'A',
300 'GCA' : 'A',
301 'GCG' : 'A',
302 'GAT' : 'D',
303 'GAC' : 'D',
304 'GAA' : 'E',
305 'GAG' : 'E',
306 'GGT' : 'G',
307 'GGC' : 'G',
308 'GGA' : 'G',
309 'GGG' : 'G'
310 }
311 AA = '?'
312 if codon in AAdict:
313 AA = AAdict[codon]
314
315 return(AA)
316
317 def singletCodon(ntPOS, nt, ref): # process to return the AA and protein seq. position based on the reference and provided nt seq position and nt
318 AAPOS = (ntPOS-1)//3
319 AAmod = (ntPOS-1)%3
320 codon = ""
321 try:
322 if AAmod == 0:
323 codon = nt+ref[1][ntPOS]+ref[1][ntPOS+1]
324 elif AAmod == 1:
325 codon = ref[1][ntPOS-2]+nt+ref[1][ntPOS]
326 elif AAmod == 2:
327 codon = ref[1][ntPOS-3]+ref[1][ntPOS-2]+nt
328 except:
329 codon = "XXX"
330
331 return(AAPOS+1, AAcall(codon))
332
333 def getCombos(qlist, clen): # returns combinations of single polymorphisms in a sequence
334 combos = []
335 if (clen == 0 or clen > len(qlist)):
336 clen = len(qlist)
337 for N in range(1, clen+1):
338 for comb in itertools.combinations(qlist, N):
339 combos.append(' '.join(comb))
340 return(combos)
341
342 def SAMparse(args, ref, refprot, file): # process SAM files
343 covar_array = []
344 seq_array = []
345
346 nt_call_dict_dict = {}
347 indel_dict = {}
348 seq_species = {}
349 sam_read_count = 0
350 sam_line_count = 0
351 firstPOS = 0
352 lastPOS = 0
353 coverage = {}
354 for line in args.Sam_file:
355 if not line.startswith('@'): # ignore header lines
356 splitline = line.split("\t")
357 if ref[0].upper().startswith(splitline[2].upper()): # check map ID matches referecne ID
358 if int(splitline[4]) > 0: # Check mapping score is positive
359
360 abund_count=1
361 if args.use_count == 1: # get the unique sequence counts
362 if '-' in splitline[0] and '=' in splitline[0]:
363 eq_split = splitline[0].split('=')
364 dash_split = splitline[0].split('-')
365 if len(eq_split[-1]) > len(dash_split[-1]):
366 abund_count=int(dash_split[-1])
367 else:
368 abund_count=int(eq_split[-1])
369
370 elif '-' in splitline[0]:
371 try:
372 abund_count=int(splitline[0].split('-')[-1])
373 except:
374 pass
375
376 elif '=' in splitline[0]:
377 try:
378 abund_count=int(splitline[0].split('=')[-1])
379 except:
380 pass
381
382
383 sam_read_count += abund_count
384
385 CIGAR = splitline[5]
386 POS = int(splitline[3])
387 if firstPOS == 0:
388 firstPOS = POS
389 elif POS < firstPOS:
390 firstPOS = POS
391
392 readID = splitline[0]
393 query_seq = splitline[9].upper()
394 run_length = 0
395 query_seq_parsed = ''
396 query_pos = 0
397 q_pars_pos = 0
398 mutations = []
399
400 for C in CIGAR: # process sequence based on standard CIGAR line
401 if C == 'M' or C == 'I' or C == 'D' or C == 'S' or C == 'H':
402 if C == 'S':
403 query_pos = query_pos + run_length
404
405 if C == 'I':
406 if query_pos > 0:
407 # add insertion to dict
408 iPOS = q_pars_pos+POS
409
410 iSeq = query_seq[query_pos: query_pos+run_length]
411 istring = str(iPOS)+'-insert'+iSeq
412
413 try:
414 indel_dict[istring]
415 except:
416 indel_dict[istring] = abund_count
417 else:
418 indel_dict[istring] += abund_count
419
420 if args.AAreport == 1 and (run_length % 3 == 0):
421
422 iProt = ''
423 if iPOS % 3 == 1:
424 for x in range(0, (run_length//3)):
425 AA = AAcall(iSeq[x*3]+iSeq[x*3+1]+iSeq[x*3+2])
426 iProt = iProt + AA
427 mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
428 elif iPOS % 3 == 2:
429 if query_pos > 0:
430 ipSeq = query_seq[query_pos-1:query_pos+run_length+2]
431 else:
432 ipSeq = "XXX"+query_seq[query_pos+2:query_pos+run_length+2]
433 for x in range(0, (run_length//3)+1):
434 AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2])
435 iProt = iProt + AA
436 mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
437 else:
438 if query_pos > 1:
439 ipSeq = query_seq[query_pos-2:query_pos+run_length+1]
440 else:
441 ipSeq = "XXX"+query_seq[query_pos+1:query_pos+run_length+1]
442
443 for x in range(0, (run_length//3)+1):
444 AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2])
445 iProt = iProt + AA
446 mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
447 else:
448 mutations.append(istring)
449
450 query_pos = query_pos + run_length
451
452 elif C == 'D':
453 for X in range(0, run_length):
454 query_seq_parsed += '-'
455
456 delstring = str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'
457
458 if args.AAreport == 1 and (run_length % 3 == 0) and not ((q_pars_pos+POS) % 3 == 1 ):
459 if (q_pars_pos+POS) % 3 == 2:
460 newcodon = query_seq[query_pos-1:query_pos+2]
461 newAArefpos = (q_pars_pos+POS) // 3
462 mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')')
463 else:
464 newcodon = query_seq[query_pos-2:query_pos+1]
465 newAArefpos = (q_pars_pos+POS) // 3
466 mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')')
467 else:
468 mutations.append(delstring)
469
470 if args.nt_call == 1:
471 for N in range(q_pars_pos+POS, q_pars_pos+POS+int(run_length)):
472 try:
473 nt_call_dict_dict[N]
474 except:
475 nt_call_dict_dict[N] = {'A' : 0,
476 'T' : 0,
477 'C' : 0,
478 'G' : 0,
479 '-' : 0}
480 nt_call_dict_dict[N]['-'] = abund_count
481 else:
482 nt_call_dict_dict[N]['-'] += abund_count
483
484 try:
485 indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del']
486 except:
487 indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] = int(abund_count)
488 else:
489 indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] += int(abund_count)
490
491 q_pars_pos = q_pars_pos + run_length
492
493 elif C == 'M':
494 offset = q_pars_pos-query_pos
495 refPOS = POS+offset
496
497 for ntPOS in range(query_pos, query_pos+run_length):
498 if query_seq[ntPOS] == 'A' or query_seq[ntPOS] == 'T' or query_seq[ntPOS] == 'C' or query_seq[ntPOS] == 'G':
499 if query_seq[ntPOS] != ref[1][refPOS+ntPOS-1]:
500 if args.AAreport == 1 and args.AAcodonasMNP == 0:
501 AAinfo = singletCodon(refPOS+ntPOS, query_seq[ntPOS], ref)
502 mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
503 else:
504 mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS])
505 if args.nt_call == 1:
506 try:
507 nt_call_dict_dict[refPOS+ntPOS]
508 except:
509 nt_call_dict_dict[refPOS+ntPOS] = {'A' : 0,
510 'T' : 0,
511 'C' : 0,
512 'G' : 0,
513 '-' : 0}
514 nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] = abund_count
515 else:
516 nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] += abund_count
517
518
519 q_pars_pos = q_pars_pos + run_length
520 query_pos = query_pos + run_length
521
522 run_length = 0
523
524
525 else:
526 run_length = (10 * run_length) + int(C)
527 # END CIGAR PARSE
528
529
530
531 if len(mutations) == 0: # record reference counts
532 if args.wgs == 0:
533 try:
534 seq_species['Reference'] += abund_count
535 except:
536 seq_species['Reference'] = abund_count
537 else:
538 try:
539 seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] += abund_count
540 except:
541 seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] = abund_count
542
543 else: # record variants and counts
544 if args.AAreport == 1 and args.AAcodonasMNP == 1:
545 codonchecked = []
546 codon = ''
547 skip = 0
548 MNP = ''
549 for i in range(0, len(mutations)): # checking for MNP
550 if 'Del' in mutations[i] or 'insert' in mutations[i]:
551 codonchecked.append(mutations[i])
552 elif skip > 0:
553 skip -= 1
554 else:
555 mut1POS = int(''.join([c for c in mutations[i] if c.isdigit()]))
556 try:
557 mutations[i+1]
558 except:
559 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
560 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
561 else:
562
563 if mut1POS % 3 == 0:
564 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
565 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
566 else:
567 mut2POS = int(''.join([c for c in mutations[i+1] if c.isdigit()]))
568 if mut2POS - mut1POS < 3:
569 AAPOS = mut1POS // 3
570 if mut1POS % 3 == 1:
571 if mut2POS % 3 == 0:
572 codon = mutations[i][-1]+ref[1][mut1POS]+mutations[i+1][-1]
573 MNP = mutations[i][-1]+'r'+mutations[i+1][-1]
574 skip = 1
575 else:
576 try:
577 mutations[i+2]
578 except:
579 codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS]
580 MNP = mutations[i][-1]+mutations[i+1][-1]+'r'
581 skip = 1
582 else:
583 mut3POS = int(''.join([c for c in mutations[i+2] if c.isdigit()]))
584 if mut2POS == mut3POS -1:
585 codon = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1]
586 MNP = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1]
587 skip = 2
588 else:
589 codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS]
590 MNP = mutations[i][-1]+mutations[i+1][-1]+'r'
591 skip = 1
592 codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')')
593 elif mut2POS - mut1POS == 1:
594 codon = ref[1][mut1POS-2]+mutations[i][-1]+mutations[i+1][-1]
595 MNP = mutations[i][-1]+mutations[i+1][-1]
596 skip = 1
597 codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')')
598 else:
599 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
600 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
601
602
603 else:
604 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
605 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
606 mutations = " ".join(codonchecked)
607 else:
608 mutations = " ".join(mutations)
609
610 if args.wgs == 0:
611 try:
612 seq_species[mutations] += abund_count
613 except:
614 seq_species[mutations] = abund_count
615 else:
616 try:
617 seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] += abund_count
618 except:
619 seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] = abund_count
620
621 if lastPOS < POS+q_pars_pos:
622 lastPOS = POS+q_pars_pos
623 for i in range(POS, POS+q_pars_pos): # update coverage
624 try:
625 coverage[i] += abund_count
626 except:
627 coverage[i] = abund_count
628
629 args.Sam_file.close()
630 # END SAM LINES
631
632 total_mapped_reads = sam_read_count
633
634 if args.seq == 1: # output the sequence
635 seq_fh = open(args.outfile1, "w")
636 seq_fh.write(args.samp+"("+str(sam_read_count)+")\n")
637 seq_fh.write("Unique Sequence\tCount\tAbundance\n")
638
639 sorted_seq = sorted(seq_species, key=seq_species.__getitem__, reverse=True)
640 for key in sorted_seq:
641 if seq_species[key] >= args.min_count:
642 if (seq_species[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0:
643 seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}\n")
644 seq_array.append(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}")
645 elif args.wgs == 1:
646 splitseqs = key.split()
647 cov = []
648 for x in range(int(splitseqs[0]), int(splitseqs[-1])):
649 cov.append(coverage[x])
650 min_cov = min(cov)
651 if (seq_species[key]/min_cov >= args.min_samp_abund):
652 seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/min_cov):.3f}\n")
653 else:
654 break
655
656 seq_fh.close()
657 # END SEQ OUT
658
659 if args.indel == 1: # and len(indel_dict) > 0: # output indels, if there are any
660 sorted_indels = sorted(indel_dict, key=indel_dict.__getitem__, reverse=True)
661 indels_to_write = []
662 for key in sorted_indels:
663 if indel_dict[key] >= args.min_count:
664 if indel_dict[key] / sam_read_count >= args.min_samp_abund and args.wgs == 0:
665 indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key]/sam_read_count):.3f}\n")
666 elif args.wgs == 1:
667 indelPOS = ''
668 for c in key:
669 if c.isdigit():
670 indelPOS += c
671 else:
672 break
673 indelPOS = int(indelPOS)
674 if indel_dict[key] / coverage[indelPOS] >= args.min_samp_abund:
675 indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key] / coverage[indelPOS]):.3f}\n")
676 else:
677 break
678 # if len(indels_to_write) > 0:
679 indel_fh = open(args.outfile2, "w")
680 indel_fh.write(args.samp+"("+str(sam_read_count)+")\n")
681 indel_fh.write("Indel\tCount\tAbundance\n")
682 for indel_entry in indels_to_write:
683 indel_fh.write(indel_entry)
684 indel_fh.close()
685 # END INDEL OUT
686
687 if args.nt_call == 1: # out put nt calls
688 ntcall_fh = open(args.outfile3, "w")
689 ntcall_fh.write(args.samp+"("+str(sam_read_count)+")\n")
690 if args.ntvar == 1:
691 ntcallv_fh = open(args.outfile7, "w")
692 ntcallv_fh.write(args.samp+"("+str(sam_read_count)+")\n")
693 sorted_POS = sorted(nt_call_dict_dict)
694 if args.AAreport == 1:
695 ntcall_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n")
696 if args.ntvar == 1:
697 ntcallv_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n")
698 for POS in sorted_POS:
699 try:
700 total = coverage[POS]
701 except:
702 total = 0
703 if total >= (sam_read_count * args.ntabund):
704 AAinfo = singletCodon(POS, ref[1][POS-1], ref)
705 POS_calls = {}
706 for key in nt_call_dict_dict[POS]:
707 POS_calls[key] = nt_call_dict_dict[POS][key]
708 sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True)
709
710 ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
711 ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
712 ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
713 ntcall_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
714 if sorted_calls[0] != ref[1][POS-1]:
715 if args.ntvar == 1:
716 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
717 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
718 ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
719 ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
720
721 mod = (POS)%3
722
723 if mod == 0:
724 try:
725 codon = ref[1][POS-3]+ref[1][POS-2]+sorted_calls[0]
726 except:
727 codon = 'NNN'
728 elif mod == 2:
729 try:
730 codon = ref[1][POS-2]+sorted_calls[0]+ref[1][POS]
731 except:
732 codon = 'NNN'
733 elif mod == 1:
734 try:
735 codon = sorted_calls[0]+ref[1][POS]+ref[1][POS+1]
736 except:
737 codon = 'NNN'
738 ntcall_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1])
739 if args.ntvar == 1:
740 ntcallv_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1])
741 if (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total >= args.min_samp_abund):
742 if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1:
743 ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
744 ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
745 if nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count:
746 if nt_call_dict_dict[POS][sorted_calls[2]] / total >= args.min_samp_abund:
747 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
748 ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
749 ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
750
751 if args.ntvar == 1:
752 ntcallv_fh.write("\n")
753 elif (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] / total >= args.min_samp_abund):
754 if args.ntvar == 1:
755 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
756 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
757 ntcallv_fh.write("\t"+str(total)+"\t\t")
758 ntcallv_fh.write(f"\t")
759 ntcallv_fh.write("\t\t")
760 ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
761 ntcall_fh.write("\t\t")
762 ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
763
764 if (nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total >= args.min_samp_abund):
765 ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
766 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
767 ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
768 if args.ntvar == 1:
769 ntcallv_fh.write("\n")
770
771 ntcall_fh.write("\n")
772 else:
773 ntcall_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n")
774 if args.ntvar == 1:
775 ntcallv_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n")
776
777 for POS in sorted_POS:
778 try:
779 total = coverage[POS] # sum(nt_call_dict_dict[POS].values())
780 except:
781 total = 0
782 if total >= (sam_read_count * args.ntabund):
783 POS_calls = {}
784 for key in nt_call_dict_dict[POS]:
785 POS_calls[key] = nt_call_dict_dict[POS][key]
786 sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True)
787
788 ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1])
789 ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
790 ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
791
792 if sorted_calls[0] != ref[1][POS-1]:
793 if args.ntvar == 1:
794 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1])
795 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
796 ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
797 ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
798 if (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund):
799 ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
800 if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1:
801 ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
802 if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund):
803 ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
804 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
805 ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
806 if args.ntvar == 1:
807 ntcallv_fh.write("\n")
808
809 elif (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund):
810 if args.ntvar == 1:
811 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1])
812 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
813 ntcallv_fh.write("\t"+str(total)+"\t\t")
814 ntcallv_fh.write(f"\t")
815 ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
816 ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
817 if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and(nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund):
818 ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
819 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
820 ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
821 if args.ntvar == 1:
822 ntcallv_fh.write("\n")
823
824 ntcall_fh.write("\n")
825
826 ntcall_fh.close()
827 if args.ntvar == 1:
828 ntcallv_fh.close()
829 # END NT CALL OUT
830 if args.covar == 1: # output covariants
831 testtrack = 0
832 combinations = {}
833 for sequence in seq_species:
834 if args.wgs == 0:
835 singles = sequence.split()
836 else:
837 singles = (sequence.split())[1:-1]
838 if len(singles) <= args.max_dist and singles[0] != 'Ref':
839 for combo in getCombos(singles, args.max_covar):
840 if not combo in combinations:
841 combinations[combo] = seq_species[sequence]
842 else:
843 combinations[combo] += seq_species[sequence]
844
845 covar_fh = open(args.outfile4, "w")
846 covar_fh.write(args.samp+"("+str(sam_read_count)+")\n")
847 covar_fh.write("Co-Variants\tCount\tAbundance\n")
848 sortedcombos = sorted(combinations, key=combinations.__getitem__, reverse=True)
849 for key in sortedcombos:
850 if (combinations[key] >= args.min_count):
851 if (combinations[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0:
852 covar_fh.write(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}\n")
853 covar_array.append(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}")
854 elif args.wgs == 1:
855 coveragepercent = 0
856 splitcombos = key.split()
857 if len(splitcombos) == 1:
858 coveragePOS = ''
859 for c in key:
860 if c.isdigit():
861 coveragePOS += c
862 else:
863 break
864 coveragepercent = combinations[key] / coverage[int(coveragePOS)]
865 else:
866 startcovPOS = ''
867 for c in splitcombos[0]:
868 if c.isdigit():
869 startcovPOS += c
870 else:
871 break
872 endcovPOS = ''
873 for c in splitcombos[-1]:
874 if c.isdigit():
875 endcovPOS += c
876 else:
877 break
878 coveragevals = []
879 for i in range(int(startcovPOS), int(endcovPOS)+1):
880 coveragevals.append(coverage[i])
881 mincov = min(coverval for coverval in coveragevals)
882 coveragepercent = combinations[key] / mincov
883 if coveragepercent >= args.min_samp_abund:
884 covar_fh.write(f"{key}\t{combinations[key]}\t{coveragepercent:.3f}\n")
885
886 covar_fh.close()
887
888
889 # END COVAR OUT
890
891 return(covar_array, seq_array, sam_read_count)
892
893 def cvdeconv(args, covardict, seqdict): # covar deconvolution process
894
895 passedseqs = {}
896 preservedseqs = {}
897 for seq in seqdict: # pass check for actual : expected abundance
898 if seq != 'total' and seq != 'singles':
899
900 splitseq = seq.split(' ')
901 abund = 1
902 for sing in splitseq:
903 try:
904 abund = abund * (covardict[sing] / covardict['total'])
905 except:
906 abund = abund * (seqdict[seq] / seqdict['total'])
907
908 try:
909 covarabund = covardict[seq]/covardict['total']
910 except:
911 covarabund = seqdict[seq]/seqdict['total']
912 covardict[seq] = seqdict[seq]
913
914 if covarabund >= args.autopass:
915 preservedseqs[seq] = max(1, args.beta, (covarabund / abund))
916 elif covarabund >= abund * args.beta:
917 passedseqs[seq] = covarabund / abund
918 elif len(seq.split(' ')) == 1:
919 passedseqs[seq] = max(1, args.beta)
920
921 if args.min_samp_abund < 1:
922 min_count = args.min_samp_abund * covardict['total']
923 else:
924 min_count = args.min_samp_abund
925
926 # sort passed covars
927 lensortedpassed = sorted(passedseqs, key = lambda key : len(key.split(' ')), reverse=True)
928 ratiolensortedpassed = sorted(lensortedpassed, key = lambda key : passedseqs[key], reverse = True)
929 sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
930 deconved = {}
931 for seq in ratiolensortedpassed: # reassign counts
932 singles = seq.split(' ')
933 first = 0
934 rem_count = 0
935 for sing in sortedsingles:
936 if sing in singles:
937 if covardict['singles'][sing] > 0:
938 if first == 0:
939 first = 1
940 rem_count = covardict['singles'][sing]
941 covardict['singles'][sing] = 0
942 deconved[seq] = rem_count
943 else:
944 covardict['singles'][sing] = covardict['singles'][sing] - rem_count
945 else:
946 break
947 sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
948
949 sortedpreserved = sorted(preservedseqs, key = lambda key : covardict[key])
950
951 for seq in sortedpreserved:
952 singles = seq.split(' ')
953 first = 0
954 rem_count = 0
955 for sing in sortedsingles:
956 if sing in singles:
957 if covardict['singles'][sing] > 0:
958 if first == 0:
959 first = 1
960 rem_count = covardict['singles'][sing]
961 covardict['singles'][sing] = 0
962 deconved[seq] = rem_count
963 else:
964 covardict['singles'][sing] = covardict['singles'][sing] - rem_count
965 else:
966 break
967 sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
968
969
970 newtotal = sum(deconved.values())
971 fh_deconv = open(args.outfile6, "w")
972 fh_deconv.write(f"{args.samp}({covardict['total']}) | ({newtotal})\tCount\tAbundance\n")
973 sorted_deconved = sorted(deconved, key = deconved.__getitem__, reverse = True)
974 for seq in sorted_deconved: # write deconv
975 if deconved[seq] >= min_count:
976 fh_deconv.write(f"{seq}\t{deconved[seq]}\t{(deconved[seq]/newtotal):.3f}\n")
977 fh_deconv.close()
978 # END COVAR DECONV OUT
979
980 return()
981
982 def dechim(args, seqs): # processing sequence dictionary to remove chimeras
983 total = seqs['total']
984 del seqs['total']
985 sorted_seqs = sorted(seqs, key=seqs.__getitem__) # sort sequences by abundance, least to greatest
986 chimeras = []
987 for seq in sorted_seqs:
988 pot_chim = ['Reference']+seq.split()+['Reference']
989 chim_halves = []
990 for i in range(0, len(pot_chim)-1): # create dimera halves
991 chim_halves.append([pot_chim[:i+1], pot_chim[i+1:]])
992 parent_pairs = []
993
994 for dimera in chim_halves: # check for potential parents
995
996 pot_left = []
997 pot_rt = []
998 lft_len = len(dimera[0])
999 rt_len = len(dimera[1])
1000 for pseq in sorted_seqs:
1001 if not seq == pseq:
1002 if (seqs[pseq] >= (seqs[seq] * args.foldab)):
1003 pot_par = ['Reference']+pseq.split()+['Reference']
1004 if dimera[0] == pot_par[:lft_len]:
1005 pot_left.append(pot_par)
1006 if ((len(pot_par) >= rt_len) and (dimera[1] == pot_par[(len(pot_par)-rt_len):])):
1007 pot_rt.append(pot_par)
1008
1009 if (len(pot_left) > 0 and len(pot_rt) > 0 ):
1010 for left_par in pot_left: # check potential parents' pairing
1011 for rt_par in pot_rt:
1012 if left_par != rt_par:
1013 left_break = left_par[lft_len]
1014 rt_break = rt_par[(len(rt_par)-rt_len)-1]
1015 if left_break == 'Reference' or rt_break == 'Reference':
1016 parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])])
1017 else:
1018 left_break_POS = ''
1019 for c in left_break:
1020 if c.isdigit():
1021 left_break_POS += c
1022 else:
1023 break
1024
1025 rt_break_POS = ''
1026 for c in rt_break:
1027 if c.isdigit():
1028 rt_break_POS += c
1029 else:
1030 break
1031
1032 if int(left_break_POS) > int(rt_break_POS):
1033 parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])])
1034
1035 par_tot_abund = 0
1036 pair_probs = []
1037 for parents in parent_pairs: # calc 'expected' abundance
1038 pair_prob = (seqs[parents[0]] / total) * (seqs[parents[1]] / total)
1039 par_tot_abund += pair_prob
1040 pair_probs.append(pair_prob)
1041
1042 recomb_count = par_tot_abund * total
1043
1044 if not seqs[seq] >= recomb_count * args.alpha: # chimera check
1045 redist_count = float(seqs[seq])
1046 chimeras.append(seq)
1047 seqs[seq] = 0
1048 if args.redist == 1: # redist counts of chimera
1049 toadd = {}
1050 for i in range(0, len(parent_pairs)):
1051 counts_to_redist = (redist_count * (pair_probs[i]/par_tot_abund))/2
1052 seqs[parent_pairs[i][0]] += counts_to_redist
1053 seqs[parent_pairs[i][1]] += counts_to_redist
1054
1055
1056
1057 for chim in chimeras: # remove chimeras
1058 del seqs[chim]
1059
1060 total = sum(seqs.values())
1061
1062
1063 # total = sum(seqs.values)
1064 seqs['total'] = total
1065
1066 return(seqs)
1067
1068 def chimrm(args, seqs): # chimera removed process
1069
1070 pre_len = len(seqs)
1071 inf_loop_shield = 0
1072 while True: # send sequences for chimera removal while chimeras are still found
1073 dechim(args, seqs)
1074 post_len = len(seqs)
1075 inf_loop_shield += 1
1076 if post_len >= pre_len:
1077 break
1078 if inf_loop_shield > 100:
1079 break
1080 pre_len = len(seqs)
1081
1082 total = seqs['total']
1083 del seqs['total']
1084 if args.min_samp_abund < 1:
1085 min_count = args.min_samp_abund * total
1086 else:
1087 min_count = args.min_samp_abund
1088 sorted_seqs = sorted(seqs, key=seqs.__getitem__, reverse=True)
1089 fh_dechime = open(args.outfile5, 'w')
1090 fh_dechime.write(f"{args.samp}({int(total)})\n")
1091 fh_dechime.write("Sequences\tCount\tAbundance\n")
1092 for seq in seqs: # write chim_rm seqs
1093 abund = seqs[seq]/total
1094 if seqs[seq] >= min_count:
1095 fh_dechime.write(f"{seq}\t{round(seqs[seq])}\t{abund:.3f}\n")
1096
1097 fh_dechime.close()
1098 return()
1099
1100 def main():
1101
1102 args = arg_parser() # getting command line arguments
1103
1104 covar_array = []
1105 seq_array = []
1106 total_mapped_reads = 0
1107
1108 if args.ref:
1109 ref = get_ref(args.ref) # get the reference ID and sequence from the FASTA file
1110 args.ref.close()
1111 if ref[1] == '':
1112 print('Reference not recognized as a Fasta format, skipping SAM parsing')
1113 else:
1114 refprot = ''
1115 if args.AAreport == 1: # make an Amino Acid sequence based on the reference sequence
1116 for x in range(0, (len(ref[1])-1)//3):
1117 AA = AAcall(ref[1][x*3]+ref[1][x*3+1]+ref[1][x*3+2])
1118 refprot = refprot + AA
1119 if (len(ref[1])-1)%3 != 0:
1120 refprot = refprot + '?'
1121
1122 covar_array, seq_array, total_mapped_reads = SAMparse(args, ref, refprot, args.Sam_file)
1123
1124 else:
1125 print('No reference provided, skipping SAM parsing')
1126
1127 in_covars = {'total' : total_mapped_reads,
1128 'singles' : {}
1129 }
1130 in_seqs = {'total' : total_mapped_reads}
1131 # Begin chimera removal if enabled
1132 if args.chim_rm == 1 or args.deconv == 1:
1133 if args.deconv == 1:
1134 for line in covar_array:
1135 lineparts = line.split("\t")
1136 in_covars[lineparts[0]] = int(lineparts[1])
1137 if len(lineparts[0].split(' ')) == 1:
1138 in_covars['singles'][lineparts[0]] = int(lineparts[1])
1139
1140 for line in seq_array:
1141 lineparts = line.split("\t")
1142 in_seqs[lineparts[0]] = float(lineparts[1])
1143
1144 if args.deconv == 1:
1145 cvdeconv(args, in_covars, in_seqs)
1146
1147 if args.chim_rm == 1:
1148 chimrm(args, in_seqs)
1149
1150 if __name__ == '__main__':
1151 main()