Mercurial > repos > degregory > sam_refiner
comparison SAM_Refiner_Gal.py @ 0:22e3f843f1db draft
Uploaded main file
author | degregory |
---|---|
date | Mon, 13 Sep 2021 14:12:21 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:22e3f843f1db |
---|---|
1 #!/bin/env python3 | |
2 # Writen by Devon Gregory with assistance from Christopher Bottoms | |
3 # University of Missouri | |
4 # Distributed under GNU GENERAL PUBLIC LICENSE v3 | |
5 import os | |
6 import sys | |
7 import argparse | |
8 import itertools | |
9 import time | |
10 from multiprocessing import Process | |
11 from pathlib import Path | |
12 | |
13 # process for collecing command line arguments | |
14 def arg_parser(): | |
15 | |
16 parser = argparse.ArgumentParser( | |
17 description='process Sam files for variant information' | |
18 ) | |
19 | |
20 parser.add_argument( | |
21 '--samp', | |
22 type=str, | |
23 default='Sample', | |
24 help='Sample Name' | |
25 ) | |
26 parser.add_argument( | |
27 '-r', '-reference', | |
28 type=argparse.FileType('r'), | |
29 dest='ref', | |
30 help='reference fasta' | |
31 ) | |
32 parser.add_argument( | |
33 '-S', '--Sam_file', | |
34 type=argparse.FileType('r'), | |
35 dest='Sam_file', | |
36 help='.sam file' | |
37 ) | |
38 parser.add_argument( | |
39 '-o1', '--out_file1', | |
40 dest='outfile1', | |
41 help='output file' | |
42 ) | |
43 parser.add_argument( | |
44 '-o2', '--out_file2', | |
45 dest='outfile2', | |
46 help='output file' | |
47 ) | |
48 parser.add_argument( | |
49 '-o3', '--out_file3', | |
50 dest='outfile3', | |
51 help='output file' | |
52 ) | |
53 parser.add_argument( | |
54 '-o4', '--out_file4', | |
55 dest='outfile4', | |
56 help='output file' | |
57 ) | |
58 parser.add_argument( | |
59 '-o5', '--out_file5', | |
60 dest='outfile5', | |
61 help='output file' | |
62 ) | |
63 parser.add_argument( | |
64 '-o6', '--out_file6', | |
65 dest='outfile6', | |
66 help='output file' | |
67 ) | |
68 parser.add_argument( | |
69 '-o7', '--out_file7', | |
70 dest='outfile7', | |
71 help='output file' | |
72 ) | |
73 parser.add_argument( | |
74 '--use_count', | |
75 type=int, | |
76 default=1, | |
77 choices=[0, 1], | |
78 help='Enable/Disable (1/0) use of counts in sequence IDs, default enabled (--use_count 1)' | |
79 ) | |
80 parser.add_argument( | |
81 '--min_count', | |
82 type=int, | |
83 default=10, | |
84 help='Minimum observations required to be included in sample reports; >= 1 occurance count; < 1 %% observed (.1 = 10%%), (default: .001)' | |
85 ) | |
86 parser.add_argument( | |
87 '--min_samp_abund', | |
88 type=float, | |
89 default=0.001, | |
90 help='Minimum abundance required for inclusion in sample reports; %% observed (.1 = 10%%), (default: .001)' | |
91 ) | |
92 parser.add_argument( | |
93 '--ntabund', | |
94 type=float, | |
95 default=.001, | |
96 help='Minimum abundance relative to total reads required for a position to be reported in the nt call output; must be non-negative and < 1, %% observed (.1 = 10%%), (default: .001)' | |
97 ) | |
98 parser.add_argument( | |
99 '--max_dist', | |
100 type=int, | |
101 default=40, | |
102 help='Maximum number of variances from the reference a sequence can have to be consider in covars processing (default: 40)' | |
103 ) | |
104 parser.add_argument( | |
105 '--max_covar', | |
106 type=int, | |
107 default=8, | |
108 help='Maximum number of variances from the reference to be reported in covars (default: 8)' | |
109 ) | |
110 parser.add_argument( | |
111 '--AAreport', | |
112 type=int, | |
113 default=1, | |
114 choices=[0, 1], | |
115 help='Enable/Disable (1/0) amino acid reporting, default enabled (--AAreport 1)' | |
116 ) | |
117 parser.add_argument( | |
118 '--AAcodonasMNP', | |
119 type=int, | |
120 default=1, | |
121 choices=[0, 1], | |
122 help='Enable/Disable (1/0) reporting multiple nt changes in a single codon as one polymorphism, default enabled (--AAcodonasMNP 1), requires AAreport enabled' | |
123 ) | |
124 parser.add_argument( | |
125 '--alpha', | |
126 type=float, | |
127 default=1.2, | |
128 help='Modifier for chim_rm chimera checking, default 1.2. Higher = more sensitive, more false chimeras removed; lower = less sensitive, fewer chimeras removed' | |
129 ) | |
130 parser.add_argument( | |
131 '--foldab', | |
132 type=float, | |
133 default=1.8, | |
134 help='Threshold for potential parent / chimera abundance ratio for chim_rm; default is 1.8' | |
135 ) | |
136 parser.add_argument( | |
137 '--redist', | |
138 type=int, | |
139 default=1, | |
140 choices=[0, 1], | |
141 help='Enable/Disable (1/0) redistribution of chimera counts for chim_rm, default enabled (--redist 1)' | |
142 ) | |
143 parser.add_argument( | |
144 '--beta', | |
145 type=float, | |
146 default=1, | |
147 help='Modifier for covar pass checking, default 1. Higher = more sensitive, more failed checks; lower = less sensitive, fewer failed checks' | |
148 ) | |
149 parser.add_argument( | |
150 '--autopass', | |
151 type=float, | |
152 default=.3, | |
153 help='threshold for a sequence to automatically pass the covar pass checking' | |
154 ) | |
155 parser.add_argument( | |
156 '--nt_call', | |
157 type=int, | |
158 default=1, | |
159 choices=[0, 1], | |
160 help='Enable/Disable (1/0) nt_call output, default enabled (--nt_call 1)' | |
161 ) | |
162 parser.add_argument( | |
163 '--ntvar', | |
164 type=int, | |
165 default=0, | |
166 choices=[0, 1], | |
167 help='Enable/Disable (1/0) nt_var output, default enabled (--nt_call 1)' | |
168 ) | |
169 parser.add_argument( | |
170 '--indel', | |
171 type=int, | |
172 default=1, | |
173 choices=[0, 1], | |
174 help='Enable/Disable (1/0) indel output, default enabled (--indel 1)' | |
175 ) | |
176 parser.add_argument( | |
177 '--seq', | |
178 type=int, | |
179 default=1, | |
180 choices=[0, 1], | |
181 help='Enable/Disable (1/0) unique seq output, default enabled (--seq 1)' | |
182 ) | |
183 parser.add_argument( | |
184 '--covar', type=int, | |
185 default=1, | |
186 choices=[0, 1], | |
187 help='Enable/Disable (1/0) covar output, default enabled (--covar 1)' | |
188 ) | |
189 parser.add_argument( | |
190 '--chim_rm', | |
191 type=int, | |
192 default=1, | |
193 choices=[0, 1], | |
194 help='Enable/Disable (1/0) chim_rm output, default enabled (--chim_rm 1)' | |
195 ) | |
196 parser.add_argument( | |
197 '--deconv', | |
198 type=int, | |
199 default=1, | |
200 choices=[0, 1], | |
201 help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)' | |
202 ) | |
203 parser.add_argument( | |
204 '--wgs', | |
205 type=int, | |
206 default=0, | |
207 choices=[0, 1], | |
208 help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)' | |
209 ) | |
210 | |
211 args = parser.parse_args() | |
212 | |
213 # checking for proper range of some parameters and consistency/compatibility | |
214 if args.wgs == 1: | |
215 if args.deconv == 1 or args.chim_rm == 1: | |
216 args.deconv = 0 | |
217 args.chim_rm = 0 | |
218 print('WGS mode enabled, disabling chimera removal methods') | |
219 | |
220 return(args) | |
221 | |
222 def get_ref(ref): # get the reference ID and sequence from the FASTA file. Will only get the first. | |
223 | |
224 n=0 | |
225 refID = '' | |
226 refseq = '' | |
227 if ref: | |
228 for line in ref: | |
229 if line.startswith('>'): | |
230 n+=1 | |
231 if n > 1: | |
232 break | |
233 refID = line[1:].strip("\n\r") | |
234 elif n == 1: | |
235 refseq = refseq + line.strip("\n\r") | |
236 refseq = refseq.upper() | |
237 | |
238 | |
239 return(refID, refseq) | |
240 | |
241 def AAcall(codon): # amino acid / codon dictionary to return encoded AAs | |
242 AAdict = { | |
243 'TTT' : 'F', | |
244 'TTC' : 'F', | |
245 'TTA' : 'L', | |
246 'TTG' : 'L', | |
247 'TCT' : 'S', | |
248 'TCC' : 'S', | |
249 'TCA' : 'S', | |
250 'TCG' : 'S', | |
251 'TAT' : 'Y', | |
252 'TAC' : 'Y', | |
253 'TAA' : '*', | |
254 'TAG' : '*', | |
255 'TGT' : 'C', | |
256 'TGC' : 'C', | |
257 'TGA' : '*', | |
258 'TGG' : 'W', | |
259 | |
260 'CTT' : 'L', | |
261 'CTC' : 'L', | |
262 'CTA' : 'L', | |
263 'CTG' : 'L', | |
264 'CCT' : 'P', | |
265 'CCC' : 'P', | |
266 'CCA' : 'P', | |
267 'CCG' : 'P', | |
268 'CAT' : 'H', | |
269 'CAC' : 'H', | |
270 'CAA' : 'Q', | |
271 'CAG' : 'Q', | |
272 'CGT' : 'R', | |
273 'CGC' : 'R', | |
274 'CGA' : 'R', | |
275 'CGG' : 'R', | |
276 | |
277 'ATT' : 'I', | |
278 'ATC' : 'I', | |
279 'ATA' : 'I', | |
280 'ATG' : 'M', | |
281 'ACT' : 'T', | |
282 'ACC' : 'T', | |
283 'ACA' : 'T', | |
284 'ACG' : 'T', | |
285 'AAT' : 'N', | |
286 'AAC' : 'N', | |
287 'AAA' : 'K', | |
288 'AAG' : 'K', | |
289 'AGT' : 'S', | |
290 'AGC' : 'S', | |
291 'AGA' : 'R', | |
292 'AGG' : 'R', | |
293 | |
294 'GTT' : 'V', | |
295 'GTC' : 'V', | |
296 'GTA' : 'V', | |
297 'GTG' : 'V', | |
298 'GCT' : 'A', | |
299 'GCC' : 'A', | |
300 'GCA' : 'A', | |
301 'GCG' : 'A', | |
302 'GAT' : 'D', | |
303 'GAC' : 'D', | |
304 'GAA' : 'E', | |
305 'GAG' : 'E', | |
306 'GGT' : 'G', | |
307 'GGC' : 'G', | |
308 'GGA' : 'G', | |
309 'GGG' : 'G' | |
310 } | |
311 AA = '?' | |
312 if codon in AAdict: | |
313 AA = AAdict[codon] | |
314 | |
315 return(AA) | |
316 | |
317 def singletCodon(ntPOS, nt, ref): # process to return the AA and protein seq. position based on the reference and provided nt seq position and nt | |
318 AAPOS = (ntPOS-1)//3 | |
319 AAmod = (ntPOS-1)%3 | |
320 codon = "" | |
321 try: | |
322 if AAmod == 0: | |
323 codon = nt+ref[1][ntPOS]+ref[1][ntPOS+1] | |
324 elif AAmod == 1: | |
325 codon = ref[1][ntPOS-2]+nt+ref[1][ntPOS] | |
326 elif AAmod == 2: | |
327 codon = ref[1][ntPOS-3]+ref[1][ntPOS-2]+nt | |
328 except: | |
329 codon = "XXX" | |
330 | |
331 return(AAPOS+1, AAcall(codon)) | |
332 | |
333 def getCombos(qlist, clen): # returns combinations of single polymorphisms in a sequence | |
334 combos = [] | |
335 if (clen == 0 or clen > len(qlist)): | |
336 clen = len(qlist) | |
337 for N in range(1, clen+1): | |
338 for comb in itertools.combinations(qlist, N): | |
339 combos.append(' '.join(comb)) | |
340 return(combos) | |
341 | |
342 def SAMparse(args, ref, refprot, file): # process SAM files | |
343 covar_array = [] | |
344 seq_array = [] | |
345 | |
346 nt_call_dict_dict = {} | |
347 indel_dict = {} | |
348 seq_species = {} | |
349 sam_read_count = 0 | |
350 sam_line_count = 0 | |
351 firstPOS = 0 | |
352 lastPOS = 0 | |
353 coverage = {} | |
354 for line in args.Sam_file: | |
355 if not line.startswith('@'): # ignore header lines | |
356 splitline = line.split("\t") | |
357 if ref[0].upper().startswith(splitline[2].upper()): # check map ID matches referecne ID | |
358 if int(splitline[4]) > 0: # Check mapping score is positive | |
359 | |
360 abund_count=1 | |
361 if args.use_count == 1: # get the unique sequence counts | |
362 if '-' in splitline[0] and '=' in splitline[0]: | |
363 eq_split = splitline[0].split('=') | |
364 dash_split = splitline[0].split('-') | |
365 if len(eq_split[-1]) > len(dash_split[-1]): | |
366 abund_count=int(dash_split[-1]) | |
367 else: | |
368 abund_count=int(eq_split[-1]) | |
369 | |
370 elif '-' in splitline[0]: | |
371 try: | |
372 abund_count=int(splitline[0].split('-')[-1]) | |
373 except: | |
374 pass | |
375 | |
376 elif '=' in splitline[0]: | |
377 try: | |
378 abund_count=int(splitline[0].split('=')[-1]) | |
379 except: | |
380 pass | |
381 | |
382 | |
383 sam_read_count += abund_count | |
384 | |
385 CIGAR = splitline[5] | |
386 POS = int(splitline[3]) | |
387 if firstPOS == 0: | |
388 firstPOS = POS | |
389 elif POS < firstPOS: | |
390 firstPOS = POS | |
391 | |
392 readID = splitline[0] | |
393 query_seq = splitline[9].upper() | |
394 run_length = 0 | |
395 query_seq_parsed = '' | |
396 query_pos = 0 | |
397 q_pars_pos = 0 | |
398 mutations = [] | |
399 | |
400 for C in CIGAR: # process sequence based on standard CIGAR line | |
401 if C == 'M' or C == 'I' or C == 'D' or C == 'S' or C == 'H': | |
402 if C == 'S': | |
403 query_pos = query_pos + run_length | |
404 | |
405 if C == 'I': | |
406 if query_pos > 0: | |
407 # add insertion to dict | |
408 iPOS = q_pars_pos+POS | |
409 | |
410 iSeq = query_seq[query_pos: query_pos+run_length] | |
411 istring = str(iPOS)+'-insert'+iSeq | |
412 | |
413 try: | |
414 indel_dict[istring] | |
415 except: | |
416 indel_dict[istring] = abund_count | |
417 else: | |
418 indel_dict[istring] += abund_count | |
419 | |
420 if args.AAreport == 1 and (run_length % 3 == 0): | |
421 | |
422 iProt = '' | |
423 if iPOS % 3 == 1: | |
424 for x in range(0, (run_length//3)): | |
425 AA = AAcall(iSeq[x*3]+iSeq[x*3+1]+iSeq[x*3+2]) | |
426 iProt = iProt + AA | |
427 mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')') | |
428 elif iPOS % 3 == 2: | |
429 if query_pos > 0: | |
430 ipSeq = query_seq[query_pos-1:query_pos+run_length+2] | |
431 else: | |
432 ipSeq = "XXX"+query_seq[query_pos+2:query_pos+run_length+2] | |
433 for x in range(0, (run_length//3)+1): | |
434 AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2]) | |
435 iProt = iProt + AA | |
436 mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')') | |
437 else: | |
438 if query_pos > 1: | |
439 ipSeq = query_seq[query_pos-2:query_pos+run_length+1] | |
440 else: | |
441 ipSeq = "XXX"+query_seq[query_pos+1:query_pos+run_length+1] | |
442 | |
443 for x in range(0, (run_length//3)+1): | |
444 AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2]) | |
445 iProt = iProt + AA | |
446 mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')') | |
447 else: | |
448 mutations.append(istring) | |
449 | |
450 query_pos = query_pos + run_length | |
451 | |
452 elif C == 'D': | |
453 for X in range(0, run_length): | |
454 query_seq_parsed += '-' | |
455 | |
456 delstring = str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del' | |
457 | |
458 if args.AAreport == 1 and (run_length % 3 == 0) and not ((q_pars_pos+POS) % 3 == 1 ): | |
459 if (q_pars_pos+POS) % 3 == 2: | |
460 newcodon = query_seq[query_pos-1:query_pos+2] | |
461 newAArefpos = (q_pars_pos+POS) // 3 | |
462 mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')') | |
463 else: | |
464 newcodon = query_seq[query_pos-2:query_pos+1] | |
465 newAArefpos = (q_pars_pos+POS) // 3 | |
466 mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')') | |
467 else: | |
468 mutations.append(delstring) | |
469 | |
470 if args.nt_call == 1: | |
471 for N in range(q_pars_pos+POS, q_pars_pos+POS+int(run_length)): | |
472 try: | |
473 nt_call_dict_dict[N] | |
474 except: | |
475 nt_call_dict_dict[N] = {'A' : 0, | |
476 'T' : 0, | |
477 'C' : 0, | |
478 'G' : 0, | |
479 '-' : 0} | |
480 nt_call_dict_dict[N]['-'] = abund_count | |
481 else: | |
482 nt_call_dict_dict[N]['-'] += abund_count | |
483 | |
484 try: | |
485 indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] | |
486 except: | |
487 indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] = int(abund_count) | |
488 else: | |
489 indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] += int(abund_count) | |
490 | |
491 q_pars_pos = q_pars_pos + run_length | |
492 | |
493 elif C == 'M': | |
494 offset = q_pars_pos-query_pos | |
495 refPOS = POS+offset | |
496 | |
497 for ntPOS in range(query_pos, query_pos+run_length): | |
498 if query_seq[ntPOS] == 'A' or query_seq[ntPOS] == 'T' or query_seq[ntPOS] == 'C' or query_seq[ntPOS] == 'G': | |
499 if query_seq[ntPOS] != ref[1][refPOS+ntPOS-1]: | |
500 if args.AAreport == 1 and args.AAcodonasMNP == 0: | |
501 AAinfo = singletCodon(refPOS+ntPOS, query_seq[ntPOS], ref) | |
502 mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') | |
503 else: | |
504 mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS]) | |
505 if args.nt_call == 1: | |
506 try: | |
507 nt_call_dict_dict[refPOS+ntPOS] | |
508 except: | |
509 nt_call_dict_dict[refPOS+ntPOS] = {'A' : 0, | |
510 'T' : 0, | |
511 'C' : 0, | |
512 'G' : 0, | |
513 '-' : 0} | |
514 nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] = abund_count | |
515 else: | |
516 nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] += abund_count | |
517 | |
518 | |
519 q_pars_pos = q_pars_pos + run_length | |
520 query_pos = query_pos + run_length | |
521 | |
522 run_length = 0 | |
523 | |
524 | |
525 else: | |
526 run_length = (10 * run_length) + int(C) | |
527 # END CIGAR PARSE | |
528 | |
529 | |
530 | |
531 if len(mutations) == 0: # record reference counts | |
532 if args.wgs == 0: | |
533 try: | |
534 seq_species['Reference'] += abund_count | |
535 except: | |
536 seq_species['Reference'] = abund_count | |
537 else: | |
538 try: | |
539 seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] += abund_count | |
540 except: | |
541 seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] = abund_count | |
542 | |
543 else: # record variants and counts | |
544 if args.AAreport == 1 and args.AAcodonasMNP == 1: | |
545 codonchecked = [] | |
546 codon = '' | |
547 skip = 0 | |
548 MNP = '' | |
549 for i in range(0, len(mutations)): # checking for MNP | |
550 if 'Del' in mutations[i] or 'insert' in mutations[i]: | |
551 codonchecked.append(mutations[i]) | |
552 elif skip > 0: | |
553 skip -= 1 | |
554 else: | |
555 mut1POS = int(''.join([c for c in mutations[i] if c.isdigit()])) | |
556 try: | |
557 mutations[i+1] | |
558 except: | |
559 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref) | |
560 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') | |
561 else: | |
562 | |
563 if mut1POS % 3 == 0: | |
564 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref) | |
565 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') | |
566 else: | |
567 mut2POS = int(''.join([c for c in mutations[i+1] if c.isdigit()])) | |
568 if mut2POS - mut1POS < 3: | |
569 AAPOS = mut1POS // 3 | |
570 if mut1POS % 3 == 1: | |
571 if mut2POS % 3 == 0: | |
572 codon = mutations[i][-1]+ref[1][mut1POS]+mutations[i+1][-1] | |
573 MNP = mutations[i][-1]+'r'+mutations[i+1][-1] | |
574 skip = 1 | |
575 else: | |
576 try: | |
577 mutations[i+2] | |
578 except: | |
579 codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS] | |
580 MNP = mutations[i][-1]+mutations[i+1][-1]+'r' | |
581 skip = 1 | |
582 else: | |
583 mut3POS = int(''.join([c for c in mutations[i+2] if c.isdigit()])) | |
584 if mut2POS == mut3POS -1: | |
585 codon = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1] | |
586 MNP = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1] | |
587 skip = 2 | |
588 else: | |
589 codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS] | |
590 MNP = mutations[i][-1]+mutations[i+1][-1]+'r' | |
591 skip = 1 | |
592 codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')') | |
593 elif mut2POS - mut1POS == 1: | |
594 codon = ref[1][mut1POS-2]+mutations[i][-1]+mutations[i+1][-1] | |
595 MNP = mutations[i][-1]+mutations[i+1][-1] | |
596 skip = 1 | |
597 codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')') | |
598 else: | |
599 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref) | |
600 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') | |
601 | |
602 | |
603 else: | |
604 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref) | |
605 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') | |
606 mutations = " ".join(codonchecked) | |
607 else: | |
608 mutations = " ".join(mutations) | |
609 | |
610 if args.wgs == 0: | |
611 try: | |
612 seq_species[mutations] += abund_count | |
613 except: | |
614 seq_species[mutations] = abund_count | |
615 else: | |
616 try: | |
617 seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] += abund_count | |
618 except: | |
619 seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] = abund_count | |
620 | |
621 if lastPOS < POS+q_pars_pos: | |
622 lastPOS = POS+q_pars_pos | |
623 for i in range(POS, POS+q_pars_pos): # update coverage | |
624 try: | |
625 coverage[i] += abund_count | |
626 except: | |
627 coverage[i] = abund_count | |
628 | |
629 args.Sam_file.close() | |
630 # END SAM LINES | |
631 | |
632 total_mapped_reads = sam_read_count | |
633 | |
634 if args.seq == 1: # output the sequence | |
635 seq_fh = open(args.outfile1, "w") | |
636 seq_fh.write(args.samp+"("+str(sam_read_count)+")\n") | |
637 seq_fh.write("Unique Sequence\tCount\tAbundance\n") | |
638 | |
639 sorted_seq = sorted(seq_species, key=seq_species.__getitem__, reverse=True) | |
640 for key in sorted_seq: | |
641 if seq_species[key] >= args.min_count: | |
642 if (seq_species[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0: | |
643 seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}\n") | |
644 seq_array.append(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}") | |
645 elif args.wgs == 1: | |
646 splitseqs = key.split() | |
647 cov = [] | |
648 for x in range(int(splitseqs[0]), int(splitseqs[-1])): | |
649 cov.append(coverage[x]) | |
650 min_cov = min(cov) | |
651 if (seq_species[key]/min_cov >= args.min_samp_abund): | |
652 seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/min_cov):.3f}\n") | |
653 else: | |
654 break | |
655 | |
656 seq_fh.close() | |
657 # END SEQ OUT | |
658 | |
659 if args.indel == 1: # and len(indel_dict) > 0: # output indels, if there are any | |
660 sorted_indels = sorted(indel_dict, key=indel_dict.__getitem__, reverse=True) | |
661 indels_to_write = [] | |
662 for key in sorted_indels: | |
663 if indel_dict[key] >= args.min_count: | |
664 if indel_dict[key] / sam_read_count >= args.min_samp_abund and args.wgs == 0: | |
665 indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key]/sam_read_count):.3f}\n") | |
666 elif args.wgs == 1: | |
667 indelPOS = '' | |
668 for c in key: | |
669 if c.isdigit(): | |
670 indelPOS += c | |
671 else: | |
672 break | |
673 indelPOS = int(indelPOS) | |
674 if indel_dict[key] / coverage[indelPOS] >= args.min_samp_abund: | |
675 indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key] / coverage[indelPOS]):.3f}\n") | |
676 else: | |
677 break | |
678 # if len(indels_to_write) > 0: | |
679 indel_fh = open(args.outfile2, "w") | |
680 indel_fh.write(args.samp+"("+str(sam_read_count)+")\n") | |
681 indel_fh.write("Indel\tCount\tAbundance\n") | |
682 for indel_entry in indels_to_write: | |
683 indel_fh.write(indel_entry) | |
684 indel_fh.close() | |
685 # END INDEL OUT | |
686 | |
687 if args.nt_call == 1: # out put nt calls | |
688 ntcall_fh = open(args.outfile3, "w") | |
689 ntcall_fh.write(args.samp+"("+str(sam_read_count)+")\n") | |
690 if args.ntvar == 1: | |
691 ntcallv_fh = open(args.outfile7, "w") | |
692 ntcallv_fh.write(args.samp+"("+str(sam_read_count)+")\n") | |
693 sorted_POS = sorted(nt_call_dict_dict) | |
694 if args.AAreport == 1: | |
695 ntcall_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n") | |
696 if args.ntvar == 1: | |
697 ntcallv_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n") | |
698 for POS in sorted_POS: | |
699 try: | |
700 total = coverage[POS] | |
701 except: | |
702 total = 0 | |
703 if total >= (sam_read_count * args.ntabund): | |
704 AAinfo = singletCodon(POS, ref[1][POS-1], ref) | |
705 POS_calls = {} | |
706 for key in nt_call_dict_dict[POS]: | |
707 POS_calls[key] = nt_call_dict_dict[POS][key] | |
708 sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True) | |
709 | |
710 ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1]) | |
711 ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) | |
712 ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])) | |
713 ntcall_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}") | |
714 if sorted_calls[0] != ref[1][POS-1]: | |
715 if args.ntvar == 1: | |
716 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1]) | |
717 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) | |
718 ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])) | |
719 ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}") | |
720 | |
721 mod = (POS)%3 | |
722 | |
723 if mod == 0: | |
724 try: | |
725 codon = ref[1][POS-3]+ref[1][POS-2]+sorted_calls[0] | |
726 except: | |
727 codon = 'NNN' | |
728 elif mod == 2: | |
729 try: | |
730 codon = ref[1][POS-2]+sorted_calls[0]+ref[1][POS] | |
731 except: | |
732 codon = 'NNN' | |
733 elif mod == 1: | |
734 try: | |
735 codon = sorted_calls[0]+ref[1][POS]+ref[1][POS+1] | |
736 except: | |
737 codon = 'NNN' | |
738 ntcall_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1]) | |
739 if args.ntvar == 1: | |
740 ntcallv_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1]) | |
741 if (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total >= args.min_samp_abund): | |
742 if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1: | |
743 ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1]) | |
744 ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1]) | |
745 if nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count: | |
746 if nt_call_dict_dict[POS][sorted_calls[2]] / total >= args.min_samp_abund: | |
747 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1: | |
748 ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}") | |
749 ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}") | |
750 | |
751 if args.ntvar == 1: | |
752 ntcallv_fh.write("\n") | |
753 elif (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] / total >= args.min_samp_abund): | |
754 if args.ntvar == 1: | |
755 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1]) | |
756 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) | |
757 ntcallv_fh.write("\t"+str(total)+"\t\t") | |
758 ntcallv_fh.write(f"\t") | |
759 ntcallv_fh.write("\t\t") | |
760 ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1]) | |
761 ntcall_fh.write("\t\t") | |
762 ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1]) | |
763 | |
764 if (nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total >= args.min_samp_abund): | |
765 ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}") | |
766 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1: | |
767 ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}") | |
768 if args.ntvar == 1: | |
769 ntcallv_fh.write("\n") | |
770 | |
771 ntcall_fh.write("\n") | |
772 else: | |
773 ntcall_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n") | |
774 if args.ntvar == 1: | |
775 ntcallv_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n") | |
776 | |
777 for POS in sorted_POS: | |
778 try: | |
779 total = coverage[POS] # sum(nt_call_dict_dict[POS].values()) | |
780 except: | |
781 total = 0 | |
782 if total >= (sam_read_count * args.ntabund): | |
783 POS_calls = {} | |
784 for key in nt_call_dict_dict[POS]: | |
785 POS_calls[key] = nt_call_dict_dict[POS][key] | |
786 sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True) | |
787 | |
788 ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1]) | |
789 ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) | |
790 ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}") | |
791 | |
792 if sorted_calls[0] != ref[1][POS-1]: | |
793 if args.ntvar == 1: | |
794 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]) | |
795 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) | |
796 ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])) | |
797 ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}") | |
798 if (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund): | |
799 ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}") | |
800 if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1: | |
801 ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}") | |
802 if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund): | |
803 ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}") | |
804 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1: | |
805 ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}") | |
806 if args.ntvar == 1: | |
807 ntcallv_fh.write("\n") | |
808 | |
809 elif (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund): | |
810 if args.ntvar == 1: | |
811 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]) | |
812 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) | |
813 ntcallv_fh.write("\t"+str(total)+"\t\t") | |
814 ntcallv_fh.write(f"\t") | |
815 ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}") | |
816 ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}") | |
817 if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and(nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund): | |
818 ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}") | |
819 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1: | |
820 ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}") | |
821 if args.ntvar == 1: | |
822 ntcallv_fh.write("\n") | |
823 | |
824 ntcall_fh.write("\n") | |
825 | |
826 ntcall_fh.close() | |
827 if args.ntvar == 1: | |
828 ntcallv_fh.close() | |
829 # END NT CALL OUT | |
830 if args.covar == 1: # output covariants | |
831 testtrack = 0 | |
832 combinations = {} | |
833 for sequence in seq_species: | |
834 if args.wgs == 0: | |
835 singles = sequence.split() | |
836 else: | |
837 singles = (sequence.split())[1:-1] | |
838 if len(singles) <= args.max_dist and singles[0] != 'Ref': | |
839 for combo in getCombos(singles, args.max_covar): | |
840 if not combo in combinations: | |
841 combinations[combo] = seq_species[sequence] | |
842 else: | |
843 combinations[combo] += seq_species[sequence] | |
844 | |
845 covar_fh = open(args.outfile4, "w") | |
846 covar_fh.write(args.samp+"("+str(sam_read_count)+")\n") | |
847 covar_fh.write("Co-Variants\tCount\tAbundance\n") | |
848 sortedcombos = sorted(combinations, key=combinations.__getitem__, reverse=True) | |
849 for key in sortedcombos: | |
850 if (combinations[key] >= args.min_count): | |
851 if (combinations[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0: | |
852 covar_fh.write(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}\n") | |
853 covar_array.append(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}") | |
854 elif args.wgs == 1: | |
855 coveragepercent = 0 | |
856 splitcombos = key.split() | |
857 if len(splitcombos) == 1: | |
858 coveragePOS = '' | |
859 for c in key: | |
860 if c.isdigit(): | |
861 coveragePOS += c | |
862 else: | |
863 break | |
864 coveragepercent = combinations[key] / coverage[int(coveragePOS)] | |
865 else: | |
866 startcovPOS = '' | |
867 for c in splitcombos[0]: | |
868 if c.isdigit(): | |
869 startcovPOS += c | |
870 else: | |
871 break | |
872 endcovPOS = '' | |
873 for c in splitcombos[-1]: | |
874 if c.isdigit(): | |
875 endcovPOS += c | |
876 else: | |
877 break | |
878 coveragevals = [] | |
879 for i in range(int(startcovPOS), int(endcovPOS)+1): | |
880 coveragevals.append(coverage[i]) | |
881 mincov = min(coverval for coverval in coveragevals) | |
882 coveragepercent = combinations[key] / mincov | |
883 if coveragepercent >= args.min_samp_abund: | |
884 covar_fh.write(f"{key}\t{combinations[key]}\t{coveragepercent:.3f}\n") | |
885 | |
886 covar_fh.close() | |
887 | |
888 | |
889 # END COVAR OUT | |
890 | |
891 return(covar_array, seq_array, sam_read_count) | |
892 | |
893 def cvdeconv(args, covardict, seqdict): # covar deconvolution process | |
894 | |
895 passedseqs = {} | |
896 preservedseqs = {} | |
897 for seq in seqdict: # pass check for actual : expected abundance | |
898 if seq != 'total' and seq != 'singles': | |
899 | |
900 splitseq = seq.split(' ') | |
901 abund = 1 | |
902 for sing in splitseq: | |
903 try: | |
904 abund = abund * (covardict[sing] / covardict['total']) | |
905 except: | |
906 abund = abund * (seqdict[seq] / seqdict['total']) | |
907 | |
908 try: | |
909 covarabund = covardict[seq]/covardict['total'] | |
910 except: | |
911 covarabund = seqdict[seq]/seqdict['total'] | |
912 covardict[seq] = seqdict[seq] | |
913 | |
914 if covarabund >= args.autopass: | |
915 preservedseqs[seq] = max(1, args.beta, (covarabund / abund)) | |
916 elif covarabund >= abund * args.beta: | |
917 passedseqs[seq] = covarabund / abund | |
918 elif len(seq.split(' ')) == 1: | |
919 passedseqs[seq] = max(1, args.beta) | |
920 | |
921 if args.min_samp_abund < 1: | |
922 min_count = args.min_samp_abund * covardict['total'] | |
923 else: | |
924 min_count = args.min_samp_abund | |
925 | |
926 # sort passed covars | |
927 lensortedpassed = sorted(passedseqs, key = lambda key : len(key.split(' ')), reverse=True) | |
928 ratiolensortedpassed = sorted(lensortedpassed, key = lambda key : passedseqs[key], reverse = True) | |
929 sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__) | |
930 deconved = {} | |
931 for seq in ratiolensortedpassed: # reassign counts | |
932 singles = seq.split(' ') | |
933 first = 0 | |
934 rem_count = 0 | |
935 for sing in sortedsingles: | |
936 if sing in singles: | |
937 if covardict['singles'][sing] > 0: | |
938 if first == 0: | |
939 first = 1 | |
940 rem_count = covardict['singles'][sing] | |
941 covardict['singles'][sing] = 0 | |
942 deconved[seq] = rem_count | |
943 else: | |
944 covardict['singles'][sing] = covardict['singles'][sing] - rem_count | |
945 else: | |
946 break | |
947 sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__) | |
948 | |
949 sortedpreserved = sorted(preservedseqs, key = lambda key : covardict[key]) | |
950 | |
951 for seq in sortedpreserved: | |
952 singles = seq.split(' ') | |
953 first = 0 | |
954 rem_count = 0 | |
955 for sing in sortedsingles: | |
956 if sing in singles: | |
957 if covardict['singles'][sing] > 0: | |
958 if first == 0: | |
959 first = 1 | |
960 rem_count = covardict['singles'][sing] | |
961 covardict['singles'][sing] = 0 | |
962 deconved[seq] = rem_count | |
963 else: | |
964 covardict['singles'][sing] = covardict['singles'][sing] - rem_count | |
965 else: | |
966 break | |
967 sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__) | |
968 | |
969 | |
970 newtotal = sum(deconved.values()) | |
971 fh_deconv = open(args.outfile6, "w") | |
972 fh_deconv.write(f"{args.samp}({covardict['total']}) | ({newtotal})\tCount\tAbundance\n") | |
973 sorted_deconved = sorted(deconved, key = deconved.__getitem__, reverse = True) | |
974 for seq in sorted_deconved: # write deconv | |
975 if deconved[seq] >= min_count: | |
976 fh_deconv.write(f"{seq}\t{deconved[seq]}\t{(deconved[seq]/newtotal):.3f}\n") | |
977 fh_deconv.close() | |
978 # END COVAR DECONV OUT | |
979 | |
980 return() | |
981 | |
982 def dechim(args, seqs): # processing sequence dictionary to remove chimeras | |
983 total = seqs['total'] | |
984 del seqs['total'] | |
985 sorted_seqs = sorted(seqs, key=seqs.__getitem__) # sort sequences by abundance, least to greatest | |
986 chimeras = [] | |
987 for seq in sorted_seqs: | |
988 pot_chim = ['Reference']+seq.split()+['Reference'] | |
989 chim_halves = [] | |
990 for i in range(0, len(pot_chim)-1): # create dimera halves | |
991 chim_halves.append([pot_chim[:i+1], pot_chim[i+1:]]) | |
992 parent_pairs = [] | |
993 | |
994 for dimera in chim_halves: # check for potential parents | |
995 | |
996 pot_left = [] | |
997 pot_rt = [] | |
998 lft_len = len(dimera[0]) | |
999 rt_len = len(dimera[1]) | |
1000 for pseq in sorted_seqs: | |
1001 if not seq == pseq: | |
1002 if (seqs[pseq] >= (seqs[seq] * args.foldab)): | |
1003 pot_par = ['Reference']+pseq.split()+['Reference'] | |
1004 if dimera[0] == pot_par[:lft_len]: | |
1005 pot_left.append(pot_par) | |
1006 if ((len(pot_par) >= rt_len) and (dimera[1] == pot_par[(len(pot_par)-rt_len):])): | |
1007 pot_rt.append(pot_par) | |
1008 | |
1009 if (len(pot_left) > 0 and len(pot_rt) > 0 ): | |
1010 for left_par in pot_left: # check potential parents' pairing | |
1011 for rt_par in pot_rt: | |
1012 if left_par != rt_par: | |
1013 left_break = left_par[lft_len] | |
1014 rt_break = rt_par[(len(rt_par)-rt_len)-1] | |
1015 if left_break == 'Reference' or rt_break == 'Reference': | |
1016 parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])]) | |
1017 else: | |
1018 left_break_POS = '' | |
1019 for c in left_break: | |
1020 if c.isdigit(): | |
1021 left_break_POS += c | |
1022 else: | |
1023 break | |
1024 | |
1025 rt_break_POS = '' | |
1026 for c in rt_break: | |
1027 if c.isdigit(): | |
1028 rt_break_POS += c | |
1029 else: | |
1030 break | |
1031 | |
1032 if int(left_break_POS) > int(rt_break_POS): | |
1033 parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])]) | |
1034 | |
1035 par_tot_abund = 0 | |
1036 pair_probs = [] | |
1037 for parents in parent_pairs: # calc 'expected' abundance | |
1038 pair_prob = (seqs[parents[0]] / total) * (seqs[parents[1]] / total) | |
1039 par_tot_abund += pair_prob | |
1040 pair_probs.append(pair_prob) | |
1041 | |
1042 recomb_count = par_tot_abund * total | |
1043 | |
1044 if not seqs[seq] >= recomb_count * args.alpha: # chimera check | |
1045 redist_count = float(seqs[seq]) | |
1046 chimeras.append(seq) | |
1047 seqs[seq] = 0 | |
1048 if args.redist == 1: # redist counts of chimera | |
1049 toadd = {} | |
1050 for i in range(0, len(parent_pairs)): | |
1051 counts_to_redist = (redist_count * (pair_probs[i]/par_tot_abund))/2 | |
1052 seqs[parent_pairs[i][0]] += counts_to_redist | |
1053 seqs[parent_pairs[i][1]] += counts_to_redist | |
1054 | |
1055 | |
1056 | |
1057 for chim in chimeras: # remove chimeras | |
1058 del seqs[chim] | |
1059 | |
1060 total = sum(seqs.values()) | |
1061 | |
1062 | |
1063 # total = sum(seqs.values) | |
1064 seqs['total'] = total | |
1065 | |
1066 return(seqs) | |
1067 | |
1068 def chimrm(args, seqs): # chimera removed process | |
1069 | |
1070 pre_len = len(seqs) | |
1071 inf_loop_shield = 0 | |
1072 while True: # send sequences for chimera removal while chimeras are still found | |
1073 dechim(args, seqs) | |
1074 post_len = len(seqs) | |
1075 inf_loop_shield += 1 | |
1076 if post_len >= pre_len: | |
1077 break | |
1078 if inf_loop_shield > 100: | |
1079 break | |
1080 pre_len = len(seqs) | |
1081 | |
1082 total = seqs['total'] | |
1083 del seqs['total'] | |
1084 if args.min_samp_abund < 1: | |
1085 min_count = args.min_samp_abund * total | |
1086 else: | |
1087 min_count = args.min_samp_abund | |
1088 sorted_seqs = sorted(seqs, key=seqs.__getitem__, reverse=True) | |
1089 fh_dechime = open(args.outfile5, 'w') | |
1090 fh_dechime.write(f"{args.samp}({int(total)})\n") | |
1091 fh_dechime.write("Sequences\tCount\tAbundance\n") | |
1092 for seq in seqs: # write chim_rm seqs | |
1093 abund = seqs[seq]/total | |
1094 if seqs[seq] >= min_count: | |
1095 fh_dechime.write(f"{seq}\t{round(seqs[seq])}\t{abund:.3f}\n") | |
1096 | |
1097 fh_dechime.close() | |
1098 return() | |
1099 | |
1100 def main(): | |
1101 | |
1102 args = arg_parser() # getting command line arguments | |
1103 | |
1104 covar_array = [] | |
1105 seq_array = [] | |
1106 total_mapped_reads = 0 | |
1107 | |
1108 if args.ref: | |
1109 ref = get_ref(args.ref) # get the reference ID and sequence from the FASTA file | |
1110 args.ref.close() | |
1111 if ref[1] == '': | |
1112 print('Reference not recognized as a Fasta format, skipping SAM parsing') | |
1113 else: | |
1114 refprot = '' | |
1115 if args.AAreport == 1: # make an Amino Acid sequence based on the reference sequence | |
1116 for x in range(0, (len(ref[1])-1)//3): | |
1117 AA = AAcall(ref[1][x*3]+ref[1][x*3+1]+ref[1][x*3+2]) | |
1118 refprot = refprot + AA | |
1119 if (len(ref[1])-1)%3 != 0: | |
1120 refprot = refprot + '?' | |
1121 | |
1122 covar_array, seq_array, total_mapped_reads = SAMparse(args, ref, refprot, args.Sam_file) | |
1123 | |
1124 else: | |
1125 print('No reference provided, skipping SAM parsing') | |
1126 | |
1127 in_covars = {'total' : total_mapped_reads, | |
1128 'singles' : {} | |
1129 } | |
1130 in_seqs = {'total' : total_mapped_reads} | |
1131 # Begin chimera removal if enabled | |
1132 if args.chim_rm == 1 or args.deconv == 1: | |
1133 if args.deconv == 1: | |
1134 for line in covar_array: | |
1135 lineparts = line.split("\t") | |
1136 in_covars[lineparts[0]] = int(lineparts[1]) | |
1137 if len(lineparts[0].split(' ')) == 1: | |
1138 in_covars['singles'][lineparts[0]] = int(lineparts[1]) | |
1139 | |
1140 for line in seq_array: | |
1141 lineparts = line.split("\t") | |
1142 in_seqs[lineparts[0]] = float(lineparts[1]) | |
1143 | |
1144 if args.deconv == 1: | |
1145 cvdeconv(args, in_covars, in_seqs) | |
1146 | |
1147 if args.chim_rm == 1: | |
1148 chimrm(args, in_seqs) | |
1149 | |
1150 if __name__ == '__main__': | |
1151 main() |