Mercurial > repos > degregory > sam_refiner
comparison SAM_Refiner_Gal.py @ 0:22e3f843f1db draft
Uploaded main file
| author | degregory |
|---|---|
| date | Mon, 13 Sep 2021 14:12:21 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:22e3f843f1db |
|---|---|
| 1 #!/bin/env python3 | |
| 2 # Writen by Devon Gregory with assistance from Christopher Bottoms | |
| 3 # University of Missouri | |
| 4 # Distributed under GNU GENERAL PUBLIC LICENSE v3 | |
| 5 import os | |
| 6 import sys | |
| 7 import argparse | |
| 8 import itertools | |
| 9 import time | |
| 10 from multiprocessing import Process | |
| 11 from pathlib import Path | |
| 12 | |
| 13 # process for collecing command line arguments | |
| 14 def arg_parser(): | |
| 15 | |
| 16 parser = argparse.ArgumentParser( | |
| 17 description='process Sam files for variant information' | |
| 18 ) | |
| 19 | |
| 20 parser.add_argument( | |
| 21 '--samp', | |
| 22 type=str, | |
| 23 default='Sample', | |
| 24 help='Sample Name' | |
| 25 ) | |
| 26 parser.add_argument( | |
| 27 '-r', '-reference', | |
| 28 type=argparse.FileType('r'), | |
| 29 dest='ref', | |
| 30 help='reference fasta' | |
| 31 ) | |
| 32 parser.add_argument( | |
| 33 '-S', '--Sam_file', | |
| 34 type=argparse.FileType('r'), | |
| 35 dest='Sam_file', | |
| 36 help='.sam file' | |
| 37 ) | |
| 38 parser.add_argument( | |
| 39 '-o1', '--out_file1', | |
| 40 dest='outfile1', | |
| 41 help='output file' | |
| 42 ) | |
| 43 parser.add_argument( | |
| 44 '-o2', '--out_file2', | |
| 45 dest='outfile2', | |
| 46 help='output file' | |
| 47 ) | |
| 48 parser.add_argument( | |
| 49 '-o3', '--out_file3', | |
| 50 dest='outfile3', | |
| 51 help='output file' | |
| 52 ) | |
| 53 parser.add_argument( | |
| 54 '-o4', '--out_file4', | |
| 55 dest='outfile4', | |
| 56 help='output file' | |
| 57 ) | |
| 58 parser.add_argument( | |
| 59 '-o5', '--out_file5', | |
| 60 dest='outfile5', | |
| 61 help='output file' | |
| 62 ) | |
| 63 parser.add_argument( | |
| 64 '-o6', '--out_file6', | |
| 65 dest='outfile6', | |
| 66 help='output file' | |
| 67 ) | |
| 68 parser.add_argument( | |
| 69 '-o7', '--out_file7', | |
| 70 dest='outfile7', | |
| 71 help='output file' | |
| 72 ) | |
| 73 parser.add_argument( | |
| 74 '--use_count', | |
| 75 type=int, | |
| 76 default=1, | |
| 77 choices=[0, 1], | |
| 78 help='Enable/Disable (1/0) use of counts in sequence IDs, default enabled (--use_count 1)' | |
| 79 ) | |
| 80 parser.add_argument( | |
| 81 '--min_count', | |
| 82 type=int, | |
| 83 default=10, | |
| 84 help='Minimum observations required to be included in sample reports; >= 1 occurance count; < 1 %% observed (.1 = 10%%), (default: .001)' | |
| 85 ) | |
| 86 parser.add_argument( | |
| 87 '--min_samp_abund', | |
| 88 type=float, | |
| 89 default=0.001, | |
| 90 help='Minimum abundance required for inclusion in sample reports; %% observed (.1 = 10%%), (default: .001)' | |
| 91 ) | |
| 92 parser.add_argument( | |
| 93 '--ntabund', | |
| 94 type=float, | |
| 95 default=.001, | |
| 96 help='Minimum abundance relative to total reads required for a position to be reported in the nt call output; must be non-negative and < 1, %% observed (.1 = 10%%), (default: .001)' | |
| 97 ) | |
| 98 parser.add_argument( | |
| 99 '--max_dist', | |
| 100 type=int, | |
| 101 default=40, | |
| 102 help='Maximum number of variances from the reference a sequence can have to be consider in covars processing (default: 40)' | |
| 103 ) | |
| 104 parser.add_argument( | |
| 105 '--max_covar', | |
| 106 type=int, | |
| 107 default=8, | |
| 108 help='Maximum number of variances from the reference to be reported in covars (default: 8)' | |
| 109 ) | |
| 110 parser.add_argument( | |
| 111 '--AAreport', | |
| 112 type=int, | |
| 113 default=1, | |
| 114 choices=[0, 1], | |
| 115 help='Enable/Disable (1/0) amino acid reporting, default enabled (--AAreport 1)' | |
| 116 ) | |
| 117 parser.add_argument( | |
| 118 '--AAcodonasMNP', | |
| 119 type=int, | |
| 120 default=1, | |
| 121 choices=[0, 1], | |
| 122 help='Enable/Disable (1/0) reporting multiple nt changes in a single codon as one polymorphism, default enabled (--AAcodonasMNP 1), requires AAreport enabled' | |
| 123 ) | |
| 124 parser.add_argument( | |
| 125 '--alpha', | |
| 126 type=float, | |
| 127 default=1.2, | |
| 128 help='Modifier for chim_rm chimera checking, default 1.2. Higher = more sensitive, more false chimeras removed; lower = less sensitive, fewer chimeras removed' | |
| 129 ) | |
| 130 parser.add_argument( | |
| 131 '--foldab', | |
| 132 type=float, | |
| 133 default=1.8, | |
| 134 help='Threshold for potential parent / chimera abundance ratio for chim_rm; default is 1.8' | |
| 135 ) | |
| 136 parser.add_argument( | |
| 137 '--redist', | |
| 138 type=int, | |
| 139 default=1, | |
| 140 choices=[0, 1], | |
| 141 help='Enable/Disable (1/0) redistribution of chimera counts for chim_rm, default enabled (--redist 1)' | |
| 142 ) | |
| 143 parser.add_argument( | |
| 144 '--beta', | |
| 145 type=float, | |
| 146 default=1, | |
| 147 help='Modifier for covar pass checking, default 1. Higher = more sensitive, more failed checks; lower = less sensitive, fewer failed checks' | |
| 148 ) | |
| 149 parser.add_argument( | |
| 150 '--autopass', | |
| 151 type=float, | |
| 152 default=.3, | |
| 153 help='threshold for a sequence to automatically pass the covar pass checking' | |
| 154 ) | |
| 155 parser.add_argument( | |
| 156 '--nt_call', | |
| 157 type=int, | |
| 158 default=1, | |
| 159 choices=[0, 1], | |
| 160 help='Enable/Disable (1/0) nt_call output, default enabled (--nt_call 1)' | |
| 161 ) | |
| 162 parser.add_argument( | |
| 163 '--ntvar', | |
| 164 type=int, | |
| 165 default=0, | |
| 166 choices=[0, 1], | |
| 167 help='Enable/Disable (1/0) nt_var output, default enabled (--nt_call 1)' | |
| 168 ) | |
| 169 parser.add_argument( | |
| 170 '--indel', | |
| 171 type=int, | |
| 172 default=1, | |
| 173 choices=[0, 1], | |
| 174 help='Enable/Disable (1/0) indel output, default enabled (--indel 1)' | |
| 175 ) | |
| 176 parser.add_argument( | |
| 177 '--seq', | |
| 178 type=int, | |
| 179 default=1, | |
| 180 choices=[0, 1], | |
| 181 help='Enable/Disable (1/0) unique seq output, default enabled (--seq 1)' | |
| 182 ) | |
| 183 parser.add_argument( | |
| 184 '--covar', type=int, | |
| 185 default=1, | |
| 186 choices=[0, 1], | |
| 187 help='Enable/Disable (1/0) covar output, default enabled (--covar 1)' | |
| 188 ) | |
| 189 parser.add_argument( | |
| 190 '--chim_rm', | |
| 191 type=int, | |
| 192 default=1, | |
| 193 choices=[0, 1], | |
| 194 help='Enable/Disable (1/0) chim_rm output, default enabled (--chim_rm 1)' | |
| 195 ) | |
| 196 parser.add_argument( | |
| 197 '--deconv', | |
| 198 type=int, | |
| 199 default=1, | |
| 200 choices=[0, 1], | |
| 201 help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)' | |
| 202 ) | |
| 203 parser.add_argument( | |
| 204 '--wgs', | |
| 205 type=int, | |
| 206 default=0, | |
| 207 choices=[0, 1], | |
| 208 help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)' | |
| 209 ) | |
| 210 | |
| 211 args = parser.parse_args() | |
| 212 | |
| 213 # checking for proper range of some parameters and consistency/compatibility | |
| 214 if args.wgs == 1: | |
| 215 if args.deconv == 1 or args.chim_rm == 1: | |
| 216 args.deconv = 0 | |
| 217 args.chim_rm = 0 | |
| 218 print('WGS mode enabled, disabling chimera removal methods') | |
| 219 | |
| 220 return(args) | |
| 221 | |
| 222 def get_ref(ref): # get the reference ID and sequence from the FASTA file. Will only get the first. | |
| 223 | |
| 224 n=0 | |
| 225 refID = '' | |
| 226 refseq = '' | |
| 227 if ref: | |
| 228 for line in ref: | |
| 229 if line.startswith('>'): | |
| 230 n+=1 | |
| 231 if n > 1: | |
| 232 break | |
| 233 refID = line[1:].strip("\n\r") | |
| 234 elif n == 1: | |
| 235 refseq = refseq + line.strip("\n\r") | |
| 236 refseq = refseq.upper() | |
| 237 | |
| 238 | |
| 239 return(refID, refseq) | |
| 240 | |
| 241 def AAcall(codon): # amino acid / codon dictionary to return encoded AAs | |
| 242 AAdict = { | |
| 243 'TTT' : 'F', | |
| 244 'TTC' : 'F', | |
| 245 'TTA' : 'L', | |
| 246 'TTG' : 'L', | |
| 247 'TCT' : 'S', | |
| 248 'TCC' : 'S', | |
| 249 'TCA' : 'S', | |
| 250 'TCG' : 'S', | |
| 251 'TAT' : 'Y', | |
| 252 'TAC' : 'Y', | |
| 253 'TAA' : '*', | |
| 254 'TAG' : '*', | |
| 255 'TGT' : 'C', | |
| 256 'TGC' : 'C', | |
| 257 'TGA' : '*', | |
| 258 'TGG' : 'W', | |
| 259 | |
| 260 'CTT' : 'L', | |
| 261 'CTC' : 'L', | |
| 262 'CTA' : 'L', | |
| 263 'CTG' : 'L', | |
| 264 'CCT' : 'P', | |
| 265 'CCC' : 'P', | |
| 266 'CCA' : 'P', | |
| 267 'CCG' : 'P', | |
| 268 'CAT' : 'H', | |
| 269 'CAC' : 'H', | |
| 270 'CAA' : 'Q', | |
| 271 'CAG' : 'Q', | |
| 272 'CGT' : 'R', | |
| 273 'CGC' : 'R', | |
| 274 'CGA' : 'R', | |
| 275 'CGG' : 'R', | |
| 276 | |
| 277 'ATT' : 'I', | |
| 278 'ATC' : 'I', | |
| 279 'ATA' : 'I', | |
| 280 'ATG' : 'M', | |
| 281 'ACT' : 'T', | |
| 282 'ACC' : 'T', | |
| 283 'ACA' : 'T', | |
| 284 'ACG' : 'T', | |
| 285 'AAT' : 'N', | |
| 286 'AAC' : 'N', | |
| 287 'AAA' : 'K', | |
| 288 'AAG' : 'K', | |
| 289 'AGT' : 'S', | |
| 290 'AGC' : 'S', | |
| 291 'AGA' : 'R', | |
| 292 'AGG' : 'R', | |
| 293 | |
| 294 'GTT' : 'V', | |
| 295 'GTC' : 'V', | |
| 296 'GTA' : 'V', | |
| 297 'GTG' : 'V', | |
| 298 'GCT' : 'A', | |
| 299 'GCC' : 'A', | |
| 300 'GCA' : 'A', | |
| 301 'GCG' : 'A', | |
| 302 'GAT' : 'D', | |
| 303 'GAC' : 'D', | |
| 304 'GAA' : 'E', | |
| 305 'GAG' : 'E', | |
| 306 'GGT' : 'G', | |
| 307 'GGC' : 'G', | |
| 308 'GGA' : 'G', | |
| 309 'GGG' : 'G' | |
| 310 } | |
| 311 AA = '?' | |
| 312 if codon in AAdict: | |
| 313 AA = AAdict[codon] | |
| 314 | |
| 315 return(AA) | |
| 316 | |
| 317 def singletCodon(ntPOS, nt, ref): # process to return the AA and protein seq. position based on the reference and provided nt seq position and nt | |
| 318 AAPOS = (ntPOS-1)//3 | |
| 319 AAmod = (ntPOS-1)%3 | |
| 320 codon = "" | |
| 321 try: | |
| 322 if AAmod == 0: | |
| 323 codon = nt+ref[1][ntPOS]+ref[1][ntPOS+1] | |
| 324 elif AAmod == 1: | |
| 325 codon = ref[1][ntPOS-2]+nt+ref[1][ntPOS] | |
| 326 elif AAmod == 2: | |
| 327 codon = ref[1][ntPOS-3]+ref[1][ntPOS-2]+nt | |
| 328 except: | |
| 329 codon = "XXX" | |
| 330 | |
| 331 return(AAPOS+1, AAcall(codon)) | |
| 332 | |
| 333 def getCombos(qlist, clen): # returns combinations of single polymorphisms in a sequence | |
| 334 combos = [] | |
| 335 if (clen == 0 or clen > len(qlist)): | |
| 336 clen = len(qlist) | |
| 337 for N in range(1, clen+1): | |
| 338 for comb in itertools.combinations(qlist, N): | |
| 339 combos.append(' '.join(comb)) | |
| 340 return(combos) | |
| 341 | |
| 342 def SAMparse(args, ref, refprot, file): # process SAM files | |
| 343 covar_array = [] | |
| 344 seq_array = [] | |
| 345 | |
| 346 nt_call_dict_dict = {} | |
| 347 indel_dict = {} | |
| 348 seq_species = {} | |
| 349 sam_read_count = 0 | |
| 350 sam_line_count = 0 | |
| 351 firstPOS = 0 | |
| 352 lastPOS = 0 | |
| 353 coverage = {} | |
| 354 for line in args.Sam_file: | |
| 355 if not line.startswith('@'): # ignore header lines | |
| 356 splitline = line.split("\t") | |
| 357 if ref[0].upper().startswith(splitline[2].upper()): # check map ID matches referecne ID | |
| 358 if int(splitline[4]) > 0: # Check mapping score is positive | |
| 359 | |
| 360 abund_count=1 | |
| 361 if args.use_count == 1: # get the unique sequence counts | |
| 362 if '-' in splitline[0] and '=' in splitline[0]: | |
| 363 eq_split = splitline[0].split('=') | |
| 364 dash_split = splitline[0].split('-') | |
| 365 if len(eq_split[-1]) > len(dash_split[-1]): | |
| 366 abund_count=int(dash_split[-1]) | |
| 367 else: | |
| 368 abund_count=int(eq_split[-1]) | |
| 369 | |
| 370 elif '-' in splitline[0]: | |
| 371 try: | |
| 372 abund_count=int(splitline[0].split('-')[-1]) | |
| 373 except: | |
| 374 pass | |
| 375 | |
| 376 elif '=' in splitline[0]: | |
| 377 try: | |
| 378 abund_count=int(splitline[0].split('=')[-1]) | |
| 379 except: | |
| 380 pass | |
| 381 | |
| 382 | |
| 383 sam_read_count += abund_count | |
| 384 | |
| 385 CIGAR = splitline[5] | |
| 386 POS = int(splitline[3]) | |
| 387 if firstPOS == 0: | |
| 388 firstPOS = POS | |
| 389 elif POS < firstPOS: | |
| 390 firstPOS = POS | |
| 391 | |
| 392 readID = splitline[0] | |
| 393 query_seq = splitline[9].upper() | |
| 394 run_length = 0 | |
| 395 query_seq_parsed = '' | |
| 396 query_pos = 0 | |
| 397 q_pars_pos = 0 | |
| 398 mutations = [] | |
| 399 | |
| 400 for C in CIGAR: # process sequence based on standard CIGAR line | |
| 401 if C == 'M' or C == 'I' or C == 'D' or C == 'S' or C == 'H': | |
| 402 if C == 'S': | |
| 403 query_pos = query_pos + run_length | |
| 404 | |
| 405 if C == 'I': | |
| 406 if query_pos > 0: | |
| 407 # add insertion to dict | |
| 408 iPOS = q_pars_pos+POS | |
| 409 | |
| 410 iSeq = query_seq[query_pos: query_pos+run_length] | |
| 411 istring = str(iPOS)+'-insert'+iSeq | |
| 412 | |
| 413 try: | |
| 414 indel_dict[istring] | |
| 415 except: | |
| 416 indel_dict[istring] = abund_count | |
| 417 else: | |
| 418 indel_dict[istring] += abund_count | |
| 419 | |
| 420 if args.AAreport == 1 and (run_length % 3 == 0): | |
| 421 | |
| 422 iProt = '' | |
| 423 if iPOS % 3 == 1: | |
| 424 for x in range(0, (run_length//3)): | |
| 425 AA = AAcall(iSeq[x*3]+iSeq[x*3+1]+iSeq[x*3+2]) | |
| 426 iProt = iProt + AA | |
| 427 mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')') | |
| 428 elif iPOS % 3 == 2: | |
| 429 if query_pos > 0: | |
| 430 ipSeq = query_seq[query_pos-1:query_pos+run_length+2] | |
| 431 else: | |
| 432 ipSeq = "XXX"+query_seq[query_pos+2:query_pos+run_length+2] | |
| 433 for x in range(0, (run_length//3)+1): | |
| 434 AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2]) | |
| 435 iProt = iProt + AA | |
| 436 mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')') | |
| 437 else: | |
| 438 if query_pos > 1: | |
| 439 ipSeq = query_seq[query_pos-2:query_pos+run_length+1] | |
| 440 else: | |
| 441 ipSeq = "XXX"+query_seq[query_pos+1:query_pos+run_length+1] | |
| 442 | |
| 443 for x in range(0, (run_length//3)+1): | |
| 444 AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2]) | |
| 445 iProt = iProt + AA | |
| 446 mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')') | |
| 447 else: | |
| 448 mutations.append(istring) | |
| 449 | |
| 450 query_pos = query_pos + run_length | |
| 451 | |
| 452 elif C == 'D': | |
| 453 for X in range(0, run_length): | |
| 454 query_seq_parsed += '-' | |
| 455 | |
| 456 delstring = str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del' | |
| 457 | |
| 458 if args.AAreport == 1 and (run_length % 3 == 0) and not ((q_pars_pos+POS) % 3 == 1 ): | |
| 459 if (q_pars_pos+POS) % 3 == 2: | |
| 460 newcodon = query_seq[query_pos-1:query_pos+2] | |
| 461 newAArefpos = (q_pars_pos+POS) // 3 | |
| 462 mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')') | |
| 463 else: | |
| 464 newcodon = query_seq[query_pos-2:query_pos+1] | |
| 465 newAArefpos = (q_pars_pos+POS) // 3 | |
| 466 mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')') | |
| 467 else: | |
| 468 mutations.append(delstring) | |
| 469 | |
| 470 if args.nt_call == 1: | |
| 471 for N in range(q_pars_pos+POS, q_pars_pos+POS+int(run_length)): | |
| 472 try: | |
| 473 nt_call_dict_dict[N] | |
| 474 except: | |
| 475 nt_call_dict_dict[N] = {'A' : 0, | |
| 476 'T' : 0, | |
| 477 'C' : 0, | |
| 478 'G' : 0, | |
| 479 '-' : 0} | |
| 480 nt_call_dict_dict[N]['-'] = abund_count | |
| 481 else: | |
| 482 nt_call_dict_dict[N]['-'] += abund_count | |
| 483 | |
| 484 try: | |
| 485 indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] | |
| 486 except: | |
| 487 indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] = int(abund_count) | |
| 488 else: | |
| 489 indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] += int(abund_count) | |
| 490 | |
| 491 q_pars_pos = q_pars_pos + run_length | |
| 492 | |
| 493 elif C == 'M': | |
| 494 offset = q_pars_pos-query_pos | |
| 495 refPOS = POS+offset | |
| 496 | |
| 497 for ntPOS in range(query_pos, query_pos+run_length): | |
| 498 if query_seq[ntPOS] == 'A' or query_seq[ntPOS] == 'T' or query_seq[ntPOS] == 'C' or query_seq[ntPOS] == 'G': | |
| 499 if query_seq[ntPOS] != ref[1][refPOS+ntPOS-1]: | |
| 500 if args.AAreport == 1 and args.AAcodonasMNP == 0: | |
| 501 AAinfo = singletCodon(refPOS+ntPOS, query_seq[ntPOS], ref) | |
| 502 mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') | |
| 503 else: | |
| 504 mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS]) | |
| 505 if args.nt_call == 1: | |
| 506 try: | |
| 507 nt_call_dict_dict[refPOS+ntPOS] | |
| 508 except: | |
| 509 nt_call_dict_dict[refPOS+ntPOS] = {'A' : 0, | |
| 510 'T' : 0, | |
| 511 'C' : 0, | |
| 512 'G' : 0, | |
| 513 '-' : 0} | |
| 514 nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] = abund_count | |
| 515 else: | |
| 516 nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] += abund_count | |
| 517 | |
| 518 | |
| 519 q_pars_pos = q_pars_pos + run_length | |
| 520 query_pos = query_pos + run_length | |
| 521 | |
| 522 run_length = 0 | |
| 523 | |
| 524 | |
| 525 else: | |
| 526 run_length = (10 * run_length) + int(C) | |
| 527 # END CIGAR PARSE | |
| 528 | |
| 529 | |
| 530 | |
| 531 if len(mutations) == 0: # record reference counts | |
| 532 if args.wgs == 0: | |
| 533 try: | |
| 534 seq_species['Reference'] += abund_count | |
| 535 except: | |
| 536 seq_species['Reference'] = abund_count | |
| 537 else: | |
| 538 try: | |
| 539 seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] += abund_count | |
| 540 except: | |
| 541 seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] = abund_count | |
| 542 | |
| 543 else: # record variants and counts | |
| 544 if args.AAreport == 1 and args.AAcodonasMNP == 1: | |
| 545 codonchecked = [] | |
| 546 codon = '' | |
| 547 skip = 0 | |
| 548 MNP = '' | |
| 549 for i in range(0, len(mutations)): # checking for MNP | |
| 550 if 'Del' in mutations[i] or 'insert' in mutations[i]: | |
| 551 codonchecked.append(mutations[i]) | |
| 552 elif skip > 0: | |
| 553 skip -= 1 | |
| 554 else: | |
| 555 mut1POS = int(''.join([c for c in mutations[i] if c.isdigit()])) | |
| 556 try: | |
| 557 mutations[i+1] | |
| 558 except: | |
| 559 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref) | |
| 560 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') | |
| 561 else: | |
| 562 | |
| 563 if mut1POS % 3 == 0: | |
| 564 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref) | |
| 565 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') | |
| 566 else: | |
| 567 mut2POS = int(''.join([c for c in mutations[i+1] if c.isdigit()])) | |
| 568 if mut2POS - mut1POS < 3: | |
| 569 AAPOS = mut1POS // 3 | |
| 570 if mut1POS % 3 == 1: | |
| 571 if mut2POS % 3 == 0: | |
| 572 codon = mutations[i][-1]+ref[1][mut1POS]+mutations[i+1][-1] | |
| 573 MNP = mutations[i][-1]+'r'+mutations[i+1][-1] | |
| 574 skip = 1 | |
| 575 else: | |
| 576 try: | |
| 577 mutations[i+2] | |
| 578 except: | |
| 579 codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS] | |
| 580 MNP = mutations[i][-1]+mutations[i+1][-1]+'r' | |
| 581 skip = 1 | |
| 582 else: | |
| 583 mut3POS = int(''.join([c for c in mutations[i+2] if c.isdigit()])) | |
| 584 if mut2POS == mut3POS -1: | |
| 585 codon = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1] | |
| 586 MNP = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1] | |
| 587 skip = 2 | |
| 588 else: | |
| 589 codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS] | |
| 590 MNP = mutations[i][-1]+mutations[i+1][-1]+'r' | |
| 591 skip = 1 | |
| 592 codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')') | |
| 593 elif mut2POS - mut1POS == 1: | |
| 594 codon = ref[1][mut1POS-2]+mutations[i][-1]+mutations[i+1][-1] | |
| 595 MNP = mutations[i][-1]+mutations[i+1][-1] | |
| 596 skip = 1 | |
| 597 codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')') | |
| 598 else: | |
| 599 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref) | |
| 600 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') | |
| 601 | |
| 602 | |
| 603 else: | |
| 604 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref) | |
| 605 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') | |
| 606 mutations = " ".join(codonchecked) | |
| 607 else: | |
| 608 mutations = " ".join(mutations) | |
| 609 | |
| 610 if args.wgs == 0: | |
| 611 try: | |
| 612 seq_species[mutations] += abund_count | |
| 613 except: | |
| 614 seq_species[mutations] = abund_count | |
| 615 else: | |
| 616 try: | |
| 617 seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] += abund_count | |
| 618 except: | |
| 619 seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] = abund_count | |
| 620 | |
| 621 if lastPOS < POS+q_pars_pos: | |
| 622 lastPOS = POS+q_pars_pos | |
| 623 for i in range(POS, POS+q_pars_pos): # update coverage | |
| 624 try: | |
| 625 coverage[i] += abund_count | |
| 626 except: | |
| 627 coverage[i] = abund_count | |
| 628 | |
| 629 args.Sam_file.close() | |
| 630 # END SAM LINES | |
| 631 | |
| 632 total_mapped_reads = sam_read_count | |
| 633 | |
| 634 if args.seq == 1: # output the sequence | |
| 635 seq_fh = open(args.outfile1, "w") | |
| 636 seq_fh.write(args.samp+"("+str(sam_read_count)+")\n") | |
| 637 seq_fh.write("Unique Sequence\tCount\tAbundance\n") | |
| 638 | |
| 639 sorted_seq = sorted(seq_species, key=seq_species.__getitem__, reverse=True) | |
| 640 for key in sorted_seq: | |
| 641 if seq_species[key] >= args.min_count: | |
| 642 if (seq_species[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0: | |
| 643 seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}\n") | |
| 644 seq_array.append(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}") | |
| 645 elif args.wgs == 1: | |
| 646 splitseqs = key.split() | |
| 647 cov = [] | |
| 648 for x in range(int(splitseqs[0]), int(splitseqs[-1])): | |
| 649 cov.append(coverage[x]) | |
| 650 min_cov = min(cov) | |
| 651 if (seq_species[key]/min_cov >= args.min_samp_abund): | |
| 652 seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/min_cov):.3f}\n") | |
| 653 else: | |
| 654 break | |
| 655 | |
| 656 seq_fh.close() | |
| 657 # END SEQ OUT | |
| 658 | |
| 659 if args.indel == 1: # and len(indel_dict) > 0: # output indels, if there are any | |
| 660 sorted_indels = sorted(indel_dict, key=indel_dict.__getitem__, reverse=True) | |
| 661 indels_to_write = [] | |
| 662 for key in sorted_indels: | |
| 663 if indel_dict[key] >= args.min_count: | |
| 664 if indel_dict[key] / sam_read_count >= args.min_samp_abund and args.wgs == 0: | |
| 665 indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key]/sam_read_count):.3f}\n") | |
| 666 elif args.wgs == 1: | |
| 667 indelPOS = '' | |
| 668 for c in key: | |
| 669 if c.isdigit(): | |
| 670 indelPOS += c | |
| 671 else: | |
| 672 break | |
| 673 indelPOS = int(indelPOS) | |
| 674 if indel_dict[key] / coverage[indelPOS] >= args.min_samp_abund: | |
| 675 indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key] / coverage[indelPOS]):.3f}\n") | |
| 676 else: | |
| 677 break | |
| 678 # if len(indels_to_write) > 0: | |
| 679 indel_fh = open(args.outfile2, "w") | |
| 680 indel_fh.write(args.samp+"("+str(sam_read_count)+")\n") | |
| 681 indel_fh.write("Indel\tCount\tAbundance\n") | |
| 682 for indel_entry in indels_to_write: | |
| 683 indel_fh.write(indel_entry) | |
| 684 indel_fh.close() | |
| 685 # END INDEL OUT | |
| 686 | |
| 687 if args.nt_call == 1: # out put nt calls | |
| 688 ntcall_fh = open(args.outfile3, "w") | |
| 689 ntcall_fh.write(args.samp+"("+str(sam_read_count)+")\n") | |
| 690 if args.ntvar == 1: | |
| 691 ntcallv_fh = open(args.outfile7, "w") | |
| 692 ntcallv_fh.write(args.samp+"("+str(sam_read_count)+")\n") | |
| 693 sorted_POS = sorted(nt_call_dict_dict) | |
| 694 if args.AAreport == 1: | |
| 695 ntcall_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n") | |
| 696 if args.ntvar == 1: | |
| 697 ntcallv_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n") | |
| 698 for POS in sorted_POS: | |
| 699 try: | |
| 700 total = coverage[POS] | |
| 701 except: | |
| 702 total = 0 | |
| 703 if total >= (sam_read_count * args.ntabund): | |
| 704 AAinfo = singletCodon(POS, ref[1][POS-1], ref) | |
| 705 POS_calls = {} | |
| 706 for key in nt_call_dict_dict[POS]: | |
| 707 POS_calls[key] = nt_call_dict_dict[POS][key] | |
| 708 sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True) | |
| 709 | |
| 710 ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1]) | |
| 711 ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) | |
| 712 ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])) | |
| 713 ntcall_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}") | |
| 714 if sorted_calls[0] != ref[1][POS-1]: | |
| 715 if args.ntvar == 1: | |
| 716 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1]) | |
| 717 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) | |
| 718 ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])) | |
| 719 ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}") | |
| 720 | |
| 721 mod = (POS)%3 | |
| 722 | |
| 723 if mod == 0: | |
| 724 try: | |
| 725 codon = ref[1][POS-3]+ref[1][POS-2]+sorted_calls[0] | |
| 726 except: | |
| 727 codon = 'NNN' | |
| 728 elif mod == 2: | |
| 729 try: | |
| 730 codon = ref[1][POS-2]+sorted_calls[0]+ref[1][POS] | |
| 731 except: | |
| 732 codon = 'NNN' | |
| 733 elif mod == 1: | |
| 734 try: | |
| 735 codon = sorted_calls[0]+ref[1][POS]+ref[1][POS+1] | |
| 736 except: | |
| 737 codon = 'NNN' | |
| 738 ntcall_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1]) | |
| 739 if args.ntvar == 1: | |
| 740 ntcallv_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1]) | |
| 741 if (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total >= args.min_samp_abund): | |
| 742 if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1: | |
| 743 ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1]) | |
| 744 ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1]) | |
| 745 if nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count: | |
| 746 if nt_call_dict_dict[POS][sorted_calls[2]] / total >= args.min_samp_abund: | |
| 747 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1: | |
| 748 ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}") | |
| 749 ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}") | |
| 750 | |
| 751 if args.ntvar == 1: | |
| 752 ntcallv_fh.write("\n") | |
| 753 elif (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] / total >= args.min_samp_abund): | |
| 754 if args.ntvar == 1: | |
| 755 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1]) | |
| 756 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) | |
| 757 ntcallv_fh.write("\t"+str(total)+"\t\t") | |
| 758 ntcallv_fh.write(f"\t") | |
| 759 ntcallv_fh.write("\t\t") | |
| 760 ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1]) | |
| 761 ntcall_fh.write("\t\t") | |
| 762 ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1]) | |
| 763 | |
| 764 if (nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total >= args.min_samp_abund): | |
| 765 ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}") | |
| 766 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1: | |
| 767 ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}") | |
| 768 if args.ntvar == 1: | |
| 769 ntcallv_fh.write("\n") | |
| 770 | |
| 771 ntcall_fh.write("\n") | |
| 772 else: | |
| 773 ntcall_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n") | |
| 774 if args.ntvar == 1: | |
| 775 ntcallv_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n") | |
| 776 | |
| 777 for POS in sorted_POS: | |
| 778 try: | |
| 779 total = coverage[POS] # sum(nt_call_dict_dict[POS].values()) | |
| 780 except: | |
| 781 total = 0 | |
| 782 if total >= (sam_read_count * args.ntabund): | |
| 783 POS_calls = {} | |
| 784 for key in nt_call_dict_dict[POS]: | |
| 785 POS_calls[key] = nt_call_dict_dict[POS][key] | |
| 786 sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True) | |
| 787 | |
| 788 ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1]) | |
| 789 ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) | |
| 790 ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}") | |
| 791 | |
| 792 if sorted_calls[0] != ref[1][POS-1]: | |
| 793 if args.ntvar == 1: | |
| 794 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]) | |
| 795 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) | |
| 796 ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])) | |
| 797 ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}") | |
| 798 if (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund): | |
| 799 ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}") | |
| 800 if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1: | |
| 801 ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}") | |
| 802 if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund): | |
| 803 ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}") | |
| 804 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1: | |
| 805 ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}") | |
| 806 if args.ntvar == 1: | |
| 807 ntcallv_fh.write("\n") | |
| 808 | |
| 809 elif (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund): | |
| 810 if args.ntvar == 1: | |
| 811 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]) | |
| 812 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) | |
| 813 ntcallv_fh.write("\t"+str(total)+"\t\t") | |
| 814 ntcallv_fh.write(f"\t") | |
| 815 ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}") | |
| 816 ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}") | |
| 817 if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and(nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund): | |
| 818 ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}") | |
| 819 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1: | |
| 820 ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}") | |
| 821 if args.ntvar == 1: | |
| 822 ntcallv_fh.write("\n") | |
| 823 | |
| 824 ntcall_fh.write("\n") | |
| 825 | |
| 826 ntcall_fh.close() | |
| 827 if args.ntvar == 1: | |
| 828 ntcallv_fh.close() | |
| 829 # END NT CALL OUT | |
| 830 if args.covar == 1: # output covariants | |
| 831 testtrack = 0 | |
| 832 combinations = {} | |
| 833 for sequence in seq_species: | |
| 834 if args.wgs == 0: | |
| 835 singles = sequence.split() | |
| 836 else: | |
| 837 singles = (sequence.split())[1:-1] | |
| 838 if len(singles) <= args.max_dist and singles[0] != 'Ref': | |
| 839 for combo in getCombos(singles, args.max_covar): | |
| 840 if not combo in combinations: | |
| 841 combinations[combo] = seq_species[sequence] | |
| 842 else: | |
| 843 combinations[combo] += seq_species[sequence] | |
| 844 | |
| 845 covar_fh = open(args.outfile4, "w") | |
| 846 covar_fh.write(args.samp+"("+str(sam_read_count)+")\n") | |
| 847 covar_fh.write("Co-Variants\tCount\tAbundance\n") | |
| 848 sortedcombos = sorted(combinations, key=combinations.__getitem__, reverse=True) | |
| 849 for key in sortedcombos: | |
| 850 if (combinations[key] >= args.min_count): | |
| 851 if (combinations[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0: | |
| 852 covar_fh.write(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}\n") | |
| 853 covar_array.append(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}") | |
| 854 elif args.wgs == 1: | |
| 855 coveragepercent = 0 | |
| 856 splitcombos = key.split() | |
| 857 if len(splitcombos) == 1: | |
| 858 coveragePOS = '' | |
| 859 for c in key: | |
| 860 if c.isdigit(): | |
| 861 coveragePOS += c | |
| 862 else: | |
| 863 break | |
| 864 coveragepercent = combinations[key] / coverage[int(coveragePOS)] | |
| 865 else: | |
| 866 startcovPOS = '' | |
| 867 for c in splitcombos[0]: | |
| 868 if c.isdigit(): | |
| 869 startcovPOS += c | |
| 870 else: | |
| 871 break | |
| 872 endcovPOS = '' | |
| 873 for c in splitcombos[-1]: | |
| 874 if c.isdigit(): | |
| 875 endcovPOS += c | |
| 876 else: | |
| 877 break | |
| 878 coveragevals = [] | |
| 879 for i in range(int(startcovPOS), int(endcovPOS)+1): | |
| 880 coveragevals.append(coverage[i]) | |
| 881 mincov = min(coverval for coverval in coveragevals) | |
| 882 coveragepercent = combinations[key] / mincov | |
| 883 if coveragepercent >= args.min_samp_abund: | |
| 884 covar_fh.write(f"{key}\t{combinations[key]}\t{coveragepercent:.3f}\n") | |
| 885 | |
| 886 covar_fh.close() | |
| 887 | |
| 888 | |
| 889 # END COVAR OUT | |
| 890 | |
| 891 return(covar_array, seq_array, sam_read_count) | |
| 892 | |
| 893 def cvdeconv(args, covardict, seqdict): # covar deconvolution process | |
| 894 | |
| 895 passedseqs = {} | |
| 896 preservedseqs = {} | |
| 897 for seq in seqdict: # pass check for actual : expected abundance | |
| 898 if seq != 'total' and seq != 'singles': | |
| 899 | |
| 900 splitseq = seq.split(' ') | |
| 901 abund = 1 | |
| 902 for sing in splitseq: | |
| 903 try: | |
| 904 abund = abund * (covardict[sing] / covardict['total']) | |
| 905 except: | |
| 906 abund = abund * (seqdict[seq] / seqdict['total']) | |
| 907 | |
| 908 try: | |
| 909 covarabund = covardict[seq]/covardict['total'] | |
| 910 except: | |
| 911 covarabund = seqdict[seq]/seqdict['total'] | |
| 912 covardict[seq] = seqdict[seq] | |
| 913 | |
| 914 if covarabund >= args.autopass: | |
| 915 preservedseqs[seq] = max(1, args.beta, (covarabund / abund)) | |
| 916 elif covarabund >= abund * args.beta: | |
| 917 passedseqs[seq] = covarabund / abund | |
| 918 elif len(seq.split(' ')) == 1: | |
| 919 passedseqs[seq] = max(1, args.beta) | |
| 920 | |
| 921 if args.min_samp_abund < 1: | |
| 922 min_count = args.min_samp_abund * covardict['total'] | |
| 923 else: | |
| 924 min_count = args.min_samp_abund | |
| 925 | |
| 926 # sort passed covars | |
| 927 lensortedpassed = sorted(passedseqs, key = lambda key : len(key.split(' ')), reverse=True) | |
| 928 ratiolensortedpassed = sorted(lensortedpassed, key = lambda key : passedseqs[key], reverse = True) | |
| 929 sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__) | |
| 930 deconved = {} | |
| 931 for seq in ratiolensortedpassed: # reassign counts | |
| 932 singles = seq.split(' ') | |
| 933 first = 0 | |
| 934 rem_count = 0 | |
| 935 for sing in sortedsingles: | |
| 936 if sing in singles: | |
| 937 if covardict['singles'][sing] > 0: | |
| 938 if first == 0: | |
| 939 first = 1 | |
| 940 rem_count = covardict['singles'][sing] | |
| 941 covardict['singles'][sing] = 0 | |
| 942 deconved[seq] = rem_count | |
| 943 else: | |
| 944 covardict['singles'][sing] = covardict['singles'][sing] - rem_count | |
| 945 else: | |
| 946 break | |
| 947 sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__) | |
| 948 | |
| 949 sortedpreserved = sorted(preservedseqs, key = lambda key : covardict[key]) | |
| 950 | |
| 951 for seq in sortedpreserved: | |
| 952 singles = seq.split(' ') | |
| 953 first = 0 | |
| 954 rem_count = 0 | |
| 955 for sing in sortedsingles: | |
| 956 if sing in singles: | |
| 957 if covardict['singles'][sing] > 0: | |
| 958 if first == 0: | |
| 959 first = 1 | |
| 960 rem_count = covardict['singles'][sing] | |
| 961 covardict['singles'][sing] = 0 | |
| 962 deconved[seq] = rem_count | |
| 963 else: | |
| 964 covardict['singles'][sing] = covardict['singles'][sing] - rem_count | |
| 965 else: | |
| 966 break | |
| 967 sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__) | |
| 968 | |
| 969 | |
| 970 newtotal = sum(deconved.values()) | |
| 971 fh_deconv = open(args.outfile6, "w") | |
| 972 fh_deconv.write(f"{args.samp}({covardict['total']}) | ({newtotal})\tCount\tAbundance\n") | |
| 973 sorted_deconved = sorted(deconved, key = deconved.__getitem__, reverse = True) | |
| 974 for seq in sorted_deconved: # write deconv | |
| 975 if deconved[seq] >= min_count: | |
| 976 fh_deconv.write(f"{seq}\t{deconved[seq]}\t{(deconved[seq]/newtotal):.3f}\n") | |
| 977 fh_deconv.close() | |
| 978 # END COVAR DECONV OUT | |
| 979 | |
| 980 return() | |
| 981 | |
| 982 def dechim(args, seqs): # processing sequence dictionary to remove chimeras | |
| 983 total = seqs['total'] | |
| 984 del seqs['total'] | |
| 985 sorted_seqs = sorted(seqs, key=seqs.__getitem__) # sort sequences by abundance, least to greatest | |
| 986 chimeras = [] | |
| 987 for seq in sorted_seqs: | |
| 988 pot_chim = ['Reference']+seq.split()+['Reference'] | |
| 989 chim_halves = [] | |
| 990 for i in range(0, len(pot_chim)-1): # create dimera halves | |
| 991 chim_halves.append([pot_chim[:i+1], pot_chim[i+1:]]) | |
| 992 parent_pairs = [] | |
| 993 | |
| 994 for dimera in chim_halves: # check for potential parents | |
| 995 | |
| 996 pot_left = [] | |
| 997 pot_rt = [] | |
| 998 lft_len = len(dimera[0]) | |
| 999 rt_len = len(dimera[1]) | |
| 1000 for pseq in sorted_seqs: | |
| 1001 if not seq == pseq: | |
| 1002 if (seqs[pseq] >= (seqs[seq] * args.foldab)): | |
| 1003 pot_par = ['Reference']+pseq.split()+['Reference'] | |
| 1004 if dimera[0] == pot_par[:lft_len]: | |
| 1005 pot_left.append(pot_par) | |
| 1006 if ((len(pot_par) >= rt_len) and (dimera[1] == pot_par[(len(pot_par)-rt_len):])): | |
| 1007 pot_rt.append(pot_par) | |
| 1008 | |
| 1009 if (len(pot_left) > 0 and len(pot_rt) > 0 ): | |
| 1010 for left_par in pot_left: # check potential parents' pairing | |
| 1011 for rt_par in pot_rt: | |
| 1012 if left_par != rt_par: | |
| 1013 left_break = left_par[lft_len] | |
| 1014 rt_break = rt_par[(len(rt_par)-rt_len)-1] | |
| 1015 if left_break == 'Reference' or rt_break == 'Reference': | |
| 1016 parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])]) | |
| 1017 else: | |
| 1018 left_break_POS = '' | |
| 1019 for c in left_break: | |
| 1020 if c.isdigit(): | |
| 1021 left_break_POS += c | |
| 1022 else: | |
| 1023 break | |
| 1024 | |
| 1025 rt_break_POS = '' | |
| 1026 for c in rt_break: | |
| 1027 if c.isdigit(): | |
| 1028 rt_break_POS += c | |
| 1029 else: | |
| 1030 break | |
| 1031 | |
| 1032 if int(left_break_POS) > int(rt_break_POS): | |
| 1033 parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])]) | |
| 1034 | |
| 1035 par_tot_abund = 0 | |
| 1036 pair_probs = [] | |
| 1037 for parents in parent_pairs: # calc 'expected' abundance | |
| 1038 pair_prob = (seqs[parents[0]] / total) * (seqs[parents[1]] / total) | |
| 1039 par_tot_abund += pair_prob | |
| 1040 pair_probs.append(pair_prob) | |
| 1041 | |
| 1042 recomb_count = par_tot_abund * total | |
| 1043 | |
| 1044 if not seqs[seq] >= recomb_count * args.alpha: # chimera check | |
| 1045 redist_count = float(seqs[seq]) | |
| 1046 chimeras.append(seq) | |
| 1047 seqs[seq] = 0 | |
| 1048 if args.redist == 1: # redist counts of chimera | |
| 1049 toadd = {} | |
| 1050 for i in range(0, len(parent_pairs)): | |
| 1051 counts_to_redist = (redist_count * (pair_probs[i]/par_tot_abund))/2 | |
| 1052 seqs[parent_pairs[i][0]] += counts_to_redist | |
| 1053 seqs[parent_pairs[i][1]] += counts_to_redist | |
| 1054 | |
| 1055 | |
| 1056 | |
| 1057 for chim in chimeras: # remove chimeras | |
| 1058 del seqs[chim] | |
| 1059 | |
| 1060 total = sum(seqs.values()) | |
| 1061 | |
| 1062 | |
| 1063 # total = sum(seqs.values) | |
| 1064 seqs['total'] = total | |
| 1065 | |
| 1066 return(seqs) | |
| 1067 | |
| 1068 def chimrm(args, seqs): # chimera removed process | |
| 1069 | |
| 1070 pre_len = len(seqs) | |
| 1071 inf_loop_shield = 0 | |
| 1072 while True: # send sequences for chimera removal while chimeras are still found | |
| 1073 dechim(args, seqs) | |
| 1074 post_len = len(seqs) | |
| 1075 inf_loop_shield += 1 | |
| 1076 if post_len >= pre_len: | |
| 1077 break | |
| 1078 if inf_loop_shield > 100: | |
| 1079 break | |
| 1080 pre_len = len(seqs) | |
| 1081 | |
| 1082 total = seqs['total'] | |
| 1083 del seqs['total'] | |
| 1084 if args.min_samp_abund < 1: | |
| 1085 min_count = args.min_samp_abund * total | |
| 1086 else: | |
| 1087 min_count = args.min_samp_abund | |
| 1088 sorted_seqs = sorted(seqs, key=seqs.__getitem__, reverse=True) | |
| 1089 fh_dechime = open(args.outfile5, 'w') | |
| 1090 fh_dechime.write(f"{args.samp}({int(total)})\n") | |
| 1091 fh_dechime.write("Sequences\tCount\tAbundance\n") | |
| 1092 for seq in seqs: # write chim_rm seqs | |
| 1093 abund = seqs[seq]/total | |
| 1094 if seqs[seq] >= min_count: | |
| 1095 fh_dechime.write(f"{seq}\t{round(seqs[seq])}\t{abund:.3f}\n") | |
| 1096 | |
| 1097 fh_dechime.close() | |
| 1098 return() | |
| 1099 | |
| 1100 def main(): | |
| 1101 | |
| 1102 args = arg_parser() # getting command line arguments | |
| 1103 | |
| 1104 covar_array = [] | |
| 1105 seq_array = [] | |
| 1106 total_mapped_reads = 0 | |
| 1107 | |
| 1108 if args.ref: | |
| 1109 ref = get_ref(args.ref) # get the reference ID and sequence from the FASTA file | |
| 1110 args.ref.close() | |
| 1111 if ref[1] == '': | |
| 1112 print('Reference not recognized as a Fasta format, skipping SAM parsing') | |
| 1113 else: | |
| 1114 refprot = '' | |
| 1115 if args.AAreport == 1: # make an Amino Acid sequence based on the reference sequence | |
| 1116 for x in range(0, (len(ref[1])-1)//3): | |
| 1117 AA = AAcall(ref[1][x*3]+ref[1][x*3+1]+ref[1][x*3+2]) | |
| 1118 refprot = refprot + AA | |
| 1119 if (len(ref[1])-1)%3 != 0: | |
| 1120 refprot = refprot + '?' | |
| 1121 | |
| 1122 covar_array, seq_array, total_mapped_reads = SAMparse(args, ref, refprot, args.Sam_file) | |
| 1123 | |
| 1124 else: | |
| 1125 print('No reference provided, skipping SAM parsing') | |
| 1126 | |
| 1127 in_covars = {'total' : total_mapped_reads, | |
| 1128 'singles' : {} | |
| 1129 } | |
| 1130 in_seqs = {'total' : total_mapped_reads} | |
| 1131 # Begin chimera removal if enabled | |
| 1132 if args.chim_rm == 1 or args.deconv == 1: | |
| 1133 if args.deconv == 1: | |
| 1134 for line in covar_array: | |
| 1135 lineparts = line.split("\t") | |
| 1136 in_covars[lineparts[0]] = int(lineparts[1]) | |
| 1137 if len(lineparts[0].split(' ')) == 1: | |
| 1138 in_covars['singles'][lineparts[0]] = int(lineparts[1]) | |
| 1139 | |
| 1140 for line in seq_array: | |
| 1141 lineparts = line.split("\t") | |
| 1142 in_seqs[lineparts[0]] = float(lineparts[1]) | |
| 1143 | |
| 1144 if args.deconv == 1: | |
| 1145 cvdeconv(args, in_covars, in_seqs) | |
| 1146 | |
| 1147 if args.chim_rm == 1: | |
| 1148 chimrm(args, in_seqs) | |
| 1149 | |
| 1150 if __name__ == '__main__': | |
| 1151 main() |
