Mercurial > repos > degregory > sam_refiner
view SAM_Refiner_Gal.py @ 1:b68f7e37e70a draft default tip
Uploaded xml
author | degregory |
---|---|
date | Mon, 13 Sep 2021 14:14:20 +0000 |
parents | 22e3f843f1db |
children |
line wrap: on
line source
#!/bin/env python3 # Writen by Devon Gregory with assistance from Christopher Bottoms # University of Missouri # Distributed under GNU GENERAL PUBLIC LICENSE v3 import os import sys import argparse import itertools import time from multiprocessing import Process from pathlib import Path # process for collecing command line arguments def arg_parser(): parser = argparse.ArgumentParser( description='process Sam files for variant information' ) parser.add_argument( '--samp', type=str, default='Sample', help='Sample Name' ) parser.add_argument( '-r', '-reference', type=argparse.FileType('r'), dest='ref', help='reference fasta' ) parser.add_argument( '-S', '--Sam_file', type=argparse.FileType('r'), dest='Sam_file', help='.sam file' ) parser.add_argument( '-o1', '--out_file1', dest='outfile1', help='output file' ) parser.add_argument( '-o2', '--out_file2', dest='outfile2', help='output file' ) parser.add_argument( '-o3', '--out_file3', dest='outfile3', help='output file' ) parser.add_argument( '-o4', '--out_file4', dest='outfile4', help='output file' ) parser.add_argument( '-o5', '--out_file5', dest='outfile5', help='output file' ) parser.add_argument( '-o6', '--out_file6', dest='outfile6', help='output file' ) parser.add_argument( '-o7', '--out_file7', dest='outfile7', help='output file' ) parser.add_argument( '--use_count', type=int, default=1, choices=[0, 1], help='Enable/Disable (1/0) use of counts in sequence IDs, default enabled (--use_count 1)' ) parser.add_argument( '--min_count', type=int, default=10, help='Minimum observations required to be included in sample reports; >= 1 occurance count; < 1 %% observed (.1 = 10%%), (default: .001)' ) parser.add_argument( '--min_samp_abund', type=float, default=0.001, help='Minimum abundance required for inclusion in sample reports; %% observed (.1 = 10%%), (default: .001)' ) parser.add_argument( '--ntabund', type=float, default=.001, help='Minimum abundance relative to total reads required for a position to be reported in the nt call output; must be non-negative and < 1, %% observed (.1 = 10%%), (default: .001)' ) parser.add_argument( '--max_dist', type=int, default=40, help='Maximum number of variances from the reference a sequence can have to be consider in covars processing (default: 40)' ) parser.add_argument( '--max_covar', type=int, default=8, help='Maximum number of variances from the reference to be reported in covars (default: 8)' ) parser.add_argument( '--AAreport', type=int, default=1, choices=[0, 1], help='Enable/Disable (1/0) amino acid reporting, default enabled (--AAreport 1)' ) parser.add_argument( '--AAcodonasMNP', type=int, default=1, choices=[0, 1], help='Enable/Disable (1/0) reporting multiple nt changes in a single codon as one polymorphism, default enabled (--AAcodonasMNP 1), requires AAreport enabled' ) parser.add_argument( '--alpha', type=float, default=1.2, help='Modifier for chim_rm chimera checking, default 1.2. Higher = more sensitive, more false chimeras removed; lower = less sensitive, fewer chimeras removed' ) parser.add_argument( '--foldab', type=float, default=1.8, help='Threshold for potential parent / chimera abundance ratio for chim_rm; default is 1.8' ) parser.add_argument( '--redist', type=int, default=1, choices=[0, 1], help='Enable/Disable (1/0) redistribution of chimera counts for chim_rm, default enabled (--redist 1)' ) parser.add_argument( '--beta', type=float, default=1, help='Modifier for covar pass checking, default 1. Higher = more sensitive, more failed checks; lower = less sensitive, fewer failed checks' ) parser.add_argument( '--autopass', type=float, default=.3, help='threshold for a sequence to automatically pass the covar pass checking' ) parser.add_argument( '--nt_call', type=int, default=1, choices=[0, 1], help='Enable/Disable (1/0) nt_call output, default enabled (--nt_call 1)' ) parser.add_argument( '--ntvar', type=int, default=0, choices=[0, 1], help='Enable/Disable (1/0) nt_var output, default enabled (--nt_call 1)' ) parser.add_argument( '--indel', type=int, default=1, choices=[0, 1], help='Enable/Disable (1/0) indel output, default enabled (--indel 1)' ) parser.add_argument( '--seq', type=int, default=1, choices=[0, 1], help='Enable/Disable (1/0) unique seq output, default enabled (--seq 1)' ) parser.add_argument( '--covar', type=int, default=1, choices=[0, 1], help='Enable/Disable (1/0) covar output, default enabled (--covar 1)' ) parser.add_argument( '--chim_rm', type=int, default=1, choices=[0, 1], help='Enable/Disable (1/0) chim_rm output, default enabled (--chim_rm 1)' ) parser.add_argument( '--deconv', type=int, default=1, choices=[0, 1], help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)' ) parser.add_argument( '--wgs', type=int, default=0, choices=[0, 1], help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)' ) args = parser.parse_args() # checking for proper range of some parameters and consistency/compatibility if args.wgs == 1: if args.deconv == 1 or args.chim_rm == 1: args.deconv = 0 args.chim_rm = 0 print('WGS mode enabled, disabling chimera removal methods') return(args) def get_ref(ref): # get the reference ID and sequence from the FASTA file. Will only get the first. n=0 refID = '' refseq = '' if ref: for line in ref: if line.startswith('>'): n+=1 if n > 1: break refID = line[1:].strip("\n\r") elif n == 1: refseq = refseq + line.strip("\n\r") refseq = refseq.upper() return(refID, refseq) def AAcall(codon): # amino acid / codon dictionary to return encoded AAs AAdict = { 'TTT' : 'F', 'TTC' : 'F', 'TTA' : 'L', 'TTG' : 'L', 'TCT' : 'S', 'TCC' : 'S', 'TCA' : 'S', 'TCG' : 'S', 'TAT' : 'Y', 'TAC' : 'Y', 'TAA' : '*', 'TAG' : '*', 'TGT' : 'C', 'TGC' : 'C', 'TGA' : '*', 'TGG' : 'W', 'CTT' : 'L', 'CTC' : 'L', 'CTA' : 'L', 'CTG' : 'L', 'CCT' : 'P', 'CCC' : 'P', 'CCA' : 'P', 'CCG' : 'P', 'CAT' : 'H', 'CAC' : 'H', 'CAA' : 'Q', 'CAG' : 'Q', 'CGT' : 'R', 'CGC' : 'R', 'CGA' : 'R', 'CGG' : 'R', 'ATT' : 'I', 'ATC' : 'I', 'ATA' : 'I', 'ATG' : 'M', 'ACT' : 'T', 'ACC' : 'T', 'ACA' : 'T', 'ACG' : 'T', 'AAT' : 'N', 'AAC' : 'N', 'AAA' : 'K', 'AAG' : 'K', 'AGT' : 'S', 'AGC' : 'S', 'AGA' : 'R', 'AGG' : 'R', 'GTT' : 'V', 'GTC' : 'V', 'GTA' : 'V', 'GTG' : 'V', 'GCT' : 'A', 'GCC' : 'A', 'GCA' : 'A', 'GCG' : 'A', 'GAT' : 'D', 'GAC' : 'D', 'GAA' : 'E', 'GAG' : 'E', 'GGT' : 'G', 'GGC' : 'G', 'GGA' : 'G', 'GGG' : 'G' } AA = '?' if codon in AAdict: AA = AAdict[codon] return(AA) def singletCodon(ntPOS, nt, ref): # process to return the AA and protein seq. position based on the reference and provided nt seq position and nt AAPOS = (ntPOS-1)//3 AAmod = (ntPOS-1)%3 codon = "" try: if AAmod == 0: codon = nt+ref[1][ntPOS]+ref[1][ntPOS+1] elif AAmod == 1: codon = ref[1][ntPOS-2]+nt+ref[1][ntPOS] elif AAmod == 2: codon = ref[1][ntPOS-3]+ref[1][ntPOS-2]+nt except: codon = "XXX" return(AAPOS+1, AAcall(codon)) def getCombos(qlist, clen): # returns combinations of single polymorphisms in a sequence combos = [] if (clen == 0 or clen > len(qlist)): clen = len(qlist) for N in range(1, clen+1): for comb in itertools.combinations(qlist, N): combos.append(' '.join(comb)) return(combos) def SAMparse(args, ref, refprot, file): # process SAM files covar_array = [] seq_array = [] nt_call_dict_dict = {} indel_dict = {} seq_species = {} sam_read_count = 0 sam_line_count = 0 firstPOS = 0 lastPOS = 0 coverage = {} for line in args.Sam_file: if not line.startswith('@'): # ignore header lines splitline = line.split("\t") if ref[0].upper().startswith(splitline[2].upper()): # check map ID matches referecne ID if int(splitline[4]) > 0: # Check mapping score is positive abund_count=1 if args.use_count == 1: # get the unique sequence counts if '-' in splitline[0] and '=' in splitline[0]: eq_split = splitline[0].split('=') dash_split = splitline[0].split('-') if len(eq_split[-1]) > len(dash_split[-1]): abund_count=int(dash_split[-1]) else: abund_count=int(eq_split[-1]) elif '-' in splitline[0]: try: abund_count=int(splitline[0].split('-')[-1]) except: pass elif '=' in splitline[0]: try: abund_count=int(splitline[0].split('=')[-1]) except: pass sam_read_count += abund_count CIGAR = splitline[5] POS = int(splitline[3]) if firstPOS == 0: firstPOS = POS elif POS < firstPOS: firstPOS = POS readID = splitline[0] query_seq = splitline[9].upper() run_length = 0 query_seq_parsed = '' query_pos = 0 q_pars_pos = 0 mutations = [] for C in CIGAR: # process sequence based on standard CIGAR line if C == 'M' or C == 'I' or C == 'D' or C == 'S' or C == 'H': if C == 'S': query_pos = query_pos + run_length if C == 'I': if query_pos > 0: # add insertion to dict iPOS = q_pars_pos+POS iSeq = query_seq[query_pos: query_pos+run_length] istring = str(iPOS)+'-insert'+iSeq try: indel_dict[istring] except: indel_dict[istring] = abund_count else: indel_dict[istring] += abund_count if args.AAreport == 1 and (run_length % 3 == 0): iProt = '' if iPOS % 3 == 1: for x in range(0, (run_length//3)): AA = AAcall(iSeq[x*3]+iSeq[x*3+1]+iSeq[x*3+2]) iProt = iProt + AA mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')') elif iPOS % 3 == 2: if query_pos > 0: ipSeq = query_seq[query_pos-1:query_pos+run_length+2] else: ipSeq = "XXX"+query_seq[query_pos+2:query_pos+run_length+2] for x in range(0, (run_length//3)+1): AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2]) iProt = iProt + AA mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')') else: if query_pos > 1: ipSeq = query_seq[query_pos-2:query_pos+run_length+1] else: ipSeq = "XXX"+query_seq[query_pos+1:query_pos+run_length+1] for x in range(0, (run_length//3)+1): AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2]) iProt = iProt + AA mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')') else: mutations.append(istring) query_pos = query_pos + run_length elif C == 'D': for X in range(0, run_length): query_seq_parsed += '-' delstring = str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del' if args.AAreport == 1 and (run_length % 3 == 0) and not ((q_pars_pos+POS) % 3 == 1 ): if (q_pars_pos+POS) % 3 == 2: newcodon = query_seq[query_pos-1:query_pos+2] newAArefpos = (q_pars_pos+POS) // 3 mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')') else: newcodon = query_seq[query_pos-2:query_pos+1] newAArefpos = (q_pars_pos+POS) // 3 mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')') else: mutations.append(delstring) if args.nt_call == 1: for N in range(q_pars_pos+POS, q_pars_pos+POS+int(run_length)): try: nt_call_dict_dict[N] except: nt_call_dict_dict[N] = {'A' : 0, 'T' : 0, 'C' : 0, 'G' : 0, '-' : 0} nt_call_dict_dict[N]['-'] = abund_count else: nt_call_dict_dict[N]['-'] += abund_count try: indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] except: indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] = int(abund_count) else: indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] += int(abund_count) q_pars_pos = q_pars_pos + run_length elif C == 'M': offset = q_pars_pos-query_pos refPOS = POS+offset for ntPOS in range(query_pos, query_pos+run_length): if query_seq[ntPOS] == 'A' or query_seq[ntPOS] == 'T' or query_seq[ntPOS] == 'C' or query_seq[ntPOS] == 'G': if query_seq[ntPOS] != ref[1][refPOS+ntPOS-1]: if args.AAreport == 1 and args.AAcodonasMNP == 0: AAinfo = singletCodon(refPOS+ntPOS, query_seq[ntPOS], ref) mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') else: mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS]) if args.nt_call == 1: try: nt_call_dict_dict[refPOS+ntPOS] except: nt_call_dict_dict[refPOS+ntPOS] = {'A' : 0, 'T' : 0, 'C' : 0, 'G' : 0, '-' : 0} nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] = abund_count else: nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] += abund_count q_pars_pos = q_pars_pos + run_length query_pos = query_pos + run_length run_length = 0 else: run_length = (10 * run_length) + int(C) # END CIGAR PARSE if len(mutations) == 0: # record reference counts if args.wgs == 0: try: seq_species['Reference'] += abund_count except: seq_species['Reference'] = abund_count else: try: seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] += abund_count except: seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] = abund_count else: # record variants and counts if args.AAreport == 1 and args.AAcodonasMNP == 1: codonchecked = [] codon = '' skip = 0 MNP = '' for i in range(0, len(mutations)): # checking for MNP if 'Del' in mutations[i] or 'insert' in mutations[i]: codonchecked.append(mutations[i]) elif skip > 0: skip -= 1 else: mut1POS = int(''.join([c for c in mutations[i] if c.isdigit()])) try: mutations[i+1] except: AAinfo = singletCodon(mut1POS, mutations[i][-1], ref) codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') else: if mut1POS % 3 == 0: AAinfo = singletCodon(mut1POS, mutations[i][-1], ref) codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') else: mut2POS = int(''.join([c for c in mutations[i+1] if c.isdigit()])) if mut2POS - mut1POS < 3: AAPOS = mut1POS // 3 if mut1POS % 3 == 1: if mut2POS % 3 == 0: codon = mutations[i][-1]+ref[1][mut1POS]+mutations[i+1][-1] MNP = mutations[i][-1]+'r'+mutations[i+1][-1] skip = 1 else: try: mutations[i+2] except: codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS] MNP = mutations[i][-1]+mutations[i+1][-1]+'r' skip = 1 else: mut3POS = int(''.join([c for c in mutations[i+2] if c.isdigit()])) if mut2POS == mut3POS -1: codon = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1] MNP = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1] skip = 2 else: codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS] MNP = mutations[i][-1]+mutations[i+1][-1]+'r' skip = 1 codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')') elif mut2POS - mut1POS == 1: codon = ref[1][mut1POS-2]+mutations[i][-1]+mutations[i+1][-1] MNP = mutations[i][-1]+mutations[i+1][-1] skip = 1 codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')') else: AAinfo = singletCodon(mut1POS, mutations[i][-1], ref) codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') else: AAinfo = singletCodon(mut1POS, mutations[i][-1], ref) codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') mutations = " ".join(codonchecked) else: mutations = " ".join(mutations) if args.wgs == 0: try: seq_species[mutations] += abund_count except: seq_species[mutations] = abund_count else: try: seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] += abund_count except: seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] = abund_count if lastPOS < POS+q_pars_pos: lastPOS = POS+q_pars_pos for i in range(POS, POS+q_pars_pos): # update coverage try: coverage[i] += abund_count except: coverage[i] = abund_count args.Sam_file.close() # END SAM LINES total_mapped_reads = sam_read_count if args.seq == 1: # output the sequence seq_fh = open(args.outfile1, "w") seq_fh.write(args.samp+"("+str(sam_read_count)+")\n") seq_fh.write("Unique Sequence\tCount\tAbundance\n") sorted_seq = sorted(seq_species, key=seq_species.__getitem__, reverse=True) for key in sorted_seq: if seq_species[key] >= args.min_count: if (seq_species[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0: seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}\n") seq_array.append(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}") elif args.wgs == 1: splitseqs = key.split() cov = [] for x in range(int(splitseqs[0]), int(splitseqs[-1])): cov.append(coverage[x]) min_cov = min(cov) if (seq_species[key]/min_cov >= args.min_samp_abund): seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/min_cov):.3f}\n") else: break seq_fh.close() # END SEQ OUT if args.indel == 1: # and len(indel_dict) > 0: # output indels, if there are any sorted_indels = sorted(indel_dict, key=indel_dict.__getitem__, reverse=True) indels_to_write = [] for key in sorted_indels: if indel_dict[key] >= args.min_count: if indel_dict[key] / sam_read_count >= args.min_samp_abund and args.wgs == 0: indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key]/sam_read_count):.3f}\n") elif args.wgs == 1: indelPOS = '' for c in key: if c.isdigit(): indelPOS += c else: break indelPOS = int(indelPOS) if indel_dict[key] / coverage[indelPOS] >= args.min_samp_abund: indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key] / coverage[indelPOS]):.3f}\n") else: break # if len(indels_to_write) > 0: indel_fh = open(args.outfile2, "w") indel_fh.write(args.samp+"("+str(sam_read_count)+")\n") indel_fh.write("Indel\tCount\tAbundance\n") for indel_entry in indels_to_write: indel_fh.write(indel_entry) indel_fh.close() # END INDEL OUT if args.nt_call == 1: # out put nt calls ntcall_fh = open(args.outfile3, "w") ntcall_fh.write(args.samp+"("+str(sam_read_count)+")\n") if args.ntvar == 1: ntcallv_fh = open(args.outfile7, "w") ntcallv_fh.write(args.samp+"("+str(sam_read_count)+")\n") sorted_POS = sorted(nt_call_dict_dict) if args.AAreport == 1: ntcall_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n") if args.ntvar == 1: ntcallv_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n") for POS in sorted_POS: try: total = coverage[POS] except: total = 0 if total >= (sam_read_count * args.ntabund): AAinfo = singletCodon(POS, ref[1][POS-1], ref) POS_calls = {} for key in nt_call_dict_dict[POS]: POS_calls[key] = nt_call_dict_dict[POS][key] sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True) ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1]) ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])) ntcall_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}") if sorted_calls[0] != ref[1][POS-1]: if args.ntvar == 1: ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1]) ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])) ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}") mod = (POS)%3 if mod == 0: try: codon = ref[1][POS-3]+ref[1][POS-2]+sorted_calls[0] except: codon = 'NNN' elif mod == 2: try: codon = ref[1][POS-2]+sorted_calls[0]+ref[1][POS] except: codon = 'NNN' elif mod == 1: try: codon = sorted_calls[0]+ref[1][POS]+ref[1][POS+1] except: codon = 'NNN' ntcall_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1]) if args.ntvar == 1: ntcallv_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1]) if (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total >= args.min_samp_abund): if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1: ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1]) ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1]) if nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count: if nt_call_dict_dict[POS][sorted_calls[2]] / total >= args.min_samp_abund: if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1: ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}") ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}") if args.ntvar == 1: ntcallv_fh.write("\n") elif (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] / total >= args.min_samp_abund): if args.ntvar == 1: ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1]) ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) ntcallv_fh.write("\t"+str(total)+"\t\t") ntcallv_fh.write(f"\t") ntcallv_fh.write("\t\t") ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1]) ntcall_fh.write("\t\t") ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1]) if (nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total >= args.min_samp_abund): ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}") if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1: ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}") if args.ntvar == 1: ntcallv_fh.write("\n") ntcall_fh.write("\n") else: ntcall_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n") if args.ntvar == 1: ntcallv_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n") for POS in sorted_POS: try: total = coverage[POS] # sum(nt_call_dict_dict[POS].values()) except: total = 0 if total >= (sam_read_count * args.ntabund): POS_calls = {} for key in nt_call_dict_dict[POS]: POS_calls[key] = nt_call_dict_dict[POS][key] sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True) ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1]) ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}") if sorted_calls[0] != ref[1][POS-1]: if args.ntvar == 1: ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]) ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])) ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}") if (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund): ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}") if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1: ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}") if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund): ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}") if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1: ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}") if args.ntvar == 1: ntcallv_fh.write("\n") elif (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund): if args.ntvar == 1: ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]) ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) ntcallv_fh.write("\t"+str(total)+"\t\t") ntcallv_fh.write(f"\t") ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}") ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}") if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and(nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund): ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}") if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1: ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}") if args.ntvar == 1: ntcallv_fh.write("\n") ntcall_fh.write("\n") ntcall_fh.close() if args.ntvar == 1: ntcallv_fh.close() # END NT CALL OUT if args.covar == 1: # output covariants testtrack = 0 combinations = {} for sequence in seq_species: if args.wgs == 0: singles = sequence.split() else: singles = (sequence.split())[1:-1] if len(singles) <= args.max_dist and singles[0] != 'Ref': for combo in getCombos(singles, args.max_covar): if not combo in combinations: combinations[combo] = seq_species[sequence] else: combinations[combo] += seq_species[sequence] covar_fh = open(args.outfile4, "w") covar_fh.write(args.samp+"("+str(sam_read_count)+")\n") covar_fh.write("Co-Variants\tCount\tAbundance\n") sortedcombos = sorted(combinations, key=combinations.__getitem__, reverse=True) for key in sortedcombos: if (combinations[key] >= args.min_count): if (combinations[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0: covar_fh.write(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}\n") covar_array.append(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}") elif args.wgs == 1: coveragepercent = 0 splitcombos = key.split() if len(splitcombos) == 1: coveragePOS = '' for c in key: if c.isdigit(): coveragePOS += c else: break coveragepercent = combinations[key] / coverage[int(coveragePOS)] else: startcovPOS = '' for c in splitcombos[0]: if c.isdigit(): startcovPOS += c else: break endcovPOS = '' for c in splitcombos[-1]: if c.isdigit(): endcovPOS += c else: break coveragevals = [] for i in range(int(startcovPOS), int(endcovPOS)+1): coveragevals.append(coverage[i]) mincov = min(coverval for coverval in coveragevals) coveragepercent = combinations[key] / mincov if coveragepercent >= args.min_samp_abund: covar_fh.write(f"{key}\t{combinations[key]}\t{coveragepercent:.3f}\n") covar_fh.close() # END COVAR OUT return(covar_array, seq_array, sam_read_count) def cvdeconv(args, covardict, seqdict): # covar deconvolution process passedseqs = {} preservedseqs = {} for seq in seqdict: # pass check for actual : expected abundance if seq != 'total' and seq != 'singles': splitseq = seq.split(' ') abund = 1 for sing in splitseq: try: abund = abund * (covardict[sing] / covardict['total']) except: abund = abund * (seqdict[seq] / seqdict['total']) try: covarabund = covardict[seq]/covardict['total'] except: covarabund = seqdict[seq]/seqdict['total'] covardict[seq] = seqdict[seq] if covarabund >= args.autopass: preservedseqs[seq] = max(1, args.beta, (covarabund / abund)) elif covarabund >= abund * args.beta: passedseqs[seq] = covarabund / abund elif len(seq.split(' ')) == 1: passedseqs[seq] = max(1, args.beta) if args.min_samp_abund < 1: min_count = args.min_samp_abund * covardict['total'] else: min_count = args.min_samp_abund # sort passed covars lensortedpassed = sorted(passedseqs, key = lambda key : len(key.split(' ')), reverse=True) ratiolensortedpassed = sorted(lensortedpassed, key = lambda key : passedseqs[key], reverse = True) sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__) deconved = {} for seq in ratiolensortedpassed: # reassign counts singles = seq.split(' ') first = 0 rem_count = 0 for sing in sortedsingles: if sing in singles: if covardict['singles'][sing] > 0: if first == 0: first = 1 rem_count = covardict['singles'][sing] covardict['singles'][sing] = 0 deconved[seq] = rem_count else: covardict['singles'][sing] = covardict['singles'][sing] - rem_count else: break sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__) sortedpreserved = sorted(preservedseqs, key = lambda key : covardict[key]) for seq in sortedpreserved: singles = seq.split(' ') first = 0 rem_count = 0 for sing in sortedsingles: if sing in singles: if covardict['singles'][sing] > 0: if first == 0: first = 1 rem_count = covardict['singles'][sing] covardict['singles'][sing] = 0 deconved[seq] = rem_count else: covardict['singles'][sing] = covardict['singles'][sing] - rem_count else: break sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__) newtotal = sum(deconved.values()) fh_deconv = open(args.outfile6, "w") fh_deconv.write(f"{args.samp}({covardict['total']}) | ({newtotal})\tCount\tAbundance\n") sorted_deconved = sorted(deconved, key = deconved.__getitem__, reverse = True) for seq in sorted_deconved: # write deconv if deconved[seq] >= min_count: fh_deconv.write(f"{seq}\t{deconved[seq]}\t{(deconved[seq]/newtotal):.3f}\n") fh_deconv.close() # END COVAR DECONV OUT return() def dechim(args, seqs): # processing sequence dictionary to remove chimeras total = seqs['total'] del seqs['total'] sorted_seqs = sorted(seqs, key=seqs.__getitem__) # sort sequences by abundance, least to greatest chimeras = [] for seq in sorted_seqs: pot_chim = ['Reference']+seq.split()+['Reference'] chim_halves = [] for i in range(0, len(pot_chim)-1): # create dimera halves chim_halves.append([pot_chim[:i+1], pot_chim[i+1:]]) parent_pairs = [] for dimera in chim_halves: # check for potential parents pot_left = [] pot_rt = [] lft_len = len(dimera[0]) rt_len = len(dimera[1]) for pseq in sorted_seqs: if not seq == pseq: if (seqs[pseq] >= (seqs[seq] * args.foldab)): pot_par = ['Reference']+pseq.split()+['Reference'] if dimera[0] == pot_par[:lft_len]: pot_left.append(pot_par) if ((len(pot_par) >= rt_len) and (dimera[1] == pot_par[(len(pot_par)-rt_len):])): pot_rt.append(pot_par) if (len(pot_left) > 0 and len(pot_rt) > 0 ): for left_par in pot_left: # check potential parents' pairing for rt_par in pot_rt: if left_par != rt_par: left_break = left_par[lft_len] rt_break = rt_par[(len(rt_par)-rt_len)-1] if left_break == 'Reference' or rt_break == 'Reference': parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])]) else: left_break_POS = '' for c in left_break: if c.isdigit(): left_break_POS += c else: break rt_break_POS = '' for c in rt_break: if c.isdigit(): rt_break_POS += c else: break if int(left_break_POS) > int(rt_break_POS): parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])]) par_tot_abund = 0 pair_probs = [] for parents in parent_pairs: # calc 'expected' abundance pair_prob = (seqs[parents[0]] / total) * (seqs[parents[1]] / total) par_tot_abund += pair_prob pair_probs.append(pair_prob) recomb_count = par_tot_abund * total if not seqs[seq] >= recomb_count * args.alpha: # chimera check redist_count = float(seqs[seq]) chimeras.append(seq) seqs[seq] = 0 if args.redist == 1: # redist counts of chimera toadd = {} for i in range(0, len(parent_pairs)): counts_to_redist = (redist_count * (pair_probs[i]/par_tot_abund))/2 seqs[parent_pairs[i][0]] += counts_to_redist seqs[parent_pairs[i][1]] += counts_to_redist for chim in chimeras: # remove chimeras del seqs[chim] total = sum(seqs.values()) # total = sum(seqs.values) seqs['total'] = total return(seqs) def chimrm(args, seqs): # chimera removed process pre_len = len(seqs) inf_loop_shield = 0 while True: # send sequences for chimera removal while chimeras are still found dechim(args, seqs) post_len = len(seqs) inf_loop_shield += 1 if post_len >= pre_len: break if inf_loop_shield > 100: break pre_len = len(seqs) total = seqs['total'] del seqs['total'] if args.min_samp_abund < 1: min_count = args.min_samp_abund * total else: min_count = args.min_samp_abund sorted_seqs = sorted(seqs, key=seqs.__getitem__, reverse=True) fh_dechime = open(args.outfile5, 'w') fh_dechime.write(f"{args.samp}({int(total)})\n") fh_dechime.write("Sequences\tCount\tAbundance\n") for seq in seqs: # write chim_rm seqs abund = seqs[seq]/total if seqs[seq] >= min_count: fh_dechime.write(f"{seq}\t{round(seqs[seq])}\t{abund:.3f}\n") fh_dechime.close() return() def main(): args = arg_parser() # getting command line arguments covar_array = [] seq_array = [] total_mapped_reads = 0 if args.ref: ref = get_ref(args.ref) # get the reference ID and sequence from the FASTA file args.ref.close() if ref[1] == '': print('Reference not recognized as a Fasta format, skipping SAM parsing') else: refprot = '' if args.AAreport == 1: # make an Amino Acid sequence based on the reference sequence for x in range(0, (len(ref[1])-1)//3): AA = AAcall(ref[1][x*3]+ref[1][x*3+1]+ref[1][x*3+2]) refprot = refprot + AA if (len(ref[1])-1)%3 != 0: refprot = refprot + '?' covar_array, seq_array, total_mapped_reads = SAMparse(args, ref, refprot, args.Sam_file) else: print('No reference provided, skipping SAM parsing') in_covars = {'total' : total_mapped_reads, 'singles' : {} } in_seqs = {'total' : total_mapped_reads} # Begin chimera removal if enabled if args.chim_rm == 1 or args.deconv == 1: if args.deconv == 1: for line in covar_array: lineparts = line.split("\t") in_covars[lineparts[0]] = int(lineparts[1]) if len(lineparts[0].split(' ')) == 1: in_covars['singles'][lineparts[0]] = int(lineparts[1]) for line in seq_array: lineparts = line.split("\t") in_seqs[lineparts[0]] = float(lineparts[1]) if args.deconv == 1: cvdeconv(args, in_covars, in_seqs) if args.chim_rm == 1: chimrm(args, in_seqs) if __name__ == '__main__': main()