Mercurial > repos > degregory > sam_refiner
diff SAM_Refiner_Gal.py @ 0:22e3f843f1db draft
Uploaded main file
author | degregory |
---|---|
date | Mon, 13 Sep 2021 14:12:21 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SAM_Refiner_Gal.py Mon Sep 13 14:12:21 2021 +0000 @@ -0,0 +1,1151 @@ +#!/bin/env python3 +# Writen by Devon Gregory with assistance from Christopher Bottoms +# University of Missouri +# Distributed under GNU GENERAL PUBLIC LICENSE v3 +import os +import sys +import argparse +import itertools +import time +from multiprocessing import Process +from pathlib import Path + +# process for collecing command line arguments +def arg_parser(): + + parser = argparse.ArgumentParser( + description='process Sam files for variant information' + ) + + parser.add_argument( + '--samp', + type=str, + default='Sample', + help='Sample Name' + ) + parser.add_argument( + '-r', '-reference', + type=argparse.FileType('r'), + dest='ref', + help='reference fasta' + ) + parser.add_argument( + '-S', '--Sam_file', + type=argparse.FileType('r'), + dest='Sam_file', + help='.sam file' + ) + parser.add_argument( + '-o1', '--out_file1', + dest='outfile1', + help='output file' + ) + parser.add_argument( + '-o2', '--out_file2', + dest='outfile2', + help='output file' + ) + parser.add_argument( + '-o3', '--out_file3', + dest='outfile3', + help='output file' + ) + parser.add_argument( + '-o4', '--out_file4', + dest='outfile4', + help='output file' + ) + parser.add_argument( + '-o5', '--out_file5', + dest='outfile5', + help='output file' + ) + parser.add_argument( + '-o6', '--out_file6', + dest='outfile6', + help='output file' + ) + parser.add_argument( + '-o7', '--out_file7', + dest='outfile7', + help='output file' + ) + parser.add_argument( + '--use_count', + type=int, + default=1, + choices=[0, 1], + help='Enable/Disable (1/0) use of counts in sequence IDs, default enabled (--use_count 1)' + ) + parser.add_argument( + '--min_count', + type=int, + default=10, + help='Minimum observations required to be included in sample reports; >= 1 occurance count; < 1 %% observed (.1 = 10%%), (default: .001)' + ) + parser.add_argument( + '--min_samp_abund', + type=float, + default=0.001, + help='Minimum abundance required for inclusion in sample reports; %% observed (.1 = 10%%), (default: .001)' + ) + parser.add_argument( + '--ntabund', + type=float, + default=.001, + help='Minimum abundance relative to total reads required for a position to be reported in the nt call output; must be non-negative and < 1, %% observed (.1 = 10%%), (default: .001)' + ) + parser.add_argument( + '--max_dist', + type=int, + default=40, + help='Maximum number of variances from the reference a sequence can have to be consider in covars processing (default: 40)' + ) + parser.add_argument( + '--max_covar', + type=int, + default=8, + help='Maximum number of variances from the reference to be reported in covars (default: 8)' + ) + parser.add_argument( + '--AAreport', + type=int, + default=1, + choices=[0, 1], + help='Enable/Disable (1/0) amino acid reporting, default enabled (--AAreport 1)' + ) + parser.add_argument( + '--AAcodonasMNP', + type=int, + default=1, + choices=[0, 1], + help='Enable/Disable (1/0) reporting multiple nt changes in a single codon as one polymorphism, default enabled (--AAcodonasMNP 1), requires AAreport enabled' + ) + parser.add_argument( + '--alpha', + type=float, + default=1.2, + help='Modifier for chim_rm chimera checking, default 1.2. Higher = more sensitive, more false chimeras removed; lower = less sensitive, fewer chimeras removed' + ) + parser.add_argument( + '--foldab', + type=float, + default=1.8, + help='Threshold for potential parent / chimera abundance ratio for chim_rm; default is 1.8' + ) + parser.add_argument( + '--redist', + type=int, + default=1, + choices=[0, 1], + help='Enable/Disable (1/0) redistribution of chimera counts for chim_rm, default enabled (--redist 1)' + ) + parser.add_argument( + '--beta', + type=float, + default=1, + help='Modifier for covar pass checking, default 1. Higher = more sensitive, more failed checks; lower = less sensitive, fewer failed checks' + ) + parser.add_argument( + '--autopass', + type=float, + default=.3, + help='threshold for a sequence to automatically pass the covar pass checking' + ) + parser.add_argument( + '--nt_call', + type=int, + default=1, + choices=[0, 1], + help='Enable/Disable (1/0) nt_call output, default enabled (--nt_call 1)' + ) + parser.add_argument( + '--ntvar', + type=int, + default=0, + choices=[0, 1], + help='Enable/Disable (1/0) nt_var output, default enabled (--nt_call 1)' + ) + parser.add_argument( + '--indel', + type=int, + default=1, + choices=[0, 1], + help='Enable/Disable (1/0) indel output, default enabled (--indel 1)' + ) + parser.add_argument( + '--seq', + type=int, + default=1, + choices=[0, 1], + help='Enable/Disable (1/0) unique seq output, default enabled (--seq 1)' + ) + parser.add_argument( + '--covar', type=int, + default=1, + choices=[0, 1], + help='Enable/Disable (1/0) covar output, default enabled (--covar 1)' + ) + parser.add_argument( + '--chim_rm', + type=int, + default=1, + choices=[0, 1], + help='Enable/Disable (1/0) chim_rm output, default enabled (--chim_rm 1)' + ) + parser.add_argument( + '--deconv', + type=int, + default=1, + choices=[0, 1], + help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)' + ) + parser.add_argument( + '--wgs', + type=int, + default=0, + choices=[0, 1], + help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)' + ) + + args = parser.parse_args() + +# checking for proper range of some parameters and consistency/compatibility + if args.wgs == 1: + if args.deconv == 1 or args.chim_rm == 1: + args.deconv = 0 + args.chim_rm = 0 + print('WGS mode enabled, disabling chimera removal methods') + + return(args) + +def get_ref(ref): # get the reference ID and sequence from the FASTA file. Will only get the first. + + n=0 + refID = '' + refseq = '' + if ref: + for line in ref: + if line.startswith('>'): + n+=1 + if n > 1: + break + refID = line[1:].strip("\n\r") + elif n == 1: + refseq = refseq + line.strip("\n\r") + refseq = refseq.upper() + + + return(refID, refseq) + +def AAcall(codon): # amino acid / codon dictionary to return encoded AAs + AAdict = { + 'TTT' : 'F', + 'TTC' : 'F', + 'TTA' : 'L', + 'TTG' : 'L', + 'TCT' : 'S', + 'TCC' : 'S', + 'TCA' : 'S', + 'TCG' : 'S', + 'TAT' : 'Y', + 'TAC' : 'Y', + 'TAA' : '*', + 'TAG' : '*', + 'TGT' : 'C', + 'TGC' : 'C', + 'TGA' : '*', + 'TGG' : 'W', + + 'CTT' : 'L', + 'CTC' : 'L', + 'CTA' : 'L', + 'CTG' : 'L', + 'CCT' : 'P', + 'CCC' : 'P', + 'CCA' : 'P', + 'CCG' : 'P', + 'CAT' : 'H', + 'CAC' : 'H', + 'CAA' : 'Q', + 'CAG' : 'Q', + 'CGT' : 'R', + 'CGC' : 'R', + 'CGA' : 'R', + 'CGG' : 'R', + + 'ATT' : 'I', + 'ATC' : 'I', + 'ATA' : 'I', + 'ATG' : 'M', + 'ACT' : 'T', + 'ACC' : 'T', + 'ACA' : 'T', + 'ACG' : 'T', + 'AAT' : 'N', + 'AAC' : 'N', + 'AAA' : 'K', + 'AAG' : 'K', + 'AGT' : 'S', + 'AGC' : 'S', + 'AGA' : 'R', + 'AGG' : 'R', + + 'GTT' : 'V', + 'GTC' : 'V', + 'GTA' : 'V', + 'GTG' : 'V', + 'GCT' : 'A', + 'GCC' : 'A', + 'GCA' : 'A', + 'GCG' : 'A', + 'GAT' : 'D', + 'GAC' : 'D', + 'GAA' : 'E', + 'GAG' : 'E', + 'GGT' : 'G', + 'GGC' : 'G', + 'GGA' : 'G', + 'GGG' : 'G' + } + AA = '?' + if codon in AAdict: + AA = AAdict[codon] + + return(AA) + +def singletCodon(ntPOS, nt, ref): # process to return the AA and protein seq. position based on the reference and provided nt seq position and nt + AAPOS = (ntPOS-1)//3 + AAmod = (ntPOS-1)%3 + codon = "" + try: + if AAmod == 0: + codon = nt+ref[1][ntPOS]+ref[1][ntPOS+1] + elif AAmod == 1: + codon = ref[1][ntPOS-2]+nt+ref[1][ntPOS] + elif AAmod == 2: + codon = ref[1][ntPOS-3]+ref[1][ntPOS-2]+nt + except: + codon = "XXX" + + return(AAPOS+1, AAcall(codon)) + +def getCombos(qlist, clen): # returns combinations of single polymorphisms in a sequence + combos = [] + if (clen == 0 or clen > len(qlist)): + clen = len(qlist) + for N in range(1, clen+1): + for comb in itertools.combinations(qlist, N): + combos.append(' '.join(comb)) + return(combos) + +def SAMparse(args, ref, refprot, file): # process SAM files + covar_array = [] + seq_array = [] + + nt_call_dict_dict = {} + indel_dict = {} + seq_species = {} + sam_read_count = 0 + sam_line_count = 0 + firstPOS = 0 + lastPOS = 0 + coverage = {} + for line in args.Sam_file: + if not line.startswith('@'): # ignore header lines + splitline = line.split("\t") + if ref[0].upper().startswith(splitline[2].upper()): # check map ID matches referecne ID + if int(splitline[4]) > 0: # Check mapping score is positive + + abund_count=1 + if args.use_count == 1: # get the unique sequence counts + if '-' in splitline[0] and '=' in splitline[0]: + eq_split = splitline[0].split('=') + dash_split = splitline[0].split('-') + if len(eq_split[-1]) > len(dash_split[-1]): + abund_count=int(dash_split[-1]) + else: + abund_count=int(eq_split[-1]) + + elif '-' in splitline[0]: + try: + abund_count=int(splitline[0].split('-')[-1]) + except: + pass + + elif '=' in splitline[0]: + try: + abund_count=int(splitline[0].split('=')[-1]) + except: + pass + + + sam_read_count += abund_count + + CIGAR = splitline[5] + POS = int(splitline[3]) + if firstPOS == 0: + firstPOS = POS + elif POS < firstPOS: + firstPOS = POS + + readID = splitline[0] + query_seq = splitline[9].upper() + run_length = 0 + query_seq_parsed = '' + query_pos = 0 + q_pars_pos = 0 + mutations = [] + + for C in CIGAR: # process sequence based on standard CIGAR line + if C == 'M' or C == 'I' or C == 'D' or C == 'S' or C == 'H': + if C == 'S': + query_pos = query_pos + run_length + + if C == 'I': + if query_pos > 0: + # add insertion to dict + iPOS = q_pars_pos+POS + + iSeq = query_seq[query_pos: query_pos+run_length] + istring = str(iPOS)+'-insert'+iSeq + + try: + indel_dict[istring] + except: + indel_dict[istring] = abund_count + else: + indel_dict[istring] += abund_count + + if args.AAreport == 1 and (run_length % 3 == 0): + + iProt = '' + if iPOS % 3 == 1: + for x in range(0, (run_length//3)): + AA = AAcall(iSeq[x*3]+iSeq[x*3+1]+iSeq[x*3+2]) + iProt = iProt + AA + mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')') + elif iPOS % 3 == 2: + if query_pos > 0: + ipSeq = query_seq[query_pos-1:query_pos+run_length+2] + else: + ipSeq = "XXX"+query_seq[query_pos+2:query_pos+run_length+2] + for x in range(0, (run_length//3)+1): + AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2]) + iProt = iProt + AA + mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')') + else: + if query_pos > 1: + ipSeq = query_seq[query_pos-2:query_pos+run_length+1] + else: + ipSeq = "XXX"+query_seq[query_pos+1:query_pos+run_length+1] + + for x in range(0, (run_length//3)+1): + AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2]) + iProt = iProt + AA + mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')') + else: + mutations.append(istring) + + query_pos = query_pos + run_length + + elif C == 'D': + for X in range(0, run_length): + query_seq_parsed += '-' + + delstring = str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del' + + if args.AAreport == 1 and (run_length % 3 == 0) and not ((q_pars_pos+POS) % 3 == 1 ): + if (q_pars_pos+POS) % 3 == 2: + newcodon = query_seq[query_pos-1:query_pos+2] + newAArefpos = (q_pars_pos+POS) // 3 + mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')') + else: + newcodon = query_seq[query_pos-2:query_pos+1] + newAArefpos = (q_pars_pos+POS) // 3 + mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')') + else: + mutations.append(delstring) + + if args.nt_call == 1: + for N in range(q_pars_pos+POS, q_pars_pos+POS+int(run_length)): + try: + nt_call_dict_dict[N] + except: + nt_call_dict_dict[N] = {'A' : 0, + 'T' : 0, + 'C' : 0, + 'G' : 0, + '-' : 0} + nt_call_dict_dict[N]['-'] = abund_count + else: + nt_call_dict_dict[N]['-'] += abund_count + + try: + indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] + except: + indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] = int(abund_count) + else: + indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] += int(abund_count) + + q_pars_pos = q_pars_pos + run_length + + elif C == 'M': + offset = q_pars_pos-query_pos + refPOS = POS+offset + + for ntPOS in range(query_pos, query_pos+run_length): + if query_seq[ntPOS] == 'A' or query_seq[ntPOS] == 'T' or query_seq[ntPOS] == 'C' or query_seq[ntPOS] == 'G': + if query_seq[ntPOS] != ref[1][refPOS+ntPOS-1]: + if args.AAreport == 1 and args.AAcodonasMNP == 0: + AAinfo = singletCodon(refPOS+ntPOS, query_seq[ntPOS], ref) + mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') + else: + mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS]) + if args.nt_call == 1: + try: + nt_call_dict_dict[refPOS+ntPOS] + except: + nt_call_dict_dict[refPOS+ntPOS] = {'A' : 0, + 'T' : 0, + 'C' : 0, + 'G' : 0, + '-' : 0} + nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] = abund_count + else: + nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] += abund_count + + + q_pars_pos = q_pars_pos + run_length + query_pos = query_pos + run_length + + run_length = 0 + + + else: + run_length = (10 * run_length) + int(C) + # END CIGAR PARSE + + + + if len(mutations) == 0: # record reference counts + if args.wgs == 0: + try: + seq_species['Reference'] += abund_count + except: + seq_species['Reference'] = abund_count + else: + try: + seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] += abund_count + except: + seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] = abund_count + + else: # record variants and counts + if args.AAreport == 1 and args.AAcodonasMNP == 1: + codonchecked = [] + codon = '' + skip = 0 + MNP = '' + for i in range(0, len(mutations)): # checking for MNP + if 'Del' in mutations[i] or 'insert' in mutations[i]: + codonchecked.append(mutations[i]) + elif skip > 0: + skip -= 1 + else: + mut1POS = int(''.join([c for c in mutations[i] if c.isdigit()])) + try: + mutations[i+1] + except: + AAinfo = singletCodon(mut1POS, mutations[i][-1], ref) + codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') + else: + + if mut1POS % 3 == 0: + AAinfo = singletCodon(mut1POS, mutations[i][-1], ref) + codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') + else: + mut2POS = int(''.join([c for c in mutations[i+1] if c.isdigit()])) + if mut2POS - mut1POS < 3: + AAPOS = mut1POS // 3 + if mut1POS % 3 == 1: + if mut2POS % 3 == 0: + codon = mutations[i][-1]+ref[1][mut1POS]+mutations[i+1][-1] + MNP = mutations[i][-1]+'r'+mutations[i+1][-1] + skip = 1 + else: + try: + mutations[i+2] + except: + codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS] + MNP = mutations[i][-1]+mutations[i+1][-1]+'r' + skip = 1 + else: + mut3POS = int(''.join([c for c in mutations[i+2] if c.isdigit()])) + if mut2POS == mut3POS -1: + codon = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1] + MNP = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1] + skip = 2 + else: + codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS] + MNP = mutations[i][-1]+mutations[i+1][-1]+'r' + skip = 1 + codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')') + elif mut2POS - mut1POS == 1: + codon = ref[1][mut1POS-2]+mutations[i][-1]+mutations[i+1][-1] + MNP = mutations[i][-1]+mutations[i+1][-1] + skip = 1 + codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')') + else: + AAinfo = singletCodon(mut1POS, mutations[i][-1], ref) + codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') + + + else: + AAinfo = singletCodon(mut1POS, mutations[i][-1], ref) + codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')') + mutations = " ".join(codonchecked) + else: + mutations = " ".join(mutations) + + if args.wgs == 0: + try: + seq_species[mutations] += abund_count + except: + seq_species[mutations] = abund_count + else: + try: + seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] += abund_count + except: + seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] = abund_count + + if lastPOS < POS+q_pars_pos: + lastPOS = POS+q_pars_pos + for i in range(POS, POS+q_pars_pos): # update coverage + try: + coverage[i] += abund_count + except: + coverage[i] = abund_count + + args.Sam_file.close() + # END SAM LINES + + total_mapped_reads = sam_read_count + + if args.seq == 1: # output the sequence + seq_fh = open(args.outfile1, "w") + seq_fh.write(args.samp+"("+str(sam_read_count)+")\n") + seq_fh.write("Unique Sequence\tCount\tAbundance\n") + + sorted_seq = sorted(seq_species, key=seq_species.__getitem__, reverse=True) + for key in sorted_seq: + if seq_species[key] >= args.min_count: + if (seq_species[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0: + seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}\n") + seq_array.append(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}") + elif args.wgs == 1: + splitseqs = key.split() + cov = [] + for x in range(int(splitseqs[0]), int(splitseqs[-1])): + cov.append(coverage[x]) + min_cov = min(cov) + if (seq_species[key]/min_cov >= args.min_samp_abund): + seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/min_cov):.3f}\n") + else: + break + + seq_fh.close() + # END SEQ OUT + + if args.indel == 1: # and len(indel_dict) > 0: # output indels, if there are any + sorted_indels = sorted(indel_dict, key=indel_dict.__getitem__, reverse=True) + indels_to_write = [] + for key in sorted_indels: + if indel_dict[key] >= args.min_count: + if indel_dict[key] / sam_read_count >= args.min_samp_abund and args.wgs == 0: + indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key]/sam_read_count):.3f}\n") + elif args.wgs == 1: + indelPOS = '' + for c in key: + if c.isdigit(): + indelPOS += c + else: + break + indelPOS = int(indelPOS) + if indel_dict[key] / coverage[indelPOS] >= args.min_samp_abund: + indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key] / coverage[indelPOS]):.3f}\n") + else: + break + # if len(indels_to_write) > 0: + indel_fh = open(args.outfile2, "w") + indel_fh.write(args.samp+"("+str(sam_read_count)+")\n") + indel_fh.write("Indel\tCount\tAbundance\n") + for indel_entry in indels_to_write: + indel_fh.write(indel_entry) + indel_fh.close() + # END INDEL OUT + + if args.nt_call == 1: # out put nt calls + ntcall_fh = open(args.outfile3, "w") + ntcall_fh.write(args.samp+"("+str(sam_read_count)+")\n") + if args.ntvar == 1: + ntcallv_fh = open(args.outfile7, "w") + ntcallv_fh.write(args.samp+"("+str(sam_read_count)+")\n") + sorted_POS = sorted(nt_call_dict_dict) + if args.AAreport == 1: + ntcall_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n") + if args.ntvar == 1: + ntcallv_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n") + for POS in sorted_POS: + try: + total = coverage[POS] + except: + total = 0 + if total >= (sam_read_count * args.ntabund): + AAinfo = singletCodon(POS, ref[1][POS-1], ref) + POS_calls = {} + for key in nt_call_dict_dict[POS]: + POS_calls[key] = nt_call_dict_dict[POS][key] + sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True) + + ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1]) + ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) + ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])) + ntcall_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}") + if sorted_calls[0] != ref[1][POS-1]: + if args.ntvar == 1: + ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1]) + ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) + ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])) + ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}") + + mod = (POS)%3 + + if mod == 0: + try: + codon = ref[1][POS-3]+ref[1][POS-2]+sorted_calls[0] + except: + codon = 'NNN' + elif mod == 2: + try: + codon = ref[1][POS-2]+sorted_calls[0]+ref[1][POS] + except: + codon = 'NNN' + elif mod == 1: + try: + codon = sorted_calls[0]+ref[1][POS]+ref[1][POS+1] + except: + codon = 'NNN' + ntcall_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1]) + if args.ntvar == 1: + ntcallv_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1]) + if (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total >= args.min_samp_abund): + if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1: + ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1]) + ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1]) + if nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count: + if nt_call_dict_dict[POS][sorted_calls[2]] / total >= args.min_samp_abund: + if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1: + ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}") + ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}") + + if args.ntvar == 1: + ntcallv_fh.write("\n") + elif (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] / total >= args.min_samp_abund): + if args.ntvar == 1: + ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1]) + ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) + ntcallv_fh.write("\t"+str(total)+"\t\t") + ntcallv_fh.write(f"\t") + ntcallv_fh.write("\t\t") + ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1]) + ntcall_fh.write("\t\t") + ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1]) + + if (nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total >= args.min_samp_abund): + ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}") + if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1: + ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}") + if args.ntvar == 1: + ntcallv_fh.write("\n") + + ntcall_fh.write("\n") + else: + ntcall_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n") + if args.ntvar == 1: + ntcallv_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n") + + for POS in sorted_POS: + try: + total = coverage[POS] # sum(nt_call_dict_dict[POS].values()) + except: + total = 0 + if total >= (sam_read_count * args.ntabund): + POS_calls = {} + for key in nt_call_dict_dict[POS]: + POS_calls[key] = nt_call_dict_dict[POS][key] + sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True) + + ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1]) + ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) + ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}") + + if sorted_calls[0] != ref[1][POS-1]: + if args.ntvar == 1: + ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]) + ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) + ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])) + ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}") + if (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund): + ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}") + if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1: + ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}") + if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund): + ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}") + if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1: + ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}") + if args.ntvar == 1: + ntcallv_fh.write("\n") + + elif (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund): + if args.ntvar == 1: + ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]) + ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-'])) + ntcallv_fh.write("\t"+str(total)+"\t\t") + ntcallv_fh.write(f"\t") + ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}") + ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}") + if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and(nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund): + ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}") + if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1: + ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}") + if args.ntvar == 1: + ntcallv_fh.write("\n") + + ntcall_fh.write("\n") + + ntcall_fh.close() + if args.ntvar == 1: + ntcallv_fh.close() + # END NT CALL OUT + if args.covar == 1: # output covariants + testtrack = 0 + combinations = {} + for sequence in seq_species: + if args.wgs == 0: + singles = sequence.split() + else: + singles = (sequence.split())[1:-1] + if len(singles) <= args.max_dist and singles[0] != 'Ref': + for combo in getCombos(singles, args.max_covar): + if not combo in combinations: + combinations[combo] = seq_species[sequence] + else: + combinations[combo] += seq_species[sequence] + + covar_fh = open(args.outfile4, "w") + covar_fh.write(args.samp+"("+str(sam_read_count)+")\n") + covar_fh.write("Co-Variants\tCount\tAbundance\n") + sortedcombos = sorted(combinations, key=combinations.__getitem__, reverse=True) + for key in sortedcombos: + if (combinations[key] >= args.min_count): + if (combinations[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0: + covar_fh.write(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}\n") + covar_array.append(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}") + elif args.wgs == 1: + coveragepercent = 0 + splitcombos = key.split() + if len(splitcombos) == 1: + coveragePOS = '' + for c in key: + if c.isdigit(): + coveragePOS += c + else: + break + coveragepercent = combinations[key] / coverage[int(coveragePOS)] + else: + startcovPOS = '' + for c in splitcombos[0]: + if c.isdigit(): + startcovPOS += c + else: + break + endcovPOS = '' + for c in splitcombos[-1]: + if c.isdigit(): + endcovPOS += c + else: + break + coveragevals = [] + for i in range(int(startcovPOS), int(endcovPOS)+1): + coveragevals.append(coverage[i]) + mincov = min(coverval for coverval in coveragevals) + coveragepercent = combinations[key] / mincov + if coveragepercent >= args.min_samp_abund: + covar_fh.write(f"{key}\t{combinations[key]}\t{coveragepercent:.3f}\n") + + covar_fh.close() + + + # END COVAR OUT + + return(covar_array, seq_array, sam_read_count) + +def cvdeconv(args, covardict, seqdict): # covar deconvolution process + + passedseqs = {} + preservedseqs = {} + for seq in seqdict: # pass check for actual : expected abundance + if seq != 'total' and seq != 'singles': + + splitseq = seq.split(' ') + abund = 1 + for sing in splitseq: + try: + abund = abund * (covardict[sing] / covardict['total']) + except: + abund = abund * (seqdict[seq] / seqdict['total']) + + try: + covarabund = covardict[seq]/covardict['total'] + except: + covarabund = seqdict[seq]/seqdict['total'] + covardict[seq] = seqdict[seq] + + if covarabund >= args.autopass: + preservedseqs[seq] = max(1, args.beta, (covarabund / abund)) + elif covarabund >= abund * args.beta: + passedseqs[seq] = covarabund / abund + elif len(seq.split(' ')) == 1: + passedseqs[seq] = max(1, args.beta) + + if args.min_samp_abund < 1: + min_count = args.min_samp_abund * covardict['total'] + else: + min_count = args.min_samp_abund + + # sort passed covars + lensortedpassed = sorted(passedseqs, key = lambda key : len(key.split(' ')), reverse=True) + ratiolensortedpassed = sorted(lensortedpassed, key = lambda key : passedseqs[key], reverse = True) + sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__) + deconved = {} + for seq in ratiolensortedpassed: # reassign counts + singles = seq.split(' ') + first = 0 + rem_count = 0 + for sing in sortedsingles: + if sing in singles: + if covardict['singles'][sing] > 0: + if first == 0: + first = 1 + rem_count = covardict['singles'][sing] + covardict['singles'][sing] = 0 + deconved[seq] = rem_count + else: + covardict['singles'][sing] = covardict['singles'][sing] - rem_count + else: + break + sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__) + + sortedpreserved = sorted(preservedseqs, key = lambda key : covardict[key]) + + for seq in sortedpreserved: + singles = seq.split(' ') + first = 0 + rem_count = 0 + for sing in sortedsingles: + if sing in singles: + if covardict['singles'][sing] > 0: + if first == 0: + first = 1 + rem_count = covardict['singles'][sing] + covardict['singles'][sing] = 0 + deconved[seq] = rem_count + else: + covardict['singles'][sing] = covardict['singles'][sing] - rem_count + else: + break + sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__) + + + newtotal = sum(deconved.values()) + fh_deconv = open(args.outfile6, "w") + fh_deconv.write(f"{args.samp}({covardict['total']}) | ({newtotal})\tCount\tAbundance\n") + sorted_deconved = sorted(deconved, key = deconved.__getitem__, reverse = True) + for seq in sorted_deconved: # write deconv + if deconved[seq] >= min_count: + fh_deconv.write(f"{seq}\t{deconved[seq]}\t{(deconved[seq]/newtotal):.3f}\n") + fh_deconv.close() +# END COVAR DECONV OUT + + return() + +def dechim(args, seqs): # processing sequence dictionary to remove chimeras + total = seqs['total'] + del seqs['total'] + sorted_seqs = sorted(seqs, key=seqs.__getitem__) # sort sequences by abundance, least to greatest + chimeras = [] + for seq in sorted_seqs: + pot_chim = ['Reference']+seq.split()+['Reference'] + chim_halves = [] + for i in range(0, len(pot_chim)-1): # create dimera halves + chim_halves.append([pot_chim[:i+1], pot_chim[i+1:]]) + parent_pairs = [] + + for dimera in chim_halves: # check for potential parents + + pot_left = [] + pot_rt = [] + lft_len = len(dimera[0]) + rt_len = len(dimera[1]) + for pseq in sorted_seqs: + if not seq == pseq: + if (seqs[pseq] >= (seqs[seq] * args.foldab)): + pot_par = ['Reference']+pseq.split()+['Reference'] + if dimera[0] == pot_par[:lft_len]: + pot_left.append(pot_par) + if ((len(pot_par) >= rt_len) and (dimera[1] == pot_par[(len(pot_par)-rt_len):])): + pot_rt.append(pot_par) + + if (len(pot_left) > 0 and len(pot_rt) > 0 ): + for left_par in pot_left: # check potential parents' pairing + for rt_par in pot_rt: + if left_par != rt_par: + left_break = left_par[lft_len] + rt_break = rt_par[(len(rt_par)-rt_len)-1] + if left_break == 'Reference' or rt_break == 'Reference': + parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])]) + else: + left_break_POS = '' + for c in left_break: + if c.isdigit(): + left_break_POS += c + else: + break + + rt_break_POS = '' + for c in rt_break: + if c.isdigit(): + rt_break_POS += c + else: + break + + if int(left_break_POS) > int(rt_break_POS): + parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])]) + + par_tot_abund = 0 + pair_probs = [] + for parents in parent_pairs: # calc 'expected' abundance + pair_prob = (seqs[parents[0]] / total) * (seqs[parents[1]] / total) + par_tot_abund += pair_prob + pair_probs.append(pair_prob) + + recomb_count = par_tot_abund * total + + if not seqs[seq] >= recomb_count * args.alpha: # chimera check + redist_count = float(seqs[seq]) + chimeras.append(seq) + seqs[seq] = 0 + if args.redist == 1: # redist counts of chimera + toadd = {} + for i in range(0, len(parent_pairs)): + counts_to_redist = (redist_count * (pair_probs[i]/par_tot_abund))/2 + seqs[parent_pairs[i][0]] += counts_to_redist + seqs[parent_pairs[i][1]] += counts_to_redist + + + + for chim in chimeras: # remove chimeras + del seqs[chim] + + total = sum(seqs.values()) + + + # total = sum(seqs.values) + seqs['total'] = total + + return(seqs) + +def chimrm(args, seqs): # chimera removed process + + pre_len = len(seqs) + inf_loop_shield = 0 + while True: # send sequences for chimera removal while chimeras are still found + dechim(args, seqs) + post_len = len(seqs) + inf_loop_shield += 1 + if post_len >= pre_len: + break + if inf_loop_shield > 100: + break + pre_len = len(seqs) + + total = seqs['total'] + del seqs['total'] + if args.min_samp_abund < 1: + min_count = args.min_samp_abund * total + else: + min_count = args.min_samp_abund + sorted_seqs = sorted(seqs, key=seqs.__getitem__, reverse=True) + fh_dechime = open(args.outfile5, 'w') + fh_dechime.write(f"{args.samp}({int(total)})\n") + fh_dechime.write("Sequences\tCount\tAbundance\n") + for seq in seqs: # write chim_rm seqs + abund = seqs[seq]/total + if seqs[seq] >= min_count: + fh_dechime.write(f"{seq}\t{round(seqs[seq])}\t{abund:.3f}\n") + + fh_dechime.close() + return() + +def main(): + + args = arg_parser() # getting command line arguments + + covar_array = [] + seq_array = [] + total_mapped_reads = 0 + + if args.ref: + ref = get_ref(args.ref) # get the reference ID and sequence from the FASTA file + args.ref.close() + if ref[1] == '': + print('Reference not recognized as a Fasta format, skipping SAM parsing') + else: + refprot = '' + if args.AAreport == 1: # make an Amino Acid sequence based on the reference sequence + for x in range(0, (len(ref[1])-1)//3): + AA = AAcall(ref[1][x*3]+ref[1][x*3+1]+ref[1][x*3+2]) + refprot = refprot + AA + if (len(ref[1])-1)%3 != 0: + refprot = refprot + '?' + + covar_array, seq_array, total_mapped_reads = SAMparse(args, ref, refprot, args.Sam_file) + + else: + print('No reference provided, skipping SAM parsing') + + in_covars = {'total' : total_mapped_reads, + 'singles' : {} + } + in_seqs = {'total' : total_mapped_reads} + # Begin chimera removal if enabled + if args.chim_rm == 1 or args.deconv == 1: + if args.deconv == 1: + for line in covar_array: + lineparts = line.split("\t") + in_covars[lineparts[0]] = int(lineparts[1]) + if len(lineparts[0].split(' ')) == 1: + in_covars['singles'][lineparts[0]] = int(lineparts[1]) + + for line in seq_array: + lineparts = line.split("\t") + in_seqs[lineparts[0]] = float(lineparts[1]) + + if args.deconv == 1: + cvdeconv(args, in_covars, in_seqs) + + if args.chim_rm == 1: + chimrm(args, in_seqs) + +if __name__ == '__main__': + main() \ No newline at end of file