Mercurial > repos > degregory > sam_refiner

diff SAM_Refiner_Gal.py @ 0:22e3f843f1db draft
Uploaded main file
author: degregory
date: Mon, 13 Sep 2021 14:12:21 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SAM_Refiner_Gal.py	Mon Sep 13 14:12:21 2021 +0000
@@ -0,0 +1,1151 @@
+#!/bin/env python3
+# Writen by Devon Gregory with assistance from Christopher Bottoms
+# University of Missouri
+# Distributed under GNU GENERAL PUBLIC LICENSE v3
+import os
+import sys
+import argparse
+import itertools
+import time
+from multiprocessing import Process
+from pathlib import Path
+
+# process for collecing command line arguments
+def arg_parser():
+
+    parser = argparse.ArgumentParser(
+        description='process Sam files for variant information'
+    )
+
+    parser.add_argument(
+        '--samp',
+        type=str,
+        default='Sample',
+        help='Sample Name'
+    )
+    parser.add_argument(
+        '-r', '-reference',
+        type=argparse.FileType('r'),
+        dest='ref',
+        help='reference fasta'
+    )
+    parser.add_argument(
+        '-S', '--Sam_file',
+        type=argparse.FileType('r'),
+        dest='Sam_file',
+        help='.sam file'
+    )
+    parser.add_argument(
+        '-o1', '--out_file1',
+        dest='outfile1',
+        help='output file'
+    )
+    parser.add_argument(
+        '-o2', '--out_file2',
+        dest='outfile2',
+        help='output file'
+    )
+    parser.add_argument(
+        '-o3', '--out_file3',
+        dest='outfile3',
+        help='output file'
+    )
+    parser.add_argument(
+        '-o4', '--out_file4',
+        dest='outfile4',
+        help='output file'
+    )
+    parser.add_argument(
+        '-o5', '--out_file5',
+        dest='outfile5',
+        help='output file'
+    )
+    parser.add_argument(
+        '-o6', '--out_file6',
+        dest='outfile6',
+        help='output file'
+    )
+    parser.add_argument(
+        '-o7', '--out_file7',
+        dest='outfile7',
+        help='output file'
+    )
+    parser.add_argument(
+        '--use_count',
+        type=int,
+        default=1,
+        choices=[0, 1],
+        help='Enable/Disable (1/0) use of counts in sequence IDs, default enabled (--use_count 1)'
+    )
+    parser.add_argument(
+        '--min_count',
+        type=int,
+        default=10,
+        help='Minimum observations required to be included in sample reports; >= 1 occurance count; < 1 %% observed (.1 = 10%%), (default: .001)'
+    )
+    parser.add_argument(
+        '--min_samp_abund',
+        type=float,
+        default=0.001,
+        help='Minimum abundance required for inclusion in sample reports; %% observed (.1 = 10%%), (default: .001)'
+    )
+    parser.add_argument(
+        '--ntabund',
+        type=float,
+        default=.001,
+        help='Minimum abundance relative to total reads required for a position to be reported in the nt call output; must be non-negative and < 1, %% observed (.1 = 10%%), (default: .001)'
+    )
+    parser.add_argument(
+        '--max_dist',
+        type=int,
+        default=40,
+        help='Maximum number of variances from the reference a sequence can have to be consider in covars processing (default: 40)'
+    )
+    parser.add_argument(
+        '--max_covar',
+        type=int,
+        default=8,
+        help='Maximum number of variances from the reference to be reported in covars (default: 8)'
+    )
+    parser.add_argument(
+        '--AAreport',
+        type=int,
+        default=1,
+        choices=[0, 1],
+        help='Enable/Disable (1/0) amino acid reporting, default enabled (--AAreport 1)'
+    )
+    parser.add_argument(
+        '--AAcodonasMNP',
+        type=int,
+        default=1,
+        choices=[0, 1],
+        help='Enable/Disable (1/0) reporting multiple nt changes in a single codon as one polymorphism, default enabled (--AAcodonasMNP 1), requires AAreport enabled'
+    )
+    parser.add_argument(
+        '--alpha',
+        type=float,
+        default=1.2,
+        help='Modifier for chim_rm chimera checking, default 1.2.  Higher = more sensitive, more false chimeras removed; lower = less sensitive, fewer chimeras removed'
+    )
+    parser.add_argument(
+        '--foldab',
+        type=float,
+        default=1.8,
+        help='Threshold for potential parent / chimera abundance ratio for chim_rm; default is 1.8'
+    )
+    parser.add_argument(
+        '--redist',
+        type=int,
+        default=1,
+        choices=[0, 1],
+        help='Enable/Disable (1/0) redistribution of chimera counts for chim_rm, default enabled (--redist 1)'
+    )
+    parser.add_argument(
+        '--beta',
+        type=float,
+        default=1,
+        help='Modifier for covar pass checking, default 1.  Higher = more sensitive, more failed checks; lower = less sensitive, fewer failed checks'
+    )
+    parser.add_argument(
+        '--autopass',
+        type=float,
+        default=.3,
+        help='threshold for a sequence to automatically pass the covar pass checking'
+    )
+    parser.add_argument(
+        '--nt_call',
+        type=int,
+        default=1,
+        choices=[0, 1],
+        help='Enable/Disable (1/0) nt_call output, default enabled (--nt_call 1)'
+    )
+    parser.add_argument(
+        '--ntvar',
+        type=int,
+        default=0,
+        choices=[0, 1],
+        help='Enable/Disable (1/0) nt_var output, default enabled (--nt_call 1)'
+    )
+    parser.add_argument(
+        '--indel',
+        type=int,
+        default=1,
+        choices=[0, 1],
+        help='Enable/Disable (1/0) indel output, default enabled (--indel 1)'
+        )
+    parser.add_argument(
+        '--seq',
+        type=int,
+        default=1,
+        choices=[0, 1],
+        help='Enable/Disable (1/0) unique seq output, default enabled (--seq 1)'
+    )
+    parser.add_argument(
+        '--covar', type=int,
+        default=1,
+        choices=[0, 1],
+        help='Enable/Disable (1/0) covar output, default enabled (--covar 1)'
+    )
+    parser.add_argument(
+        '--chim_rm',
+        type=int,
+        default=1,
+        choices=[0, 1],
+        help='Enable/Disable (1/0) chim_rm output, default enabled (--chim_rm 1)'
+    )
+    parser.add_argument(
+        '--deconv',
+        type=int,
+        default=1,
+        choices=[0, 1],
+        help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)'
+    )
+    parser.add_argument(
+        '--wgs',
+        type=int,
+        default=0,
+        choices=[0, 1],
+        help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)'
+    )
+
+    args = parser.parse_args()
+
+# checking for proper range of some parameters and consistency/compatibility
+    if args.wgs == 1:
+        if args.deconv == 1 or args.chim_rm == 1:
+            args.deconv = 0
+            args.chim_rm = 0
+            print('WGS mode enabled, disabling chimera removal methods')
+
+    return(args)
+
+def get_ref(ref): # get the reference ID and sequence from the FASTA file.  Will only get the first.
+
+    n=0
+    refID = ''
+    refseq = ''
+    if ref:
+        for line in ref:
+            if line.startswith('>'):
+                n+=1
+                if n > 1:
+                    break
+                refID = line[1:].strip("\n\r")
+            elif n == 1:
+                refseq = refseq + line.strip("\n\r")
+        refseq = refseq.upper()
+
+
+    return(refID, refseq)
+
+def AAcall(codon): # amino acid / codon dictionary to return encoded AAs
+    AAdict = {
+        'TTT' : 'F',
+        'TTC' : 'F',
+        'TTA' : 'L',
+        'TTG' : 'L',
+        'TCT' : 'S',
+        'TCC' : 'S',
+        'TCA' : 'S',
+        'TCG' : 'S',
+        'TAT' : 'Y',
+        'TAC' : 'Y',
+        'TAA' : '*',
+        'TAG' : '*',
+        'TGT' : 'C',
+        'TGC' : 'C',
+        'TGA' : '*',
+        'TGG' : 'W',
+
+        'CTT' : 'L',
+        'CTC' : 'L',
+        'CTA' : 'L',
+        'CTG' : 'L',
+        'CCT' : 'P',
+        'CCC' : 'P',
+        'CCA' : 'P',
+        'CCG' : 'P',
+        'CAT' : 'H',
+        'CAC' : 'H',
+        'CAA' : 'Q',
+        'CAG' : 'Q',
+        'CGT' : 'R',
+        'CGC' : 'R',
+        'CGA' : 'R',
+        'CGG' : 'R',
+
+        'ATT' : 'I',
+        'ATC' : 'I',
+        'ATA' : 'I',
+        'ATG' : 'M',
+        'ACT' : 'T',
+        'ACC' : 'T',
+        'ACA' : 'T',
+        'ACG' : 'T',
+        'AAT' : 'N',
+        'AAC' : 'N',
+        'AAA' : 'K',
+        'AAG' : 'K',
+        'AGT' : 'S',
+        'AGC' : 'S',
+        'AGA' : 'R',
+        'AGG' : 'R',
+
+        'GTT' : 'V',
+        'GTC' : 'V',
+        'GTA' : 'V',
+        'GTG' : 'V',
+        'GCT' : 'A',
+        'GCC' : 'A',
+        'GCA' : 'A',
+        'GCG' : 'A',
+        'GAT' : 'D',
+        'GAC' : 'D',
+        'GAA' : 'E',
+        'GAG' : 'E',
+        'GGT' : 'G',
+        'GGC' : 'G',
+        'GGA' : 'G',
+        'GGG' : 'G'
+    }
+    AA = '?'
+    if codon in AAdict:
+        AA = AAdict[codon]
+
+    return(AA)
+
+def singletCodon(ntPOS, nt, ref): # process to return the AA and protein seq. position based on the reference and provided nt seq position and nt
+    AAPOS = (ntPOS-1)//3
+    AAmod = (ntPOS-1)%3
+    codon = ""
+    try:
+        if AAmod == 0:
+            codon = nt+ref[1][ntPOS]+ref[1][ntPOS+1]
+        elif AAmod == 1:
+            codon = ref[1][ntPOS-2]+nt+ref[1][ntPOS]
+        elif AAmod == 2:
+            codon = ref[1][ntPOS-3]+ref[1][ntPOS-2]+nt
+    except:
+        codon = "XXX"
+
+    return(AAPOS+1, AAcall(codon))
+
+def getCombos(qlist, clen): # returns combinations of single polymorphisms in a sequence
+    combos = []
+    if (clen == 0 or clen > len(qlist)):
+        clen = len(qlist)
+    for N in range(1, clen+1):
+        for comb in itertools.combinations(qlist, N):
+            combos.append(' '.join(comb))
+    return(combos)
+
+def SAMparse(args, ref, refprot, file): # process SAM files
+    covar_array = []
+    seq_array = []
+
+    nt_call_dict_dict = {}
+    indel_dict = {}
+    seq_species = {}
+    sam_read_count = 0
+    sam_line_count = 0
+    firstPOS = 0
+    lastPOS = 0
+    coverage = {}
+    for line in args.Sam_file:
+        if not line.startswith('@'): # ignore header lines
+            splitline = line.split("\t")
+            if ref[0].upper().startswith(splitline[2].upper()): # check map ID matches referecne ID
+                if int(splitline[4]) > 0:  # Check mapping score is positive
+
+                    abund_count=1
+                    if args.use_count == 1: # get the unique sequence counts
+                        if '-' in splitline[0] and '=' in splitline[0]:
+                            eq_split = splitline[0].split('=')
+                            dash_split = splitline[0].split('-')
+                            if len(eq_split[-1]) > len(dash_split[-1]):
+                                abund_count=int(dash_split[-1])
+                            else:
+                                abund_count=int(eq_split[-1])
+
+                        elif '-' in splitline[0]:
+                            try:
+                                abund_count=int(splitline[0].split('-')[-1])
+                            except:
+                                pass
+
+                        elif '=' in splitline[0]:
+                            try:
+                                abund_count=int(splitline[0].split('=')[-1])
+                            except:
+                                pass
+
+
+                    sam_read_count += abund_count
+
+                    CIGAR = splitline[5]
+                    POS = int(splitline[3])
+                    if firstPOS == 0:
+                        firstPOS = POS
+                    elif POS < firstPOS:
+                        firstPOS = POS
+
+                    readID = splitline[0]
+                    query_seq = splitline[9].upper()
+                    run_length = 0
+                    query_seq_parsed = ''
+                    query_pos = 0
+                    q_pars_pos = 0
+                    mutations = []
+
+                    for C in CIGAR: # process sequence based on standard CIGAR line
+                        if C == 'M' or C == 'I' or C == 'D' or C == 'S' or C == 'H':
+                            if C == 'S':
+                                query_pos = query_pos + run_length
+
+                            if C == 'I':
+                                if query_pos > 0:
+                                    # add insertion to dict
+                                    iPOS = q_pars_pos+POS
+
+                                    iSeq = query_seq[query_pos: query_pos+run_length]
+                                    istring = str(iPOS)+'-insert'+iSeq
+
+                                    try:
+                                        indel_dict[istring]
+                                    except:
+                                        indel_dict[istring] = abund_count
+                                    else:
+                                        indel_dict[istring] += abund_count
+
+                                    if args.AAreport == 1 and (run_length % 3 == 0):
+                                        
+                                        iProt = ''
+                                        if iPOS % 3 == 1:
+                                            for x in range(0, (run_length//3)):
+                                                AA = AAcall(iSeq[x*3]+iSeq[x*3+1]+iSeq[x*3+2])
+                                                iProt = iProt + AA
+                                            mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
+                                        elif iPOS % 3 == 2:
+                                            if query_pos > 0:
+                                                ipSeq = query_seq[query_pos-1:query_pos+run_length+2]
+                                            else:
+                                                ipSeq = "XXX"+query_seq[query_pos+2:query_pos+run_length+2]
+                                            for x in range(0, (run_length//3)+1):
+                                                AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2])
+                                                iProt = iProt + AA
+                                            mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
+                                        else:
+                                            if query_pos > 1:
+                                                ipSeq = query_seq[query_pos-2:query_pos+run_length+1]
+                                            else:
+                                                ipSeq = "XXX"+query_seq[query_pos+1:query_pos+run_length+1]
+                                            
+                                            for x in range(0, (run_length//3)+1):
+                                                AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2])
+                                                iProt = iProt + AA
+                                            mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
+                                    else:
+                                        mutations.append(istring)
+
+                                    query_pos = query_pos + run_length
+
+                            elif C == 'D':
+                                for X in range(0, run_length):
+                                    query_seq_parsed += '-'
+
+                                delstring = str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'
+
+                                if args.AAreport == 1 and (run_length % 3 == 0) and not ((q_pars_pos+POS) % 3 == 1 ):
+                                    if (q_pars_pos+POS) % 3 == 2:
+                                        newcodon = query_seq[query_pos-1:query_pos+2]
+                                        newAArefpos = (q_pars_pos+POS) // 3
+                                        mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')')
+                                    else:
+                                        newcodon = query_seq[query_pos-2:query_pos+1]
+                                        newAArefpos = (q_pars_pos+POS) // 3
+                                        mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')')
+                                else:
+                                    mutations.append(delstring)
+
+                                if args.nt_call == 1:
+                                    for N in range(q_pars_pos+POS, q_pars_pos+POS+int(run_length)):
+                                        try:
+                                            nt_call_dict_dict[N]
+                                        except:
+                                            nt_call_dict_dict[N] = {'A' : 0,
+                                                                    'T' : 0,
+                                                                    'C' : 0,
+                                                                    'G' : 0,
+                                                                    '-' : 0}
+                                            nt_call_dict_dict[N]['-'] = abund_count
+                                        else:
+                                            nt_call_dict_dict[N]['-'] += abund_count
+
+                                try:
+                                    indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del']
+                                except:
+                                    indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] = int(abund_count)
+                                else:
+                                    indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] += int(abund_count)
+
+                                q_pars_pos = q_pars_pos + run_length
+
+                            elif C == 'M':
+                                offset = q_pars_pos-query_pos
+                                refPOS = POS+offset
+
+                                for ntPOS in range(query_pos, query_pos+run_length):
+                                    if query_seq[ntPOS] == 'A' or query_seq[ntPOS] == 'T' or query_seq[ntPOS] == 'C' or query_seq[ntPOS] == 'G':
+                                        if query_seq[ntPOS] != ref[1][refPOS+ntPOS-1]:
+                                            if args.AAreport == 1 and args.AAcodonasMNP == 0:
+                                                AAinfo = singletCodon(refPOS+ntPOS, query_seq[ntPOS], ref)
+                                                mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
+                                            else:
+                                                mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS])
+                                        if args.nt_call == 1:
+                                            try:
+                                                nt_call_dict_dict[refPOS+ntPOS]
+                                            except:
+                                                nt_call_dict_dict[refPOS+ntPOS] = {'A' : 0,
+                                                                                   'T' : 0,
+                                                                                   'C' : 0,
+                                                                                   'G' : 0,
+                                                                                   '-' : 0}
+                                                nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] = abund_count
+                                            else:
+                                                nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] += abund_count
+
+
+                                q_pars_pos = q_pars_pos + run_length
+                                query_pos = query_pos + run_length
+
+                            run_length = 0
+
+
+                        else:
+                            run_length = (10 * run_length) + int(C)
+                    # END CIGAR PARSE
+
+
+
+                    if len(mutations) == 0: # record reference counts
+                        if args.wgs == 0:
+                            try:
+                                seq_species['Reference'] += abund_count
+                            except:
+                                seq_species['Reference'] = abund_count
+                        else:
+                            try:
+                                seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] += abund_count
+                            except:
+                                seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] = abund_count
+
+                    else: # record variants and counts
+                        if args.AAreport == 1 and args.AAcodonasMNP == 1:
+                            codonchecked = []
+                            codon = ''
+                            skip = 0
+                            MNP = ''
+                            for i in range(0, len(mutations)): # checking for MNP
+                                if 'Del' in mutations[i] or 'insert' in mutations[i]:
+                                    codonchecked.append(mutations[i])
+                                elif skip > 0:
+                                    skip -= 1
+                                else:
+                                    mut1POS = int(''.join([c for c in mutations[i] if c.isdigit()]))
+                                    try:
+                                        mutations[i+1]
+                                    except:
+                                        AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
+                                        codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
+                                    else:
+
+                                        if mut1POS % 3 == 0:
+                                            AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
+                                            codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
+                                        else:
+                                            mut2POS = int(''.join([c for c in mutations[i+1] if c.isdigit()]))
+                                            if mut2POS - mut1POS < 3:
+                                                AAPOS = mut1POS // 3
+                                                if mut1POS % 3 == 1:
+                                                    if mut2POS % 3 == 0:
+                                                        codon = mutations[i][-1]+ref[1][mut1POS]+mutations[i+1][-1]
+                                                        MNP = mutations[i][-1]+'r'+mutations[i+1][-1]
+                                                        skip = 1
+                                                    else:
+                                                        try:
+                                                            mutations[i+2]
+                                                        except:
+                                                            codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS]
+                                                            MNP = mutations[i][-1]+mutations[i+1][-1]+'r'
+                                                            skip = 1
+                                                        else:
+                                                            mut3POS = int(''.join([c for c in mutations[i+2] if c.isdigit()]))
+                                                            if mut2POS == mut3POS -1:
+                                                                codon = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1]
+                                                                MNP = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1]
+                                                                skip = 2
+                                                            else:
+                                                                codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS]
+                                                                MNP = mutations[i][-1]+mutations[i+1][-1]+'r'
+                                                                skip = 1
+                                                    codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')')
+                                                elif mut2POS - mut1POS == 1:
+                                                    codon = ref[1][mut1POS-2]+mutations[i][-1]+mutations[i+1][-1]
+                                                    MNP = mutations[i][-1]+mutations[i+1][-1]
+                                                    skip = 1
+                                                    codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')')
+                                                else:
+                                                    AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
+                                                    codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
+
+
+                                            else:
+                                                AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
+                                                codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
+                            mutations = " ".join(codonchecked)
+                        else:
+                            mutations = " ".join(mutations)
+
+                        if args.wgs == 0:
+                            try:
+                                seq_species[mutations] += abund_count
+                            except:
+                                seq_species[mutations] = abund_count
+                        else:
+                            try:
+                                seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] += abund_count
+                            except:
+                                seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] = abund_count
+
+                    if lastPOS < POS+q_pars_pos:
+                        lastPOS = POS+q_pars_pos
+                    for i in range(POS, POS+q_pars_pos): # update coverage
+                        try:
+                            coverage[i] += abund_count
+                        except:
+                            coverage[i] = abund_count
+
+    args.Sam_file.close()
+    # END SAM LINES
+
+    total_mapped_reads = sam_read_count
+
+    if args.seq == 1: # output the sequence
+        seq_fh = open(args.outfile1, "w")
+        seq_fh.write(args.samp+"("+str(sam_read_count)+")\n")
+        seq_fh.write("Unique Sequence\tCount\tAbundance\n")
+
+        sorted_seq = sorted(seq_species, key=seq_species.__getitem__, reverse=True)
+        for key in sorted_seq:
+            if seq_species[key] >= args.min_count:
+                if (seq_species[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0:
+                    seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}\n")
+                    seq_array.append(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}")
+                elif args.wgs == 1:
+                    splitseqs = key.split()
+                    cov = []
+                    for x in range(int(splitseqs[0]), int(splitseqs[-1])):
+                        cov.append(coverage[x])
+                    min_cov = min(cov)
+                    if (seq_species[key]/min_cov >= args.min_samp_abund):
+                        seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/min_cov):.3f}\n")
+            else:
+                break
+
+        seq_fh.close()
+        # END SEQ OUT
+
+    if args.indel == 1: # and len(indel_dict) > 0: # output indels, if there are any
+        sorted_indels = sorted(indel_dict, key=indel_dict.__getitem__, reverse=True)
+        indels_to_write = []
+        for key in sorted_indels:
+            if indel_dict[key] >= args.min_count:
+                if indel_dict[key] / sam_read_count >= args.min_samp_abund and args.wgs == 0:
+                    indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key]/sam_read_count):.3f}\n")
+                elif args.wgs == 1:
+                    indelPOS = ''
+                    for c in key:
+                        if c.isdigit():
+                            indelPOS += c
+                        else:
+                            break
+                    indelPOS = int(indelPOS)
+                    if indel_dict[key] / coverage[indelPOS] >= args.min_samp_abund:
+                        indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key] / coverage[indelPOS]):.3f}\n")
+            else:
+                break
+        # if len(indels_to_write) > 0:
+        indel_fh = open(args.outfile2, "w")
+        indel_fh.write(args.samp+"("+str(sam_read_count)+")\n")
+        indel_fh.write("Indel\tCount\tAbundance\n")
+        for indel_entry in indels_to_write:
+            indel_fh.write(indel_entry)
+        indel_fh.close()
+        # END INDEL OUT
+
+    if args.nt_call == 1: # out put nt calls
+        ntcall_fh = open(args.outfile3, "w")
+        ntcall_fh.write(args.samp+"("+str(sam_read_count)+")\n")
+        if args.ntvar == 1:
+            ntcallv_fh = open(args.outfile7, "w")
+            ntcallv_fh.write(args.samp+"("+str(sam_read_count)+")\n")
+        sorted_POS = sorted(nt_call_dict_dict)
+        if args.AAreport == 1:
+            ntcall_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n")
+            if args.ntvar == 1:
+                ntcallv_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n")
+            for POS in sorted_POS:
+                try:
+                    total = coverage[POS]
+                except:
+                    total = 0
+                if total >= (sam_read_count * args.ntabund):
+                    AAinfo = singletCodon(POS, ref[1][POS-1], ref)
+                    POS_calls = {}
+                    for key in nt_call_dict_dict[POS]:
+                        POS_calls[key] = nt_call_dict_dict[POS][key]
+                    sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True)
+
+                    ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
+                    ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
+                    ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
+                    ntcall_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
+                    if sorted_calls[0] != ref[1][POS-1]:
+                        if args.ntvar == 1:
+                            ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
+                            ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
+                            ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
+                            ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
+
+                        mod = (POS)%3
+
+                        if mod == 0:
+                            try:
+                                codon = ref[1][POS-3]+ref[1][POS-2]+sorted_calls[0]
+                            except:
+                                codon = 'NNN'
+                        elif mod == 2:
+                            try:
+                                codon = ref[1][POS-2]+sorted_calls[0]+ref[1][POS]
+                            except:
+                                codon = 'NNN'
+                        elif mod == 1:
+                            try:
+                                codon = sorted_calls[0]+ref[1][POS]+ref[1][POS+1]
+                            except:
+                                codon = 'NNN'
+                        ntcall_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1])
+                        if args.ntvar == 1:
+                            ntcallv_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1])
+                        if (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total >= args.min_samp_abund):
+                                if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1:
+                                    ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
+                                ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
+                                if nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count:
+                                    if nt_call_dict_dict[POS][sorted_calls[2]] / total  >= args.min_samp_abund:
+                                        if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
+                                            ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
+                                        ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
+
+                        if args.ntvar == 1:
+                            ntcallv_fh.write("\n")
+                    elif (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] / total >= args.min_samp_abund):
+                        if args.ntvar == 1:
+                            ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
+                            ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
+                            ntcallv_fh.write("\t"+str(total)+"\t\t")
+                            ntcallv_fh.write(f"\t")
+                            ntcallv_fh.write("\t\t")
+                            ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
+                        ntcall_fh.write("\t\t")
+                        ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
+
+                        if (nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total >= args.min_samp_abund):
+                            ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
+                            if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
+                                ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
+                        if args.ntvar == 1:
+                            ntcallv_fh.write("\n")
+
+                    ntcall_fh.write("\n")
+        else:
+            ntcall_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n")
+            if args.ntvar == 1:
+                ntcallv_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n")
+
+            for POS in sorted_POS:
+                try:
+                    total = coverage[POS] # sum(nt_call_dict_dict[POS].values())
+                except:
+                    total = 0
+                if total >= (sam_read_count * args.ntabund):
+                    POS_calls = {}
+                    for key in nt_call_dict_dict[POS]:
+                        POS_calls[key] = nt_call_dict_dict[POS][key]
+                    sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True)
+
+                    ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1])
+                    ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
+                    ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
+
+                    if sorted_calls[0] != ref[1][POS-1]:
+                        if args.ntvar == 1:
+                            ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1])
+                            ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
+                            ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
+                            ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
+                        if (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund):
+                            ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
+                            if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1:
+                                ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
+                            if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund):
+                                ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
+                                if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
+                                    ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
+                        if args.ntvar == 1:
+                            ntcallv_fh.write("\n")
+
+                    elif (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund):
+                        if args.ntvar == 1:
+                            ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1])
+                            ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
+                            ntcallv_fh.write("\t"+str(total)+"\t\t")
+                            ntcallv_fh.write(f"\t")
+                            ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
+                        ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
+                        if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and(nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund):
+                            ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
+                            if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
+                                ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
+                        if args.ntvar == 1:
+                            ntcallv_fh.write("\n")
+
+                    ntcall_fh.write("\n")
+
+        ntcall_fh.close()
+        if args.ntvar == 1:
+            ntcallv_fh.close()
+        # END NT CALL OUT
+    if args.covar == 1: # output covariants
+        testtrack = 0
+        combinations = {}
+        for sequence in seq_species:
+            if args.wgs == 0:
+                singles = sequence.split()
+            else:
+                singles = (sequence.split())[1:-1]
+            if len(singles) <= args.max_dist and singles[0] != 'Ref':
+                for combo in getCombos(singles, args.max_covar):
+                    if not combo in combinations:
+                        combinations[combo] = seq_species[sequence]
+                    else:
+                        combinations[combo] += seq_species[sequence]
+
+        covar_fh = open(args.outfile4, "w")
+        covar_fh.write(args.samp+"("+str(sam_read_count)+")\n")
+        covar_fh.write("Co-Variants\tCount\tAbundance\n")
+        sortedcombos = sorted(combinations, key=combinations.__getitem__, reverse=True)
+        for key in sortedcombos:
+            if (combinations[key] >= args.min_count):
+                if (combinations[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0:
+                    covar_fh.write(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}\n")
+                    covar_array.append(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}")
+                elif args.wgs == 1:
+                    coveragepercent = 0
+                    splitcombos = key.split()
+                    if len(splitcombos) == 1:
+                        coveragePOS = ''
+                        for c in key:
+                            if c.isdigit():
+                                coveragePOS += c
+                            else:
+                                break
+                        coveragepercent = combinations[key] / coverage[int(coveragePOS)]
+                    else:
+                        startcovPOS = ''
+                        for c in splitcombos[0]:
+                            if c.isdigit():
+                                startcovPOS += c
+                            else:
+                                break
+                        endcovPOS = ''
+                        for c in splitcombos[-1]:
+                            if c.isdigit():
+                                endcovPOS += c
+                            else:
+                                break
+                        coveragevals = []
+                        for i in range(int(startcovPOS), int(endcovPOS)+1):
+                            coveragevals.append(coverage[i])
+                        mincov = min(coverval for coverval in coveragevals)
+                        coveragepercent = combinations[key] / mincov
+                    if coveragepercent >= args.min_samp_abund:
+                        covar_fh.write(f"{key}\t{combinations[key]}\t{coveragepercent:.3f}\n")
+
+        covar_fh.close()
+
+
+        # END COVAR OUT
+
+    return(covar_array, seq_array, sam_read_count)
+
+def cvdeconv(args, covardict, seqdict): # covar deconvolution process
+
+    passedseqs = {}
+    preservedseqs = {}
+    for seq in seqdict: # pass check for actual : expected abundance
+        if seq != 'total' and seq != 'singles':
+         
+            splitseq = seq.split(' ')
+            abund = 1
+            for sing in splitseq:
+                try:
+                    abund = abund * (covardict[sing] / covardict['total'])
+                except:
+                    abund = abund * (seqdict[seq] / seqdict['total'])
+
+            try:
+                covarabund = covardict[seq]/covardict['total']
+            except:
+                covarabund = seqdict[seq]/seqdict['total']
+                covardict[seq] = seqdict[seq]
+
+            if covarabund >= args.autopass:
+                preservedseqs[seq] = max(1, args.beta, (covarabund / abund))
+            elif covarabund >= abund * args.beta:
+                passedseqs[seq] = covarabund / abund
+            elif len(seq.split(' ')) == 1:
+                passedseqs[seq] = max(1, args.beta)
+
+    if args.min_samp_abund < 1:
+        min_count = args.min_samp_abund * covardict['total']
+    else:
+        min_count = args.min_samp_abund
+
+    # sort passed covars
+    lensortedpassed = sorted(passedseqs, key = lambda key : len(key.split(' ')), reverse=True)
+    ratiolensortedpassed = sorted(lensortedpassed, key = lambda key : passedseqs[key], reverse = True)
+    sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
+    deconved = {}
+    for seq in ratiolensortedpassed: # reassign counts
+        singles = seq.split(' ')
+        first = 0
+        rem_count = 0
+        for sing in sortedsingles:
+            if sing in singles:
+                if covardict['singles'][sing] > 0:
+                    if first == 0:
+                        first = 1
+                        rem_count = covardict['singles'][sing]
+                        covardict['singles'][sing] = 0
+                        deconved[seq] = rem_count
+                    else:
+                        covardict['singles'][sing] = covardict['singles'][sing] - rem_count
+                else:
+                    break
+        sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
+
+    sortedpreserved = sorted(preservedseqs, key = lambda key : covardict[key])
+ 
+    for seq in sortedpreserved:
+        singles = seq.split(' ')
+        first = 0
+        rem_count = 0
+        for sing in sortedsingles:
+            if sing in singles:
+                if covardict['singles'][sing] > 0:
+                    if first == 0:
+                        first = 1
+                        rem_count = covardict['singles'][sing]
+                        covardict['singles'][sing] = 0
+                        deconved[seq] = rem_count
+                    else:
+                        covardict['singles'][sing] = covardict['singles'][sing] - rem_count
+                else:
+                    break
+        sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
+            
+
+    newtotal = sum(deconved.values())
+    fh_deconv = open(args.outfile6, "w")
+    fh_deconv.write(f"{args.samp}({covardict['total']}) | ({newtotal})\tCount\tAbundance\n")
+    sorted_deconved = sorted(deconved, key = deconved.__getitem__, reverse = True)
+    for seq in sorted_deconved: # write deconv
+        if deconved[seq] >= min_count:
+            fh_deconv.write(f"{seq}\t{deconved[seq]}\t{(deconved[seq]/newtotal):.3f}\n")
+    fh_deconv.close()
+# END COVAR DECONV OUT
+
+    return()
+
+def dechim(args, seqs): # processing sequence dictionary to remove chimeras
+    total = seqs['total']
+    del seqs['total']
+    sorted_seqs = sorted(seqs, key=seqs.__getitem__) # sort sequences by abundance, least to greatest
+    chimeras = []
+    for seq in sorted_seqs:
+        pot_chim = ['Reference']+seq.split()+['Reference']
+        chim_halves = []
+        for i in range(0, len(pot_chim)-1): # create dimera halves
+            chim_halves.append([pot_chim[:i+1], pot_chim[i+1:]])
+        parent_pairs = []
+
+        for dimera in chim_halves: # check for potential parents
+
+            pot_left = []
+            pot_rt = []
+            lft_len = len(dimera[0])
+            rt_len = len(dimera[1])
+            for pseq in sorted_seqs:
+                if not seq == pseq:
+                    if (seqs[pseq] >= (seqs[seq] * args.foldab)):
+                        pot_par = ['Reference']+pseq.split()+['Reference']
+                        if dimera[0] == pot_par[:lft_len]:
+                            pot_left.append(pot_par)
+                        if ((len(pot_par) >= rt_len) and (dimera[1] == pot_par[(len(pot_par)-rt_len):])):
+                            pot_rt.append(pot_par)
+
+            if (len(pot_left) > 0 and len(pot_rt) > 0 ):
+                for left_par in pot_left: # check potential parents' pairing
+                    for rt_par in pot_rt:
+                        if left_par != rt_par:
+                            left_break = left_par[lft_len]
+                            rt_break = rt_par[(len(rt_par)-rt_len)-1]
+                            if left_break == 'Reference' or rt_break == 'Reference':
+                                parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])])
+                            else:
+                                left_break_POS = ''
+                                for c in left_break:
+                                    if c.isdigit():
+                                        left_break_POS += c
+                                    else:
+                                        break
+
+                                rt_break_POS = ''
+                                for c in rt_break:
+                                    if c.isdigit():
+                                        rt_break_POS += c
+                                    else:
+                                        break
+
+                                if int(left_break_POS) > int(rt_break_POS):
+                                    parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])])
+
+        par_tot_abund = 0
+        pair_probs = []
+        for parents in parent_pairs: # calc 'expected' abundance
+            pair_prob = (seqs[parents[0]] / total) * (seqs[parents[1]] / total)
+            par_tot_abund += pair_prob
+            pair_probs.append(pair_prob)
+
+        recomb_count = par_tot_abund * total
+
+        if not seqs[seq] >= recomb_count * args.alpha: # chimera check
+            redist_count = float(seqs[seq])
+            chimeras.append(seq)
+            seqs[seq] = 0
+            if args.redist == 1: # redist counts of chimera
+                toadd = {}
+                for i in range(0, len(parent_pairs)):
+                    counts_to_redist = (redist_count * (pair_probs[i]/par_tot_abund))/2
+                    seqs[parent_pairs[i][0]] += counts_to_redist
+                    seqs[parent_pairs[i][1]] += counts_to_redist
+
+
+
+    for chim in chimeras: # remove chimeras
+        del seqs[chim]
+
+    total = sum(seqs.values())
+
+
+    # total = sum(seqs.values)
+    seqs['total'] = total
+
+    return(seqs)
+
+def chimrm(args, seqs): # chimera removed process
+
+    pre_len = len(seqs)
+    inf_loop_shield = 0
+    while True: # send sequences for chimera removal while chimeras are still found
+        dechim(args, seqs)
+        post_len = len(seqs)
+        inf_loop_shield += 1
+        if post_len >= pre_len:
+            break
+        if inf_loop_shield > 100:
+            break
+        pre_len = len(seqs)
+
+    total = seqs['total']
+    del seqs['total']
+    if args.min_samp_abund < 1:
+        min_count = args.min_samp_abund * total
+    else:
+        min_count = args.min_samp_abund
+    sorted_seqs = sorted(seqs, key=seqs.__getitem__, reverse=True)
+    fh_dechime = open(args.outfile5, 'w')
+    fh_dechime.write(f"{args.samp}({int(total)})\n")
+    fh_dechime.write("Sequences\tCount\tAbundance\n")
+    for seq in seqs: # write chim_rm seqs
+        abund = seqs[seq]/total
+        if seqs[seq] >= min_count:
+            fh_dechime.write(f"{seq}\t{round(seqs[seq])}\t{abund:.3f}\n")
+
+    fh_dechime.close()
+    return()
+
+def main():
+
+    args = arg_parser() # getting command line arguments
+
+    covar_array = []
+    seq_array = []
+    total_mapped_reads = 0
+    
+    if args.ref:
+        ref = get_ref(args.ref) # get the reference ID and sequence from the FASTA file
+        args.ref.close()
+        if ref[1] == '':
+            print('Reference not recognized as a Fasta format, skipping SAM parsing')
+        else:
+            refprot = ''
+            if args.AAreport == 1: # make an Amino Acid sequence based on the reference sequence
+                for x in range(0, (len(ref[1])-1)//3):
+                    AA = AAcall(ref[1][x*3]+ref[1][x*3+1]+ref[1][x*3+2])
+                    refprot = refprot + AA
+                if (len(ref[1])-1)%3 != 0:
+                    refprot = refprot + '?'
+
+            covar_array, seq_array, total_mapped_reads = SAMparse(args, ref, refprot, args.Sam_file)
+                
+    else:
+        print('No reference provided, skipping SAM parsing')
+
+    in_covars = {'total' : total_mapped_reads,
+                  'singles' : {}
+                  }
+    in_seqs = {'total' : total_mapped_reads}
+    # Begin chimera removal if enabled
+    if args.chim_rm == 1 or args.deconv == 1:
+        if args.deconv == 1:
+            for line in covar_array:
+                lineparts = line.split("\t")
+                in_covars[lineparts[0]] = int(lineparts[1])
+                if len(lineparts[0].split(' ')) == 1:
+                    in_covars['singles'][lineparts[0]] = int(lineparts[1])
+
+        for line in seq_array:
+            lineparts = line.split("\t")
+            in_seqs[lineparts[0]] = float(lineparts[1])
+
+    if args.deconv == 1:
+        cvdeconv(args, in_covars, in_seqs)
+
+    if args.chim_rm == 1:
+        chimrm(args, in_seqs)
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
author	degregory
date	Mon, 13 Sep 2021 14:12:21 +0000
parents
children