view SAM_Refiner_Gal.py @ 1:b68f7e37e70a draft default tip

Uploaded xml
author degregory
date Mon, 13 Sep 2021 14:14:20 +0000
parents 22e3f843f1db
children
line wrap: on
line source

#!/bin/env python3
# Writen by Devon Gregory with assistance from Christopher Bottoms
# University of Missouri
# Distributed under GNU GENERAL PUBLIC LICENSE v3
import os
import sys
import argparse
import itertools
import time
from multiprocessing import Process
from pathlib import Path

# process for collecing command line arguments
def arg_parser():

    parser = argparse.ArgumentParser(
        description='process Sam files for variant information'
    )

    parser.add_argument(
        '--samp',
        type=str,
        default='Sample',
        help='Sample Name'
    )
    parser.add_argument(
        '-r', '-reference',
        type=argparse.FileType('r'),
        dest='ref',
        help='reference fasta'
    )
    parser.add_argument(
        '-S', '--Sam_file',
        type=argparse.FileType('r'),
        dest='Sam_file',
        help='.sam file'
    )
    parser.add_argument(
        '-o1', '--out_file1',
        dest='outfile1',
        help='output file'
    )
    parser.add_argument(
        '-o2', '--out_file2',
        dest='outfile2',
        help='output file'
    )
    parser.add_argument(
        '-o3', '--out_file3',
        dest='outfile3',
        help='output file'
    )
    parser.add_argument(
        '-o4', '--out_file4',
        dest='outfile4',
        help='output file'
    )
    parser.add_argument(
        '-o5', '--out_file5',
        dest='outfile5',
        help='output file'
    )
    parser.add_argument(
        '-o6', '--out_file6',
        dest='outfile6',
        help='output file'
    )
    parser.add_argument(
        '-o7', '--out_file7',
        dest='outfile7',
        help='output file'
    )
    parser.add_argument(
        '--use_count',
        type=int,
        default=1,
        choices=[0, 1],
        help='Enable/Disable (1/0) use of counts in sequence IDs, default enabled (--use_count 1)'
    )
    parser.add_argument(
        '--min_count',
        type=int,
        default=10,
        help='Minimum observations required to be included in sample reports; >= 1 occurance count; < 1 %% observed (.1 = 10%%), (default: .001)'
    )
    parser.add_argument(
        '--min_samp_abund',
        type=float,
        default=0.001,
        help='Minimum abundance required for inclusion in sample reports; %% observed (.1 = 10%%), (default: .001)'
    )
    parser.add_argument(
        '--ntabund',
        type=float,
        default=.001,
        help='Minimum abundance relative to total reads required for a position to be reported in the nt call output; must be non-negative and < 1, %% observed (.1 = 10%%), (default: .001)'
    )
    parser.add_argument(
        '--max_dist',
        type=int,
        default=40,
        help='Maximum number of variances from the reference a sequence can have to be consider in covars processing (default: 40)'
    )
    parser.add_argument(
        '--max_covar',
        type=int,
        default=8,
        help='Maximum number of variances from the reference to be reported in covars (default: 8)'
    )
    parser.add_argument(
        '--AAreport',
        type=int,
        default=1,
        choices=[0, 1],
        help='Enable/Disable (1/0) amino acid reporting, default enabled (--AAreport 1)'
    )
    parser.add_argument(
        '--AAcodonasMNP',
        type=int,
        default=1,
        choices=[0, 1],
        help='Enable/Disable (1/0) reporting multiple nt changes in a single codon as one polymorphism, default enabled (--AAcodonasMNP 1), requires AAreport enabled'
    )
    parser.add_argument(
        '--alpha',
        type=float,
        default=1.2,
        help='Modifier for chim_rm chimera checking, default 1.2.  Higher = more sensitive, more false chimeras removed; lower = less sensitive, fewer chimeras removed'
    )
    parser.add_argument(
        '--foldab',
        type=float,
        default=1.8,
        help='Threshold for potential parent / chimera abundance ratio for chim_rm; default is 1.8'
    )
    parser.add_argument(
        '--redist',
        type=int,
        default=1,
        choices=[0, 1],
        help='Enable/Disable (1/0) redistribution of chimera counts for chim_rm, default enabled (--redist 1)'
    )
    parser.add_argument(
        '--beta',
        type=float,
        default=1,
        help='Modifier for covar pass checking, default 1.  Higher = more sensitive, more failed checks; lower = less sensitive, fewer failed checks'
    )
    parser.add_argument(
        '--autopass',
        type=float,
        default=.3,
        help='threshold for a sequence to automatically pass the covar pass checking'
    )
    parser.add_argument(
        '--nt_call',
        type=int,
        default=1,
        choices=[0, 1],
        help='Enable/Disable (1/0) nt_call output, default enabled (--nt_call 1)'
    )
    parser.add_argument(
        '--ntvar',
        type=int,
        default=0,
        choices=[0, 1],
        help='Enable/Disable (1/0) nt_var output, default enabled (--nt_call 1)'
    )
    parser.add_argument(
        '--indel',
        type=int,
        default=1,
        choices=[0, 1],
        help='Enable/Disable (1/0) indel output, default enabled (--indel 1)'
        )
    parser.add_argument(
        '--seq',
        type=int,
        default=1,
        choices=[0, 1],
        help='Enable/Disable (1/0) unique seq output, default enabled (--seq 1)'
    )
    parser.add_argument(
        '--covar', type=int,
        default=1,
        choices=[0, 1],
        help='Enable/Disable (1/0) covar output, default enabled (--covar 1)'
    )
    parser.add_argument(
        '--chim_rm',
        type=int,
        default=1,
        choices=[0, 1],
        help='Enable/Disable (1/0) chim_rm output, default enabled (--chim_rm 1)'
    )
    parser.add_argument(
        '--deconv',
        type=int,
        default=1,
        choices=[0, 1],
        help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)'
    )
    parser.add_argument(
        '--wgs',
        type=int,
        default=0,
        choices=[0, 1],
        help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)'
    )

    args = parser.parse_args()

# checking for proper range of some parameters and consistency/compatibility
    if args.wgs == 1:
        if args.deconv == 1 or args.chim_rm == 1:
            args.deconv = 0
            args.chim_rm = 0
            print('WGS mode enabled, disabling chimera removal methods')

    return(args)

def get_ref(ref): # get the reference ID and sequence from the FASTA file.  Will only get the first.

    n=0
    refID = ''
    refseq = ''
    if ref:
        for line in ref:
            if line.startswith('>'):
                n+=1
                if n > 1:
                    break
                refID = line[1:].strip("\n\r")
            elif n == 1:
                refseq = refseq + line.strip("\n\r")
        refseq = refseq.upper()


    return(refID, refseq)

def AAcall(codon): # amino acid / codon dictionary to return encoded AAs
    AAdict = {
        'TTT' : 'F',
        'TTC' : 'F',
        'TTA' : 'L',
        'TTG' : 'L',
        'TCT' : 'S',
        'TCC' : 'S',
        'TCA' : 'S',
        'TCG' : 'S',
        'TAT' : 'Y',
        'TAC' : 'Y',
        'TAA' : '*',
        'TAG' : '*',
        'TGT' : 'C',
        'TGC' : 'C',
        'TGA' : '*',
        'TGG' : 'W',

        'CTT' : 'L',
        'CTC' : 'L',
        'CTA' : 'L',
        'CTG' : 'L',
        'CCT' : 'P',
        'CCC' : 'P',
        'CCA' : 'P',
        'CCG' : 'P',
        'CAT' : 'H',
        'CAC' : 'H',
        'CAA' : 'Q',
        'CAG' : 'Q',
        'CGT' : 'R',
        'CGC' : 'R',
        'CGA' : 'R',
        'CGG' : 'R',

        'ATT' : 'I',
        'ATC' : 'I',
        'ATA' : 'I',
        'ATG' : 'M',
        'ACT' : 'T',
        'ACC' : 'T',
        'ACA' : 'T',
        'ACG' : 'T',
        'AAT' : 'N',
        'AAC' : 'N',
        'AAA' : 'K',
        'AAG' : 'K',
        'AGT' : 'S',
        'AGC' : 'S',
        'AGA' : 'R',
        'AGG' : 'R',

        'GTT' : 'V',
        'GTC' : 'V',
        'GTA' : 'V',
        'GTG' : 'V',
        'GCT' : 'A',
        'GCC' : 'A',
        'GCA' : 'A',
        'GCG' : 'A',
        'GAT' : 'D',
        'GAC' : 'D',
        'GAA' : 'E',
        'GAG' : 'E',
        'GGT' : 'G',
        'GGC' : 'G',
        'GGA' : 'G',
        'GGG' : 'G'
    }
    AA = '?'
    if codon in AAdict:
        AA = AAdict[codon]

    return(AA)

def singletCodon(ntPOS, nt, ref): # process to return the AA and protein seq. position based on the reference and provided nt seq position and nt
    AAPOS = (ntPOS-1)//3
    AAmod = (ntPOS-1)%3
    codon = ""
    try:
        if AAmod == 0:
            codon = nt+ref[1][ntPOS]+ref[1][ntPOS+1]
        elif AAmod == 1:
            codon = ref[1][ntPOS-2]+nt+ref[1][ntPOS]
        elif AAmod == 2:
            codon = ref[1][ntPOS-3]+ref[1][ntPOS-2]+nt
    except:
        codon = "XXX"

    return(AAPOS+1, AAcall(codon))

def getCombos(qlist, clen): # returns combinations of single polymorphisms in a sequence
    combos = []
    if (clen == 0 or clen > len(qlist)):
        clen = len(qlist)
    for N in range(1, clen+1):
        for comb in itertools.combinations(qlist, N):
            combos.append(' '.join(comb))
    return(combos)

def SAMparse(args, ref, refprot, file): # process SAM files
    covar_array = []
    seq_array = []

    nt_call_dict_dict = {}
    indel_dict = {}
    seq_species = {}
    sam_read_count = 0
    sam_line_count = 0
    firstPOS = 0
    lastPOS = 0
    coverage = {}
    for line in args.Sam_file:
        if not line.startswith('@'): # ignore header lines
            splitline = line.split("\t")
            if ref[0].upper().startswith(splitline[2].upper()): # check map ID matches referecne ID
                if int(splitline[4]) > 0:  # Check mapping score is positive

                    abund_count=1
                    if args.use_count == 1: # get the unique sequence counts
                        if '-' in splitline[0] and '=' in splitline[0]:
                            eq_split = splitline[0].split('=')
                            dash_split = splitline[0].split('-')
                            if len(eq_split[-1]) > len(dash_split[-1]):
                                abund_count=int(dash_split[-1])
                            else:
                                abund_count=int(eq_split[-1])

                        elif '-' in splitline[0]:
                            try:
                                abund_count=int(splitline[0].split('-')[-1])
                            except:
                                pass

                        elif '=' in splitline[0]:
                            try:
                                abund_count=int(splitline[0].split('=')[-1])
                            except:
                                pass


                    sam_read_count += abund_count

                    CIGAR = splitline[5]
                    POS = int(splitline[3])
                    if firstPOS == 0:
                        firstPOS = POS
                    elif POS < firstPOS:
                        firstPOS = POS

                    readID = splitline[0]
                    query_seq = splitline[9].upper()
                    run_length = 0
                    query_seq_parsed = ''
                    query_pos = 0
                    q_pars_pos = 0
                    mutations = []

                    for C in CIGAR: # process sequence based on standard CIGAR line
                        if C == 'M' or C == 'I' or C == 'D' or C == 'S' or C == 'H':
                            if C == 'S':
                                query_pos = query_pos + run_length

                            if C == 'I':
                                if query_pos > 0:
                                    # add insertion to dict
                                    iPOS = q_pars_pos+POS

                                    iSeq = query_seq[query_pos: query_pos+run_length]
                                    istring = str(iPOS)+'-insert'+iSeq

                                    try:
                                        indel_dict[istring]
                                    except:
                                        indel_dict[istring] = abund_count
                                    else:
                                        indel_dict[istring] += abund_count

                                    if args.AAreport == 1 and (run_length % 3 == 0):
                                        
                                        iProt = ''
                                        if iPOS % 3 == 1:
                                            for x in range(0, (run_length//3)):
                                                AA = AAcall(iSeq[x*3]+iSeq[x*3+1]+iSeq[x*3+2])
                                                iProt = iProt + AA
                                            mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
                                        elif iPOS % 3 == 2:
                                            if query_pos > 0:
                                                ipSeq = query_seq[query_pos-1:query_pos+run_length+2]
                                            else:
                                                ipSeq = "XXX"+query_seq[query_pos+2:query_pos+run_length+2]
                                            for x in range(0, (run_length//3)+1):
                                                AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2])
                                                iProt = iProt + AA
                                            mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
                                        else:
                                            if query_pos > 1:
                                                ipSeq = query_seq[query_pos-2:query_pos+run_length+1]
                                            else:
                                                ipSeq = "XXX"+query_seq[query_pos+1:query_pos+run_length+1]
                                            
                                            for x in range(0, (run_length//3)+1):
                                                AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2])
                                                iProt = iProt + AA
                                            mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
                                    else:
                                        mutations.append(istring)

                                    query_pos = query_pos + run_length

                            elif C == 'D':
                                for X in range(0, run_length):
                                    query_seq_parsed += '-'

                                delstring = str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'

                                if args.AAreport == 1 and (run_length % 3 == 0) and not ((q_pars_pos+POS) % 3 == 1 ):
                                    if (q_pars_pos+POS) % 3 == 2:
                                        newcodon = query_seq[query_pos-1:query_pos+2]
                                        newAArefpos = (q_pars_pos+POS) // 3
                                        mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')')
                                    else:
                                        newcodon = query_seq[query_pos-2:query_pos+1]
                                        newAArefpos = (q_pars_pos+POS) // 3
                                        mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')')
                                else:
                                    mutations.append(delstring)

                                if args.nt_call == 1:
                                    for N in range(q_pars_pos+POS, q_pars_pos+POS+int(run_length)):
                                        try:
                                            nt_call_dict_dict[N]
                                        except:
                                            nt_call_dict_dict[N] = {'A' : 0,
                                                                    'T' : 0,
                                                                    'C' : 0,
                                                                    'G' : 0,
                                                                    '-' : 0}
                                            nt_call_dict_dict[N]['-'] = abund_count
                                        else:
                                            nt_call_dict_dict[N]['-'] += abund_count

                                try:
                                    indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del']
                                except:
                                    indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] = int(abund_count)
                                else:
                                    indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] += int(abund_count)

                                q_pars_pos = q_pars_pos + run_length

                            elif C == 'M':
                                offset = q_pars_pos-query_pos
                                refPOS = POS+offset

                                for ntPOS in range(query_pos, query_pos+run_length):
                                    if query_seq[ntPOS] == 'A' or query_seq[ntPOS] == 'T' or query_seq[ntPOS] == 'C' or query_seq[ntPOS] == 'G':
                                        if query_seq[ntPOS] != ref[1][refPOS+ntPOS-1]:
                                            if args.AAreport == 1 and args.AAcodonasMNP == 0:
                                                AAinfo = singletCodon(refPOS+ntPOS, query_seq[ntPOS], ref)
                                                mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
                                            else:
                                                mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS])
                                        if args.nt_call == 1:
                                            try:
                                                nt_call_dict_dict[refPOS+ntPOS]
                                            except:
                                                nt_call_dict_dict[refPOS+ntPOS] = {'A' : 0,
                                                                                   'T' : 0,
                                                                                   'C' : 0,
                                                                                   'G' : 0,
                                                                                   '-' : 0}
                                                nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] = abund_count
                                            else:
                                                nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] += abund_count


                                q_pars_pos = q_pars_pos + run_length
                                query_pos = query_pos + run_length

                            run_length = 0


                        else:
                            run_length = (10 * run_length) + int(C)
                    # END CIGAR PARSE



                    if len(mutations) == 0: # record reference counts
                        if args.wgs == 0:
                            try:
                                seq_species['Reference'] += abund_count
                            except:
                                seq_species['Reference'] = abund_count
                        else:
                            try:
                                seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] += abund_count
                            except:
                                seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] = abund_count

                    else: # record variants and counts
                        if args.AAreport == 1 and args.AAcodonasMNP == 1:
                            codonchecked = []
                            codon = ''
                            skip = 0
                            MNP = ''
                            for i in range(0, len(mutations)): # checking for MNP
                                if 'Del' in mutations[i] or 'insert' in mutations[i]:
                                    codonchecked.append(mutations[i])
                                elif skip > 0:
                                    skip -= 1
                                else:
                                    mut1POS = int(''.join([c for c in mutations[i] if c.isdigit()]))
                                    try:
                                        mutations[i+1]
                                    except:
                                        AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
                                        codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
                                    else:

                                        if mut1POS % 3 == 0:
                                            AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
                                            codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
                                        else:
                                            mut2POS = int(''.join([c for c in mutations[i+1] if c.isdigit()]))
                                            if mut2POS - mut1POS < 3:
                                                AAPOS = mut1POS // 3
                                                if mut1POS % 3 == 1:
                                                    if mut2POS % 3 == 0:
                                                        codon = mutations[i][-1]+ref[1][mut1POS]+mutations[i+1][-1]
                                                        MNP = mutations[i][-1]+'r'+mutations[i+1][-1]
                                                        skip = 1
                                                    else:
                                                        try:
                                                            mutations[i+2]
                                                        except:
                                                            codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS]
                                                            MNP = mutations[i][-1]+mutations[i+1][-1]+'r'
                                                            skip = 1
                                                        else:
                                                            mut3POS = int(''.join([c for c in mutations[i+2] if c.isdigit()]))
                                                            if mut2POS == mut3POS -1:
                                                                codon = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1]
                                                                MNP = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1]
                                                                skip = 2
                                                            else:
                                                                codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS]
                                                                MNP = mutations[i][-1]+mutations[i+1][-1]+'r'
                                                                skip = 1
                                                    codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')')
                                                elif mut2POS - mut1POS == 1:
                                                    codon = ref[1][mut1POS-2]+mutations[i][-1]+mutations[i+1][-1]
                                                    MNP = mutations[i][-1]+mutations[i+1][-1]
                                                    skip = 1
                                                    codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')')
                                                else:
                                                    AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
                                                    codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')


                                            else:
                                                AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
                                                codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
                            mutations = " ".join(codonchecked)
                        else:
                            mutations = " ".join(mutations)

                        if args.wgs == 0:
                            try:
                                seq_species[mutations] += abund_count
                            except:
                                seq_species[mutations] = abund_count
                        else:
                            try:
                                seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] += abund_count
                            except:
                                seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] = abund_count

                    if lastPOS < POS+q_pars_pos:
                        lastPOS = POS+q_pars_pos
                    for i in range(POS, POS+q_pars_pos): # update coverage
                        try:
                            coverage[i] += abund_count
                        except:
                            coverage[i] = abund_count

    args.Sam_file.close()
    # END SAM LINES

    total_mapped_reads = sam_read_count

    if args.seq == 1: # output the sequence
        seq_fh = open(args.outfile1, "w")
        seq_fh.write(args.samp+"("+str(sam_read_count)+")\n")
        seq_fh.write("Unique Sequence\tCount\tAbundance\n")

        sorted_seq = sorted(seq_species, key=seq_species.__getitem__, reverse=True)
        for key in sorted_seq:
            if seq_species[key] >= args.min_count:
                if (seq_species[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0:
                    seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}\n")
                    seq_array.append(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}")
                elif args.wgs == 1:
                    splitseqs = key.split()
                    cov = []
                    for x in range(int(splitseqs[0]), int(splitseqs[-1])):
                        cov.append(coverage[x])
                    min_cov = min(cov)
                    if (seq_species[key]/min_cov >= args.min_samp_abund):
                        seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/min_cov):.3f}\n")
            else:
                break

        seq_fh.close()
        # END SEQ OUT

    if args.indel == 1: # and len(indel_dict) > 0: # output indels, if there are any
        sorted_indels = sorted(indel_dict, key=indel_dict.__getitem__, reverse=True)
        indels_to_write = []
        for key in sorted_indels:
            if indel_dict[key] >= args.min_count:
                if indel_dict[key] / sam_read_count >= args.min_samp_abund and args.wgs == 0:
                    indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key]/sam_read_count):.3f}\n")
                elif args.wgs == 1:
                    indelPOS = ''
                    for c in key:
                        if c.isdigit():
                            indelPOS += c
                        else:
                            break
                    indelPOS = int(indelPOS)
                    if indel_dict[key] / coverage[indelPOS] >= args.min_samp_abund:
                        indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key] / coverage[indelPOS]):.3f}\n")
            else:
                break
        # if len(indels_to_write) > 0:
        indel_fh = open(args.outfile2, "w")
        indel_fh.write(args.samp+"("+str(sam_read_count)+")\n")
        indel_fh.write("Indel\tCount\tAbundance\n")
        for indel_entry in indels_to_write:
            indel_fh.write(indel_entry)
        indel_fh.close()
        # END INDEL OUT

    if args.nt_call == 1: # out put nt calls
        ntcall_fh = open(args.outfile3, "w")
        ntcall_fh.write(args.samp+"("+str(sam_read_count)+")\n")
        if args.ntvar == 1:
            ntcallv_fh = open(args.outfile7, "w")
            ntcallv_fh.write(args.samp+"("+str(sam_read_count)+")\n")
        sorted_POS = sorted(nt_call_dict_dict)
        if args.AAreport == 1:
            ntcall_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n")
            if args.ntvar == 1:
                ntcallv_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n")
            for POS in sorted_POS:
                try:
                    total = coverage[POS]
                except:
                    total = 0
                if total >= (sam_read_count * args.ntabund):
                    AAinfo = singletCodon(POS, ref[1][POS-1], ref)
                    POS_calls = {}
                    for key in nt_call_dict_dict[POS]:
                        POS_calls[key] = nt_call_dict_dict[POS][key]
                    sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True)

                    ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
                    ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
                    ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
                    ntcall_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
                    if sorted_calls[0] != ref[1][POS-1]:
                        if args.ntvar == 1:
                            ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
                            ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
                            ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
                            ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")

                        mod = (POS)%3

                        if mod == 0:
                            try:
                                codon = ref[1][POS-3]+ref[1][POS-2]+sorted_calls[0]
                            except:
                                codon = 'NNN'
                        elif mod == 2:
                            try:
                                codon = ref[1][POS-2]+sorted_calls[0]+ref[1][POS]
                            except:
                                codon = 'NNN'
                        elif mod == 1:
                            try:
                                codon = sorted_calls[0]+ref[1][POS]+ref[1][POS+1]
                            except:
                                codon = 'NNN'
                        ntcall_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1])
                        if args.ntvar == 1:
                            ntcallv_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1])
                        if (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total >= args.min_samp_abund):
                                if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1:
                                    ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
                                ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
                                if nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count:
                                    if nt_call_dict_dict[POS][sorted_calls[2]] / total  >= args.min_samp_abund:
                                        if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
                                            ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
                                        ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")

                        if args.ntvar == 1:
                            ntcallv_fh.write("\n")
                    elif (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] / total >= args.min_samp_abund):
                        if args.ntvar == 1:
                            ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
                            ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
                            ntcallv_fh.write("\t"+str(total)+"\t\t")
                            ntcallv_fh.write(f"\t")
                            ntcallv_fh.write("\t\t")
                            ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
                        ntcall_fh.write("\t\t")
                        ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])

                        if (nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total >= args.min_samp_abund):
                            ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
                            if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
                                ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
                        if args.ntvar == 1:
                            ntcallv_fh.write("\n")

                    ntcall_fh.write("\n")
        else:
            ntcall_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n")
            if args.ntvar == 1:
                ntcallv_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n")

            for POS in sorted_POS:
                try:
                    total = coverage[POS] # sum(nt_call_dict_dict[POS].values())
                except:
                    total = 0
                if total >= (sam_read_count * args.ntabund):
                    POS_calls = {}
                    for key in nt_call_dict_dict[POS]:
                        POS_calls[key] = nt_call_dict_dict[POS][key]
                    sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True)

                    ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1])
                    ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
                    ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")

                    if sorted_calls[0] != ref[1][POS-1]:
                        if args.ntvar == 1:
                            ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1])
                            ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
                            ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
                            ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
                        if (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund):
                            ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
                            if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1:
                                ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
                            if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund):
                                ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
                                if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
                                    ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
                        if args.ntvar == 1:
                            ntcallv_fh.write("\n")

                    elif (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund):
                        if args.ntvar == 1:
                            ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1])
                            ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
                            ntcallv_fh.write("\t"+str(total)+"\t\t")
                            ntcallv_fh.write(f"\t")
                            ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
                        ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
                        if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and(nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund):
                            ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
                            if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
                                ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
                        if args.ntvar == 1:
                            ntcallv_fh.write("\n")

                    ntcall_fh.write("\n")

        ntcall_fh.close()
        if args.ntvar == 1:
            ntcallv_fh.close()
        # END NT CALL OUT
    if args.covar == 1: # output covariants
        testtrack = 0
        combinations = {}
        for sequence in seq_species:
            if args.wgs == 0:
                singles = sequence.split()
            else:
                singles = (sequence.split())[1:-1]
            if len(singles) <= args.max_dist and singles[0] != 'Ref':
                for combo in getCombos(singles, args.max_covar):
                    if not combo in combinations:
                        combinations[combo] = seq_species[sequence]
                    else:
                        combinations[combo] += seq_species[sequence]

        covar_fh = open(args.outfile4, "w")
        covar_fh.write(args.samp+"("+str(sam_read_count)+")\n")
        covar_fh.write("Co-Variants\tCount\tAbundance\n")
        sortedcombos = sorted(combinations, key=combinations.__getitem__, reverse=True)
        for key in sortedcombos:
            if (combinations[key] >= args.min_count):
                if (combinations[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0:
                    covar_fh.write(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}\n")
                    covar_array.append(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}")
                elif args.wgs == 1:
                    coveragepercent = 0
                    splitcombos = key.split()
                    if len(splitcombos) == 1:
                        coveragePOS = ''
                        for c in key:
                            if c.isdigit():
                                coveragePOS += c
                            else:
                                break
                        coveragepercent = combinations[key] / coverage[int(coveragePOS)]
                    else:
                        startcovPOS = ''
                        for c in splitcombos[0]:
                            if c.isdigit():
                                startcovPOS += c
                            else:
                                break
                        endcovPOS = ''
                        for c in splitcombos[-1]:
                            if c.isdigit():
                                endcovPOS += c
                            else:
                                break
                        coveragevals = []
                        for i in range(int(startcovPOS), int(endcovPOS)+1):
                            coveragevals.append(coverage[i])
                        mincov = min(coverval for coverval in coveragevals)
                        coveragepercent = combinations[key] / mincov
                    if coveragepercent >= args.min_samp_abund:
                        covar_fh.write(f"{key}\t{combinations[key]}\t{coveragepercent:.3f}\n")

        covar_fh.close()


        # END COVAR OUT

    return(covar_array, seq_array, sam_read_count)

def cvdeconv(args, covardict, seqdict): # covar deconvolution process

    passedseqs = {}
    preservedseqs = {}
    for seq in seqdict: # pass check for actual : expected abundance
        if seq != 'total' and seq != 'singles':
         
            splitseq = seq.split(' ')
            abund = 1
            for sing in splitseq:
                try:
                    abund = abund * (covardict[sing] / covardict['total'])
                except:
                    abund = abund * (seqdict[seq] / seqdict['total'])

            try:
                covarabund = covardict[seq]/covardict['total']
            except:
                covarabund = seqdict[seq]/seqdict['total']
                covardict[seq] = seqdict[seq]

            if covarabund >= args.autopass:
                preservedseqs[seq] = max(1, args.beta, (covarabund / abund))
            elif covarabund >= abund * args.beta:
                passedseqs[seq] = covarabund / abund
            elif len(seq.split(' ')) == 1:
                passedseqs[seq] = max(1, args.beta)

    if args.min_samp_abund < 1:
        min_count = args.min_samp_abund * covardict['total']
    else:
        min_count = args.min_samp_abund

    # sort passed covars
    lensortedpassed = sorted(passedseqs, key = lambda key : len(key.split(' ')), reverse=True)
    ratiolensortedpassed = sorted(lensortedpassed, key = lambda key : passedseqs[key], reverse = True)
    sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
    deconved = {}
    for seq in ratiolensortedpassed: # reassign counts
        singles = seq.split(' ')
        first = 0
        rem_count = 0
        for sing in sortedsingles:
            if sing in singles:
                if covardict['singles'][sing] > 0:
                    if first == 0:
                        first = 1
                        rem_count = covardict['singles'][sing]
                        covardict['singles'][sing] = 0
                        deconved[seq] = rem_count
                    else:
                        covardict['singles'][sing] = covardict['singles'][sing] - rem_count
                else:
                    break
        sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)

    sortedpreserved = sorted(preservedseqs, key = lambda key : covardict[key])
 
    for seq in sortedpreserved:
        singles = seq.split(' ')
        first = 0
        rem_count = 0
        for sing in sortedsingles:
            if sing in singles:
                if covardict['singles'][sing] > 0:
                    if first == 0:
                        first = 1
                        rem_count = covardict['singles'][sing]
                        covardict['singles'][sing] = 0
                        deconved[seq] = rem_count
                    else:
                        covardict['singles'][sing] = covardict['singles'][sing] - rem_count
                else:
                    break
        sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
            

    newtotal = sum(deconved.values())
    fh_deconv = open(args.outfile6, "w")
    fh_deconv.write(f"{args.samp}({covardict['total']}) | ({newtotal})\tCount\tAbundance\n")
    sorted_deconved = sorted(deconved, key = deconved.__getitem__, reverse = True)
    for seq in sorted_deconved: # write deconv
        if deconved[seq] >= min_count:
            fh_deconv.write(f"{seq}\t{deconved[seq]}\t{(deconved[seq]/newtotal):.3f}\n")
    fh_deconv.close()
# END COVAR DECONV OUT

    return()

def dechim(args, seqs): # processing sequence dictionary to remove chimeras
    total = seqs['total']
    del seqs['total']
    sorted_seqs = sorted(seqs, key=seqs.__getitem__) # sort sequences by abundance, least to greatest
    chimeras = []
    for seq in sorted_seqs:
        pot_chim = ['Reference']+seq.split()+['Reference']
        chim_halves = []
        for i in range(0, len(pot_chim)-1): # create dimera halves
            chim_halves.append([pot_chim[:i+1], pot_chim[i+1:]])
        parent_pairs = []

        for dimera in chim_halves: # check for potential parents

            pot_left = []
            pot_rt = []
            lft_len = len(dimera[0])
            rt_len = len(dimera[1])
            for pseq in sorted_seqs:
                if not seq == pseq:
                    if (seqs[pseq] >= (seqs[seq] * args.foldab)):
                        pot_par = ['Reference']+pseq.split()+['Reference']
                        if dimera[0] == pot_par[:lft_len]:
                            pot_left.append(pot_par)
                        if ((len(pot_par) >= rt_len) and (dimera[1] == pot_par[(len(pot_par)-rt_len):])):
                            pot_rt.append(pot_par)

            if (len(pot_left) > 0 and len(pot_rt) > 0 ):
                for left_par in pot_left: # check potential parents' pairing
                    for rt_par in pot_rt:
                        if left_par != rt_par:
                            left_break = left_par[lft_len]
                            rt_break = rt_par[(len(rt_par)-rt_len)-1]
                            if left_break == 'Reference' or rt_break == 'Reference':
                                parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])])
                            else:
                                left_break_POS = ''
                                for c in left_break:
                                    if c.isdigit():
                                        left_break_POS += c
                                    else:
                                        break

                                rt_break_POS = ''
                                for c in rt_break:
                                    if c.isdigit():
                                        rt_break_POS += c
                                    else:
                                        break

                                if int(left_break_POS) > int(rt_break_POS):
                                    parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])])

        par_tot_abund = 0
        pair_probs = []
        for parents in parent_pairs: # calc 'expected' abundance
            pair_prob = (seqs[parents[0]] / total) * (seqs[parents[1]] / total)
            par_tot_abund += pair_prob
            pair_probs.append(pair_prob)

        recomb_count = par_tot_abund * total

        if not seqs[seq] >= recomb_count * args.alpha: # chimera check
            redist_count = float(seqs[seq])
            chimeras.append(seq)
            seqs[seq] = 0
            if args.redist == 1: # redist counts of chimera
                toadd = {}
                for i in range(0, len(parent_pairs)):
                    counts_to_redist = (redist_count * (pair_probs[i]/par_tot_abund))/2
                    seqs[parent_pairs[i][0]] += counts_to_redist
                    seqs[parent_pairs[i][1]] += counts_to_redist



    for chim in chimeras: # remove chimeras
        del seqs[chim]

    total = sum(seqs.values())


    # total = sum(seqs.values)
    seqs['total'] = total

    return(seqs)

def chimrm(args, seqs): # chimera removed process

    pre_len = len(seqs)
    inf_loop_shield = 0
    while True: # send sequences for chimera removal while chimeras are still found
        dechim(args, seqs)
        post_len = len(seqs)
        inf_loop_shield += 1
        if post_len >= pre_len:
            break
        if inf_loop_shield > 100:
            break
        pre_len = len(seqs)

    total = seqs['total']
    del seqs['total']
    if args.min_samp_abund < 1:
        min_count = args.min_samp_abund * total
    else:
        min_count = args.min_samp_abund
    sorted_seqs = sorted(seqs, key=seqs.__getitem__, reverse=True)
    fh_dechime = open(args.outfile5, 'w')
    fh_dechime.write(f"{args.samp}({int(total)})\n")
    fh_dechime.write("Sequences\tCount\tAbundance\n")
    for seq in seqs: # write chim_rm seqs
        abund = seqs[seq]/total
        if seqs[seq] >= min_count:
            fh_dechime.write(f"{seq}\t{round(seqs[seq])}\t{abund:.3f}\n")

    fh_dechime.close()
    return()

def main():

    args = arg_parser() # getting command line arguments

    covar_array = []
    seq_array = []
    total_mapped_reads = 0
    
    if args.ref:
        ref = get_ref(args.ref) # get the reference ID and sequence from the FASTA file
        args.ref.close()
        if ref[1] == '':
            print('Reference not recognized as a Fasta format, skipping SAM parsing')
        else:
            refprot = ''
            if args.AAreport == 1: # make an Amino Acid sequence based on the reference sequence
                for x in range(0, (len(ref[1])-1)//3):
                    AA = AAcall(ref[1][x*3]+ref[1][x*3+1]+ref[1][x*3+2])
                    refprot = refprot + AA
                if (len(ref[1])-1)%3 != 0:
                    refprot = refprot + '?'

            covar_array, seq_array, total_mapped_reads = SAMparse(args, ref, refprot, args.Sam_file)
                
    else:
        print('No reference provided, skipping SAM parsing')

    in_covars = {'total' : total_mapped_reads,
                  'singles' : {}
                  }
    in_seqs = {'total' : total_mapped_reads}
    # Begin chimera removal if enabled
    if args.chim_rm == 1 or args.deconv == 1:
        if args.deconv == 1:
            for line in covar_array:
                lineparts = line.split("\t")
                in_covars[lineparts[0]] = int(lineparts[1])
                if len(lineparts[0].split(' ')) == 1:
                    in_covars['singles'][lineparts[0]] = int(lineparts[1])

        for line in seq_array:
            lineparts = line.split("\t")
            in_seqs[lineparts[0]] = float(lineparts[1])

    if args.deconv == 1:
        cvdeconv(args, in_covars, in_seqs)

    if args.chim_rm == 1:
        chimrm(args, in_seqs)

if __name__ == '__main__':
    main()