sam_refiner: SAM_Refiner_Gal.py comparison

comparison SAM_Refiner_Gal.py @ 0:22e3f843f1db draft

Uploaded main file

author	degregory
date	Mon, 13 Sep 2021 14:12:21 +0000
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:22e3f843f1db
+#!/bin/env python3
+# Writen by Devon Gregory with assistance from Christopher Bottoms
+# University of Missouri
+# Distributed under GNU GENERAL PUBLIC LICENSE v3
+import os
+import sys
+import argparse
+import itertools
+import time
+from multiprocessing import Process
+from pathlib import Path
+# process for collecing command line arguments
+def arg_parser():
+parser = argparse.ArgumentParser(
+description='process Sam files for variant information'
+)
+parser.add_argument(
+'--samp',
+type=str,
+default='Sample',
+help='Sample Name'
+)
+parser.add_argument(
+'-r', '-reference',
+type=argparse.FileType('r'),
+dest='ref',
+help='reference fasta'
+)
+parser.add_argument(
+'-S', '--Sam_file',
+type=argparse.FileType('r'),
+dest='Sam_file',
+help='.sam file'
+)
+parser.add_argument(
+'-o1', '--out_file1',
+dest='outfile1',
+help='output file'
+)
+parser.add_argument(
+'-o2', '--out_file2',
+dest='outfile2',
+help='output file'
+)
+parser.add_argument(
+'-o3', '--out_file3',
+dest='outfile3',
+help='output file'
+)
+parser.add_argument(
+'-o4', '--out_file4',
+dest='outfile4',
+help='output file'
+)
+parser.add_argument(
+'-o5', '--out_file5',
+dest='outfile5',
+help='output file'
+)
+parser.add_argument(
+'-o6', '--out_file6',
+dest='outfile6',
+help='output file'
+)
+parser.add_argument(
+'-o7', '--out_file7',
+dest='outfile7',
+help='output file'
+)
+parser.add_argument(
+'--use_count',
+type=int,
+default=1,
+choices=[0, 1],
+help='Enable/Disable (1/0) use of counts in sequence IDs, default enabled (--use_count 1)'
+)
+parser.add_argument(
+'--min_count',
+type=int,
+default=10,
+help='Minimum observations required to be included in sample reports; >= 1 occurance count; < 1 %% observed (.1 = 10%%), (default: .001)'
+)
+parser.add_argument(
+'--min_samp_abund',
+type=float,
+default=0.001,
+help='Minimum abundance required for inclusion in sample reports; %% observed (.1 = 10%%), (default: .001)'
+)
+parser.add_argument(
+'--ntabund',
+type=float,
+default=.001,
+help='Minimum abundance relative to total reads required for a position to be reported in the nt call output; must be non-negative and < 1, %% observed (.1 = 10%%), (default: .001)'
+)
+parser.add_argument(
+'--max_dist',
+type=int,
+default=40,
+help='Maximum number of variances from the reference a sequence can have to be consider in covars processing (default: 40)'
+)
+parser.add_argument(
+'--max_covar',
+type=int,
+default=8,
+help='Maximum number of variances from the reference to be reported in covars (default: 8)'
+)
+parser.add_argument(
+'--AAreport',
+type=int,
+default=1,
+choices=[0, 1],
+help='Enable/Disable (1/0) amino acid reporting, default enabled (--AAreport 1)'
+)
+parser.add_argument(
+'--AAcodonasMNP',
+type=int,
+default=1,
+choices=[0, 1],
+help='Enable/Disable (1/0) reporting multiple nt changes in a single codon as one polymorphism, default enabled (--AAcodonasMNP 1), requires AAreport enabled'
+)
+parser.add_argument(
+'--alpha',
+type=float,
+default=1.2,
+help='Modifier for chim_rm chimera checking, default 1.2.  Higher = more sensitive, more false chimeras removed; lower = less sensitive, fewer chimeras removed'
+)
+parser.add_argument(
+'--foldab',
+type=float,
+default=1.8,
+help='Threshold for potential parent / chimera abundance ratio for chim_rm; default is 1.8'
+)
+parser.add_argument(
+'--redist',
+type=int,
+default=1,
+choices=[0, 1],
+help='Enable/Disable (1/0) redistribution of chimera counts for chim_rm, default enabled (--redist 1)'
+)
+parser.add_argument(
+'--beta',
+type=float,
+default=1,
+help='Modifier for covar pass checking, default 1.  Higher = more sensitive, more failed checks; lower = less sensitive, fewer failed checks'
+)
+parser.add_argument(
+'--autopass',
+type=float,
+default=.3,
+help='threshold for a sequence to automatically pass the covar pass checking'
+)
+parser.add_argument(
+'--nt_call',
+type=int,
+default=1,
+choices=[0, 1],
+help='Enable/Disable (1/0) nt_call output, default enabled (--nt_call 1)'
+)
+parser.add_argument(
+'--ntvar',
+type=int,
+default=0,
+choices=[0, 1],
+help='Enable/Disable (1/0) nt_var output, default enabled (--nt_call 1)'
+)
+parser.add_argument(
+'--indel',
+type=int,
+default=1,
+choices=[0, 1],
+help='Enable/Disable (1/0) indel output, default enabled (--indel 1)'
+)
+parser.add_argument(
+'--seq',
+type=int,
+default=1,
+choices=[0, 1],
+help='Enable/Disable (1/0) unique seq output, default enabled (--seq 1)'
+)
+parser.add_argument(
+'--covar', type=int,
+default=1,
+choices=[0, 1],
+help='Enable/Disable (1/0) covar output, default enabled (--covar 1)'
+)
+parser.add_argument(
+'--chim_rm',
+type=int,
+default=1,
+choices=[0, 1],
+help='Enable/Disable (1/0) chim_rm output, default enabled (--chim_rm 1)'
+)
+parser.add_argument(
+'--deconv',
+type=int,
+default=1,
+choices=[0, 1],
+help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)'
+)
+parser.add_argument(
+'--wgs',
+type=int,
+default=0,
+choices=[0, 1],
+help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)'
+)
+args = parser.parse_args()
+# checking for proper range of some parameters and consistency/compatibility
+if args.wgs == 1:
+if args.deconv == 1 or args.chim_rm == 1:
+args.deconv = 0
+args.chim_rm = 0
+print('WGS mode enabled, disabling chimera removal methods')
+return(args)
+def get_ref(ref): # get the reference ID and sequence from the FASTA file.  Will only get the first.
+n=0
+refID = ''
+refseq = ''
+if ref:
+for line in ref:
+if line.startswith('>'):
+n+=1
+if n > 1:
+break
+refID = line[1:].strip("\n\r")
+elif n == 1:
+refseq = refseq + line.strip("\n\r")
+refseq = refseq.upper()
+return(refID, refseq)
+def AAcall(codon): # amino acid / codon dictionary to return encoded AAs
+AAdict = {
+'TTT' : 'F',
+'TTC' : 'F',
+'TTA' : 'L',
+'TTG' : 'L',
+'TCT' : 'S',
+'TCC' : 'S',
+'TCA' : 'S',
+'TCG' : 'S',
+'TAT' : 'Y',
+'TAC' : 'Y',
+'TAA' : '*',
+'TAG' : '*',
+'TGT' : 'C',
+'TGC' : 'C',
+'TGA' : '*',
+'TGG' : 'W',
+'CTT' : 'L',
+'CTC' : 'L',
+'CTA' : 'L',
+'CTG' : 'L',
+'CCT' : 'P',
+'CCC' : 'P',
+'CCA' : 'P',
+'CCG' : 'P',
+'CAT' : 'H',
+'CAC' : 'H',
+'CAA' : 'Q',
+'CAG' : 'Q',
+'CGT' : 'R',
+'CGC' : 'R',
+'CGA' : 'R',
+'CGG' : 'R',
+'ATT' : 'I',
+'ATC' : 'I',
+'ATA' : 'I',
+'ATG' : 'M',
+'ACT' : 'T',
+'ACC' : 'T',
+'ACA' : 'T',
+'ACG' : 'T',
+'AAT' : 'N',
+'AAC' : 'N',
+'AAA' : 'K',
+'AAG' : 'K',
+'AGT' : 'S',
+'AGC' : 'S',
+'AGA' : 'R',
+'AGG' : 'R',
+'GTT' : 'V',
+'GTC' : 'V',
+'GTA' : 'V',
+'GTG' : 'V',
+'GCT' : 'A',
+'GCC' : 'A',
+'GCA' : 'A',
+'GCG' : 'A',
+'GAT' : 'D',
+'GAC' : 'D',
+'GAA' : 'E',
+'GAG' : 'E',
+'GGT' : 'G',
+'GGC' : 'G',
+'GGA' : 'G',
+'GGG' : 'G'
+}
+AA = '?'
+if codon in AAdict:
+AA = AAdict[codon]
+return(AA)
+def singletCodon(ntPOS, nt, ref): # process to return the AA and protein seq. position based on the reference and provided nt seq position and nt
+AAPOS = (ntPOS-1)//3
+AAmod = (ntPOS-1)%3
+codon = ""
+try:
+if AAmod == 0:
+codon = nt+ref[1][ntPOS]+ref[1][ntPOS+1]
+elif AAmod == 1:
+codon = ref[1][ntPOS-2]+nt+ref[1][ntPOS]
+elif AAmod == 2:
+codon = ref[1][ntPOS-3]+ref[1][ntPOS-2]+nt
+except:
+codon = "XXX"
+return(AAPOS+1, AAcall(codon))
+def getCombos(qlist, clen): # returns combinations of single polymorphisms in a sequence
+combos = []
+if (clen == 0 or clen > len(qlist)):
+clen = len(qlist)
+for N in range(1, clen+1):
+for comb in itertools.combinations(qlist, N):
+combos.append(' '.join(comb))
+return(combos)
+def SAMparse(args, ref, refprot, file): # process SAM files
+covar_array = []
+seq_array = []
+nt_call_dict_dict = {}
+indel_dict = {}
+seq_species = {}
+sam_read_count = 0
+sam_line_count = 0
+firstPOS = 0
+lastPOS = 0
+coverage = {}
+for line in args.Sam_file:
+if not line.startswith('@'): # ignore header lines
+splitline = line.split("\t")
+if ref[0].upper().startswith(splitline[2].upper()): # check map ID matches referecne ID
+if int(splitline[4]) > 0:  # Check mapping score is positive
+abund_count=1
+if args.use_count == 1: # get the unique sequence counts
+if '-' in splitline[0] and '=' in splitline[0]:
+eq_split = splitline[0].split('=')
+dash_split = splitline[0].split('-')
+if len(eq_split[-1]) > len(dash_split[-1]):
+abund_count=int(dash_split[-1])
+else:
+abund_count=int(eq_split[-1])
+elif '-' in splitline[0]:
+try:
+abund_count=int(splitline[0].split('-')[-1])
+except:
+pass
+elif '=' in splitline[0]:
+try:
+abund_count=int(splitline[0].split('=')[-1])
+except:
+pass
+sam_read_count += abund_count
+CIGAR = splitline[5]
+POS = int(splitline[3])
+if firstPOS == 0:
+firstPOS = POS
+elif POS < firstPOS:
+firstPOS = POS
+readID = splitline[0]
+query_seq = splitline[9].upper()
+run_length = 0
+query_seq_parsed = ''
+query_pos = 0
+q_pars_pos = 0
+mutations = []
+for C in CIGAR: # process sequence based on standard CIGAR line
+if C == 'M' or C == 'I' or C == 'D' or C == 'S' or C == 'H':
+if C == 'S':
+query_pos = query_pos + run_length
+if C == 'I':
+if query_pos > 0:
+# add insertion to dict
+iPOS = q_pars_pos+POS
+iSeq = query_seq[query_pos: query_pos+run_length]
+istring = str(iPOS)+'-insert'+iSeq
+try:
+indel_dict[istring]
+except:
+indel_dict[istring] = abund_count
+else:
+indel_dict[istring] += abund_count
+if args.AAreport == 1 and (run_length % 3 == 0):
+iProt = ''
+if iPOS % 3 == 1:
+for x in range(0, (run_length//3)):
+AA = AAcall(iSeq[x*3]+iSeq[x*3+1]+iSeq[x*3+2])
+iProt = iProt + AA
+mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
+elif iPOS % 3 == 2:
+if query_pos > 0:
+ipSeq = query_seq[query_pos-1:query_pos+run_length+2]
+else:
+ipSeq = "XXX"+query_seq[query_pos+2:query_pos+run_length+2]
+for x in range(0, (run_length//3)+1):
+AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2])
+iProt = iProt + AA
+mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
+else:
+if query_pos > 1:
+ipSeq = query_seq[query_pos-2:query_pos+run_length+1]
+else:
+ipSeq = "XXX"+query_seq[query_pos+1:query_pos+run_length+1]
+for x in range(0, (run_length//3)+1):
+AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2])
+iProt = iProt + AA
+mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
+else:
+mutations.append(istring)
+query_pos = query_pos + run_length
+elif C == 'D':
+for X in range(0, run_length):
+query_seq_parsed += '-'
+delstring = str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'
+if args.AAreport == 1 and (run_length % 3 == 0) and not ((q_pars_pos+POS) % 3 == 1 ):
+if (q_pars_pos+POS) % 3 == 2:
+newcodon = query_seq[query_pos-1:query_pos+2]
+newAArefpos = (q_pars_pos+POS) // 3
+mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')')
+else:
+newcodon = query_seq[query_pos-2:query_pos+1]
+newAArefpos = (q_pars_pos+POS) // 3
+mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')')
+else:
+mutations.append(delstring)
+if args.nt_call == 1:
+for N in range(q_pars_pos+POS, q_pars_pos+POS+int(run_length)):
+try:
+nt_call_dict_dict[N]
+except:
+nt_call_dict_dict[N] = {'A' : 0,
+'T' : 0,
+'C' : 0,
+'G' : 0,
+'-' : 0}
+nt_call_dict_dict[N]['-'] = abund_count
+else:
+nt_call_dict_dict[N]['-'] += abund_count
+try:
+indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del']
+except:
+indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] = int(abund_count)
+else:
+indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] += int(abund_count)
+q_pars_pos = q_pars_pos + run_length
+elif C == 'M':
+offset = q_pars_pos-query_pos
+refPOS = POS+offset
+for ntPOS in range(query_pos, query_pos+run_length):
+if query_seq[ntPOS] == 'A' or query_seq[ntPOS] == 'T' or query_seq[ntPOS] == 'C' or query_seq[ntPOS] == 'G':
+if query_seq[ntPOS] != ref[1][refPOS+ntPOS-1]:
+if args.AAreport == 1 and args.AAcodonasMNP == 0:
+AAinfo = singletCodon(refPOS+ntPOS, query_seq[ntPOS], ref)
+mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
+else:
+mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS])
+if args.nt_call == 1:
+try:
+nt_call_dict_dict[refPOS+ntPOS]
+except:
+nt_call_dict_dict[refPOS+ntPOS] = {'A' : 0,
+'T' : 0,
+'C' : 0,
+'G' : 0,
+'-' : 0}
+nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] = abund_count
+else:
+nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] += abund_count
+q_pars_pos = q_pars_pos + run_length
+query_pos = query_pos + run_length
+run_length = 0
+else:
+run_length = (10 * run_length) + int(C)
+# END CIGAR PARSE
+if len(mutations) == 0: # record reference counts
+if args.wgs == 0:
+try:
+seq_species['Reference'] += abund_count
+except:
+seq_species['Reference'] = abund_count
+else:
+try:
+seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] += abund_count
+except:
+seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] = abund_count
+else: # record variants and counts
+if args.AAreport == 1 and args.AAcodonasMNP == 1:
+codonchecked = []
+codon = ''
+skip = 0
+MNP = ''
+for i in range(0, len(mutations)): # checking for MNP
+if 'Del' in mutations[i] or 'insert' in mutations[i]:
+codonchecked.append(mutations[i])
+elif skip > 0:
+skip -= 1
+else:
+mut1POS = int(''.join([c for c in mutations[i] if c.isdigit()]))
+try:
+mutations[i+1]
+except:
+AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
+codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
+else:
+if mut1POS % 3 == 0:
+AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
+codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
+else:
+mut2POS = int(''.join([c for c in mutations[i+1] if c.isdigit()]))
+if mut2POS - mut1POS < 3:
+AAPOS = mut1POS // 3
+if mut1POS % 3 == 1:
+if mut2POS % 3 == 0:
+codon = mutations[i][-1]+ref[1][mut1POS]+mutations[i+1][-1]
+MNP = mutations[i][-1]+'r'+mutations[i+1][-1]
+skip = 1
+else:
+try:
+mutations[i+2]
+except:
+codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS]
+MNP = mutations[i][-1]+mutations[i+1][-1]+'r'
+skip = 1
+else:
+mut3POS = int(''.join([c for c in mutations[i+2] if c.isdigit()]))
+if mut2POS == mut3POS -1:
+codon = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1]
+MNP = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1]
+skip = 2
+else:
+codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS]
+MNP = mutations[i][-1]+mutations[i+1][-1]+'r'
+skip = 1
+codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')')
+elif mut2POS - mut1POS == 1:
+codon = ref[1][mut1POS-2]+mutations[i][-1]+mutations[i+1][-1]
+MNP = mutations[i][-1]+mutations[i+1][-1]
+skip = 1
+codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')')
+else:
+AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
+codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
+else:
+AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
+codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
+mutations = " ".join(codonchecked)
+else:
+mutations = " ".join(mutations)
+if args.wgs == 0:
+try:
+seq_species[mutations] += abund_count
+except:
+seq_species[mutations] = abund_count
+else:
+try:
+seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] += abund_count
+except:
+seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] = abund_count
+if lastPOS < POS+q_pars_pos:
+lastPOS = POS+q_pars_pos
+for i in range(POS, POS+q_pars_pos): # update coverage
+try:
+coverage[i] += abund_count
+except:
+coverage[i] = abund_count
+args.Sam_file.close()
+# END SAM LINES
+total_mapped_reads = sam_read_count
+if args.seq == 1: # output the sequence
+seq_fh = open(args.outfile1, "w")
+seq_fh.write(args.samp+"("+str(sam_read_count)+")\n")
+seq_fh.write("Unique Sequence\tCount\tAbundance\n")
+sorted_seq = sorted(seq_species, key=seq_species.__getitem__, reverse=True)
+for key in sorted_seq:
+if seq_species[key] >= args.min_count:
+if (seq_species[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0:
+seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}\n")
+seq_array.append(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}")
+elif args.wgs == 1:
+splitseqs = key.split()
+cov = []
+for x in range(int(splitseqs[0]), int(splitseqs[-1])):
+cov.append(coverage[x])
+min_cov = min(cov)
+if (seq_species[key]/min_cov >= args.min_samp_abund):
+seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/min_cov):.3f}\n")
+else:
+break
+seq_fh.close()
+# END SEQ OUT
+if args.indel == 1: # and len(indel_dict) > 0: # output indels, if there are any
+sorted_indels = sorted(indel_dict, key=indel_dict.__getitem__, reverse=True)
+indels_to_write = []
+for key in sorted_indels:
+if indel_dict[key] >= args.min_count:
+if indel_dict[key] / sam_read_count >= args.min_samp_abund and args.wgs == 0:
+indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key]/sam_read_count):.3f}\n")
+elif args.wgs == 1:
+indelPOS = ''
+for c in key:
+if c.isdigit():
+indelPOS += c
+else:
+break
+indelPOS = int(indelPOS)
+if indel_dict[key] / coverage[indelPOS] >= args.min_samp_abund:
+indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key] / coverage[indelPOS]):.3f}\n")
+else:
+break
+# if len(indels_to_write) > 0:
+indel_fh = open(args.outfile2, "w")
+indel_fh.write(args.samp+"("+str(sam_read_count)+")\n")
+indel_fh.write("Indel\tCount\tAbundance\n")
+for indel_entry in indels_to_write:
+indel_fh.write(indel_entry)
+indel_fh.close()
+# END INDEL OUT
+if args.nt_call == 1: # out put nt calls
+ntcall_fh = open(args.outfile3, "w")
+ntcall_fh.write(args.samp+"("+str(sam_read_count)+")\n")
+if args.ntvar == 1:
+ntcallv_fh = open(args.outfile7, "w")
+ntcallv_fh.write(args.samp+"("+str(sam_read_count)+")\n")
+sorted_POS = sorted(nt_call_dict_dict)
+if args.AAreport == 1:
+ntcall_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n")
+if args.ntvar == 1:
+ntcallv_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n")
+for POS in sorted_POS:
+try:
+total = coverage[POS]
+except:
+total = 0
+if total >= (sam_read_count * args.ntabund):
+AAinfo = singletCodon(POS, ref[1][POS-1], ref)
+POS_calls = {}
+for key in nt_call_dict_dict[POS]:
+POS_calls[key] = nt_call_dict_dict[POS][key]
+sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True)
+ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
+ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
+ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
+ntcall_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
+if sorted_calls[0] != ref[1][POS-1]:
+if args.ntvar == 1:
+ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
+ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
+ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
+ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
+mod = (POS)%3
+if mod == 0:
+try:
+codon = ref[1][POS-3]+ref[1][POS-2]+sorted_calls[0]
+except:
+codon = 'NNN'
+elif mod == 2:
+try:
+codon = ref[1][POS-2]+sorted_calls[0]+ref[1][POS]
+except:
+codon = 'NNN'
+elif mod == 1:
+try:
+codon = sorted_calls[0]+ref[1][POS]+ref[1][POS+1]
+except:
+codon = 'NNN'
+ntcall_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1])
+if args.ntvar == 1:
+ntcallv_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1])
+if (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total >= args.min_samp_abund):
+if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1:
+ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
+ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
+if nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count:
+if nt_call_dict_dict[POS][sorted_calls[2]] / total  >= args.min_samp_abund:
+if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
+ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
+ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
+if args.ntvar == 1:
+ntcallv_fh.write("\n")
+elif (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] / total >= args.min_samp_abund):
+if args.ntvar == 1:
+ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
+ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
+ntcallv_fh.write("\t"+str(total)+"\t\t")
+ntcallv_fh.write(f"\t")
+ntcallv_fh.write("\t\t")
+ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
+ntcall_fh.write("\t\t")
+ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
+if (nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total >= args.min_samp_abund):
+ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
+if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
+ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
+if args.ntvar == 1:
+ntcallv_fh.write("\n")
+ntcall_fh.write("\n")
+else:
+ntcall_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n")
+if args.ntvar == 1:
+ntcallv_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n")
+for POS in sorted_POS:
+try:
+total = coverage[POS] # sum(nt_call_dict_dict[POS].values())
+except:
+total = 0
+if total >= (sam_read_count * args.ntabund):
+POS_calls = {}
+for key in nt_call_dict_dict[POS]:
+POS_calls[key] = nt_call_dict_dict[POS][key]
+sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True)
+ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1])
+ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
+ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
+if sorted_calls[0] != ref[1][POS-1]:
+if args.ntvar == 1:
+ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1])
+ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
+ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
+ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
+if (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund):
+ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
+if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1:
+ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
+if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund):
+ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
+if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
+ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
+if args.ntvar == 1:
+ntcallv_fh.write("\n")
+elif (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund):
+if args.ntvar == 1:
+ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1])
+ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
+ntcallv_fh.write("\t"+str(total)+"\t\t")
+ntcallv_fh.write(f"\t")
+ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
+ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
+if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and(nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund):
+ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
+if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
+ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
+if args.ntvar == 1:
+ntcallv_fh.write("\n")
+ntcall_fh.write("\n")
+ntcall_fh.close()
+if args.ntvar == 1:
+ntcallv_fh.close()
+# END NT CALL OUT
+if args.covar == 1: # output covariants
+testtrack = 0
+combinations = {}
+for sequence in seq_species:
+if args.wgs == 0:
+singles = sequence.split()
+else:
+singles = (sequence.split())[1:-1]
+if len(singles) <= args.max_dist and singles[0] != 'Ref':
+for combo in getCombos(singles, args.max_covar):
+if not combo in combinations:
+combinations[combo] = seq_species[sequence]
+else:
+combinations[combo] += seq_species[sequence]
+covar_fh = open(args.outfile4, "w")
+covar_fh.write(args.samp+"("+str(sam_read_count)+")\n")
+covar_fh.write("Co-Variants\tCount\tAbundance\n")
+sortedcombos = sorted(combinations, key=combinations.__getitem__, reverse=True)
+for key in sortedcombos:
+if (combinations[key] >= args.min_count):
+if (combinations[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0:
+covar_fh.write(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}\n")
+covar_array.append(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}")
+elif args.wgs == 1:
+coveragepercent = 0
+splitcombos = key.split()
+if len(splitcombos) == 1:
+coveragePOS = ''
+for c in key:
+if c.isdigit():
+coveragePOS += c
+else:
+break
+coveragepercent = combinations[key] / coverage[int(coveragePOS)]
+else:
+startcovPOS = ''
+for c in splitcombos[0]:
+if c.isdigit():
+startcovPOS += c
+else:
+break
+endcovPOS = ''
+for c in splitcombos[-1]:
+if c.isdigit():
+endcovPOS += c
+else:
+break
+coveragevals = []
+for i in range(int(startcovPOS), int(endcovPOS)+1):
+coveragevals.append(coverage[i])
+mincov = min(coverval for coverval in coveragevals)
+coveragepercent = combinations[key] / mincov
+if coveragepercent >= args.min_samp_abund:
+covar_fh.write(f"{key}\t{combinations[key]}\t{coveragepercent:.3f}\n")
+covar_fh.close()
+# END COVAR OUT
+return(covar_array, seq_array, sam_read_count)
+def cvdeconv(args, covardict, seqdict): # covar deconvolution process
+passedseqs = {}
+preservedseqs = {}
+for seq in seqdict: # pass check for actual : expected abundance
+if seq != 'total' and seq != 'singles':
+splitseq = seq.split(' ')
+abund = 1
+for sing in splitseq:
+try:
+abund = abund * (covardict[sing] / covardict['total'])
+except:
+abund = abund * (seqdict[seq] / seqdict['total'])
+try:
+covarabund = covardict[seq]/covardict['total']
+except:
+covarabund = seqdict[seq]/seqdict['total']
+covardict[seq] = seqdict[seq]
+if covarabund >= args.autopass:
+preservedseqs[seq] = max(1, args.beta, (covarabund / abund))
+elif covarabund >= abund * args.beta:
+passedseqs[seq] = covarabund / abund
+elif len(seq.split(' ')) == 1:
+passedseqs[seq] = max(1, args.beta)
+if args.min_samp_abund < 1:
+min_count = args.min_samp_abund * covardict['total']
+else:
+min_count = args.min_samp_abund
+# sort passed covars
+lensortedpassed = sorted(passedseqs, key = lambda key : len(key.split(' ')), reverse=True)
+ratiolensortedpassed = sorted(lensortedpassed, key = lambda key : passedseqs[key], reverse = True)
+sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
+deconved = {}
+for seq in ratiolensortedpassed: # reassign counts
+singles = seq.split(' ')
+first = 0
+rem_count = 0
+for sing in sortedsingles:
+if sing in singles:
+if covardict['singles'][sing] > 0:
+if first == 0:
+first = 1
+rem_count = covardict['singles'][sing]
+covardict['singles'][sing] = 0
+deconved[seq] = rem_count
+else:
+covardict['singles'][sing] = covardict['singles'][sing] - rem_count
+else:
+break
+sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
+sortedpreserved = sorted(preservedseqs, key = lambda key : covardict[key])
+for seq in sortedpreserved:
+singles = seq.split(' ')
+first = 0
+rem_count = 0
+for sing in sortedsingles:
+if sing in singles:
+if covardict['singles'][sing] > 0:
+if first == 0:
+first = 1
+rem_count = covardict['singles'][sing]
+covardict['singles'][sing] = 0
+deconved[seq] = rem_count
+else:
+covardict['singles'][sing] = covardict['singles'][sing] - rem_count
+else:
+break
+sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
+newtotal = sum(deconved.values())
+fh_deconv = open(args.outfile6, "w")
+fh_deconv.write(f"{args.samp}({covardict['total']}) | ({newtotal})\tCount\tAbundance\n")
+sorted_deconved = sorted(deconved, key = deconved.__getitem__, reverse = True)
+for seq in sorted_deconved: # write deconv
+if deconved[seq] >= min_count:
+fh_deconv.write(f"{seq}\t{deconved[seq]}\t{(deconved[seq]/newtotal):.3f}\n")
+fh_deconv.close()
+# END COVAR DECONV OUT
+return()
+def dechim(args, seqs): # processing sequence dictionary to remove chimeras
+total = seqs['total']
+del seqs['total']
+sorted_seqs = sorted(seqs, key=seqs.__getitem__) # sort sequences by abundance, least to greatest
+chimeras = []
+for seq in sorted_seqs:
+pot_chim = ['Reference']+seq.split()+['Reference']
+chim_halves = []
+for i in range(0, len(pot_chim)-1): # create dimera halves
+chim_halves.append([pot_chim[:i+1], pot_chim[i+1:]])
+parent_pairs = []
+for dimera in chim_halves: # check for potential parents
+pot_left = []
+pot_rt = []
+lft_len = len(dimera[0])
+rt_len = len(dimera[1])
+for pseq in sorted_seqs:
+if not seq == pseq:
+if (seqs[pseq] >= (seqs[seq] * args.foldab)):
+pot_par = ['Reference']+pseq.split()+['Reference']
+if dimera[0] == pot_par[:lft_len]:
+pot_left.append(pot_par)
+if ((len(pot_par) >= rt_len) and (dimera[1] == pot_par[(len(pot_par)-rt_len):])):
+pot_rt.append(pot_par)
+if (len(pot_left) > 0 and len(pot_rt) > 0 ):
+for left_par in pot_left: # check potential parents' pairing
+for rt_par in pot_rt:
+if left_par != rt_par:
+left_break = left_par[lft_len]
+rt_break = rt_par[(len(rt_par)-rt_len)-1]
+if left_break == 'Reference' or rt_break == 'Reference':
+parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])])
+else:
+left_break_POS = ''
+for c in left_break:
+if c.isdigit():
+left_break_POS += c
+else:
+break
+rt_break_POS = ''
+for c in rt_break:
+if c.isdigit():
+rt_break_POS += c
+else:
+break
+if int(left_break_POS) > int(rt_break_POS):
+parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])])
+par_tot_abund = 0
+pair_probs = []
+for parents in parent_pairs: # calc 'expected' abundance
+pair_prob = (seqs[parents[0]] / total) * (seqs[parents[1]] / total)
+par_tot_abund += pair_prob
+pair_probs.append(pair_prob)
+recomb_count = par_tot_abund * total
+if not seqs[seq] >= recomb_count * args.alpha: # chimera check
+redist_count = float(seqs[seq])
+chimeras.append(seq)
+seqs[seq] = 0
+if args.redist == 1: # redist counts of chimera
+toadd = {}
+for i in range(0, len(parent_pairs)):
+counts_to_redist = (redist_count * (pair_probs[i]/par_tot_abund))/2
+seqs[parent_pairs[i][0]] += counts_to_redist
+seqs[parent_pairs[i][1]] += counts_to_redist
+for chim in chimeras: # remove chimeras
+del seqs[chim]
+total = sum(seqs.values())
+# total = sum(seqs.values)
+seqs['total'] = total
+return(seqs)
+def chimrm(args, seqs): # chimera removed process
+pre_len = len(seqs)
+inf_loop_shield = 0
+while True: # send sequences for chimera removal while chimeras are still found
+dechim(args, seqs)
+post_len = len(seqs)
+inf_loop_shield += 1
+if post_len >= pre_len:
+break
+if inf_loop_shield > 100:
+break
+pre_len = len(seqs)
+total = seqs['total']
+del seqs['total']
+if args.min_samp_abund < 1:
+min_count = args.min_samp_abund * total
+else:
+min_count = args.min_samp_abund
+sorted_seqs = sorted(seqs, key=seqs.__getitem__, reverse=True)
+fh_dechime = open(args.outfile5, 'w')
+fh_dechime.write(f"{args.samp}({int(total)})\n")
+fh_dechime.write("Sequences\tCount\tAbundance\n")
+for seq in seqs: # write chim_rm seqs
+abund = seqs[seq]/total
+if seqs[seq] >= min_count:
+fh_dechime.write(f"{seq}\t{round(seqs[seq])}\t{abund:.3f}\n")
+fh_dechime.close()
+return()
+def main():
+args = arg_parser() # getting command line arguments
+covar_array = []
+seq_array = []
+total_mapped_reads = 0
+if args.ref:
+ref = get_ref(args.ref) # get the reference ID and sequence from the FASTA file
+args.ref.close()
+if ref[1] == '':
+print('Reference not recognized as a Fasta format, skipping SAM parsing')
+else:
+refprot = ''
+if args.AAreport == 1: # make an Amino Acid sequence based on the reference sequence
+for x in range(0, (len(ref[1])-1)//3):
+AA = AAcall(ref[1][x*3]+ref[1][x*3+1]+ref[1][x*3+2])
+refprot = refprot + AA
+if (len(ref[1])-1)%3 != 0:
+refprot = refprot + '?'
+covar_array, seq_array, total_mapped_reads = SAMparse(args, ref, refprot, args.Sam_file)
+else:
+print('No reference provided, skipping SAM parsing')
+in_covars = {'total' : total_mapped_reads,
+'singles' : {}
+}
+in_seqs = {'total' : total_mapped_reads}
+# Begin chimera removal if enabled
+if args.chim_rm == 1 or args.deconv == 1:
+if args.deconv == 1:
+for line in covar_array:
+lineparts = line.split("\t")
+in_covars[lineparts[0]] = int(lineparts[1])
+if len(lineparts[0].split(' ')) == 1:
+in_covars['singles'][lineparts[0]] = int(lineparts[1])
+for line in seq_array:
+lineparts = line.split("\t")
+in_seqs[lineparts[0]] = float(lineparts[1])
+if args.deconv == 1:
+cvdeconv(args, in_covars, in_seqs)
+if args.chim_rm == 1:
+chimrm(args, in_seqs)
+if __name__ == '__main__':
+main()

Mercurial > repos > degregory > sam_refiner

comparison SAM_Refiner_Gal.py @ 0:22e3f843f1db draft