annotate SAM_Refiner_Gal.py @ 1:b68f7e37e70a draft default tip

Uploaded xml
author degregory
date Mon, 13 Sep 2021 14:14:20 +0000
parents 22e3f843f1db
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1 #!/bin/env python3
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
2 # Writen by Devon Gregory with assistance from Christopher Bottoms
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
3 # University of Missouri
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
4 # Distributed under GNU GENERAL PUBLIC LICENSE v3
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
5 import os
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
6 import sys
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
7 import argparse
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
8 import itertools
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
9 import time
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
10 from multiprocessing import Process
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
11 from pathlib import Path
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
12
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
13 # process for collecing command line arguments
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
14 def arg_parser():
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
15
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
16 parser = argparse.ArgumentParser(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
17 description='process Sam files for variant information'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
18 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
19
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
20 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
21 '--samp',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
22 type=str,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
23 default='Sample',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
24 help='Sample Name'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
25 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
26 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
27 '-r', '-reference',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
28 type=argparse.FileType('r'),
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
29 dest='ref',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
30 help='reference fasta'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
31 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
32 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
33 '-S', '--Sam_file',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
34 type=argparse.FileType('r'),
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
35 dest='Sam_file',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
36 help='.sam file'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
37 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
38 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
39 '-o1', '--out_file1',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
40 dest='outfile1',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
41 help='output file'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
42 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
43 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
44 '-o2', '--out_file2',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
45 dest='outfile2',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
46 help='output file'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
47 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
48 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
49 '-o3', '--out_file3',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
50 dest='outfile3',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
51 help='output file'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
52 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
53 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
54 '-o4', '--out_file4',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
55 dest='outfile4',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
56 help='output file'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
57 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
58 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
59 '-o5', '--out_file5',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
60 dest='outfile5',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
61 help='output file'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
62 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
63 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
64 '-o6', '--out_file6',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
65 dest='outfile6',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
66 help='output file'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
67 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
68 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
69 '-o7', '--out_file7',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
70 dest='outfile7',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
71 help='output file'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
72 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
73 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
74 '--use_count',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
75 type=int,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
76 default=1,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
77 choices=[0, 1],
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
78 help='Enable/Disable (1/0) use of counts in sequence IDs, default enabled (--use_count 1)'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
79 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
80 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
81 '--min_count',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
82 type=int,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
83 default=10,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
84 help='Minimum observations required to be included in sample reports; >= 1 occurance count; < 1 %% observed (.1 = 10%%), (default: .001)'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
85 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
86 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
87 '--min_samp_abund',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
88 type=float,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
89 default=0.001,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
90 help='Minimum abundance required for inclusion in sample reports; %% observed (.1 = 10%%), (default: .001)'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
91 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
92 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
93 '--ntabund',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
94 type=float,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
95 default=.001,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
96 help='Minimum abundance relative to total reads required for a position to be reported in the nt call output; must be non-negative and < 1, %% observed (.1 = 10%%), (default: .001)'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
97 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
98 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
99 '--max_dist',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
100 type=int,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
101 default=40,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
102 help='Maximum number of variances from the reference a sequence can have to be consider in covars processing (default: 40)'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
103 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
104 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
105 '--max_covar',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
106 type=int,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
107 default=8,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
108 help='Maximum number of variances from the reference to be reported in covars (default: 8)'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
109 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
110 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
111 '--AAreport',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
112 type=int,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
113 default=1,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
114 choices=[0, 1],
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
115 help='Enable/Disable (1/0) amino acid reporting, default enabled (--AAreport 1)'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
116 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
117 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
118 '--AAcodonasMNP',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
119 type=int,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
120 default=1,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
121 choices=[0, 1],
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
122 help='Enable/Disable (1/0) reporting multiple nt changes in a single codon as one polymorphism, default enabled (--AAcodonasMNP 1), requires AAreport enabled'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
123 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
124 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
125 '--alpha',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
126 type=float,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
127 default=1.2,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
128 help='Modifier for chim_rm chimera checking, default 1.2. Higher = more sensitive, more false chimeras removed; lower = less sensitive, fewer chimeras removed'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
129 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
130 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
131 '--foldab',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
132 type=float,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
133 default=1.8,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
134 help='Threshold for potential parent / chimera abundance ratio for chim_rm; default is 1.8'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
135 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
136 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
137 '--redist',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
138 type=int,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
139 default=1,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
140 choices=[0, 1],
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
141 help='Enable/Disable (1/0) redistribution of chimera counts for chim_rm, default enabled (--redist 1)'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
142 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
143 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
144 '--beta',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
145 type=float,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
146 default=1,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
147 help='Modifier for covar pass checking, default 1. Higher = more sensitive, more failed checks; lower = less sensitive, fewer failed checks'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
148 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
149 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
150 '--autopass',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
151 type=float,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
152 default=.3,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
153 help='threshold for a sequence to automatically pass the covar pass checking'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
154 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
155 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
156 '--nt_call',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
157 type=int,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
158 default=1,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
159 choices=[0, 1],
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
160 help='Enable/Disable (1/0) nt_call output, default enabled (--nt_call 1)'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
161 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
162 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
163 '--ntvar',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
164 type=int,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
165 default=0,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
166 choices=[0, 1],
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
167 help='Enable/Disable (1/0) nt_var output, default enabled (--nt_call 1)'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
168 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
169 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
170 '--indel',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
171 type=int,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
172 default=1,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
173 choices=[0, 1],
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
174 help='Enable/Disable (1/0) indel output, default enabled (--indel 1)'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
175 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
176 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
177 '--seq',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
178 type=int,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
179 default=1,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
180 choices=[0, 1],
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
181 help='Enable/Disable (1/0) unique seq output, default enabled (--seq 1)'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
182 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
183 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
184 '--covar', type=int,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
185 default=1,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
186 choices=[0, 1],
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
187 help='Enable/Disable (1/0) covar output, default enabled (--covar 1)'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
188 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
189 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
190 '--chim_rm',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
191 type=int,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
192 default=1,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
193 choices=[0, 1],
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
194 help='Enable/Disable (1/0) chim_rm output, default enabled (--chim_rm 1)'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
195 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
196 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
197 '--deconv',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
198 type=int,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
199 default=1,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
200 choices=[0, 1],
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
201 help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
202 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
203 parser.add_argument(
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
204 '--wgs',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
205 type=int,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
206 default=0,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
207 choices=[0, 1],
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
208 help='Enable/Disable (1/0) covar deconv, default enabled (--deconv 1)'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
209 )
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
210
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
211 args = parser.parse_args()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
212
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
213 # checking for proper range of some parameters and consistency/compatibility
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
214 if args.wgs == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
215 if args.deconv == 1 or args.chim_rm == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
216 args.deconv = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
217 args.chim_rm = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
218 print('WGS mode enabled, disabling chimera removal methods')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
219
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
220 return(args)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
221
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
222 def get_ref(ref): # get the reference ID and sequence from the FASTA file. Will only get the first.
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
223
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
224 n=0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
225 refID = ''
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
226 refseq = ''
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
227 if ref:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
228 for line in ref:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
229 if line.startswith('>'):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
230 n+=1
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
231 if n > 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
232 break
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
233 refID = line[1:].strip("\n\r")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
234 elif n == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
235 refseq = refseq + line.strip("\n\r")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
236 refseq = refseq.upper()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
237
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
238
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
239 return(refID, refseq)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
240
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
241 def AAcall(codon): # amino acid / codon dictionary to return encoded AAs
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
242 AAdict = {
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
243 'TTT' : 'F',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
244 'TTC' : 'F',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
245 'TTA' : 'L',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
246 'TTG' : 'L',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
247 'TCT' : 'S',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
248 'TCC' : 'S',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
249 'TCA' : 'S',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
250 'TCG' : 'S',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
251 'TAT' : 'Y',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
252 'TAC' : 'Y',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
253 'TAA' : '*',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
254 'TAG' : '*',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
255 'TGT' : 'C',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
256 'TGC' : 'C',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
257 'TGA' : '*',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
258 'TGG' : 'W',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
259
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
260 'CTT' : 'L',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
261 'CTC' : 'L',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
262 'CTA' : 'L',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
263 'CTG' : 'L',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
264 'CCT' : 'P',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
265 'CCC' : 'P',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
266 'CCA' : 'P',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
267 'CCG' : 'P',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
268 'CAT' : 'H',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
269 'CAC' : 'H',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
270 'CAA' : 'Q',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
271 'CAG' : 'Q',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
272 'CGT' : 'R',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
273 'CGC' : 'R',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
274 'CGA' : 'R',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
275 'CGG' : 'R',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
276
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
277 'ATT' : 'I',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
278 'ATC' : 'I',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
279 'ATA' : 'I',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
280 'ATG' : 'M',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
281 'ACT' : 'T',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
282 'ACC' : 'T',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
283 'ACA' : 'T',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
284 'ACG' : 'T',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
285 'AAT' : 'N',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
286 'AAC' : 'N',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
287 'AAA' : 'K',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
288 'AAG' : 'K',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
289 'AGT' : 'S',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
290 'AGC' : 'S',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
291 'AGA' : 'R',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
292 'AGG' : 'R',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
293
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
294 'GTT' : 'V',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
295 'GTC' : 'V',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
296 'GTA' : 'V',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
297 'GTG' : 'V',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
298 'GCT' : 'A',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
299 'GCC' : 'A',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
300 'GCA' : 'A',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
301 'GCG' : 'A',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
302 'GAT' : 'D',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
303 'GAC' : 'D',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
304 'GAA' : 'E',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
305 'GAG' : 'E',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
306 'GGT' : 'G',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
307 'GGC' : 'G',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
308 'GGA' : 'G',
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
309 'GGG' : 'G'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
310 }
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
311 AA = '?'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
312 if codon in AAdict:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
313 AA = AAdict[codon]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
314
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
315 return(AA)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
316
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
317 def singletCodon(ntPOS, nt, ref): # process to return the AA and protein seq. position based on the reference and provided nt seq position and nt
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
318 AAPOS = (ntPOS-1)//3
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
319 AAmod = (ntPOS-1)%3
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
320 codon = ""
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
321 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
322 if AAmod == 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
323 codon = nt+ref[1][ntPOS]+ref[1][ntPOS+1]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
324 elif AAmod == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
325 codon = ref[1][ntPOS-2]+nt+ref[1][ntPOS]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
326 elif AAmod == 2:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
327 codon = ref[1][ntPOS-3]+ref[1][ntPOS-2]+nt
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
328 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
329 codon = "XXX"
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
330
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
331 return(AAPOS+1, AAcall(codon))
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
332
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
333 def getCombos(qlist, clen): # returns combinations of single polymorphisms in a sequence
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
334 combos = []
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
335 if (clen == 0 or clen > len(qlist)):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
336 clen = len(qlist)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
337 for N in range(1, clen+1):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
338 for comb in itertools.combinations(qlist, N):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
339 combos.append(' '.join(comb))
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
340 return(combos)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
341
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
342 def SAMparse(args, ref, refprot, file): # process SAM files
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
343 covar_array = []
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
344 seq_array = []
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
345
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
346 nt_call_dict_dict = {}
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
347 indel_dict = {}
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
348 seq_species = {}
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
349 sam_read_count = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
350 sam_line_count = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
351 firstPOS = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
352 lastPOS = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
353 coverage = {}
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
354 for line in args.Sam_file:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
355 if not line.startswith('@'): # ignore header lines
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
356 splitline = line.split("\t")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
357 if ref[0].upper().startswith(splitline[2].upper()): # check map ID matches referecne ID
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
358 if int(splitline[4]) > 0: # Check mapping score is positive
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
359
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
360 abund_count=1
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
361 if args.use_count == 1: # get the unique sequence counts
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
362 if '-' in splitline[0] and '=' in splitline[0]:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
363 eq_split = splitline[0].split('=')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
364 dash_split = splitline[0].split('-')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
365 if len(eq_split[-1]) > len(dash_split[-1]):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
366 abund_count=int(dash_split[-1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
367 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
368 abund_count=int(eq_split[-1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
369
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
370 elif '-' in splitline[0]:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
371 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
372 abund_count=int(splitline[0].split('-')[-1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
373 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
374 pass
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
375
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
376 elif '=' in splitline[0]:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
377 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
378 abund_count=int(splitline[0].split('=')[-1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
379 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
380 pass
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
381
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
382
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
383 sam_read_count += abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
384
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
385 CIGAR = splitline[5]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
386 POS = int(splitline[3])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
387 if firstPOS == 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
388 firstPOS = POS
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
389 elif POS < firstPOS:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
390 firstPOS = POS
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
391
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
392 readID = splitline[0]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
393 query_seq = splitline[9].upper()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
394 run_length = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
395 query_seq_parsed = ''
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
396 query_pos = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
397 q_pars_pos = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
398 mutations = []
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
399
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
400 for C in CIGAR: # process sequence based on standard CIGAR line
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
401 if C == 'M' or C == 'I' or C == 'D' or C == 'S' or C == 'H':
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
402 if C == 'S':
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
403 query_pos = query_pos + run_length
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
404
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
405 if C == 'I':
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
406 if query_pos > 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
407 # add insertion to dict
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
408 iPOS = q_pars_pos+POS
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
409
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
410 iSeq = query_seq[query_pos: query_pos+run_length]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
411 istring = str(iPOS)+'-insert'+iSeq
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
412
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
413 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
414 indel_dict[istring]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
415 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
416 indel_dict[istring] = abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
417 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
418 indel_dict[istring] += abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
419
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
420 if args.AAreport == 1 and (run_length % 3 == 0):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
421
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
422 iProt = ''
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
423 if iPOS % 3 == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
424 for x in range(0, (run_length//3)):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
425 AA = AAcall(iSeq[x*3]+iSeq[x*3+1]+iSeq[x*3+2])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
426 iProt = iProt + AA
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
427 mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
428 elif iPOS % 3 == 2:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
429 if query_pos > 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
430 ipSeq = query_seq[query_pos-1:query_pos+run_length+2]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
431 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
432 ipSeq = "XXX"+query_seq[query_pos+2:query_pos+run_length+2]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
433 for x in range(0, (run_length//3)+1):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
434 AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
435 iProt = iProt + AA
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
436 mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
437 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
438 if query_pos > 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
439 ipSeq = query_seq[query_pos-2:query_pos+run_length+1]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
440 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
441 ipSeq = "XXX"+query_seq[query_pos+1:query_pos+run_length+1]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
442
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
443 for x in range(0, (run_length//3)+1):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
444 AA = AAcall(ipSeq[x*3]+ipSeq[x*3+1]+ipSeq[x*3+2])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
445 iProt = iProt + AA
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
446 mutations.append(istring + '(' + str((iPOS//3)+1) + iProt + ')')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
447 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
448 mutations.append(istring)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
449
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
450 query_pos = query_pos + run_length
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
451
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
452 elif C == 'D':
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
453 for X in range(0, run_length):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
454 query_seq_parsed += '-'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
455
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
456 delstring = str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
457
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
458 if args.AAreport == 1 and (run_length % 3 == 0) and not ((q_pars_pos+POS) % 3 == 1 ):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
459 if (q_pars_pos+POS) % 3 == 2:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
460 newcodon = query_seq[query_pos-1:query_pos+2]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
461 newAArefpos = (q_pars_pos+POS) // 3
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
462 mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
463 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
464 newcodon = query_seq[query_pos-2:query_pos+1]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
465 newAArefpos = (q_pars_pos+POS) // 3
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
466 mutations.append(delstring + '(' + refprot[newAArefpos] + str(newAArefpos+1) + AAcall(newcodon) + ')')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
467 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
468 mutations.append(delstring)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
469
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
470 if args.nt_call == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
471 for N in range(q_pars_pos+POS, q_pars_pos+POS+int(run_length)):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
472 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
473 nt_call_dict_dict[N]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
474 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
475 nt_call_dict_dict[N] = {'A' : 0,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
476 'T' : 0,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
477 'C' : 0,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
478 'G' : 0,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
479 '-' : 0}
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
480 nt_call_dict_dict[N]['-'] = abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
481 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
482 nt_call_dict_dict[N]['-'] += abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
483
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
484 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
485 indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del']
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
486 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
487 indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] = int(abund_count)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
488 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
489 indel_dict[str(q_pars_pos+POS)+'-'+str(q_pars_pos+POS+run_length-1)+'Del'] += int(abund_count)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
490
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
491 q_pars_pos = q_pars_pos + run_length
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
492
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
493 elif C == 'M':
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
494 offset = q_pars_pos-query_pos
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
495 refPOS = POS+offset
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
496
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
497 for ntPOS in range(query_pos, query_pos+run_length):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
498 if query_seq[ntPOS] == 'A' or query_seq[ntPOS] == 'T' or query_seq[ntPOS] == 'C' or query_seq[ntPOS] == 'G':
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
499 if query_seq[ntPOS] != ref[1][refPOS+ntPOS-1]:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
500 if args.AAreport == 1 and args.AAcodonasMNP == 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
501 AAinfo = singletCodon(refPOS+ntPOS, query_seq[ntPOS], ref)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
502 mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
503 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
504 mutations.append(str(refPOS+ntPOS)+query_seq[ntPOS])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
505 if args.nt_call == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
506 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
507 nt_call_dict_dict[refPOS+ntPOS]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
508 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
509 nt_call_dict_dict[refPOS+ntPOS] = {'A' : 0,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
510 'T' : 0,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
511 'C' : 0,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
512 'G' : 0,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
513 '-' : 0}
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
514 nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] = abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
515 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
516 nt_call_dict_dict[refPOS+ntPOS][query_seq[ntPOS]] += abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
517
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
518
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
519 q_pars_pos = q_pars_pos + run_length
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
520 query_pos = query_pos + run_length
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
521
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
522 run_length = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
523
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
524
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
525 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
526 run_length = (10 * run_length) + int(C)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
527 # END CIGAR PARSE
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
528
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
529
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
530
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
531 if len(mutations) == 0: # record reference counts
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
532 if args.wgs == 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
533 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
534 seq_species['Reference'] += abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
535 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
536 seq_species['Reference'] = abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
537 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
538 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
539 seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] += abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
540 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
541 seq_species[str(POS)+' Ref '+str(POS+q_pars_pos)] = abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
542
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
543 else: # record variants and counts
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
544 if args.AAreport == 1 and args.AAcodonasMNP == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
545 codonchecked = []
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
546 codon = ''
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
547 skip = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
548 MNP = ''
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
549 for i in range(0, len(mutations)): # checking for MNP
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
550 if 'Del' in mutations[i] or 'insert' in mutations[i]:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
551 codonchecked.append(mutations[i])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
552 elif skip > 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
553 skip -= 1
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
554 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
555 mut1POS = int(''.join([c for c in mutations[i] if c.isdigit()]))
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
556 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
557 mutations[i+1]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
558 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
559 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
560 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
561 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
562
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
563 if mut1POS % 3 == 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
564 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
565 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
566 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
567 mut2POS = int(''.join([c for c in mutations[i+1] if c.isdigit()]))
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
568 if mut2POS - mut1POS < 3:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
569 AAPOS = mut1POS // 3
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
570 if mut1POS % 3 == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
571 if mut2POS % 3 == 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
572 codon = mutations[i][-1]+ref[1][mut1POS]+mutations[i+1][-1]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
573 MNP = mutations[i][-1]+'r'+mutations[i+1][-1]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
574 skip = 1
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
575 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
576 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
577 mutations[i+2]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
578 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
579 codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
580 MNP = mutations[i][-1]+mutations[i+1][-1]+'r'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
581 skip = 1
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
582 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
583 mut3POS = int(''.join([c for c in mutations[i+2] if c.isdigit()]))
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
584 if mut2POS == mut3POS -1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
585 codon = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
586 MNP = mutations[i][-1]+mutations[i+1][-1]+mutations[i+2][-1]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
587 skip = 2
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
588 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
589 codon = mutations[i][-1]+mutations[i+1][-1]+ref[1][mut2POS]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
590 MNP = mutations[i][-1]+mutations[i+1][-1]+'r'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
591 skip = 1
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
592 codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
593 elif mut2POS - mut1POS == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
594 codon = ref[1][mut1POS-2]+mutations[i][-1]+mutations[i+1][-1]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
595 MNP = mutations[i][-1]+mutations[i+1][-1]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
596 skip = 1
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
597 codonchecked.append(str(mut1POS)+MNP+'('+refprot[AAPOS]+str(AAPOS+1)+AAcall(codon)+')')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
598 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
599 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
600 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
601
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
602
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
603 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
604 AAinfo = singletCodon(mut1POS, mutations[i][-1], ref)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
605 codonchecked.append(mutations[i]+'('+refprot[AAinfo[0]-1]+str(AAinfo[0])+AAinfo[1]+')')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
606 mutations = " ".join(codonchecked)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
607 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
608 mutations = " ".join(mutations)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
609
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
610 if args.wgs == 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
611 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
612 seq_species[mutations] += abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
613 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
614 seq_species[mutations] = abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
615 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
616 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
617 seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] += abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
618 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
619 seq_species[str(POS)+' '+mutations+' '+str(POS+q_pars_pos)] = abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
620
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
621 if lastPOS < POS+q_pars_pos:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
622 lastPOS = POS+q_pars_pos
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
623 for i in range(POS, POS+q_pars_pos): # update coverage
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
624 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
625 coverage[i] += abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
626 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
627 coverage[i] = abund_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
628
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
629 args.Sam_file.close()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
630 # END SAM LINES
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
631
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
632 total_mapped_reads = sam_read_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
633
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
634 if args.seq == 1: # output the sequence
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
635 seq_fh = open(args.outfile1, "w")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
636 seq_fh.write(args.samp+"("+str(sam_read_count)+")\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
637 seq_fh.write("Unique Sequence\tCount\tAbundance\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
638
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
639 sorted_seq = sorted(seq_species, key=seq_species.__getitem__, reverse=True)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
640 for key in sorted_seq:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
641 if seq_species[key] >= args.min_count:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
642 if (seq_species[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
643 seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
644 seq_array.append(f"{key}\t{seq_species[key]}\t{(seq_species[key]/sam_read_count):.3f}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
645 elif args.wgs == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
646 splitseqs = key.split()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
647 cov = []
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
648 for x in range(int(splitseqs[0]), int(splitseqs[-1])):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
649 cov.append(coverage[x])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
650 min_cov = min(cov)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
651 if (seq_species[key]/min_cov >= args.min_samp_abund):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
652 seq_fh.write(f"{key}\t{seq_species[key]}\t{(seq_species[key]/min_cov):.3f}\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
653 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
654 break
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
655
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
656 seq_fh.close()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
657 # END SEQ OUT
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
658
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
659 if args.indel == 1: # and len(indel_dict) > 0: # output indels, if there are any
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
660 sorted_indels = sorted(indel_dict, key=indel_dict.__getitem__, reverse=True)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
661 indels_to_write = []
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
662 for key in sorted_indels:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
663 if indel_dict[key] >= args.min_count:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
664 if indel_dict[key] / sam_read_count >= args.min_samp_abund and args.wgs == 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
665 indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key]/sam_read_count):.3f}\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
666 elif args.wgs == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
667 indelPOS = ''
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
668 for c in key:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
669 if c.isdigit():
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
670 indelPOS += c
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
671 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
672 break
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
673 indelPOS = int(indelPOS)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
674 if indel_dict[key] / coverage[indelPOS] >= args.min_samp_abund:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
675 indels_to_write.append(f"{key}\t{indel_dict[key]}\t{(indel_dict[key] / coverage[indelPOS]):.3f}\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
676 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
677 break
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
678 # if len(indels_to_write) > 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
679 indel_fh = open(args.outfile2, "w")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
680 indel_fh.write(args.samp+"("+str(sam_read_count)+")\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
681 indel_fh.write("Indel\tCount\tAbundance\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
682 for indel_entry in indels_to_write:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
683 indel_fh.write(indel_entry)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
684 indel_fh.close()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
685 # END INDEL OUT
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
686
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
687 if args.nt_call == 1: # out put nt calls
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
688 ntcall_fh = open(args.outfile3, "w")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
689 ntcall_fh.write(args.samp+"("+str(sam_read_count)+")\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
690 if args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
691 ntcallv_fh = open(args.outfile7, "w")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
692 ntcallv_fh.write(args.samp+"("+str(sam_read_count)+")\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
693 sorted_POS = sorted(nt_call_dict_dict)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
694 if args.AAreport == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
695 ntcall_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
696 if args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
697 ntcallv_fh.write("Position\tref NT\tAA POS\tref AA\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tPrimary Seq AA\tsingle nt AA\tSecondary NT\tCounts\tAbundance\tAA\tTertiary NT\tCounts\tAbundance\tAA\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
698 for POS in sorted_POS:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
699 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
700 total = coverage[POS]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
701 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
702 total = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
703 if total >= (sam_read_count * args.ntabund):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
704 AAinfo = singletCodon(POS, ref[1][POS-1], ref)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
705 POS_calls = {}
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
706 for key in nt_call_dict_dict[POS]:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
707 POS_calls[key] = nt_call_dict_dict[POS][key]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
708 sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
709
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
710 ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
711 ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
712 ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
713 ntcall_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
714 if sorted_calls[0] != ref[1][POS-1]:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
715 if args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
716 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
717 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
718 ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
719 ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
720
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
721 mod = (POS)%3
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
722
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
723 if mod == 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
724 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
725 codon = ref[1][POS-3]+ref[1][POS-2]+sorted_calls[0]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
726 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
727 codon = 'NNN'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
728 elif mod == 2:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
729 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
730 codon = ref[1][POS-2]+sorted_calls[0]+ref[1][POS]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
731 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
732 codon = 'NNN'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
733 elif mod == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
734 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
735 codon = sorted_calls[0]+ref[1][POS]+ref[1][POS+1]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
736 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
737 codon = 'NNN'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
738 ntcall_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
739 if args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
740 ntcallv_fh.write("\t"+AAcall(codon)+"\t"+singletCodon(POS, sorted_calls[0], ref)[1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
741 if (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total >= args.min_samp_abund):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
742 if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
743 ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
744 ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
745 if nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
746 if nt_call_dict_dict[POS][sorted_calls[2]] / total >= args.min_samp_abund:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
747 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
748 ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
749 ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
750
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
751 if args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
752 ntcallv_fh.write("\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
753 elif (nt_call_dict_dict[POS][sorted_calls[1]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] / total >= args.min_samp_abund):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
754 if args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
755 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1]+"\t"+str(AAinfo[0])+"\t"+AAinfo[1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
756 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
757 ntcallv_fh.write("\t"+str(total)+"\t\t")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
758 ntcallv_fh.write(f"\t")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
759 ntcallv_fh.write("\t\t")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
760 ntcallv_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
761 ntcall_fh.write("\t\t")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
762 ntcall_fh.write(f"\t{sorted_calls[1]}\t{nt_call_dict_dict[POS][sorted_calls[1]]}\t{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}"+"\t"+singletCodon(POS, sorted_calls[1], ref)[1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
763
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
764 if (nt_call_dict_dict[POS][sorted_calls[2]] >= args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total >= args.min_samp_abund):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
765 ntcall_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
766 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
767 ntcallv_fh.write(f"\t{sorted_calls[2]}\t{nt_call_dict_dict[POS][sorted_calls[2]]}\t{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}\t{singletCodon(POS, sorted_calls[2], ref)[1]}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
768 if args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
769 ntcallv_fh.write("\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
770
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
771 ntcall_fh.write("\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
772 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
773 ntcall_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
774 if args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
775 ntcallv_fh.write("Position\tref NT\tA\tT\tC\tG\t-\tTotal\tPrimary NT\tCounts\tAbundance\tSecondary NT\tCounts\tAbundance\tTertiary NT\tCounts\tAbundance\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
776
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
777 for POS in sorted_POS:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
778 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
779 total = coverage[POS] # sum(nt_call_dict_dict[POS].values())
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
780 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
781 total = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
782 if total >= (sam_read_count * args.ntabund):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
783 POS_calls = {}
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
784 for key in nt_call_dict_dict[POS]:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
785 POS_calls[key] = nt_call_dict_dict[POS][key]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
786 sorted_calls = sorted(POS_calls, key=POS_calls.__getitem__, reverse=True)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
787
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
788 ntcall_fh.write(str(POS)+"\t"+ref[1][POS-1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
789 ntcall_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
790 ntcall_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
791
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
792 if sorted_calls[0] != ref[1][POS-1]:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
793 if args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
794 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
795 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
796 ntcallv_fh.write("\t"+str(total)+"\t"+sorted_calls[0]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[0]]))
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
797 ntcallv_fh.write(f"\t{(nt_call_dict_dict[POS][sorted_calls[0]]/total):.3f}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
798 if (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
799 ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
800 if sorted_calls[1] != ref[1][POS-1] and args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
801 ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
802 if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
803 ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
804 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
805 ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
806 if args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
807 ntcallv_fh.write("\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
808
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
809 elif (nt_call_dict_dict[POS][sorted_calls[1]] > args.min_count) and (nt_call_dict_dict[POS][sorted_calls[1]] /total > args.min_samp_abund):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
810 if args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
811 ntcallv_fh.write(str(POS)+"\t"+ref[1][POS-1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
812 ntcallv_fh.write("\t"+str(nt_call_dict_dict[POS]['A'])+"\t"+str(nt_call_dict_dict[POS]['T'])+"\t"+str(nt_call_dict_dict[POS]['C'])+"\t"+str(nt_call_dict_dict[POS]['G'])+"\t"+str(nt_call_dict_dict[POS]['-']))
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
813 ntcallv_fh.write("\t"+str(total)+"\t\t")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
814 ntcallv_fh.write(f"\t")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
815 ntcallv_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
816 ntcall_fh.write("\t"+sorted_calls[1]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[1]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[1]]/total):.3f}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
817 if (nt_call_dict_dict[POS][sorted_calls[2]] > args.min_count) and(nt_call_dict_dict[POS][sorted_calls[2]] /total > args.min_samp_abund):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
818 ntcall_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
819 if sorted_calls[2] != ref[1][POS-1] and args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
820 ntcallv_fh.write("\t"+sorted_calls[2]+"\t"+str(nt_call_dict_dict[POS][sorted_calls[2]])+"\t"+f"{(nt_call_dict_dict[POS][sorted_calls[2]]/total):.3f}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
821 if args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
822 ntcallv_fh.write("\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
823
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
824 ntcall_fh.write("\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
825
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
826 ntcall_fh.close()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
827 if args.ntvar == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
828 ntcallv_fh.close()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
829 # END NT CALL OUT
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
830 if args.covar == 1: # output covariants
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
831 testtrack = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
832 combinations = {}
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
833 for sequence in seq_species:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
834 if args.wgs == 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
835 singles = sequence.split()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
836 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
837 singles = (sequence.split())[1:-1]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
838 if len(singles) <= args.max_dist and singles[0] != 'Ref':
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
839 for combo in getCombos(singles, args.max_covar):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
840 if not combo in combinations:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
841 combinations[combo] = seq_species[sequence]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
842 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
843 combinations[combo] += seq_species[sequence]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
844
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
845 covar_fh = open(args.outfile4, "w")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
846 covar_fh.write(args.samp+"("+str(sam_read_count)+")\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
847 covar_fh.write("Co-Variants\tCount\tAbundance\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
848 sortedcombos = sorted(combinations, key=combinations.__getitem__, reverse=True)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
849 for key in sortedcombos:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
850 if (combinations[key] >= args.min_count):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
851 if (combinations[key] / sam_read_count >= args.min_samp_abund) and args.wgs == 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
852 covar_fh.write(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
853 covar_array.append(key+"\t"+str(combinations[key])+"\t"+f"{(combinations[key]/sam_read_count):.3f}")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
854 elif args.wgs == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
855 coveragepercent = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
856 splitcombos = key.split()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
857 if len(splitcombos) == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
858 coveragePOS = ''
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
859 for c in key:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
860 if c.isdigit():
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
861 coveragePOS += c
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
862 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
863 break
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
864 coveragepercent = combinations[key] / coverage[int(coveragePOS)]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
865 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
866 startcovPOS = ''
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
867 for c in splitcombos[0]:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
868 if c.isdigit():
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
869 startcovPOS += c
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
870 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
871 break
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
872 endcovPOS = ''
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
873 for c in splitcombos[-1]:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
874 if c.isdigit():
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
875 endcovPOS += c
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
876 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
877 break
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
878 coveragevals = []
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
879 for i in range(int(startcovPOS), int(endcovPOS)+1):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
880 coveragevals.append(coverage[i])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
881 mincov = min(coverval for coverval in coveragevals)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
882 coveragepercent = combinations[key] / mincov
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
883 if coveragepercent >= args.min_samp_abund:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
884 covar_fh.write(f"{key}\t{combinations[key]}\t{coveragepercent:.3f}\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
885
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
886 covar_fh.close()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
887
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
888
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
889 # END COVAR OUT
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
890
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
891 return(covar_array, seq_array, sam_read_count)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
892
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
893 def cvdeconv(args, covardict, seqdict): # covar deconvolution process
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
894
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
895 passedseqs = {}
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
896 preservedseqs = {}
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
897 for seq in seqdict: # pass check for actual : expected abundance
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
898 if seq != 'total' and seq != 'singles':
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
899
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
900 splitseq = seq.split(' ')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
901 abund = 1
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
902 for sing in splitseq:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
903 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
904 abund = abund * (covardict[sing] / covardict['total'])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
905 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
906 abund = abund * (seqdict[seq] / seqdict['total'])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
907
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
908 try:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
909 covarabund = covardict[seq]/covardict['total']
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
910 except:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
911 covarabund = seqdict[seq]/seqdict['total']
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
912 covardict[seq] = seqdict[seq]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
913
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
914 if covarabund >= args.autopass:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
915 preservedseqs[seq] = max(1, args.beta, (covarabund / abund))
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
916 elif covarabund >= abund * args.beta:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
917 passedseqs[seq] = covarabund / abund
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
918 elif len(seq.split(' ')) == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
919 passedseqs[seq] = max(1, args.beta)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
920
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
921 if args.min_samp_abund < 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
922 min_count = args.min_samp_abund * covardict['total']
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
923 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
924 min_count = args.min_samp_abund
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
925
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
926 # sort passed covars
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
927 lensortedpassed = sorted(passedseqs, key = lambda key : len(key.split(' ')), reverse=True)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
928 ratiolensortedpassed = sorted(lensortedpassed, key = lambda key : passedseqs[key], reverse = True)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
929 sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
930 deconved = {}
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
931 for seq in ratiolensortedpassed: # reassign counts
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
932 singles = seq.split(' ')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
933 first = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
934 rem_count = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
935 for sing in sortedsingles:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
936 if sing in singles:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
937 if covardict['singles'][sing] > 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
938 if first == 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
939 first = 1
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
940 rem_count = covardict['singles'][sing]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
941 covardict['singles'][sing] = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
942 deconved[seq] = rem_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
943 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
944 covardict['singles'][sing] = covardict['singles'][sing] - rem_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
945 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
946 break
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
947 sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
948
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
949 sortedpreserved = sorted(preservedseqs, key = lambda key : covardict[key])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
950
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
951 for seq in sortedpreserved:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
952 singles = seq.split(' ')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
953 first = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
954 rem_count = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
955 for sing in sortedsingles:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
956 if sing in singles:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
957 if covardict['singles'][sing] > 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
958 if first == 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
959 first = 1
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
960 rem_count = covardict['singles'][sing]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
961 covardict['singles'][sing] = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
962 deconved[seq] = rem_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
963 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
964 covardict['singles'][sing] = covardict['singles'][sing] - rem_count
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
965 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
966 break
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
967 sortedsingles = sorted(covardict['singles'], key = covardict['singles'].__getitem__)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
968
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
969
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
970 newtotal = sum(deconved.values())
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
971 fh_deconv = open(args.outfile6, "w")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
972 fh_deconv.write(f"{args.samp}({covardict['total']}) | ({newtotal})\tCount\tAbundance\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
973 sorted_deconved = sorted(deconved, key = deconved.__getitem__, reverse = True)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
974 for seq in sorted_deconved: # write deconv
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
975 if deconved[seq] >= min_count:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
976 fh_deconv.write(f"{seq}\t{deconved[seq]}\t{(deconved[seq]/newtotal):.3f}\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
977 fh_deconv.close()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
978 # END COVAR DECONV OUT
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
979
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
980 return()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
981
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
982 def dechim(args, seqs): # processing sequence dictionary to remove chimeras
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
983 total = seqs['total']
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
984 del seqs['total']
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
985 sorted_seqs = sorted(seqs, key=seqs.__getitem__) # sort sequences by abundance, least to greatest
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
986 chimeras = []
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
987 for seq in sorted_seqs:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
988 pot_chim = ['Reference']+seq.split()+['Reference']
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
989 chim_halves = []
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
990 for i in range(0, len(pot_chim)-1): # create dimera halves
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
991 chim_halves.append([pot_chim[:i+1], pot_chim[i+1:]])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
992 parent_pairs = []
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
993
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
994 for dimera in chim_halves: # check for potential parents
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
995
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
996 pot_left = []
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
997 pot_rt = []
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
998 lft_len = len(dimera[0])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
999 rt_len = len(dimera[1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1000 for pseq in sorted_seqs:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1001 if not seq == pseq:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1002 if (seqs[pseq] >= (seqs[seq] * args.foldab)):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1003 pot_par = ['Reference']+pseq.split()+['Reference']
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1004 if dimera[0] == pot_par[:lft_len]:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1005 pot_left.append(pot_par)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1006 if ((len(pot_par) >= rt_len) and (dimera[1] == pot_par[(len(pot_par)-rt_len):])):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1007 pot_rt.append(pot_par)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1008
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1009 if (len(pot_left) > 0 and len(pot_rt) > 0 ):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1010 for left_par in pot_left: # check potential parents' pairing
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1011 for rt_par in pot_rt:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1012 if left_par != rt_par:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1013 left_break = left_par[lft_len]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1014 rt_break = rt_par[(len(rt_par)-rt_len)-1]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1015 if left_break == 'Reference' or rt_break == 'Reference':
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1016 parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1017 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1018 left_break_POS = ''
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1019 for c in left_break:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1020 if c.isdigit():
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1021 left_break_POS += c
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1022 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1023 break
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1024
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1025 rt_break_POS = ''
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1026 for c in rt_break:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1027 if c.isdigit():
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1028 rt_break_POS += c
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1029 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1030 break
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1031
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1032 if int(left_break_POS) > int(rt_break_POS):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1033 parent_pairs.append([' '.join(left_par[1:-1]), ' '.join(rt_par[1:-1])])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1034
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1035 par_tot_abund = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1036 pair_probs = []
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1037 for parents in parent_pairs: # calc 'expected' abundance
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1038 pair_prob = (seqs[parents[0]] / total) * (seqs[parents[1]] / total)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1039 par_tot_abund += pair_prob
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1040 pair_probs.append(pair_prob)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1041
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1042 recomb_count = par_tot_abund * total
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1043
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1044 if not seqs[seq] >= recomb_count * args.alpha: # chimera check
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1045 redist_count = float(seqs[seq])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1046 chimeras.append(seq)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1047 seqs[seq] = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1048 if args.redist == 1: # redist counts of chimera
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1049 toadd = {}
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1050 for i in range(0, len(parent_pairs)):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1051 counts_to_redist = (redist_count * (pair_probs[i]/par_tot_abund))/2
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1052 seqs[parent_pairs[i][0]] += counts_to_redist
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1053 seqs[parent_pairs[i][1]] += counts_to_redist
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1054
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1055
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1056
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1057 for chim in chimeras: # remove chimeras
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1058 del seqs[chim]
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1059
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1060 total = sum(seqs.values())
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1061
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1062
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1063 # total = sum(seqs.values)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1064 seqs['total'] = total
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1065
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1066 return(seqs)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1067
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1068 def chimrm(args, seqs): # chimera removed process
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1069
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1070 pre_len = len(seqs)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1071 inf_loop_shield = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1072 while True: # send sequences for chimera removal while chimeras are still found
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1073 dechim(args, seqs)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1074 post_len = len(seqs)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1075 inf_loop_shield += 1
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1076 if post_len >= pre_len:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1077 break
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1078 if inf_loop_shield > 100:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1079 break
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1080 pre_len = len(seqs)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1081
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1082 total = seqs['total']
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1083 del seqs['total']
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1084 if args.min_samp_abund < 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1085 min_count = args.min_samp_abund * total
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1086 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1087 min_count = args.min_samp_abund
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1088 sorted_seqs = sorted(seqs, key=seqs.__getitem__, reverse=True)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1089 fh_dechime = open(args.outfile5, 'w')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1090 fh_dechime.write(f"{args.samp}({int(total)})\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1091 fh_dechime.write("Sequences\tCount\tAbundance\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1092 for seq in seqs: # write chim_rm seqs
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1093 abund = seqs[seq]/total
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1094 if seqs[seq] >= min_count:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1095 fh_dechime.write(f"{seq}\t{round(seqs[seq])}\t{abund:.3f}\n")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1096
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1097 fh_dechime.close()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1098 return()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1099
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1100 def main():
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1101
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1102 args = arg_parser() # getting command line arguments
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1103
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1104 covar_array = []
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1105 seq_array = []
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1106 total_mapped_reads = 0
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1107
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1108 if args.ref:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1109 ref = get_ref(args.ref) # get the reference ID and sequence from the FASTA file
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1110 args.ref.close()
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1111 if ref[1] == '':
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1112 print('Reference not recognized as a Fasta format, skipping SAM parsing')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1113 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1114 refprot = ''
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1115 if args.AAreport == 1: # make an Amino Acid sequence based on the reference sequence
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1116 for x in range(0, (len(ref[1])-1)//3):
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1117 AA = AAcall(ref[1][x*3]+ref[1][x*3+1]+ref[1][x*3+2])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1118 refprot = refprot + AA
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1119 if (len(ref[1])-1)%3 != 0:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1120 refprot = refprot + '?'
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1121
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1122 covar_array, seq_array, total_mapped_reads = SAMparse(args, ref, refprot, args.Sam_file)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1123
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1124 else:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1125 print('No reference provided, skipping SAM parsing')
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1126
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1127 in_covars = {'total' : total_mapped_reads,
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1128 'singles' : {}
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1129 }
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1130 in_seqs = {'total' : total_mapped_reads}
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1131 # Begin chimera removal if enabled
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1132 if args.chim_rm == 1 or args.deconv == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1133 if args.deconv == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1134 for line in covar_array:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1135 lineparts = line.split("\t")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1136 in_covars[lineparts[0]] = int(lineparts[1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1137 if len(lineparts[0].split(' ')) == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1138 in_covars['singles'][lineparts[0]] = int(lineparts[1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1139
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1140 for line in seq_array:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1141 lineparts = line.split("\t")
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1142 in_seqs[lineparts[0]] = float(lineparts[1])
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1143
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1144 if args.deconv == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1145 cvdeconv(args, in_covars, in_seqs)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1146
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1147 if args.chim_rm == 1:
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1148 chimrm(args, in_seqs)
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1149
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1150 if __name__ == '__main__':
22e3f843f1db Uploaded main file
degregory
parents:
diff changeset
1151 main()