comparison core.py @ 2:d0350fe29fdf draft

planemo upload commit c50df40caef2fb97c178d6890961e0e527992324
author cstrittmatter
date Mon, 27 Apr 2020 01:11:53 -0400
parents
children
comparison
equal deleted inserted replaced
1:9811f8cd313d 2:d0350fe29fdf
1 #!/usr/bin/env python3
2
3
4 import gzip
5 import io
6 import pickle
7 import os
8 import sys
9
10 from argparse import ArgumentParser
11 try:
12 from .version import SalmID_version
13 except ImportError:
14 SalmID_version = "version unknown"
15
16
17 def reverse_complement(sequence):
18 """return the reverse complement of a nucleotide (including IUPAC ambiguous nuceotide codes)"""
19 complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'M': 'K', 'R': 'Y', 'W': 'W',
20 'S': 'S', 'Y': 'R', 'K': 'M', 'V': 'B', 'H': 'D', 'D': 'H', 'B': 'V'}
21 return "".join(complement[base] for base in reversed(sequence))
22
23
24 def parse_args():
25 "Parse the input arguments, use '-h' for help."
26 parser = ArgumentParser(description='SalmID - rapid Kmer based Salmonella identifier from sequence data')
27 # inputs
28 parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + SalmID_version)
29 parser.add_argument(
30 '-i', '--input_file', type=str, required=False, default='None', metavar='your_fastqgz',
31 help='Single fastq.gz file input, include path to file if file is not in same directory ')
32 parser.add_argument(
33 '-e', '--extension', type=str, required=False, default='.fastq.gz', metavar='file_extension',
34 help='File extension, if specified without "--input_dir", SalmID will attempt to ID all files\n' +
35 ' with this extension in current directory, otherwise files in input directory')
36
37 parser.add_argument(
38 '-d', '--input_dir', type=str, required=False, default='.', metavar='directory',
39 help='Directory which contains data for identification, when not specified files in current directory will be analyzed.')
40 parser.add_argument(
41 '-r', '--report', type=str, required=False, default='percentage', metavar='percentage, coverage or taxonomy',
42 help='Report either percentage ("percentage") of clade specific kmers recovered, average kmer-coverage ("cov"), or '
43 'taxonomy (taxonomic species ID, plus observed mean k-mer coverages and expected coverage).')
44 parser.add_argument(
45 '-m', '--mode', type=str, required=False, default='quick', metavar='quick or thorough',
46 help='Quick [quick] or thorough [thorough] mode')
47 if len(sys.argv) == 1:
48 parser.print_help(sys.stderr)
49 sys.exit(1)
50 return parser.parse_args()
51
52
53 def get_av_read_length(file):
54 """Samples the first 100 reads from a fastq file and return the average read length."""
55 i = 1
56 n_reads = 0
57 total_length = 0
58 if file.endswith(".gz"):
59 file_content = io.BufferedReader(gzip.open(file))
60 else:
61 file_content = open(file, "r").readlines()
62 for line in file_content:
63 if i % 4 == 2:
64 total_length += len(line.strip())
65 n_reads += 1
66 i += 1
67 if n_reads == 100:
68 break
69 return total_length / 100
70
71
72 def createKmerDict_reads(list_of_strings, kmer):
73 """Count occurence of K-mers in a list of strings
74
75 Args:
76 list_of_strings(list of str): nucleotide sequences as a list of strings
77 kmer(int): length of the K-mer to count
78
79 Returns:
80 dict: dictionary with kmers as keys, counts for each kmer as values"""
81 kmer_table = {}
82 for string in list_of_strings:
83 sequence = string.strip('\n')
84 if len(sequence) >= kmer:
85 for i in range(len(sequence) - kmer + 1):
86 new_mer = sequence[i:i + kmer]
87 new_mer_rc = reverse_complement(new_mer)
88 if new_mer in kmer_table:
89 kmer_table[new_mer.upper()] += 1
90 else:
91 kmer_table[new_mer.upper()] = 1
92 if new_mer_rc in kmer_table:
93 kmer_table[new_mer_rc.upper()] += 1
94 else:
95 kmer_table[new_mer_rc.upper()] = 1
96 return kmer_table
97
98
99 def target_read_kmerizer_multi(file, k, kmerDict_1, kmerDict_2, mode):
100 mean_1 = None
101 mean_2 = None
102 i = 1
103 n_reads_1 = 0
104 n_reads_2 = 0
105 total_coverage_1 = 0
106 total_coverage_2 = 0
107 reads_1 = []
108 reads_2 = []
109 total_reads = 0
110 if file.endswith(".gz"):
111 file_content = io.BufferedReader(gzip.open(file))
112 else:
113 file_content = open(file, "r").readlines()
114 for line in file_content:
115 start = int((len(line) - k) // 2)
116 if i % 4 == 2:
117 total_reads += 1
118 if file.endswith(".gz"):
119 s1 = line[start:k + start].decode()
120 line = line.decode()
121 else:
122 s1 = line[start:k + start]
123 if s1 in kmerDict_1:
124 n_reads_1 += 1
125 total_coverage_1 += len(line)
126 reads_1.append(line)
127 if s1 in kmerDict_2:
128 n_reads_2 += 1
129 total_coverage_2 += len(line)
130 reads_2.append(line)
131 i += 1
132 if mode == 'quick':
133 if total_coverage_2 >= 800000:
134 break
135
136 if len(reads_1) == 0:
137 kmer_Dict1 = {}
138 else:
139 kmer_Dict1 = createKmerDict_reads(reads_1, k)
140 mers_1 = set([key for key in kmer_Dict1])
141 mean_1 = sum([kmer_Dict1[key] for key in kmer_Dict1]) / len(mers_1)
142 if len(reads_2) == 0:
143 kmer_Dict2 = {}
144 else:
145 kmer_Dict2 = createKmerDict_reads(reads_2, k)
146 mers_2 = set([key for key in kmer_Dict2])
147 mean_2 = sum([kmer_Dict2[key] for key in kmer_Dict2]) / len(mers_2)
148 return kmer_Dict1, kmer_Dict2, mean_1, mean_2, total_reads
149
150
151 def mean_cov_selected_kmers(iterable, kmer_dict, clade_specific_kmers):
152 '''
153 Given an iterable (list, set, dictrionary) returns mean coverage for the kmers in iterable
154 :param iterable: set, list or dictionary containing kmers
155 :param kmer_dict: dictionary with kmers as keys, kmer-frequency as value
156 :param clade_specific_kmers: list, dict or set of clade specific kmers
157 :return: mean frequency as float
158 '''
159 if len(iterable) == 0:
160 return 0
161 return sum([kmer_dict[value] for value in iterable]) / len(clade_specific_kmers)
162
163
164 def kmer_lists(query_fastq_gz, k,
165 allmers, allmers_rpoB,
166 uniqmers_bongori,
167 uniqmers_I,
168 uniqmers_IIa,
169 uniqmers_IIb,
170 uniqmers_IIIa,
171 uniqmers_IIIb,
172 uniqmers_IV,
173 uniqmers_VI,
174 uniqmers_VII,
175 uniqmers_VIII,
176 uniqmers_bongori_rpoB,
177 uniqmers_S_enterica_rpoB,
178 uniqmers_Escherichia_rpoB,
179 uniqmers_Listeria_ss_rpoB,
180 uniqmers_Lmono_rpoB,
181 mode):
182 dict_invA, dict_rpoB, mean_invA, mean_rpoB, total_reads = target_read_kmerizer_multi(query_fastq_gz, k, allmers,
183 allmers_rpoB, mode)
184 target_mers_invA = set([key for key in dict_invA])
185 target_mers_rpoB = set([key for key in dict_rpoB])
186 if target_mers_invA == 0:
187 print('No reads found matching invA, no Salmonella in sample?')
188 else:
189 p_bongori = (len(uniqmers_bongori & target_mers_invA) / len(uniqmers_bongori)) * 100
190 p_I = (len(uniqmers_I & target_mers_invA) / len(uniqmers_I)) * 100
191 p_IIa = (len(uniqmers_IIa & target_mers_invA) / len(uniqmers_IIa)) * 100
192 p_IIb = (len(uniqmers_IIb & target_mers_invA) / len(uniqmers_IIb)) * 100
193 p_IIIa = (len(uniqmers_IIIa & target_mers_invA) / len(uniqmers_IIIa)) * 100
194 p_IIIb = (len(uniqmers_IIIb & target_mers_invA) / len(uniqmers_IIIb)) * 100
195 p_VI = (len(uniqmers_VI & target_mers_invA) / len(uniqmers_VI)) * 100
196 p_IV = (len(uniqmers_IV & target_mers_invA) / len(uniqmers_IV)) * 100
197 p_VII = (len(uniqmers_VII & target_mers_invA) / len(uniqmers_VII)) * 100
198 p_VIII = (len(uniqmers_VIII & target_mers_invA) / len(uniqmers_VIII)) * 100
199 p_bongori_rpoB = (len(uniqmers_bongori_rpoB & target_mers_rpoB) / len(uniqmers_bongori_rpoB)) * 100
200 p_Senterica = (len(uniqmers_S_enterica_rpoB & target_mers_rpoB) / len(uniqmers_S_enterica_rpoB)) * 100
201 p_Escherichia = (len(uniqmers_Escherichia_rpoB & target_mers_rpoB) / len(uniqmers_Escherichia_rpoB)) * 100
202 p_Listeria_ss = (len(uniqmers_Listeria_ss_rpoB & target_mers_rpoB) / len(uniqmers_Listeria_ss_rpoB)) * 100
203 p_Lmono = (len(uniqmers_Lmono_rpoB & target_mers_rpoB) / len(uniqmers_Lmono_rpoB)) * 100
204 bongori_invA_cov = mean_cov_selected_kmers(uniqmers_bongori & target_mers_invA, dict_invA, uniqmers_bongori)
205 I_invA_cov = mean_cov_selected_kmers(uniqmers_I & target_mers_invA, dict_invA, uniqmers_I)
206 IIa_invA_cov = mean_cov_selected_kmers(uniqmers_IIa & target_mers_invA, dict_invA, uniqmers_IIa)
207 IIb_invA_cov = mean_cov_selected_kmers(uniqmers_IIb & target_mers_invA, dict_invA, uniqmers_IIb)
208 IIIa_invA_cov = mean_cov_selected_kmers(uniqmers_IIIa & target_mers_invA, dict_invA, uniqmers_IIIa)
209 IIIb_invA_cov = mean_cov_selected_kmers(uniqmers_IIIb & target_mers_invA, dict_invA, uniqmers_IIIb)
210 IV_invA_cov = mean_cov_selected_kmers(uniqmers_IV & target_mers_invA, dict_invA, uniqmers_IV)
211 VI_invA_cov = mean_cov_selected_kmers(uniqmers_VI & target_mers_invA, dict_invA, uniqmers_VI)
212 VII_invA_cov = mean_cov_selected_kmers(uniqmers_VII & target_mers_invA, dict_invA, uniqmers_VII)
213 VIII_invA_cov = mean_cov_selected_kmers(uniqmers_VIII & target_mers_invA, dict_invA, uniqmers_VIII)
214 S_enterica_rpoB_cov = mean_cov_selected_kmers((uniqmers_S_enterica_rpoB & target_mers_rpoB), dict_rpoB,
215 uniqmers_S_enterica_rpoB)
216 S_bongori_rpoB_cov = mean_cov_selected_kmers((uniqmers_bongori_rpoB & target_mers_rpoB), dict_rpoB,
217 uniqmers_bongori_rpoB)
218 Escherichia_rpoB_cov = mean_cov_selected_kmers((uniqmers_Escherichia_rpoB & target_mers_rpoB), dict_rpoB,
219 uniqmers_Escherichia_rpoB)
220 Listeria_ss_rpoB_cov = mean_cov_selected_kmers((uniqmers_Listeria_ss_rpoB & target_mers_rpoB), dict_rpoB,
221 uniqmers_Listeria_ss_rpoB)
222 Lmono_rpoB_cov = mean_cov_selected_kmers((uniqmers_Lmono_rpoB & target_mers_rpoB), dict_rpoB,
223 uniqmers_Lmono_rpoB)
224 coverages = [Listeria_ss_rpoB_cov, Lmono_rpoB_cov, Escherichia_rpoB_cov, S_bongori_rpoB_cov,
225 S_enterica_rpoB_cov, bongori_invA_cov, I_invA_cov, IIa_invA_cov, IIb_invA_cov,
226 IIIa_invA_cov, IIIb_invA_cov, IV_invA_cov, VI_invA_cov, VII_invA_cov, VIII_invA_cov]
227 locus_scores = [p_Listeria_ss, p_Lmono, p_Escherichia, p_bongori_rpoB, p_Senterica, p_bongori,
228 p_I, p_IIa, p_IIb, p_IIIa, p_IIIb, p_IV, p_VI, p_VII, p_VIII]
229 return locus_scores, coverages, total_reads
230
231
232 def report_taxon(locus_covs, average_read_length, number_of_reads):
233 list_taxa = [ 'Listeria ss', 'Listeria monocytogenes', 'Escherichia sp.', # noqa: E201
234 'Salmonella bongori (rpoB)', 'Salmonella enterica (rpoB)',
235 'Salmonella bongori (invA)', 'S. enterica subsp. enterica (invA)',
236 'S. enterica subsp. salamae (invA: clade a)', 'S. enterica subsp. salamae (invA: clade b)',
237 'S. enterica subsp. arizonae (invA)', 'S. enterica subsp. diarizonae (invA)',
238 'S. enterica subsp. houtenae (invA)', 'S. enterica subsp. indica (invA)',
239 'S. enterica subsp. VII (invA)', 'S. enterica subsp. salamae (invA: clade VIII)' ] # noqa: E202
240 if sum(locus_covs) < 1:
241 rpoB = ('No rpoB matches!', 0)
242 invA = ('No invA matches!', 0)
243 return rpoB, invA, 0.0
244 else:
245 # given list of scores get taxon
246 if sum(locus_covs[0:5]) > 0:
247 best_rpoB = max(range(len(locus_covs[1:5])), key=lambda x: locus_covs[1:5][x]) + 1
248 all_rpoB = max(range(len(locus_covs[0:5])), key=lambda x: locus_covs[0:5][x])
249 if (locus_covs[best_rpoB] != 0) & (all_rpoB == 0):
250 rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB])
251 elif (all_rpoB == 0) & (round(sum(locus_covs[1:5]), 1) < 1):
252 rpoB = (list_taxa[0], locus_covs[0])
253 else:
254 rpoB = (list_taxa[best_rpoB], locus_covs[best_rpoB])
255 else:
256 rpoB = ('No rpoB matches!', 0)
257 if sum(locus_covs[5:]) > 0:
258 best_invA = max(range(len(locus_covs[5:])), key=lambda x: locus_covs[5:][x]) + 5
259 invA = (list_taxa[best_invA], locus_covs[best_invA])
260 else:
261 invA = ('No invA matches!', 0)
262 if 'Listeria' in rpoB[0]:
263 return rpoB, invA, (average_read_length * number_of_reads) / 3000000
264 else:
265 return rpoB, invA, (average_read_length * number_of_reads) / 5000000
266
267
268 def main():
269 ex_dir = os.path.dirname(os.path.realpath(__file__))
270 args = parse_args()
271 input_file = args.input_file
272 if input_file != 'None':
273 files = [input_file]
274 else:
275 extension = args.extension
276 inputdir = args.input_dir
277 files = [inputdir + '/' + f for f in os.listdir(inputdir) if f.endswith(extension)]
278 report = args.report
279 mode = args.mode
280 f_invA = open(ex_dir + "/invA_mers_dict", "rb")
281 sets_dict_invA = pickle.load(f_invA)
282 f_invA.close()
283 allmers = sets_dict_invA['allmers']
284 uniqmers_I = sets_dict_invA['uniqmers_I']
285 uniqmers_IIa = sets_dict_invA['uniqmers_IIa']
286 uniqmers_IIb = sets_dict_invA['uniqmers_IIb']
287 uniqmers_IIIa = sets_dict_invA['uniqmers_IIIa']
288 uniqmers_IIIb = sets_dict_invA['uniqmers_IIIb']
289 uniqmers_IV = sets_dict_invA['uniqmers_IV']
290 uniqmers_VI = sets_dict_invA['uniqmers_VI']
291 uniqmers_VII = sets_dict_invA['uniqmers_VII']
292 uniqmers_VIII = sets_dict_invA['uniqmers_VIII']
293 uniqmers_bongori = sets_dict_invA['uniqmers_bongori']
294
295 f = open(ex_dir + "/rpoB_mers_dict", "rb")
296 sets_dict = pickle.load(f)
297 f.close()
298
299 allmers_rpoB = sets_dict['allmers']
300 uniqmers_bongori_rpoB = sets_dict['uniqmers_bongori']
301 uniqmers_S_enterica_rpoB = sets_dict['uniqmers_S_enterica']
302 uniqmers_Escherichia_rpoB = sets_dict['uniqmers_Escherichia']
303 uniqmers_Listeria_ss_rpoB = sets_dict['uniqmers_Listeria_ss']
304 uniqmers_Lmono_rpoB = sets_dict['uniqmers_L_mono']
305 # todo: run kmer_lists() once, create list of tuples containing data to be used fro different reports
306 if report == 'taxonomy':
307 print('file\trpoB\tinvA\texpected coverage')
308 for f in files:
309 locus_scores, coverages, reads = kmer_lists(f, 27,
310 allmers, allmers_rpoB,
311 uniqmers_bongori,
312 uniqmers_I,
313 uniqmers_IIa,
314 uniqmers_IIb,
315 uniqmers_IIIa,
316 uniqmers_IIIb,
317 uniqmers_IV,
318 uniqmers_VI,
319 uniqmers_VII,
320 uniqmers_VIII,
321 uniqmers_bongori_rpoB,
322 uniqmers_S_enterica_rpoB,
323 uniqmers_Escherichia_rpoB,
324 uniqmers_Listeria_ss_rpoB,
325 uniqmers_Lmono_rpoB,
326 mode)
327 pretty_covs = [round(cov, 1) for cov in coverages]
328 report = report_taxon(pretty_covs, get_av_read_length(f), reads)
329 print(f.split('/')[-1] + '\t' + report[0][0] + '[' + str(report[0][1]) + ']' + '\t' + report[1][0] +
330 '[' + str(report[1][1]) + ']' +
331 '\t' + str(round(report[2], 1)))
332 else:
333 print(
334 'file\tListeria sensu stricto (rpoB)\tL. monocytogenes (rpoB)\tEscherichia spp. (rpoB)\tS. bongori (rpoB)\tS. enterica' + # noqa: E122
335 '(rpoB)\tS. bongori (invA)\tsubsp. I (invA)\tsubsp. II (clade a: invA)\tsubsp. II' + # noqa: E122
336 ' (clade b: invA)\tsubsp. IIIa (invA)\tsubsp. IIIb (invA)\tsubsp.IV (invA)\tsubsp. VI (invA)\tsubsp. VII (invA)' + # noqa: E122
337 '\tsubsp. II (clade VIII : invA)')
338 if report == 'percentage':
339 for f in files:
340 locus_scores, coverages, reads = kmer_lists(f, 27,
341 allmers, allmers_rpoB,
342 uniqmers_bongori,
343 uniqmers_I,
344 uniqmers_IIa,
345 uniqmers_IIb,
346 uniqmers_IIIa,
347 uniqmers_IIIb,
348 uniqmers_IV,
349 uniqmers_VI,
350 uniqmers_VII,
351 uniqmers_VIII,
352 uniqmers_bongori_rpoB,
353 uniqmers_S_enterica_rpoB,
354 uniqmers_Escherichia_rpoB,
355 uniqmers_Listeria_ss_rpoB,
356 uniqmers_Lmono_rpoB,
357 mode)
358 pretty_scores = [str(round(score)) for score in locus_scores]
359 print(f.split('/')[-1] + '\t' + '\t'.join(pretty_scores))
360 else:
361 for f in files:
362 locus_scores, coverages, reads = kmer_lists(f, 27,
363 allmers, allmers_rpoB,
364 uniqmers_bongori,
365 uniqmers_I,
366 uniqmers_IIa,
367 uniqmers_IIb,
368 uniqmers_IIIa,
369 uniqmers_IIIb,
370 uniqmers_IV,
371 uniqmers_VI,
372 uniqmers_VII,
373 uniqmers_VIII,
374 uniqmers_bongori_rpoB,
375 uniqmers_S_enterica_rpoB,
376 uniqmers_Escherichia_rpoB,
377 uniqmers_Listeria_ss_rpoB,
378 uniqmers_Lmono_rpoB,
379 mode)
380 pretty_covs = [str(round(cov, 1)) for cov in coverages]
381 print(f.split('/')[-1] + '\t' + '\t'.join(pretty_covs))
382
383
384 if __name__ == '__main__':
385 main()
386