Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
comparison scripts/patho_typing.py @ 0:965517909457 draft
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author | cstrittmatter |
---|---|
date | Wed, 22 Jan 2020 08:41:44 -0500 |
parents | |
children | 0cbed1c0a762 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:965517909457 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # -*- coding: utf-8 -*- | |
4 | |
5 """ | |
6 patho_typing.py - In silico pathogenic typing directly from raw | |
7 Illumina reads | |
8 <https://github.com/B-UMMI/patho_typing/> | |
9 | |
10 Copyright (C) 2017 Miguel Machado <mpmachado@medicina.ulisboa.pt> | |
11 | |
12 Last modified: July 06, 2017 | |
13 | |
14 This program is free software: you can redistribute it and/or modify | |
15 it under the terms of the GNU General Public License as published by | |
16 the Free Software Foundation, either version 3 of the License, or | |
17 (at your option) any later version. | |
18 | |
19 This program is distributed in the hope that it will be useful, | |
20 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
22 GNU General Public License for more details. | |
23 | |
24 You should have received a copy of the GNU General Public License | |
25 along with this program. If not, see <http://www.gnu.org/licenses/>. | |
26 | |
27 2017-12-05 ISS | |
28 In order to use the module within the EURL_WGS_Typer pipeline with a | |
29 different virulence database for E coli, mapping against the | |
30 typing_rules.tab was deactivated. | |
31 """ | |
32 | |
33 import argparse | |
34 import os | |
35 import time | |
36 import sys | |
37 | |
38 import modules.utils as utils | |
39 import modules.run_rematch as run_rematch | |
40 import modules.typing as typing | |
41 | |
42 version = '0.3' | |
43 | |
44 | |
45 def set_reference(species, outdir, script_path, trueCoverage): | |
46 reference_file = None | |
47 trueCoverage_file = None | |
48 trueCoverage_sequences = None | |
49 trueCoverage_headers = None | |
50 trueCoverage_config = None | |
51 typing_file = None | |
52 typing_sequences = None | |
53 typing_headers = None | |
54 typing_rules = None | |
55 typing_config = None | |
56 | |
57 species_folder = os.path.join(os.path.dirname(script_path), 'modules', 'seq_conf', '_'.join([s.lower() for s in species]), '') | |
58 | |
59 if os.path.isdir(species_folder): | |
60 typing_rules = os.path.join(species_folder, 'typing_rules.tab') | |
61 typing_file = os.path.join(species_folder, 'typing.fasta') | |
62 typing_sequences, ignore = utils.get_sequence_information(typing_file, 0) | |
63 typing_sequences, typing_headers = utils.clean_headers_sequences(typing_sequences) | |
64 typing_sequences = utils.simplify_sequence_dict(typing_sequences) | |
65 typing_config = os.path.join(species_folder, 'typing.config') | |
66 if trueCoverage: | |
67 trueCoverage_file = os.path.join(species_folder, 'trueCoverage.fasta') | |
68 trueCoverage_sequences, ignore = utils.get_sequence_information(trueCoverage_file, 0) | |
69 trueCoverage_sequences, trueCoverage_headers = utils.clean_headers_sequences(trueCoverage_sequences) | |
70 trueCoverage_sequences = utils.simplify_sequence_dict(trueCoverage_sequences) | |
71 trueCoverage_config = os.path.join(species_folder, 'trueCoverage.config') | |
72 | |
73 trueCoverage_typing_sequences = trueCoverage_sequences.copy() | |
74 for header in typing_sequences: | |
75 if header not in trueCoverage_sequences: | |
76 trueCoverage_typing_sequences[header] = typing_sequences[header] | |
77 else: | |
78 print 'Sequence {header} of typing.fasta already present in trueCoverage.fasta'.format(header=header) | |
79 | |
80 reference_file = os.path.join(outdir, 'trueCoverage_typing.fasta') | |
81 write_sequeces(reference_file, trueCoverage_typing_sequences) | |
82 else: | |
83 reference_file = os.path.join(outdir, 'typing.fasta') | |
84 write_sequeces(reference_file, typing_sequences) | |
85 else: | |
86 species_present = [] | |
87 seq_conf_folder = os.path.join(os.path.dirname(script_path), 'modules', 'seq_conf', '') | |
88 species_folder = [d for d in os.listdir(seq_conf_folder) if not d.startswith('.') and os.path.isdir(os.path.join(seq_conf_folder, d, ''))] | |
89 for species in species_folder: | |
90 species = species.split('_') | |
91 species[0] = species[0][0].upper() + species[0][1:] | |
92 species_present.append(' '.join(species)) | |
93 sys.exit('Only these species are available:' + '\n' + '\n'.join(species_present)) | |
94 | |
95 return reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, typing_file, typing_sequences, typing_headers, typing_rules, typing_config | |
96 | |
97 | |
98 def confirm_genes_fasta_rules(typing_fasta_headers, typing_rules_file): | |
99 with open(typing_rules_file, 'rtU') as reader: | |
100 genes = [] | |
101 for line in reader: | |
102 line = line.splitlines()[0] | |
103 if len(line) > 0: | |
104 line = line.split('\t') | |
105 if line[0].startswith('#'): | |
106 genes = line[1:] | |
107 break | |
108 missing_genes = [] | |
109 for gene in genes: | |
110 if gene.lower() not in typing_fasta_headers: | |
111 missing_genes.append(gene) | |
112 if len(missing_genes) > 0: | |
113 sys.exit('The following genes required for typing are not present in typing fasta file: {missing_genes}'.format(missing_genes=', '.join(missing_genes))) | |
114 | |
115 | |
116 def index_fasta_samtools(fasta, region_None, region_outfile_none, print_comand_True): | |
117 command = ['samtools', 'faidx', fasta, '', '', ''] | |
118 shell_true = False | |
119 if region_None is not None: | |
120 command[3] = region_None | |
121 if region_outfile_none is not None: | |
122 command[4] = '>' | |
123 command[5] = region_outfile_none | |
124 shell_true = True | |
125 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, shell_true, None, print_comand_True) | |
126 return run_successfully, stdout | |
127 | |
128 | |
129 def indexSequenceBowtie2(referenceFile, threads): | |
130 if os.path.isfile(str(referenceFile + '.1.bt2')): | |
131 run_successfully = True | |
132 else: | |
133 command = ['bowtie2-build', '--threads', str(threads), referenceFile, referenceFile] | |
134 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) | |
135 return run_successfully | |
136 | |
137 | |
138 def run_bowtie(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc): | |
139 sam_file = os.path.join(outdir, str('alignment.sam')) | |
140 | |
141 run_successfully = indexSequenceBowtie2(referenceFile, threads) | |
142 if run_successfully: | |
143 command = ['bowtie2', '-k', str(numMapLoc), '-q', '', '--threads', str(threads), '-x', referenceFile, '', '--no-unal', '-S', sam_file] | |
144 | |
145 if len(fastq_files) == 1: | |
146 command[9] = '-U ' + fastq_files[0] | |
147 elif len(fastq_files) == 2: | |
148 command[9] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1] | |
149 else: | |
150 return False, None | |
151 | |
152 if conserved_True: | |
153 command[4] = '--sensitive' | |
154 else: | |
155 command[4] = '--very-sensitive-local' | |
156 | |
157 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) | |
158 | |
159 if not run_successfully: | |
160 sam_file = None | |
161 | |
162 return run_successfully, sam_file | |
163 | |
164 | |
165 def sortAlignment(alignment_file, output_file, sortByName_True, threads): | |
166 outFormat_string = os.path.splitext(output_file)[1][1:].lower() | |
167 command = ['samtools', 'sort', '-o', output_file, '-O', outFormat_string, '', '-@', str(threads), alignment_file] | |
168 if sortByName_True: | |
169 command[6] = '-n' | |
170 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) | |
171 if not run_successfully: | |
172 output_file = None | |
173 return run_successfully, output_file | |
174 | |
175 | |
176 def indexAlignment(alignment_file): | |
177 command = ['samtools', 'index', alignment_file] | |
178 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) | |
179 return run_successfully | |
180 | |
181 | |
182 def mapping_reads(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc): | |
183 print '\n' + 'Mapping the reads' + '\n' | |
184 run_successfully, sam_file = run_bowtie(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc) | |
185 if run_successfully: | |
186 run_successfully, bam_file = sortAlignment(sam_file, str(os.path.splitext(sam_file)[0] + '.bam'), False, threads) | |
187 if run_successfully: | |
188 os.remove(sam_file) | |
189 run_successfully = indexAlignment(bam_file) | |
190 if run_successfully: | |
191 index_fasta_samtools(referenceFile, None, None, True) | |
192 return run_successfully, bam_file | |
193 | |
194 | |
195 def include_rematch_dependencies_path(): | |
196 command = ['which', 'rematch.py'] | |
197 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) | |
198 if run_successfully: | |
199 rematch = stdout.splitlines()[0] | |
200 utils.setPATHvariable(False, rematch) | |
201 return rematch | |
202 | |
203 | |
204 def split_bam(bam_file, list_sequences, outdir, threads): | |
205 new_bam = os.path.join(outdir, 'partial.bam') | |
206 command = ['samtools', 'view', '-b', '-u', '-h', '-o', new_bam, '-@', str(threads), bam_file, ' '.join(list_sequences)] | |
207 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) | |
208 return run_successfully, new_bam | |
209 | |
210 | |
211 def parse_config(config_file): | |
212 config = {'reference_file': None, 'length_extra_seq': None, 'maximum_number_absent_genes': None, 'maximum_number_genes_multiple_alleles': None, 'minimum_read_coverage': None, 'minimum_depth_presence': None, 'minimum_depth_call': None, 'minimum_depth_frequency_dominant_allele': None, 'minimum_gene_coverage': None, 'minimum_gene_identity': None} | |
213 | |
214 with open(config_file, 'rtU') as reader: | |
215 field = None | |
216 for line in reader: | |
217 line = line.splitlines()[0] | |
218 if len(line) > 0: | |
219 line = line.split(' ')[0] | |
220 if line.startswith('#'): | |
221 line = line[1:].split(' ')[0] | |
222 field = line | |
223 else: | |
224 if field is not None: | |
225 if field in ['length_extra_seq', 'maximum_number_absent_genes', 'maximum_number_genes_multiple_alleles', 'minimum_read_coverage', 'minimum_depth_presence', 'minimum_depth_call', 'minimum_gene_coverage', 'minimum_gene_identity']: | |
226 line = int(line) | |
227 if field in ['minimum_gene_coverage', 'minimum_gene_identity']: | |
228 if line < 0 or line > 100: | |
229 sys.exit('minimum_gene_coverage in trueCoverage_rematch config file must be an integer between 0 and 100') | |
230 elif field == 'minimum_depth_frequency_dominant_allele': | |
231 line = float(line) | |
232 if line < 0 or line > 1: | |
233 sys.exit('minimum_depth_frequency_dominant_allele in trueCoverage_rematch config file must be a double between 0 and 1') | |
234 config[field] = line | |
235 field = None | |
236 | |
237 for field in config: | |
238 if config[field] is None: | |
239 sys.exit(field + ' in trueCoverage_rematch config file is missing') | |
240 | |
241 return config | |
242 | |
243 | |
244 def clean_pathotyping_folder(outdir, reference_file, debug_mode_true): | |
245 if not debug_mode_true: | |
246 files = [f for f in os.listdir(outdir) if not f.startswith('.') and os.path.isfile(os.path.join(outdir, f))] | |
247 for file_found in files: | |
248 if file_found.startswith(('alignment.', os.path.splitext(os.path.basename(reference_file))[0])): | |
249 file_found = os.path.join(outdir, file_found) | |
250 os.remove(file_found) | |
251 | |
252 | |
253 def write_sequeces(out_file, sequences_dict): | |
254 with open(out_file, 'wt') as writer: | |
255 for header in sequences_dict: | |
256 writer.write('>' + header + '\n') | |
257 writer.write('\n'.join(utils.chunkstring(sequences_dict[header]['sequence'], 80)) + '\n') | |
258 | |
259 | |
260 def main(): | |
261 parser = argparse.ArgumentParser(prog='patho_typing.py', description='In silico pathogenic typing directly from raw Illumina reads', formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
262 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) | |
263 | |
264 parser_required = parser.add_argument_group('Required options') | |
265 parser_required.add_argument('-f', '--fastq', nargs='+', action=utils.required_length((1, 2), '--fastq'), type=argparse.FileType('r'), metavar=('/path/to/input/file.fq.gz'), help='Path to single OR paired-end fastq files. If two files are passed, they will be assumed as being the paired fastq files', required=True) | |
266 parser_required.add_argument('-s', '--species', nargs=2, type=str, metavar=('Yersinia', 'enterocolitica'), help='Species name', required=True) | |
267 | |
268 parser_optional_general = parser.add_argument_group('General facultative options') | |
269 parser_optional_general.add_argument('-o', '--outdir', type=str, metavar='/path/to/output/directory/', help='Path to the directory where the information will be stored', required=False, default='.') | |
270 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use', required=False, default=1) | |
271 parser_optional_general.add_argument('--trueCoverage', action='store_true', help='Assess true coverage before continue typing') | |
272 parser_optional_general.add_argument('--noCheckPoint', action='store_true', help='Ignore the true coverage checking point') | |
273 parser_optional_general.add_argument('--minGeneCoverage', type=int, metavar='N', help='Minimum typing percentage of target reference gene sequence covered to consider a gene to be present (value between [0, 100])', required=False) | |
274 parser_optional_general.add_argument('--minGeneIdentity', type=int, metavar='N', help='Minimum typing percentage of identity of reference gene sequence covered to consider a gene to be present (value between [0, 100]). One INDEL will be considered as one difference', required=False) | |
275 parser_optional_general.add_argument('--minGeneDepth', type=int, metavar='N', help='Minimum typing gene average coverage depth of present positions to consider a gene to be present (default 15, or 1/3 of average sample coverage assessed by true coverage analysis)', required=False) | |
276 parser_optional_general.add_argument('--doNotRemoveConsensus', action='store_true', help='Do not remove ReMatCh consensus sequences') | |
277 parser_optional_general.add_argument('--debug', action='store_true', help='DeBug Mode: do not remove temporary files') | |
278 | |
279 args = parser.parse_args() | |
280 | |
281 if args.minGeneCoverage is not None and (args.minGeneCoverage < 0 or args.minGeneCoverage > 100): | |
282 parser.error('--minGeneCoverage should be a value between [0, 100]') | |
283 if args.minGeneIdentity is not None and (args.minGeneIdentity < 0 or args.minGeneIdentity > 100): | |
284 parser.error('--minGeneIdentity should be a value between [0, 100]') | |
285 | |
286 start_time = time.time() | |
287 | |
288 args.outdir = os.path.abspath(args.outdir) | |
289 if not os.path.isdir(args.outdir): | |
290 os.makedirs(args.outdir) | |
291 | |
292 # Start logger | |
293 logfile, time_str = utils.start_logger(args.outdir) | |
294 | |
295 script_path = utils.general_information(logfile, version, args.outdir, time_str) | |
296 print '\n' | |
297 | |
298 rematch = include_rematch_dependencies_path() | |
299 | |
300 args.fastq = [fastq.name for fastq in args.fastq] | |
301 | |
302 reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, typing_file, typing_sequences, typing_headers, typing_rules, typing_config = set_reference(args.species, args.outdir, script_path, args.trueCoverage) | |
303 original_reference_file = str(reference_file) | |
304 | |
305 #confirm_genes_fasta_rules(typing_headers, typing_rules) | |
306 | |
307 run_successfully, bam_file = mapping_reads(args.fastq, reference_file, args.threads, args.outdir, False, 1) | |
308 if run_successfully: | |
309 rematch_dir = os.path.join(args.outdir, 'rematch', '') | |
310 if not os.path.isdir(rematch_dir): | |
311 os.makedirs(rematch_dir) | |
312 | |
313 if args.trueCoverage: | |
314 trueCoverage_dir = os.path.join(rematch_dir, 'trueCoverage', '') | |
315 if not os.path.isdir(trueCoverage_dir): | |
316 os.makedirs(trueCoverage_dir) | |
317 | |
318 print '\n' | |
319 run_successfully, trueCoverage_bam = split_bam(bam_file, trueCoverage_headers, trueCoverage_dir, args.threads) | |
320 if run_successfully: | |
321 run_successfully = indexAlignment(trueCoverage_bam) | |
322 if run_successfully: | |
323 reference_file = os.path.join(trueCoverage_dir, 'reference.fasta') | |
324 write_sequeces(reference_file, trueCoverage_sequences) | |
325 index_fasta_samtools(reference_file, None, None, True) | |
326 config = parse_config(trueCoverage_config) | |
327 runtime, run_successfully, sample_data_general, data_by_gene = run_rematch.run_rematch(rematch, trueCoverage_dir, reference_file, trueCoverage_bam, args.threads, config['length_extra_seq'], config['minimum_depth_presence'], config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'], config['minimum_gene_coverage'], config['minimum_gene_identity'], args.debug, args.doNotRemoveConsensus) | |
328 | |
329 if run_successfully and sample_data_general['mean_sample_coverage'] is not None and sample_data_general['number_absent_genes'] is not None and sample_data_general['number_genes_multiple_alleles'] is not None: | |
330 if args.minGeneDepth is None: | |
331 args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 | |
332 | |
333 exit_info = [] | |
334 if sample_data_general['mean_sample_coverage'] < config['minimum_read_coverage']: | |
335 exit_info.append('Sample coverage ({mean_sample_coverage}) lower than the minimum required ({minimum_read_coverage})'.format(mean_sample_coverage=sample_data_general['mean_sample_coverage'], minimum_read_coverage=config['minimum_read_coverage'])) | |
336 if sample_data_general['number_absent_genes'] > config['maximum_number_absent_genes']: | |
337 exit_info.append('Number of absent genes ({number_absent_genes}) higher than the maximum allowed ({maximum_number_absent_genes})'.format(number_absent_genes=sample_data_general['number_absent_genes'], maximum_number_absent_genes=config['maximum_number_absent_genes'])) | |
338 if sample_data_general['number_genes_multiple_alleles'] > config['maximum_number_genes_multiple_alleles']: | |
339 exit_info.append('Number of genes with multiple alleles ({number_genes_multiple_alleles}) higher than the maximum allowed ({maximum_number_genes_multiple_alleles})'.format(number_genes_multiple_alleles=sample_data_general['number_genes_multiple_alleles'], maximum_number_genes_multiple_alleles=config['maximum_number_genes_multiple_alleles'])) | |
340 | |
341 if len(exit_info) > 0: | |
342 print '\n' + '\n'.join(exit_info) + '\n' | |
343 e = 'TrueCoverage requirements not fulfilled' | |
344 print '\n' + e + '\n' | |
345 if not args.noCheckPoint: | |
346 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) | |
347 time_taken = utils.runTime(start_time) | |
348 sys.exit(e) | |
349 else: | |
350 e = 'TrueCoverage module did not run successfully' | |
351 print '\n' + e + '\n' | |
352 if not args.noCheckPoint: | |
353 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) | |
354 time_taken = utils.runTime(start_time) | |
355 sys.exit(e) | |
356 | |
357 print '\n' | |
358 typing_dir = os.path.join(rematch_dir, 'typing', '') | |
359 if not os.path.isdir(typing_dir): | |
360 os.makedirs(typing_dir) | |
361 run_successfully, bam_file = split_bam(bam_file, typing_headers, typing_dir, args.threads) | |
362 if run_successfully: | |
363 run_successfully = indexAlignment(bam_file) | |
364 if run_successfully: | |
365 reference_file = os.path.join(typing_dir, 'reference.fasta') | |
366 write_sequeces(reference_file, typing_sequences) | |
367 index_fasta_samtools(reference_file, None, None, True) | |
368 rematch_dir = str(typing_dir) | |
369 if not run_successfully: | |
370 if args.noCheckPoint: | |
371 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) | |
372 time_taken = utils.runTime(start_time) | |
373 sys.exit('Something in the required TrueCoverage analysis went wrong') | |
374 | |
375 if run_successfully: | |
376 config = parse_config(typing_config) | |
377 if args.minGeneCoverage is not None: | |
378 config['minimum_gene_coverage'] = args.minGeneCoverage | |
379 if args.minGeneIdentity is not None: | |
380 config['minimum_gene_identity'] = args.minGeneIdentity | |
381 | |
382 runtime, run_successfully, sample_data_general, data_by_gene = run_rematch.run_rematch(rematch, rematch_dir, reference_file, bam_file, args.threads, config['length_extra_seq'], config['minimum_depth_presence'], config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'], config['minimum_gene_coverage'], config['minimum_gene_identity'], args.debug, args.doNotRemoveConsensus) | |
383 if run_successfully and data_by_gene is not None: | |
384 if args.minGeneDepth is None: | |
385 args.minGeneDepth = 15 | |
386 | |
387 #runtime, ignore, ignore = typing.typing(data_by_gene, typing_rules, config['minimum_gene_coverage'], config['minimum_gene_identity'], args.minGeneDepth, args.outdir) | |
388 else: | |
389 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) | |
390 time_taken = utils.runTime(start_time) | |
391 sys.exit('ReMatCh run for pathotyping did not run successfully') | |
392 else: | |
393 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) | |
394 time_taken = utils.runTime(start_time) | |
395 sys.exit('Something did not run successfully') | |
396 | |
397 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) | |
398 | |
399 print '\n' | |
400 time_taken = utils.runTime(start_time) | |
401 del time_taken | |
402 | |
403 | |
404 if __name__ == "__main__": | |
405 main() |