comparison scripts/patho_typing.py @ 3:0cbed1c0a762 draft default tip

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Tue, 28 Jan 2020 10:42:31 -0500
parents 965517909457
children
comparison
equal deleted inserted replaced
2:6837f733b4aa 3:0cbed1c0a762
1 #!/usr/bin/env python 1 #!/usr/bin/env python3
2 2
3 # -*- coding: utf-8 -*- 3 # -*- coding: utf-8 -*-
4 4
5 """ 5 """
6 patho_typing.py - In silico pathogenic typing directly from raw 6 patho_typing.py - In silico pathogenic typing directly from raw
7 Illumina reads 7 Illumina reads
8 <https://github.com/B-UMMI/patho_typing/> 8 <https://github.com/B-UMMI/patho_typing/>
9 9
10 Copyright (C) 2017 Miguel Machado <mpmachado@medicina.ulisboa.pt> 10 Copyright (C) 2018 Miguel Machado <mpmachado@medicina.ulisboa.pt>
11 11
12 Last modified: July 06, 2017 12 Last modified: October 15, 2018
13 13
14 This program is free software: you can redistribute it and/or modify 14 This program is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by 15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or 16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version. 17 (at your option) any later version.
22 GNU General Public License for more details. 22 GNU General Public License for more details.
23 23
24 You should have received a copy of the GNU General Public License 24 You should have received a copy of the GNU General Public License
25 along with this program. If not, see <http://www.gnu.org/licenses/>. 25 along with this program. If not, see <http://www.gnu.org/licenses/>.
26 26
27 2017-12-05 ISS 27 2020-01-13 ISS
28 In order to use the module within the EURL_WGS_Typer pipeline with a 28 In order to use the module within the EURL_WGS_Typer pipeline with a
29 different virulence database for E coli, mapping against the 29 different virulence database for E coli, mapping against the
30 typing_rules.tab was deactivated. 30 typing_rules.tab was deactivated.
31 """ 31 """
32 32
33 import argparse 33 import argparse
34 import os 34 import os
35 import time 35 import time
36 import sys 36 import sys
37 37 from pkg_resources import resource_filename
38 import modules.utils as utils 38
39 import modules.run_rematch as run_rematch 39 try:
40 import modules.typing as typing 40 from __init__ import __version__
41 41
42 version = '0.3' 42 import modules.utils as utils
43 import modules.run_rematch as run_rematch
44 import modules.typing as typing
45 except ImportError:
46 from pathotyping.__init__ import __version__
47
48 from pathotyping.modules import utils as utils
49 from pathotyping.modules import run_rematch as run_rematch
50 from pathotyping.modules import typing as typing
43 51
44 52
45 def set_reference(species, outdir, script_path, trueCoverage): 53 def set_reference(species, outdir, script_path, trueCoverage):
46 reference_file = None 54 reference_file = None
47 trueCoverage_file = None 55 trueCoverage_file = None
52 typing_sequences = None 60 typing_sequences = None
53 typing_headers = None 61 typing_headers = None
54 typing_rules = None 62 typing_rules = None
55 typing_config = None 63 typing_config = None
56 64
57 species_folder = os.path.join(os.path.dirname(script_path), 'modules', 'seq_conf', '_'.join([s.lower() for s in species]), '') 65 species_folder = os.path.join(os.path.dirname(script_path), 'modules', 'seq_conf',
66 '_'.join([s.lower() for s in species]), '')
58 67
59 if os.path.isdir(species_folder): 68 if os.path.isdir(species_folder):
60 typing_rules = os.path.join(species_folder, 'typing_rules.tab') 69 typing_rules = os.path.join(species_folder, 'typing_rules.tab')
61 typing_file = os.path.join(species_folder, 'typing.fasta') 70 typing_file = os.path.join(species_folder, 'typing.fasta')
62 typing_sequences, ignore = utils.get_sequence_information(typing_file, 0) 71 typing_sequences, ignore = utils.get_sequence_information(typing_file, 0)
63 typing_sequences, typing_headers = utils.clean_headers_sequences(typing_sequences) 72 typing_sequences, typing_headers = utils.clean_headers_sequences(typing_sequences)
64 typing_sequences = utils.simplify_sequence_dict(typing_sequences) 73 typing_sequences = utils.simplify_sequence_dict(typing_sequences)
65 typing_config = os.path.join(species_folder, 'typing.config') 74 typing_config = os.path.join(species_folder, 'typing.config')
66 if trueCoverage: 75 if trueCoverage:
67 trueCoverage_file = os.path.join(species_folder, 'trueCoverage.fasta') 76 if os.path.isfile(os.path.join(species_folder, 'trueCoverage.fasta')):
68 trueCoverage_sequences, ignore = utils.get_sequence_information(trueCoverage_file, 0) 77 trueCoverage_file = os.path.join(species_folder, 'trueCoverage.fasta')
69 trueCoverage_sequences, trueCoverage_headers = utils.clean_headers_sequences(trueCoverage_sequences) 78 trueCoverage_sequences, ignore = utils.get_sequence_information(trueCoverage_file, 0)
70 trueCoverage_sequences = utils.simplify_sequence_dict(trueCoverage_sequences) 79 trueCoverage_sequences, trueCoverage_headers = utils.clean_headers_sequences(trueCoverage_sequences)
71 trueCoverage_config = os.path.join(species_folder, 'trueCoverage.config') 80 trueCoverage_sequences = utils.simplify_sequence_dict(trueCoverage_sequences)
72 81 trueCoverage_config = os.path.join(species_folder, 'trueCoverage.config')
73 trueCoverage_typing_sequences = trueCoverage_sequences.copy() 82
74 for header in typing_sequences: 83 trueCoverage_typing_sequences = trueCoverage_sequences.copy()
75 if header not in trueCoverage_sequences: 84 for header in typing_sequences:
76 trueCoverage_typing_sequences[header] = typing_sequences[header] 85 if header not in trueCoverage_sequences:
77 else: 86 trueCoverage_typing_sequences[header] = typing_sequences[header]
78 print 'Sequence {header} of typing.fasta already present in trueCoverage.fasta'.format(header=header) 87 else:
79 88 print('Sequence {header} of typing.fasta already present in'
80 reference_file = os.path.join(outdir, 'trueCoverage_typing.fasta') 89 ' trueCoverage.fasta'.format(header=header))
81 write_sequeces(reference_file, trueCoverage_typing_sequences) 90
91 reference_file = os.path.join(outdir, 'trueCoverage_typing.fasta')
92 write_sequeces(reference_file, trueCoverage_typing_sequences)
82 else: 93 else:
83 reference_file = os.path.join(outdir, 'typing.fasta') 94 reference_file = os.path.join(outdir, 'typing.fasta')
84 write_sequeces(reference_file, typing_sequences) 95 write_sequeces(reference_file, typing_sequences)
85 else: 96 else:
86 species_present = [] 97 species_present = []
87 seq_conf_folder = os.path.join(os.path.dirname(script_path), 'modules', 'seq_conf', '') 98 seq_conf_folder = os.path.join(os.path.dirname(script_path), 'modules', 'seq_conf', '')
88 species_folder = [d for d in os.listdir(seq_conf_folder) if not d.startswith('.') and os.path.isdir(os.path.join(seq_conf_folder, d, ''))] 99 species_folder = [d for d in os.listdir(seq_conf_folder) if
100 not d.startswith('.') and os.path.isdir(os.path.join(seq_conf_folder, d, ''))]
89 for species in species_folder: 101 for species in species_folder:
90 species = species.split('_') 102 species = species.split('_')
91 species[0] = species[0][0].upper() + species[0][1:] 103 species[0] = species[0][0].upper() + species[0][1:]
92 species_present.append(' '.join(species)) 104 species_present.append(' '.join(species))
93 sys.exit('Only these species are available:' + '\n' + '\n'.join(species_present)) 105 sys.exit('Only these species are available:' + '\n' + '\n'.join(species_present))
94 106
95 return reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, typing_file, typing_sequences, typing_headers, typing_rules, typing_config 107 return reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, \
96 108 typing_file, typing_sequences, typing_headers, typing_rules, typing_config
97
98 def confirm_genes_fasta_rules(typing_fasta_headers, typing_rules_file):
99 with open(typing_rules_file, 'rtU') as reader:
100 genes = []
101 for line in reader:
102 line = line.splitlines()[0]
103 if len(line) > 0:
104 line = line.split('\t')
105 if line[0].startswith('#'):
106 genes = line[1:]
107 break
108 missing_genes = []
109 for gene in genes:
110 if gene.lower() not in typing_fasta_headers:
111 missing_genes.append(gene)
112 if len(missing_genes) > 0:
113 sys.exit('The following genes required for typing are not present in typing fasta file: {missing_genes}'.format(missing_genes=', '.join(missing_genes)))
114 109
115 110
116 def index_fasta_samtools(fasta, region_None, region_outfile_none, print_comand_True): 111 def index_fasta_samtools(fasta, region_None, region_outfile_none, print_comand_True):
117 command = ['samtools', 'faidx', fasta, '', '', ''] 112 command = ['samtools', 'faidx', fasta, '', '', '']
118 shell_true = False 113 shell_true = False
138 def run_bowtie(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc): 133 def run_bowtie(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc):
139 sam_file = os.path.join(outdir, str('alignment.sam')) 134 sam_file = os.path.join(outdir, str('alignment.sam'))
140 135
141 run_successfully = indexSequenceBowtie2(referenceFile, threads) 136 run_successfully = indexSequenceBowtie2(referenceFile, threads)
142 if run_successfully: 137 if run_successfully:
143 command = ['bowtie2', '-k', str(numMapLoc), '-q', '', '--threads', str(threads), '-x', referenceFile, '', '--no-unal', '-S', sam_file] 138 command = ['bowtie2', '-k', str(numMapLoc), '-q', '', '--threads', str(threads), '-x', referenceFile, '',
139 '--no-unal', '-S', sam_file]
144 140
145 if len(fastq_files) == 1: 141 if len(fastq_files) == 1:
146 command[9] = '-U ' + fastq_files[0] 142 command[9] = '-U ' + fastq_files[0]
147 elif len(fastq_files) == 2: 143 elif len(fastq_files) == 2:
148 command[9] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1] 144 command[9] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1]
178 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) 174 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
179 return run_successfully 175 return run_successfully
180 176
181 177
182 def mapping_reads(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc): 178 def mapping_reads(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc):
183 print '\n' + 'Mapping the reads' + '\n' 179 print('\n' + 'Mapping the reads' + '\n')
184 run_successfully, sam_file = run_bowtie(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc) 180 run_successfully, sam_file = run_bowtie(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc)
181 bam_file = None
185 if run_successfully: 182 if run_successfully:
186 run_successfully, bam_file = sortAlignment(sam_file, str(os.path.splitext(sam_file)[0] + '.bam'), False, threads) 183 run_successfully, bam_file = sortAlignment(sam_file, str(os.path.splitext(sam_file)[0] + '.bam'), False,
184 threads)
187 if run_successfully: 185 if run_successfully:
188 os.remove(sam_file) 186 os.remove(sam_file)
189 run_successfully = indexAlignment(bam_file) 187 run_successfully = indexAlignment(bam_file)
190 if run_successfully: 188 if run_successfully:
191 index_fasta_samtools(referenceFile, None, None, True) 189 index_fasta_samtools(referenceFile, None, None, True)
192 return run_successfully, bam_file 190 return run_successfully, bam_file
193 191
194 192
195 def include_rematch_dependencies_path(): 193 def include_rematch_dependencies_path():
194 original_rematch = None
196 command = ['which', 'rematch.py'] 195 command = ['which', 'rematch.py']
197 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) 196 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False)
198 if run_successfully: 197 if run_successfully:
199 rematch = stdout.splitlines()[0] 198 original_rematch = stdout.splitlines()[0]
200 utils.setPATHvariable(False, rematch) 199
201 return rematch 200 resource_rematch = None
201 try:
202 resource_rematch = resource_filename('ReMatCh', 'rematch.py')
203 except ModuleNotFoundError:
204 resource_rematch = original_rematch
205 else:
206 print('\n'
207 'Using ReMatCh "{resource_rematch}" via "{original_rematch}"\n'.format(resource_rematch=resource_rematch,
208 original_rematch=original_rematch))
209
210 if resource_rematch is not None:
211 utils.setPATHvariable(False, resource_rematch)
212 else:
213 sys.exit('ReMatCh not found in the PATH')
214
215 return resource_rematch
202 216
203 217
204 def split_bam(bam_file, list_sequences, outdir, threads): 218 def split_bam(bam_file, list_sequences, outdir, threads):
205 new_bam = os.path.join(outdir, 'partial.bam') 219 new_bam = os.path.join(outdir, 'partial.bam')
206 command = ['samtools', 'view', '-b', '-u', '-h', '-o', new_bam, '-@', str(threads), bam_file, ' '.join(list_sequences)] 220 command = ['samtools', 'view', '-b', '-u', '-h', '-o', new_bam, '-@', str(threads), bam_file,
221 ' '.join(list_sequences)]
207 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) 222 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
208 return run_successfully, new_bam 223 return run_successfully, new_bam
209 224
210 225
211 def parse_config(config_file): 226 def parse_config(config_file):
212 config = {'reference_file': None, 'length_extra_seq': None, 'maximum_number_absent_genes': None, 'maximum_number_genes_multiple_alleles': None, 'minimum_read_coverage': None, 'minimum_depth_presence': None, 'minimum_depth_call': None, 'minimum_depth_frequency_dominant_allele': None, 'minimum_gene_coverage': None, 'minimum_gene_identity': None} 227 config = {'reference_file': None, 'length_extra_seq': None, 'maximum_number_absent_genes': None,
213 228 'maximum_number_genes_multiple_alleles': None, 'minimum_read_coverage': None,
214 with open(config_file, 'rtU') as reader: 229 'minimum_depth_presence': None, 'minimum_depth_call': None,
230 'minimum_depth_frequency_dominant_allele': None, 'minimum_gene_coverage': None,
231 'minimum_gene_identity': None}
232
233 with open(config_file, 'rt') as reader:
215 field = None 234 field = None
216 for line in reader: 235 for line in reader:
217 line = line.splitlines()[0] 236 line = line.splitlines()[0]
218 if len(line) > 0: 237 if len(line) > 0:
219 line = line.split(' ')[0] 238 line = line.split(' ')[0]
220 if line.startswith('#'): 239 if line.startswith('#'):
221 line = line[1:].split(' ')[0] 240 line = line[1:].split(' ')[0]
222 field = line 241 field = line
223 else: 242 else:
224 if field is not None: 243 if field is not None:
225 if field in ['length_extra_seq', 'maximum_number_absent_genes', 'maximum_number_genes_multiple_alleles', 'minimum_read_coverage', 'minimum_depth_presence', 'minimum_depth_call', 'minimum_gene_coverage', 'minimum_gene_identity']: 244 if field in ['length_extra_seq', 'maximum_number_absent_genes',
245 'maximum_number_genes_multiple_alleles', 'minimum_read_coverage',
246 'minimum_depth_presence', 'minimum_depth_call', 'minimum_gene_coverage',
247 'minimum_gene_identity']:
226 line = int(line) 248 line = int(line)
227 if field in ['minimum_gene_coverage', 'minimum_gene_identity']: 249 if field in ['minimum_gene_coverage', 'minimum_gene_identity']:
228 if line < 0 or line > 100: 250 if line < 0 or line > 100:
229 sys.exit('minimum_gene_coverage in trueCoverage_rematch config file must be an integer between 0 and 100') 251 sys.exit('minimum_gene_coverage in trueCoverage_rematch config file must be an'
252 ' integer between 0 and 100')
230 elif field == 'minimum_depth_frequency_dominant_allele': 253 elif field == 'minimum_depth_frequency_dominant_allele':
231 line = float(line) 254 line = float(line)
232 if line < 0 or line > 1: 255 if line < 0 or line > 1:
233 sys.exit('minimum_depth_frequency_dominant_allele in trueCoverage_rematch config file must be a double between 0 and 1') 256 sys.exit('minimum_depth_frequency_dominant_allele in trueCoverage_rematch config file'
257 ' must be a double between 0 and 1')
234 config[field] = line 258 config[field] = line
235 field = None 259 field = None
236 260
237 for field in config: 261 for field in config:
238 if config[field] is None: 262 if config[field] is None:
256 writer.write('>' + header + '\n') 280 writer.write('>' + header + '\n')
257 writer.write('\n'.join(utils.chunkstring(sequences_dict[header]['sequence'], 80)) + '\n') 281 writer.write('\n'.join(utils.chunkstring(sequences_dict[header]['sequence'], 80)) + '\n')
258 282
259 283
260 def main(): 284 def main():
261 parser = argparse.ArgumentParser(prog='patho_typing.py', description='In silico pathogenic typing directly from raw Illumina reads', formatter_class=argparse.ArgumentDefaultsHelpFormatter) 285 parser = argparse.ArgumentParser(prog='patho_typing.py',
262 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) 286 description='In silico pathogenic typing directly from raw Illumina reads',
287 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
288 parser.add_argument('--version', help='Version information', action='version',
289 version='{prog} v{version}'.format(prog=parser.prog, version=__version__))
263 290
264 parser_required = parser.add_argument_group('Required options') 291 parser_required = parser.add_argument_group('Required options')
265 parser_required.add_argument('-f', '--fastq', nargs='+', action=utils.required_length((1, 2), '--fastq'), type=argparse.FileType('r'), metavar=('/path/to/input/file.fq.gz'), help='Path to single OR paired-end fastq files. If two files are passed, they will be assumed as being the paired fastq files', required=True) 292 parser_required.add_argument('-f', '--fastq', nargs='+', action=utils.required_length((1, 2), '--fastq'),
266 parser_required.add_argument('-s', '--species', nargs=2, type=str, metavar=('Yersinia', 'enterocolitica'), help='Species name', required=True) 293 type=argparse.FileType('r'), metavar=('/path/to/input/file.fq.gz'),
294 help='Path to single OR paired-end fastq files. If two files are passed, they will be'
295 ' assumed as being the paired fastq files', required=True)
296 parser_required.add_argument('-s', '--species', nargs=2, type=str, metavar=('Yersinia', 'enterocolitica'),
297 help='Species name', required=True)
267 298
268 parser_optional_general = parser.add_argument_group('General facultative options') 299 parser_optional_general = parser.add_argument_group('General facultative options')
269 parser_optional_general.add_argument('-o', '--outdir', type=str, metavar='/path/to/output/directory/', help='Path to the directory where the information will be stored', required=False, default='.') 300 parser_optional_general.add_argument('-o', '--outdir', type=str, metavar='/path/to/output/directory/',
270 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use', required=False, default=1) 301 help='Path to the directory where the information will be stored',
271 parser_optional_general.add_argument('--trueCoverage', action='store_true', help='Assess true coverage before continue typing') 302 required=False, default='.')
272 parser_optional_general.add_argument('--noCheckPoint', action='store_true', help='Ignore the true coverage checking point') 303 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use',
273 parser_optional_general.add_argument('--minGeneCoverage', type=int, metavar='N', help='Minimum typing percentage of target reference gene sequence covered to consider a gene to be present (value between [0, 100])', required=False) 304 required=False, default=1)
274 parser_optional_general.add_argument('--minGeneIdentity', type=int, metavar='N', help='Minimum typing percentage of identity of reference gene sequence covered to consider a gene to be present (value between [0, 100]). One INDEL will be considered as one difference', required=False) 305 parser_optional_general.add_argument('--trueCoverage', action='store_true',
275 parser_optional_general.add_argument('--minGeneDepth', type=int, metavar='N', help='Minimum typing gene average coverage depth of present positions to consider a gene to be present (default 15, or 1/3 of average sample coverage assessed by true coverage analysis)', required=False) 306 help='Assess true coverage before continue typing')
276 parser_optional_general.add_argument('--doNotRemoveConsensus', action='store_true', help='Do not remove ReMatCh consensus sequences') 307 parser_optional_general.add_argument('--noCheckPoint', action='store_true',
277 parser_optional_general.add_argument('--debug', action='store_true', help='DeBug Mode: do not remove temporary files') 308 help='Ignore the true coverage checking point')
309 parser_optional_general.add_argument('--minGeneCoverage', type=int, metavar='N',
310 help='Minimum typing percentage of target reference gene sequence covered to'
311 ' consider a gene to be present (value between [0, 100])', required=False)
312 parser_optional_general.add_argument('--minGeneIdentity', type=int, metavar='N',
313 help='Minimum typing percentage of identity of reference gene sequence covered'
314 ' to consider a gene to be present (value between [0, 100]). One INDEL'
315 ' will be considered as one difference', required=False)
316 parser_optional_general.add_argument('--minGeneDepth', type=int, metavar='N',
317 help='Minimum typing gene average coverage depth of present positions to'
318 ' consider a gene to be present (default is 1/3 of average sample'
319 ' coverage or 15x)', required=False)
320 parser_optional_general.add_argument('--doNotRemoveConsensus', action='store_true',
321 help='Do not remove ReMatCh consensus sequences')
322 parser_optional_general.add_argument('--debug', action='store_true',
323 help='DeBug Mode: do not remove temporary files')
278 324
279 args = parser.parse_args() 325 args = parser.parse_args()
280 326
281 if args.minGeneCoverage is not None and (args.minGeneCoverage < 0 or args.minGeneCoverage > 100): 327 if args.minGeneCoverage is not None and (args.minGeneCoverage < 0 or args.minGeneCoverage > 100):
282 parser.error('--minGeneCoverage should be a value between [0, 100]') 328 parser.error('--minGeneCoverage should be a value between [0, 100]')
290 os.makedirs(args.outdir) 336 os.makedirs(args.outdir)
291 337
292 # Start logger 338 # Start logger
293 logfile, time_str = utils.start_logger(args.outdir) 339 logfile, time_str = utils.start_logger(args.outdir)
294 340
295 script_path = utils.general_information(logfile, version, args.outdir, time_str) 341 script_path = utils.general_information(logfile, __version__, args.outdir, time_str)
296 print '\n' 342 print('\n')
297 343
298 rematch = include_rematch_dependencies_path() 344 rematch = include_rematch_dependencies_path()
299 345
300 args.fastq = [fastq.name for fastq in args.fastq] 346 args.fastq = [fastq.name for fastq in args.fastq]
301 347
302 reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, typing_file, typing_sequences, typing_headers, typing_rules, typing_config = set_reference(args.species, args.outdir, script_path, args.trueCoverage) 348 reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, typing_file, \
349 typing_sequences, typing_headers, typing_rules, typing_config = \
350 set_reference(args.species, args.outdir, script_path, args.trueCoverage)
303 original_reference_file = str(reference_file) 351 original_reference_file = str(reference_file)
304
305 #confirm_genes_fasta_rules(typing_headers, typing_rules)
306 352
307 run_successfully, bam_file = mapping_reads(args.fastq, reference_file, args.threads, args.outdir, False, 1) 353 run_successfully, bam_file = mapping_reads(args.fastq, reference_file, args.threads, args.outdir, False, 1)
308 if run_successfully: 354 if run_successfully:
309 rematch_dir = os.path.join(args.outdir, 'rematch', '') 355 rematch_dir = os.path.join(args.outdir, 'rematch', '')
310 if not os.path.isdir(rematch_dir): 356 if not os.path.isdir(rematch_dir):
311 os.makedirs(rematch_dir) 357 os.makedirs(rematch_dir)
312 358
313 if args.trueCoverage: 359 if args.trueCoverage:
314 trueCoverage_dir = os.path.join(rematch_dir, 'trueCoverage', '') 360 if trueCoverage_file is not None:
315 if not os.path.isdir(trueCoverage_dir): 361 trueCoverage_dir = os.path.join(rematch_dir, 'trueCoverage', '')
316 os.makedirs(trueCoverage_dir) 362 if not os.path.isdir(trueCoverage_dir):
317 363 os.makedirs(trueCoverage_dir)
318 print '\n' 364
319 run_successfully, trueCoverage_bam = split_bam(bam_file, trueCoverage_headers, trueCoverage_dir, args.threads) 365 print('\n')
320 if run_successfully: 366 run_successfully, trueCoverage_bam = split_bam(bam_file, trueCoverage_headers, trueCoverage_dir,
321 run_successfully = indexAlignment(trueCoverage_bam) 367 args.threads)
322 if run_successfully: 368 if run_successfully:
323 reference_file = os.path.join(trueCoverage_dir, 'reference.fasta') 369 run_successfully = indexAlignment(trueCoverage_bam)
324 write_sequeces(reference_file, trueCoverage_sequences) 370 if run_successfully:
325 index_fasta_samtools(reference_file, None, None, True) 371 reference_file = os.path.join(trueCoverage_dir, 'reference.fasta')
326 config = parse_config(trueCoverage_config) 372 write_sequeces(reference_file, trueCoverage_sequences)
327 runtime, run_successfully, sample_data_general, data_by_gene = run_rematch.run_rematch(rematch, trueCoverage_dir, reference_file, trueCoverage_bam, args.threads, config['length_extra_seq'], config['minimum_depth_presence'], config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'], config['minimum_gene_coverage'], config['minimum_gene_identity'], args.debug, args.doNotRemoveConsensus) 373 index_fasta_samtools(reference_file, None, None, True)
328 374 config = parse_config(trueCoverage_config)
329 if run_successfully and sample_data_general['mean_sample_coverage'] is not None and sample_data_general['number_absent_genes'] is not None and sample_data_general['number_genes_multiple_alleles'] is not None: 375 runtime, run_successfully, sample_data_general, data_by_gene = \
330 if args.minGeneDepth is None: 376 run_rematch.run_rematch(rematch, trueCoverage_dir, reference_file, trueCoverage_bam,
331 args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 377 args.threads, config['length_extra_seq'],
332 378 config['minimum_depth_presence'], config['minimum_depth_call'],
333 exit_info = [] 379 config['minimum_depth_frequency_dominant_allele'],
334 if sample_data_general['mean_sample_coverage'] < config['minimum_read_coverage']: 380 config['minimum_gene_coverage'], config['minimum_gene_identity'],
335 exit_info.append('Sample coverage ({mean_sample_coverage}) lower than the minimum required ({minimum_read_coverage})'.format(mean_sample_coverage=sample_data_general['mean_sample_coverage'], minimum_read_coverage=config['minimum_read_coverage'])) 381 args.debug, args.doNotRemoveConsensus)
336 if sample_data_general['number_absent_genes'] > config['maximum_number_absent_genes']: 382
337 exit_info.append('Number of absent genes ({number_absent_genes}) higher than the maximum allowed ({maximum_number_absent_genes})'.format(number_absent_genes=sample_data_general['number_absent_genes'], maximum_number_absent_genes=config['maximum_number_absent_genes'])) 383 if run_successfully and sample_data_general['mean_sample_coverage'] is not None and \
338 if sample_data_general['number_genes_multiple_alleles'] > config['maximum_number_genes_multiple_alleles']: 384 sample_data_general['number_absent_genes'] is not None and \
339 exit_info.append('Number of genes with multiple alleles ({number_genes_multiple_alleles}) higher than the maximum allowed ({maximum_number_genes_multiple_alleles})'.format(number_genes_multiple_alleles=sample_data_general['number_genes_multiple_alleles'], maximum_number_genes_multiple_alleles=config['maximum_number_genes_multiple_alleles'])) 385 sample_data_general['number_genes_multiple_alleles'] is not None:
340 386 if args.minGeneDepth is None:
341 if len(exit_info) > 0: 387 args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \
342 print '\n' + '\n'.join(exit_info) + '\n' 388 sample_data_general['mean_sample_coverage'] / 3 > 15 else \
343 e = 'TrueCoverage requirements not fulfilled' 389 15
344 print '\n' + e + '\n' 390
391 exit_info = []
392 if sample_data_general['mean_sample_coverage'] < config['minimum_read_coverage']:
393 exit_info.append('Sample coverage ({mean}) lower than the minimum'
394 ' required ({minimum})'
395 ''.format(mean=sample_data_general['mean_sample_coverage'],
396 minimum=config['minimum_read_coverage']))
397 if sample_data_general['number_absent_genes'] > config['maximum_number_absent_genes']:
398 exit_info.append('Number of absent genes ({number}) higher than the'
399 ' maximum allowed ({maximum})'
400 ''.format(number=sample_data_general['number_absent_genes'],
401 maximum=config['maximum_number_absent_genes']))
402 if sample_data_general['number_genes_multiple_alleles'] > \
403 config['maximum_number_genes_multiple_alleles']:
404 exit_info.append('Number of genes with multiple alleles'
405 ' ({number}) higher than the maximum'
406 ' allowed ({maximum})'
407 ''.format(number=sample_data_general['number_genes_multiple_alleles'],
408 maximum=config['maximum_number_genes_multiple_alleles']))
409
410 if len(exit_info) > 0:
411 print('\n' + '\n'.join(exit_info) + '\n')
412 e = 'TrueCoverage requirements not fulfilled'
413 print('\n' + e + '\n')
414 if not args.noCheckPoint:
415 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
416 _ = utils.runTime(start_time)
417 sys.exit(e)
418 else:
419 e = 'TrueCoverage module did not run successfully'
420 print('\n' + e + '\n')
345 if not args.noCheckPoint: 421 if not args.noCheckPoint:
346 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) 422 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
347 time_taken = utils.runTime(start_time) 423 _ = utils.runTime(start_time)
348 sys.exit(e) 424 sys.exit(e)
349 else: 425
350 e = 'TrueCoverage module did not run successfully' 426 print('\n')
351 print '\n' + e + '\n' 427 typing_dir = os.path.join(rematch_dir, 'typing', '')
352 if not args.noCheckPoint: 428 if not os.path.isdir(typing_dir):
353 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) 429 os.makedirs(typing_dir)
354 time_taken = utils.runTime(start_time) 430 run_successfully, bam_file = split_bam(bam_file, typing_headers, typing_dir, args.threads)
355 sys.exit(e)
356
357 print '\n'
358 typing_dir = os.path.join(rematch_dir, 'typing', '')
359 if not os.path.isdir(typing_dir):
360 os.makedirs(typing_dir)
361 run_successfully, bam_file = split_bam(bam_file, typing_headers, typing_dir, args.threads)
362 if run_successfully:
363 run_successfully = indexAlignment(bam_file)
364 if run_successfully: 431 if run_successfully:
365 reference_file = os.path.join(typing_dir, 'reference.fasta') 432 run_successfully = indexAlignment(bam_file)
366 write_sequeces(reference_file, typing_sequences) 433 if run_successfully:
367 index_fasta_samtools(reference_file, None, None, True) 434 reference_file = os.path.join(typing_dir, 'reference.fasta')
368 rematch_dir = str(typing_dir) 435 write_sequeces(reference_file, typing_sequences)
369 if not run_successfully: 436 index_fasta_samtools(reference_file, None, None, True)
370 if args.noCheckPoint: 437 rematch_dir = str(typing_dir)
371 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) 438 if not run_successfully:
372 time_taken = utils.runTime(start_time) 439 if args.noCheckPoint:
373 sys.exit('Something in the required TrueCoverage analysis went wrong') 440 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
441 _ = utils.runTime(start_time)
442 sys.exit('Something in the required TrueCoverage analysis went wrong')
443 else:
444 print('\n'
445 'WARNING: it was not found trueCoverage target files. trueCoverage will not run.'
446 '\n')
374 447
375 if run_successfully: 448 if run_successfully:
376 config = parse_config(typing_config) 449 config = parse_config(typing_config)
377 if args.minGeneCoverage is not None: 450 if args.minGeneCoverage is not None:
378 config['minimum_gene_coverage'] = args.minGeneCoverage 451 config['minimum_gene_coverage'] = args.minGeneCoverage
379 if args.minGeneIdentity is not None: 452 if args.minGeneIdentity is not None:
380 config['minimum_gene_identity'] = args.minGeneIdentity 453 config['minimum_gene_identity'] = args.minGeneIdentity
381 454
382 runtime, run_successfully, sample_data_general, data_by_gene = run_rematch.run_rematch(rematch, rematch_dir, reference_file, bam_file, args.threads, config['length_extra_seq'], config['minimum_depth_presence'], config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'], config['minimum_gene_coverage'], config['minimum_gene_identity'], args.debug, args.doNotRemoveConsensus) 455 runtime, run_successfully, sample_data_general, data_by_gene = \
456 run_rematch.run_rematch(rematch, rematch_dir, reference_file, bam_file, args.threads,
457 config['length_extra_seq'], config['minimum_depth_presence'],
458 config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'],
459 config['minimum_gene_coverage'], config['minimum_gene_identity'],
460 args.debug, args.doNotRemoveConsensus)
383 if run_successfully and data_by_gene is not None: 461 if run_successfully and data_by_gene is not None:
384 if args.minGeneDepth is None: 462 if args.minGeneDepth is None:
385 args.minGeneDepth = 15 463 args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \
386 464 sample_data_general['mean_sample_coverage'] / 3 > 15 else \
387 #runtime, ignore, ignore = typing.typing(data_by_gene, typing_rules, config['minimum_gene_coverage'], config['minimum_gene_identity'], args.minGeneDepth, args.outdir) 465 15
388 else: 466 else:
389 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) 467 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
390 time_taken = utils.runTime(start_time) 468 _ = utils.runTime(start_time)
391 sys.exit('ReMatCh run for pathotyping did not run successfully') 469 sys.exit('ReMatCh run for pathotyping did not run successfully')
392 else: 470 else:
393 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) 471 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
394 time_taken = utils.runTime(start_time) 472 _ = utils.runTime(start_time)
395 sys.exit('Something did not run successfully') 473 sys.exit('Something did not run successfully')
396 474
397 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) 475 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
398 476
399 print '\n' 477 print('\n')
400 time_taken = utils.runTime(start_time) 478 _ = utils.runTime(start_time)
401 del time_taken
402 479
403 480
404 if __name__ == "__main__": 481 if __name__ == "__main__":
405 main() 482 main()