Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
comparison scripts/patho_typing.py @ 3:0cbed1c0a762 draft default tip
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
| author | cstrittmatter |
|---|---|
| date | Tue, 28 Jan 2020 10:42:31 -0500 |
| parents | 965517909457 |
| children |
comparison
equal
deleted
inserted
replaced
| 2:6837f733b4aa | 3:0cbed1c0a762 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python3 |
| 2 | 2 |
| 3 # -*- coding: utf-8 -*- | 3 # -*- coding: utf-8 -*- |
| 4 | 4 |
| 5 """ | 5 """ |
| 6 patho_typing.py - In silico pathogenic typing directly from raw | 6 patho_typing.py - In silico pathogenic typing directly from raw |
| 7 Illumina reads | 7 Illumina reads |
| 8 <https://github.com/B-UMMI/patho_typing/> | 8 <https://github.com/B-UMMI/patho_typing/> |
| 9 | 9 |
| 10 Copyright (C) 2017 Miguel Machado <mpmachado@medicina.ulisboa.pt> | 10 Copyright (C) 2018 Miguel Machado <mpmachado@medicina.ulisboa.pt> |
| 11 | 11 |
| 12 Last modified: July 06, 2017 | 12 Last modified: October 15, 2018 |
| 13 | 13 |
| 14 This program is free software: you can redistribute it and/or modify | 14 This program is free software: you can redistribute it and/or modify |
| 15 it under the terms of the GNU General Public License as published by | 15 it under the terms of the GNU General Public License as published by |
| 16 the Free Software Foundation, either version 3 of the License, or | 16 the Free Software Foundation, either version 3 of the License, or |
| 17 (at your option) any later version. | 17 (at your option) any later version. |
| 22 GNU General Public License for more details. | 22 GNU General Public License for more details. |
| 23 | 23 |
| 24 You should have received a copy of the GNU General Public License | 24 You should have received a copy of the GNU General Public License |
| 25 along with this program. If not, see <http://www.gnu.org/licenses/>. | 25 along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 26 | 26 |
| 27 2017-12-05 ISS | 27 2020-01-13 ISS |
| 28 In order to use the module within the EURL_WGS_Typer pipeline with a | 28 In order to use the module within the EURL_WGS_Typer pipeline with a |
| 29 different virulence database for E coli, mapping against the | 29 different virulence database for E coli, mapping against the |
| 30 typing_rules.tab was deactivated. | 30 typing_rules.tab was deactivated. |
| 31 """ | 31 """ |
| 32 | 32 |
| 33 import argparse | 33 import argparse |
| 34 import os | 34 import os |
| 35 import time | 35 import time |
| 36 import sys | 36 import sys |
| 37 | 37 from pkg_resources import resource_filename |
| 38 import modules.utils as utils | 38 |
| 39 import modules.run_rematch as run_rematch | 39 try: |
| 40 import modules.typing as typing | 40 from __init__ import __version__ |
| 41 | 41 |
| 42 version = '0.3' | 42 import modules.utils as utils |
| 43 import modules.run_rematch as run_rematch | |
| 44 import modules.typing as typing | |
| 45 except ImportError: | |
| 46 from pathotyping.__init__ import __version__ | |
| 47 | |
| 48 from pathotyping.modules import utils as utils | |
| 49 from pathotyping.modules import run_rematch as run_rematch | |
| 50 from pathotyping.modules import typing as typing | |
| 43 | 51 |
| 44 | 52 |
| 45 def set_reference(species, outdir, script_path, trueCoverage): | 53 def set_reference(species, outdir, script_path, trueCoverage): |
| 46 reference_file = None | 54 reference_file = None |
| 47 trueCoverage_file = None | 55 trueCoverage_file = None |
| 52 typing_sequences = None | 60 typing_sequences = None |
| 53 typing_headers = None | 61 typing_headers = None |
| 54 typing_rules = None | 62 typing_rules = None |
| 55 typing_config = None | 63 typing_config = None |
| 56 | 64 |
| 57 species_folder = os.path.join(os.path.dirname(script_path), 'modules', 'seq_conf', '_'.join([s.lower() for s in species]), '') | 65 species_folder = os.path.join(os.path.dirname(script_path), 'modules', 'seq_conf', |
| 66 '_'.join([s.lower() for s in species]), '') | |
| 58 | 67 |
| 59 if os.path.isdir(species_folder): | 68 if os.path.isdir(species_folder): |
| 60 typing_rules = os.path.join(species_folder, 'typing_rules.tab') | 69 typing_rules = os.path.join(species_folder, 'typing_rules.tab') |
| 61 typing_file = os.path.join(species_folder, 'typing.fasta') | 70 typing_file = os.path.join(species_folder, 'typing.fasta') |
| 62 typing_sequences, ignore = utils.get_sequence_information(typing_file, 0) | 71 typing_sequences, ignore = utils.get_sequence_information(typing_file, 0) |
| 63 typing_sequences, typing_headers = utils.clean_headers_sequences(typing_sequences) | 72 typing_sequences, typing_headers = utils.clean_headers_sequences(typing_sequences) |
| 64 typing_sequences = utils.simplify_sequence_dict(typing_sequences) | 73 typing_sequences = utils.simplify_sequence_dict(typing_sequences) |
| 65 typing_config = os.path.join(species_folder, 'typing.config') | 74 typing_config = os.path.join(species_folder, 'typing.config') |
| 66 if trueCoverage: | 75 if trueCoverage: |
| 67 trueCoverage_file = os.path.join(species_folder, 'trueCoverage.fasta') | 76 if os.path.isfile(os.path.join(species_folder, 'trueCoverage.fasta')): |
| 68 trueCoverage_sequences, ignore = utils.get_sequence_information(trueCoverage_file, 0) | 77 trueCoverage_file = os.path.join(species_folder, 'trueCoverage.fasta') |
| 69 trueCoverage_sequences, trueCoverage_headers = utils.clean_headers_sequences(trueCoverage_sequences) | 78 trueCoverage_sequences, ignore = utils.get_sequence_information(trueCoverage_file, 0) |
| 70 trueCoverage_sequences = utils.simplify_sequence_dict(trueCoverage_sequences) | 79 trueCoverage_sequences, trueCoverage_headers = utils.clean_headers_sequences(trueCoverage_sequences) |
| 71 trueCoverage_config = os.path.join(species_folder, 'trueCoverage.config') | 80 trueCoverage_sequences = utils.simplify_sequence_dict(trueCoverage_sequences) |
| 72 | 81 trueCoverage_config = os.path.join(species_folder, 'trueCoverage.config') |
| 73 trueCoverage_typing_sequences = trueCoverage_sequences.copy() | 82 |
| 74 for header in typing_sequences: | 83 trueCoverage_typing_sequences = trueCoverage_sequences.copy() |
| 75 if header not in trueCoverage_sequences: | 84 for header in typing_sequences: |
| 76 trueCoverage_typing_sequences[header] = typing_sequences[header] | 85 if header not in trueCoverage_sequences: |
| 77 else: | 86 trueCoverage_typing_sequences[header] = typing_sequences[header] |
| 78 print 'Sequence {header} of typing.fasta already present in trueCoverage.fasta'.format(header=header) | 87 else: |
| 79 | 88 print('Sequence {header} of typing.fasta already present in' |
| 80 reference_file = os.path.join(outdir, 'trueCoverage_typing.fasta') | 89 ' trueCoverage.fasta'.format(header=header)) |
| 81 write_sequeces(reference_file, trueCoverage_typing_sequences) | 90 |
| 91 reference_file = os.path.join(outdir, 'trueCoverage_typing.fasta') | |
| 92 write_sequeces(reference_file, trueCoverage_typing_sequences) | |
| 82 else: | 93 else: |
| 83 reference_file = os.path.join(outdir, 'typing.fasta') | 94 reference_file = os.path.join(outdir, 'typing.fasta') |
| 84 write_sequeces(reference_file, typing_sequences) | 95 write_sequeces(reference_file, typing_sequences) |
| 85 else: | 96 else: |
| 86 species_present = [] | 97 species_present = [] |
| 87 seq_conf_folder = os.path.join(os.path.dirname(script_path), 'modules', 'seq_conf', '') | 98 seq_conf_folder = os.path.join(os.path.dirname(script_path), 'modules', 'seq_conf', '') |
| 88 species_folder = [d for d in os.listdir(seq_conf_folder) if not d.startswith('.') and os.path.isdir(os.path.join(seq_conf_folder, d, ''))] | 99 species_folder = [d for d in os.listdir(seq_conf_folder) if |
| 100 not d.startswith('.') and os.path.isdir(os.path.join(seq_conf_folder, d, ''))] | |
| 89 for species in species_folder: | 101 for species in species_folder: |
| 90 species = species.split('_') | 102 species = species.split('_') |
| 91 species[0] = species[0][0].upper() + species[0][1:] | 103 species[0] = species[0][0].upper() + species[0][1:] |
| 92 species_present.append(' '.join(species)) | 104 species_present.append(' '.join(species)) |
| 93 sys.exit('Only these species are available:' + '\n' + '\n'.join(species_present)) | 105 sys.exit('Only these species are available:' + '\n' + '\n'.join(species_present)) |
| 94 | 106 |
| 95 return reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, typing_file, typing_sequences, typing_headers, typing_rules, typing_config | 107 return reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, \ |
| 96 | 108 typing_file, typing_sequences, typing_headers, typing_rules, typing_config |
| 97 | |
| 98 def confirm_genes_fasta_rules(typing_fasta_headers, typing_rules_file): | |
| 99 with open(typing_rules_file, 'rtU') as reader: | |
| 100 genes = [] | |
| 101 for line in reader: | |
| 102 line = line.splitlines()[0] | |
| 103 if len(line) > 0: | |
| 104 line = line.split('\t') | |
| 105 if line[0].startswith('#'): | |
| 106 genes = line[1:] | |
| 107 break | |
| 108 missing_genes = [] | |
| 109 for gene in genes: | |
| 110 if gene.lower() not in typing_fasta_headers: | |
| 111 missing_genes.append(gene) | |
| 112 if len(missing_genes) > 0: | |
| 113 sys.exit('The following genes required for typing are not present in typing fasta file: {missing_genes}'.format(missing_genes=', '.join(missing_genes))) | |
| 114 | 109 |
| 115 | 110 |
| 116 def index_fasta_samtools(fasta, region_None, region_outfile_none, print_comand_True): | 111 def index_fasta_samtools(fasta, region_None, region_outfile_none, print_comand_True): |
| 117 command = ['samtools', 'faidx', fasta, '', '', ''] | 112 command = ['samtools', 'faidx', fasta, '', '', ''] |
| 118 shell_true = False | 113 shell_true = False |
| 138 def run_bowtie(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc): | 133 def run_bowtie(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc): |
| 139 sam_file = os.path.join(outdir, str('alignment.sam')) | 134 sam_file = os.path.join(outdir, str('alignment.sam')) |
| 140 | 135 |
| 141 run_successfully = indexSequenceBowtie2(referenceFile, threads) | 136 run_successfully = indexSequenceBowtie2(referenceFile, threads) |
| 142 if run_successfully: | 137 if run_successfully: |
| 143 command = ['bowtie2', '-k', str(numMapLoc), '-q', '', '--threads', str(threads), '-x', referenceFile, '', '--no-unal', '-S', sam_file] | 138 command = ['bowtie2', '-k', str(numMapLoc), '-q', '', '--threads', str(threads), '-x', referenceFile, '', |
| 139 '--no-unal', '-S', sam_file] | |
| 144 | 140 |
| 145 if len(fastq_files) == 1: | 141 if len(fastq_files) == 1: |
| 146 command[9] = '-U ' + fastq_files[0] | 142 command[9] = '-U ' + fastq_files[0] |
| 147 elif len(fastq_files) == 2: | 143 elif len(fastq_files) == 2: |
| 148 command[9] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1] | 144 command[9] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1] |
| 178 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) | 174 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) |
| 179 return run_successfully | 175 return run_successfully |
| 180 | 176 |
| 181 | 177 |
| 182 def mapping_reads(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc): | 178 def mapping_reads(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc): |
| 183 print '\n' + 'Mapping the reads' + '\n' | 179 print('\n' + 'Mapping the reads' + '\n') |
| 184 run_successfully, sam_file = run_bowtie(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc) | 180 run_successfully, sam_file = run_bowtie(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc) |
| 181 bam_file = None | |
| 185 if run_successfully: | 182 if run_successfully: |
| 186 run_successfully, bam_file = sortAlignment(sam_file, str(os.path.splitext(sam_file)[0] + '.bam'), False, threads) | 183 run_successfully, bam_file = sortAlignment(sam_file, str(os.path.splitext(sam_file)[0] + '.bam'), False, |
| 184 threads) | |
| 187 if run_successfully: | 185 if run_successfully: |
| 188 os.remove(sam_file) | 186 os.remove(sam_file) |
| 189 run_successfully = indexAlignment(bam_file) | 187 run_successfully = indexAlignment(bam_file) |
| 190 if run_successfully: | 188 if run_successfully: |
| 191 index_fasta_samtools(referenceFile, None, None, True) | 189 index_fasta_samtools(referenceFile, None, None, True) |
| 192 return run_successfully, bam_file | 190 return run_successfully, bam_file |
| 193 | 191 |
| 194 | 192 |
| 195 def include_rematch_dependencies_path(): | 193 def include_rematch_dependencies_path(): |
| 194 original_rematch = None | |
| 196 command = ['which', 'rematch.py'] | 195 command = ['which', 'rematch.py'] |
| 197 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) | 196 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False) |
| 198 if run_successfully: | 197 if run_successfully: |
| 199 rematch = stdout.splitlines()[0] | 198 original_rematch = stdout.splitlines()[0] |
| 200 utils.setPATHvariable(False, rematch) | 199 |
| 201 return rematch | 200 resource_rematch = None |
| 201 try: | |
| 202 resource_rematch = resource_filename('ReMatCh', 'rematch.py') | |
| 203 except ModuleNotFoundError: | |
| 204 resource_rematch = original_rematch | |
| 205 else: | |
| 206 print('\n' | |
| 207 'Using ReMatCh "{resource_rematch}" via "{original_rematch}"\n'.format(resource_rematch=resource_rematch, | |
| 208 original_rematch=original_rematch)) | |
| 209 | |
| 210 if resource_rematch is not None: | |
| 211 utils.setPATHvariable(False, resource_rematch) | |
| 212 else: | |
| 213 sys.exit('ReMatCh not found in the PATH') | |
| 214 | |
| 215 return resource_rematch | |
| 202 | 216 |
| 203 | 217 |
| 204 def split_bam(bam_file, list_sequences, outdir, threads): | 218 def split_bam(bam_file, list_sequences, outdir, threads): |
| 205 new_bam = os.path.join(outdir, 'partial.bam') | 219 new_bam = os.path.join(outdir, 'partial.bam') |
| 206 command = ['samtools', 'view', '-b', '-u', '-h', '-o', new_bam, '-@', str(threads), bam_file, ' '.join(list_sequences)] | 220 command = ['samtools', 'view', '-b', '-u', '-h', '-o', new_bam, '-@', str(threads), bam_file, |
| 221 ' '.join(list_sequences)] | |
| 207 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) | 222 run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True) |
| 208 return run_successfully, new_bam | 223 return run_successfully, new_bam |
| 209 | 224 |
| 210 | 225 |
| 211 def parse_config(config_file): | 226 def parse_config(config_file): |
| 212 config = {'reference_file': None, 'length_extra_seq': None, 'maximum_number_absent_genes': None, 'maximum_number_genes_multiple_alleles': None, 'minimum_read_coverage': None, 'minimum_depth_presence': None, 'minimum_depth_call': None, 'minimum_depth_frequency_dominant_allele': None, 'minimum_gene_coverage': None, 'minimum_gene_identity': None} | 227 config = {'reference_file': None, 'length_extra_seq': None, 'maximum_number_absent_genes': None, |
| 213 | 228 'maximum_number_genes_multiple_alleles': None, 'minimum_read_coverage': None, |
| 214 with open(config_file, 'rtU') as reader: | 229 'minimum_depth_presence': None, 'minimum_depth_call': None, |
| 230 'minimum_depth_frequency_dominant_allele': None, 'minimum_gene_coverage': None, | |
| 231 'minimum_gene_identity': None} | |
| 232 | |
| 233 with open(config_file, 'rt') as reader: | |
| 215 field = None | 234 field = None |
| 216 for line in reader: | 235 for line in reader: |
| 217 line = line.splitlines()[0] | 236 line = line.splitlines()[0] |
| 218 if len(line) > 0: | 237 if len(line) > 0: |
| 219 line = line.split(' ')[0] | 238 line = line.split(' ')[0] |
| 220 if line.startswith('#'): | 239 if line.startswith('#'): |
| 221 line = line[1:].split(' ')[0] | 240 line = line[1:].split(' ')[0] |
| 222 field = line | 241 field = line |
| 223 else: | 242 else: |
| 224 if field is not None: | 243 if field is not None: |
| 225 if field in ['length_extra_seq', 'maximum_number_absent_genes', 'maximum_number_genes_multiple_alleles', 'minimum_read_coverage', 'minimum_depth_presence', 'minimum_depth_call', 'minimum_gene_coverage', 'minimum_gene_identity']: | 244 if field in ['length_extra_seq', 'maximum_number_absent_genes', |
| 245 'maximum_number_genes_multiple_alleles', 'minimum_read_coverage', | |
| 246 'minimum_depth_presence', 'minimum_depth_call', 'minimum_gene_coverage', | |
| 247 'minimum_gene_identity']: | |
| 226 line = int(line) | 248 line = int(line) |
| 227 if field in ['minimum_gene_coverage', 'minimum_gene_identity']: | 249 if field in ['minimum_gene_coverage', 'minimum_gene_identity']: |
| 228 if line < 0 or line > 100: | 250 if line < 0 or line > 100: |
| 229 sys.exit('minimum_gene_coverage in trueCoverage_rematch config file must be an integer between 0 and 100') | 251 sys.exit('minimum_gene_coverage in trueCoverage_rematch config file must be an' |
| 252 ' integer between 0 and 100') | |
| 230 elif field == 'minimum_depth_frequency_dominant_allele': | 253 elif field == 'minimum_depth_frequency_dominant_allele': |
| 231 line = float(line) | 254 line = float(line) |
| 232 if line < 0 or line > 1: | 255 if line < 0 or line > 1: |
| 233 sys.exit('minimum_depth_frequency_dominant_allele in trueCoverage_rematch config file must be a double between 0 and 1') | 256 sys.exit('minimum_depth_frequency_dominant_allele in trueCoverage_rematch config file' |
| 257 ' must be a double between 0 and 1') | |
| 234 config[field] = line | 258 config[field] = line |
| 235 field = None | 259 field = None |
| 236 | 260 |
| 237 for field in config: | 261 for field in config: |
| 238 if config[field] is None: | 262 if config[field] is None: |
| 256 writer.write('>' + header + '\n') | 280 writer.write('>' + header + '\n') |
| 257 writer.write('\n'.join(utils.chunkstring(sequences_dict[header]['sequence'], 80)) + '\n') | 281 writer.write('\n'.join(utils.chunkstring(sequences_dict[header]['sequence'], 80)) + '\n') |
| 258 | 282 |
| 259 | 283 |
| 260 def main(): | 284 def main(): |
| 261 parser = argparse.ArgumentParser(prog='patho_typing.py', description='In silico pathogenic typing directly from raw Illumina reads', formatter_class=argparse.ArgumentDefaultsHelpFormatter) | 285 parser = argparse.ArgumentParser(prog='patho_typing.py', |
| 262 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) | 286 description='In silico pathogenic typing directly from raw Illumina reads', |
| 287 formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
| 288 parser.add_argument('--version', help='Version information', action='version', | |
| 289 version='{prog} v{version}'.format(prog=parser.prog, version=__version__)) | |
| 263 | 290 |
| 264 parser_required = parser.add_argument_group('Required options') | 291 parser_required = parser.add_argument_group('Required options') |
| 265 parser_required.add_argument('-f', '--fastq', nargs='+', action=utils.required_length((1, 2), '--fastq'), type=argparse.FileType('r'), metavar=('/path/to/input/file.fq.gz'), help='Path to single OR paired-end fastq files. If two files are passed, they will be assumed as being the paired fastq files', required=True) | 292 parser_required.add_argument('-f', '--fastq', nargs='+', action=utils.required_length((1, 2), '--fastq'), |
| 266 parser_required.add_argument('-s', '--species', nargs=2, type=str, metavar=('Yersinia', 'enterocolitica'), help='Species name', required=True) | 293 type=argparse.FileType('r'), metavar=('/path/to/input/file.fq.gz'), |
| 294 help='Path to single OR paired-end fastq files. If two files are passed, they will be' | |
| 295 ' assumed as being the paired fastq files', required=True) | |
| 296 parser_required.add_argument('-s', '--species', nargs=2, type=str, metavar=('Yersinia', 'enterocolitica'), | |
| 297 help='Species name', required=True) | |
| 267 | 298 |
| 268 parser_optional_general = parser.add_argument_group('General facultative options') | 299 parser_optional_general = parser.add_argument_group('General facultative options') |
| 269 parser_optional_general.add_argument('-o', '--outdir', type=str, metavar='/path/to/output/directory/', help='Path to the directory where the information will be stored', required=False, default='.') | 300 parser_optional_general.add_argument('-o', '--outdir', type=str, metavar='/path/to/output/directory/', |
| 270 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use', required=False, default=1) | 301 help='Path to the directory where the information will be stored', |
| 271 parser_optional_general.add_argument('--trueCoverage', action='store_true', help='Assess true coverage before continue typing') | 302 required=False, default='.') |
| 272 parser_optional_general.add_argument('--noCheckPoint', action='store_true', help='Ignore the true coverage checking point') | 303 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use', |
| 273 parser_optional_general.add_argument('--minGeneCoverage', type=int, metavar='N', help='Minimum typing percentage of target reference gene sequence covered to consider a gene to be present (value between [0, 100])', required=False) | 304 required=False, default=1) |
| 274 parser_optional_general.add_argument('--minGeneIdentity', type=int, metavar='N', help='Minimum typing percentage of identity of reference gene sequence covered to consider a gene to be present (value between [0, 100]). One INDEL will be considered as one difference', required=False) | 305 parser_optional_general.add_argument('--trueCoverage', action='store_true', |
| 275 parser_optional_general.add_argument('--minGeneDepth', type=int, metavar='N', help='Minimum typing gene average coverage depth of present positions to consider a gene to be present (default 15, or 1/3 of average sample coverage assessed by true coverage analysis)', required=False) | 306 help='Assess true coverage before continue typing') |
| 276 parser_optional_general.add_argument('--doNotRemoveConsensus', action='store_true', help='Do not remove ReMatCh consensus sequences') | 307 parser_optional_general.add_argument('--noCheckPoint', action='store_true', |
| 277 parser_optional_general.add_argument('--debug', action='store_true', help='DeBug Mode: do not remove temporary files') | 308 help='Ignore the true coverage checking point') |
| 309 parser_optional_general.add_argument('--minGeneCoverage', type=int, metavar='N', | |
| 310 help='Minimum typing percentage of target reference gene sequence covered to' | |
| 311 ' consider a gene to be present (value between [0, 100])', required=False) | |
| 312 parser_optional_general.add_argument('--minGeneIdentity', type=int, metavar='N', | |
| 313 help='Minimum typing percentage of identity of reference gene sequence covered' | |
| 314 ' to consider a gene to be present (value between [0, 100]). One INDEL' | |
| 315 ' will be considered as one difference', required=False) | |
| 316 parser_optional_general.add_argument('--minGeneDepth', type=int, metavar='N', | |
| 317 help='Minimum typing gene average coverage depth of present positions to' | |
| 318 ' consider a gene to be present (default is 1/3 of average sample' | |
| 319 ' coverage or 15x)', required=False) | |
| 320 parser_optional_general.add_argument('--doNotRemoveConsensus', action='store_true', | |
| 321 help='Do not remove ReMatCh consensus sequences') | |
| 322 parser_optional_general.add_argument('--debug', action='store_true', | |
| 323 help='DeBug Mode: do not remove temporary files') | |
| 278 | 324 |
| 279 args = parser.parse_args() | 325 args = parser.parse_args() |
| 280 | 326 |
| 281 if args.minGeneCoverage is not None and (args.minGeneCoverage < 0 or args.minGeneCoverage > 100): | 327 if args.minGeneCoverage is not None and (args.minGeneCoverage < 0 or args.minGeneCoverage > 100): |
| 282 parser.error('--minGeneCoverage should be a value between [0, 100]') | 328 parser.error('--minGeneCoverage should be a value between [0, 100]') |
| 290 os.makedirs(args.outdir) | 336 os.makedirs(args.outdir) |
| 291 | 337 |
| 292 # Start logger | 338 # Start logger |
| 293 logfile, time_str = utils.start_logger(args.outdir) | 339 logfile, time_str = utils.start_logger(args.outdir) |
| 294 | 340 |
| 295 script_path = utils.general_information(logfile, version, args.outdir, time_str) | 341 script_path = utils.general_information(logfile, __version__, args.outdir, time_str) |
| 296 print '\n' | 342 print('\n') |
| 297 | 343 |
| 298 rematch = include_rematch_dependencies_path() | 344 rematch = include_rematch_dependencies_path() |
| 299 | 345 |
| 300 args.fastq = [fastq.name for fastq in args.fastq] | 346 args.fastq = [fastq.name for fastq in args.fastq] |
| 301 | 347 |
| 302 reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, typing_file, typing_sequences, typing_headers, typing_rules, typing_config = set_reference(args.species, args.outdir, script_path, args.trueCoverage) | 348 reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, typing_file, \ |
| 349 typing_sequences, typing_headers, typing_rules, typing_config = \ | |
| 350 set_reference(args.species, args.outdir, script_path, args.trueCoverage) | |
| 303 original_reference_file = str(reference_file) | 351 original_reference_file = str(reference_file) |
| 304 | |
| 305 #confirm_genes_fasta_rules(typing_headers, typing_rules) | |
| 306 | 352 |
| 307 run_successfully, bam_file = mapping_reads(args.fastq, reference_file, args.threads, args.outdir, False, 1) | 353 run_successfully, bam_file = mapping_reads(args.fastq, reference_file, args.threads, args.outdir, False, 1) |
| 308 if run_successfully: | 354 if run_successfully: |
| 309 rematch_dir = os.path.join(args.outdir, 'rematch', '') | 355 rematch_dir = os.path.join(args.outdir, 'rematch', '') |
| 310 if not os.path.isdir(rematch_dir): | 356 if not os.path.isdir(rematch_dir): |
| 311 os.makedirs(rematch_dir) | 357 os.makedirs(rematch_dir) |
| 312 | 358 |
| 313 if args.trueCoverage: | 359 if args.trueCoverage: |
| 314 trueCoverage_dir = os.path.join(rematch_dir, 'trueCoverage', '') | 360 if trueCoverage_file is not None: |
| 315 if not os.path.isdir(trueCoverage_dir): | 361 trueCoverage_dir = os.path.join(rematch_dir, 'trueCoverage', '') |
| 316 os.makedirs(trueCoverage_dir) | 362 if not os.path.isdir(trueCoverage_dir): |
| 317 | 363 os.makedirs(trueCoverage_dir) |
| 318 print '\n' | 364 |
| 319 run_successfully, trueCoverage_bam = split_bam(bam_file, trueCoverage_headers, trueCoverage_dir, args.threads) | 365 print('\n') |
| 320 if run_successfully: | 366 run_successfully, trueCoverage_bam = split_bam(bam_file, trueCoverage_headers, trueCoverage_dir, |
| 321 run_successfully = indexAlignment(trueCoverage_bam) | 367 args.threads) |
| 322 if run_successfully: | 368 if run_successfully: |
| 323 reference_file = os.path.join(trueCoverage_dir, 'reference.fasta') | 369 run_successfully = indexAlignment(trueCoverage_bam) |
| 324 write_sequeces(reference_file, trueCoverage_sequences) | 370 if run_successfully: |
| 325 index_fasta_samtools(reference_file, None, None, True) | 371 reference_file = os.path.join(trueCoverage_dir, 'reference.fasta') |
| 326 config = parse_config(trueCoverage_config) | 372 write_sequeces(reference_file, trueCoverage_sequences) |
| 327 runtime, run_successfully, sample_data_general, data_by_gene = run_rematch.run_rematch(rematch, trueCoverage_dir, reference_file, trueCoverage_bam, args.threads, config['length_extra_seq'], config['minimum_depth_presence'], config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'], config['minimum_gene_coverage'], config['minimum_gene_identity'], args.debug, args.doNotRemoveConsensus) | 373 index_fasta_samtools(reference_file, None, None, True) |
| 328 | 374 config = parse_config(trueCoverage_config) |
| 329 if run_successfully and sample_data_general['mean_sample_coverage'] is not None and sample_data_general['number_absent_genes'] is not None and sample_data_general['number_genes_multiple_alleles'] is not None: | 375 runtime, run_successfully, sample_data_general, data_by_gene = \ |
| 330 if args.minGeneDepth is None: | 376 run_rematch.run_rematch(rematch, trueCoverage_dir, reference_file, trueCoverage_bam, |
| 331 args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 | 377 args.threads, config['length_extra_seq'], |
| 332 | 378 config['minimum_depth_presence'], config['minimum_depth_call'], |
| 333 exit_info = [] | 379 config['minimum_depth_frequency_dominant_allele'], |
| 334 if sample_data_general['mean_sample_coverage'] < config['minimum_read_coverage']: | 380 config['minimum_gene_coverage'], config['minimum_gene_identity'], |
| 335 exit_info.append('Sample coverage ({mean_sample_coverage}) lower than the minimum required ({minimum_read_coverage})'.format(mean_sample_coverage=sample_data_general['mean_sample_coverage'], minimum_read_coverage=config['minimum_read_coverage'])) | 381 args.debug, args.doNotRemoveConsensus) |
| 336 if sample_data_general['number_absent_genes'] > config['maximum_number_absent_genes']: | 382 |
| 337 exit_info.append('Number of absent genes ({number_absent_genes}) higher than the maximum allowed ({maximum_number_absent_genes})'.format(number_absent_genes=sample_data_general['number_absent_genes'], maximum_number_absent_genes=config['maximum_number_absent_genes'])) | 383 if run_successfully and sample_data_general['mean_sample_coverage'] is not None and \ |
| 338 if sample_data_general['number_genes_multiple_alleles'] > config['maximum_number_genes_multiple_alleles']: | 384 sample_data_general['number_absent_genes'] is not None and \ |
| 339 exit_info.append('Number of genes with multiple alleles ({number_genes_multiple_alleles}) higher than the maximum allowed ({maximum_number_genes_multiple_alleles})'.format(number_genes_multiple_alleles=sample_data_general['number_genes_multiple_alleles'], maximum_number_genes_multiple_alleles=config['maximum_number_genes_multiple_alleles'])) | 385 sample_data_general['number_genes_multiple_alleles'] is not None: |
| 340 | 386 if args.minGeneDepth is None: |
| 341 if len(exit_info) > 0: | 387 args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \ |
| 342 print '\n' + '\n'.join(exit_info) + '\n' | 388 sample_data_general['mean_sample_coverage'] / 3 > 15 else \ |
| 343 e = 'TrueCoverage requirements not fulfilled' | 389 15 |
| 344 print '\n' + e + '\n' | 390 |
| 391 exit_info = [] | |
| 392 if sample_data_general['mean_sample_coverage'] < config['minimum_read_coverage']: | |
| 393 exit_info.append('Sample coverage ({mean}) lower than the minimum' | |
| 394 ' required ({minimum})' | |
| 395 ''.format(mean=sample_data_general['mean_sample_coverage'], | |
| 396 minimum=config['minimum_read_coverage'])) | |
| 397 if sample_data_general['number_absent_genes'] > config['maximum_number_absent_genes']: | |
| 398 exit_info.append('Number of absent genes ({number}) higher than the' | |
| 399 ' maximum allowed ({maximum})' | |
| 400 ''.format(number=sample_data_general['number_absent_genes'], | |
| 401 maximum=config['maximum_number_absent_genes'])) | |
| 402 if sample_data_general['number_genes_multiple_alleles'] > \ | |
| 403 config['maximum_number_genes_multiple_alleles']: | |
| 404 exit_info.append('Number of genes with multiple alleles' | |
| 405 ' ({number}) higher than the maximum' | |
| 406 ' allowed ({maximum})' | |
| 407 ''.format(number=sample_data_general['number_genes_multiple_alleles'], | |
| 408 maximum=config['maximum_number_genes_multiple_alleles'])) | |
| 409 | |
| 410 if len(exit_info) > 0: | |
| 411 print('\n' + '\n'.join(exit_info) + '\n') | |
| 412 e = 'TrueCoverage requirements not fulfilled' | |
| 413 print('\n' + e + '\n') | |
| 414 if not args.noCheckPoint: | |
| 415 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) | |
| 416 _ = utils.runTime(start_time) | |
| 417 sys.exit(e) | |
| 418 else: | |
| 419 e = 'TrueCoverage module did not run successfully' | |
| 420 print('\n' + e + '\n') | |
| 345 if not args.noCheckPoint: | 421 if not args.noCheckPoint: |
| 346 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) | 422 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) |
| 347 time_taken = utils.runTime(start_time) | 423 _ = utils.runTime(start_time) |
| 348 sys.exit(e) | 424 sys.exit(e) |
| 349 else: | 425 |
| 350 e = 'TrueCoverage module did not run successfully' | 426 print('\n') |
| 351 print '\n' + e + '\n' | 427 typing_dir = os.path.join(rematch_dir, 'typing', '') |
| 352 if not args.noCheckPoint: | 428 if not os.path.isdir(typing_dir): |
| 353 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) | 429 os.makedirs(typing_dir) |
| 354 time_taken = utils.runTime(start_time) | 430 run_successfully, bam_file = split_bam(bam_file, typing_headers, typing_dir, args.threads) |
| 355 sys.exit(e) | |
| 356 | |
| 357 print '\n' | |
| 358 typing_dir = os.path.join(rematch_dir, 'typing', '') | |
| 359 if not os.path.isdir(typing_dir): | |
| 360 os.makedirs(typing_dir) | |
| 361 run_successfully, bam_file = split_bam(bam_file, typing_headers, typing_dir, args.threads) | |
| 362 if run_successfully: | |
| 363 run_successfully = indexAlignment(bam_file) | |
| 364 if run_successfully: | 431 if run_successfully: |
| 365 reference_file = os.path.join(typing_dir, 'reference.fasta') | 432 run_successfully = indexAlignment(bam_file) |
| 366 write_sequeces(reference_file, typing_sequences) | 433 if run_successfully: |
| 367 index_fasta_samtools(reference_file, None, None, True) | 434 reference_file = os.path.join(typing_dir, 'reference.fasta') |
| 368 rematch_dir = str(typing_dir) | 435 write_sequeces(reference_file, typing_sequences) |
| 369 if not run_successfully: | 436 index_fasta_samtools(reference_file, None, None, True) |
| 370 if args.noCheckPoint: | 437 rematch_dir = str(typing_dir) |
| 371 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) | 438 if not run_successfully: |
| 372 time_taken = utils.runTime(start_time) | 439 if args.noCheckPoint: |
| 373 sys.exit('Something in the required TrueCoverage analysis went wrong') | 440 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) |
| 441 _ = utils.runTime(start_time) | |
| 442 sys.exit('Something in the required TrueCoverage analysis went wrong') | |
| 443 else: | |
| 444 print('\n' | |
| 445 'WARNING: it was not found trueCoverage target files. trueCoverage will not run.' | |
| 446 '\n') | |
| 374 | 447 |
| 375 if run_successfully: | 448 if run_successfully: |
| 376 config = parse_config(typing_config) | 449 config = parse_config(typing_config) |
| 377 if args.minGeneCoverage is not None: | 450 if args.minGeneCoverage is not None: |
| 378 config['minimum_gene_coverage'] = args.minGeneCoverage | 451 config['minimum_gene_coverage'] = args.minGeneCoverage |
| 379 if args.minGeneIdentity is not None: | 452 if args.minGeneIdentity is not None: |
| 380 config['minimum_gene_identity'] = args.minGeneIdentity | 453 config['minimum_gene_identity'] = args.minGeneIdentity |
| 381 | 454 |
| 382 runtime, run_successfully, sample_data_general, data_by_gene = run_rematch.run_rematch(rematch, rematch_dir, reference_file, bam_file, args.threads, config['length_extra_seq'], config['minimum_depth_presence'], config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'], config['minimum_gene_coverage'], config['minimum_gene_identity'], args.debug, args.doNotRemoveConsensus) | 455 runtime, run_successfully, sample_data_general, data_by_gene = \ |
| 456 run_rematch.run_rematch(rematch, rematch_dir, reference_file, bam_file, args.threads, | |
| 457 config['length_extra_seq'], config['minimum_depth_presence'], | |
| 458 config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'], | |
| 459 config['minimum_gene_coverage'], config['minimum_gene_identity'], | |
| 460 args.debug, args.doNotRemoveConsensus) | |
| 383 if run_successfully and data_by_gene is not None: | 461 if run_successfully and data_by_gene is not None: |
| 384 if args.minGeneDepth is None: | 462 if args.minGeneDepth is None: |
| 385 args.minGeneDepth = 15 | 463 args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \ |
| 386 | 464 sample_data_general['mean_sample_coverage'] / 3 > 15 else \ |
| 387 #runtime, ignore, ignore = typing.typing(data_by_gene, typing_rules, config['minimum_gene_coverage'], config['minimum_gene_identity'], args.minGeneDepth, args.outdir) | 465 15 |
| 388 else: | 466 else: |
| 389 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) | 467 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) |
| 390 time_taken = utils.runTime(start_time) | 468 _ = utils.runTime(start_time) |
| 391 sys.exit('ReMatCh run for pathotyping did not run successfully') | 469 sys.exit('ReMatCh run for pathotyping did not run successfully') |
| 392 else: | 470 else: |
| 393 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) | 471 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) |
| 394 time_taken = utils.runTime(start_time) | 472 _ = utils.runTime(start_time) |
| 395 sys.exit('Something did not run successfully') | 473 sys.exit('Something did not run successfully') |
| 396 | 474 |
| 397 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) | 475 clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) |
| 398 | 476 |
| 399 print '\n' | 477 print('\n') |
| 400 time_taken = utils.runTime(start_time) | 478 _ = utils.runTime(start_time) |
| 401 del time_taken | |
| 402 | 479 |
| 403 | 480 |
| 404 if __name__ == "__main__": | 481 if __name__ == "__main__": |
| 405 main() | 482 main() |
