Mercurial > repos > iss > eurl_vtec_wgs_pt
comparison scripts/ReMatCh/rematch.py @ 0:c6bab5103a14 draft
"planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
| author | iss |
|---|---|
| date | Mon, 21 Mar 2022 15:23:09 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c6bab5103a14 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 | |
| 3 # -*- coding: utf-8 -*- | |
| 4 | |
| 5 """ | |
| 6 rematch.py - Reads mapping against target sequences, checking mapping | |
| 7 and consensus sequences production | |
| 8 <https://github.com/B-UMMI/ReMatCh/> | |
| 9 | |
| 10 Copyright (C) 2019 Miguel Machado <mpmachado@medicina.ulisboa.pt> | |
| 11 | |
| 12 Last modified: August 08, 2019 | |
| 13 | |
| 14 This program is free software: you can redistribute it and/or modify | |
| 15 it under the terms of the GNU General Public License as published by | |
| 16 the Free Software Foundation, either version 3 of the License, or | |
| 17 (at your option) any later version. | |
| 18 | |
| 19 This program is distributed in the hope that it will be useful, | |
| 20 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 22 GNU General Public License for more details. | |
| 23 | |
| 24 You should have received a copy of the GNU General Public License | |
| 25 along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 26 """ | |
| 27 | |
| 28 import os | |
| 29 import sys | |
| 30 import time | |
| 31 import argparse | |
| 32 | |
| 33 try: | |
| 34 from __init__ import __version__ | |
| 35 | |
| 36 import modules.utils as utils | |
| 37 import modules.seqFromWebTaxon as seq_from_web_taxon | |
| 38 import modules.download as download | |
| 39 import modules.rematch_module as rematch_module | |
| 40 import modules.checkMLST as check_mlst | |
| 41 except ImportError: | |
| 42 from ReMatCh.__init__ import __version__ | |
| 43 | |
| 44 from ReMatCh.modules import utils as utils | |
| 45 from ReMatCh.modules import seqFromWebTaxon as seq_from_web_taxon | |
| 46 from ReMatCh.modules import download as download | |
| 47 from ReMatCh.modules import rematch_module as rematch_module | |
| 48 from ReMatCh.modules import checkMLST as check_mlst | |
| 49 | |
| 50 | |
| 51 def search_fastq_files(directory): | |
| 52 files_extensions = ['.fastq.gz', '.fq.gz'] | |
| 53 pair_end_files_separation = [['_R1_001.f', '_R2_001.f'], ['_1.f', '_2.f']] | |
| 54 | |
| 55 list_ids = {} | |
| 56 directories = [d for d in os.listdir(directory) if | |
| 57 not d.startswith('.') and os.path.isdir(os.path.join(directory, d, ''))] | |
| 58 for directory_found in directories: | |
| 59 if directory_found != 'pubmlst': | |
| 60 directory_path = os.path.join(directory, directory_found, '') | |
| 61 | |
| 62 fastq_found = [] | |
| 63 files = [f for f in os.listdir(directory_path) if | |
| 64 not f.startswith('.') and os.path.isfile(os.path.join(directory_path, f))] | |
| 65 for file_found in files: | |
| 66 if file_found.endswith(tuple(files_extensions)): | |
| 67 fastq_found.append(file_found) | |
| 68 | |
| 69 if len(fastq_found) == 1: | |
| 70 list_ids[directory_found] = [os.path.join(directory_path, f) for f in fastq_found] | |
| 71 elif len(fastq_found) >= 2: | |
| 72 file_pair = [] | |
| 73 | |
| 74 # Search pairs | |
| 75 for pe_separation in pair_end_files_separation: | |
| 76 for fastq in fastq_found: | |
| 77 if pe_separation[0] in fastq or pe_separation[1] in fastq: | |
| 78 file_pair.append(fastq) | |
| 79 | |
| 80 if len(file_pair) == 2: | |
| 81 break | |
| 82 else: | |
| 83 file_pair = [] | |
| 84 | |
| 85 # Search single | |
| 86 if len(file_pair) == 0: | |
| 87 for pe_separation in pair_end_files_separation: | |
| 88 for fastq in fastq_found: | |
| 89 if pe_separation[0] not in fastq or pe_separation[1] not in fastq: | |
| 90 file_pair.append(fastq) | |
| 91 | |
| 92 if len(file_pair) >= 1: | |
| 93 file_pair = file_pair[0] | |
| 94 | |
| 95 if len(file_pair) >= 1: | |
| 96 list_ids[directory_found] = [os.path.join(directory_path, f) for f in file_pair] | |
| 97 | |
| 98 return list_ids | |
| 99 | |
| 100 | |
| 101 def get_list_ids_from_file(file_list_ids): | |
| 102 list_ids = [] | |
| 103 | |
| 104 with open(file_list_ids, 'rtU') as lines: | |
| 105 for line in lines: | |
| 106 line = line.rstrip('\r\n') | |
| 107 if len(line) > 0: | |
| 108 list_ids.append(line) | |
| 109 | |
| 110 if len(list_ids) == 0: | |
| 111 sys.exit('No runIDs were found in ' + file_list_ids) | |
| 112 | |
| 113 return list_ids | |
| 114 | |
| 115 | |
| 116 def get_taxon_run_ids(taxon_name, outputfile): | |
| 117 seq_from_web_taxon.run_seq_from_web_taxon(taxon_name, outputfile, True, True, True, False) | |
| 118 | |
| 119 run_ids = [] | |
| 120 with open(outputfile, 'rtU') as reader: | |
| 121 for line in reader: | |
| 122 line = line.rstrip('\r\n') | |
| 123 if len(line) > 0: | |
| 124 if not line.startswith('#'): | |
| 125 line = line.split('\t') | |
| 126 run_ids.append(line[0]) | |
| 127 | |
| 128 return run_ids | |
| 129 | |
| 130 | |
| 131 def get_list_ids(workdir, file_list_ids, taxon_name): | |
| 132 searched_fastq_files = False | |
| 133 list_ids = [] | |
| 134 if file_list_ids is None and taxon_name is None: | |
| 135 list_ids = search_fastq_files(workdir) | |
| 136 searched_fastq_files = True | |
| 137 elif file_list_ids is not None: | |
| 138 list_ids = get_list_ids_from_file(os.path.abspath(file_list_ids)) | |
| 139 elif taxon_name is not None and file_list_ids is None: | |
| 140 list_ids = get_taxon_run_ids(taxon_name, os.path.join(workdir, 'IDs_list.seqFromWebTaxon.tab')) | |
| 141 | |
| 142 if len(list_ids) == 0: | |
| 143 sys.exit('No IDs were found') | |
| 144 return list_ids, searched_fastq_files | |
| 145 | |
| 146 | |
| 147 def format_gene_info(gene_specific_info, minimum_gene_coverage, minimum_gene_identity, reported_data_type, summary, | |
| 148 sample, genes_present): | |
| 149 info = None | |
| 150 if gene_specific_info['gene_coverage'] >= minimum_gene_coverage and \ | |
| 151 gene_specific_info['gene_identity'] >= minimum_gene_identity: | |
| 152 if summary and sample not in genes_present: | |
| 153 genes_present[sample] = {} | |
| 154 | |
| 155 if gene_specific_info['gene_number_positions_multiple_alleles'] == 0: | |
| 156 s = str(gene_specific_info[reported_data_type]) | |
| 157 info = str(s) | |
| 158 if summary: | |
| 159 genes_present[sample][gene_specific_info['header']] = str(s) | |
| 160 else: | |
| 161 s = 'multiAlleles_' + str(gene_specific_info[reported_data_type]) | |
| 162 info = str(s) | |
| 163 if summary: | |
| 164 genes_present[sample][gene_specific_info['header']] = str(s) | |
| 165 else: | |
| 166 info = 'absent_' + str(gene_specific_info[reported_data_type]) | |
| 167 | |
| 168 return info, genes_present | |
| 169 | |
| 170 | |
| 171 def write_data_by_gene(gene_list_reference, minimum_gene_coverage, sample, data_by_gene, outdir, time_str, run_times, | |
| 172 minimum_gene_identity, reported_data_type, summary, genes_present): | |
| 173 combined_report = \ | |
| 174 os.path.join(outdir, | |
| 175 'combined_report.data_by_gene.' + run_times + '.' + reported_data_type + '.' + time_str + '.tab') | |
| 176 | |
| 177 if reported_data_type == 'coverage_depth': | |
| 178 reported_data_type = 'gene_mean_read_coverage' | |
| 179 elif reported_data_type == 'sequence_coverage': | |
| 180 reported_data_type = 'gene_coverage' | |
| 181 | |
| 182 combined_report_exist = os.path.isfile(combined_report) | |
| 183 with open(combined_report, 'at') as writer: | |
| 184 seq_list = sorted(gene_list_reference.keys()) | |
| 185 if not combined_report_exist: | |
| 186 writer.write('#sample' + '\t' + '\t'.join([gene_list_reference[seq] for seq in seq_list]) + '\n') | |
| 187 | |
| 188 results = {} | |
| 189 headers = [] | |
| 190 for i in data_by_gene: | |
| 191 results[data_by_gene[i]['header']], genes_present = format_gene_info(data_by_gene[i], minimum_gene_coverage, | |
| 192 minimum_gene_identity, | |
| 193 reported_data_type, summary, sample, | |
| 194 genes_present) | |
| 195 headers.append(data_by_gene[i]['header']) | |
| 196 | |
| 197 if len(headers) != gene_list_reference: | |
| 198 for gene in gene_list_reference: | |
| 199 if gene not in headers: | |
| 200 results[gene] = 'NA' | |
| 201 | |
| 202 writer.write(sample + '\t' + '\t'.join([results[seq] for seq in seq_list]) + '\n') | |
| 203 | |
| 204 return genes_present | |
| 205 | |
| 206 | |
| 207 def write_sample_report(sample, outdir, time_str, file_size, run_successfully_fastq, run_successfully_rematch_first, | |
| 208 run_successfully_rematch_second, time_taken_fastq, time_taken_rematch_first, | |
| 209 time_taken_rematch_second, time_taken_sample, sequencing_information, sample_data_general_first, | |
| 210 sample_data_general_second, fastq_used): | |
| 211 sample_report = os.path.join(outdir, 'sample_report.' + time_str + '.tab') | |
| 212 report_exist = os.path.isfile(sample_report) | |
| 213 | |
| 214 header_general = ['sample', 'sample_run_successfully', 'sample_run_time', 'files_size', 'download_run_successfully', | |
| 215 'download_run_time', 'rematch_run_successfully_first', 'rematch_run_time_first', | |
| 216 'rematch_run_successfully_second', 'rematch_run_time_second'] | |
| 217 header_data_general = ['number_absent_genes', 'number_genes_multiple_alleles', 'mean_sample_coverage'] | |
| 218 header_sequencing = ['run_accession', 'instrument_platform', 'instrument_model', 'library_layout', 'library_source', | |
| 219 'extra_run_accession', 'nominal_length', 'read_count', 'base_count', 'date_download'] | |
| 220 | |
| 221 with open(sample_report, 'at') as writer: | |
| 222 if not report_exist: | |
| 223 writer.write('#' + '\t'.join(header_general) + '\t' + '_first\t'.join(header_data_general) + '_first\t' + | |
| 224 '_second\t'.join(header_data_general) + '_second\t' + '\t'.join(header_sequencing) + '\t' + | |
| 225 'fastq_used' + '\n') | |
| 226 | |
| 227 writer.write('\t'.join([sample, | |
| 228 str(all([run_successfully_fastq is not False, | |
| 229 run_successfully_rematch_first is not False, | |
| 230 run_successfully_rematch_second is not False])), | |
| 231 str(time_taken_sample), | |
| 232 str(file_size), | |
| 233 str(run_successfully_fastq), | |
| 234 str(time_taken_fastq), | |
| 235 str(run_successfully_rematch_first), | |
| 236 str(time_taken_rematch_first), | |
| 237 str(run_successfully_rematch_second), | |
| 238 str(time_taken_rematch_second)]) + | |
| 239 '\t' + '\t'.join([str(sample_data_general_first[i]) for i in header_data_general]) + | |
| 240 '\t' + '\t'.join([str(sample_data_general_second[i]) for i in header_data_general]) + | |
| 241 '\t' + '\t'.join([str(sequencing_information[i]) for i in header_sequencing]) + | |
| 242 '\t' + ','.join(fastq_used) + '\n') | |
| 243 | |
| 244 | |
| 245 def concatenate_extra_seq_2_consensus(consensus_sequence, reference_sequence, extra_seq_length, outdir): | |
| 246 reference_dict, ignore, ignore = rematch_module.get_sequence_information(reference_sequence, extra_seq_length) | |
| 247 consensus_dict, genes, ignore = rematch_module.get_sequence_information(consensus_sequence, 0) | |
| 248 number_consensus_with_sequences = 0 | |
| 249 for k, values_consensus in list(consensus_dict.items()): | |
| 250 for values_reference in list(reference_dict.values()): | |
| 251 if values_reference['header'] == values_consensus['header']: | |
| 252 if len(set(consensus_dict[k]['sequence'])) > 1: | |
| 253 number_consensus_with_sequences += 1 | |
| 254 if extra_seq_length <= len(values_reference['sequence']): | |
| 255 right_extra_seq = \ | |
| 256 '' if extra_seq_length == 0 else values_reference['sequence'][-extra_seq_length:] | |
| 257 consensus_dict[k]['sequence'] = \ | |
| 258 values_reference['sequence'][:extra_seq_length] + \ | |
| 259 consensus_dict[k]['sequence'] + \ | |
| 260 right_extra_seq | |
| 261 consensus_dict[k]['length'] += extra_seq_length + len(right_extra_seq) | |
| 262 | |
| 263 consensus_concatenated = os.path.join(outdir, 'consensus_concatenated_extraSeq.fasta') | |
| 264 with open(consensus_concatenated, 'wt') as writer: | |
| 265 for i in consensus_dict: | |
| 266 writer.write('>' + consensus_dict[i]['header'] + '\n') | |
| 267 fasta_sequence_lines = rematch_module.chunkstring(consensus_dict[i]['sequence'], 80) | |
| 268 for line in fasta_sequence_lines: | |
| 269 writer.write(line + '\n') | |
| 270 | |
| 271 return consensus_concatenated, genes, consensus_dict, number_consensus_with_sequences | |
| 272 | |
| 273 | |
| 274 def clean_headers_reference_file(reference_file, outdir, extra_seq): | |
| 275 problematic_characters = ["|", " ", ",", ".", "(", ")", "'", "/", ":"] | |
| 276 print('Checking if reference sequences contain ' + str(problematic_characters) + '\n') | |
| 277 # headers_changed = False | |
| 278 new_reference_file = str(reference_file) | |
| 279 sequences, genes, headers_changed = rematch_module.get_sequence_information(reference_file, extra_seq) | |
| 280 if headers_changed: | |
| 281 print('At least one of the those characters was found. Replacing those with _' + '\n') | |
| 282 new_reference_file = \ | |
| 283 os.path.join(outdir, os.path.splitext(os.path.basename(reference_file))[0] + '.headers_renamed.fasta') | |
| 284 with open(new_reference_file, 'wt') as writer: | |
| 285 for i in sequences: | |
| 286 writer.write('>' + sequences[i]['header'] + '\n') | |
| 287 fasta_sequence_lines = rematch_module.chunkstring(sequences[i]['sequence'], 80) | |
| 288 for line in fasta_sequence_lines: | |
| 289 writer.write(line + '\n') | |
| 290 return new_reference_file, genes, sequences | |
| 291 | |
| 292 | |
| 293 def write_mlst_report(sample, run_times, consensus_type, st, alleles_profile, loci_order, outdir, time_str): | |
| 294 mlst_report = os.path.join(outdir, 'mlst_report.' + time_str + '.tab') | |
| 295 mlst_report_exist = os.path.isfile(mlst_report) | |
| 296 with open(mlst_report, 'at') as writer: | |
| 297 if not mlst_report_exist: | |
| 298 writer.write('\t'.join(['#sample', 'ReMatCh_run', 'consensus_type', 'ST'] + loci_order) + '\n') | |
| 299 writer.write('\t'.join([sample, run_times, consensus_type, str(st)] + alleles_profile.split(',')) + '\n') | |
| 300 | |
| 301 | |
| 302 def run_get_st(sample, mlst_dicts, consensus_sequences, mlst_consensus, run_times, outdir, time_str): | |
| 303 if mlst_consensus == 'all': | |
| 304 for consensus_type in consensus_sequences: | |
| 305 print('Searching MLST for ' + consensus_type + ' consensus') | |
| 306 st, alleles_profile = check_mlst.get_st(mlst_dicts, consensus_sequences[consensus_type]) | |
| 307 write_mlst_report(sample, run_times, consensus_type, st, alleles_profile, mlst_dicts[2], outdir, time_str) | |
| 308 print('ST found: ' + str(st) + ' (' + alleles_profile + ')') | |
| 309 else: | |
| 310 st, alleles_profile = check_mlst.get_st(mlst_dicts, consensus_sequences[mlst_consensus]) | |
| 311 write_mlst_report(sample, run_times, mlst_consensus, st, alleles_profile, mlst_dicts[2], outdir, time_str) | |
| 312 print('ST found for ' + mlst_consensus + ' consensus: ' + str(st) + ' (' + alleles_profile + ')') | |
| 313 | |
| 314 | |
| 315 def write_summary_report(outdir, reported_data_type, time_str, gene_list_reference, genes_present): | |
| 316 with open(os.path.join(outdir, | |
| 317 'summary.{reported_data_type}.{time_str}.tab'.format(reported_data_type=reported_data_type, | |
| 318 time_str=time_str)), 'wt') as writer: | |
| 319 seq_list = [] | |
| 320 for info in list(genes_present.values()): | |
| 321 seq_list.extend(list(info.keys())) | |
| 322 seq_list = list(set(seq_list)) | |
| 323 writer.write('#sample' + '\t' + '\t'.join([gene_list_reference[seq] for seq in sorted(seq_list)]) + '\n') | |
| 324 for sample, info in list(genes_present.items()): | |
| 325 data = [] | |
| 326 for seq in sorted(seq_list): | |
| 327 if seq in info: | |
| 328 data.append(info[seq]) | |
| 329 else: | |
| 330 data.append('NF') | |
| 331 writer.write(sample + '\t' + '\t'.join(data) + '\n') | |
| 332 | |
| 333 | |
| 334 def run_rematch(args): | |
| 335 workdir = os.path.abspath(args.workdir) | |
| 336 if not os.path.isdir(workdir): | |
| 337 os.makedirs(workdir) | |
| 338 | |
| 339 aspera_key = os.path.abspath(args.asperaKey.name) if args.asperaKey is not None else None | |
| 340 | |
| 341 # Start logger | |
| 342 logfile, time_str = utils.start_logger(workdir) | |
| 343 | |
| 344 # Get general information | |
| 345 script_path = utils.general_information(logfile, __version__, workdir, time_str, args.doNotUseProvidedSoftware, | |
| 346 aspera_key, args.downloadCramBam, args.SRA, args.SRAopt) | |
| 347 | |
| 348 # Set list_ids | |
| 349 list_ids, searched_fastq_files = get_list_ids(workdir, args.listIDs.name if args.listIDs is not None else None, | |
| 350 args.taxon) | |
| 351 | |
| 352 mlst_sequences = None | |
| 353 mlst_dicts = None | |
| 354 if args.mlst is not None: | |
| 355 time_taken_pub_mlst, mlst_dicts, mlst_sequences = check_mlst.download_pub_mlst_xml(args.mlst, | |
| 356 args.mlstSchemaNumber, | |
| 357 workdir) | |
| 358 args.softClip_recodeRun = 'first' | |
| 359 | |
| 360 if args.reference is None: | |
| 361 if args.mlst is not None: | |
| 362 reference_file = check_mlst.check_existing_schema(args.mlst, args.mlstSchemaNumber, script_path) | |
| 363 args.extraSeq = 200 | |
| 364 if reference_file is None: | |
| 365 print('It was not found provided MLST scheme sequences for ' + args.mlst) | |
| 366 print('Trying to obtain reference MLST sequences from PubMLST') | |
| 367 if len(mlst_sequences) > 0: | |
| 368 reference_file = check_mlst.write_mlst_reference(args.mlst, mlst_sequences, workdir, time_str) | |
| 369 args.extraSeq = 0 | |
| 370 else: | |
| 371 sys.exit('It was not possible to download MLST sequences from PubMLST!') | |
| 372 else: | |
| 373 print('Using provided scheme as referece: ' + reference_file) | |
| 374 else: | |
| 375 sys.exit('Need to provide at least one of the following options: "--reference" and "--mlst"') | |
| 376 else: | |
| 377 reference_file = os.path.abspath(args.reference.name) | |
| 378 | |
| 379 # Run ReMatCh for each sample | |
| 380 print('\n' + 'STARTING ReMatCh' + '\n') | |
| 381 | |
| 382 # Clean sequences headers | |
| 383 reference_file, gene_list_reference, reference_dict = clean_headers_reference_file(reference_file, workdir, | |
| 384 args.extraSeq) | |
| 385 | |
| 386 if args.mlst is not None: | |
| 387 problem_genes = False | |
| 388 for header in mlst_sequences: | |
| 389 if header not in gene_list_reference: | |
| 390 print('MLST gene {header} not found between reference sequences'.format(header=header)) | |
| 391 problem_genes = True | |
| 392 if problem_genes: | |
| 393 sys.exit('Missing MLST genes from reference sequences (at least sequences names do not match)!') | |
| 394 | |
| 395 if len(gene_list_reference) == 0: | |
| 396 sys.exit('No sequences left') | |
| 397 | |
| 398 # To use in combined report | |
| 399 | |
| 400 number_samples_successfully = 0 | |
| 401 genes_present_coverage_depth = {} | |
| 402 genes_present_sequence_coverage = {} | |
| 403 for sample in list_ids: | |
| 404 sample_start_time = time.time() | |
| 405 print('\n\n' + 'Sample ID: ' + sample) | |
| 406 | |
| 407 # Create sample outdir | |
| 408 sample_outdir = os.path.join(workdir, sample, '') | |
| 409 if not os.path.isdir(sample_outdir): | |
| 410 os.mkdir(sample_outdir) | |
| 411 | |
| 412 run_successfully_fastq = None | |
| 413 time_taken_fastq = 0 | |
| 414 sequencing_information = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, | |
| 415 'library_layout': None, 'library_source': None, 'extra_run_accession': None, | |
| 416 'nominal_length': None, 'read_count': None, 'base_count': None, 'date_download': None} | |
| 417 if not searched_fastq_files: | |
| 418 # Download Files | |
| 419 time_taken_fastq, run_successfully_fastq, fastq_files, sequencing_information = \ | |
| 420 download.run_download(sample, args.downloadLibrariesType, aspera_key, sample_outdir, | |
| 421 args.downloadCramBam, args.threads, args.downloadInstrumentPlatform, args.SRA, | |
| 422 args.SRAopt) | |
| 423 else: | |
| 424 fastq_files = list_ids[sample] | |
| 425 | |
| 426 file_size = None | |
| 427 | |
| 428 run_successfully_rematch_first = None | |
| 429 run_successfully_rematch_second = None | |
| 430 time_taken_rematch_first = 0 | |
| 431 time_taken_rematch_second = 0 | |
| 432 sample_data_general_first = None | |
| 433 sample_data_general_second = None | |
| 434 if run_successfully_fastq is not False: | |
| 435 file_size = sum(os.path.getsize(fastq) for fastq in fastq_files) | |
| 436 # Run ReMatCh | |
| 437 time_taken_rematch_first, run_successfully_rematch_first, data_by_gene, sample_data_general_first, \ | |
| 438 consensus_files, consensus_sequences = \ | |
| 439 rematch_module.run_rematch_module(sample, fastq_files, reference_file, args.threads, sample_outdir, | |
| 440 args.extraSeq, args.minCovPresence, args.minCovCall, | |
| 441 args.minFrequencyDominantAllele, args.minGeneCoverage, | |
| 442 args.debug, args.numMapLoc, args.minGeneIdentity, | |
| 443 'first', args.softClip_baseQuality, args.softClip_recodeRun, | |
| 444 reference_dict, args.softClip_cigarFlagRecode, | |
| 445 args.bowtieAlgo, args.bowtieOPT, | |
| 446 gene_list_reference, args.notWriteConsensus, clean_run=True) | |
| 447 if run_successfully_rematch_first: | |
| 448 if args.mlst is not None and (args.mlstRun == 'first' or args.mlstRun == 'all'): | |
| 449 run_get_st(sample, mlst_dicts, consensus_sequences, args.mlstConsensus, 'first', workdir, time_str) | |
| 450 genes_present_coverage_depth = write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, | |
| 451 data_by_gene, workdir, time_str, 'first_run', | |
| 452 args.minGeneIdentity, 'coverage_depth', args.summary, | |
| 453 genes_present_coverage_depth) | |
| 454 if args.reportSequenceCoverage: | |
| 455 genes_present_sequence_coverage = write_data_by_gene(gene_list_reference, args.minGeneCoverage, | |
| 456 sample, data_by_gene, workdir, time_str, | |
| 457 'first_run', args.minGeneIdentity, | |
| 458 'sequence_coverage', args.summary, | |
| 459 genes_present_sequence_coverage) | |
| 460 if args.doubleRun: | |
| 461 rematch_second_outdir = os.path.join(sample_outdir, 'rematch_second_run', '') | |
| 462 if not os.path.isdir(rematch_second_outdir): | |
| 463 os.mkdir(rematch_second_outdir) | |
| 464 consensus_concatenated_fasta, consensus_concatenated_gene_list, consensus_concatenated_dict, \ | |
| 465 number_consensus_with_sequences = \ | |
| 466 concatenate_extra_seq_2_consensus(consensus_files['noMatter'], reference_file, args.extraSeq, | |
| 467 rematch_second_outdir) | |
| 468 if len(consensus_concatenated_gene_list) > 0: | |
| 469 if args.mlst is None or \ | |
| 470 (args.mlst is not None and number_consensus_with_sequences == len(gene_list_reference)): | |
| 471 time_taken_rematch_second, run_successfully_rematch_second, data_by_gene, \ | |
| 472 sample_data_general_second, consensus_files, consensus_sequences = \ | |
| 473 rematch_module.run_rematch_module(sample, fastq_files, consensus_concatenated_fasta, | |
| 474 args.threads, rematch_second_outdir, args.extraSeq, | |
| 475 args.minCovPresence, args.minCovCall, | |
| 476 args.minFrequencyDominantAllele, args.minGeneCoverage, | |
| 477 args.debug, args.numMapLoc, | |
| 478 args.minGeneIdentity, 'second', | |
| 479 args.softClip_baseQuality, args.softClip_recodeRun, | |
| 480 consensus_concatenated_dict, | |
| 481 args.softClip_cigarFlagRecode, | |
| 482 args.bowtieAlgo, args.bowtieOPT, | |
| 483 gene_list_reference, args.notWriteConsensus, | |
| 484 clean_run=True) | |
| 485 if not args.debug: | |
| 486 os.remove(consensus_concatenated_fasta) | |
| 487 if run_successfully_rematch_second: | |
| 488 if args.mlst is not None and (args.mlstRun == 'second' or args.mlstRun == 'all'): | |
| 489 run_get_st(sample, mlst_dicts, consensus_sequences, args.mlstConsensus, 'second', | |
| 490 workdir, time_str) | |
| 491 _ = write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene, | |
| 492 workdir, time_str, 'second_run', args.minGeneIdentity, | |
| 493 'coverage_depth', False, {}) | |
| 494 if args.reportSequenceCoverage: | |
| 495 _ = write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, | |
| 496 data_by_gene, workdir, time_str, 'second_run', | |
| 497 args.minGeneIdentity, 'sequence_coverage', False, {}) | |
| 498 else: | |
| 499 print('Some sequences missing after ReMatCh module first run. Second run will not be' | |
| 500 ' performed') | |
| 501 if os.path.isfile(consensus_concatenated_fasta): | |
| 502 os.remove(consensus_concatenated_fasta) | |
| 503 if os.path.isdir(rematch_second_outdir): | |
| 504 utils.remove_directory(rematch_second_outdir) | |
| 505 else: | |
| 506 print('No sequences left after ReMatCh module first run. Second run will not be performed') | |
| 507 if os.path.isfile(consensus_concatenated_fasta): | |
| 508 os.remove(consensus_concatenated_fasta) | |
| 509 if os.path.isdir(rematch_second_outdir): | |
| 510 utils.remove_directory(rematch_second_outdir) | |
| 511 | |
| 512 if not searched_fastq_files and not args.keepDownloadedFastq and fastq_files is not None: | |
| 513 for fastq in fastq_files: | |
| 514 if os.path.isfile(fastq): | |
| 515 os.remove(fastq) | |
| 516 | |
| 517 time_taken = utils.run_time(sample_start_time) | |
| 518 | |
| 519 write_sample_report(sample, workdir, time_str, file_size, run_successfully_fastq, | |
| 520 run_successfully_rematch_first, run_successfully_rematch_second, time_taken_fastq, | |
| 521 time_taken_rematch_first, time_taken_rematch_second, time_taken, sequencing_information, | |
| 522 sample_data_general_first if run_successfully_rematch_first else | |
| 523 {'number_absent_genes': None, 'number_genes_multiple_alleles': None, | |
| 524 'mean_sample_coverage': None}, | |
| 525 sample_data_general_second if run_successfully_rematch_second else | |
| 526 {'number_absent_genes': None, 'number_genes_multiple_alleles': None, | |
| 527 'mean_sample_coverage': None}, | |
| 528 fastq_files if fastq_files is not None else '') | |
| 529 | |
| 530 if all([run_successfully_fastq is not False, | |
| 531 run_successfully_rematch_first is not False, | |
| 532 run_successfully_rematch_second is not False]): | |
| 533 number_samples_successfully += 1 | |
| 534 | |
| 535 if args.summary: | |
| 536 write_summary_report(workdir, 'coverage_depth', time_str, gene_list_reference, genes_present_coverage_depth) | |
| 537 if args.reportSequenceCoverage: | |
| 538 write_summary_report(workdir, 'sequence_coverage', time_str, gene_list_reference, | |
| 539 genes_present_sequence_coverage) | |
| 540 | |
| 541 return number_samples_successfully, len(list_ids) | |
| 542 | |
| 543 | |
| 544 def main(): | |
| 545 if sys.version_info[0] < 3: | |
| 546 sys.exit('Must be using Python 3. Try calling "python3 rematch.py"') | |
| 547 | |
| 548 parser = argparse.ArgumentParser(prog='rematch.py', | |
| 549 description='Reads mapping against target sequences, checking mapping and' | |
| 550 ' consensus sequences production', | |
| 551 formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
| 552 parser.add_argument('--version', help='Version information', action='version', | |
| 553 version='{prog} v{version}'.format(prog=parser.prog, version=__version__)) | |
| 554 | |
| 555 parser_optional_general = parser.add_argument_group('General facultative options') | |
| 556 parser_optional_general.add_argument('-r', '--reference', type=argparse.FileType('r'), | |
| 557 metavar='/path/to/reference_sequence.fasta', | |
| 558 help='Fasta file containing reference sequences', required=False) | |
| 559 parser_optional_general.add_argument('-w', '--workdir', type=str, metavar='/path/to/workdir/directory/', | |
| 560 help='Path to the directory where ReMatCh will run and produce the outputs' | |
| 561 ' with reads (ended with fastq.gz/fq.gz and, in case of PE data, pair-end' | |
| 562 ' direction coded as _R1_001 / _R2_001 or _1 / _2) already' | |
| 563 ' present (organized in sample folders) or to be downloaded', | |
| 564 required=False, default='.') | |
| 565 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use', | |
| 566 required=False, default=1) | |
| 567 parser_optional_general.add_argument('--mlst', type=str, metavar='"Streptococcus agalactiae"', | |
| 568 help='Species name (same as in PubMLST) to be used in MLST' | |
| 569 ' determination. ReMatCh will use Bowtie2 very-sensitive-local mapping' | |
| 570 ' parameters and will recode the soft clip CIGAR flags of the first run', | |
| 571 required=False) | |
| 572 parser_optional_general.add_argument('--doNotUseProvidedSoftware', action='store_true', | |
| 573 help='Tells ReMatCh to not use Bowtie2, Samtools and Bcftools that are' | |
| 574 ' provided with it') | |
| 575 | |
| 576 parser_optional_download_exclusive = parser.add_mutually_exclusive_group() | |
| 577 parser_optional_download_exclusive.add_argument('-l', '--listIDs', type=argparse.FileType('r'), | |
| 578 metavar='/path/to/list_IDs.txt', | |
| 579 help='Path to list containing the IDs to be' | |
| 580 ' downloaded (one per line)', required=False) | |
| 581 parser_optional_download_exclusive.add_argument('-t', '--taxon', type=str, metavar='"Streptococcus agalactiae"', | |
| 582 help='Taxon name for which ReMatCh will download fastq files', | |
| 583 required=False) | |
| 584 | |
| 585 parser_optional_rematch = parser.add_argument_group('ReMatCh module facultative options') | |
| 586 parser_optional_rematch.add_argument('--extraSeq', type=int, metavar='N', | |
| 587 help='Sequence length added to both ends of target sequences (usefull to' | |
| 588 ' improve reads mapping to the target one) that will be trimmed in' | |
| 589 ' ReMatCh outputs', required=False, default=0) | |
| 590 parser_optional_rematch.add_argument('--minCovPresence', type=int, metavar='N', | |
| 591 help='Reference position minimum coverage depth to consider the position to be' | |
| 592 ' present in the sample', required=False, default=5) | |
| 593 parser_optional_rematch.add_argument('--minCovCall', type=int, metavar='N', | |
| 594 help='Reference position minimum coverage depth to perform a base call. Lower' | |
| 595 ' coverage will be coded as N', required=False, default=10) | |
| 596 parser_optional_rematch.add_argument('--minFrequencyDominantAllele', type=float, metavar='0.6', | |
| 597 help='Minimum relative frequency of the dominant allele coverage depth (value' | |
| 598 ' between [0, 1]). Positions with lower values will be considered as' | |
| 599 ' having multiple alleles (and will be coded as N)', required=False, | |
| 600 default=0.6) | |
| 601 parser_optional_rematch.add_argument('--minGeneCoverage', type=int, metavar='N', | |
| 602 help='Minimum percentage of target reference gene sequence covered' | |
| 603 ' by --minCovPresence to consider a gene to be present (value' | |
| 604 ' between [0, 100])', required=False, default=70) | |
| 605 parser_optional_rematch.add_argument('--minGeneIdentity', type=int, metavar='N', | |
| 606 help='Minimum percentage of identity of reference gene sequence covered' | |
| 607 ' by --minCovCall to consider a gene to be present (value' | |
| 608 ' between [0, 100]). One INDEL will be considered as one difference', | |
| 609 required=False, default=80) | |
| 610 parser_optional_rematch.add_argument('--numMapLoc', type=int, metavar='N', help=argparse.SUPPRESS, required=False, | |
| 611 default=1) | |
| 612 # parser_optional_rematch.add_argument('--numMapLoc', type=int, metavar='N', help='Maximum number of locations to which a read can map (sometimes useful when mapping against similar sequences)', required=False, default=1) | |
| 613 parser_optional_rematch.add_argument('--doubleRun', action='store_true', | |
| 614 help='Tells ReMatCh to run a second time using as reference the noMatter' | |
| 615 ' consensus sequence produced in the first run. This will improve' | |
| 616 ' consensus sequence determination for sequences with high percentage of' | |
| 617 ' target reference gene sequence covered') | |
| 618 parser_optional_rematch.add_argument('--reportSequenceCoverage', action='store_true', | |
| 619 help='Produce an extra combined_report.data_by_gene with the sequence coverage' | |
| 620 ' instead of coverage depth') | |
| 621 parser_optional_rematch.add_argument('--summary', action='store_true', | |
| 622 help='Produce extra report files containing only sequences present in at least' | |
| 623 ' one sample (usefull when using a large number of reference' | |
| 624 ' sequences, and only for first run)') | |
| 625 parser_optional_rematch.add_argument('--notWriteConsensus', action='store_true', | |
| 626 help='Do not write consensus sequences') | |
| 627 parser_optional_rematch.add_argument('--bowtieAlgo', type=str, metavar='"--very-sensitive-local"', | |
| 628 help='Bowtie2 alignment mode. It can be an end-to-end alignment (unclipped' | |
| 629 ' alignment) or local alignment (soft clipped alignment). Also, can' | |
| 630 ' choose between fast or sensitive alignments. Please check Bowtie2' | |
| 631 ' manual for extra' | |
| 632 ' information: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml .' | |
| 633 ' This option should be provided between quotes and starting with' | |
| 634 ' an empty space (like --bowtieAlgo " --very-fast") or using equal' | |
| 635 ' sign (like --bowtieAlgo="--very-fast")', | |
| 636 required=False, default='--very-sensitive-local') | |
| 637 parser_optional_rematch.add_argument('--bowtieOPT', type=str, metavar='"--no-mixed"', | |
| 638 help='Extra Bowtie2 options. This option should be provided between quotes and' | |
| 639 ' starting with an empty space (like --bowtieOPT " --no-mixed") or using' | |
| 640 ' equal sign (like --bowtieOPT="--no-mixed")', | |
| 641 required=False) | |
| 642 parser_optional_rematch.add_argument('--debug', action='store_true', | |
| 643 help='DeBug Mode: do not remove temporary files') | |
| 644 | |
| 645 parser_optional_mlst = parser.add_argument_group('MLST facultative options') | |
| 646 parser_optional_rematch.add_argument('--mlstReference', action='store_true', | |
| 647 help='If the curated scheme for MLST alleles is available, tells ReMatCh to' | |
| 648 ' use these as reference (force Bowtie2 to run with very-sensitive-local' | |
| 649 ' parameters, and sets --extraSeq to 200), otherwise ReMatCh uses the' | |
| 650 ' first alleles of each MLST gene fragment in PubMLST as reference' | |
| 651 ' sequences (force Bowtie2 to run with very-sensitive-local parameters,' | |
| 652 ' and sets --extraSeq to 0)') | |
| 653 parser_optional_mlst.add_argument('--mlstSchemaNumber', type=int, metavar='N', | |
| 654 help='Number of the species PubMLST schema to be used in case of multiple schemes' | |
| 655 ' available (by default will use the first schema)', required=False) | |
| 656 parser_optional_mlst.add_argument('--mlstConsensus', choices=['noMatter', 'correct', 'alignment', 'all'], type=str, | |
| 657 metavar='noMatter', | |
| 658 help='Consensus sequence to be used in MLST' | |
| 659 ' determination (available options: %(choices)s)', required=False, | |
| 660 default='noMatter') | |
| 661 parser_optional_mlst.add_argument('--mlstRun', choices=['first', 'second', 'all'], type=str, metavar='first', | |
| 662 help='ReMatCh run outputs to be used in MLST determination (available' | |
| 663 ' options: %(choices)s)', required=False, default='all') | |
| 664 | |
| 665 parser_optional_download = parser.add_argument_group('Download facultative options') | |
| 666 parser_optional_download.add_argument('-a', '--asperaKey', type=argparse.FileType('r'), | |
| 667 metavar='/path/to/asperaweb_id_dsa.openssh', | |
| 668 help='Tells ReMatCh to download fastq files from ENA using Aspera' | |
| 669 ' Connect. With this option, the path to Private-key file' | |
| 670 ' asperaweb_id_dsa.openssh must be provided (normaly found in' | |
| 671 ' ~/.aspera/connect/etc/asperaweb_id_dsa.openssh).', required=False) | |
| 672 parser_optional_download.add_argument('-k', '--keepDownloadedFastq', action='store_true', | |
| 673 help='Tells ReMatCh to keep the fastq files downloaded') | |
| 674 parser_optional_download.add_argument('--downloadLibrariesType', type=str, metavar='PAIRED', | |
| 675 help='Tells ReMatCh to download files with specific library' | |
| 676 ' layout (available options: %(choices)s)', | |
| 677 choices=['PAIRED', 'SINGLE', 'BOTH'], required=False, default='BOTH') | |
| 678 parser_optional_download.add_argument('--downloadInstrumentPlatform', type=str, metavar='ILLUMINA', | |
| 679 help='Tells ReMatCh to download files with specific library layout (available' | |
| 680 ' options: %(choices)s)', choices=['ILLUMINA', 'ALL'], required=False, | |
| 681 default='ILLUMINA') | |
| 682 parser_optional_download.add_argument('--downloadCramBam', action='store_true', | |
| 683 help='Tells ReMatCh to also download cram/bam files and convert them to fastq' | |
| 684 ' files') | |
| 685 | |
| 686 parser_optional_sra = parser.add_mutually_exclusive_group() | |
| 687 parser_optional_sra.add_argument('--SRA', action='store_true', | |
| 688 help='Tells getSeqENA.py to download reads in fastq format only from NCBI SRA' | |
| 689 ' database (not recommended)') | |
| 690 parser_optional_sra.add_argument('--SRAopt', action='store_true', | |
| 691 help='Tells getSeqENA.py to download reads from NCBI SRA if the download from ENA' | |
| 692 ' fails') | |
| 693 | |
| 694 parser_optional_soft_clip = parser.add_argument_group('Soft clip facultative options') | |
| 695 parser_optional_soft_clip.add_argument('--softClip_baseQuality', type=int, metavar='N', | |
| 696 help='Base quality phred score in reads soft clipped regions', | |
| 697 required=False, | |
| 698 default=7) | |
| 699 parser_optional_soft_clip.add_argument('--softClip_recodeRun', type=str, metavar='first', | |
| 700 help='ReMatCh run to recode soft clipped regions (available' | |
| 701 ' options: %(choices)s)', choices=['first', 'second', 'both', 'none'], | |
| 702 required=False, default='none') | |
| 703 parser_optional_soft_clip.add_argument('--softClip_cigarFlagRecode', type=str, metavar='M', | |
| 704 help='CIGAR flag to recode CIGAR soft clip (available options: %(choices)s)', | |
| 705 choices=['M', 'I', 'X'], required=False, default='X') | |
| 706 | |
| 707 args = parser.parse_args() | |
| 708 | |
| 709 msg = [] | |
| 710 if args.reference is None and not args.mlstReference: | |
| 711 msg.append('At least --reference or --mlstReference should be provided') | |
| 712 elif args.reference is not None and args.mlstReference: | |
| 713 msg.append('Only --reference or --mlstReference should be provided') | |
| 714 else: | |
| 715 if args.mlstReference: | |
| 716 if args.mlst is None: | |
| 717 msg.append('Please provide species name using --mlst') | |
| 718 if args.minFrequencyDominantAllele < 0 or args.minFrequencyDominantAllele > 1: | |
| 719 msg.append('--minFrequencyDominantAllele should be a value between [0, 1]') | |
| 720 if args.minGeneCoverage < 0 or args.minGeneCoverage > 100: | |
| 721 msg.append('--minGeneCoverage should be a value between [0, 100]') | |
| 722 if args.minGeneIdentity < 0 or args.minGeneIdentity > 100: | |
| 723 msg.append('--minGeneIdentity should be a value between [0, 100]') | |
| 724 if args.notWriteConsensus and args.doubleRun: | |
| 725 msg.append('--notWriteConsensus and --doubleRun cannot be used together.' | |
| 726 ' Maybe you only want to use --doubleRun') | |
| 727 | |
| 728 if len(msg) > 0: | |
| 729 argparse.ArgumentParser.error('\n'.join(msg)) | |
| 730 | |
| 731 start_time = time.time() | |
| 732 | |
| 733 number_samples_successfully, samples_total_number = run_rematch(args) | |
| 734 | |
| 735 print('\n' + 'END ReMatCh') | |
| 736 print('\n' + | |
| 737 str(number_samples_successfully) + ' samples out of ' + str(samples_total_number) + ' run successfully') | |
| 738 time_taken = utils.run_time(start_time) | |
| 739 del time_taken | |
| 740 | |
| 741 if number_samples_successfully == 0: | |
| 742 sys.exit('No samples run successfully!') | |
| 743 | |
| 744 | |
| 745 if __name__ == "__main__": | |
| 746 main() |
