Mercurial > repos > cstrittmatter > test_eurl_vtec_wgs_pt
comparison scripts/ReMatCh/rematch.py @ 0:965517909457 draft
planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author | cstrittmatter |
---|---|
date | Wed, 22 Jan 2020 08:41:44 -0500 |
parents | |
children | 0cbed1c0a762 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:965517909457 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # -*- coding: utf-8 -*- | |
4 | |
5 """ | |
6 rematch.py - Reads mapping against target sequences, checking mapping | |
7 and consensus sequences production | |
8 <https://github.com/B-UMMI/ReMatCh/> | |
9 | |
10 Copyright (C) 2017 Miguel Machado <mpmachado@medicina.ulisboa.pt> | |
11 | |
12 Last modified: April 12, 2017 | |
13 | |
14 This program is free software: you can redistribute it and/or modify | |
15 it under the terms of the GNU General Public License as published by | |
16 the Free Software Foundation, either version 3 of the License, or | |
17 (at your option) any later version. | |
18 | |
19 This program is distributed in the hope that it will be useful, | |
20 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
22 GNU General Public License for more details. | |
23 | |
24 You should have received a copy of the GNU General Public License | |
25 along with this program. If not, see <http://www.gnu.org/licenses/>. | |
26 """ | |
27 | |
28 import os | |
29 import sys | |
30 import time | |
31 import argparse | |
32 import modules.utils as utils | |
33 import modules.seqFromWebTaxon as seqFromWebTaxon | |
34 import modules.download as download | |
35 import modules.rematch_module as rematch_module | |
36 import modules.checkMLST as checkMLST | |
37 | |
38 | |
39 version = '3.2' | |
40 | |
41 | |
42 def searchFastqFiles(directory): | |
43 filesExtensions = ['.fastq.gz', '.fq.gz'] | |
44 pairEnd_filesSeparation = [['_R1_001.f', '_R2_001.f'], ['_1.f', '_2.f']] | |
45 | |
46 listIDs = {} | |
47 directories = [d for d in os.listdir(directory) if not d.startswith('.') and os.path.isdir(os.path.join(directory, d, ''))] | |
48 for directory_found in directories: | |
49 if directory_found != 'pubmlst': | |
50 directory_path = os.path.join(directory, directory_found, '') | |
51 | |
52 fastqFound = [] | |
53 files = [f for f in os.listdir(directory_path) if not f.startswith('.') and os.path.isfile(os.path.join(directory_path, f))] | |
54 for file_found in files: | |
55 if file_found.endswith(tuple(filesExtensions)): | |
56 fastqFound.append(file_found) | |
57 | |
58 if len(fastqFound) == 1: | |
59 listIDs[directory_found] = [os.path.join(directory_path, f) for f in fastqFound] | |
60 elif len(fastqFound) >= 2: | |
61 file_pair = [] | |
62 | |
63 # Search pairs | |
64 for PE_separation in pairEnd_filesSeparation: | |
65 for fastq in fastqFound: | |
66 if PE_separation[0] in fastq or PE_separation[1] in fastq: | |
67 file_pair.append(fastq) | |
68 | |
69 if len(file_pair) == 2: | |
70 break | |
71 else: | |
72 file_pair = [] | |
73 | |
74 # Search single | |
75 if len(file_pair) == 0: | |
76 for PE_separation in pairEnd_filesSeparation: | |
77 for fastq in fastqFound: | |
78 if PE_separation[0] not in fastq or PE_separation[1] not in fastq: | |
79 file_pair.append(fastq) | |
80 | |
81 if len(file_pair) >= 1: | |
82 file_pair = file_pair[0] | |
83 | |
84 if len(file_pair) >= 1: | |
85 listIDs[directory_found] = [os.path.join(directory_path, f) for f in file_pair] | |
86 | |
87 return listIDs | |
88 | |
89 | |
90 def getListIDs_fromFile(fileListIDs): | |
91 list_ids = [] | |
92 | |
93 with open(fileListIDs, 'rtU') as lines: | |
94 for line in lines: | |
95 line = line.splitlines()[0] | |
96 if len(line) > 0: | |
97 list_ids.append(line) | |
98 | |
99 if len(list_ids) == 0: | |
100 sys.exit('No runIDs were found in ' + fileListIDs) | |
101 | |
102 return list_ids | |
103 | |
104 | |
105 def getTaxonRunIDs(taxon_name, outputfile): | |
106 seqFromWebTaxon.runSeqFromWebTaxon(taxon_name, outputfile, True, True, True, False) | |
107 | |
108 runIDs = [] | |
109 with open(outputfile, 'rtU') as reader: | |
110 for line in reader: | |
111 line = line.splitlines()[0] | |
112 if len(line) > 0: | |
113 if not line.startswith('#'): | |
114 line = line.split('\t') | |
115 runIDs.append(line[0]) | |
116 | |
117 return runIDs | |
118 | |
119 | |
120 def getListIDs(workdir, fileListIDs, taxon_name): | |
121 searched_fastq_files = False | |
122 listIDs = [] | |
123 if fileListIDs is None and taxon_name is None: | |
124 listIDs = searchFastqFiles(workdir) | |
125 searched_fastq_files = True | |
126 elif fileListIDs is not None: | |
127 listIDs = getListIDs_fromFile(os.path.abspath(fileListIDs)) | |
128 elif taxon_name is not None and fileListIDs is None: | |
129 listIDs = getTaxonRunIDs(taxon_name, os.path.join(workdir, 'IDs_list.seqFromWebTaxon.tab')) | |
130 | |
131 if len(listIDs) == 0: | |
132 sys.exit('No IDs were found') | |
133 return listIDs, searched_fastq_files | |
134 | |
135 | |
136 def format_gene_info(gene_specific_info, minimum_gene_coverage, minimum_gene_identity, reported_data_type): | |
137 info = None | |
138 if gene_specific_info['gene_coverage'] >= minimum_gene_coverage and gene_specific_info['gene_identity'] >= minimum_gene_identity: | |
139 if gene_specific_info['gene_number_positions_multiple_alleles'] == 0: | |
140 info = str(gene_specific_info[reported_data_type]) | |
141 else: | |
142 info = 'multiAlleles_' + str(gene_specific_info[reported_data_type]) | |
143 else: | |
144 info = 'absent_' + str(gene_specific_info[reported_data_type]) | |
145 | |
146 return info | |
147 | |
148 | |
149 def write_data_by_gene(gene_list_reference, minimum_gene_coverage, sample, data_by_gene, outdir, time_str, run_times, minimum_gene_identity, reported_data_type): | |
150 combined_report = os.path.join(outdir, 'combined_report.data_by_gene.' + run_times + '.' + reported_data_type + '.' + time_str + '.tab') | |
151 | |
152 if reported_data_type == 'coverage_depth': | |
153 reported_data_type = 'gene_mean_read_coverage' | |
154 elif reported_data_type == 'sequence_coverage': | |
155 reported_data_type = 'gene_coverage' | |
156 | |
157 combined_report_exist = os.path.isfile(combined_report) | |
158 with open(combined_report, 'at') as writer: | |
159 seq_list = gene_list_reference.keys() | |
160 if not combined_report_exist: | |
161 writer.write('#sample' + '\t' + '\t'.join([gene_list_reference[seq] for seq in seq_list]) + '\n') | |
162 | |
163 results = {} | |
164 headers = [] | |
165 for i in data_by_gene: | |
166 results[data_by_gene[i]['header']] = format_gene_info(data_by_gene[i], minimum_gene_coverage, minimum_gene_identity, reported_data_type) | |
167 headers.append(data_by_gene[i]['header']) | |
168 | |
169 if len(headers) != gene_list_reference: | |
170 for gene in gene_list_reference: | |
171 if gene not in headers: | |
172 results[gene] = 'NA' | |
173 | |
174 writer.write(sample + '\t' + '\t'.join([results[seq] for seq in seq_list]) + '\n') | |
175 | |
176 | |
177 def write_sample_report(sample, outdir, time_str, fileSize, run_successfully_fastq, run_successfully_rematch_first, run_successfully_rematch_second, time_taken_fastq, time_taken_rematch_first, time_taken_rematch_second, time_taken_sample, sequencingInformation, sample_data_general_first, sample_data_general_second, fastq_used): | |
178 sample_report = os.path.join(outdir, 'sample_report.' + time_str + '.tab') | |
179 report_exist = os.path.isfile(sample_report) | |
180 | |
181 header_general = ['sample', 'sample_run_successfully', 'sample_run_time', 'files_size', 'download_run_successfully', 'download_run_time', 'rematch_run_successfully_first', 'rematch_run_time_first', 'rematch_run_successfully_second', 'rematch_run_time_second'] | |
182 header_data_general = ['number_absent_genes', 'number_genes_multiple_alleles', 'mean_sample_coverage'] | |
183 header_sequencing = ['run_accession', 'instrument_platform', 'instrument_model', 'library_layout', 'library_source', 'extra_run_accession', 'nominal_length', 'read_count', 'base_count', 'date_download'] | |
184 | |
185 with open(sample_report, 'at') as writer: | |
186 if not report_exist: | |
187 writer.write('#' + '\t'.join(header_general) + '\t' + '_first\t'.join(header_data_general) + '_first\t' + '_second\t'.join(header_data_general) + '_second\t' + '\t'.join(header_sequencing) + '\t' + 'fastq_used' + '\n') | |
188 | |
189 writer.write('\t'.join([sample, str(all([run_successfully_fastq is not False, run_successfully_rematch_first is not False, run_successfully_rematch_second is not False])), str(time_taken_sample), str(fileSize), str(run_successfully_fastq), str(time_taken_fastq), str(run_successfully_rematch_first), str(time_taken_rematch_first), str(run_successfully_rematch_second), str(time_taken_rematch_second)]) + '\t' + '\t'.join([str(sample_data_general_first[i]) for i in header_data_general]) + '\t' + '\t'.join([str(sample_data_general_second[i]) for i in header_data_general]) + '\t' + '\t'.join([str(sequencingInformation[i]) for i in header_sequencing]) + '\t' + ','.join(fastq_used) + '\n') | |
190 | |
191 | |
192 def concatenate_extraSeq_2_consensus(consensus_sequence, reference_sequence, extraSeq_length, outdir): | |
193 reference_dict, ignore, ignore = rematch_module.get_sequence_information(reference_sequence, extraSeq_length) | |
194 consensus_dict, genes, ignore = rematch_module.get_sequence_information(consensus_sequence, 0) | |
195 for k, values_consensus in consensus_dict.items(): | |
196 for values_reference in reference_dict.values(): | |
197 if values_reference['header'] == values_consensus['header']: | |
198 if extraSeq_length <= len(values_reference['sequence']): | |
199 right_extra_seq = '' if extraSeq_length == 0 else values_reference['sequence'][-extraSeq_length:] | |
200 consensus_dict[k]['sequence'] = values_reference['sequence'][:extraSeq_length] + consensus_dict[k]['sequence'] + right_extra_seq | |
201 consensus_dict[k]['length'] += extraSeq_length + len(right_extra_seq) | |
202 | |
203 consensus_concatenated = os.path.join(outdir, 'consensus_concatenated_extraSeq.fasta') | |
204 with open(consensus_concatenated, 'wt') as writer: | |
205 for i in consensus_dict: | |
206 writer.write('>' + consensus_dict[i]['header'] + '\n') | |
207 fasta_sequence_lines = rematch_module.chunkstring(consensus_dict[i]['sequence'], 80) | |
208 for line in fasta_sequence_lines: | |
209 writer.write(line + '\n') | |
210 | |
211 return consensus_concatenated, genes, consensus_dict | |
212 | |
213 | |
214 def clean_headers_reference_file(reference_file, outdir, extraSeq): | |
215 problematic_characters = ["|", " ", ",", ".", "(", ")", "'", "/", ":"] | |
216 print 'Checking if reference sequences contain ' + str(problematic_characters) + '\n' | |
217 headers_changed = False | |
218 new_reference_file = str(reference_file) | |
219 sequences, genes, headers_changed = rematch_module.get_sequence_information(reference_file, extraSeq) | |
220 if headers_changed: | |
221 print 'At least one of the those characters was found. Replacing those with _' + '\n' | |
222 new_reference_file = os.path.join(outdir, os.path.splitext(os.path.basename(reference_file))[0] + '.headers_renamed.fasta') | |
223 with open(new_reference_file, 'wt') as writer: | |
224 for i in sequences: | |
225 writer.write('>' + sequences[i]['header'] + '\n') | |
226 fasta_sequence_lines = rematch_module.chunkstring(sequences[i]['sequence'], 80) | |
227 for line in fasta_sequence_lines: | |
228 writer.write(line + '\n') | |
229 return new_reference_file, genes, sequences | |
230 | |
231 | |
232 def write_mlst_report(sample, run_times, consensus_type, st, alleles_profile, lociOrder, outdir, time_str): | |
233 mlst_report = os.path.join(outdir, 'mlst_report.' + time_str + '.tab') | |
234 mlst_report_exist = os.path.isfile(mlst_report) | |
235 with open(mlst_report, 'at') as writer: | |
236 if not mlst_report_exist: | |
237 writer.write('\t'.join(['#sample', 'ReMatCh_run', 'consensus_type', 'ST'] + lociOrder) + '\n') | |
238 writer.write('\t'.join([sample, run_times, consensus_type, str(st)] + alleles_profile.split(',')) + '\n') | |
239 | |
240 | |
241 def run_get_st(sample, mlst_dicts, consensus_sequences, mlstConsensus, run_times, outdir, time_str): | |
242 if mlstConsensus == 'all': | |
243 for consensus_type in consensus_sequences: | |
244 print 'Searching MLST for ' + consensus_type + ' consensus' | |
245 st, alleles_profile = checkMLST.getST(mlst_dicts, consensus_sequences[consensus_type]) | |
246 write_mlst_report(sample, run_times, consensus_type, st, alleles_profile, mlst_dicts[2], outdir, time_str) | |
247 print 'ST found: ' + str(st) + ' (' + alleles_profile + ')' | |
248 else: | |
249 st, alleles_profile = checkMLST.getST(mlst_dicts, consensus_sequences[mlstConsensus]) | |
250 write_mlst_report(sample, run_times, mlstConsensus, st, alleles_profile, mlst_dicts[2], outdir, time_str) | |
251 print 'ST found for ' + mlstConsensus + ' consensus: ' + str(st) + ' (' + alleles_profile + ')' | |
252 | |
253 | |
254 def runRematch(args): | |
255 workdir = os.path.abspath(args.workdir) | |
256 if not os.path.isdir(workdir): | |
257 os.makedirs(workdir) | |
258 | |
259 asperaKey = os.path.abspath(args.asperaKey.name) if args.asperaKey is not None else None | |
260 | |
261 # Start logger | |
262 logfile, time_str = utils.start_logger(workdir) | |
263 | |
264 # Get general information | |
265 script_path = utils.general_information(logfile, version, workdir, time_str, args.doNotUseProvidedSoftware, asperaKey, args.downloadCramBam) | |
266 | |
267 # Set listIDs | |
268 listIDs, searched_fastq_files = getListIDs(workdir, args.listIDs.name if args.listIDs is not None else None, args.taxon) | |
269 | |
270 if args.mlst is not None: | |
271 time_taken_PubMLST, mlst_dicts, mlst_sequences = checkMLST.downloadPubMLSTxml(args.mlst, args.mlstSchemaNumber, workdir) | |
272 args.softClip_recodeRun = 'first' | |
273 args.conservedSeq = False | |
274 | |
275 if args.reference is None: | |
276 reference_file = checkMLST.check_existing_schema(args.mlst, args.mlstSchemaNumber, script_path) | |
277 args.extraSeq = 200 | |
278 if reference_file is None: | |
279 print 'It was not found provided MLST scheme sequences for ' + args.mlst | |
280 print 'Trying to obtain reference MLST sequences from PubMLST' | |
281 if len(mlst_sequences) > 0: | |
282 reference_file = checkMLST.write_mlst_reference(args.mlst, mlst_sequences, workdir, time_str) | |
283 args.extraSeq = 0 | |
284 else: | |
285 sys.exit('It was not possible to download MLST sequences from PubMLST!') | |
286 else: | |
287 print 'Using provided scheme as referece: ' + reference_file | |
288 else: | |
289 reference_file = os.path.abspath(args.reference.name) | |
290 | |
291 # Run ReMatCh for each sample | |
292 print '\n' + 'STARTING ReMatCh' + '\n' | |
293 | |
294 # Clean sequences headers | |
295 reference_file, gene_list_reference, reference_dict = clean_headers_reference_file(reference_file, workdir, args.extraSeq) | |
296 | |
297 if args.mlst is not None: | |
298 problem_genes = False | |
299 for header in mlst_sequences: | |
300 if header not in gene_list_reference: | |
301 print 'MLST gene {header} not found between reference sequences'.format(header=header) | |
302 problem_genes = True | |
303 if problem_genes: | |
304 sys.exit('Missing MLST genes from reference sequences (at least sequences names do not match)!') | |
305 | |
306 if len(gene_list_reference) == 0: | |
307 sys.exit('No sequences left') | |
308 | |
309 # To use in combined report | |
310 | |
311 number_samples_successfully = 0 | |
312 for sample in listIDs: | |
313 sample_start_time = time.time() | |
314 print '\n\n' + 'Sample ID: ' + sample | |
315 | |
316 # Create sample outdir | |
317 sample_outdir = os.path.join(workdir, sample, '') | |
318 if not os.path.isdir(sample_outdir): | |
319 os.mkdir(sample_outdir) | |
320 | |
321 run_successfully_fastq = None | |
322 time_taken_fastq = 0 | |
323 sequencingInformation = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, 'library_layout': None, 'library_source': None, 'extra_run_accession': None, 'nominal_length': None, 'read_count': None, 'base_count': None, 'date_download': None} | |
324 if not searched_fastq_files: | |
325 # Download Files | |
326 time_taken_fastq, run_successfully_fastq, fastq_files, sequencingInformation = download.runDownload(sample, args.downloadLibrariesType, asperaKey, sample_outdir, args.downloadCramBam, args.threads, args.downloadInstrumentPlatform) | |
327 else: | |
328 fastq_files = listIDs[sample] | |
329 | |
330 fileSize = None | |
331 | |
332 run_successfully_rematch_first = None | |
333 run_successfully_rematch_second = None | |
334 time_taken_rematch_first = 0 | |
335 time_taken_rematch_second = 0 | |
336 if run_successfully_fastq is not False: | |
337 fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files) | |
338 # Run ReMatCh | |
339 time_taken_rematch_first, run_successfully_rematch_first, data_by_gene, sample_data_general_first, consensus_files, consensus_sequences = rematch_module.runRematchModule(sample, fastq_files, reference_file, args.threads, sample_outdir, args.extraSeq, args.minCovPresence, args.minCovCall, args.minFrequencyDominantAllele, args.minGeneCoverage, args.conservedSeq, args.debug, args.numMapLoc, args.minGeneIdentity, 'first', args.softClip_baseQuality, args.softClip_recodeRun, reference_dict, args.softClip_cigarFlagRecode, args.bowtieOPT, gene_list_reference, args.notWriteConsensus) | |
340 if run_successfully_rematch_first: | |
341 if args.mlst is not None and (args.mlstRun == 'first' or args.mlstRun == 'all'): | |
342 run_get_st(sample, mlst_dicts, consensus_sequences, args.mlstConsensus, 'first', workdir, time_str) | |
343 write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene, workdir, time_str, 'first_run', args.minGeneIdentity, 'coverage_depth') | |
344 if args.reportSequenceCoverage: | |
345 write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene, workdir, time_str, 'first_run', args.minGeneIdentity, 'sequence_coverage') | |
346 if args.doubleRun: | |
347 rematch_second_outdir = os.path.join(sample_outdir, 'rematch_second_run', '') | |
348 if not os.path.isdir(rematch_second_outdir): | |
349 os.mkdir(rematch_second_outdir) | |
350 consensus_concatenated_fasta, consensus_concatenated_gene_list, consensus_concatenated_dict = concatenate_extraSeq_2_consensus(consensus_files['noMatter'], reference_file, args.extraSeq, rematch_second_outdir) | |
351 if len(consensus_concatenated_gene_list) > 0: | |
352 time_taken_rematch_second, run_successfully_rematch_second, data_by_gene, sample_data_general_second, consensus_files, consensus_sequences = rematch_module.runRematchModule(sample, fastq_files, consensus_concatenated_fasta, args.threads, rematch_second_outdir, args.extraSeq, args.minCovPresence, args.minCovCall, args.minFrequencyDominantAllele, args.minGeneCoverage, args.conservedSeq, args.debug, args.numMapLoc, args.minGeneIdentity, 'second', args.softClip_baseQuality, args.softClip_recodeRun, consensus_concatenated_dict, args.softClip_cigarFlagRecode, args.bowtieOPT, gene_list_reference, args.notWriteConsensus) | |
353 if not args.debug: | |
354 os.remove(consensus_concatenated_fasta) | |
355 if run_successfully_rematch_second: | |
356 if args.mlst is not None and (args.mlstRun == 'second' or args.mlstRun == 'all'): | |
357 run_get_st(sample, mlst_dicts, consensus_sequences, args.mlstConsensus, 'second', workdir, time_str) | |
358 write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene, workdir, time_str, 'second_run', args.minGeneIdentity, 'coverage_depth') | |
359 if args.reportSequenceCoverage: | |
360 write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene, workdir, time_str, 'second_run', args.minGeneIdentity, 'sequence_coverage') | |
361 else: | |
362 print 'No sequences left after ReMatCh module first run. Second run will not be performed' | |
363 if os.path.isfile(consensus_concatenated_fasta): | |
364 os.remove(consensus_concatenated_fasta) | |
365 if os.path.isdir(rematch_second_outdir): | |
366 utils.removeDirectory(rematch_second_outdir) | |
367 | |
368 if not searched_fastq_files and not args.keepDownloadedFastq and fastq_files is not None: | |
369 for fastq in fastq_files: | |
370 if os.path.isfile(fastq): | |
371 os.remove(fastq) | |
372 | |
373 time_taken = utils.runTime(sample_start_time) | |
374 | |
375 write_sample_report(sample, workdir, time_str, fileSize, run_successfully_fastq, run_successfully_rematch_first, run_successfully_rematch_second, time_taken_fastq, time_taken_rematch_first, time_taken_rematch_second, time_taken, sequencingInformation, sample_data_general_first if run_successfully_rematch_first else {'number_absent_genes': None, 'number_genes_multiple_alleles': None, 'mean_sample_coverage': None}, sample_data_general_second if run_successfully_rematch_second else {'number_absent_genes': None, 'number_genes_multiple_alleles': None, 'mean_sample_coverage': None}, fastq_files if fastq_files is not None else '') | |
376 | |
377 if all([run_successfully_fastq is not False, run_successfully_rematch_first is not False, run_successfully_rematch_second is not False]): | |
378 number_samples_successfully += 1 | |
379 | |
380 return number_samples_successfully, len(listIDs) | |
381 | |
382 | |
383 def main(): | |
384 parser = argparse.ArgumentParser(prog='rematch.py', description='Reads mapping against target sequences, checking mapping and consensus sequences production', formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
385 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) | |
386 | |
387 parser_optional_general = parser.add_argument_group('General facultative options') | |
388 parser_optional_general.add_argument('-r', '--reference', type=argparse.FileType('r'), metavar='/path/to/reference_sequence.fasta', help='Fasta file containing reference sequences', required=False) | |
389 parser_optional_general.add_argument('-w', '--workdir', type=str, metavar='/path/to/workdir/directory/', help='Path to the directory where ReMatCh will run and produce the outputs with reads (ended with fastq.gz/fq.gz and, in case of PE data, pair-end direction coded as _R1_001 / _R2_001 or _1 / _2) already present (organized in sample folders) or to be downloaded', required=False, default='.') | |
390 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use', required=False, default=1) | |
391 parser_optional_general.add_argument('--mlst', type=str, metavar='"Streptococcus agalactiae"', help='Species name (same as in PubMLST) to be used in MLST determination. ReMatCh will use Bowtie2 very-sensitive-local mapping parameters and will recode the soft clip CIGAR flags of the first run', required=False) | |
392 parser_optional_general.add_argument('--doNotUseProvidedSoftware', action='store_true', help='Tells ReMatCh to not use Bowtie2, Samtools and Bcftools that are provided with it') | |
393 | |
394 parser_optional_rematch = parser.add_argument_group('ReMatCh module facultative options') | |
395 parser_optional_rematch.add_argument('--conservedSeq', action='store_true', help=argparse.SUPPRESS) | |
396 # parser_optional_rematch.add_argument('--conservedSeq', action='store_true', help='This option can be used with conserved sequences like MLST genes to speedup the analysis by alignning reads using Bowtie2 sensitive algorithm') | |
397 parser_optional_rematch.add_argument('--extraSeq', type=int, metavar='N', help='Sequence length added to both ends of target sequences (usefull to improve reads mapping to the target one) that will be trimmed in ReMatCh outputs', required=False, default=0) | |
398 parser_optional_rematch.add_argument('--minCovPresence', type=int, metavar='N', help='Reference position minimum coverage depth to consider the position to be present in the sample', required=False, default=5) | |
399 parser_optional_rematch.add_argument('--minCovCall', type=int, metavar='N', help='Reference position minimum coverage depth to perform a base call. Lower coverage will be coded as N', required=False, default=10) | |
400 parser_optional_rematch.add_argument('--minFrequencyDominantAllele', type=float, metavar='0.6', help='Minimum relative frequency of the dominant allele coverage depth (value between [0, 1]). Positions with lower values will be considered as having multiple alleles (and will be coded as N)', required=False, default=0.6) | |
401 parser_optional_rematch.add_argument('--minGeneCoverage', type=int, metavar='N', help='Minimum percentage of target reference gene sequence covered by --minCovPresence to consider a gene to be present (value between [0, 100])', required=False, default=70) | |
402 parser_optional_rematch.add_argument('--minGeneIdentity', type=int, metavar='N', help='Minimum percentage of identity of reference gene sequence covered by --minCovCall to consider a gene to be present (value between [0, 100]). One INDEL will be considered as one difference', required=False, default=80) | |
403 parser_optional_rematch.add_argument('--numMapLoc', type=int, metavar='N', help=argparse.SUPPRESS, required=False, default=1) | |
404 # parser_optional_rematch.add_argument('--numMapLoc', type=int, metavar='N', help='Maximum number of locations to which a read can map (sometimes useful when mapping against similar sequences)', required=False, default=1) | |
405 parser_optional_rematch.add_argument('--doubleRun', action='store_true', help='Tells ReMatCh to run a second time using as reference the noMatter consensus sequence produced in the first run. This will improve consensus sequence determination for sequences with high percentage of target reference gene sequence covered') | |
406 parser_optional_rematch.add_argument('--reportSequenceCoverage', action='store_true', help='Produce an extra combined_report.data_by_gene with the sequence coverage instead of coverage depth') | |
407 parser_optional_rematch.add_argument('--notWriteConsensus', action='store_true', help='Do not write consensus sequences') | |
408 parser_optional_rematch.add_argument('--bowtieOPT', type=str, metavar='"--no-mixed"', help='Extra Bowtie2 options', required=False) | |
409 parser_optional_rematch.add_argument('--debug', action='store_true', help='DeBug Mode: do not remove temporary files') | |
410 | |
411 parser_optional_mlst = parser.add_argument_group('MLST facultative options') | |
412 parser_optional_rematch.add_argument('--mlstReference', action='store_true', help='If the curated scheme for MLST alleles is available, tells ReMatCh to use these as reference (force Bowtie2 to run with very-sensitive-local parameters, and sets --extraSeq to 200), otherwise ReMatCh uses the first alleles of each MLST gene fragment in PubMLST as reference sequences (force Bowtie2 to run with very-sensitive-local parameters, and sets --extraSeq to 0)') | |
413 parser_optional_mlst.add_argument('--mlstSchemaNumber', type=int, metavar='N', help='Number of the species PubMLST schema to be used in case of multiple schemes available (by default will use the first schema)', required=False) | |
414 parser_optional_mlst.add_argument('--mlstConsensus', choices=['noMatter', 'correct', 'alignment', 'all'], type=str, metavar='noMatter', help='Consensus sequence to be used in MLST determination (available options: %(choices)s)', required=False, default='noMatter') | |
415 parser_optional_mlst.add_argument('--mlstRun', choices=['first', 'second', 'all'], type=str, metavar='first', help='ReMatCh run outputs to be used in MLST determination (available options: %(choices)s)', required=False, default='all') | |
416 | |
417 parser_optional_download = parser.add_argument_group('Download facultative options') | |
418 parser_optional_download.add_argument('-a', '--asperaKey', type=argparse.FileType('r'), metavar='/path/to/asperaweb_id_dsa.openssh', help='Tells ReMatCh to download fastq files from ENA using Aspera Connect. With this option, the path to Private-key file asperaweb_id_dsa.openssh must be provided (normaly found in ~/.aspera/connect/etc/asperaweb_id_dsa.openssh).', required=False) | |
419 parser_optional_download.add_argument('-k', '--keepDownloadedFastq', action='store_true', help='Tells ReMatCh to keep the fastq files downloaded') | |
420 parser_optional_download.add_argument('--downloadLibrariesType', type=str, metavar='PAIRED', help='Tells ReMatCh to download files with specific library layout (available options: %(choices)s)', choices=['PAIRED', 'SINGLE', 'BOTH'], required=False, default='BOTH') | |
421 parser_optional_download.add_argument('--downloadInstrumentPlatform', type=str, metavar='ILLUMINA', help='Tells ReMatCh to download files with specific library layout (available options: %(choices)s)', choices=['ILLUMINA', 'ALL'], required=False, default='ILLUMINA') | |
422 parser_optional_download.add_argument('--downloadCramBam', action='store_true', help='Tells ReMatCh to also download cram/bam files and convert them to fastq files') | |
423 | |
424 parser_optional_softClip = parser.add_argument_group('Soft clip facultative options') | |
425 parser_optional_softClip.add_argument('--softClip_baseQuality', type=int, metavar='N', help='Base quality phred score in reads soft clipped regions', required=False, default=7) | |
426 parser_optional_download.add_argument('--softClip_recodeRun', type=str, metavar='first', help='ReMatCh run to recode soft clipped regions (available options: %(choices)s)', choices=['first', 'second', 'both', 'none'], required=False, default='none') | |
427 parser_optional_download.add_argument('--softClip_cigarFlagRecode', type=str, metavar='M', help='CIGAR flag to recode CIGAR soft clip (available options: %(choices)s)', choices=['M', 'I', 'X'], required=False, default='X') | |
428 | |
429 parser_optional_download_exclusive = parser.add_mutually_exclusive_group() | |
430 parser_optional_download_exclusive.add_argument('-l', '--listIDs', type=argparse.FileType('r'), metavar='/path/to/list_IDs.txt', help='Path to list containing the IDs to be downloaded (one per line)', required=False) | |
431 parser_optional_download_exclusive.add_argument('-t', '--taxon', type=str, metavar='"Streptococcus agalactiae"', help='Taxon name for which ReMatCh will download fastq files', required=False) | |
432 | |
433 args = parser.parse_args() | |
434 | |
435 if args.reference is None and not args.mlstReference: | |
436 parser.error('At least --reference or --mlstReference should be provided') | |
437 elif args.reference is not None and args.mlstReference: | |
438 parser.error('Only --reference or --mlstReference should be provided') | |
439 else: | |
440 if args.mlstReference: | |
441 if args.mlst is None: | |
442 parser.error('Please provide species name using --mlst') | |
443 | |
444 if args.minFrequencyDominantAllele < 0 or args.minFrequencyDominantAllele > 1: | |
445 parser.error('--minFrequencyDominantAllele should be a value between [0, 1]') | |
446 | |
447 if args.minGeneCoverage < 0 or args.minGeneCoverage > 100: | |
448 parser.error('--minGeneCoverage should be a value between [0, 100]') | |
449 if args.minGeneIdentity < 0 or args.minGeneIdentity > 100: | |
450 parser.error('--minGeneIdentity should be a value between [0, 100]') | |
451 | |
452 start_time = time.time() | |
453 | |
454 number_samples_successfully, samples_total_number = runRematch(args) | |
455 | |
456 print '\n' + 'END ReMatCh' | |
457 print '\n' + str(number_samples_successfully) + ' samples out of ' + str(samples_total_number) + ' run successfully' | |
458 time_taken = utils.runTime(start_time) | |
459 del time_taken | |
460 | |
461 if number_samples_successfully == 0: | |
462 sys.exit('No samples run successfully!') | |
463 | |
464 | |
465 if __name__ == "__main__": | |
466 main() |