comparison scripts/ReMatCh/rematch.py @ 3:0cbed1c0a762 draft default tip

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Tue, 28 Jan 2020 10:42:31 -0500
parents 965517909457
children
comparison
equal deleted inserted replaced
2:6837f733b4aa 3:0cbed1c0a762
1 #!/usr/bin/env python 1 #!/usr/bin/env python3
2 2
3 # -*- coding: utf-8 -*- 3 # -*- coding: utf-8 -*-
4 4
5 """ 5 """
6 rematch.py - Reads mapping against target sequences, checking mapping 6 rematch.py - Reads mapping against target sequences, checking mapping
7 and consensus sequences production 7 and consensus sequences production
8 <https://github.com/B-UMMI/ReMatCh/> 8 <https://github.com/B-UMMI/ReMatCh/>
9 9
10 Copyright (C) 2017 Miguel Machado <mpmachado@medicina.ulisboa.pt> 10 Copyright (C) 2019 Miguel Machado <mpmachado@medicina.ulisboa.pt>
11 11
12 Last modified: April 12, 2017 12 Last modified: August 08, 2019
13 13
14 This program is free software: you can redistribute it and/or modify 14 This program is free software: you can redistribute it and/or modify
15 it under the terms of the GNU General Public License as published by 15 it under the terms of the GNU General Public License as published by
16 the Free Software Foundation, either version 3 of the License, or 16 the Free Software Foundation, either version 3 of the License, or
17 (at your option) any later version. 17 (at your option) any later version.
27 27
28 import os 28 import os
29 import sys 29 import sys
30 import time 30 import time
31 import argparse 31 import argparse
32 import modules.utils as utils 32
33 import modules.seqFromWebTaxon as seqFromWebTaxon 33 try:
34 import modules.download as download 34 from __init__ import __version__
35 import modules.rematch_module as rematch_module 35
36 import modules.checkMLST as checkMLST 36 import modules.utils as utils
37 37 import modules.seqFromWebTaxon as seq_from_web_taxon
38 38 import modules.download as download
39 version = '3.2' 39 import modules.rematch_module as rematch_module
40 40 import modules.checkMLST as check_mlst
41 41 except ImportError:
42 def searchFastqFiles(directory): 42 from ReMatCh.__init__ import __version__
43 filesExtensions = ['.fastq.gz', '.fq.gz'] 43
44 pairEnd_filesSeparation = [['_R1_001.f', '_R2_001.f'], ['_1.f', '_2.f']] 44 from ReMatCh.modules import utils as utils
45 45 from ReMatCh.modules import seqFromWebTaxon as seq_from_web_taxon
46 listIDs = {} 46 from ReMatCh.modules import download as download
47 directories = [d for d in os.listdir(directory) if not d.startswith('.') and os.path.isdir(os.path.join(directory, d, ''))] 47 from ReMatCh.modules import rematch_module as rematch_module
48 from ReMatCh.modules import checkMLST as check_mlst
49
50
51 def search_fastq_files(directory):
52 files_extensions = ['.fastq.gz', '.fq.gz']
53 pair_end_files_separation = [['_R1_001.f', '_R2_001.f'], ['_1.f', '_2.f']]
54
55 list_ids = {}
56 directories = [d for d in os.listdir(directory) if
57 not d.startswith('.') and os.path.isdir(os.path.join(directory, d, ''))]
48 for directory_found in directories: 58 for directory_found in directories:
49 if directory_found != 'pubmlst': 59 if directory_found != 'pubmlst':
50 directory_path = os.path.join(directory, directory_found, '') 60 directory_path = os.path.join(directory, directory_found, '')
51 61
52 fastqFound = [] 62 fastq_found = []
53 files = [f for f in os.listdir(directory_path) if not f.startswith('.') and os.path.isfile(os.path.join(directory_path, f))] 63 files = [f for f in os.listdir(directory_path) if
64 not f.startswith('.') and os.path.isfile(os.path.join(directory_path, f))]
54 for file_found in files: 65 for file_found in files:
55 if file_found.endswith(tuple(filesExtensions)): 66 if file_found.endswith(tuple(files_extensions)):
56 fastqFound.append(file_found) 67 fastq_found.append(file_found)
57 68
58 if len(fastqFound) == 1: 69 if len(fastq_found) == 1:
59 listIDs[directory_found] = [os.path.join(directory_path, f) for f in fastqFound] 70 list_ids[directory_found] = [os.path.join(directory_path, f) for f in fastq_found]
60 elif len(fastqFound) >= 2: 71 elif len(fastq_found) >= 2:
61 file_pair = [] 72 file_pair = []
62 73
63 # Search pairs 74 # Search pairs
64 for PE_separation in pairEnd_filesSeparation: 75 for pe_separation in pair_end_files_separation:
65 for fastq in fastqFound: 76 for fastq in fastq_found:
66 if PE_separation[0] in fastq or PE_separation[1] in fastq: 77 if pe_separation[0] in fastq or pe_separation[1] in fastq:
67 file_pair.append(fastq) 78 file_pair.append(fastq)
68 79
69 if len(file_pair) == 2: 80 if len(file_pair) == 2:
70 break 81 break
71 else: 82 else:
72 file_pair = [] 83 file_pair = []
73 84
74 # Search single 85 # Search single
75 if len(file_pair) == 0: 86 if len(file_pair) == 0:
76 for PE_separation in pairEnd_filesSeparation: 87 for pe_separation in pair_end_files_separation:
77 for fastq in fastqFound: 88 for fastq in fastq_found:
78 if PE_separation[0] not in fastq or PE_separation[1] not in fastq: 89 if pe_separation[0] not in fastq or pe_separation[1] not in fastq:
79 file_pair.append(fastq) 90 file_pair.append(fastq)
80 91
81 if len(file_pair) >= 1: 92 if len(file_pair) >= 1:
82 file_pair = file_pair[0] 93 file_pair = file_pair[0]
83 94
84 if len(file_pair) >= 1: 95 if len(file_pair) >= 1:
85 listIDs[directory_found] = [os.path.join(directory_path, f) for f in file_pair] 96 list_ids[directory_found] = [os.path.join(directory_path, f) for f in file_pair]
86 97
87 return listIDs 98 return list_ids
88 99
89 100
90 def getListIDs_fromFile(fileListIDs): 101 def get_list_ids_from_file(file_list_ids):
91 list_ids = [] 102 list_ids = []
92 103
93 with open(fileListIDs, 'rtU') as lines: 104 with open(file_list_ids, 'rtU') as lines:
94 for line in lines: 105 for line in lines:
95 line = line.splitlines()[0] 106 line = line.rstrip('\r\n')
96 if len(line) > 0: 107 if len(line) > 0:
97 list_ids.append(line) 108 list_ids.append(line)
98 109
99 if len(list_ids) == 0: 110 if len(list_ids) == 0:
100 sys.exit('No runIDs were found in ' + fileListIDs) 111 sys.exit('No runIDs were found in ' + file_list_ids)
101 112
102 return list_ids 113 return list_ids
103 114
104 115
105 def getTaxonRunIDs(taxon_name, outputfile): 116 def get_taxon_run_ids(taxon_name, outputfile):
106 seqFromWebTaxon.runSeqFromWebTaxon(taxon_name, outputfile, True, True, True, False) 117 seq_from_web_taxon.run_seq_from_web_taxon(taxon_name, outputfile, True, True, True, False)
107 118
108 runIDs = [] 119 run_ids = []
109 with open(outputfile, 'rtU') as reader: 120 with open(outputfile, 'rtU') as reader:
110 for line in reader: 121 for line in reader:
111 line = line.splitlines()[0] 122 line = line.rstrip('\r\n')
112 if len(line) > 0: 123 if len(line) > 0:
113 if not line.startswith('#'): 124 if not line.startswith('#'):
114 line = line.split('\t') 125 line = line.split('\t')
115 runIDs.append(line[0]) 126 run_ids.append(line[0])
116 127
117 return runIDs 128 return run_ids
118 129
119 130
120 def getListIDs(workdir, fileListIDs, taxon_name): 131 def get_list_ids(workdir, file_list_ids, taxon_name):
121 searched_fastq_files = False 132 searched_fastq_files = False
122 listIDs = [] 133 list_ids = []
123 if fileListIDs is None and taxon_name is None: 134 if file_list_ids is None and taxon_name is None:
124 listIDs = searchFastqFiles(workdir) 135 list_ids = search_fastq_files(workdir)
125 searched_fastq_files = True 136 searched_fastq_files = True
126 elif fileListIDs is not None: 137 elif file_list_ids is not None:
127 listIDs = getListIDs_fromFile(os.path.abspath(fileListIDs)) 138 list_ids = get_list_ids_from_file(os.path.abspath(file_list_ids))
128 elif taxon_name is not None and fileListIDs is None: 139 elif taxon_name is not None and file_list_ids is None:
129 listIDs = getTaxonRunIDs(taxon_name, os.path.join(workdir, 'IDs_list.seqFromWebTaxon.tab')) 140 list_ids = get_taxon_run_ids(taxon_name, os.path.join(workdir, 'IDs_list.seqFromWebTaxon.tab'))
130 141
131 if len(listIDs) == 0: 142 if len(list_ids) == 0:
132 sys.exit('No IDs were found') 143 sys.exit('No IDs were found')
133 return listIDs, searched_fastq_files 144 return list_ids, searched_fastq_files
134 145
135 146
136 def format_gene_info(gene_specific_info, minimum_gene_coverage, minimum_gene_identity, reported_data_type): 147 def format_gene_info(gene_specific_info, minimum_gene_coverage, minimum_gene_identity, reported_data_type, summary,
148 sample, genes_present):
137 info = None 149 info = None
138 if gene_specific_info['gene_coverage'] >= minimum_gene_coverage and gene_specific_info['gene_identity'] >= minimum_gene_identity: 150 if gene_specific_info['gene_coverage'] >= minimum_gene_coverage and \
151 gene_specific_info['gene_identity'] >= minimum_gene_identity:
152 if summary and sample not in genes_present:
153 genes_present[sample] = {}
154
139 if gene_specific_info['gene_number_positions_multiple_alleles'] == 0: 155 if gene_specific_info['gene_number_positions_multiple_alleles'] == 0:
140 info = str(gene_specific_info[reported_data_type]) 156 s = str(gene_specific_info[reported_data_type])
157 info = str(s)
158 if summary:
159 genes_present[sample][gene_specific_info['header']] = str(s)
141 else: 160 else:
142 info = 'multiAlleles_' + str(gene_specific_info[reported_data_type]) 161 s = 'multiAlleles_' + str(gene_specific_info[reported_data_type])
162 info = str(s)
163 if summary:
164 genes_present[sample][gene_specific_info['header']] = str(s)
143 else: 165 else:
144 info = 'absent_' + str(gene_specific_info[reported_data_type]) 166 info = 'absent_' + str(gene_specific_info[reported_data_type])
145 167
146 return info 168 return info, genes_present
147 169
148 170
149 def write_data_by_gene(gene_list_reference, minimum_gene_coverage, sample, data_by_gene, outdir, time_str, run_times, minimum_gene_identity, reported_data_type): 171 def write_data_by_gene(gene_list_reference, minimum_gene_coverage, sample, data_by_gene, outdir, time_str, run_times,
150 combined_report = os.path.join(outdir, 'combined_report.data_by_gene.' + run_times + '.' + reported_data_type + '.' + time_str + '.tab') 172 minimum_gene_identity, reported_data_type, summary, genes_present):
173 combined_report = \
174 os.path.join(outdir,
175 'combined_report.data_by_gene.' + run_times + '.' + reported_data_type + '.' + time_str + '.tab')
151 176
152 if reported_data_type == 'coverage_depth': 177 if reported_data_type == 'coverage_depth':
153 reported_data_type = 'gene_mean_read_coverage' 178 reported_data_type = 'gene_mean_read_coverage'
154 elif reported_data_type == 'sequence_coverage': 179 elif reported_data_type == 'sequence_coverage':
155 reported_data_type = 'gene_coverage' 180 reported_data_type = 'gene_coverage'
156 181
157 combined_report_exist = os.path.isfile(combined_report) 182 combined_report_exist = os.path.isfile(combined_report)
158 with open(combined_report, 'at') as writer: 183 with open(combined_report, 'at') as writer:
159 seq_list = gene_list_reference.keys() 184 seq_list = sorted(gene_list_reference.keys())
160 if not combined_report_exist: 185 if not combined_report_exist:
161 writer.write('#sample' + '\t' + '\t'.join([gene_list_reference[seq] for seq in seq_list]) + '\n') 186 writer.write('#sample' + '\t' + '\t'.join([gene_list_reference[seq] for seq in seq_list]) + '\n')
162 187
163 results = {} 188 results = {}
164 headers = [] 189 headers = []
165 for i in data_by_gene: 190 for i in data_by_gene:
166 results[data_by_gene[i]['header']] = format_gene_info(data_by_gene[i], minimum_gene_coverage, minimum_gene_identity, reported_data_type) 191 results[data_by_gene[i]['header']], genes_present = format_gene_info(data_by_gene[i], minimum_gene_coverage,
192 minimum_gene_identity,
193 reported_data_type, summary, sample,
194 genes_present)
167 headers.append(data_by_gene[i]['header']) 195 headers.append(data_by_gene[i]['header'])
168 196
169 if len(headers) != gene_list_reference: 197 if len(headers) != gene_list_reference:
170 for gene in gene_list_reference: 198 for gene in gene_list_reference:
171 if gene not in headers: 199 if gene not in headers:
172 results[gene] = 'NA' 200 results[gene] = 'NA'
173 201
174 writer.write(sample + '\t' + '\t'.join([results[seq] for seq in seq_list]) + '\n') 202 writer.write(sample + '\t' + '\t'.join([results[seq] for seq in seq_list]) + '\n')
175 203
176 204 return genes_present
177 def write_sample_report(sample, outdir, time_str, fileSize, run_successfully_fastq, run_successfully_rematch_first, run_successfully_rematch_second, time_taken_fastq, time_taken_rematch_first, time_taken_rematch_second, time_taken_sample, sequencingInformation, sample_data_general_first, sample_data_general_second, fastq_used): 205
206
207 def write_sample_report(sample, outdir, time_str, file_size, run_successfully_fastq, run_successfully_rematch_first,
208 run_successfully_rematch_second, time_taken_fastq, time_taken_rematch_first,
209 time_taken_rematch_second, time_taken_sample, sequencing_information, sample_data_general_first,
210 sample_data_general_second, fastq_used):
178 sample_report = os.path.join(outdir, 'sample_report.' + time_str + '.tab') 211 sample_report = os.path.join(outdir, 'sample_report.' + time_str + '.tab')
179 report_exist = os.path.isfile(sample_report) 212 report_exist = os.path.isfile(sample_report)
180 213
181 header_general = ['sample', 'sample_run_successfully', 'sample_run_time', 'files_size', 'download_run_successfully', 'download_run_time', 'rematch_run_successfully_first', 'rematch_run_time_first', 'rematch_run_successfully_second', 'rematch_run_time_second'] 214 header_general = ['sample', 'sample_run_successfully', 'sample_run_time', 'files_size', 'download_run_successfully',
215 'download_run_time', 'rematch_run_successfully_first', 'rematch_run_time_first',
216 'rematch_run_successfully_second', 'rematch_run_time_second']
182 header_data_general = ['number_absent_genes', 'number_genes_multiple_alleles', 'mean_sample_coverage'] 217 header_data_general = ['number_absent_genes', 'number_genes_multiple_alleles', 'mean_sample_coverage']
183 header_sequencing = ['run_accession', 'instrument_platform', 'instrument_model', 'library_layout', 'library_source', 'extra_run_accession', 'nominal_length', 'read_count', 'base_count', 'date_download'] 218 header_sequencing = ['run_accession', 'instrument_platform', 'instrument_model', 'library_layout', 'library_source',
219 'extra_run_accession', 'nominal_length', 'read_count', 'base_count', 'date_download']
184 220
185 with open(sample_report, 'at') as writer: 221 with open(sample_report, 'at') as writer:
186 if not report_exist: 222 if not report_exist:
187 writer.write('#' + '\t'.join(header_general) + '\t' + '_first\t'.join(header_data_general) + '_first\t' + '_second\t'.join(header_data_general) + '_second\t' + '\t'.join(header_sequencing) + '\t' + 'fastq_used' + '\n') 223 writer.write('#' + '\t'.join(header_general) + '\t' + '_first\t'.join(header_data_general) + '_first\t' +
188 224 '_second\t'.join(header_data_general) + '_second\t' + '\t'.join(header_sequencing) + '\t' +
189 writer.write('\t'.join([sample, str(all([run_successfully_fastq is not False, run_successfully_rematch_first is not False, run_successfully_rematch_second is not False])), str(time_taken_sample), str(fileSize), str(run_successfully_fastq), str(time_taken_fastq), str(run_successfully_rematch_first), str(time_taken_rematch_first), str(run_successfully_rematch_second), str(time_taken_rematch_second)]) + '\t' + '\t'.join([str(sample_data_general_first[i]) for i in header_data_general]) + '\t' + '\t'.join([str(sample_data_general_second[i]) for i in header_data_general]) + '\t' + '\t'.join([str(sequencingInformation[i]) for i in header_sequencing]) + '\t' + ','.join(fastq_used) + '\n') 225 'fastq_used' + '\n')
190 226
191 227 writer.write('\t'.join([sample,
192 def concatenate_extraSeq_2_consensus(consensus_sequence, reference_sequence, extraSeq_length, outdir): 228 str(all([run_successfully_fastq is not False,
193 reference_dict, ignore, ignore = rematch_module.get_sequence_information(reference_sequence, extraSeq_length) 229 run_successfully_rematch_first is not False,
230 run_successfully_rematch_second is not False])),
231 str(time_taken_sample),
232 str(file_size),
233 str(run_successfully_fastq),
234 str(time_taken_fastq),
235 str(run_successfully_rematch_first),
236 str(time_taken_rematch_first),
237 str(run_successfully_rematch_second),
238 str(time_taken_rematch_second)]) +
239 '\t' + '\t'.join([str(sample_data_general_first[i]) for i in header_data_general]) +
240 '\t' + '\t'.join([str(sample_data_general_second[i]) for i in header_data_general]) +
241 '\t' + '\t'.join([str(sequencing_information[i]) for i in header_sequencing]) +
242 '\t' + ','.join(fastq_used) + '\n')
243
244
245 def concatenate_extra_seq_2_consensus(consensus_sequence, reference_sequence, extra_seq_length, outdir):
246 reference_dict, ignore, ignore = rematch_module.get_sequence_information(reference_sequence, extra_seq_length)
194 consensus_dict, genes, ignore = rematch_module.get_sequence_information(consensus_sequence, 0) 247 consensus_dict, genes, ignore = rematch_module.get_sequence_information(consensus_sequence, 0)
195 for k, values_consensus in consensus_dict.items(): 248 number_consensus_with_sequences = 0
196 for values_reference in reference_dict.values(): 249 for k, values_consensus in list(consensus_dict.items()):
250 for values_reference in list(reference_dict.values()):
197 if values_reference['header'] == values_consensus['header']: 251 if values_reference['header'] == values_consensus['header']:
198 if extraSeq_length <= len(values_reference['sequence']): 252 if len(set(consensus_dict[k]['sequence'])) > 1:
199 right_extra_seq = '' if extraSeq_length == 0 else values_reference['sequence'][-extraSeq_length:] 253 number_consensus_with_sequences += 1
200 consensus_dict[k]['sequence'] = values_reference['sequence'][:extraSeq_length] + consensus_dict[k]['sequence'] + right_extra_seq 254 if extra_seq_length <= len(values_reference['sequence']):
201 consensus_dict[k]['length'] += extraSeq_length + len(right_extra_seq) 255 right_extra_seq = \
256 '' if extra_seq_length == 0 else values_reference['sequence'][-extra_seq_length:]
257 consensus_dict[k]['sequence'] = \
258 values_reference['sequence'][:extra_seq_length] + \
259 consensus_dict[k]['sequence'] + \
260 right_extra_seq
261 consensus_dict[k]['length'] += extra_seq_length + len(right_extra_seq)
202 262
203 consensus_concatenated = os.path.join(outdir, 'consensus_concatenated_extraSeq.fasta') 263 consensus_concatenated = os.path.join(outdir, 'consensus_concatenated_extraSeq.fasta')
204 with open(consensus_concatenated, 'wt') as writer: 264 with open(consensus_concatenated, 'wt') as writer:
205 for i in consensus_dict: 265 for i in consensus_dict:
206 writer.write('>' + consensus_dict[i]['header'] + '\n') 266 writer.write('>' + consensus_dict[i]['header'] + '\n')
207 fasta_sequence_lines = rematch_module.chunkstring(consensus_dict[i]['sequence'], 80) 267 fasta_sequence_lines = rematch_module.chunkstring(consensus_dict[i]['sequence'], 80)
208 for line in fasta_sequence_lines: 268 for line in fasta_sequence_lines:
209 writer.write(line + '\n') 269 writer.write(line + '\n')
210 270
211 return consensus_concatenated, genes, consensus_dict 271 return consensus_concatenated, genes, consensus_dict, number_consensus_with_sequences
212 272
213 273
214 def clean_headers_reference_file(reference_file, outdir, extraSeq): 274 def clean_headers_reference_file(reference_file, outdir, extra_seq):
215 problematic_characters = ["|", " ", ",", ".", "(", ")", "'", "/", ":"] 275 problematic_characters = ["|", " ", ",", ".", "(", ")", "'", "/", ":"]
216 print 'Checking if reference sequences contain ' + str(problematic_characters) + '\n' 276 print('Checking if reference sequences contain ' + str(problematic_characters) + '\n')
217 headers_changed = False 277 # headers_changed = False
218 new_reference_file = str(reference_file) 278 new_reference_file = str(reference_file)
219 sequences, genes, headers_changed = rematch_module.get_sequence_information(reference_file, extraSeq) 279 sequences, genes, headers_changed = rematch_module.get_sequence_information(reference_file, extra_seq)
220 if headers_changed: 280 if headers_changed:
221 print 'At least one of the those characters was found. Replacing those with _' + '\n' 281 print('At least one of the those characters was found. Replacing those with _' + '\n')
222 new_reference_file = os.path.join(outdir, os.path.splitext(os.path.basename(reference_file))[0] + '.headers_renamed.fasta') 282 new_reference_file = \
283 os.path.join(outdir, os.path.splitext(os.path.basename(reference_file))[0] + '.headers_renamed.fasta')
223 with open(new_reference_file, 'wt') as writer: 284 with open(new_reference_file, 'wt') as writer:
224 for i in sequences: 285 for i in sequences:
225 writer.write('>' + sequences[i]['header'] + '\n') 286 writer.write('>' + sequences[i]['header'] + '\n')
226 fasta_sequence_lines = rematch_module.chunkstring(sequences[i]['sequence'], 80) 287 fasta_sequence_lines = rematch_module.chunkstring(sequences[i]['sequence'], 80)
227 for line in fasta_sequence_lines: 288 for line in fasta_sequence_lines:
228 writer.write(line + '\n') 289 writer.write(line + '\n')
229 return new_reference_file, genes, sequences 290 return new_reference_file, genes, sequences
230 291
231 292
232 def write_mlst_report(sample, run_times, consensus_type, st, alleles_profile, lociOrder, outdir, time_str): 293 def write_mlst_report(sample, run_times, consensus_type, st, alleles_profile, loci_order, outdir, time_str):
233 mlst_report = os.path.join(outdir, 'mlst_report.' + time_str + '.tab') 294 mlst_report = os.path.join(outdir, 'mlst_report.' + time_str + '.tab')
234 mlst_report_exist = os.path.isfile(mlst_report) 295 mlst_report_exist = os.path.isfile(mlst_report)
235 with open(mlst_report, 'at') as writer: 296 with open(mlst_report, 'at') as writer:
236 if not mlst_report_exist: 297 if not mlst_report_exist:
237 writer.write('\t'.join(['#sample', 'ReMatCh_run', 'consensus_type', 'ST'] + lociOrder) + '\n') 298 writer.write('\t'.join(['#sample', 'ReMatCh_run', 'consensus_type', 'ST'] + loci_order) + '\n')
238 writer.write('\t'.join([sample, run_times, consensus_type, str(st)] + alleles_profile.split(',')) + '\n') 299 writer.write('\t'.join([sample, run_times, consensus_type, str(st)] + alleles_profile.split(',')) + '\n')
239 300
240 301
241 def run_get_st(sample, mlst_dicts, consensus_sequences, mlstConsensus, run_times, outdir, time_str): 302 def run_get_st(sample, mlst_dicts, consensus_sequences, mlst_consensus, run_times, outdir, time_str):
242 if mlstConsensus == 'all': 303 if mlst_consensus == 'all':
243 for consensus_type in consensus_sequences: 304 for consensus_type in consensus_sequences:
244 print 'Searching MLST for ' + consensus_type + ' consensus' 305 print('Searching MLST for ' + consensus_type + ' consensus')
245 st, alleles_profile = checkMLST.getST(mlst_dicts, consensus_sequences[consensus_type]) 306 st, alleles_profile = check_mlst.get_st(mlst_dicts, consensus_sequences[consensus_type])
246 write_mlst_report(sample, run_times, consensus_type, st, alleles_profile, mlst_dicts[2], outdir, time_str) 307 write_mlst_report(sample, run_times, consensus_type, st, alleles_profile, mlst_dicts[2], outdir, time_str)
247 print 'ST found: ' + str(st) + ' (' + alleles_profile + ')' 308 print('ST found: ' + str(st) + ' (' + alleles_profile + ')')
248 else: 309 else:
249 st, alleles_profile = checkMLST.getST(mlst_dicts, consensus_sequences[mlstConsensus]) 310 st, alleles_profile = check_mlst.get_st(mlst_dicts, consensus_sequences[mlst_consensus])
250 write_mlst_report(sample, run_times, mlstConsensus, st, alleles_profile, mlst_dicts[2], outdir, time_str) 311 write_mlst_report(sample, run_times, mlst_consensus, st, alleles_profile, mlst_dicts[2], outdir, time_str)
251 print 'ST found for ' + mlstConsensus + ' consensus: ' + str(st) + ' (' + alleles_profile + ')' 312 print('ST found for ' + mlst_consensus + ' consensus: ' + str(st) + ' (' + alleles_profile + ')')
252 313
253 314
254 def runRematch(args): 315 def write_summary_report(outdir, reported_data_type, time_str, gene_list_reference, genes_present):
316 with open(os.path.join(outdir,
317 'summary.{reported_data_type}.{time_str}.tab'.format(reported_data_type=reported_data_type,
318 time_str=time_str)), 'wt') as writer:
319 seq_list = []
320 for info in list(genes_present.values()):
321 seq_list.extend(list(info.keys()))
322 seq_list = list(set(seq_list))
323 writer.write('#sample' + '\t' + '\t'.join([gene_list_reference[seq] for seq in sorted(seq_list)]) + '\n')
324 for sample, info in list(genes_present.items()):
325 data = []
326 for seq in sorted(seq_list):
327 if seq in info:
328 data.append(info[seq])
329 else:
330 data.append('NF')
331 writer.write(sample + '\t' + '\t'.join(data) + '\n')
332
333
334 def run_rematch(args):
255 workdir = os.path.abspath(args.workdir) 335 workdir = os.path.abspath(args.workdir)
256 if not os.path.isdir(workdir): 336 if not os.path.isdir(workdir):
257 os.makedirs(workdir) 337 os.makedirs(workdir)
258 338
259 asperaKey = os.path.abspath(args.asperaKey.name) if args.asperaKey is not None else None 339 aspera_key = os.path.abspath(args.asperaKey.name) if args.asperaKey is not None else None
260 340
261 # Start logger 341 # Start logger
262 logfile, time_str = utils.start_logger(workdir) 342 logfile, time_str = utils.start_logger(workdir)
263 343
264 # Get general information 344 # Get general information
265 script_path = utils.general_information(logfile, version, workdir, time_str, args.doNotUseProvidedSoftware, asperaKey, args.downloadCramBam) 345 script_path = utils.general_information(logfile, __version__, workdir, time_str, args.doNotUseProvidedSoftware,
266 346 aspera_key, args.downloadCramBam, args.SRA, args.SRAopt)
267 # Set listIDs 347
268 listIDs, searched_fastq_files = getListIDs(workdir, args.listIDs.name if args.listIDs is not None else None, args.taxon) 348 # Set list_ids
269 349 list_ids, searched_fastq_files = get_list_ids(workdir, args.listIDs.name if args.listIDs is not None else None,
350 args.taxon)
351
352 mlst_sequences = None
353 mlst_dicts = None
270 if args.mlst is not None: 354 if args.mlst is not None:
271 time_taken_PubMLST, mlst_dicts, mlst_sequences = checkMLST.downloadPubMLSTxml(args.mlst, args.mlstSchemaNumber, workdir) 355 time_taken_pub_mlst, mlst_dicts, mlst_sequences = check_mlst.download_pub_mlst_xml(args.mlst,
356 args.mlstSchemaNumber,
357 workdir)
272 args.softClip_recodeRun = 'first' 358 args.softClip_recodeRun = 'first'
273 args.conservedSeq = False
274 359
275 if args.reference is None: 360 if args.reference is None:
276 reference_file = checkMLST.check_existing_schema(args.mlst, args.mlstSchemaNumber, script_path) 361 if args.mlst is not None:
277 args.extraSeq = 200 362 reference_file = check_mlst.check_existing_schema(args.mlst, args.mlstSchemaNumber, script_path)
278 if reference_file is None: 363 args.extraSeq = 200
279 print 'It was not found provided MLST scheme sequences for ' + args.mlst 364 if reference_file is None:
280 print 'Trying to obtain reference MLST sequences from PubMLST' 365 print('It was not found provided MLST scheme sequences for ' + args.mlst)
281 if len(mlst_sequences) > 0: 366 print('Trying to obtain reference MLST sequences from PubMLST')
282 reference_file = checkMLST.write_mlst_reference(args.mlst, mlst_sequences, workdir, time_str) 367 if len(mlst_sequences) > 0:
283 args.extraSeq = 0 368 reference_file = check_mlst.write_mlst_reference(args.mlst, mlst_sequences, workdir, time_str)
369 args.extraSeq = 0
370 else:
371 sys.exit('It was not possible to download MLST sequences from PubMLST!')
284 else: 372 else:
285 sys.exit('It was not possible to download MLST sequences from PubMLST!') 373 print('Using provided scheme as referece: ' + reference_file)
286 else: 374 else:
287 print 'Using provided scheme as referece: ' + reference_file 375 sys.exit('Need to provide at least one of the following options: "--reference" and "--mlst"')
288 else: 376 else:
289 reference_file = os.path.abspath(args.reference.name) 377 reference_file = os.path.abspath(args.reference.name)
290 378
291 # Run ReMatCh for each sample 379 # Run ReMatCh for each sample
292 print '\n' + 'STARTING ReMatCh' + '\n' 380 print('\n' + 'STARTING ReMatCh' + '\n')
293 381
294 # Clean sequences headers 382 # Clean sequences headers
295 reference_file, gene_list_reference, reference_dict = clean_headers_reference_file(reference_file, workdir, args.extraSeq) 383 reference_file, gene_list_reference, reference_dict = clean_headers_reference_file(reference_file, workdir,
384 args.extraSeq)
296 385
297 if args.mlst is not None: 386 if args.mlst is not None:
298 problem_genes = False 387 problem_genes = False
299 for header in mlst_sequences: 388 for header in mlst_sequences:
300 if header not in gene_list_reference: 389 if header not in gene_list_reference:
301 print 'MLST gene {header} not found between reference sequences'.format(header=header) 390 print('MLST gene {header} not found between reference sequences'.format(header=header))
302 problem_genes = True 391 problem_genes = True
303 if problem_genes: 392 if problem_genes:
304 sys.exit('Missing MLST genes from reference sequences (at least sequences names do not match)!') 393 sys.exit('Missing MLST genes from reference sequences (at least sequences names do not match)!')
305 394
306 if len(gene_list_reference) == 0: 395 if len(gene_list_reference) == 0:
307 sys.exit('No sequences left') 396 sys.exit('No sequences left')
308 397
309 # To use in combined report 398 # To use in combined report
310 399
311 number_samples_successfully = 0 400 number_samples_successfully = 0
312 for sample in listIDs: 401 genes_present_coverage_depth = {}
402 genes_present_sequence_coverage = {}
403 for sample in list_ids:
313 sample_start_time = time.time() 404 sample_start_time = time.time()
314 print '\n\n' + 'Sample ID: ' + sample 405 print('\n\n' + 'Sample ID: ' + sample)
315 406
316 # Create sample outdir 407 # Create sample outdir
317 sample_outdir = os.path.join(workdir, sample, '') 408 sample_outdir = os.path.join(workdir, sample, '')
318 if not os.path.isdir(sample_outdir): 409 if not os.path.isdir(sample_outdir):
319 os.mkdir(sample_outdir) 410 os.mkdir(sample_outdir)
320 411
321 run_successfully_fastq = None 412 run_successfully_fastq = None
322 time_taken_fastq = 0 413 time_taken_fastq = 0
323 sequencingInformation = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None, 'library_layout': None, 'library_source': None, 'extra_run_accession': None, 'nominal_length': None, 'read_count': None, 'base_count': None, 'date_download': None} 414 sequencing_information = {'run_accession': None, 'instrument_platform': None, 'instrument_model': None,
415 'library_layout': None, 'library_source': None, 'extra_run_accession': None,
416 'nominal_length': None, 'read_count': None, 'base_count': None, 'date_download': None}
324 if not searched_fastq_files: 417 if not searched_fastq_files:
325 # Download Files 418 # Download Files
326 time_taken_fastq, run_successfully_fastq, fastq_files, sequencingInformation = download.runDownload(sample, args.downloadLibrariesType, asperaKey, sample_outdir, args.downloadCramBam, args.threads, args.downloadInstrumentPlatform) 419 time_taken_fastq, run_successfully_fastq, fastq_files, sequencing_information = \
420 download.run_download(sample, args.downloadLibrariesType, aspera_key, sample_outdir,
421 args.downloadCramBam, args.threads, args.downloadInstrumentPlatform, args.SRA,
422 args.SRAopt)
327 else: 423 else:
328 fastq_files = listIDs[sample] 424 fastq_files = list_ids[sample]
329 425
330 fileSize = None 426 file_size = None
331 427
332 run_successfully_rematch_first = None 428 run_successfully_rematch_first = None
333 run_successfully_rematch_second = None 429 run_successfully_rematch_second = None
334 time_taken_rematch_first = 0 430 time_taken_rematch_first = 0
335 time_taken_rematch_second = 0 431 time_taken_rematch_second = 0
432 sample_data_general_first = None
433 sample_data_general_second = None
336 if run_successfully_fastq is not False: 434 if run_successfully_fastq is not False:
337 fileSize = sum(os.path.getsize(fastq) for fastq in fastq_files) 435 file_size = sum(os.path.getsize(fastq) for fastq in fastq_files)
338 # Run ReMatCh 436 # Run ReMatCh
339 time_taken_rematch_first, run_successfully_rematch_first, data_by_gene, sample_data_general_first, consensus_files, consensus_sequences = rematch_module.runRematchModule(sample, fastq_files, reference_file, args.threads, sample_outdir, args.extraSeq, args.minCovPresence, args.minCovCall, args.minFrequencyDominantAllele, args.minGeneCoverage, args.conservedSeq, args.debug, args.numMapLoc, args.minGeneIdentity, 'first', args.softClip_baseQuality, args.softClip_recodeRun, reference_dict, args.softClip_cigarFlagRecode, args.bowtieOPT, gene_list_reference, args.notWriteConsensus) 437 time_taken_rematch_first, run_successfully_rematch_first, data_by_gene, sample_data_general_first, \
438 consensus_files, consensus_sequences = \
439 rematch_module.run_rematch_module(sample, fastq_files, reference_file, args.threads, sample_outdir,
440 args.extraSeq, args.minCovPresence, args.minCovCall,
441 args.minFrequencyDominantAllele, args.minGeneCoverage,
442 args.debug, args.numMapLoc, args.minGeneIdentity,
443 'first', args.softClip_baseQuality, args.softClip_recodeRun,
444 reference_dict, args.softClip_cigarFlagRecode,
445 args.bowtieAlgo, args.bowtieOPT,
446 gene_list_reference, args.notWriteConsensus, clean_run=True)
340 if run_successfully_rematch_first: 447 if run_successfully_rematch_first:
341 if args.mlst is not None and (args.mlstRun == 'first' or args.mlstRun == 'all'): 448 if args.mlst is not None and (args.mlstRun == 'first' or args.mlstRun == 'all'):
342 run_get_st(sample, mlst_dicts, consensus_sequences, args.mlstConsensus, 'first', workdir, time_str) 449 run_get_st(sample, mlst_dicts, consensus_sequences, args.mlstConsensus, 'first', workdir, time_str)
343 write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene, workdir, time_str, 'first_run', args.minGeneIdentity, 'coverage_depth') 450 genes_present_coverage_depth = write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample,
451 data_by_gene, workdir, time_str, 'first_run',
452 args.minGeneIdentity, 'coverage_depth', args.summary,
453 genes_present_coverage_depth)
344 if args.reportSequenceCoverage: 454 if args.reportSequenceCoverage:
345 write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene, workdir, time_str, 'first_run', args.minGeneIdentity, 'sequence_coverage') 455 genes_present_sequence_coverage = write_data_by_gene(gene_list_reference, args.minGeneCoverage,
456 sample, data_by_gene, workdir, time_str,
457 'first_run', args.minGeneIdentity,
458 'sequence_coverage', args.summary,
459 genes_present_sequence_coverage)
346 if args.doubleRun: 460 if args.doubleRun:
347 rematch_second_outdir = os.path.join(sample_outdir, 'rematch_second_run', '') 461 rematch_second_outdir = os.path.join(sample_outdir, 'rematch_second_run', '')
348 if not os.path.isdir(rematch_second_outdir): 462 if not os.path.isdir(rematch_second_outdir):
349 os.mkdir(rematch_second_outdir) 463 os.mkdir(rematch_second_outdir)
350 consensus_concatenated_fasta, consensus_concatenated_gene_list, consensus_concatenated_dict = concatenate_extraSeq_2_consensus(consensus_files['noMatter'], reference_file, args.extraSeq, rematch_second_outdir) 464 consensus_concatenated_fasta, consensus_concatenated_gene_list, consensus_concatenated_dict, \
465 number_consensus_with_sequences = \
466 concatenate_extra_seq_2_consensus(consensus_files['noMatter'], reference_file, args.extraSeq,
467 rematch_second_outdir)
351 if len(consensus_concatenated_gene_list) > 0: 468 if len(consensus_concatenated_gene_list) > 0:
352 time_taken_rematch_second, run_successfully_rematch_second, data_by_gene, sample_data_general_second, consensus_files, consensus_sequences = rematch_module.runRematchModule(sample, fastq_files, consensus_concatenated_fasta, args.threads, rematch_second_outdir, args.extraSeq, args.minCovPresence, args.minCovCall, args.minFrequencyDominantAllele, args.minGeneCoverage, args.conservedSeq, args.debug, args.numMapLoc, args.minGeneIdentity, 'second', args.softClip_baseQuality, args.softClip_recodeRun, consensus_concatenated_dict, args.softClip_cigarFlagRecode, args.bowtieOPT, gene_list_reference, args.notWriteConsensus) 469 if args.mlst is None or \
353 if not args.debug: 470 (args.mlst is not None and number_consensus_with_sequences == len(gene_list_reference)):
354 os.remove(consensus_concatenated_fasta) 471 time_taken_rematch_second, run_successfully_rematch_second, data_by_gene, \
355 if run_successfully_rematch_second: 472 sample_data_general_second, consensus_files, consensus_sequences = \
356 if args.mlst is not None and (args.mlstRun == 'second' or args.mlstRun == 'all'): 473 rematch_module.run_rematch_module(sample, fastq_files, consensus_concatenated_fasta,
357 run_get_st(sample, mlst_dicts, consensus_sequences, args.mlstConsensus, 'second', workdir, time_str) 474 args.threads, rematch_second_outdir, args.extraSeq,
358 write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene, workdir, time_str, 'second_run', args.minGeneIdentity, 'coverage_depth') 475 args.minCovPresence, args.minCovCall,
359 if args.reportSequenceCoverage: 476 args.minFrequencyDominantAllele, args.minGeneCoverage,
360 write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene, workdir, time_str, 'second_run', args.minGeneIdentity, 'sequence_coverage') 477 args.debug, args.numMapLoc,
478 args.minGeneIdentity, 'second',
479 args.softClip_baseQuality, args.softClip_recodeRun,
480 consensus_concatenated_dict,
481 args.softClip_cigarFlagRecode,
482 args.bowtieAlgo, args.bowtieOPT,
483 gene_list_reference, args.notWriteConsensus,
484 clean_run=True)
485 if not args.debug:
486 os.remove(consensus_concatenated_fasta)
487 if run_successfully_rematch_second:
488 if args.mlst is not None and (args.mlstRun == 'second' or args.mlstRun == 'all'):
489 run_get_st(sample, mlst_dicts, consensus_sequences, args.mlstConsensus, 'second',
490 workdir, time_str)
491 _ = write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample, data_by_gene,
492 workdir, time_str, 'second_run', args.minGeneIdentity,
493 'coverage_depth', False, {})
494 if args.reportSequenceCoverage:
495 _ = write_data_by_gene(gene_list_reference, args.minGeneCoverage, sample,
496 data_by_gene, workdir, time_str, 'second_run',
497 args.minGeneIdentity, 'sequence_coverage', False, {})
498 else:
499 print('Some sequences missing after ReMatCh module first run. Second run will not be'
500 ' performed')
501 if os.path.isfile(consensus_concatenated_fasta):
502 os.remove(consensus_concatenated_fasta)
503 if os.path.isdir(rematch_second_outdir):
504 utils.remove_directory(rematch_second_outdir)
361 else: 505 else:
362 print 'No sequences left after ReMatCh module first run. Second run will not be performed' 506 print('No sequences left after ReMatCh module first run. Second run will not be performed')
363 if os.path.isfile(consensus_concatenated_fasta): 507 if os.path.isfile(consensus_concatenated_fasta):
364 os.remove(consensus_concatenated_fasta) 508 os.remove(consensus_concatenated_fasta)
365 if os.path.isdir(rematch_second_outdir): 509 if os.path.isdir(rematch_second_outdir):
366 utils.removeDirectory(rematch_second_outdir) 510 utils.remove_directory(rematch_second_outdir)
367 511
368 if not searched_fastq_files and not args.keepDownloadedFastq and fastq_files is not None: 512 if not searched_fastq_files and not args.keepDownloadedFastq and fastq_files is not None:
369 for fastq in fastq_files: 513 for fastq in fastq_files:
370 if os.path.isfile(fastq): 514 if os.path.isfile(fastq):
371 os.remove(fastq) 515 os.remove(fastq)
372 516
373 time_taken = utils.runTime(sample_start_time) 517 time_taken = utils.run_time(sample_start_time)
374 518
375 write_sample_report(sample, workdir, time_str, fileSize, run_successfully_fastq, run_successfully_rematch_first, run_successfully_rematch_second, time_taken_fastq, time_taken_rematch_first, time_taken_rematch_second, time_taken, sequencingInformation, sample_data_general_first if run_successfully_rematch_first else {'number_absent_genes': None, 'number_genes_multiple_alleles': None, 'mean_sample_coverage': None}, sample_data_general_second if run_successfully_rematch_second else {'number_absent_genes': None, 'number_genes_multiple_alleles': None, 'mean_sample_coverage': None}, fastq_files if fastq_files is not None else '') 519 write_sample_report(sample, workdir, time_str, file_size, run_successfully_fastq,
376 520 run_successfully_rematch_first, run_successfully_rematch_second, time_taken_fastq,
377 if all([run_successfully_fastq is not False, run_successfully_rematch_first is not False, run_successfully_rematch_second is not False]): 521 time_taken_rematch_first, time_taken_rematch_second, time_taken, sequencing_information,
522 sample_data_general_first if run_successfully_rematch_first else
523 {'number_absent_genes': None, 'number_genes_multiple_alleles': None,
524 'mean_sample_coverage': None},
525 sample_data_general_second if run_successfully_rematch_second else
526 {'number_absent_genes': None, 'number_genes_multiple_alleles': None,
527 'mean_sample_coverage': None},
528 fastq_files if fastq_files is not None else '')
529
530 if all([run_successfully_fastq is not False,
531 run_successfully_rematch_first is not False,
532 run_successfully_rematch_second is not False]):
378 number_samples_successfully += 1 533 number_samples_successfully += 1
379 534
380 return number_samples_successfully, len(listIDs) 535 if args.summary:
536 write_summary_report(workdir, 'coverage_depth', time_str, gene_list_reference, genes_present_coverage_depth)
537 if args.reportSequenceCoverage:
538 write_summary_report(workdir, 'sequence_coverage', time_str, gene_list_reference,
539 genes_present_sequence_coverage)
540
541 return number_samples_successfully, len(list_ids)
381 542
382 543
383 def main(): 544 def main():
384 parser = argparse.ArgumentParser(prog='rematch.py', description='Reads mapping against target sequences, checking mapping and consensus sequences production', formatter_class=argparse.ArgumentDefaultsHelpFormatter) 545 if sys.version_info[0] < 3:
385 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) 546 sys.exit('Must be using Python 3. Try calling "python3 rematch.py"')
547
548 parser = argparse.ArgumentParser(prog='rematch.py',
549 description='Reads mapping against target sequences, checking mapping and'
550 ' consensus sequences production',
551 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
552 parser.add_argument('--version', help='Version information', action='version',
553 version='{prog} v{version}'.format(prog=parser.prog, version=__version__))
386 554
387 parser_optional_general = parser.add_argument_group('General facultative options') 555 parser_optional_general = parser.add_argument_group('General facultative options')
388 parser_optional_general.add_argument('-r', '--reference', type=argparse.FileType('r'), metavar='/path/to/reference_sequence.fasta', help='Fasta file containing reference sequences', required=False) 556 parser_optional_general.add_argument('-r', '--reference', type=argparse.FileType('r'),
389 parser_optional_general.add_argument('-w', '--workdir', type=str, metavar='/path/to/workdir/directory/', help='Path to the directory where ReMatCh will run and produce the outputs with reads (ended with fastq.gz/fq.gz and, in case of PE data, pair-end direction coded as _R1_001 / _R2_001 or _1 / _2) already present (organized in sample folders) or to be downloaded', required=False, default='.') 557 metavar='/path/to/reference_sequence.fasta',
390 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use', required=False, default=1) 558 help='Fasta file containing reference sequences', required=False)
391 parser_optional_general.add_argument('--mlst', type=str, metavar='"Streptococcus agalactiae"', help='Species name (same as in PubMLST) to be used in MLST determination. ReMatCh will use Bowtie2 very-sensitive-local mapping parameters and will recode the soft clip CIGAR flags of the first run', required=False) 559 parser_optional_general.add_argument('-w', '--workdir', type=str, metavar='/path/to/workdir/directory/',
392 parser_optional_general.add_argument('--doNotUseProvidedSoftware', action='store_true', help='Tells ReMatCh to not use Bowtie2, Samtools and Bcftools that are provided with it') 560 help='Path to the directory where ReMatCh will run and produce the outputs'
561 ' with reads (ended with fastq.gz/fq.gz and, in case of PE data, pair-end'
562 ' direction coded as _R1_001 / _R2_001 or _1 / _2) already'
563 ' present (organized in sample folders) or to be downloaded',
564 required=False, default='.')
565 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use',
566 required=False, default=1)
567 parser_optional_general.add_argument('--mlst', type=str, metavar='"Streptococcus agalactiae"',
568 help='Species name (same as in PubMLST) to be used in MLST'
569 ' determination. ReMatCh will use Bowtie2 very-sensitive-local mapping'
570 ' parameters and will recode the soft clip CIGAR flags of the first run',
571 required=False)
572 parser_optional_general.add_argument('--doNotUseProvidedSoftware', action='store_true',
573 help='Tells ReMatCh to not use Bowtie2, Samtools and Bcftools that are'
574 ' provided with it')
575
576 parser_optional_download_exclusive = parser.add_mutually_exclusive_group()
577 parser_optional_download_exclusive.add_argument('-l', '--listIDs', type=argparse.FileType('r'),
578 metavar='/path/to/list_IDs.txt',
579 help='Path to list containing the IDs to be'
580 ' downloaded (one per line)', required=False)
581 parser_optional_download_exclusive.add_argument('-t', '--taxon', type=str, metavar='"Streptococcus agalactiae"',
582 help='Taxon name for which ReMatCh will download fastq files',
583 required=False)
393 584
394 parser_optional_rematch = parser.add_argument_group('ReMatCh module facultative options') 585 parser_optional_rematch = parser.add_argument_group('ReMatCh module facultative options')
395 parser_optional_rematch.add_argument('--conservedSeq', action='store_true', help=argparse.SUPPRESS) 586 parser_optional_rematch.add_argument('--extraSeq', type=int, metavar='N',
396 # parser_optional_rematch.add_argument('--conservedSeq', action='store_true', help='This option can be used with conserved sequences like MLST genes to speedup the analysis by alignning reads using Bowtie2 sensitive algorithm') 587 help='Sequence length added to both ends of target sequences (usefull to'
397 parser_optional_rematch.add_argument('--extraSeq', type=int, metavar='N', help='Sequence length added to both ends of target sequences (usefull to improve reads mapping to the target one) that will be trimmed in ReMatCh outputs', required=False, default=0) 588 ' improve reads mapping to the target one) that will be trimmed in'
398 parser_optional_rematch.add_argument('--minCovPresence', type=int, metavar='N', help='Reference position minimum coverage depth to consider the position to be present in the sample', required=False, default=5) 589 ' ReMatCh outputs', required=False, default=0)
399 parser_optional_rematch.add_argument('--minCovCall', type=int, metavar='N', help='Reference position minimum coverage depth to perform a base call. Lower coverage will be coded as N', required=False, default=10) 590 parser_optional_rematch.add_argument('--minCovPresence', type=int, metavar='N',
400 parser_optional_rematch.add_argument('--minFrequencyDominantAllele', type=float, metavar='0.6', help='Minimum relative frequency of the dominant allele coverage depth (value between [0, 1]). Positions with lower values will be considered as having multiple alleles (and will be coded as N)', required=False, default=0.6) 591 help='Reference position minimum coverage depth to consider the position to be'
401 parser_optional_rematch.add_argument('--minGeneCoverage', type=int, metavar='N', help='Minimum percentage of target reference gene sequence covered by --minCovPresence to consider a gene to be present (value between [0, 100])', required=False, default=70) 592 ' present in the sample', required=False, default=5)
402 parser_optional_rematch.add_argument('--minGeneIdentity', type=int, metavar='N', help='Minimum percentage of identity of reference gene sequence covered by --minCovCall to consider a gene to be present (value between [0, 100]). One INDEL will be considered as one difference', required=False, default=80) 593 parser_optional_rematch.add_argument('--minCovCall', type=int, metavar='N',
403 parser_optional_rematch.add_argument('--numMapLoc', type=int, metavar='N', help=argparse.SUPPRESS, required=False, default=1) 594 help='Reference position minimum coverage depth to perform a base call. Lower'
595 ' coverage will be coded as N', required=False, default=10)
596 parser_optional_rematch.add_argument('--minFrequencyDominantAllele', type=float, metavar='0.6',
597 help='Minimum relative frequency of the dominant allele coverage depth (value'
598 ' between [0, 1]). Positions with lower values will be considered as'
599 ' having multiple alleles (and will be coded as N)', required=False,
600 default=0.6)
601 parser_optional_rematch.add_argument('--minGeneCoverage', type=int, metavar='N',
602 help='Minimum percentage of target reference gene sequence covered'
603 ' by --minCovPresence to consider a gene to be present (value'
604 ' between [0, 100])', required=False, default=70)
605 parser_optional_rematch.add_argument('--minGeneIdentity', type=int, metavar='N',
606 help='Minimum percentage of identity of reference gene sequence covered'
607 ' by --minCovCall to consider a gene to be present (value'
608 ' between [0, 100]). One INDEL will be considered as one difference',
609 required=False, default=80)
610 parser_optional_rematch.add_argument('--numMapLoc', type=int, metavar='N', help=argparse.SUPPRESS, required=False,
611 default=1)
404 # parser_optional_rematch.add_argument('--numMapLoc', type=int, metavar='N', help='Maximum number of locations to which a read can map (sometimes useful when mapping against similar sequences)', required=False, default=1) 612 # parser_optional_rematch.add_argument('--numMapLoc', type=int, metavar='N', help='Maximum number of locations to which a read can map (sometimes useful when mapping against similar sequences)', required=False, default=1)
405 parser_optional_rematch.add_argument('--doubleRun', action='store_true', help='Tells ReMatCh to run a second time using as reference the noMatter consensus sequence produced in the first run. This will improve consensus sequence determination for sequences with high percentage of target reference gene sequence covered') 613 parser_optional_rematch.add_argument('--doubleRun', action='store_true',
406 parser_optional_rematch.add_argument('--reportSequenceCoverage', action='store_true', help='Produce an extra combined_report.data_by_gene with the sequence coverage instead of coverage depth') 614 help='Tells ReMatCh to run a second time using as reference the noMatter'
407 parser_optional_rematch.add_argument('--notWriteConsensus', action='store_true', help='Do not write consensus sequences') 615 ' consensus sequence produced in the first run. This will improve'
408 parser_optional_rematch.add_argument('--bowtieOPT', type=str, metavar='"--no-mixed"', help='Extra Bowtie2 options', required=False) 616 ' consensus sequence determination for sequences with high percentage of'
409 parser_optional_rematch.add_argument('--debug', action='store_true', help='DeBug Mode: do not remove temporary files') 617 ' target reference gene sequence covered')
618 parser_optional_rematch.add_argument('--reportSequenceCoverage', action='store_true',
619 help='Produce an extra combined_report.data_by_gene with the sequence coverage'
620 ' instead of coverage depth')
621 parser_optional_rematch.add_argument('--summary', action='store_true',
622 help='Produce extra report files containing only sequences present in at least'
623 ' one sample (usefull when using a large number of reference'
624 ' sequences, and only for first run)')
625 parser_optional_rematch.add_argument('--notWriteConsensus', action='store_true',
626 help='Do not write consensus sequences')
627 parser_optional_rematch.add_argument('--bowtieAlgo', type=str, metavar='"--very-sensitive-local"',
628 help='Bowtie2 alignment mode. It can be an end-to-end alignment (unclipped'
629 ' alignment) or local alignment (soft clipped alignment). Also, can'
630 ' choose between fast or sensitive alignments. Please check Bowtie2'
631 ' manual for extra'
632 ' information: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml .'
633 ' This option should be provided between quotes and starting with'
634 ' an empty space (like --bowtieAlgo " --very-fast") or using equal'
635 ' sign (like --bowtieAlgo="--very-fast")',
636 required=False, default='--very-sensitive-local')
637 parser_optional_rematch.add_argument('--bowtieOPT', type=str, metavar='"--no-mixed"',
638 help='Extra Bowtie2 options. This option should be provided between quotes and'
639 ' starting with an empty space (like --bowtieOPT " --no-mixed") or using'
640 ' equal sign (like --bowtieOPT="--no-mixed")',
641 required=False)
642 parser_optional_rematch.add_argument('--debug', action='store_true',
643 help='DeBug Mode: do not remove temporary files')
410 644
411 parser_optional_mlst = parser.add_argument_group('MLST facultative options') 645 parser_optional_mlst = parser.add_argument_group('MLST facultative options')
412 parser_optional_rematch.add_argument('--mlstReference', action='store_true', help='If the curated scheme for MLST alleles is available, tells ReMatCh to use these as reference (force Bowtie2 to run with very-sensitive-local parameters, and sets --extraSeq to 200), otherwise ReMatCh uses the first alleles of each MLST gene fragment in PubMLST as reference sequences (force Bowtie2 to run with very-sensitive-local parameters, and sets --extraSeq to 0)') 646 parser_optional_rematch.add_argument('--mlstReference', action='store_true',
413 parser_optional_mlst.add_argument('--mlstSchemaNumber', type=int, metavar='N', help='Number of the species PubMLST schema to be used in case of multiple schemes available (by default will use the first schema)', required=False) 647 help='If the curated scheme for MLST alleles is available, tells ReMatCh to'
414 parser_optional_mlst.add_argument('--mlstConsensus', choices=['noMatter', 'correct', 'alignment', 'all'], type=str, metavar='noMatter', help='Consensus sequence to be used in MLST determination (available options: %(choices)s)', required=False, default='noMatter') 648 ' use these as reference (force Bowtie2 to run with very-sensitive-local'
415 parser_optional_mlst.add_argument('--mlstRun', choices=['first', 'second', 'all'], type=str, metavar='first', help='ReMatCh run outputs to be used in MLST determination (available options: %(choices)s)', required=False, default='all') 649 ' parameters, and sets --extraSeq to 200), otherwise ReMatCh uses the'
650 ' first alleles of each MLST gene fragment in PubMLST as reference'
651 ' sequences (force Bowtie2 to run with very-sensitive-local parameters,'
652 ' and sets --extraSeq to 0)')
653 parser_optional_mlst.add_argument('--mlstSchemaNumber', type=int, metavar='N',
654 help='Number of the species PubMLST schema to be used in case of multiple schemes'
655 ' available (by default will use the first schema)', required=False)
656 parser_optional_mlst.add_argument('--mlstConsensus', choices=['noMatter', 'correct', 'alignment', 'all'], type=str,
657 metavar='noMatter',
658 help='Consensus sequence to be used in MLST'
659 ' determination (available options: %(choices)s)', required=False,
660 default='noMatter')
661 parser_optional_mlst.add_argument('--mlstRun', choices=['first', 'second', 'all'], type=str, metavar='first',
662 help='ReMatCh run outputs to be used in MLST determination (available'
663 ' options: %(choices)s)', required=False, default='all')
416 664
417 parser_optional_download = parser.add_argument_group('Download facultative options') 665 parser_optional_download = parser.add_argument_group('Download facultative options')
418 parser_optional_download.add_argument('-a', '--asperaKey', type=argparse.FileType('r'), metavar='/path/to/asperaweb_id_dsa.openssh', help='Tells ReMatCh to download fastq files from ENA using Aspera Connect. With this option, the path to Private-key file asperaweb_id_dsa.openssh must be provided (normaly found in ~/.aspera/connect/etc/asperaweb_id_dsa.openssh).', required=False) 666 parser_optional_download.add_argument('-a', '--asperaKey', type=argparse.FileType('r'),
419 parser_optional_download.add_argument('-k', '--keepDownloadedFastq', action='store_true', help='Tells ReMatCh to keep the fastq files downloaded') 667 metavar='/path/to/asperaweb_id_dsa.openssh',
420 parser_optional_download.add_argument('--downloadLibrariesType', type=str, metavar='PAIRED', help='Tells ReMatCh to download files with specific library layout (available options: %(choices)s)', choices=['PAIRED', 'SINGLE', 'BOTH'], required=False, default='BOTH') 668 help='Tells ReMatCh to download fastq files from ENA using Aspera'
421 parser_optional_download.add_argument('--downloadInstrumentPlatform', type=str, metavar='ILLUMINA', help='Tells ReMatCh to download files with specific library layout (available options: %(choices)s)', choices=['ILLUMINA', 'ALL'], required=False, default='ILLUMINA') 669 ' Connect. With this option, the path to Private-key file'
422 parser_optional_download.add_argument('--downloadCramBam', action='store_true', help='Tells ReMatCh to also download cram/bam files and convert them to fastq files') 670 ' asperaweb_id_dsa.openssh must be provided (normaly found in'
423 671 ' ~/.aspera/connect/etc/asperaweb_id_dsa.openssh).', required=False)
424 parser_optional_softClip = parser.add_argument_group('Soft clip facultative options') 672 parser_optional_download.add_argument('-k', '--keepDownloadedFastq', action='store_true',
425 parser_optional_softClip.add_argument('--softClip_baseQuality', type=int, metavar='N', help='Base quality phred score in reads soft clipped regions', required=False, default=7) 673 help='Tells ReMatCh to keep the fastq files downloaded')
426 parser_optional_download.add_argument('--softClip_recodeRun', type=str, metavar='first', help='ReMatCh run to recode soft clipped regions (available options: %(choices)s)', choices=['first', 'second', 'both', 'none'], required=False, default='none') 674 parser_optional_download.add_argument('--downloadLibrariesType', type=str, metavar='PAIRED',
427 parser_optional_download.add_argument('--softClip_cigarFlagRecode', type=str, metavar='M', help='CIGAR flag to recode CIGAR soft clip (available options: %(choices)s)', choices=['M', 'I', 'X'], required=False, default='X') 675 help='Tells ReMatCh to download files with specific library'
428 676 ' layout (available options: %(choices)s)',
429 parser_optional_download_exclusive = parser.add_mutually_exclusive_group() 677 choices=['PAIRED', 'SINGLE', 'BOTH'], required=False, default='BOTH')
430 parser_optional_download_exclusive.add_argument('-l', '--listIDs', type=argparse.FileType('r'), metavar='/path/to/list_IDs.txt', help='Path to list containing the IDs to be downloaded (one per line)', required=False) 678 parser_optional_download.add_argument('--downloadInstrumentPlatform', type=str, metavar='ILLUMINA',
431 parser_optional_download_exclusive.add_argument('-t', '--taxon', type=str, metavar='"Streptococcus agalactiae"', help='Taxon name for which ReMatCh will download fastq files', required=False) 679 help='Tells ReMatCh to download files with specific library layout (available'
680 ' options: %(choices)s)', choices=['ILLUMINA', 'ALL'], required=False,
681 default='ILLUMINA')
682 parser_optional_download.add_argument('--downloadCramBam', action='store_true',
683 help='Tells ReMatCh to also download cram/bam files and convert them to fastq'
684 ' files')
685
686 parser_optional_sra = parser.add_mutually_exclusive_group()
687 parser_optional_sra.add_argument('--SRA', action='store_true',
688 help='Tells getSeqENA.py to download reads in fastq format only from NCBI SRA'
689 ' database (not recommended)')
690 parser_optional_sra.add_argument('--SRAopt', action='store_true',
691 help='Tells getSeqENA.py to download reads from NCBI SRA if the download from ENA'
692 ' fails')
693
694 parser_optional_soft_clip = parser.add_argument_group('Soft clip facultative options')
695 parser_optional_soft_clip.add_argument('--softClip_baseQuality', type=int, metavar='N',
696 help='Base quality phred score in reads soft clipped regions',
697 required=False,
698 default=7)
699 parser_optional_soft_clip.add_argument('--softClip_recodeRun', type=str, metavar='first',
700 help='ReMatCh run to recode soft clipped regions (available'
701 ' options: %(choices)s)', choices=['first', 'second', 'both', 'none'],
702 required=False, default='none')
703 parser_optional_soft_clip.add_argument('--softClip_cigarFlagRecode', type=str, metavar='M',
704 help='CIGAR flag to recode CIGAR soft clip (available options: %(choices)s)',
705 choices=['M', 'I', 'X'], required=False, default='X')
432 706
433 args = parser.parse_args() 707 args = parser.parse_args()
434 708
709 msg = []
435 if args.reference is None and not args.mlstReference: 710 if args.reference is None and not args.mlstReference:
436 parser.error('At least --reference or --mlstReference should be provided') 711 msg.append('At least --reference or --mlstReference should be provided')
437 elif args.reference is not None and args.mlstReference: 712 elif args.reference is not None and args.mlstReference:
438 parser.error('Only --reference or --mlstReference should be provided') 713 msg.append('Only --reference or --mlstReference should be provided')
439 else: 714 else:
440 if args.mlstReference: 715 if args.mlstReference:
441 if args.mlst is None: 716 if args.mlst is None:
442 parser.error('Please provide species name using --mlst') 717 msg.append('Please provide species name using --mlst')
443
444 if args.minFrequencyDominantAllele < 0 or args.minFrequencyDominantAllele > 1: 718 if args.minFrequencyDominantAllele < 0 or args.minFrequencyDominantAllele > 1:
445 parser.error('--minFrequencyDominantAllele should be a value between [0, 1]') 719 msg.append('--minFrequencyDominantAllele should be a value between [0, 1]')
446
447 if args.minGeneCoverage < 0 or args.minGeneCoverage > 100: 720 if args.minGeneCoverage < 0 or args.minGeneCoverage > 100:
448 parser.error('--minGeneCoverage should be a value between [0, 100]') 721 msg.append('--minGeneCoverage should be a value between [0, 100]')
449 if args.minGeneIdentity < 0 or args.minGeneIdentity > 100: 722 if args.minGeneIdentity < 0 or args.minGeneIdentity > 100:
450 parser.error('--minGeneIdentity should be a value between [0, 100]') 723 msg.append('--minGeneIdentity should be a value between [0, 100]')
724 if args.notWriteConsensus and args.doubleRun:
725 msg.append('--notWriteConsensus and --doubleRun cannot be used together.'
726 ' Maybe you only want to use --doubleRun')
727
728 if len(msg) > 0:
729 argparse.ArgumentParser.error('\n'.join(msg))
451 730
452 start_time = time.time() 731 start_time = time.time()
453 732
454 number_samples_successfully, samples_total_number = runRematch(args) 733 number_samples_successfully, samples_total_number = run_rematch(args)
455 734
456 print '\n' + 'END ReMatCh' 735 print('\n' + 'END ReMatCh')
457 print '\n' + str(number_samples_successfully) + ' samples out of ' + str(samples_total_number) + ' run successfully' 736 print('\n' +
458 time_taken = utils.runTime(start_time) 737 str(number_samples_successfully) + ' samples out of ' + str(samples_total_number) + ' run successfully')
738 time_taken = utils.run_time(start_time)
459 del time_taken 739 del time_taken
460 740
461 if number_samples_successfully == 0: 741 if number_samples_successfully == 0:
462 sys.exit('No samples run successfully!') 742 sys.exit('No samples run successfully!')
463 743