6
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Predictive serotyping for _E. coli_.
|
|
4 """
|
|
5 import logging
|
|
6 import os
|
|
7 import sys
|
|
8 import tempfile
|
|
9 import datetime
|
|
10 from urllib.request import urlretrieve
|
|
11
|
|
12 from ectyper import (blastFunctions, commandLineOptions, definitions,
|
|
13 genomeFunctions, loggingFunctions, predictionFunctions,
|
|
14 speciesIdentification)
|
|
15
|
|
16 LOG_FILE = loggingFunctions.initialize_logging()
|
|
17 LOG = logging.getLogger(__name__)
|
|
18
|
|
19
|
|
20 def run_program():
|
|
21 """
|
|
22 Wrapper for both the serotyping and virulence finder
|
|
23 The program needs to do the following
|
|
24 (1) Filter genome files based on format
|
|
25 (2) Filter genome files based on species
|
|
26 (3) Map FASTQ files to FASTA seq
|
|
27 (1) Get names of all genomes being tested
|
|
28 (2) Create a BLAST database of those genomes
|
|
29 (3) Query for serotype
|
|
30 (4) Parse the results
|
|
31 (5) Display the results
|
|
32 :return: success or failure
|
|
33
|
|
34 """
|
|
35 # Initialize the program
|
|
36 args = commandLineOptions.parse_command_line()
|
|
37 LOG.info('Starting ectyper\nSerotype prediction with input:\n \
|
|
38 {0}\n \
|
|
39 Log file is: {1}'.format(args, LOG_FILE))
|
|
40 LOG.debug(args)
|
|
41
|
|
42 ## Initialize temporary directories for the scope of this program
|
|
43 with tempfile.TemporaryDirectory() as temp_dir:
|
|
44 temp_files = create_tmp_files(temp_dir, output_dir=args.output)
|
|
45 LOG.debug(temp_files)
|
|
46
|
|
47 # Download refseq file if speicies identification is enabled
|
|
48 if args.species:
|
|
49 download_refseq()
|
|
50
|
|
51 LOG.info("Gathering genome files")
|
|
52 raw_genome_files = genomeFunctions.get_files_as_list(args.input)
|
|
53 LOG.debug(raw_genome_files)
|
|
54
|
|
55 LOG.info("Removing invalid file types")
|
|
56 raw_files_dict = get_raw_files(raw_genome_files)
|
|
57 LOG.debug(raw_files_dict)
|
|
58
|
|
59 # Assembling fastq + verify ecoli genome
|
|
60 LOG.info("Preparing genome files for blast alignment")
|
|
61 final_fasta_files = filter_for_ecoli_files(
|
|
62 raw_files_dict, temp_files, verify=args.verify, species=args.species
|
|
63 )
|
|
64 LOG.debug(final_fasta_files)
|
|
65 if len(final_fasta_files) is 0:
|
|
66 LOG.info("No valid genome files. Terminating the program.")
|
|
67 exit(0)
|
|
68
|
|
69 LOG.info("Standardizing the genome headers")
|
|
70 (all_genomes_list, all_genomes_files) = \
|
|
71 genomeFunctions.get_genome_names_from_files(
|
|
72 final_fasta_files, temp_files['fasta_temp_dir'])
|
|
73 LOG.debug(all_genomes_list)
|
|
74 LOG.debug(all_genomes_files)
|
|
75
|
|
76 # Main prediction function
|
|
77 predictions_file = run_prediction(all_genomes_files, args,
|
|
78 temp_files['output_file'])
|
|
79
|
|
80 # Add empty rows for genomes without blast result
|
|
81 predictions_file = predictionFunctions.add_non_predicted(
|
|
82 all_genomes_list, predictions_file)
|
|
83 LOG.info('Outputs stored in {0}'.format(temp_files['output_dir']))
|
|
84
|
|
85 # Store most recent result in working directory
|
|
86 LOG.info('\nReporting result...')
|
|
87 predictionFunctions.report_result(predictions_file)
|
|
88
|
|
89 def download_refseq():
|
|
90 '''Download refseq file with progress bar
|
|
91 '''
|
|
92
|
|
93
|
|
94 def reporthook(blocknum, blocksize, totalsize):
|
|
95 '''
|
|
96 https://stackoverflow.com/questions/15644964/python-progress-bar-and-downloads
|
|
97 '''
|
|
98 readsofar = blocknum * blocksize
|
|
99 if totalsize > 0:
|
|
100 s = "\r {:5.1%} {:{}d} / {:d}".format(
|
|
101 readsofar/totalsize, readsofar,
|
|
102 len(str(totalsize)),
|
|
103 totalsize
|
|
104 )
|
|
105 sys.stderr.write(s)
|
|
106 if readsofar >= totalsize: # near the end
|
|
107 sys.stderr.write("\n")
|
|
108 else: # total size is unknown
|
|
109 sys.stderr.write("read {}\n".format(readsofar))
|
|
110
|
|
111 if not os.path.isfile(definitions.REFSEQ_SKETCH):
|
|
112 refseq_url = 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh'
|
|
113 LOG.info("No refseq found. Downloading reference file for species identification...")
|
|
114 urlretrieve(refseq_url, definitions.REFSEQ_SKETCH, reporthook)
|
|
115 LOG.info("Download complete.")
|
|
116
|
|
117 def create_tmp_files(temp_dir, output_dir=None):
|
|
118 """Create a dictionary of temporary files used by ectyper
|
|
119
|
|
120 Args:
|
|
121 temp_dir: program scope temporary directory
|
|
122 output_dir(str, optional):
|
|
123 directory to store output
|
|
124
|
|
125 Return:
|
|
126 a dictionary of temporary files
|
|
127 example:
|
|
128 {'assemble_temp_dir': 'test/temp/assemblies',
|
|
129 'fasta_temp_dir': 'test/temp/fastas',
|
|
130 'output_dir': os.path.abspath('output')+'/',
|
|
131 'output_file': os.path.abspath('output/output.csv')}
|
|
132 """
|
|
133
|
|
134 # Get the correct files and directories
|
|
135 files_and_dirs = {
|
|
136 'assemble_temp_dir': os.path.join(temp_dir, 'assemblies'),
|
|
137 'fasta_temp_dir': os.path.join(temp_dir, 'fastas'),
|
|
138 }
|
|
139 if output_dir is None:
|
|
140 output_dir = ''.join([
|
|
141 str(datetime.datetime.now().date()),
|
|
142 '_',
|
|
143 str(datetime.datetime.now().time()).replace(':', '.')
|
|
144 ])
|
|
145 if os.path.isabs(output_dir):
|
|
146 pass
|
|
147 else:
|
|
148 output_dir = os.path.join(definitions.WORKPLACE_DIR, 'output', output_dir)
|
|
149
|
|
150 output_file = os.path.join(output_dir, 'output.csv')
|
|
151 if os.path.isfile(output_file):
|
|
152 os.remove(output_file)
|
|
153 for d in [
|
|
154 output_dir, files_and_dirs['assemble_temp_dir'],
|
|
155 files_and_dirs['fasta_temp_dir']
|
|
156 ]:
|
|
157 if not os.path.exists(d):
|
|
158 os.makedirs(d)
|
|
159
|
|
160 # Finalize the tmp_files dictionary
|
|
161 files_and_dirs['output_file'] = output_file
|
|
162 files_and_dirs['output_dir'] = output_dir
|
|
163
|
|
164 LOG.info("Temporary files and directory created")
|
|
165 return files_and_dirs
|
|
166
|
|
167
|
|
168 def run_prediction(genome_files, args, predictions_file):
|
|
169 '''Core prediction functionality
|
|
170
|
|
171 Args:
|
|
172 genome_files:
|
|
173 list of genome files
|
|
174 args:
|
|
175 commandline arguments
|
|
176 predictions_file:
|
|
177 filename of prediction output
|
|
178
|
|
179 Returns:
|
|
180 predictions_file with prediction written in it
|
|
181 '''
|
|
182 query_file = definitions.SEROTYPE_FILE
|
|
183 ectyper_dict_file = definitions.SEROTYPE_ALLELE_JSON
|
|
184 # create a temp dir for blastdb
|
|
185 with tempfile.TemporaryDirectory() as temp_dir:
|
|
186 # Divide genome files into chunks
|
|
187 chunk_size = 50
|
|
188 genome_chunks = [
|
|
189 genome_files[i:i + chunk_size]
|
|
190 for i in range(0, len(genome_files), chunk_size)
|
|
191 ]
|
|
192 for index, chunk in enumerate(genome_chunks):
|
|
193 LOG.info("Start creating blast database #{0}".format(index + 1))
|
|
194 blast_db = blastFunctions.create_blast_db(chunk, temp_dir)
|
|
195
|
|
196 LOG.info("Start blast alignment on database #{0}".format(index + 1))
|
|
197 blast_output_file = blastFunctions.run_blast(query_file, blast_db, args)
|
|
198 LOG.info("Start serotype prediction for database #{0}".format(index + 1))
|
|
199 predictions_file = predictionFunctions.predict_serotype(
|
|
200 blast_output_file, ectyper_dict_file, predictions_file,
|
|
201 args.detailed)
|
|
202 return predictions_file
|
|
203
|
|
204
|
|
205 def get_raw_files(raw_files):
|
|
206 """Take all the raw files, and filter not fasta / fastq
|
|
207
|
|
208 Args:
|
|
209 raw_files(str): list of files from user input
|
|
210
|
|
211 Returns:
|
|
212 A dictitionary collection of fasta and fastq files
|
|
213 example:
|
|
214 {'raw_fasta_files':[],
|
|
215 'raw_fastq_files':[]}
|
|
216 """
|
|
217 fasta_files = []
|
|
218 fastq_files = []
|
|
219
|
|
220 for file in raw_files:
|
|
221 file_format = genomeFunctions.get_valid_format(file)
|
|
222 if file_format == 'fasta':
|
|
223 fasta_files.append(file)
|
|
224 elif file_format == 'fastq':
|
|
225 fastq_files.append(file)
|
|
226
|
|
227 LOG.debug('raw fasta files: {}'.format(fasta_files))
|
|
228 LOG.debug('raw fastq files: {}'.format(fastq_files))
|
|
229
|
|
230 return({'fasta':fasta_files, 'fastq':fastq_files})
|
|
231
|
|
232
|
|
233 def filter_for_ecoli_files(raw_dict, temp_files, verify=False, species=False):
|
|
234 """filter ecoli, identify species, assemble fastq
|
|
235 Assemble fastq files to fasta files,
|
|
236 then filter all files by reference method if verify is enabled,
|
|
237 if identified as non-ecoli, identify species by mash method if species is enabled.
|
|
238
|
|
239 Args:
|
|
240 raw_dict{fasta:list_of_files, fastq:list_of_files}:
|
|
241 dictionary collection of fasta and fastq files
|
|
242 temp_file: temporary directory
|
|
243 verify(bool):
|
|
244 whether to perform ecoli verification
|
|
245 species(bool):
|
|
246 whether to perform species identification for non-ecoli genome
|
|
247 Returns:
|
|
248 List of filtered and assembled genome files in fasta format
|
|
249 """
|
|
250 final_files = []
|
|
251 for f in raw_dict.keys():
|
|
252 temp_dir = temp_files['fasta_temp_dir'] if f == "fasta" else temp_files['assemble_temp_dir']
|
|
253
|
|
254 for ffile in raw_dict[f]:
|
|
255 filtered_file = filter_file_by_species(
|
|
256 ffile, f, temp_dir, verify=verify, species=species)
|
|
257 if filtered_file is not None and \
|
|
258 genomeFunctions.get_valid_format(filtered_file) is not None:
|
|
259 final_files.append(filtered_file)
|
|
260
|
|
261 LOG.info('{} final fasta files'.format(len(final_files)))
|
|
262 return final_files
|
|
263
|
|
264 def filter_file_by_species(genome_file, genome_format, temp_dir, verify=False, species=False):
|
|
265 """filter ecoli, identify species, assemble fastq
|
|
266 Assemble fastq file to fasta file,
|
|
267 then filter the file by reference method if verify is enabled,
|
|
268 if identified as non-ecoli, identify species by mash method if species is enabled.
|
|
269
|
|
270 Args:
|
|
271 genome_file: input genome file
|
|
272 genome_format(str): fasta or fastq
|
|
273 temp_file: temporary directory
|
|
274 verify(bool):
|
|
275 whether to perform ecoli verification
|
|
276 species(bool):
|
|
277 whether to perform species identification for non-ecoli genome
|
|
278 Returns:
|
|
279 The filtered and assembled genome files in fasta format
|
|
280 """
|
|
281 combined_file = definitions.COMBINED
|
|
282 filtered_file = None
|
|
283 if genome_format == 'fastq':
|
|
284 iden_file, pred_file = \
|
|
285 genomeFunctions.assemble_reads(genome_file, combined_file, temp_dir)
|
|
286 # If no alignment resut, the file is definitely not E.Coli
|
|
287 if genomeFunctions.get_valid_format(iden_file) is None:
|
|
288 LOG.warning(
|
|
289 "{} is filtered out because no identification alignment found".format(genome_file))
|
|
290 return filtered_file
|
|
291 if not (verify or species) or speciesIdentification.is_ecoli_genome(
|
|
292 iden_file, genome_file, mash=species):
|
|
293 # final check before adding the alignment for prediction
|
|
294 if genomeFunctions.get_valid_format(iden_file) != 'fasta':
|
|
295 LOG.warning(
|
|
296 "{0} is filtered out because no prediction alignment found".format(genome_file))
|
|
297 return filtered_file
|
|
298 filtered_file = pred_file
|
|
299 if genome_format == 'fasta':
|
|
300 if not (verify or species) \
|
|
301 or speciesIdentification.is_ecoli_genome(genome_file, mash=species):
|
|
302 filtered_file = genome_file
|
|
303 return filtered_file
|