comparison ecoli_serotyping/ectyper.py @ 6:fe3ceb5c4214 draft

Uploaded
author jpetteng
date Fri, 05 Jan 2018 15:43:14 -0500
parents
children
comparison
equal deleted inserted replaced
5:a202cc394af8 6:fe3ceb5c4214
1 #!/usr/bin/env python
2 """
3 Predictive serotyping for _E. coli_.
4 """
5 import logging
6 import os
7 import sys
8 import tempfile
9 import datetime
10 from urllib.request import urlretrieve
11
12 from ectyper import (blastFunctions, commandLineOptions, definitions,
13 genomeFunctions, loggingFunctions, predictionFunctions,
14 speciesIdentification)
15
16 LOG_FILE = loggingFunctions.initialize_logging()
17 LOG = logging.getLogger(__name__)
18
19
20 def run_program():
21 """
22 Wrapper for both the serotyping and virulence finder
23 The program needs to do the following
24 (1) Filter genome files based on format
25 (2) Filter genome files based on species
26 (3) Map FASTQ files to FASTA seq
27 (1) Get names of all genomes being tested
28 (2) Create a BLAST database of those genomes
29 (3) Query for serotype
30 (4) Parse the results
31 (5) Display the results
32 :return: success or failure
33
34 """
35 # Initialize the program
36 args = commandLineOptions.parse_command_line()
37 LOG.info('Starting ectyper\nSerotype prediction with input:\n \
38 {0}\n \
39 Log file is: {1}'.format(args, LOG_FILE))
40 LOG.debug(args)
41
42 ## Initialize temporary directories for the scope of this program
43 with tempfile.TemporaryDirectory() as temp_dir:
44 temp_files = create_tmp_files(temp_dir, output_dir=args.output)
45 LOG.debug(temp_files)
46
47 # Download refseq file if speicies identification is enabled
48 if args.species:
49 download_refseq()
50
51 LOG.info("Gathering genome files")
52 raw_genome_files = genomeFunctions.get_files_as_list(args.input)
53 LOG.debug(raw_genome_files)
54
55 LOG.info("Removing invalid file types")
56 raw_files_dict = get_raw_files(raw_genome_files)
57 LOG.debug(raw_files_dict)
58
59 # Assembling fastq + verify ecoli genome
60 LOG.info("Preparing genome files for blast alignment")
61 final_fasta_files = filter_for_ecoli_files(
62 raw_files_dict, temp_files, verify=args.verify, species=args.species
63 )
64 LOG.debug(final_fasta_files)
65 if len(final_fasta_files) is 0:
66 LOG.info("No valid genome files. Terminating the program.")
67 exit(0)
68
69 LOG.info("Standardizing the genome headers")
70 (all_genomes_list, all_genomes_files) = \
71 genomeFunctions.get_genome_names_from_files(
72 final_fasta_files, temp_files['fasta_temp_dir'])
73 LOG.debug(all_genomes_list)
74 LOG.debug(all_genomes_files)
75
76 # Main prediction function
77 predictions_file = run_prediction(all_genomes_files, args,
78 temp_files['output_file'])
79
80 # Add empty rows for genomes without blast result
81 predictions_file = predictionFunctions.add_non_predicted(
82 all_genomes_list, predictions_file)
83 LOG.info('Outputs stored in {0}'.format(temp_files['output_dir']))
84
85 # Store most recent result in working directory
86 LOG.info('\nReporting result...')
87 predictionFunctions.report_result(predictions_file)
88
89 def download_refseq():
90 '''Download refseq file with progress bar
91 '''
92
93
94 def reporthook(blocknum, blocksize, totalsize):
95 '''
96 https://stackoverflow.com/questions/15644964/python-progress-bar-and-downloads
97 '''
98 readsofar = blocknum * blocksize
99 if totalsize > 0:
100 s = "\r {:5.1%} {:{}d} / {:d}".format(
101 readsofar/totalsize, readsofar,
102 len(str(totalsize)),
103 totalsize
104 )
105 sys.stderr.write(s)
106 if readsofar >= totalsize: # near the end
107 sys.stderr.write("\n")
108 else: # total size is unknown
109 sys.stderr.write("read {}\n".format(readsofar))
110
111 if not os.path.isfile(definitions.REFSEQ_SKETCH):
112 refseq_url = 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh'
113 LOG.info("No refseq found. Downloading reference file for species identification...")
114 urlretrieve(refseq_url, definitions.REFSEQ_SKETCH, reporthook)
115 LOG.info("Download complete.")
116
117 def create_tmp_files(temp_dir, output_dir=None):
118 """Create a dictionary of temporary files used by ectyper
119
120 Args:
121 temp_dir: program scope temporary directory
122 output_dir(str, optional):
123 directory to store output
124
125 Return:
126 a dictionary of temporary files
127 example:
128 {'assemble_temp_dir': 'test/temp/assemblies',
129 'fasta_temp_dir': 'test/temp/fastas',
130 'output_dir': os.path.abspath('output')+'/',
131 'output_file': os.path.abspath('output/output.csv')}
132 """
133
134 # Get the correct files and directories
135 files_and_dirs = {
136 'assemble_temp_dir': os.path.join(temp_dir, 'assemblies'),
137 'fasta_temp_dir': os.path.join(temp_dir, 'fastas'),
138 }
139 if output_dir is None:
140 output_dir = ''.join([
141 str(datetime.datetime.now().date()),
142 '_',
143 str(datetime.datetime.now().time()).replace(':', '.')
144 ])
145 if os.path.isabs(output_dir):
146 pass
147 else:
148 output_dir = os.path.join(definitions.WORKPLACE_DIR, 'output', output_dir)
149
150 output_file = os.path.join(output_dir, 'output.csv')
151 if os.path.isfile(output_file):
152 os.remove(output_file)
153 for d in [
154 output_dir, files_and_dirs['assemble_temp_dir'],
155 files_and_dirs['fasta_temp_dir']
156 ]:
157 if not os.path.exists(d):
158 os.makedirs(d)
159
160 # Finalize the tmp_files dictionary
161 files_and_dirs['output_file'] = output_file
162 files_and_dirs['output_dir'] = output_dir
163
164 LOG.info("Temporary files and directory created")
165 return files_and_dirs
166
167
168 def run_prediction(genome_files, args, predictions_file):
169 '''Core prediction functionality
170
171 Args:
172 genome_files:
173 list of genome files
174 args:
175 commandline arguments
176 predictions_file:
177 filename of prediction output
178
179 Returns:
180 predictions_file with prediction written in it
181 '''
182 query_file = definitions.SEROTYPE_FILE
183 ectyper_dict_file = definitions.SEROTYPE_ALLELE_JSON
184 # create a temp dir for blastdb
185 with tempfile.TemporaryDirectory() as temp_dir:
186 # Divide genome files into chunks
187 chunk_size = 50
188 genome_chunks = [
189 genome_files[i:i + chunk_size]
190 for i in range(0, len(genome_files), chunk_size)
191 ]
192 for index, chunk in enumerate(genome_chunks):
193 LOG.info("Start creating blast database #{0}".format(index + 1))
194 blast_db = blastFunctions.create_blast_db(chunk, temp_dir)
195
196 LOG.info("Start blast alignment on database #{0}".format(index + 1))
197 blast_output_file = blastFunctions.run_blast(query_file, blast_db, args)
198 LOG.info("Start serotype prediction for database #{0}".format(index + 1))
199 predictions_file = predictionFunctions.predict_serotype(
200 blast_output_file, ectyper_dict_file, predictions_file,
201 args.detailed)
202 return predictions_file
203
204
205 def get_raw_files(raw_files):
206 """Take all the raw files, and filter not fasta / fastq
207
208 Args:
209 raw_files(str): list of files from user input
210
211 Returns:
212 A dictitionary collection of fasta and fastq files
213 example:
214 {'raw_fasta_files':[],
215 'raw_fastq_files':[]}
216 """
217 fasta_files = []
218 fastq_files = []
219
220 for file in raw_files:
221 file_format = genomeFunctions.get_valid_format(file)
222 if file_format == 'fasta':
223 fasta_files.append(file)
224 elif file_format == 'fastq':
225 fastq_files.append(file)
226
227 LOG.debug('raw fasta files: {}'.format(fasta_files))
228 LOG.debug('raw fastq files: {}'.format(fastq_files))
229
230 return({'fasta':fasta_files, 'fastq':fastq_files})
231
232
233 def filter_for_ecoli_files(raw_dict, temp_files, verify=False, species=False):
234 """filter ecoli, identify species, assemble fastq
235 Assemble fastq files to fasta files,
236 then filter all files by reference method if verify is enabled,
237 if identified as non-ecoli, identify species by mash method if species is enabled.
238
239 Args:
240 raw_dict{fasta:list_of_files, fastq:list_of_files}:
241 dictionary collection of fasta and fastq files
242 temp_file: temporary directory
243 verify(bool):
244 whether to perform ecoli verification
245 species(bool):
246 whether to perform species identification for non-ecoli genome
247 Returns:
248 List of filtered and assembled genome files in fasta format
249 """
250 final_files = []
251 for f in raw_dict.keys():
252 temp_dir = temp_files['fasta_temp_dir'] if f == "fasta" else temp_files['assemble_temp_dir']
253
254 for ffile in raw_dict[f]:
255 filtered_file = filter_file_by_species(
256 ffile, f, temp_dir, verify=verify, species=species)
257 if filtered_file is not None and \
258 genomeFunctions.get_valid_format(filtered_file) is not None:
259 final_files.append(filtered_file)
260
261 LOG.info('{} final fasta files'.format(len(final_files)))
262 return final_files
263
264 def filter_file_by_species(genome_file, genome_format, temp_dir, verify=False, species=False):
265 """filter ecoli, identify species, assemble fastq
266 Assemble fastq file to fasta file,
267 then filter the file by reference method if verify is enabled,
268 if identified as non-ecoli, identify species by mash method if species is enabled.
269
270 Args:
271 genome_file: input genome file
272 genome_format(str): fasta or fastq
273 temp_file: temporary directory
274 verify(bool):
275 whether to perform ecoli verification
276 species(bool):
277 whether to perform species identification for non-ecoli genome
278 Returns:
279 The filtered and assembled genome files in fasta format
280 """
281 combined_file = definitions.COMBINED
282 filtered_file = None
283 if genome_format == 'fastq':
284 iden_file, pred_file = \
285 genomeFunctions.assemble_reads(genome_file, combined_file, temp_dir)
286 # If no alignment resut, the file is definitely not E.Coli
287 if genomeFunctions.get_valid_format(iden_file) is None:
288 LOG.warning(
289 "{} is filtered out because no identification alignment found".format(genome_file))
290 return filtered_file
291 if not (verify or species) or speciesIdentification.is_ecoli_genome(
292 iden_file, genome_file, mash=species):
293 # final check before adding the alignment for prediction
294 if genomeFunctions.get_valid_format(iden_file) != 'fasta':
295 LOG.warning(
296 "{0} is filtered out because no prediction alignment found".format(genome_file))
297 return filtered_file
298 filtered_file = pred_file
299 if genome_format == 'fasta':
300 if not (verify or species) \
301 or speciesIdentification.is_ecoli_genome(genome_file, mash=species):
302 filtered_file = genome_file
303 return filtered_file