Mercurial > repos > jpetteng > ectyper
comparison ecoli_serotyping/ectyper.py @ 6:fe3ceb5c4214 draft
Uploaded
author | jpetteng |
---|---|
date | Fri, 05 Jan 2018 15:43:14 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:a202cc394af8 | 6:fe3ceb5c4214 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Predictive serotyping for _E. coli_. | |
4 """ | |
5 import logging | |
6 import os | |
7 import sys | |
8 import tempfile | |
9 import datetime | |
10 from urllib.request import urlretrieve | |
11 | |
12 from ectyper import (blastFunctions, commandLineOptions, definitions, | |
13 genomeFunctions, loggingFunctions, predictionFunctions, | |
14 speciesIdentification) | |
15 | |
16 LOG_FILE = loggingFunctions.initialize_logging() | |
17 LOG = logging.getLogger(__name__) | |
18 | |
19 | |
20 def run_program(): | |
21 """ | |
22 Wrapper for both the serotyping and virulence finder | |
23 The program needs to do the following | |
24 (1) Filter genome files based on format | |
25 (2) Filter genome files based on species | |
26 (3) Map FASTQ files to FASTA seq | |
27 (1) Get names of all genomes being tested | |
28 (2) Create a BLAST database of those genomes | |
29 (3) Query for serotype | |
30 (4) Parse the results | |
31 (5) Display the results | |
32 :return: success or failure | |
33 | |
34 """ | |
35 # Initialize the program | |
36 args = commandLineOptions.parse_command_line() | |
37 LOG.info('Starting ectyper\nSerotype prediction with input:\n \ | |
38 {0}\n \ | |
39 Log file is: {1}'.format(args, LOG_FILE)) | |
40 LOG.debug(args) | |
41 | |
42 ## Initialize temporary directories for the scope of this program | |
43 with tempfile.TemporaryDirectory() as temp_dir: | |
44 temp_files = create_tmp_files(temp_dir, output_dir=args.output) | |
45 LOG.debug(temp_files) | |
46 | |
47 # Download refseq file if speicies identification is enabled | |
48 if args.species: | |
49 download_refseq() | |
50 | |
51 LOG.info("Gathering genome files") | |
52 raw_genome_files = genomeFunctions.get_files_as_list(args.input) | |
53 LOG.debug(raw_genome_files) | |
54 | |
55 LOG.info("Removing invalid file types") | |
56 raw_files_dict = get_raw_files(raw_genome_files) | |
57 LOG.debug(raw_files_dict) | |
58 | |
59 # Assembling fastq + verify ecoli genome | |
60 LOG.info("Preparing genome files for blast alignment") | |
61 final_fasta_files = filter_for_ecoli_files( | |
62 raw_files_dict, temp_files, verify=args.verify, species=args.species | |
63 ) | |
64 LOG.debug(final_fasta_files) | |
65 if len(final_fasta_files) is 0: | |
66 LOG.info("No valid genome files. Terminating the program.") | |
67 exit(0) | |
68 | |
69 LOG.info("Standardizing the genome headers") | |
70 (all_genomes_list, all_genomes_files) = \ | |
71 genomeFunctions.get_genome_names_from_files( | |
72 final_fasta_files, temp_files['fasta_temp_dir']) | |
73 LOG.debug(all_genomes_list) | |
74 LOG.debug(all_genomes_files) | |
75 | |
76 # Main prediction function | |
77 predictions_file = run_prediction(all_genomes_files, args, | |
78 temp_files['output_file']) | |
79 | |
80 # Add empty rows for genomes without blast result | |
81 predictions_file = predictionFunctions.add_non_predicted( | |
82 all_genomes_list, predictions_file) | |
83 LOG.info('Outputs stored in {0}'.format(temp_files['output_dir'])) | |
84 | |
85 # Store most recent result in working directory | |
86 LOG.info('\nReporting result...') | |
87 predictionFunctions.report_result(predictions_file) | |
88 | |
89 def download_refseq(): | |
90 '''Download refseq file with progress bar | |
91 ''' | |
92 | |
93 | |
94 def reporthook(blocknum, blocksize, totalsize): | |
95 ''' | |
96 https://stackoverflow.com/questions/15644964/python-progress-bar-and-downloads | |
97 ''' | |
98 readsofar = blocknum * blocksize | |
99 if totalsize > 0: | |
100 s = "\r {:5.1%} {:{}d} / {:d}".format( | |
101 readsofar/totalsize, readsofar, | |
102 len(str(totalsize)), | |
103 totalsize | |
104 ) | |
105 sys.stderr.write(s) | |
106 if readsofar >= totalsize: # near the end | |
107 sys.stderr.write("\n") | |
108 else: # total size is unknown | |
109 sys.stderr.write("read {}\n".format(readsofar)) | |
110 | |
111 if not os.path.isfile(definitions.REFSEQ_SKETCH): | |
112 refseq_url = 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' | |
113 LOG.info("No refseq found. Downloading reference file for species identification...") | |
114 urlretrieve(refseq_url, definitions.REFSEQ_SKETCH, reporthook) | |
115 LOG.info("Download complete.") | |
116 | |
117 def create_tmp_files(temp_dir, output_dir=None): | |
118 """Create a dictionary of temporary files used by ectyper | |
119 | |
120 Args: | |
121 temp_dir: program scope temporary directory | |
122 output_dir(str, optional): | |
123 directory to store output | |
124 | |
125 Return: | |
126 a dictionary of temporary files | |
127 example: | |
128 {'assemble_temp_dir': 'test/temp/assemblies', | |
129 'fasta_temp_dir': 'test/temp/fastas', | |
130 'output_dir': os.path.abspath('output')+'/', | |
131 'output_file': os.path.abspath('output/output.csv')} | |
132 """ | |
133 | |
134 # Get the correct files and directories | |
135 files_and_dirs = { | |
136 'assemble_temp_dir': os.path.join(temp_dir, 'assemblies'), | |
137 'fasta_temp_dir': os.path.join(temp_dir, 'fastas'), | |
138 } | |
139 if output_dir is None: | |
140 output_dir = ''.join([ | |
141 str(datetime.datetime.now().date()), | |
142 '_', | |
143 str(datetime.datetime.now().time()).replace(':', '.') | |
144 ]) | |
145 if os.path.isabs(output_dir): | |
146 pass | |
147 else: | |
148 output_dir = os.path.join(definitions.WORKPLACE_DIR, 'output', output_dir) | |
149 | |
150 output_file = os.path.join(output_dir, 'output.csv') | |
151 if os.path.isfile(output_file): | |
152 os.remove(output_file) | |
153 for d in [ | |
154 output_dir, files_and_dirs['assemble_temp_dir'], | |
155 files_and_dirs['fasta_temp_dir'] | |
156 ]: | |
157 if not os.path.exists(d): | |
158 os.makedirs(d) | |
159 | |
160 # Finalize the tmp_files dictionary | |
161 files_and_dirs['output_file'] = output_file | |
162 files_and_dirs['output_dir'] = output_dir | |
163 | |
164 LOG.info("Temporary files and directory created") | |
165 return files_and_dirs | |
166 | |
167 | |
168 def run_prediction(genome_files, args, predictions_file): | |
169 '''Core prediction functionality | |
170 | |
171 Args: | |
172 genome_files: | |
173 list of genome files | |
174 args: | |
175 commandline arguments | |
176 predictions_file: | |
177 filename of prediction output | |
178 | |
179 Returns: | |
180 predictions_file with prediction written in it | |
181 ''' | |
182 query_file = definitions.SEROTYPE_FILE | |
183 ectyper_dict_file = definitions.SEROTYPE_ALLELE_JSON | |
184 # create a temp dir for blastdb | |
185 with tempfile.TemporaryDirectory() as temp_dir: | |
186 # Divide genome files into chunks | |
187 chunk_size = 50 | |
188 genome_chunks = [ | |
189 genome_files[i:i + chunk_size] | |
190 for i in range(0, len(genome_files), chunk_size) | |
191 ] | |
192 for index, chunk in enumerate(genome_chunks): | |
193 LOG.info("Start creating blast database #{0}".format(index + 1)) | |
194 blast_db = blastFunctions.create_blast_db(chunk, temp_dir) | |
195 | |
196 LOG.info("Start blast alignment on database #{0}".format(index + 1)) | |
197 blast_output_file = blastFunctions.run_blast(query_file, blast_db, args) | |
198 LOG.info("Start serotype prediction for database #{0}".format(index + 1)) | |
199 predictions_file = predictionFunctions.predict_serotype( | |
200 blast_output_file, ectyper_dict_file, predictions_file, | |
201 args.detailed) | |
202 return predictions_file | |
203 | |
204 | |
205 def get_raw_files(raw_files): | |
206 """Take all the raw files, and filter not fasta / fastq | |
207 | |
208 Args: | |
209 raw_files(str): list of files from user input | |
210 | |
211 Returns: | |
212 A dictitionary collection of fasta and fastq files | |
213 example: | |
214 {'raw_fasta_files':[], | |
215 'raw_fastq_files':[]} | |
216 """ | |
217 fasta_files = [] | |
218 fastq_files = [] | |
219 | |
220 for file in raw_files: | |
221 file_format = genomeFunctions.get_valid_format(file) | |
222 if file_format == 'fasta': | |
223 fasta_files.append(file) | |
224 elif file_format == 'fastq': | |
225 fastq_files.append(file) | |
226 | |
227 LOG.debug('raw fasta files: {}'.format(fasta_files)) | |
228 LOG.debug('raw fastq files: {}'.format(fastq_files)) | |
229 | |
230 return({'fasta':fasta_files, 'fastq':fastq_files}) | |
231 | |
232 | |
233 def filter_for_ecoli_files(raw_dict, temp_files, verify=False, species=False): | |
234 """filter ecoli, identify species, assemble fastq | |
235 Assemble fastq files to fasta files, | |
236 then filter all files by reference method if verify is enabled, | |
237 if identified as non-ecoli, identify species by mash method if species is enabled. | |
238 | |
239 Args: | |
240 raw_dict{fasta:list_of_files, fastq:list_of_files}: | |
241 dictionary collection of fasta and fastq files | |
242 temp_file: temporary directory | |
243 verify(bool): | |
244 whether to perform ecoli verification | |
245 species(bool): | |
246 whether to perform species identification for non-ecoli genome | |
247 Returns: | |
248 List of filtered and assembled genome files in fasta format | |
249 """ | |
250 final_files = [] | |
251 for f in raw_dict.keys(): | |
252 temp_dir = temp_files['fasta_temp_dir'] if f == "fasta" else temp_files['assemble_temp_dir'] | |
253 | |
254 for ffile in raw_dict[f]: | |
255 filtered_file = filter_file_by_species( | |
256 ffile, f, temp_dir, verify=verify, species=species) | |
257 if filtered_file is not None and \ | |
258 genomeFunctions.get_valid_format(filtered_file) is not None: | |
259 final_files.append(filtered_file) | |
260 | |
261 LOG.info('{} final fasta files'.format(len(final_files))) | |
262 return final_files | |
263 | |
264 def filter_file_by_species(genome_file, genome_format, temp_dir, verify=False, species=False): | |
265 """filter ecoli, identify species, assemble fastq | |
266 Assemble fastq file to fasta file, | |
267 then filter the file by reference method if verify is enabled, | |
268 if identified as non-ecoli, identify species by mash method if species is enabled. | |
269 | |
270 Args: | |
271 genome_file: input genome file | |
272 genome_format(str): fasta or fastq | |
273 temp_file: temporary directory | |
274 verify(bool): | |
275 whether to perform ecoli verification | |
276 species(bool): | |
277 whether to perform species identification for non-ecoli genome | |
278 Returns: | |
279 The filtered and assembled genome files in fasta format | |
280 """ | |
281 combined_file = definitions.COMBINED | |
282 filtered_file = None | |
283 if genome_format == 'fastq': | |
284 iden_file, pred_file = \ | |
285 genomeFunctions.assemble_reads(genome_file, combined_file, temp_dir) | |
286 # If no alignment resut, the file is definitely not E.Coli | |
287 if genomeFunctions.get_valid_format(iden_file) is None: | |
288 LOG.warning( | |
289 "{} is filtered out because no identification alignment found".format(genome_file)) | |
290 return filtered_file | |
291 if not (verify or species) or speciesIdentification.is_ecoli_genome( | |
292 iden_file, genome_file, mash=species): | |
293 # final check before adding the alignment for prediction | |
294 if genomeFunctions.get_valid_format(iden_file) != 'fasta': | |
295 LOG.warning( | |
296 "{0} is filtered out because no prediction alignment found".format(genome_file)) | |
297 return filtered_file | |
298 filtered_file = pred_file | |
299 if genome_format == 'fasta': | |
300 if not (verify or species) \ | |
301 or speciesIdentification.is_ecoli_genome(genome_file, mash=species): | |
302 filtered_file = genome_file | |
303 return filtered_file |