annotate ecoli_serotyping/ectyper.py @ 9:148199949636 draft

Uploaded
author jpetteng
date Fri, 05 Jan 2018 16:25:18 -0500
parents fe3ceb5c4214
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
1 #!/usr/bin/env python
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
2 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
3 Predictive serotyping for _E. coli_.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
4 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
5 import logging
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
6 import os
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
7 import sys
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
8 import tempfile
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
9 import datetime
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
10 from urllib.request import urlretrieve
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
11
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
12 from ectyper import (blastFunctions, commandLineOptions, definitions,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
13 genomeFunctions, loggingFunctions, predictionFunctions,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
14 speciesIdentification)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
15
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
16 LOG_FILE = loggingFunctions.initialize_logging()
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
17 LOG = logging.getLogger(__name__)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
18
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
19
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
20 def run_program():
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
21 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
22 Wrapper for both the serotyping and virulence finder
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
23 The program needs to do the following
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
24 (1) Filter genome files based on format
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
25 (2) Filter genome files based on species
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
26 (3) Map FASTQ files to FASTA seq
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
27 (1) Get names of all genomes being tested
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
28 (2) Create a BLAST database of those genomes
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
29 (3) Query for serotype
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
30 (4) Parse the results
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
31 (5) Display the results
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
32 :return: success or failure
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
33
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
34 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
35 # Initialize the program
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
36 args = commandLineOptions.parse_command_line()
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
37 LOG.info('Starting ectyper\nSerotype prediction with input:\n \
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
38 {0}\n \
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
39 Log file is: {1}'.format(args, LOG_FILE))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
40 LOG.debug(args)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
41
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
42 ## Initialize temporary directories for the scope of this program
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
43 with tempfile.TemporaryDirectory() as temp_dir:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
44 temp_files = create_tmp_files(temp_dir, output_dir=args.output)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
45 LOG.debug(temp_files)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
46
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
47 # Download refseq file if speicies identification is enabled
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
48 if args.species:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
49 download_refseq()
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
50
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
51 LOG.info("Gathering genome files")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
52 raw_genome_files = genomeFunctions.get_files_as_list(args.input)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
53 LOG.debug(raw_genome_files)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
54
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
55 LOG.info("Removing invalid file types")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
56 raw_files_dict = get_raw_files(raw_genome_files)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
57 LOG.debug(raw_files_dict)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
58
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
59 # Assembling fastq + verify ecoli genome
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
60 LOG.info("Preparing genome files for blast alignment")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
61 final_fasta_files = filter_for_ecoli_files(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
62 raw_files_dict, temp_files, verify=args.verify, species=args.species
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
63 )
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
64 LOG.debug(final_fasta_files)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
65 if len(final_fasta_files) is 0:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
66 LOG.info("No valid genome files. Terminating the program.")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
67 exit(0)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
68
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
69 LOG.info("Standardizing the genome headers")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
70 (all_genomes_list, all_genomes_files) = \
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
71 genomeFunctions.get_genome_names_from_files(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
72 final_fasta_files, temp_files['fasta_temp_dir'])
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
73 LOG.debug(all_genomes_list)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
74 LOG.debug(all_genomes_files)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
75
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
76 # Main prediction function
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
77 predictions_file = run_prediction(all_genomes_files, args,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
78 temp_files['output_file'])
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
79
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
80 # Add empty rows for genomes without blast result
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
81 predictions_file = predictionFunctions.add_non_predicted(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
82 all_genomes_list, predictions_file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
83 LOG.info('Outputs stored in {0}'.format(temp_files['output_dir']))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
84
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
85 # Store most recent result in working directory
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
86 LOG.info('\nReporting result...')
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
87 predictionFunctions.report_result(predictions_file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
88
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
89 def download_refseq():
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
90 '''Download refseq file with progress bar
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
91 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
92
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
93
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
94 def reporthook(blocknum, blocksize, totalsize):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
95 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
96 https://stackoverflow.com/questions/15644964/python-progress-bar-and-downloads
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
97 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
98 readsofar = blocknum * blocksize
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
99 if totalsize > 0:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
100 s = "\r {:5.1%} {:{}d} / {:d}".format(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
101 readsofar/totalsize, readsofar,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
102 len(str(totalsize)),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
103 totalsize
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
104 )
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
105 sys.stderr.write(s)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
106 if readsofar >= totalsize: # near the end
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
107 sys.stderr.write("\n")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
108 else: # total size is unknown
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
109 sys.stderr.write("read {}\n".format(readsofar))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
110
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
111 if not os.path.isfile(definitions.REFSEQ_SKETCH):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
112 refseq_url = 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh'
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
113 LOG.info("No refseq found. Downloading reference file for species identification...")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
114 urlretrieve(refseq_url, definitions.REFSEQ_SKETCH, reporthook)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
115 LOG.info("Download complete.")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
116
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
117 def create_tmp_files(temp_dir, output_dir=None):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
118 """Create a dictionary of temporary files used by ectyper
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
119
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
120 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
121 temp_dir: program scope temporary directory
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
122 output_dir(str, optional):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
123 directory to store output
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
124
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
125 Return:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
126 a dictionary of temporary files
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
127 example:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
128 {'assemble_temp_dir': 'test/temp/assemblies',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
129 'fasta_temp_dir': 'test/temp/fastas',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
130 'output_dir': os.path.abspath('output')+'/',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
131 'output_file': os.path.abspath('output/output.csv')}
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
132 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
133
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
134 # Get the correct files and directories
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
135 files_and_dirs = {
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
136 'assemble_temp_dir': os.path.join(temp_dir, 'assemblies'),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
137 'fasta_temp_dir': os.path.join(temp_dir, 'fastas'),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
138 }
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
139 if output_dir is None:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
140 output_dir = ''.join([
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
141 str(datetime.datetime.now().date()),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
142 '_',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
143 str(datetime.datetime.now().time()).replace(':', '.')
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
144 ])
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
145 if os.path.isabs(output_dir):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
146 pass
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
147 else:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
148 output_dir = os.path.join(definitions.WORKPLACE_DIR, 'output', output_dir)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
149
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
150 output_file = os.path.join(output_dir, 'output.csv')
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
151 if os.path.isfile(output_file):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
152 os.remove(output_file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
153 for d in [
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
154 output_dir, files_and_dirs['assemble_temp_dir'],
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
155 files_and_dirs['fasta_temp_dir']
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
156 ]:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
157 if not os.path.exists(d):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
158 os.makedirs(d)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
159
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
160 # Finalize the tmp_files dictionary
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
161 files_and_dirs['output_file'] = output_file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
162 files_and_dirs['output_dir'] = output_dir
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
163
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
164 LOG.info("Temporary files and directory created")
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
165 return files_and_dirs
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
166
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
167
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
168 def run_prediction(genome_files, args, predictions_file):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
169 '''Core prediction functionality
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
170
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
171 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
172 genome_files:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
173 list of genome files
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
174 args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
175 commandline arguments
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
176 predictions_file:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
177 filename of prediction output
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
178
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
179 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
180 predictions_file with prediction written in it
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
181 '''
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
182 query_file = definitions.SEROTYPE_FILE
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
183 ectyper_dict_file = definitions.SEROTYPE_ALLELE_JSON
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
184 # create a temp dir for blastdb
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
185 with tempfile.TemporaryDirectory() as temp_dir:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
186 # Divide genome files into chunks
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
187 chunk_size = 50
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
188 genome_chunks = [
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
189 genome_files[i:i + chunk_size]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
190 for i in range(0, len(genome_files), chunk_size)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
191 ]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
192 for index, chunk in enumerate(genome_chunks):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
193 LOG.info("Start creating blast database #{0}".format(index + 1))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
194 blast_db = blastFunctions.create_blast_db(chunk, temp_dir)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
195
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
196 LOG.info("Start blast alignment on database #{0}".format(index + 1))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
197 blast_output_file = blastFunctions.run_blast(query_file, blast_db, args)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
198 LOG.info("Start serotype prediction for database #{0}".format(index + 1))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
199 predictions_file = predictionFunctions.predict_serotype(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
200 blast_output_file, ectyper_dict_file, predictions_file,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
201 args.detailed)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
202 return predictions_file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
203
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
204
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
205 def get_raw_files(raw_files):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
206 """Take all the raw files, and filter not fasta / fastq
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
207
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
208 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
209 raw_files(str): list of files from user input
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
210
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
211 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
212 A dictitionary collection of fasta and fastq files
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
213 example:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
214 {'raw_fasta_files':[],
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
215 'raw_fastq_files':[]}
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
216 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
217 fasta_files = []
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
218 fastq_files = []
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
219
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
220 for file in raw_files:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
221 file_format = genomeFunctions.get_valid_format(file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
222 if file_format == 'fasta':
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
223 fasta_files.append(file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
224 elif file_format == 'fastq':
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
225 fastq_files.append(file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
226
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
227 LOG.debug('raw fasta files: {}'.format(fasta_files))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
228 LOG.debug('raw fastq files: {}'.format(fastq_files))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
229
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
230 return({'fasta':fasta_files, 'fastq':fastq_files})
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
231
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
232
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
233 def filter_for_ecoli_files(raw_dict, temp_files, verify=False, species=False):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
234 """filter ecoli, identify species, assemble fastq
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
235 Assemble fastq files to fasta files,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
236 then filter all files by reference method if verify is enabled,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
237 if identified as non-ecoli, identify species by mash method if species is enabled.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
238
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
239 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
240 raw_dict{fasta:list_of_files, fastq:list_of_files}:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
241 dictionary collection of fasta and fastq files
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
242 temp_file: temporary directory
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
243 verify(bool):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
244 whether to perform ecoli verification
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
245 species(bool):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
246 whether to perform species identification for non-ecoli genome
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
247 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
248 List of filtered and assembled genome files in fasta format
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
249 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
250 final_files = []
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
251 for f in raw_dict.keys():
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
252 temp_dir = temp_files['fasta_temp_dir'] if f == "fasta" else temp_files['assemble_temp_dir']
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
253
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
254 for ffile in raw_dict[f]:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
255 filtered_file = filter_file_by_species(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
256 ffile, f, temp_dir, verify=verify, species=species)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
257 if filtered_file is not None and \
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
258 genomeFunctions.get_valid_format(filtered_file) is not None:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
259 final_files.append(filtered_file)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
260
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
261 LOG.info('{} final fasta files'.format(len(final_files)))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
262 return final_files
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
263
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
264 def filter_file_by_species(genome_file, genome_format, temp_dir, verify=False, species=False):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
265 """filter ecoli, identify species, assemble fastq
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
266 Assemble fastq file to fasta file,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
267 then filter the file by reference method if verify is enabled,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
268 if identified as non-ecoli, identify species by mash method if species is enabled.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
269
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
270 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
271 genome_file: input genome file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
272 genome_format(str): fasta or fastq
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
273 temp_file: temporary directory
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
274 verify(bool):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
275 whether to perform ecoli verification
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
276 species(bool):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
277 whether to perform species identification for non-ecoli genome
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
278 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
279 The filtered and assembled genome files in fasta format
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
280 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
281 combined_file = definitions.COMBINED
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
282 filtered_file = None
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
283 if genome_format == 'fastq':
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
284 iden_file, pred_file = \
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
285 genomeFunctions.assemble_reads(genome_file, combined_file, temp_dir)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
286 # If no alignment resut, the file is definitely not E.Coli
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
287 if genomeFunctions.get_valid_format(iden_file) is None:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
288 LOG.warning(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
289 "{} is filtered out because no identification alignment found".format(genome_file))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
290 return filtered_file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
291 if not (verify or species) or speciesIdentification.is_ecoli_genome(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
292 iden_file, genome_file, mash=species):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
293 # final check before adding the alignment for prediction
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
294 if genomeFunctions.get_valid_format(iden_file) != 'fasta':
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
295 LOG.warning(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
296 "{0} is filtered out because no prediction alignment found".format(genome_file))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
297 return filtered_file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
298 filtered_file = pred_file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
299 if genome_format == 'fasta':
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
300 if not (verify or species) \
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
301 or speciesIdentification.is_ecoli_genome(genome_file, mash=species):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
302 filtered_file = genome_file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
303 return filtered_file