Mercurial > repos > jpetteng > ectyper
comparison ecoli_serotyping/ectyper.py @ 6:fe3ceb5c4214 draft
Uploaded
| author | jpetteng |
|---|---|
| date | Fri, 05 Jan 2018 15:43:14 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 5:a202cc394af8 | 6:fe3ceb5c4214 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 Predictive serotyping for _E. coli_. | |
| 4 """ | |
| 5 import logging | |
| 6 import os | |
| 7 import sys | |
| 8 import tempfile | |
| 9 import datetime | |
| 10 from urllib.request import urlretrieve | |
| 11 | |
| 12 from ectyper import (blastFunctions, commandLineOptions, definitions, | |
| 13 genomeFunctions, loggingFunctions, predictionFunctions, | |
| 14 speciesIdentification) | |
| 15 | |
| 16 LOG_FILE = loggingFunctions.initialize_logging() | |
| 17 LOG = logging.getLogger(__name__) | |
| 18 | |
| 19 | |
| 20 def run_program(): | |
| 21 """ | |
| 22 Wrapper for both the serotyping and virulence finder | |
| 23 The program needs to do the following | |
| 24 (1) Filter genome files based on format | |
| 25 (2) Filter genome files based on species | |
| 26 (3) Map FASTQ files to FASTA seq | |
| 27 (1) Get names of all genomes being tested | |
| 28 (2) Create a BLAST database of those genomes | |
| 29 (3) Query for serotype | |
| 30 (4) Parse the results | |
| 31 (5) Display the results | |
| 32 :return: success or failure | |
| 33 | |
| 34 """ | |
| 35 # Initialize the program | |
| 36 args = commandLineOptions.parse_command_line() | |
| 37 LOG.info('Starting ectyper\nSerotype prediction with input:\n \ | |
| 38 {0}\n \ | |
| 39 Log file is: {1}'.format(args, LOG_FILE)) | |
| 40 LOG.debug(args) | |
| 41 | |
| 42 ## Initialize temporary directories for the scope of this program | |
| 43 with tempfile.TemporaryDirectory() as temp_dir: | |
| 44 temp_files = create_tmp_files(temp_dir, output_dir=args.output) | |
| 45 LOG.debug(temp_files) | |
| 46 | |
| 47 # Download refseq file if speicies identification is enabled | |
| 48 if args.species: | |
| 49 download_refseq() | |
| 50 | |
| 51 LOG.info("Gathering genome files") | |
| 52 raw_genome_files = genomeFunctions.get_files_as_list(args.input) | |
| 53 LOG.debug(raw_genome_files) | |
| 54 | |
| 55 LOG.info("Removing invalid file types") | |
| 56 raw_files_dict = get_raw_files(raw_genome_files) | |
| 57 LOG.debug(raw_files_dict) | |
| 58 | |
| 59 # Assembling fastq + verify ecoli genome | |
| 60 LOG.info("Preparing genome files for blast alignment") | |
| 61 final_fasta_files = filter_for_ecoli_files( | |
| 62 raw_files_dict, temp_files, verify=args.verify, species=args.species | |
| 63 ) | |
| 64 LOG.debug(final_fasta_files) | |
| 65 if len(final_fasta_files) is 0: | |
| 66 LOG.info("No valid genome files. Terminating the program.") | |
| 67 exit(0) | |
| 68 | |
| 69 LOG.info("Standardizing the genome headers") | |
| 70 (all_genomes_list, all_genomes_files) = \ | |
| 71 genomeFunctions.get_genome_names_from_files( | |
| 72 final_fasta_files, temp_files['fasta_temp_dir']) | |
| 73 LOG.debug(all_genomes_list) | |
| 74 LOG.debug(all_genomes_files) | |
| 75 | |
| 76 # Main prediction function | |
| 77 predictions_file = run_prediction(all_genomes_files, args, | |
| 78 temp_files['output_file']) | |
| 79 | |
| 80 # Add empty rows for genomes without blast result | |
| 81 predictions_file = predictionFunctions.add_non_predicted( | |
| 82 all_genomes_list, predictions_file) | |
| 83 LOG.info('Outputs stored in {0}'.format(temp_files['output_dir'])) | |
| 84 | |
| 85 # Store most recent result in working directory | |
| 86 LOG.info('\nReporting result...') | |
| 87 predictionFunctions.report_result(predictions_file) | |
| 88 | |
| 89 def download_refseq(): | |
| 90 '''Download refseq file with progress bar | |
| 91 ''' | |
| 92 | |
| 93 | |
| 94 def reporthook(blocknum, blocksize, totalsize): | |
| 95 ''' | |
| 96 https://stackoverflow.com/questions/15644964/python-progress-bar-and-downloads | |
| 97 ''' | |
| 98 readsofar = blocknum * blocksize | |
| 99 if totalsize > 0: | |
| 100 s = "\r {:5.1%} {:{}d} / {:d}".format( | |
| 101 readsofar/totalsize, readsofar, | |
| 102 len(str(totalsize)), | |
| 103 totalsize | |
| 104 ) | |
| 105 sys.stderr.write(s) | |
| 106 if readsofar >= totalsize: # near the end | |
| 107 sys.stderr.write("\n") | |
| 108 else: # total size is unknown | |
| 109 sys.stderr.write("read {}\n".format(readsofar)) | |
| 110 | |
| 111 if not os.path.isfile(definitions.REFSEQ_SKETCH): | |
| 112 refseq_url = 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh' | |
| 113 LOG.info("No refseq found. Downloading reference file for species identification...") | |
| 114 urlretrieve(refseq_url, definitions.REFSEQ_SKETCH, reporthook) | |
| 115 LOG.info("Download complete.") | |
| 116 | |
| 117 def create_tmp_files(temp_dir, output_dir=None): | |
| 118 """Create a dictionary of temporary files used by ectyper | |
| 119 | |
| 120 Args: | |
| 121 temp_dir: program scope temporary directory | |
| 122 output_dir(str, optional): | |
| 123 directory to store output | |
| 124 | |
| 125 Return: | |
| 126 a dictionary of temporary files | |
| 127 example: | |
| 128 {'assemble_temp_dir': 'test/temp/assemblies', | |
| 129 'fasta_temp_dir': 'test/temp/fastas', | |
| 130 'output_dir': os.path.abspath('output')+'/', | |
| 131 'output_file': os.path.abspath('output/output.csv')} | |
| 132 """ | |
| 133 | |
| 134 # Get the correct files and directories | |
| 135 files_and_dirs = { | |
| 136 'assemble_temp_dir': os.path.join(temp_dir, 'assemblies'), | |
| 137 'fasta_temp_dir': os.path.join(temp_dir, 'fastas'), | |
| 138 } | |
| 139 if output_dir is None: | |
| 140 output_dir = ''.join([ | |
| 141 str(datetime.datetime.now().date()), | |
| 142 '_', | |
| 143 str(datetime.datetime.now().time()).replace(':', '.') | |
| 144 ]) | |
| 145 if os.path.isabs(output_dir): | |
| 146 pass | |
| 147 else: | |
| 148 output_dir = os.path.join(definitions.WORKPLACE_DIR, 'output', output_dir) | |
| 149 | |
| 150 output_file = os.path.join(output_dir, 'output.csv') | |
| 151 if os.path.isfile(output_file): | |
| 152 os.remove(output_file) | |
| 153 for d in [ | |
| 154 output_dir, files_and_dirs['assemble_temp_dir'], | |
| 155 files_and_dirs['fasta_temp_dir'] | |
| 156 ]: | |
| 157 if not os.path.exists(d): | |
| 158 os.makedirs(d) | |
| 159 | |
| 160 # Finalize the tmp_files dictionary | |
| 161 files_and_dirs['output_file'] = output_file | |
| 162 files_and_dirs['output_dir'] = output_dir | |
| 163 | |
| 164 LOG.info("Temporary files and directory created") | |
| 165 return files_and_dirs | |
| 166 | |
| 167 | |
| 168 def run_prediction(genome_files, args, predictions_file): | |
| 169 '''Core prediction functionality | |
| 170 | |
| 171 Args: | |
| 172 genome_files: | |
| 173 list of genome files | |
| 174 args: | |
| 175 commandline arguments | |
| 176 predictions_file: | |
| 177 filename of prediction output | |
| 178 | |
| 179 Returns: | |
| 180 predictions_file with prediction written in it | |
| 181 ''' | |
| 182 query_file = definitions.SEROTYPE_FILE | |
| 183 ectyper_dict_file = definitions.SEROTYPE_ALLELE_JSON | |
| 184 # create a temp dir for blastdb | |
| 185 with tempfile.TemporaryDirectory() as temp_dir: | |
| 186 # Divide genome files into chunks | |
| 187 chunk_size = 50 | |
| 188 genome_chunks = [ | |
| 189 genome_files[i:i + chunk_size] | |
| 190 for i in range(0, len(genome_files), chunk_size) | |
| 191 ] | |
| 192 for index, chunk in enumerate(genome_chunks): | |
| 193 LOG.info("Start creating blast database #{0}".format(index + 1)) | |
| 194 blast_db = blastFunctions.create_blast_db(chunk, temp_dir) | |
| 195 | |
| 196 LOG.info("Start blast alignment on database #{0}".format(index + 1)) | |
| 197 blast_output_file = blastFunctions.run_blast(query_file, blast_db, args) | |
| 198 LOG.info("Start serotype prediction for database #{0}".format(index + 1)) | |
| 199 predictions_file = predictionFunctions.predict_serotype( | |
| 200 blast_output_file, ectyper_dict_file, predictions_file, | |
| 201 args.detailed) | |
| 202 return predictions_file | |
| 203 | |
| 204 | |
| 205 def get_raw_files(raw_files): | |
| 206 """Take all the raw files, and filter not fasta / fastq | |
| 207 | |
| 208 Args: | |
| 209 raw_files(str): list of files from user input | |
| 210 | |
| 211 Returns: | |
| 212 A dictitionary collection of fasta and fastq files | |
| 213 example: | |
| 214 {'raw_fasta_files':[], | |
| 215 'raw_fastq_files':[]} | |
| 216 """ | |
| 217 fasta_files = [] | |
| 218 fastq_files = [] | |
| 219 | |
| 220 for file in raw_files: | |
| 221 file_format = genomeFunctions.get_valid_format(file) | |
| 222 if file_format == 'fasta': | |
| 223 fasta_files.append(file) | |
| 224 elif file_format == 'fastq': | |
| 225 fastq_files.append(file) | |
| 226 | |
| 227 LOG.debug('raw fasta files: {}'.format(fasta_files)) | |
| 228 LOG.debug('raw fastq files: {}'.format(fastq_files)) | |
| 229 | |
| 230 return({'fasta':fasta_files, 'fastq':fastq_files}) | |
| 231 | |
| 232 | |
| 233 def filter_for_ecoli_files(raw_dict, temp_files, verify=False, species=False): | |
| 234 """filter ecoli, identify species, assemble fastq | |
| 235 Assemble fastq files to fasta files, | |
| 236 then filter all files by reference method if verify is enabled, | |
| 237 if identified as non-ecoli, identify species by mash method if species is enabled. | |
| 238 | |
| 239 Args: | |
| 240 raw_dict{fasta:list_of_files, fastq:list_of_files}: | |
| 241 dictionary collection of fasta and fastq files | |
| 242 temp_file: temporary directory | |
| 243 verify(bool): | |
| 244 whether to perform ecoli verification | |
| 245 species(bool): | |
| 246 whether to perform species identification for non-ecoli genome | |
| 247 Returns: | |
| 248 List of filtered and assembled genome files in fasta format | |
| 249 """ | |
| 250 final_files = [] | |
| 251 for f in raw_dict.keys(): | |
| 252 temp_dir = temp_files['fasta_temp_dir'] if f == "fasta" else temp_files['assemble_temp_dir'] | |
| 253 | |
| 254 for ffile in raw_dict[f]: | |
| 255 filtered_file = filter_file_by_species( | |
| 256 ffile, f, temp_dir, verify=verify, species=species) | |
| 257 if filtered_file is not None and \ | |
| 258 genomeFunctions.get_valid_format(filtered_file) is not None: | |
| 259 final_files.append(filtered_file) | |
| 260 | |
| 261 LOG.info('{} final fasta files'.format(len(final_files))) | |
| 262 return final_files | |
| 263 | |
| 264 def filter_file_by_species(genome_file, genome_format, temp_dir, verify=False, species=False): | |
| 265 """filter ecoli, identify species, assemble fastq | |
| 266 Assemble fastq file to fasta file, | |
| 267 then filter the file by reference method if verify is enabled, | |
| 268 if identified as non-ecoli, identify species by mash method if species is enabled. | |
| 269 | |
| 270 Args: | |
| 271 genome_file: input genome file | |
| 272 genome_format(str): fasta or fastq | |
| 273 temp_file: temporary directory | |
| 274 verify(bool): | |
| 275 whether to perform ecoli verification | |
| 276 species(bool): | |
| 277 whether to perform species identification for non-ecoli genome | |
| 278 Returns: | |
| 279 The filtered and assembled genome files in fasta format | |
| 280 """ | |
| 281 combined_file = definitions.COMBINED | |
| 282 filtered_file = None | |
| 283 if genome_format == 'fastq': | |
| 284 iden_file, pred_file = \ | |
| 285 genomeFunctions.assemble_reads(genome_file, combined_file, temp_dir) | |
| 286 # If no alignment resut, the file is definitely not E.Coli | |
| 287 if genomeFunctions.get_valid_format(iden_file) is None: | |
| 288 LOG.warning( | |
| 289 "{} is filtered out because no identification alignment found".format(genome_file)) | |
| 290 return filtered_file | |
| 291 if not (verify or species) or speciesIdentification.is_ecoli_genome( | |
| 292 iden_file, genome_file, mash=species): | |
| 293 # final check before adding the alignment for prediction | |
| 294 if genomeFunctions.get_valid_format(iden_file) != 'fasta': | |
| 295 LOG.warning( | |
| 296 "{0} is filtered out because no prediction alignment found".format(genome_file)) | |
| 297 return filtered_file | |
| 298 filtered_file = pred_file | |
| 299 if genome_format == 'fasta': | |
| 300 if not (verify or species) \ | |
| 301 or speciesIdentification.is_ecoli_genome(genome_file, mash=species): | |
| 302 filtered_file = genome_file | |
| 303 return filtered_file |
