ectyper: ecoli_serotyping/ectyper.py comparison

comparison ecoli_serotyping/ectyper.py @ 6:fe3ceb5c4214 draft

Uploaded

author	jpetteng
date	Fri, 05 Jan 2018 15:43:14 -0500
parents
children

comparison

equal deleted inserted replaced

-:a202cc394af8
+:fe3ceb5c4214
+#!/usr/bin/env python
+"""
+Predictive serotyping for _E. coli_.
+"""
+import logging
+import os
+import sys
+import tempfile
+import datetime
+from urllib.request import urlretrieve
+from ectyper import (blastFunctions, commandLineOptions, definitions,
+genomeFunctions, loggingFunctions, predictionFunctions,
+speciesIdentification)
+LOG_FILE = loggingFunctions.initialize_logging()
+LOG = logging.getLogger(__name__)
+def run_program():
+"""
+Wrapper for both the serotyping and virulence finder
+The program needs to do the following
+(1) Filter genome files based on format
+(2) Filter genome files based on species
+(3) Map FASTQ files to FASTA seq
+(1) Get names of all genomes being tested
+(2) Create a BLAST database of those genomes
+(3) Query for serotype
+(4) Parse the results
+(5) Display the results
+:return: success or failure
+"""
+# Initialize the program
+args = commandLineOptions.parse_command_line()
+LOG.info('Starting ectyper\nSerotype prediction with input:\n \
+{0}\n \
+Log file is: {1}'.format(args, LOG_FILE))
+LOG.debug(args)
+## Initialize temporary directories for the scope of this program
+with tempfile.TemporaryDirectory() as temp_dir:
+temp_files = create_tmp_files(temp_dir, output_dir=args.output)
+LOG.debug(temp_files)
+# Download refseq file if speicies identification is enabled
+if args.species:
+download_refseq()
+LOG.info("Gathering genome files")
+raw_genome_files = genomeFunctions.get_files_as_list(args.input)
+LOG.debug(raw_genome_files)
+LOG.info("Removing invalid file types")
+raw_files_dict = get_raw_files(raw_genome_files)
+LOG.debug(raw_files_dict)
+# Assembling fastq + verify ecoli genome
+LOG.info("Preparing genome files for blast alignment")
+final_fasta_files = filter_for_ecoli_files(
+raw_files_dict, temp_files, verify=args.verify, species=args.species
+)
+LOG.debug(final_fasta_files)
+if len(final_fasta_files) is 0:
+LOG.info("No valid genome files. Terminating the program.")
+exit(0)
+LOG.info("Standardizing the genome headers")
+(all_genomes_list, all_genomes_files) = \
+genomeFunctions.get_genome_names_from_files(
+final_fasta_files, temp_files['fasta_temp_dir'])
+LOG.debug(all_genomes_list)
+LOG.debug(all_genomes_files)
+# Main prediction function
+predictions_file = run_prediction(all_genomes_files, args,
+temp_files['output_file'])
+# Add empty rows for genomes without blast result
+predictions_file = predictionFunctions.add_non_predicted(
+all_genomes_list, predictions_file)
+LOG.info('Outputs stored in {0}'.format(temp_files['output_dir']))
+# Store most recent result in working directory
+LOG.info('\nReporting result...')
+predictionFunctions.report_result(predictions_file)
+def download_refseq():
+'''Download refseq file with progress bar
+'''
+def reporthook(blocknum, blocksize, totalsize):
+'''
+https://stackoverflow.com/questions/15644964/python-progress-bar-and-downloads
+'''
+readsofar = blocknum * blocksize
+if totalsize > 0:
+s = "\r {:5.1%} {:{}d} / {:d}".format(
+readsofar/totalsize, readsofar,
+len(str(totalsize)),
+totalsize
+)
+sys.stderr.write(s)
+if readsofar >= totalsize: # near the end
+sys.stderr.write("\n")
+else: # total size is unknown
+sys.stderr.write("read {}\n".format(readsofar))
+if not os.path.isfile(definitions.REFSEQ_SKETCH):
+refseq_url = 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh'
+LOG.info("No refseq found. Downloading reference file for species identification...")
+urlretrieve(refseq_url, definitions.REFSEQ_SKETCH, reporthook)
+LOG.info("Download complete.")
+def create_tmp_files(temp_dir, output_dir=None):
+"""Create a dictionary of temporary files used by ectyper
+Args:
+temp_dir: program scope temporary directory
+output_dir(str, optional):
+directory to store output
+Return:
+a dictionary of temporary files
+example:
+{'assemble_temp_dir': 'test/temp/assemblies',
+'fasta_temp_dir': 'test/temp/fastas',
+'output_dir': os.path.abspath('output')+'/',
+'output_file': os.path.abspath('output/output.csv')}
+"""
+# Get the correct files and directories
+files_and_dirs = {
+'assemble_temp_dir': os.path.join(temp_dir, 'assemblies'),
+'fasta_temp_dir': os.path.join(temp_dir, 'fastas'),
+}
+if output_dir is None:
+output_dir = ''.join([
+str(datetime.datetime.now().date()),
+'_',
+str(datetime.datetime.now().time()).replace(':', '.')
+])
+if os.path.isabs(output_dir):
+pass
+else:
+output_dir = os.path.join(definitions.WORKPLACE_DIR, 'output', output_dir)
+output_file = os.path.join(output_dir, 'output.csv')
+if os.path.isfile(output_file):
+os.remove(output_file)
+for d in [
+output_dir, files_and_dirs['assemble_temp_dir'],
+files_and_dirs['fasta_temp_dir']
+]:
+if not os.path.exists(d):
+os.makedirs(d)
+# Finalize the tmp_files dictionary
+files_and_dirs['output_file'] = output_file
+files_and_dirs['output_dir'] = output_dir
+LOG.info("Temporary files and directory created")
+return files_and_dirs
+def run_prediction(genome_files, args, predictions_file):
+'''Core prediction functionality
+Args:
+genome_files:
+list of genome files
+args:
+commandline arguments
+predictions_file:
+filename of prediction output
+Returns:
+predictions_file with prediction written in it
+'''
+query_file = definitions.SEROTYPE_FILE
+ectyper_dict_file = definitions.SEROTYPE_ALLELE_JSON
+# create a temp dir for blastdb
+with tempfile.TemporaryDirectory() as temp_dir:
+# Divide genome files into chunks
+chunk_size = 50
+genome_chunks = [
+genome_files[i:i + chunk_size]
+for i in range(0, len(genome_files), chunk_size)
+]
+for index, chunk in enumerate(genome_chunks):
+LOG.info("Start creating blast database #{0}".format(index + 1))
+blast_db = blastFunctions.create_blast_db(chunk, temp_dir)
+LOG.info("Start blast alignment on database #{0}".format(index + 1))
+blast_output_file = blastFunctions.run_blast(query_file, blast_db, args)
+LOG.info("Start serotype prediction for database #{0}".format(index + 1))
+predictions_file = predictionFunctions.predict_serotype(
+blast_output_file, ectyper_dict_file, predictions_file,
+args.detailed)
+return predictions_file
+def get_raw_files(raw_files):
+"""Take all the raw files, and filter not fasta / fastq
+Args:
+raw_files(str): list of files from user input
+Returns:
+A dictitionary collection of fasta and fastq files
+example:
+{'raw_fasta_files':[],
+'raw_fastq_files':[]}
+"""
+fasta_files = []
+fastq_files = []
+for file in raw_files:
+file_format = genomeFunctions.get_valid_format(file)
+if file_format == 'fasta':
+fasta_files.append(file)
+elif file_format == 'fastq':
+fastq_files.append(file)
+LOG.debug('raw fasta files: {}'.format(fasta_files))
+LOG.debug('raw fastq files: {}'.format(fastq_files))
+return({'fasta':fasta_files, 'fastq':fastq_files})
+def filter_for_ecoli_files(raw_dict, temp_files, verify=False, species=False):
+"""filter ecoli, identify species, assemble fastq
+Assemble fastq files to fasta files,
+then filter all files by reference method if verify is enabled,
+if identified as non-ecoli, identify species by mash method if species is enabled.
+Args:
+raw_dict{fasta:list_of_files, fastq:list_of_files}:
+dictionary collection of fasta and fastq files
+temp_file: temporary directory
+verify(bool):
+whether to perform ecoli verification
+species(bool):
+whether to perform species identification for non-ecoli genome
+Returns:
+List of filtered and assembled genome files in fasta format
+"""
+final_files = []
+for f in raw_dict.keys():
+temp_dir = temp_files['fasta_temp_dir'] if f == "fasta" else temp_files['assemble_temp_dir']
+for ffile in raw_dict[f]:
+filtered_file = filter_file_by_species(
+ffile, f, temp_dir, verify=verify, species=species)
+if filtered_file is not None and \
+genomeFunctions.get_valid_format(filtered_file) is not None:
+final_files.append(filtered_file)
+LOG.info('{} final fasta files'.format(len(final_files)))
+return final_files
+def filter_file_by_species(genome_file, genome_format, temp_dir, verify=False, species=False):
+"""filter ecoli, identify species, assemble fastq
+Assemble fastq file to fasta file,
+then filter the file by reference method if verify is enabled,
+if identified as non-ecoli, identify species by mash method if species is enabled.
+Args:
+genome_file: input genome file
+genome_format(str): fasta or fastq
+temp_file: temporary directory
+verify(bool):
+whether to perform ecoli verification
+species(bool):
+whether to perform species identification for non-ecoli genome
+Returns:
+The filtered and assembled genome files in fasta format
+"""
+combined_file = definitions.COMBINED
+filtered_file = None
+if genome_format == 'fastq':
+iden_file, pred_file = \
+genomeFunctions.assemble_reads(genome_file, combined_file, temp_dir)
+# If no alignment resut, the file is definitely not E.Coli
+if genomeFunctions.get_valid_format(iden_file) is None:
+LOG.warning(
+"{} is filtered out because no identification alignment found".format(genome_file))
+return filtered_file
+if not (verify or species) or speciesIdentification.is_ecoli_genome(
+iden_file, genome_file, mash=species):
+# final check before adding the alignment for prediction
+if genomeFunctions.get_valid_format(iden_file) != 'fasta':
+LOG.warning(
+"{0} is filtered out because no prediction alignment found".format(genome_file))
+return filtered_file
+filtered_file = pred_file
+if genome_format == 'fasta':
+if not (verify or species) \
+or speciesIdentification.is_ecoli_genome(genome_file, mash=species):
+filtered_file = genome_file
+return filtered_file

Mercurial > repos > jpetteng > ectyper

comparison ecoli_serotyping/ectyper.py @ 6:fe3ceb5c4214 draft