diff ecoli_serotyping/ectyper.py @ 6:fe3ceb5c4214 draft

Uploaded
author jpetteng
date Fri, 05 Jan 2018 15:43:14 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/ectyper.py	Fri Jan 05 15:43:14 2018 -0500
@@ -0,0 +1,303 @@
+#!/usr/bin/env python
+"""
+    Predictive serotyping for _E. coli_.
+"""
+import logging
+import os
+import sys
+import tempfile
+import datetime
+from urllib.request import urlretrieve
+
+from ectyper import (blastFunctions, commandLineOptions, definitions,
+                     genomeFunctions, loggingFunctions, predictionFunctions,
+                     speciesIdentification)
+
+LOG_FILE = loggingFunctions.initialize_logging()
+LOG = logging.getLogger(__name__)
+
+
+def run_program():
+    """
+    Wrapper for both the serotyping and virulence finder
+    The program needs to do the following
+    (1) Filter genome files based on format
+    (2) Filter genome files based on species
+    (3) Map FASTQ files to FASTA seq
+    (1) Get names of all genomes being tested
+    (2) Create a BLAST database of those genomes
+    (3) Query for serotype
+    (4) Parse the results
+    (5) Display the results
+    :return: success or failure
+
+    """
+    # Initialize the program
+    args = commandLineOptions.parse_command_line()
+    LOG.info('Starting ectyper\nSerotype prediction with input:\n \
+            {0}\n \
+            Log file is: {1}'.format(args, LOG_FILE))
+    LOG.debug(args)
+
+    ## Initialize temporary directories for the scope of this program
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_files = create_tmp_files(temp_dir, output_dir=args.output)
+        LOG.debug(temp_files)
+
+        # Download refseq file if speicies identification is enabled
+        if args.species:
+            download_refseq()
+
+        LOG.info("Gathering genome files")
+        raw_genome_files = genomeFunctions.get_files_as_list(args.input)
+        LOG.debug(raw_genome_files)
+
+        LOG.info("Removing invalid file types")
+        raw_files_dict = get_raw_files(raw_genome_files)
+        LOG.debug(raw_files_dict)
+
+        # Assembling fastq + verify ecoli genome
+        LOG.info("Preparing genome files for blast alignment")
+        final_fasta_files = filter_for_ecoli_files(
+            raw_files_dict, temp_files, verify=args.verify, species=args.species
+        )
+        LOG.debug(final_fasta_files)
+        if len(final_fasta_files) is 0:
+            LOG.info("No valid genome files. Terminating the program.")
+            exit(0)
+
+        LOG.info("Standardizing the genome headers")
+        (all_genomes_list, all_genomes_files) = \
+            genomeFunctions.get_genome_names_from_files(
+                final_fasta_files, temp_files['fasta_temp_dir'])
+        LOG.debug(all_genomes_list)
+        LOG.debug(all_genomes_files)
+
+        # Main prediction function
+        predictions_file = run_prediction(all_genomes_files, args,
+                                          temp_files['output_file'])
+
+        # Add empty rows for genomes without blast result
+        predictions_file = predictionFunctions.add_non_predicted(
+            all_genomes_list, predictions_file)
+        LOG.info('Outputs stored in {0}'.format(temp_files['output_dir']))
+
+        # Store most recent result in working directory
+        LOG.info('\nReporting result...')
+        predictionFunctions.report_result(predictions_file)
+
+def download_refseq():
+    '''Download refseq file with progress bar
+    '''
+
+
+    def reporthook(blocknum, blocksize, totalsize):
+        '''
+        https://stackoverflow.com/questions/15644964/python-progress-bar-and-downloads
+        '''
+        readsofar = blocknum * blocksize
+        if totalsize > 0:
+            s = "\r {:5.1%} {:{}d} / {:d}".format(
+                readsofar/totalsize, readsofar,
+                len(str(totalsize)),
+                totalsize
+            )
+            sys.stderr.write(s)
+            if readsofar >= totalsize: # near the end
+                sys.stderr.write("\n")
+        else: # total size is unknown
+            sys.stderr.write("read {}\n".format(readsofar))
+
+    if not os.path.isfile(definitions.REFSEQ_SKETCH):
+        refseq_url = 'https://gembox.cbcb.umd.edu/mash/refseq.genomes.k21s1000.msh'
+        LOG.info("No refseq found. Downloading reference file for species identification...")
+        urlretrieve(refseq_url, definitions.REFSEQ_SKETCH, reporthook)
+        LOG.info("Download complete.")
+
+def create_tmp_files(temp_dir, output_dir=None):
+    """Create a dictionary of temporary files used by ectyper
+
+    Args:
+        temp_dir: program scope temporary directory
+        output_dir(str, optional):
+            directory to store output
+
+    Return:
+        a dictionary of temporary files
+        example:
+            {'assemble_temp_dir': 'test/temp/assemblies',
+             'fasta_temp_dir': 'test/temp/fastas',
+             'output_dir': os.path.abspath('output')+'/',
+             'output_file': os.path.abspath('output/output.csv')}
+    """
+
+    # Get the correct files and directories
+    files_and_dirs = {
+        'assemble_temp_dir': os.path.join(temp_dir, 'assemblies'),
+        'fasta_temp_dir': os.path.join(temp_dir, 'fastas'),
+    }
+    if output_dir is None:
+        output_dir = ''.join([
+            str(datetime.datetime.now().date()),
+            '_',
+            str(datetime.datetime.now().time()).replace(':', '.')
+        ])
+    if os.path.isabs(output_dir):
+        pass
+    else:
+        output_dir = os.path.join(definitions.WORKPLACE_DIR, 'output', output_dir)
+
+    output_file = os.path.join(output_dir, 'output.csv')
+    if os.path.isfile(output_file):
+        os.remove(output_file)
+    for d in [
+            output_dir, files_and_dirs['assemble_temp_dir'],
+            files_and_dirs['fasta_temp_dir']
+    ]:
+        if not os.path.exists(d):
+            os.makedirs(d)
+
+    # Finalize the tmp_files dictionary
+    files_and_dirs['output_file'] = output_file
+    files_and_dirs['output_dir'] = output_dir
+
+    LOG.info("Temporary files and directory created")
+    return files_and_dirs
+
+
+def run_prediction(genome_files, args, predictions_file):
+    '''Core prediction functionality
+    
+    Args:
+        genome_files:
+            list of genome files
+        args:
+            commandline arguments
+        predictions_file:
+            filename of prediction output
+    
+    Returns:
+        predictions_file with prediction written in it
+    '''
+    query_file = definitions.SEROTYPE_FILE
+    ectyper_dict_file = definitions.SEROTYPE_ALLELE_JSON
+    # create a temp dir for blastdb
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Divide genome files into chunks
+        chunk_size = 50
+        genome_chunks = [
+            genome_files[i:i + chunk_size]
+            for i in range(0, len(genome_files), chunk_size)
+        ]
+        for index, chunk in enumerate(genome_chunks):
+            LOG.info("Start creating blast database #{0}".format(index + 1))
+            blast_db = blastFunctions.create_blast_db(chunk, temp_dir)
+
+            LOG.info("Start blast alignment on database #{0}".format(index + 1))
+            blast_output_file = blastFunctions.run_blast(query_file, blast_db, args)
+            LOG.info("Start serotype prediction for database #{0}".format(index + 1))
+            predictions_file = predictionFunctions.predict_serotype(
+                blast_output_file, ectyper_dict_file, predictions_file,
+                args.detailed)
+        return predictions_file
+
+
+def get_raw_files(raw_files):
+    """Take all the raw files, and filter not fasta / fastq
+    
+    Args:
+        raw_files(str): list of files from user input
+    
+    Returns:
+        A dictitionary collection of fasta and fastq files
+        example:
+        {'raw_fasta_files':[],
+         'raw_fastq_files':[]}
+    """
+    fasta_files = []
+    fastq_files = []
+
+    for file in raw_files:
+        file_format = genomeFunctions.get_valid_format(file)
+        if file_format == 'fasta':
+            fasta_files.append(file)
+        elif file_format == 'fastq':
+            fastq_files.append(file)
+
+    LOG.debug('raw fasta files: {}'.format(fasta_files))
+    LOG.debug('raw fastq files: {}'.format(fastq_files))
+
+    return({'fasta':fasta_files, 'fastq':fastq_files})
+
+
+def filter_for_ecoli_files(raw_dict, temp_files, verify=False, species=False):
+    """filter ecoli, identify species, assemble fastq
+    Assemble fastq files to fasta files,
+    then filter all files by reference method if verify is enabled,
+    if identified as non-ecoli, identify species by mash method if species is enabled.
+    
+    Args:
+        raw_dict{fasta:list_of_files, fastq:list_of_files}:
+            dictionary collection of fasta and fastq files
+        temp_file: temporary directory
+        verify(bool):
+            whether to perform ecoli verification
+        species(bool):
+            whether to perform species identification for non-ecoli genome
+    Returns:
+        List of filtered and assembled genome files in fasta format
+    """
+    final_files = []
+    for f in raw_dict.keys():
+        temp_dir = temp_files['fasta_temp_dir'] if f == "fasta" else temp_files['assemble_temp_dir']
+
+        for ffile in raw_dict[f]:
+            filtered_file = filter_file_by_species(
+                ffile, f, temp_dir, verify=verify, species=species)
+            if filtered_file is not None and \
+            genomeFunctions.get_valid_format(filtered_file) is not None:
+                final_files.append(filtered_file)
+
+    LOG.info('{} final fasta files'.format(len(final_files)))
+    return final_files
+
+def filter_file_by_species(genome_file, genome_format, temp_dir, verify=False, species=False):
+    """filter ecoli, identify species, assemble fastq
+    Assemble fastq file to fasta file,
+    then filter the file by reference method if verify is enabled,
+    if identified as non-ecoli, identify species by mash method if species is enabled.
+    
+    Args:
+        genome_file: input genome file
+        genome_format(str): fasta or fastq
+        temp_file: temporary directory
+        verify(bool):
+            whether to perform ecoli verification
+        species(bool):
+            whether to perform species identification for non-ecoli genome
+    Returns:
+        The filtered and assembled genome files in fasta format
+    """
+    combined_file = definitions.COMBINED
+    filtered_file = None
+    if genome_format == 'fastq':
+        iden_file, pred_file = \
+            genomeFunctions.assemble_reads(genome_file, combined_file, temp_dir)
+        # If no alignment resut, the file is definitely not E.Coli
+        if genomeFunctions.get_valid_format(iden_file) is None:
+            LOG.warning(
+                "{} is filtered out because no identification alignment found".format(genome_file))
+            return filtered_file
+        if not (verify or species) or speciesIdentification.is_ecoli_genome(
+                iden_file, genome_file, mash=species):
+            # final check before adding the alignment for prediction
+            if genomeFunctions.get_valid_format(iden_file) != 'fasta':
+                LOG.warning(
+                    "{0} is filtered out because no prediction alignment found".format(genome_file))
+                return filtered_file
+            filtered_file = pred_file
+    if genome_format == 'fasta':
+        if not (verify or species) \
+        or speciesIdentification.is_ecoli_genome(genome_file, mash=species):
+            filtered_file = genome_file
+    return filtered_file