changeset 7:b9146bf9de08 draft

Uploaded
author jpetteng
date Fri, 05 Jan 2018 15:58:51 -0500
parents fe3ceb5c4214
children 06e7c1355431
files ._ecoli_serotyping ecoli_serotyping/._bin ecoli_serotyping/bin/._blastFunctions.py ecoli_serotyping/bin/._commandLineOptions.py ecoli_serotyping/bin/._definitions.py ecoli_serotyping/bin/._genomeFunctions.py ecoli_serotyping/bin/._loggingFunctions.py ecoli_serotyping/bin/._predictionFunctions.py ecoli_serotyping/bin/._speciesIdentification.py ecoli_serotyping/bin/._subprocess_util.py ecoli_serotyping/bin/blastFunctions.py ecoli_serotyping/bin/commandLineOptions.py ecoli_serotyping/bin/definitions.py ecoli_serotyping/bin/genomeFunctions.py ecoli_serotyping/bin/loggingFunctions.py ecoli_serotyping/bin/predictionFunctions.py ecoli_serotyping/bin/speciesIdentification.py ecoli_serotyping/bin/subprocess_util.py
diffstat 18 files changed, 960 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
Binary file ._ecoli_serotyping has changed
Binary file ecoli_serotyping/._bin has changed
Binary file ecoli_serotyping/bin/._blastFunctions.py has changed
Binary file ecoli_serotyping/bin/._commandLineOptions.py has changed
Binary file ecoli_serotyping/bin/._definitions.py has changed
Binary file ecoli_serotyping/bin/._genomeFunctions.py has changed
Binary file ecoli_serotyping/bin/._loggingFunctions.py has changed
Binary file ecoli_serotyping/bin/._predictionFunctions.py has changed
Binary file ecoli_serotyping/bin/._speciesIdentification.py has changed
Binary file ecoli_serotyping/bin/._subprocess_util.py has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/bin/blastFunctions.py	Fri Jan 05 15:58:51 2018 -0500
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+
+"""
+Functions for setting up, running, and parsing blast
+"""
+import logging
+import os
+
+from ectyper import subprocess_util
+
+LOG = logging.getLogger(__name__)
+
+
+def create_blast_db(filelist, temp_dir):
+    """http://stackoverflow.com/questions/23944657/typeerror-method-takes-1-positional-argument-but-2-were-given
+    Creating a blast DB using the makeblastdb command.
+    The database is created in the temporary folder of the system.
+
+    Args:
+        filelist: list of genomes that was given by the user on the command line.
+        temp_dir: temporary directory to store the blastdb in.
+
+    Returns:
+        Full path to the DB
+    """
+    blast_db_path = os.path.join(temp_dir, 'ectyper_blastdb')
+
+    LOG.debug("Generating the blast db at {0}".format(blast_db_path))
+    cmd = [
+        "makeblastdb",
+        "-in", ' '.join(filelist),
+        "-dbtype", "nucl",
+        "-title", "ectyper_blastdb",
+        "-out", blast_db_path]
+    subprocess_util.run_subprocess(cmd)
+
+    return blast_db_path
+
+
+def run_blast(query_file, blast_db, args):
+    """
+    Execute a blastn run given the query files and blastdb
+
+    Args:
+        query_file (str): one or both of the VF / Serotype input files
+        blast_db (str): validated fasta files from the user, in DB form
+        args (Namespace object): parsed commadnline options from the user
+        chunck_size: number of genomes in the database
+
+    Returns:
+        The blast output file
+    """
+    percent_identity = args.percentIdentity
+    percent_length = args.percentLength
+
+    LOG.debug('Running blast query {0} against database {1} '.format(
+        query_file, blast_db))
+
+    blast_output_file = blast_db + '.output'
+
+    cmd = [
+        "blastn",
+        "-query", query_file,
+        "-db", blast_db,
+        "-out", blast_output_file,
+        '-perc_identity', str(percent_identity),
+        '-qcov_hsp_perc', str(percent_length),
+        '-max_hsps', '1', # each allele only need to hit once
+        # use default max_target_seqs=500
+        "-outfmt",
+        '6 qseqid qlen sseqid length pident sstart send sframe qcovhsp',
+        "-word_size", "11"
+    ]
+    subprocess_util.run_subprocess(cmd)
+    with open(blast_output_file, mode='rb') as fh:
+        for line in fh:
+            LOG.debug(line.decode('ascii'))
+    return blast_output_file
+
+def run_blast_for_identification(query_file, blast_db):
+    """
+    Execute a blastn run given the query files and blastdb
+    with special configuration for high performance identification
+
+    Args:
+        query_file: one or both of the VF / Serotype input files
+        blast_db: validated fasta files from the user, in DB form
+
+    Returns:
+        blast_output_file (str): path to the blast output file
+    """
+
+    LOG.debug('Running blast query {0} against database {1} '.format(
+        query_file, blast_db))
+
+    blast_output_file = blast_db + '.output'
+
+    cmd = [
+        "blastn",
+        "-query", query_file,
+        "-db", blast_db,
+        "-out", blast_output_file,
+        '-perc_identity', '90',
+        '-qcov_hsp_perc', '90',
+        '-max_target_seqs', '1',  # we only want to know hit/no hit
+        # 10 query seq, we want at most 1 hit each
+        "-outfmt",
+        '6 qseqid qlen sseqid length pident sstart send sframe',
+        "-word_size", "11"
+    ]
+    subprocess_util.run_subprocess(cmd)
+
+    return blast_output_file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/bin/commandLineOptions.py	Fri Jan 05 15:58:51 2018 -0500
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+
+import argparse
+
+
+def parse_command_line(args=None):
+    """
+    The options for both the serotyper, and virulence finder.
+    The returned object is used by both, but the options do not
+    necessarily apply to both.
+
+    Args:
+        args: Optional args to be passed to argparse.parse_args()
+
+    Returns:
+        The populated argparse Namespace
+    """
+
+    def check_percentage(value):
+        """
+        type checker for percentage input
+        """
+        ivalue = int(value)
+        if ivalue <= 0 or ivalue > 100:
+            raise argparse.ArgumentTypeError(
+                "{0} is an invalid positive int percentage value".format(value)
+            )
+        return ivalue
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i",
+        "--input",
+        help="Location of new file(s). Can be a single file or \
+            a directory",
+        required=True
+    )
+
+    parser.add_argument(
+        "-d",
+        "--percentIdentity",
+        type=check_percentage,
+        help="Percentage of identity wanted to use against the\
+                  database. From 0 to 100, default is 90%%.",
+        default=90
+    )
+
+    parser.add_argument(
+        "-l",
+        "--percentLength",
+        type=check_percentage,
+        help="Percentage of length wanted to use against the \
+                  database. From 0 to 100, default is 50%%.",
+        default=50
+    )
+
+    parser.add_argument(
+        "--verify",
+        action="store_true",
+        help="Enable E. Coli. verification"
+    )
+
+    parser.add_argument(
+        "-s",
+        "--species",
+        action="store_true",
+        help="Enable species identification when non-ecoli genome is found\n\
+            Note: refseq downloading is required when running this option for the first time."
+    )
+
+    parser.add_argument(
+        '--detailed',
+        action='store_true',
+        help='Enable detailed output'
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="Directory location of output files."
+    )
+
+    if args is None:
+        return parser.parse_args()
+    return parser.parse_args(args)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/bin/definitions.py	Fri Jan 05 15:58:51 2018 -0500
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+
+"""
+    Definitions for the ectyper project
+"""
+
+import os
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+DATA_DIR = os.path.join(ROOT_DIR, 'Data')
+WORKPLACE_DIR = os.getcwd()
+
+SEROTYPE_FILE = os.path.join(DATA_DIR, 'ectyper_data.fasta')
+SEROTYPE_ALLELE_JSON = os.path.join(DATA_DIR, 'ectyper_dict.json')
+COMBINED = os.path.join(DATA_DIR, 'combined.fasta')
+
+ECOLI_MARKERS = os.path.join(DATA_DIR, 'ecoli_specific_markers.fasta')
+SAMTOOLS = 'samtools'
+REFSEQ_SUMMARY = os.path.join(DATA_DIR, 'assembly_summary_refseq.txt')
+REFSEQ_SKETCH = os.path.join(DATA_DIR, 'refseq.genomes.k21s1000.msh')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/bin/genomeFunctions.py	Fri Jan 05 15:58:51 2018 -0500
@@ -0,0 +1,280 @@
+'''
+Genome Utilities
+'''
+#!/usr/bin/env python
+
+import logging
+import os
+import re
+import tempfile
+from tarfile import is_tarfile
+
+from Bio import SeqIO
+from ectyper import definitions, subprocess_util
+
+LOG = logging.getLogger(__name__)
+
+
+def get_files_as_list(file_or_directory):
+    """
+    Creates a list of files from either the given file, or all files within the
+    directory specified (where each file name is its absolute path).
+
+    Args:
+        file_or_directory (str): file or directory name given on commandline
+
+    Returns:
+        files_list (list(str)): List of all the files found.
+
+    """
+
+    files_list = []
+    if file_or_directory == '':
+        return files_list
+
+    if os.path.isdir(file_or_directory):
+        LOG.info("Gathering genomes from directory " + file_or_directory)
+
+        # Create a list containing the file names
+        for root, dirs, files in os.walk(file_or_directory):
+            for filename in files:
+                files_list.append(os.path.join(root, filename))
+    # check if input is concatenated file locations
+    elif ',' in file_or_directory:
+        LOG.info("Using genomes in the input list")
+        for filename in file_or_directory.split(','):
+            files_list.append(os.path.abspath(filename))
+    else:
+        LOG.info("Using genomes in file " + file_or_directory)
+        files_list.append(os.path.abspath(file_or_directory))
+
+    return sorted(files_list)
+
+
+def get_valid_format(file):
+    """
+    Check using SeqIO if files are valid fasta/fastq format, returns the format.
+
+    Args:
+        file (str): path of file
+
+    Returns:
+        fm (str or None): the file format if 'fasta' or 'fastq', otherwise None
+    """
+    for fm in ['fastq', 'fasta']:
+        try:
+            with open(file, "r") as handle:
+                data = SeqIO.parse(handle, fm)
+                if any(data):
+                    if is_tarfile(file):
+                        LOG.warning("Compressed file is not supported: {}".format(file))
+                        return None
+                    return fm
+        except FileNotFoundError as err:
+            LOG.warning("{0} is not found".format(file))
+            return None
+        except UnicodeDecodeError as err:
+            LOG.warning("{0} is not a valid file".format(file))
+            return None
+        except:
+            LOG.warning("{0} is an unexpected file".format(file))
+            return None
+    LOG.warning("{0} is not a fasta/fastq file".format(file))
+    return None
+
+
+def get_genome_names_from_files(files, temp_dir):
+    """
+    For each file:
+    Takes the first header from a fasta file and sends it to the get_genome_name
+    function. Returns the name of the genome. If the name of the file is to be
+    used as the genome name, creates a temporary file using >lcl|filename as the
+    first part of the header.
+
+    Args:
+        files (list): The list of files to get the genome names for
+        tempdir (str): A tempdir where the copied files will be stored
+
+    Returns:
+        tuple(list(str), list(str)): first list is genome names, second list is file names
+    """
+
+    list_of_genomes = []
+    list_of_files = []
+    for file in files:
+        # Always to use the filename as the genome name.
+        # This means we also need to create a new file adding
+        # the filename to each of the headers, so that downstream applications
+        # (eg. BLAST) can be used with the filename as genome name.
+
+        # get only the name of the file for use in the fasta header
+        file_base_name = os.path.basename(file)
+        file_path_name = os.path.splitext(file_base_name)[0]
+        n_name = file_path_name.replace(' ', '_')
+
+        # create a new file for the updated fasta headers
+        new_file = tempfile.NamedTemporaryFile(dir=temp_dir, delete=False).name
+
+        # add the new name to the list of files and genomes
+        list_of_files.append(new_file)
+        list_of_genomes.append(n_name)
+
+        with open(new_file, "w") as outfile:
+            with open(file) as infile:
+                for record in SeqIO.parse(infile, "fasta"):
+                    outfile.write(">lcl|" + n_name + "|" + record.description + "\n")
+                    outfile.write(str(record.seq) + "\n")
+
+    return list_of_genomes, list_of_files
+
+
+def get_genome_name(header):
+    """
+    Getting the name of the genome by hierarchy. This requires reading the first
+    fasta header from the file. It also assumes a single genome per file.
+
+    Args:
+        header (str): The header containing the record.
+
+    Returns:
+        genomeName (str): Name of the genome contained in the header.
+    """
+
+    re_patterns = (
+        # Look for lcl followed by the possible genome name
+        re.compile('(lcl\|[\w\-\.]+)'),
+
+        # Look for contigs in the wwwwdddddd format
+        re.compile('([A-Za-z]{4}\d{2})\d{6}'),
+
+        # Look for a possible genome name at the beginning of the record ID
+        re.compile('^(\w{8}\.\d)'),
+
+        # Look for ref, gb, emb or dbj followed by the possible genome name
+        re.compile('(ref\|\w{2}_\w{6}|gb\|\w{8}|emb\|\w{8}|dbj\|\w{8})'),
+
+        # Look for gi followed by the possible genome name
+        re.compile('(gi\|\d{8})'),
+
+
+        # Look for name followed by space, then description
+        re.compile('^([\w\-\.]+)\s+[\w\-\.]+')
+    )
+
+    # if nothing matches, use the full header as genome_name
+    genome_name = header
+    for rep in re_patterns:
+        m = rep.search(header)
+
+        if m:
+            genome_name = m.group(1)
+            break
+
+    return str(genome_name)
+
+def assemble_reads(reads, reference, temp_dir):
+    '''
+    Return path to assembled reads.
+    Assemble short reads by mapping to a reference genome.
+    Default output is the same as reads file
+        (basename+'iden.fasta' and basename+'pred.fasta').
+
+    Args:
+        reads (str): FASTQ/FQ format reads file
+        reference (str): FASTA format reference file
+        temp_dir (str): temp_dir for storing assembled files
+
+    Returns:
+        tuple(str, str): identifcation and prediction fasta file
+    '''
+    output = os.path.join(
+        temp_dir,
+        os.path.splitext(os.path.basename(reads))[0]+'.fasta'
+    )
+
+    # run once if index reference does not exist
+    index_path = \
+        os.path.join(
+            definitions.DATA_DIR,
+            'bowtie_index',
+            os.path.splitext(os.path.basename(reference))[0]
+        )
+    index_dir = os.path.split(index_path)[0]
+    if not os.path.isdir(index_dir):
+        LOG.info('Reference index does not exist. Creating new reference index at {}'.format(index_dir))
+        if not os.path.exists(index_dir):
+            os.makedirs(index_dir)
+        cmd1 = [
+            'bowtie2-build',
+            reference,
+            index_path
+        ]
+        subprocess_util.run_subprocess(cmd1)
+
+    cmd2 = [
+        'bowtie2',
+        '--score-min L,1,-0.5',
+        '--np 5',
+        '-x', index_path,
+        '-U', reads,
+        '-S', os.path.join(temp_dir, 'reads.sam')
+    ]
+    subprocess_util.run_subprocess(cmd2)
+
+    cmd3 = [
+        definitions.SAMTOOLS, 'view',
+        '-F 4',
+        '-q 1',
+        '-bS', os.path.join(temp_dir, 'reads.sam'),
+        '-o', os.path.join(temp_dir, 'reads.bam')
+    ]
+    subprocess_util.run_subprocess(cmd3)
+    cmd4 = [
+        definitions.SAMTOOLS, 'sort',
+        os.path.join(temp_dir, 'reads.bam'),
+        '-o', os.path.join(temp_dir, 'reads.sorted.bam'),
+    ]
+    subprocess_util.run_subprocess(cmd4)
+
+    shell_cmd = [
+        definitions.SAMTOOLS+' mpileup -uf', # mpileup
+        reference,
+        os.path.join(temp_dir, 'reads.sorted.bam'),
+        '|',
+        'bcftools call -c', # variant calling
+        '|',
+        'vcfutils.pl vcf2fq', # vcf to fq
+        '|',
+        'seqtk seq -A -', # fq to fasta
+        '>',
+        output
+    ]
+    subprocess_util.run_subprocess(' '.join(shell_cmd), is_shell=True)
+    return split_mapped_output(output)
+
+def split_mapped_output(file):
+    '''
+    Splits given fasta files into two file based on 'lcl' tags
+        in the seq header
+
+    Args:
+        file (str): path to input fasta file
+
+    Returns:
+        (str): path to ecoli identification fasta seq
+        (str): path to serotype prediction fasta seq
+    '''
+    identif_file = os.path.splitext(file)[0]+'.iden.fasta'
+    predict_file = os.path.splitext(file)[0]+'.pred.fasta'
+    identif_seqs = []
+    predict_seqs = []
+    for record in SeqIO.parse(file, 'fasta'):
+        if 'lcl' in record.description:
+            identif_seqs.append(record)
+        else:
+            predict_seqs.append(record)
+    with open(identif_file, "w") as output_handle:
+        SeqIO.write(identif_seqs, output_handle, "fasta")
+    with open(predict_file, "w") as output_handle:
+        SeqIO.write(predict_seqs, output_handle, "fasta")
+    return identif_file, predict_file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/bin/loggingFunctions.py	Fri Jan 05 15:58:51 2018 -0500
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+"""
+    Set up the logging
+"""
+
+import logging
+import os
+
+import ectyper.definitions as D
+
+
+def initialize_logging():
+    """
+    Set up the screen and file logging.
+
+    Args:
+        None
+        
+    Returns:
+        log_file (str): The log filename
+    """
+
+    # set up DEBUG logging to file, INFO logging to STDERR
+    log_file = os.path.join(D.WORKPLACE_DIR, 'ectyper.log')
+
+    formatter = logging.Formatter(
+        '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
+
+    # logging to file
+    logging.basicConfig(
+        level=logging.DEBUG,
+        format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
+        datefmt='%m-%d %H:%M',
+        filename=log_file,
+        filemode='w')
+
+    # define a Handler which writes INFO messages or higher to the sys.stderr
+    console = logging.StreamHandler()
+    console.setFormatter(formatter)
+    console.setLevel(logging.INFO)
+
+    # add the handler to the root logger
+    logging.getLogger('').addHandler(console)
+
+    return log_file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/bin/predictionFunctions.py	Fri Jan 05 15:58:51 2018 -0500
@@ -0,0 +1,231 @@
+#!/usr/bin/env python
+
+import json
+import logging
+import os
+from collections import defaultdict
+
+import pandas as pd
+
+LOG = logging.getLogger(__name__)
+
+"""
+    Serotype prediction for E. coli
+"""
+
+def predict_serotype(blast_output_file, ectyper_dict_file, predictions_file, detailed=False):
+    """Make serotype prediction for all genomes based on blast output
+    
+    Args:
+        blast_output_file(str):
+            blastn output with outfmt:
+                "6 qseqid qlen sseqid length pident sstart send sframe qcovhsp -word_size 11"
+    ectyper_dict_file(str):
+        mapping file used to associate allele id to allele informations
+    predictions_file(str):
+        csv file to store result
+    detailed(bool, optional):
+        whether to generate detailed output or not
+    
+    Returns:
+        predictions_file
+    """
+    basename, extension = os.path.splitext(predictions_file)
+    parsed_output_file = ''.join([basename, '_raw', extension])
+
+    LOG.info("Predicting serotype from blast output")
+    output_df = blast_output_to_df(blast_output_file)
+    ectyper_df = ectyper_dict_to_df(ectyper_dict_file)
+    # Merge output_df and ectyper_df
+    output_df = output_df.merge(ectyper_df, left_on='qseqid', right_on='name', how='left')
+    predictions_dict = {}
+    # Select individual genomes
+    output_df['genome_name'] = output_df['sseqid'].str.split('|').str[1]
+    # Initialize constants
+    gene_pairs = {'wzx':'wzy', 'wzy':'wzx', 'wzm':'wzt', 'wzt':'wzm'}
+    predictions_columns = ['O_prediction', 'O_info', 'H_prediction', 'H_info']
+    gene_list = ['wzx', 'wzy', 'wzm', 'wzt', 'fliC', 'fllA', 'flkA', 'flmA', 'flnA']
+    if detailed:
+        # Add gene lists for detailed output report
+        for gene in gene_list:
+            predictions_columns.append(gene)
+    for genome_name, per_genome_df in output_df.groupby('genome_name'):
+        # Make prediction for each genome based on blast output
+        predictions_dict[genome_name] = get_prediction(
+            per_genome_df, predictions_columns, gene_pairs, detailed)
+    predictions_df = pd.DataFrame(predictions_dict).transpose()
+    if predictions_df.empty:
+        predictions_df = pd.DataFrame(columns=predictions_columns)
+    predictions_df = predictions_df[predictions_columns]
+    store_df(output_df, parsed_output_file)
+    store_df(predictions_df, predictions_file)
+    LOG.info("Serotype prediction completed")
+    return predictions_file
+
+def get_prediction(per_genome_df, predictions_columns, gene_pairs, detailed, ):
+    """Make serotype prediction for single genomes based on blast output
+    
+    Args:
+        per_genome_df(DataFrame):
+            blastn outputs for the given genome
+        predictions_columns(dict):
+            columns to be filled by prediction function
+        gene_pairs(dict):
+            dict of pair genes used for paired logic
+        detailed(bool):
+            whether to generate detailed output or not
+    Return:
+        Prediction dictionary for the given genome
+    """
+    # Extract potential predictors
+    useful_columns = [
+        'gene', 'serotype', 'score', 'name', 'desc', 'pident', 'qcovhsp', 'qseqid', 'sseqid'
+    ]
+    per_genome_df = per_genome_df.sort_values(['gene', 'serotype', 'score'], ascending=False)
+    per_genome_df = per_genome_df[~per_genome_df.duplicated(['gene', 'serotype'])]
+    predictors_df = per_genome_df[useful_columns]
+    predictors_df = predictors_df.sort_values('score', ascending=False)
+    predictions = {}
+    for column in predictions_columns:
+        predictions[column] = None
+    for predicting_antigen in ['O', 'H']:
+        genes_pool = defaultdict(list)
+        for index, row in predictors_df.iterrows():
+            gene = row['gene']
+            if detailed:
+                predictions[gene] = True
+            if not predictions[predicting_antigen+'_prediction']:
+                serotype = row['serotype']
+                if serotype[0] is not predicting_antigen:
+                    continue
+                genes_pool[gene].append(serotype)
+                prediction = None
+                if len(serotype) < 1:
+                    continue
+                antigen = serotype[0].upper()
+                if antigen != predicting_antigen:
+                    continue
+                if gene in gene_pairs.keys():
+                    predictions[antigen+'_info'] = 'Only unpaired alignments found'
+                    # Pair gene logic
+                    potential_pairs = genes_pool.get(gene_pairs.get(gene))
+                    if potential_pairs is None:
+                        continue
+                    if serotype in potential_pairs:
+                        prediction = serotype
+                else:
+                    # Normal logic
+                    prediction = serotype
+                if prediction is None:
+                    continue
+                predictions[antigen+'_info'] = 'Alignment found'
+                predictions[predicting_antigen+'_prediction'] = prediction
+        # No alignment found
+        ## Make prediction based on non-paired gene
+        ##   if only one non-paired gene is avaliable
+        if predictions.get(predicting_antigen+'_prediction') is None:
+            if len(genes_pool) == 1:
+                serotypes = list(genes_pool.values())[0]
+                if len(serotypes) == 1:
+                    predictions[antigen+'_info'] = 'Lone unpaired alignment found'
+                    predictions[predicting_antigen+'_prediction'] = serotypes[0]
+    return predictions
+
+def blast_output_to_df(blast_output_file):
+    '''Convert raw blast output file to DataFrame
+    Args:
+        blast_output_file(str): location of blast output
+    
+    Returns:
+        DataFrame:
+            DataFrame that contains all informations from blast output
+    '''
+    # Load blast output
+    output_data = []
+    with open(blast_output_file, 'r') as fh:
+        for line in fh:
+            fields = line.strip().split()
+            entry = {
+                'qseqid': fields[0],
+                'qlen': fields[1],
+                'sseqid': fields[2],
+                'length': fields[3],
+                'pident': fields[4],
+                'sstart': fields[5],
+                'send': fields[6],
+                'sframe': fields[7],
+                'qcovhsp': fields[8]
+            }
+            output_data.append(entry)
+    df = pd.DataFrame(output_data)
+    if not output_data:
+        LOG.info("No hit found for this blast query")
+        # Return empty dataframe with correct columns
+        return pd.DataFrame(
+            columns=[
+                'length', 'pident', 'qcovhsp',
+                'qlen', 'qseqid', 'send',
+                'sframe', 'sseqid', 'sstart'
+            ])
+    df['score'] = df['pident'].astype(float)*df['qcovhsp'].astype(float)/10000
+    return df
+
+def ectyper_dict_to_df(ectyper_dict_file):
+    # Read ectyper dict
+    with open(ectyper_dict_file) as fh:
+        ectyper_dict = json.load(fh)
+        temp_list = []
+        for antigen, alleles in ectyper_dict.items():
+            for name, allele in alleles.items():
+                new_entry = {
+                    'serotype': allele.get('allele'),
+                    'name': name,
+                    'gene': allele.get('gene'),
+                    'desc': allele.get('desc')
+                }
+                temp_list.append(new_entry)
+        df = pd.DataFrame(temp_list)
+        return df
+
+def store_df(src_df, dst_file):
+    """Append dataframe to a file if it exists, otherwise, make a new file
+    Args:
+        src_df(str): dataframe object to be stored
+        dst_file(str): dst_file to be modified/created
+    """
+    if os.path.isfile(dst_file):
+        with open(dst_file, 'a') as fh:
+            src_df.to_csv(fh, header=False)
+    else:
+        with open(dst_file, 'w') as fh:
+            src_df.to_csv(fh, header=True, index_label='genome')
+
+def report_result(csv_file):
+    '''Report the content of dataframe in log
+    
+    Args:
+        csv_file(str): location of the prediction file
+    '''
+    df = pd.read_csv(csv_file)
+    if df.empty:
+        LOG.info('No prediction was made becuase no alignment was found')
+        return
+    LOG.info('\n{0}'.format(df.to_string(index=False)))
+
+def add_non_predicted(all_genomes_list, predictions_file):
+    '''Add genomes that do not show up in blast result to prediction file
+    
+    Args:
+        all_genome_list(list):
+            list of genomes from user input
+        predictions_file(str):
+            location of the prediction file
+    Returns:
+        str: location of the prediction file
+    '''
+    df = pd.read_csv(predictions_file)
+    df = df.merge(pd.DataFrame(all_genomes_list, columns=['genome']), on='genome', how='outer')
+    df.fillna({'O_info':'No alignment found', 'H_info':'No alignment found'}, inplace=True)
+    df.fillna('-', inplace=True)
+    df.to_csv(predictions_file, index=False)
+    return predictions_file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/bin/speciesIdentification.py	Fri Jan 05 15:58:51 2018 -0500
@@ -0,0 +1,152 @@
+import logging
+import multiprocessing
+import os
+import tempfile
+from Bio import SeqIO
+
+from ectyper import genomeFunctions, blastFunctions, definitions, subprocess_util
+
+LOG = logging.getLogger(__name__)
+
+def is_ecoli_genome(iden_file, genome_file=None, mash=False):
+    '''
+    Return True if file is classified as ecoli by ecoli markers, otherwise False
+
+    Args:
+        iden_file (str): path to valid fasta genome file
+        genome_file (str): Optional path to valid fastq file for reads
+        mash (bool): Optional input to decide whether to use mash if genome is
+                     identified as non-ecoli
+
+    Returns:
+        bool: True if iden_file is ecoli, False otherwise
+    '''
+    if genome_file is None:
+        genome_file = iden_file
+    num_hit = get_num_hits(iden_file)
+    if num_hit < 3:
+        LOG.info(
+            "{0} is identified as "
+            "an invalid e.coli genome file "
+            "by marker approach".format(os.path.basename(iden_file)))
+        if mash:
+            species = get_species(genome_file)
+            LOG.info(
+                "{0} is identified as genome of "
+                "{1} by mash approach".format(os.path.basename(iden_file), species))
+        return False
+    LOG.debug("{0} is a valid e.coli genome file".format(os.path.basename(iden_file)))
+    return True
+
+def get_num_hits(target):
+    '''
+    Return number of matching hits when query the reference genome
+        on the target genome
+
+    Args:
+        target (str): target genome
+
+    Returns:
+        int: number of hits found
+    '''
+    num_hit = 0
+    try:
+        with tempfile.TemporaryDirectory() as temp_dir:
+            blast_db = blastFunctions.create_blast_db([target], temp_dir)
+            result = blastFunctions.run_blast_for_identification(
+                definitions.ECOLI_MARKERS,
+                blast_db
+            )
+            with open(result) as handler:
+                LOG.debug("get_num_hits() output:")
+                for line in handler:
+                    LOG.debug(line)
+                    num_hit += 1
+            LOG.debug("{0} aligned to {1} marker sequences".format(target, num_hit))
+    except SystemExit:
+        pass
+    return num_hit
+
+def get_species(file):
+    '''
+    Given a fasta/fastq file, return the most likely species identification
+
+    Args:
+        file (str): fasta/fastq file input
+
+    Returns:
+        str: name of estimated species
+    '''
+    LOG.info("Identifying species for {0}".format(file))
+    if not os.path.isfile(definitions.REFSEQ_SKETCH):
+        LOG.warning("No refseq found.")
+        return None
+    species = 'unknown'
+    if genomeFunctions.get_valid_format(file) == 'fasta':
+        tmp_file = tempfile.NamedTemporaryFile().name
+        basename = os.path.basename(file).replace(' ', '_')
+        with open(tmp_file, 'w') as new_fh:
+            header = '> {0}\n'.format(basename)
+            new_fh.write(header)
+            for record in SeqIO.parse(file, 'fasta'):
+                new_fh.write(str(record.seq))
+                new_fh.write('nnnnnnnnnnnnnnnnnnnn')
+        try:
+            species = get_species_helper(tmp_file)
+        except Exception:
+            pass
+    if genomeFunctions.get_valid_format(file) == 'fastq':
+        species = get_species_helper(file)
+    return species
+
+def get_species_helper(file):
+    '''
+    Given a fasta/fastq file with one sequence, return the most likely species
+    identification
+
+    Args:
+        file (str): fasta/fastq file input
+
+    Returns:
+        str: name of estimated species
+    '''
+    species = 'unknown'
+    cmd = [
+        'mash', 'dist',
+        file,
+        definitions.REFSEQ_SKETCH,
+        '|',
+        'sort -gk3 -',
+        '|',
+        'head -1 -'
+    ]
+    try:
+        mash_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True)
+        ass_acc_num = '_'.join(mash_output.split('\t')[1].split('_')[:2])
+        cmd = [
+            'grep -E',
+            ass_acc_num,
+            definitions.REFSEQ_SUMMARY
+        ]
+        summary_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True)
+        species = summary_output.split('\t')[7]
+        return species
+    except Exception as err:
+        LOG.warning('No matching species found with distance estimation:{0}'.format(err))
+        try:
+            cmd = [
+                'mash screen',
+                '-w',
+                '-p', str(multiprocessing.cpu_count()//2),
+                definitions.REFSEQ_SKETCH,
+                file,
+                '| sort -gr - | head -1 -'
+            ]
+            screen_output = subprocess_util.run_subprocess(' '.join(cmd), is_shell=True)
+            LOG.debug(screen_output.split('\t'))
+            species = screen_output.split('\t')[5].split('\n')[0]
+        except Exception as err2:
+            LOG.warning(
+                'No matching species found with distance screening either:{}'.format(err2)
+            )
+    return species
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/bin/subprocess_util.py	Fri Jan 05 15:58:51 2018 -0500
@@ -0,0 +1,34 @@
+'''
+Utilities
+'''
+import logging
+import subprocess
+import timeit
+
+LOG = logging.getLogger(__name__)
+
+def run_subprocess(cmd, is_shell=False):
+    '''
+    Run cmd command on subprocess
+
+    Args:
+        cmd (str): cmd command
+
+    Returns:
+        stdout (str): The stdout of cmd
+    '''
+    start_time = timeit.default_timer()
+    LOG.debug("Running: {0}".format(cmd))
+    comp_proc = subprocess.run(
+        cmd,
+        shell=is_shell,
+        universal_newlines=True,
+        check=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE
+    )
+    stderr = comp_proc.stderr
+    stdout = comp_proc.stdout
+    elapsed_time = timeit.default_timer() - start_time
+    LOG.debug("Subprocess finished successfully in {:0.3f} sec.".format(elapsed_time))
+    return stdout