diff ecoli_serotyping/blastFunctions.py @ 6:fe3ceb5c4214 draft

Uploaded
author jpetteng
date Fri, 05 Jan 2018 15:43:14 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ecoli_serotyping/blastFunctions.py	Fri Jan 05 15:43:14 2018 -0500
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+
+"""
+Functions for setting up, running, and parsing blast
+"""
+import logging
+import os
+
+from ectyper import subprocess_util
+
+LOG = logging.getLogger(__name__)
+
+
+def create_blast_db(filelist, temp_dir):
+    """http://stackoverflow.com/questions/23944657/typeerror-method-takes-1-positional-argument-but-2-were-given
+    Creating a blast DB using the makeblastdb command.
+    The database is created in the temporary folder of the system.
+
+    Args:
+        filelist: list of genomes that was given by the user on the command line.
+        temp_dir: temporary directory to store the blastdb in.
+
+    Returns:
+        Full path to the DB
+    """
+    blast_db_path = os.path.join(temp_dir, 'ectyper_blastdb')
+
+    LOG.debug("Generating the blast db at {0}".format(blast_db_path))
+    cmd = [
+        "makeblastdb",
+        "-in", ' '.join(filelist),
+        "-dbtype", "nucl",
+        "-title", "ectyper_blastdb",
+        "-out", blast_db_path]
+    subprocess_util.run_subprocess(cmd)
+
+    return blast_db_path
+
+
+def run_blast(query_file, blast_db, args):
+    """
+    Execute a blastn run given the query files and blastdb
+
+    Args:
+        query_file (str): one or both of the VF / Serotype input files
+        blast_db (str): validated fasta files from the user, in DB form
+        args (Namespace object): parsed commadnline options from the user
+        chunck_size: number of genomes in the database
+
+    Returns:
+        The blast output file
+    """
+    percent_identity = args.percentIdentity
+    percent_length = args.percentLength
+
+    LOG.debug('Running blast query {0} against database {1} '.format(
+        query_file, blast_db))
+
+    blast_output_file = blast_db + '.output'
+
+    cmd = [
+        "blastn",
+        "-query", query_file,
+        "-db", blast_db,
+        "-out", blast_output_file,
+        '-perc_identity', str(percent_identity),
+        '-qcov_hsp_perc', str(percent_length),
+        '-max_hsps', '1', # each allele only need to hit once
+        # use default max_target_seqs=500
+        "-outfmt",
+        '6 qseqid qlen sseqid length pident sstart send sframe qcovhsp',
+        "-word_size", "11"
+    ]
+    subprocess_util.run_subprocess(cmd)
+    with open(blast_output_file, mode='rb') as fh:
+        for line in fh:
+            LOG.debug(line.decode('ascii'))
+    return blast_output_file
+
+def run_blast_for_identification(query_file, blast_db):
+    """
+    Execute a blastn run given the query files and blastdb
+    with special configuration for high performance identification
+
+    Args:
+        query_file: one or both of the VF / Serotype input files
+        blast_db: validated fasta files from the user, in DB form
+
+    Returns:
+        blast_output_file (str): path to the blast output file
+    """
+
+    LOG.debug('Running blast query {0} against database {1} '.format(
+        query_file, blast_db))
+
+    blast_output_file = blast_db + '.output'
+
+    cmd = [
+        "blastn",
+        "-query", query_file,
+        "-db", blast_db,
+        "-out", blast_output_file,
+        '-perc_identity', '90',
+        '-qcov_hsp_perc', '90',
+        '-max_target_seqs', '1',  # we only want to know hit/no hit
+        # 10 query seq, we want at most 1 hit each
+        "-outfmt",
+        '6 qseqid qlen sseqid length pident sstart send sframe',
+        "-word_size", "11"
+    ]
+    subprocess_util.run_subprocess(cmd)
+
+    return blast_output_file