Mercurial > repos > jpetteng > ectyper
diff ecoli_serotyping/blastFunctions.py @ 6:fe3ceb5c4214 draft
Uploaded
author | jpetteng |
---|---|
date | Fri, 05 Jan 2018 15:43:14 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ecoli_serotyping/blastFunctions.py Fri Jan 05 15:43:14 2018 -0500 @@ -0,0 +1,113 @@ +#!/usr/bin/env python + +""" +Functions for setting up, running, and parsing blast +""" +import logging +import os + +from ectyper import subprocess_util + +LOG = logging.getLogger(__name__) + + +def create_blast_db(filelist, temp_dir): + """http://stackoverflow.com/questions/23944657/typeerror-method-takes-1-positional-argument-but-2-were-given + Creating a blast DB using the makeblastdb command. + The database is created in the temporary folder of the system. + + Args: + filelist: list of genomes that was given by the user on the command line. + temp_dir: temporary directory to store the blastdb in. + + Returns: + Full path to the DB + """ + blast_db_path = os.path.join(temp_dir, 'ectyper_blastdb') + + LOG.debug("Generating the blast db at {0}".format(blast_db_path)) + cmd = [ + "makeblastdb", + "-in", ' '.join(filelist), + "-dbtype", "nucl", + "-title", "ectyper_blastdb", + "-out", blast_db_path] + subprocess_util.run_subprocess(cmd) + + return blast_db_path + + +def run_blast(query_file, blast_db, args): + """ + Execute a blastn run given the query files and blastdb + + Args: + query_file (str): one or both of the VF / Serotype input files + blast_db (str): validated fasta files from the user, in DB form + args (Namespace object): parsed commadnline options from the user + chunck_size: number of genomes in the database + + Returns: + The blast output file + """ + percent_identity = args.percentIdentity + percent_length = args.percentLength + + LOG.debug('Running blast query {0} against database {1} '.format( + query_file, blast_db)) + + blast_output_file = blast_db + '.output' + + cmd = [ + "blastn", + "-query", query_file, + "-db", blast_db, + "-out", blast_output_file, + '-perc_identity', str(percent_identity), + '-qcov_hsp_perc', str(percent_length), + '-max_hsps', '1', # each allele only need to hit once + # use default max_target_seqs=500 + "-outfmt", + '6 qseqid qlen sseqid length pident sstart send sframe qcovhsp', + "-word_size", "11" + ] + subprocess_util.run_subprocess(cmd) + with open(blast_output_file, mode='rb') as fh: + for line in fh: + LOG.debug(line.decode('ascii')) + return blast_output_file + +def run_blast_for_identification(query_file, blast_db): + """ + Execute a blastn run given the query files and blastdb + with special configuration for high performance identification + + Args: + query_file: one or both of the VF / Serotype input files + blast_db: validated fasta files from the user, in DB form + + Returns: + blast_output_file (str): path to the blast output file + """ + + LOG.debug('Running blast query {0} against database {1} '.format( + query_file, blast_db)) + + blast_output_file = blast_db + '.output' + + cmd = [ + "blastn", + "-query", query_file, + "-db", blast_db, + "-out", blast_output_file, + '-perc_identity', '90', + '-qcov_hsp_perc', '90', + '-max_target_seqs', '1', # we only want to know hit/no hit + # 10 query seq, we want at most 1 hit each + "-outfmt", + '6 qseqid qlen sseqid length pident sstart send sframe', + "-word_size", "11" + ] + subprocess_util.run_subprocess(cmd) + + return blast_output_file