| 6 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 """ | 
|  | 4 Functions for setting up, running, and parsing blast | 
|  | 5 """ | 
|  | 6 import logging | 
|  | 7 import os | 
|  | 8 | 
|  | 9 from ectyper import subprocess_util | 
|  | 10 | 
|  | 11 LOG = logging.getLogger(__name__) | 
|  | 12 | 
|  | 13 | 
|  | 14 def create_blast_db(filelist, temp_dir): | 
|  | 15     """http://stackoverflow.com/questions/23944657/typeerror-method-takes-1-positional-argument-but-2-were-given | 
|  | 16     Creating a blast DB using the makeblastdb command. | 
|  | 17     The database is created in the temporary folder of the system. | 
|  | 18 | 
|  | 19     Args: | 
|  | 20         filelist: list of genomes that was given by the user on the command line. | 
|  | 21         temp_dir: temporary directory to store the blastdb in. | 
|  | 22 | 
|  | 23     Returns: | 
|  | 24         Full path to the DB | 
|  | 25     """ | 
|  | 26     blast_db_path = os.path.join(temp_dir, 'ectyper_blastdb') | 
|  | 27 | 
|  | 28     LOG.debug("Generating the blast db at {0}".format(blast_db_path)) | 
|  | 29     cmd = [ | 
|  | 30         "makeblastdb", | 
|  | 31         "-in", ' '.join(filelist), | 
|  | 32         "-dbtype", "nucl", | 
|  | 33         "-title", "ectyper_blastdb", | 
|  | 34         "-out", blast_db_path] | 
|  | 35     subprocess_util.run_subprocess(cmd) | 
|  | 36 | 
|  | 37     return blast_db_path | 
|  | 38 | 
|  | 39 | 
|  | 40 def run_blast(query_file, blast_db, args): | 
|  | 41     """ | 
|  | 42     Execute a blastn run given the query files and blastdb | 
|  | 43 | 
|  | 44     Args: | 
|  | 45         query_file (str): one or both of the VF / Serotype input files | 
|  | 46         blast_db (str): validated fasta files from the user, in DB form | 
|  | 47         args (Namespace object): parsed commadnline options from the user | 
|  | 48         chunck_size: number of genomes in the database | 
|  | 49 | 
|  | 50     Returns: | 
|  | 51         The blast output file | 
|  | 52     """ | 
|  | 53     percent_identity = args.percentIdentity | 
|  | 54     percent_length = args.percentLength | 
|  | 55 | 
|  | 56     LOG.debug('Running blast query {0} against database {1} '.format( | 
|  | 57         query_file, blast_db)) | 
|  | 58 | 
|  | 59     blast_output_file = blast_db + '.output' | 
|  | 60 | 
|  | 61     cmd = [ | 
|  | 62         "blastn", | 
|  | 63         "-query", query_file, | 
|  | 64         "-db", blast_db, | 
|  | 65         "-out", blast_output_file, | 
|  | 66         '-perc_identity', str(percent_identity), | 
|  | 67         '-qcov_hsp_perc', str(percent_length), | 
|  | 68         '-max_hsps', '1', # each allele only need to hit once | 
|  | 69         # use default max_target_seqs=500 | 
|  | 70         "-outfmt", | 
|  | 71         '6 qseqid qlen sseqid length pident sstart send sframe qcovhsp', | 
|  | 72         "-word_size", "11" | 
|  | 73     ] | 
|  | 74     subprocess_util.run_subprocess(cmd) | 
|  | 75     with open(blast_output_file, mode='rb') as fh: | 
|  | 76         for line in fh: | 
|  | 77             LOG.debug(line.decode('ascii')) | 
|  | 78     return blast_output_file | 
|  | 79 | 
|  | 80 def run_blast_for_identification(query_file, blast_db): | 
|  | 81     """ | 
|  | 82     Execute a blastn run given the query files and blastdb | 
|  | 83     with special configuration for high performance identification | 
|  | 84 | 
|  | 85     Args: | 
|  | 86         query_file: one or both of the VF / Serotype input files | 
|  | 87         blast_db: validated fasta files from the user, in DB form | 
|  | 88 | 
|  | 89     Returns: | 
|  | 90         blast_output_file (str): path to the blast output file | 
|  | 91     """ | 
|  | 92 | 
|  | 93     LOG.debug('Running blast query {0} against database {1} '.format( | 
|  | 94         query_file, blast_db)) | 
|  | 95 | 
|  | 96     blast_output_file = blast_db + '.output' | 
|  | 97 | 
|  | 98     cmd = [ | 
|  | 99         "blastn", | 
|  | 100         "-query", query_file, | 
|  | 101         "-db", blast_db, | 
|  | 102         "-out", blast_output_file, | 
|  | 103         '-perc_identity', '90', | 
|  | 104         '-qcov_hsp_perc', '90', | 
|  | 105         '-max_target_seqs', '1',  # we only want to know hit/no hit | 
|  | 106         # 10 query seq, we want at most 1 hit each | 
|  | 107         "-outfmt", | 
|  | 108         '6 qseqid qlen sseqid length pident sstart send sframe', | 
|  | 109         "-word_size", "11" | 
|  | 110     ] | 
|  | 111     subprocess_util.run_subprocess(cmd) | 
|  | 112 | 
|  | 113     return blast_output_file |