6
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 Functions for setting up, running, and parsing blast
|
|
5 """
|
|
6 import logging
|
|
7 import os
|
|
8
|
|
9 from ectyper import subprocess_util
|
|
10
|
|
11 LOG = logging.getLogger(__name__)
|
|
12
|
|
13
|
|
14 def create_blast_db(filelist, temp_dir):
|
|
15 """http://stackoverflow.com/questions/23944657/typeerror-method-takes-1-positional-argument-but-2-were-given
|
|
16 Creating a blast DB using the makeblastdb command.
|
|
17 The database is created in the temporary folder of the system.
|
|
18
|
|
19 Args:
|
|
20 filelist: list of genomes that was given by the user on the command line.
|
|
21 temp_dir: temporary directory to store the blastdb in.
|
|
22
|
|
23 Returns:
|
|
24 Full path to the DB
|
|
25 """
|
|
26 blast_db_path = os.path.join(temp_dir, 'ectyper_blastdb')
|
|
27
|
|
28 LOG.debug("Generating the blast db at {0}".format(blast_db_path))
|
|
29 cmd = [
|
|
30 "makeblastdb",
|
|
31 "-in", ' '.join(filelist),
|
|
32 "-dbtype", "nucl",
|
|
33 "-title", "ectyper_blastdb",
|
|
34 "-out", blast_db_path]
|
|
35 subprocess_util.run_subprocess(cmd)
|
|
36
|
|
37 return blast_db_path
|
|
38
|
|
39
|
|
40 def run_blast(query_file, blast_db, args):
|
|
41 """
|
|
42 Execute a blastn run given the query files and blastdb
|
|
43
|
|
44 Args:
|
|
45 query_file (str): one or both of the VF / Serotype input files
|
|
46 blast_db (str): validated fasta files from the user, in DB form
|
|
47 args (Namespace object): parsed commadnline options from the user
|
|
48 chunck_size: number of genomes in the database
|
|
49
|
|
50 Returns:
|
|
51 The blast output file
|
|
52 """
|
|
53 percent_identity = args.percentIdentity
|
|
54 percent_length = args.percentLength
|
|
55
|
|
56 LOG.debug('Running blast query {0} against database {1} '.format(
|
|
57 query_file, blast_db))
|
|
58
|
|
59 blast_output_file = blast_db + '.output'
|
|
60
|
|
61 cmd = [
|
|
62 "blastn",
|
|
63 "-query", query_file,
|
|
64 "-db", blast_db,
|
|
65 "-out", blast_output_file,
|
|
66 '-perc_identity', str(percent_identity),
|
|
67 '-qcov_hsp_perc', str(percent_length),
|
|
68 '-max_hsps', '1', # each allele only need to hit once
|
|
69 # use default max_target_seqs=500
|
|
70 "-outfmt",
|
|
71 '6 qseqid qlen sseqid length pident sstart send sframe qcovhsp',
|
|
72 "-word_size", "11"
|
|
73 ]
|
|
74 subprocess_util.run_subprocess(cmd)
|
|
75 with open(blast_output_file, mode='rb') as fh:
|
|
76 for line in fh:
|
|
77 LOG.debug(line.decode('ascii'))
|
|
78 return blast_output_file
|
|
79
|
|
80 def run_blast_for_identification(query_file, blast_db):
|
|
81 """
|
|
82 Execute a blastn run given the query files and blastdb
|
|
83 with special configuration for high performance identification
|
|
84
|
|
85 Args:
|
|
86 query_file: one or both of the VF / Serotype input files
|
|
87 blast_db: validated fasta files from the user, in DB form
|
|
88
|
|
89 Returns:
|
|
90 blast_output_file (str): path to the blast output file
|
|
91 """
|
|
92
|
|
93 LOG.debug('Running blast query {0} against database {1} '.format(
|
|
94 query_file, blast_db))
|
|
95
|
|
96 blast_output_file = blast_db + '.output'
|
|
97
|
|
98 cmd = [
|
|
99 "blastn",
|
|
100 "-query", query_file,
|
|
101 "-db", blast_db,
|
|
102 "-out", blast_output_file,
|
|
103 '-perc_identity', '90',
|
|
104 '-qcov_hsp_perc', '90',
|
|
105 '-max_target_seqs', '1', # we only want to know hit/no hit
|
|
106 # 10 query seq, we want at most 1 hit each
|
|
107 "-outfmt",
|
|
108 '6 qseqid qlen sseqid length pident sstart send sframe',
|
|
109 "-word_size", "11"
|
|
110 ]
|
|
111 subprocess_util.run_subprocess(cmd)
|
|
112
|
|
113 return blast_output_file
|