annotate ecoli_serotyping/blastFunctions.py @ 8:06e7c1355431 draft

Uploaded
author jpetteng
date Fri, 05 Jan 2018 16:21:19 -0500
parents fe3ceb5c4214
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
1 #!/usr/bin/env python
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
2
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
3 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
4 Functions for setting up, running, and parsing blast
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
5 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
6 import logging
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
7 import os
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
8
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
9 from ectyper import subprocess_util
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
10
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
11 LOG = logging.getLogger(__name__)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
12
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
13
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
14 def create_blast_db(filelist, temp_dir):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
15 """http://stackoverflow.com/questions/23944657/typeerror-method-takes-1-positional-argument-but-2-were-given
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
16 Creating a blast DB using the makeblastdb command.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
17 The database is created in the temporary folder of the system.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
18
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
19 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
20 filelist: list of genomes that was given by the user on the command line.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
21 temp_dir: temporary directory to store the blastdb in.
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
22
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
23 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
24 Full path to the DB
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
25 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
26 blast_db_path = os.path.join(temp_dir, 'ectyper_blastdb')
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
27
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
28 LOG.debug("Generating the blast db at {0}".format(blast_db_path))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
29 cmd = [
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
30 "makeblastdb",
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
31 "-in", ' '.join(filelist),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
32 "-dbtype", "nucl",
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
33 "-title", "ectyper_blastdb",
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
34 "-out", blast_db_path]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
35 subprocess_util.run_subprocess(cmd)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
36
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
37 return blast_db_path
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
38
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
39
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
40 def run_blast(query_file, blast_db, args):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
41 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
42 Execute a blastn run given the query files and blastdb
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
43
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
44 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
45 query_file (str): one or both of the VF / Serotype input files
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
46 blast_db (str): validated fasta files from the user, in DB form
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
47 args (Namespace object): parsed commadnline options from the user
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
48 chunck_size: number of genomes in the database
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
49
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
50 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
51 The blast output file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
52 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
53 percent_identity = args.percentIdentity
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
54 percent_length = args.percentLength
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
55
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
56 LOG.debug('Running blast query {0} against database {1} '.format(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
57 query_file, blast_db))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
58
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
59 blast_output_file = blast_db + '.output'
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
60
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
61 cmd = [
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
62 "blastn",
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
63 "-query", query_file,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
64 "-db", blast_db,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
65 "-out", blast_output_file,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
66 '-perc_identity', str(percent_identity),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
67 '-qcov_hsp_perc', str(percent_length),
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
68 '-max_hsps', '1', # each allele only need to hit once
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
69 # use default max_target_seqs=500
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
70 "-outfmt",
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
71 '6 qseqid qlen sseqid length pident sstart send sframe qcovhsp',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
72 "-word_size", "11"
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
73 ]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
74 subprocess_util.run_subprocess(cmd)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
75 with open(blast_output_file, mode='rb') as fh:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
76 for line in fh:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
77 LOG.debug(line.decode('ascii'))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
78 return blast_output_file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
79
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
80 def run_blast_for_identification(query_file, blast_db):
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
81 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
82 Execute a blastn run given the query files and blastdb
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
83 with special configuration for high performance identification
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
84
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
85 Args:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
86 query_file: one or both of the VF / Serotype input files
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
87 blast_db: validated fasta files from the user, in DB form
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
88
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
89 Returns:
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
90 blast_output_file (str): path to the blast output file
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
91 """
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
92
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
93 LOG.debug('Running blast query {0} against database {1} '.format(
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
94 query_file, blast_db))
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
95
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
96 blast_output_file = blast_db + '.output'
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
97
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
98 cmd = [
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
99 "blastn",
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
100 "-query", query_file,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
101 "-db", blast_db,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
102 "-out", blast_output_file,
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
103 '-perc_identity', '90',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
104 '-qcov_hsp_perc', '90',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
105 '-max_target_seqs', '1', # we only want to know hit/no hit
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
106 # 10 query seq, we want at most 1 hit each
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
107 "-outfmt",
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
108 '6 qseqid qlen sseqid length pident sstart send sframe',
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
109 "-word_size", "11"
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
110 ]
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
111 subprocess_util.run_subprocess(cmd)
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
112
fe3ceb5c4214 Uploaded
jpetteng
parents:
diff changeset
113 return blast_output_file