comparison tools/blast_rbh/blast_rbh.py @ 1:ff0b814c1320 draft

Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
author peterjc
date Fri, 21 Nov 2014 06:57:14 -0500
parents b828ca44a313
children 14b2e159b310
comparison
equal deleted inserted replaced
0:b828ca44a313 1:ff0b814c1320
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files. 2 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files.
3 3
4 Takes the following command line options, 4 Run "blast_rbh.py -h" to see the help text, or read the associated
5 1. FASTA filename of species A 5 README.rst file which is also available on GitHub at:
6 2. FASTA filename of species B 6 https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh
7 3. Sequence type (prot/nucl) 7
8 4. BLAST type (e.g. blastn, or blastp) consistent with sequence type 8 This requires Python and the NCBI BLAST+ tools to be installed
9 5. Minimum BLAST Percentage identity 9 and on the $PATH.
10 6. Minimum BLAST query coverage 10
11 7. Output filename 11 You can also run this tool via Galaxy using the "blast_rbh.xml"
12 definition file. This is available as a package on the Galaxy
13 Tool Shed: http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh
12 """ 14 """
13 15
14 # TODO - Output more columns, e.g. pident, qcovs, descriptions? 16 # TODO - Output more columns, e.g. pident, qcovs, descriptions?
15 17
16 import os 18 import os
28 if return_code: 30 if return_code:
29 stop_err("Error %i from: %s" % (return_code, cmd)) 31 stop_err("Error %i from: %s" % (return_code, cmd))
30 32
31 if "--version" in sys.argv[1:]: 33 if "--version" in sys.argv[1:]:
32 #TODO - Capture version of BLAST+ binaries too? 34 #TODO - Capture version of BLAST+ binaries too?
33 print "BLAST RBH v0.1.2" 35 print "BLAST RBH v0.1.5"
34 sys.exit(0) 36 sys.exit(0)
35 37
36 #Parse Command Line 38 #Parse Command Line
37 usage = """Use as follows: 39 usage = """Use as follows:
38 40
45 help="Alphabet type (nucl or prot)") 47 help="Alphabet type (nucl or prot)")
46 parser.add_option("-t", "--task", dest="task", 48 parser.add_option("-t", "--task", dest="task",
47 default=None, 49 default=None,
48 help="BLAST task (e.g. blastp, blastn, megablast)") 50 help="BLAST task (e.g. blastp, blastn, megablast)")
49 parser.add_option("-i","--identity", dest="min_identity", 51 parser.add_option("-i","--identity", dest="min_identity",
50 default="0", 52 default="70",
51 help="Minimum percentage identity (optional, default 0)") 53 help="Minimum percentage identity (optional, default 70)")
52 parser.add_option("-c", "--coverage", dest="min_coverage", 54 parser.add_option("-c", "--coverage", dest="min_coverage",
53 default="0", 55 default="50",
54 help="Minimum HSP coverage (optional, default 0)") 56 help="Minimum HSP coverage (optional, default 50)")
57 parser.add_option("--nr", dest="nr", default=False, action="store_true",
58 help="Preprocess FASTA files to collapse identifical "
59 "entries (make sequences non-redundant)")
55 parser.add_option("-o", "--output", dest="output", 60 parser.add_option("-o", "--output", dest="output",
56 default=None, metavar="FILE", 61 default=None, metavar="FILE",
57 help="Output filename") 62 help="Output filename")
58 options, args = parser.parse_args() 63 options, args = parser.parse_args()
59 64
115 assert 1 <= threads, threads 120 assert 1 <= threads, threads
116 121
117 makeblastdb_exe = "makeblastdb" 122 makeblastdb_exe = "makeblastdb"
118 123
119 base_path = tempfile.mkdtemp() 124 base_path = tempfile.mkdtemp()
125 tmp_a = os.path.join(base_path, "SpeciesA.fasta")
120 db_a = os.path.join(base_path, "SpeciesA") 126 db_a = os.path.join(base_path, "SpeciesA")
121 db_b = os.path.join(base_path, "SpeciesB")
122 a_vs_b = os.path.join(base_path, "A_vs_B.tabular") 127 a_vs_b = os.path.join(base_path, "A_vs_B.tabular")
123 b_vs_a = os.path.join(base_path, "B_vs_A.tabular") 128 if self_comparison:
129 tmp_b = tmp_a
130 db_b = db_a
131 b_vs_a = a_vs_b
132 else:
133 tmp_b = os.path.join(base_path, "SpeciesB.fasta")
134 db_b = os.path.join(base_path, "SpeciesB")
135 b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
124 log = os.path.join(base_path, "blast.log") 136 log = os.path.join(base_path, "blast.log")
125 137
126 cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs? 138 cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs?
127 c_query = 0 139 c_query = 0
128 c_match = 1 140 c_match = 1
200 yield current, list(best.values())[0] 212 yield current, list(best.values())[0]
201 else: 213 else:
202 #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) 214 #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
203 tie_warning += 1 215 tie_warning += 1
204 216
217 def check_duplicate_ids(filename):
218 # Copied from tools/ncbi_blast_plus/check_no_duplicates.py
219 # TODO - just use Biopython's FASTA parser?
220 if not os.path.isfile(filename):
221 stop_err("Missing FASTA file %r" % filename, 2)
222 identifiers = set()
223 handle = open(filename)
224 for line in handle:
225 if line.startswith(">"):
226 # The split will also take care of the new line character,
227 # e.g. ">test\n" and ">test description here\n" both give "test"
228 seq_id = line[1:].split(None, 1)[0]
229 if seq_id in identifiers:
230 handle.close()
231 stop_err("Repeated identifiers, e.g. %r" % seq_id, 3)
232 identifiers.add(seq_id)
233 handle.close()
234
235 def make_nr(input_fasta, output_fasta, sep=";"):
236 #TODO - seq-hash based to avoid loading everything into RAM?
237 by_seq = dict()
238 try:
239 from Bio import SeqIO
240 except KeyError:
241 stop_err("Missing Biopython")
242 for record in SeqIO.parse(input_fasta, "fasta"):
243 s = str(record.seq).upper()
244 try:
245 by_seq[s].append(record.id)
246 except KeyError:
247 by_seq[s] = [record.id]
248 unique = 0
249 representatives = dict()
250 duplicates = set()
251 for cluster in by_seq.values():
252 if len(cluster) > 1:
253 representatives[cluster[0]] = cluster
254 duplicates.update(cluster[1:])
255 else:
256 unique += 1
257 del by_seq
258 if duplicates:
259 #TODO - refactor as a generator with single SeqIO.write(...) call
260 with open(output_fasta, "w") as handle:
261 for record in SeqIO.parse(input_fasta, "fasta"):
262 if record.id in representatives:
263 cluster = representatives[record.id]
264 record.id = sep.join(cluster)
265 record.description = "representing %i records" % len(cluster)
266 elif record.id in duplicates:
267 continue
268 SeqIO.write(record, handle, "fasta")
269 print("%i unique entries; removed %i duplicates leaving %i representative records" % (unique, len(duplicates), len(representatives)))
270 else:
271 os.symlink(os.path.abspath(input_fasta), output_fasta)
272 print("No perfect duplicates in file, %i unique entries" % unique)
205 273
206 #print("Starting...") 274 #print("Starting...")
275 check_duplicate_ids(fasta_a)
276 if not self_comparison:
277 check_duplicate_ids(fasta_b)
278
279 if options.nr:
280 make_nr(fasta_a, tmp_a)
281 if not self_comparison:
282 make_nr(fasta_b, tmp_b)
283 fasta_a = tmp_a
284 fasta_b = tmp_b
285
207 #TODO - Report log in case of error? 286 #TODO - Report log in case of error?
208 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log)) 287 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log))
209 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log)) 288 if not self_comparison:
289 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
210 #print("BLAST databases prepared.") 290 #print("BLAST databases prepared.")
211 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' 291 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
212 % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads)) 292 % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads))
213 #print("BLAST species A vs species B done.") 293 #print("BLAST species A vs species B done.")
214 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' 294 if not self_comparison:
215 % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)) 295 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
216 #print("BLAST species B vs species A done.") 296 % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads))
297 #print("BLAST species B vs species A done.")
217 298
218 299
219 best_b_vs_a = dict(best_hits(b_vs_a, self_comparison)) 300 best_b_vs_a = dict(best_hits(b_vs_a, self_comparison))
220 301
221 302