Mercurial > repos > peterjc > blast_rbh
comparison tools/blast_rbh/blast_rbh.py @ 1:ff0b814c1320 draft
Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
author | peterjc |
---|---|
date | Fri, 21 Nov 2014 06:57:14 -0500 |
parents | b828ca44a313 |
children | 14b2e159b310 |
comparison
equal
deleted
inserted
replaced
0:b828ca44a313 | 1:ff0b814c1320 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files. | 2 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files. |
3 | 3 |
4 Takes the following command line options, | 4 Run "blast_rbh.py -h" to see the help text, or read the associated |
5 1. FASTA filename of species A | 5 README.rst file which is also available on GitHub at: |
6 2. FASTA filename of species B | 6 https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh |
7 3. Sequence type (prot/nucl) | 7 |
8 4. BLAST type (e.g. blastn, or blastp) consistent with sequence type | 8 This requires Python and the NCBI BLAST+ tools to be installed |
9 5. Minimum BLAST Percentage identity | 9 and on the $PATH. |
10 6. Minimum BLAST query coverage | 10 |
11 7. Output filename | 11 You can also run this tool via Galaxy using the "blast_rbh.xml" |
12 definition file. This is available as a package on the Galaxy | |
13 Tool Shed: http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh | |
12 """ | 14 """ |
13 | 15 |
14 # TODO - Output more columns, e.g. pident, qcovs, descriptions? | 16 # TODO - Output more columns, e.g. pident, qcovs, descriptions? |
15 | 17 |
16 import os | 18 import os |
28 if return_code: | 30 if return_code: |
29 stop_err("Error %i from: %s" % (return_code, cmd)) | 31 stop_err("Error %i from: %s" % (return_code, cmd)) |
30 | 32 |
31 if "--version" in sys.argv[1:]: | 33 if "--version" in sys.argv[1:]: |
32 #TODO - Capture version of BLAST+ binaries too? | 34 #TODO - Capture version of BLAST+ binaries too? |
33 print "BLAST RBH v0.1.2" | 35 print "BLAST RBH v0.1.5" |
34 sys.exit(0) | 36 sys.exit(0) |
35 | 37 |
36 #Parse Command Line | 38 #Parse Command Line |
37 usage = """Use as follows: | 39 usage = """Use as follows: |
38 | 40 |
45 help="Alphabet type (nucl or prot)") | 47 help="Alphabet type (nucl or prot)") |
46 parser.add_option("-t", "--task", dest="task", | 48 parser.add_option("-t", "--task", dest="task", |
47 default=None, | 49 default=None, |
48 help="BLAST task (e.g. blastp, blastn, megablast)") | 50 help="BLAST task (e.g. blastp, blastn, megablast)") |
49 parser.add_option("-i","--identity", dest="min_identity", | 51 parser.add_option("-i","--identity", dest="min_identity", |
50 default="0", | 52 default="70", |
51 help="Minimum percentage identity (optional, default 0)") | 53 help="Minimum percentage identity (optional, default 70)") |
52 parser.add_option("-c", "--coverage", dest="min_coverage", | 54 parser.add_option("-c", "--coverage", dest="min_coverage", |
53 default="0", | 55 default="50", |
54 help="Minimum HSP coverage (optional, default 0)") | 56 help="Minimum HSP coverage (optional, default 50)") |
57 parser.add_option("--nr", dest="nr", default=False, action="store_true", | |
58 help="Preprocess FASTA files to collapse identifical " | |
59 "entries (make sequences non-redundant)") | |
55 parser.add_option("-o", "--output", dest="output", | 60 parser.add_option("-o", "--output", dest="output", |
56 default=None, metavar="FILE", | 61 default=None, metavar="FILE", |
57 help="Output filename") | 62 help="Output filename") |
58 options, args = parser.parse_args() | 63 options, args = parser.parse_args() |
59 | 64 |
115 assert 1 <= threads, threads | 120 assert 1 <= threads, threads |
116 | 121 |
117 makeblastdb_exe = "makeblastdb" | 122 makeblastdb_exe = "makeblastdb" |
118 | 123 |
119 base_path = tempfile.mkdtemp() | 124 base_path = tempfile.mkdtemp() |
125 tmp_a = os.path.join(base_path, "SpeciesA.fasta") | |
120 db_a = os.path.join(base_path, "SpeciesA") | 126 db_a = os.path.join(base_path, "SpeciesA") |
121 db_b = os.path.join(base_path, "SpeciesB") | |
122 a_vs_b = os.path.join(base_path, "A_vs_B.tabular") | 127 a_vs_b = os.path.join(base_path, "A_vs_B.tabular") |
123 b_vs_a = os.path.join(base_path, "B_vs_A.tabular") | 128 if self_comparison: |
129 tmp_b = tmp_a | |
130 db_b = db_a | |
131 b_vs_a = a_vs_b | |
132 else: | |
133 tmp_b = os.path.join(base_path, "SpeciesB.fasta") | |
134 db_b = os.path.join(base_path, "SpeciesB") | |
135 b_vs_a = os.path.join(base_path, "B_vs_A.tabular") | |
124 log = os.path.join(base_path, "blast.log") | 136 log = os.path.join(base_path, "blast.log") |
125 | 137 |
126 cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs? | 138 cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs? |
127 c_query = 0 | 139 c_query = 0 |
128 c_match = 1 | 140 c_match = 1 |
200 yield current, list(best.values())[0] | 212 yield current, list(best.values())[0] |
201 else: | 213 else: |
202 #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) | 214 #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) |
203 tie_warning += 1 | 215 tie_warning += 1 |
204 | 216 |
217 def check_duplicate_ids(filename): | |
218 # Copied from tools/ncbi_blast_plus/check_no_duplicates.py | |
219 # TODO - just use Biopython's FASTA parser? | |
220 if not os.path.isfile(filename): | |
221 stop_err("Missing FASTA file %r" % filename, 2) | |
222 identifiers = set() | |
223 handle = open(filename) | |
224 for line in handle: | |
225 if line.startswith(">"): | |
226 # The split will also take care of the new line character, | |
227 # e.g. ">test\n" and ">test description here\n" both give "test" | |
228 seq_id = line[1:].split(None, 1)[0] | |
229 if seq_id in identifiers: | |
230 handle.close() | |
231 stop_err("Repeated identifiers, e.g. %r" % seq_id, 3) | |
232 identifiers.add(seq_id) | |
233 handle.close() | |
234 | |
235 def make_nr(input_fasta, output_fasta, sep=";"): | |
236 #TODO - seq-hash based to avoid loading everything into RAM? | |
237 by_seq = dict() | |
238 try: | |
239 from Bio import SeqIO | |
240 except KeyError: | |
241 stop_err("Missing Biopython") | |
242 for record in SeqIO.parse(input_fasta, "fasta"): | |
243 s = str(record.seq).upper() | |
244 try: | |
245 by_seq[s].append(record.id) | |
246 except KeyError: | |
247 by_seq[s] = [record.id] | |
248 unique = 0 | |
249 representatives = dict() | |
250 duplicates = set() | |
251 for cluster in by_seq.values(): | |
252 if len(cluster) > 1: | |
253 representatives[cluster[0]] = cluster | |
254 duplicates.update(cluster[1:]) | |
255 else: | |
256 unique += 1 | |
257 del by_seq | |
258 if duplicates: | |
259 #TODO - refactor as a generator with single SeqIO.write(...) call | |
260 with open(output_fasta, "w") as handle: | |
261 for record in SeqIO.parse(input_fasta, "fasta"): | |
262 if record.id in representatives: | |
263 cluster = representatives[record.id] | |
264 record.id = sep.join(cluster) | |
265 record.description = "representing %i records" % len(cluster) | |
266 elif record.id in duplicates: | |
267 continue | |
268 SeqIO.write(record, handle, "fasta") | |
269 print("%i unique entries; removed %i duplicates leaving %i representative records" % (unique, len(duplicates), len(representatives))) | |
270 else: | |
271 os.symlink(os.path.abspath(input_fasta), output_fasta) | |
272 print("No perfect duplicates in file, %i unique entries" % unique) | |
205 | 273 |
206 #print("Starting...") | 274 #print("Starting...") |
275 check_duplicate_ids(fasta_a) | |
276 if not self_comparison: | |
277 check_duplicate_ids(fasta_b) | |
278 | |
279 if options.nr: | |
280 make_nr(fasta_a, tmp_a) | |
281 if not self_comparison: | |
282 make_nr(fasta_b, tmp_b) | |
283 fasta_a = tmp_a | |
284 fasta_b = tmp_b | |
285 | |
207 #TODO - Report log in case of error? | 286 #TODO - Report log in case of error? |
208 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log)) | 287 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log)) |
209 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log)) | 288 if not self_comparison: |
289 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log)) | |
210 #print("BLAST databases prepared.") | 290 #print("BLAST databases prepared.") |
211 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' | 291 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' |
212 % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads)) | 292 % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads)) |
213 #print("BLAST species A vs species B done.") | 293 #print("BLAST species A vs species B done.") |
214 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' | 294 if not self_comparison: |
215 % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)) | 295 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' |
216 #print("BLAST species B vs species A done.") | 296 % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)) |
297 #print("BLAST species B vs species A done.") | |
217 | 298 |
218 | 299 |
219 best_b_vs_a = dict(best_hits(b_vs_a, self_comparison)) | 300 best_b_vs_a = dict(best_hits(b_vs_a, self_comparison)) |
220 | 301 |
221 | 302 |