Mercurial > repos > peterjc > blast_rbh
comparison tools/blast_rbh/blast_rbh.py @ 1:ff0b814c1320 draft
Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc
| author | peterjc |
|---|---|
| date | Fri, 21 Nov 2014 06:57:14 -0500 |
| parents | b828ca44a313 |
| children | 14b2e159b310 |
comparison
equal
deleted
inserted
replaced
| 0:b828ca44a313 | 1:ff0b814c1320 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files. | 2 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files. |
| 3 | 3 |
| 4 Takes the following command line options, | 4 Run "blast_rbh.py -h" to see the help text, or read the associated |
| 5 1. FASTA filename of species A | 5 README.rst file which is also available on GitHub at: |
| 6 2. FASTA filename of species B | 6 https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh |
| 7 3. Sequence type (prot/nucl) | 7 |
| 8 4. BLAST type (e.g. blastn, or blastp) consistent with sequence type | 8 This requires Python and the NCBI BLAST+ tools to be installed |
| 9 5. Minimum BLAST Percentage identity | 9 and on the $PATH. |
| 10 6. Minimum BLAST query coverage | 10 |
| 11 7. Output filename | 11 You can also run this tool via Galaxy using the "blast_rbh.xml" |
| 12 definition file. This is available as a package on the Galaxy | |
| 13 Tool Shed: http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh | |
| 12 """ | 14 """ |
| 13 | 15 |
| 14 # TODO - Output more columns, e.g. pident, qcovs, descriptions? | 16 # TODO - Output more columns, e.g. pident, qcovs, descriptions? |
| 15 | 17 |
| 16 import os | 18 import os |
| 28 if return_code: | 30 if return_code: |
| 29 stop_err("Error %i from: %s" % (return_code, cmd)) | 31 stop_err("Error %i from: %s" % (return_code, cmd)) |
| 30 | 32 |
| 31 if "--version" in sys.argv[1:]: | 33 if "--version" in sys.argv[1:]: |
| 32 #TODO - Capture version of BLAST+ binaries too? | 34 #TODO - Capture version of BLAST+ binaries too? |
| 33 print "BLAST RBH v0.1.2" | 35 print "BLAST RBH v0.1.5" |
| 34 sys.exit(0) | 36 sys.exit(0) |
| 35 | 37 |
| 36 #Parse Command Line | 38 #Parse Command Line |
| 37 usage = """Use as follows: | 39 usage = """Use as follows: |
| 38 | 40 |
| 45 help="Alphabet type (nucl or prot)") | 47 help="Alphabet type (nucl or prot)") |
| 46 parser.add_option("-t", "--task", dest="task", | 48 parser.add_option("-t", "--task", dest="task", |
| 47 default=None, | 49 default=None, |
| 48 help="BLAST task (e.g. blastp, blastn, megablast)") | 50 help="BLAST task (e.g. blastp, blastn, megablast)") |
| 49 parser.add_option("-i","--identity", dest="min_identity", | 51 parser.add_option("-i","--identity", dest="min_identity", |
| 50 default="0", | 52 default="70", |
| 51 help="Minimum percentage identity (optional, default 0)") | 53 help="Minimum percentage identity (optional, default 70)") |
| 52 parser.add_option("-c", "--coverage", dest="min_coverage", | 54 parser.add_option("-c", "--coverage", dest="min_coverage", |
| 53 default="0", | 55 default="50", |
| 54 help="Minimum HSP coverage (optional, default 0)") | 56 help="Minimum HSP coverage (optional, default 50)") |
| 57 parser.add_option("--nr", dest="nr", default=False, action="store_true", | |
| 58 help="Preprocess FASTA files to collapse identifical " | |
| 59 "entries (make sequences non-redundant)") | |
| 55 parser.add_option("-o", "--output", dest="output", | 60 parser.add_option("-o", "--output", dest="output", |
| 56 default=None, metavar="FILE", | 61 default=None, metavar="FILE", |
| 57 help="Output filename") | 62 help="Output filename") |
| 58 options, args = parser.parse_args() | 63 options, args = parser.parse_args() |
| 59 | 64 |
| 115 assert 1 <= threads, threads | 120 assert 1 <= threads, threads |
| 116 | 121 |
| 117 makeblastdb_exe = "makeblastdb" | 122 makeblastdb_exe = "makeblastdb" |
| 118 | 123 |
| 119 base_path = tempfile.mkdtemp() | 124 base_path = tempfile.mkdtemp() |
| 125 tmp_a = os.path.join(base_path, "SpeciesA.fasta") | |
| 120 db_a = os.path.join(base_path, "SpeciesA") | 126 db_a = os.path.join(base_path, "SpeciesA") |
| 121 db_b = os.path.join(base_path, "SpeciesB") | |
| 122 a_vs_b = os.path.join(base_path, "A_vs_B.tabular") | 127 a_vs_b = os.path.join(base_path, "A_vs_B.tabular") |
| 123 b_vs_a = os.path.join(base_path, "B_vs_A.tabular") | 128 if self_comparison: |
| 129 tmp_b = tmp_a | |
| 130 db_b = db_a | |
| 131 b_vs_a = a_vs_b | |
| 132 else: | |
| 133 tmp_b = os.path.join(base_path, "SpeciesB.fasta") | |
| 134 db_b = os.path.join(base_path, "SpeciesB") | |
| 135 b_vs_a = os.path.join(base_path, "B_vs_A.tabular") | |
| 124 log = os.path.join(base_path, "blast.log") | 136 log = os.path.join(base_path, "blast.log") |
| 125 | 137 |
| 126 cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs? | 138 cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs? |
| 127 c_query = 0 | 139 c_query = 0 |
| 128 c_match = 1 | 140 c_match = 1 |
| 200 yield current, list(best.values())[0] | 212 yield current, list(best.values())[0] |
| 201 else: | 213 else: |
| 202 #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) | 214 #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best))) |
| 203 tie_warning += 1 | 215 tie_warning += 1 |
| 204 | 216 |
| 217 def check_duplicate_ids(filename): | |
| 218 # Copied from tools/ncbi_blast_plus/check_no_duplicates.py | |
| 219 # TODO - just use Biopython's FASTA parser? | |
| 220 if not os.path.isfile(filename): | |
| 221 stop_err("Missing FASTA file %r" % filename, 2) | |
| 222 identifiers = set() | |
| 223 handle = open(filename) | |
| 224 for line in handle: | |
| 225 if line.startswith(">"): | |
| 226 # The split will also take care of the new line character, | |
| 227 # e.g. ">test\n" and ">test description here\n" both give "test" | |
| 228 seq_id = line[1:].split(None, 1)[0] | |
| 229 if seq_id in identifiers: | |
| 230 handle.close() | |
| 231 stop_err("Repeated identifiers, e.g. %r" % seq_id, 3) | |
| 232 identifiers.add(seq_id) | |
| 233 handle.close() | |
| 234 | |
| 235 def make_nr(input_fasta, output_fasta, sep=";"): | |
| 236 #TODO - seq-hash based to avoid loading everything into RAM? | |
| 237 by_seq = dict() | |
| 238 try: | |
| 239 from Bio import SeqIO | |
| 240 except KeyError: | |
| 241 stop_err("Missing Biopython") | |
| 242 for record in SeqIO.parse(input_fasta, "fasta"): | |
| 243 s = str(record.seq).upper() | |
| 244 try: | |
| 245 by_seq[s].append(record.id) | |
| 246 except KeyError: | |
| 247 by_seq[s] = [record.id] | |
| 248 unique = 0 | |
| 249 representatives = dict() | |
| 250 duplicates = set() | |
| 251 for cluster in by_seq.values(): | |
| 252 if len(cluster) > 1: | |
| 253 representatives[cluster[0]] = cluster | |
| 254 duplicates.update(cluster[1:]) | |
| 255 else: | |
| 256 unique += 1 | |
| 257 del by_seq | |
| 258 if duplicates: | |
| 259 #TODO - refactor as a generator with single SeqIO.write(...) call | |
| 260 with open(output_fasta, "w") as handle: | |
| 261 for record in SeqIO.parse(input_fasta, "fasta"): | |
| 262 if record.id in representatives: | |
| 263 cluster = representatives[record.id] | |
| 264 record.id = sep.join(cluster) | |
| 265 record.description = "representing %i records" % len(cluster) | |
| 266 elif record.id in duplicates: | |
| 267 continue | |
| 268 SeqIO.write(record, handle, "fasta") | |
| 269 print("%i unique entries; removed %i duplicates leaving %i representative records" % (unique, len(duplicates), len(representatives))) | |
| 270 else: | |
| 271 os.symlink(os.path.abspath(input_fasta), output_fasta) | |
| 272 print("No perfect duplicates in file, %i unique entries" % unique) | |
| 205 | 273 |
| 206 #print("Starting...") | 274 #print("Starting...") |
| 275 check_duplicate_ids(fasta_a) | |
| 276 if not self_comparison: | |
| 277 check_duplicate_ids(fasta_b) | |
| 278 | |
| 279 if options.nr: | |
| 280 make_nr(fasta_a, tmp_a) | |
| 281 if not self_comparison: | |
| 282 make_nr(fasta_b, tmp_b) | |
| 283 fasta_a = tmp_a | |
| 284 fasta_b = tmp_b | |
| 285 | |
| 207 #TODO - Report log in case of error? | 286 #TODO - Report log in case of error? |
| 208 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log)) | 287 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log)) |
| 209 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log)) | 288 if not self_comparison: |
| 289 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log)) | |
| 210 #print("BLAST databases prepared.") | 290 #print("BLAST databases prepared.") |
| 211 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' | 291 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' |
| 212 % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads)) | 292 % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads)) |
| 213 #print("BLAST species A vs species B done.") | 293 #print("BLAST species A vs species B done.") |
| 214 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' | 294 if not self_comparison: |
| 215 % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)) | 295 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i' |
| 216 #print("BLAST species B vs species A done.") | 296 % (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads)) |
| 297 #print("BLAST species B vs species A done.") | |
| 217 | 298 |
| 218 | 299 |
| 219 best_b_vs_a = dict(best_hits(b_vs_a, self_comparison)) | 300 best_b_vs_a = dict(best_hits(b_vs_a, self_comparison)) |
| 220 | 301 |
| 221 | 302 |
