blast_rbh: tools/blast_rbh/blast

comparison tools/blast_rbh/blast_rbh.py @ 1:ff0b814c1320 draft

Uploaded v0.1.5, NCBI BLAST+ 2.2.30 etc

author	peterjc
date	Fri, 21 Nov 2014 06:57:14 -0500
parents	b828ca44a313
children	14b2e159b310

comparison

equal deleted inserted replaced

-:b828ca44a313
+:ff0b814c1320
 #!/usr/bin/env python
 """BLAST Reciprocal Best Hit (RBH) from two FASTA input files.
-Takes the following command line options,
+Run "blast_rbh.py -h" to see the help text, or read the associated
-1. FASTA filename of species A
+README.rst file which is also available on GitHub at:
-2. FASTA filename of species B
+https://github.com/peterjc/galaxy_blast/tree/master/tools/blast_rbh
-3. Sequence type (prot/nucl)
-4. BLAST type (e.g. blastn, or blastp) consistent with sequence type
+This requires Python and the NCBI BLAST+ tools to be installed
-5. Minimum BLAST Percentage identity
+and on the $PATH.
-6. Minimum BLAST query coverage
-7. Output filename
+You can also run this tool via Galaxy using the "blast_rbh.xml"
+definition file. This is available as a package on the Galaxy
+Tool Shed: http://toolshed.g2.bx.psu.edu/view/peterjc/blast_rbh
 """
 # TODO - Output more columns, e.g. pident, qcovs, descriptions?
 import os
 if return_code:
 stop_err("Error %i from: %s" % (return_code, cmd))
 if "--version" in sys.argv[1:]:
 #TODO - Capture version of BLAST+ binaries too?
-print "BLAST RBH v0.1.2"
+print "BLAST RBH v0.1.5"
 sys.exit(0)
 #Parse Command Line
 usage = """Use as follows:
 help="Alphabet type (nucl or prot)")
 parser.add_option("-t", "--task", dest="task",
 default=None,
 help="BLAST task (e.g. blastp, blastn, megablast)")
 parser.add_option("-i","--identity", dest="min_identity",
-default="0",
+default="70",
-help="Minimum percentage identity (optional, default 0)")
+help="Minimum percentage identity (optional, default 70)")
 parser.add_option("-c", "--coverage", dest="min_coverage",
-default="0",
+default="50",
-help="Minimum HSP coverage (optional, default 0)")
+help="Minimum HSP coverage (optional, default 50)")
+parser.add_option("--nr", dest="nr", default=False, action="store_true",
+help="Preprocess FASTA files to collapse identifical "
+"entries (make sequences non-redundant)")
 parser.add_option("-o", "--output", dest="output",
 default=None, metavar="FILE",
 help="Output filename")
 options, args = parser.parse_args()
 assert 1 <= threads, threads
 makeblastdb_exe = "makeblastdb"
 base_path = tempfile.mkdtemp()
+tmp_a = os.path.join(base_path, "SpeciesA.fasta")
 db_a = os.path.join(base_path, "SpeciesA")
-db_b = os.path.join(base_path, "SpeciesB")
 a_vs_b = os.path.join(base_path, "A_vs_B.tabular")
-b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
+if self_comparison:
+tmp_b = tmp_a
+db_b = db_a
+b_vs_a = a_vs_b
+else:
+tmp_b = os.path.join(base_path, "SpeciesB.fasta")
+db_b = os.path.join(base_path, "SpeciesB")
+b_vs_a = os.path.join(base_path, "B_vs_A.tabular")
 log = os.path.join(base_path, "blast.log")
 cols = "qseqid sseqid bitscore pident qcovhsp qlen length" #Or qcovs?
 c_query = 0
 c_match = 1
 yield current, list(best.values())[0]
 else:
 #print("%s has %i equally good hits: %s" % (a, len(best), ", ".join(best)))
 tie_warning += 1
+def check_duplicate_ids(filename):
+# Copied from tools/ncbi_blast_plus/check_no_duplicates.py
+# TODO - just use Biopython's FASTA parser?
+if not os.path.isfile(filename):
+stop_err("Missing FASTA file %r" % filename, 2)
+identifiers = set()
+handle = open(filename)
+for line in handle:
+if line.startswith(">"):
+# The split will also take care of the new line character,
+# e.g. ">test\n" and ">test description here\n" both give "test"
+seq_id = line[1:].split(None, 1)[0]
+if seq_id in identifiers:
+handle.close()
+stop_err("Repeated identifiers, e.g. %r" % seq_id, 3)
+identifiers.add(seq_id)
+handle.close()
+def make_nr(input_fasta, output_fasta, sep=";"):
+#TODO - seq-hash based to avoid loading everything into RAM?
+by_seq = dict()
+try:
+from Bio import SeqIO
+except KeyError:
+stop_err("Missing Biopython")
+for record in SeqIO.parse(input_fasta, "fasta"):
+s = str(record.seq).upper()
+try:
+by_seq[s].append(record.id)
+except KeyError:
+by_seq[s] = [record.id]
+unique = 0
+representatives = dict()
+duplicates = set()
+for cluster in by_seq.values():
+if len(cluster) > 1:
+representatives[cluster[0]] = cluster
+duplicates.update(cluster[1:])
+else:
+unique += 1
+del by_seq
+if duplicates:
+#TODO - refactor as a generator with single SeqIO.write(...) call
+with open(output_fasta, "w") as handle:
+for record in SeqIO.parse(input_fasta, "fasta"):
+if record.id in representatives:
+cluster = representatives[record.id]
+record.id = sep.join(cluster)
+record.description = "representing %i records" % len(cluster)
+elif record.id in duplicates:
+continue
+SeqIO.write(record, handle, "fasta")
+print("%i unique entries; removed %i duplicates leaving %i representative records" % (unique, len(duplicates), len(representatives)))
+else:
+os.symlink(os.path.abspath(input_fasta), output_fasta)
+print("No perfect duplicates in file, %i unique entries" % unique)
 #print("Starting...")
+check_duplicate_ids(fasta_a)
+if not self_comparison:
+check_duplicate_ids(fasta_b)
+if options.nr:
+make_nr(fasta_a, tmp_a)
+if not self_comparison:
+make_nr(fasta_b, tmp_b)
+fasta_a = tmp_a
+fasta_b = tmp_b
 #TODO - Report log in case of error?
 run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_a, db_a, log))
-run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
+if not self_comparison:
+run('%s -dbtype %s -in "%s" -out "%s" -logfile "%s"' % (makeblastdb_exe, dbtype, fasta_b, db_b, log))
 #print("BLAST databases prepared.")
 run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
 % (blast_cmd, fasta_a, db_b, a_vs_b, cols, threads))
 #print("BLAST species A vs species B done.")
-run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
+if not self_comparison:
-% (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads))
+run('%s -query "%s" -db "%s" -out "%s" -outfmt "6 %s" -num_threads %i'
-#print("BLAST species B vs species A done.")
+% (blast_cmd, fasta_b, db_a, b_vs_a, cols, threads))
+#print("BLAST species B vs species A done.")
 best_b_vs_a = dict(best_hits(b_vs_a, self_comparison))

Mercurial > repos > peterjc > blast_rbh

comparison tools/blast_rbh/blast_rbh.py @ 1:ff0b814c1320 draft