# HG changeset patch
# User peterjc
# Date 1563662196 14400
# Node ID 2889433c7ae16dc3474e12d24455b3dc6d2720e6
# Parent e25d3acf6e682982e709a3b72fcdac488f9d545b
v0.3.3 - fixed legacy dependecy definition
diff -r e25d3acf6e68 -r 2889433c7ae1 test-data/cd00003_and_cd00008.pin
Binary file test-data/cd00003_and_cd00008.pin has changed
diff -r e25d3acf6e68 -r 2889433c7ae1 test-data/rhodopsin_nucs.blastdbcmd.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rhodopsin_nucs.blastdbcmd.txt Sat Jul 20 18:36:36 2019 -0400
@@ -0,0 +1,7 @@
+gi|57163782|ref|NM_001009242.1|
+gi|2734705|gb|U59921.1|BBU59921 "1 -" +
+gi|283855845|gb|GQ290303.1| 1-4301 +
+gi|283855822|gb|GQ290312.1| "1-983"
+gi|18148870|dbj|AB062417.1| "1 -" +
+gi|12583664|dbj|AB043817.1| "1--"
+
diff -r e25d3acf6e68 -r 2889433c7ae1 tools/ncbi_blast_plus/README.rst
--- a/tools/ncbi_blast_plus/README.rst Tue Oct 23 08:48:19 2018 -0400
+++ b/tools/ncbi_blast_plus/README.rst Sat Jul 20 18:36:36 2019 -0400
@@ -264,6 +264,10 @@
output format it must be mapped to different command line arguments.
- Extend gzipped query support to all the command line tools.
- Workaround for gzipped support under Galaxy release 16.01 or older.
+v0.3.2 - Fixed incomplete ``@CLI_OPTIONS@`` macro in the help text for the
+ ``tblastn`` and ``blastdbcmd`` wrappers.
+v0.3.3 - Fixed ``tool_dependencies.xml`` to use BLAST+ 2.7.1 (useful only for
+ older Galaxy instances - we recommend conda for dependencies now).
======= ======================================================================
diff -r e25d3acf6e68 -r 2889433c7ae1 tools/ncbi_blast_plus/blastxml_to_tabular.py
--- a/tools/ncbi_blast_plus/blastxml_to_tabular.py Tue Oct 23 08:48:19 2018 -0400
+++ b/tools/ncbi_blast_plus/blastxml_to_tabular.py Sat Jul 20 18:36:36 2019 -0400
@@ -81,12 +81,14 @@
else:
from galaxy import eggs # noqa - ignore flake8 F401
import pkg_resources
+
pkg_resources.require("elementtree")
from elementtree import ElementTree
if len(sys.argv) == 4 and sys.argv[3] in ["std", "x22", "ext"]:
# False positive if user really has a BLAST XML file called 'std' or 'ext'...
- sys.exit("""ERROR: The script API has changed, sorry.
+ sys.exit(
+ """ERROR: The script API has changed, sorry.
Instead of the old style:
@@ -99,7 +101,8 @@
For more information, use:
$ python blastxml_to_tabular.py -h
-""")
+"""
+ )
usage = """usage: %prog [options] blastxml[,...]
@@ -113,16 +116,29 @@
extended column names are supported.
"""
parser = OptionParser(usage=usage)
-parser.add_option('-o', '--output', dest='output', default=None,
- help='output filename (defaults to stdout)',
- metavar="FILE")
-parser.add_option("-c", "--columns", dest="columns", default='std',
- help="[std|ext|col1,col2,...] standard 12 columns, extended 25 columns, or list of column names")
+parser.add_option(
+ "-o",
+ "--output",
+ dest="output",
+ default=None,
+ help="output filename (defaults to stdout)",
+ metavar="FILE",
+)
+parser.add_option(
+ "-c",
+ "--columns",
+ dest="columns",
+ default="std",
+ help="[std|ext|col1,col2,...] standard 12 columns, "
+ "extended 25 columns, or list of column names",
+)
(options, args) = parser.parse_args()
-colnames = ('qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,'
- 'sstart,send,evalue,bitscore,sallseqid,score,nident,positive,'
- 'gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles').split(',')
+colnames = (
+ "qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,"
+ "sstart,send,evalue,bitscore,sallseqid,score,nident,positive,"
+ "gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles"
+).split(",")
if len(args) < 1:
sys.exit("ERROR: No BLASTXML input files given; run with --help to see options.")
@@ -148,7 +164,9 @@
assert set(colnames).issuperset(cols), cols
if not cols:
sys.exit("No columns selected!")
- extended = max(colnames.index(c) for c in cols) >= 12 # Do we need any higher columns?
+ extended = (
+ max(colnames.index(c) for c in cols) >= 12
+ ) # Do we need any higher columns?
del out_fmt
for in_file in args:
@@ -156,15 +174,15 @@
sys.exit("Input BLAST XML file not found: %s" % in_file)
-re_default_query_id = re.compile("^Query_\d+$")
-assert re_default_query_id.match("Query_101")
-assert not re_default_query_id.match("Query_101a")
-assert not re_default_query_id.match("MyQuery_101")
-re_default_subject_id = re.compile("^Subject_\d+$")
-assert re_default_subject_id.match("Subject_1")
-assert not re_default_subject_id.match("Subject_")
-assert not re_default_subject_id.match("Subject_12a")
-assert not re_default_subject_id.match("TheSubject_1")
+re_default_query_id = re.compile(r"^Query_\d+$")
+assert re_default_query_id.match(r"Query_101")
+assert not re_default_query_id.match(r"Query_101a")
+assert not re_default_query_id.match(r"MyQuery_101")
+re_default_subject_id = re.compile(r"^Subject_\d+$")
+assert re_default_subject_id.match(r"Subject_1")
+assert not re_default_subject_id.match(r"Subject_")
+assert not re_default_subject_id.match(r"Subject_12a")
+assert not re_default_subject_id.match(r"TheSubject_1")
def convert(blastxml_filename, output_handle):
@@ -213,7 +231,8 @@
# P56514
# or,
# Subject_1
- # gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]
+ # gi|57163783|ref|NP_001009242.1|
+ # rhodopsin [Felis catus]
# Subject_1
#
# apparently depending on the parse_deflines switch
@@ -225,11 +244,15 @@
# 2
sseqid = hit.findtext("Hit_id").split(None, 1)[0]
hit_def = sseqid + " " + hit.findtext("Hit_def")
- if re_default_subject_id.match(sseqid) and sseqid == hit.findtext("Hit_accession"):
+ if re_default_subject_id.match(sseqid) and sseqid == hit.findtext(
+ "Hit_accession"
+ ):
# Place holder ID, take the first word of the subject definition
hit_def = hit.findtext("Hit_def")
sseqid = hit_def.split(None, 1)[0]
- if sseqid.startswith("gnl|BL_ORD_ID|") and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
+ if sseqid.startswith(
+ "gnl|BL_ORD_ID|"
+ ) and sseqid == "gnl|BL_ORD_ID|" + hit.findtext("Hit_accession"):
# Alternative place holder ID, again take the first word of hit_def
hit_def = hit.findtext("Hit_def")
sseqid = hit_def.split(None, 1)[0]
@@ -244,27 +267,61 @@
h_seq = hsp.findtext("Hsp_hseq")
m_seq = hsp.findtext("Hsp_midline")
assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
- gapopen = str(len(q_seq.replace('-', ' ').split()) - 1 +
- len(h_seq.replace('-', ' ').split()) - 1)
+ gapopen = str(
+ len(q_seq.replace("-", " ").split())
+ - 1
+ + len(h_seq.replace("-", " ").split())
+ - 1
+ )
- mismatch = m_seq.count(' ') + m_seq.count('+') - q_seq.count('-') - h_seq.count('-')
+ mismatch = (
+ m_seq.count(" ")
+ + m_seq.count("+")
+ - q_seq.count("-")
+ - h_seq.count("-")
+ )
# TODO - Remove this alternative mismatch calculation and test
# once satisifed there are no problems
- expected_mismatch = len(q_seq) - sum(1 for q, h in zip(q_seq, h_seq)
- if q == h or q == "-" or h == "-")
+ expected_mismatch = len(q_seq) - sum(
+ 1
+ for q, h in zip(q_seq, h_seq)
+ if q == h or q == "-" or h == "-"
+ )
xx = sum(1 for q, h in zip(q_seq, h_seq) if q == "X" and h == "X")
- if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
- sys.exit("%s vs %s mismatches, expected %i <= %i <= %i"
- % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
- int(mismatch), expected_mismatch))
+ if not (
+ expected_mismatch - q_seq.count("X")
+ <= int(mismatch)
+ <= expected_mismatch + xx
+ ):
+ sys.exit(
+ "%s vs %s mismatches, expected %i <= %i <= %i"
+ % (
+ qseqid,
+ sseqid,
+ expected_mismatch - q_seq.count("X"),
+ int(mismatch),
+ expected_mismatch,
+ )
+ )
# TODO - Remove this alternative identity calculation and test
# once satisifed there are no problems
expected_identity = sum(1 for q, h in zip(q_seq, h_seq) if q == h)
- if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):
- sys.exit("%s vs %s identities, expected %i <= %i <= %i"
- % (qseqid, sseqid, expected_identity, int(nident),
- expected_identity + q_seq.count("X")))
+ if not (
+ expected_identity - xx
+ <= int(nident)
+ <= expected_identity + q_seq.count("X")
+ ):
+ sys.exit(
+ "%s vs %s identities, expected %i <= %i <= %i"
+ % (
+ qseqid,
+ sseqid,
+ expected_identity,
+ int(nident),
+ expected_identity + q_seq.count("X"),
+ )
+ )
evalue = hsp.findtext("Hsp_evalue")
if evalue == "0":
@@ -280,53 +337,66 @@
# Note BLAST does not round to nearest int, it truncates
bitscore = "%i" % bitscore
- values = [qseqid,
- sseqid,
- pident,
- length, # hsp.findtext("Hsp_align-len")
- str(mismatch),
- gapopen,
- hsp.findtext("Hsp_query-from"), # qstart,
- hsp.findtext("Hsp_query-to"), # qend,
- hsp.findtext("Hsp_hit-from"), # sstart,
- hsp.findtext("Hsp_hit-to"), # send,
- evalue, # hsp.findtext("Hsp_evalue") in scientific notation
- bitscore, # hsp.findtext("Hsp_bit-score") rounded
- ]
+ values = [
+ qseqid,
+ sseqid,
+ pident,
+ length, # hsp.findtext("Hsp_align-len")
+ str(mismatch),
+ gapopen,
+ hsp.findtext("Hsp_query-from"), # qstart,
+ hsp.findtext("Hsp_query-to"), # qend,
+ hsp.findtext("Hsp_hit-from"), # sstart,
+ hsp.findtext("Hsp_hit-to"), # send,
+ evalue, # hsp.findtext("Hsp_evalue") in scientific notation
+ bitscore, # hsp.findtext("Hsp_bit-score") rounded
+ ]
if extended:
try:
- sallseqid = ";".join(name.split(None, 1)[0] for name in hit_def.split(" >"))
- salltitles = "<>".join(name.split(None, 1)[1] for name in hit_def.split(" >"))
+ sallseqid = ";".join(
+ name.split(None, 1)[0] for name in hit_def.split(" >")
+ )
+ salltitles = "<>".join(
+ name.split(None, 1)[1] for name in hit_def.split(" >")
+ )
except IndexError as e:
- sys.exit("Problem splitting multuple hits?\n%r\n--> %s" % (hit_def, e))
+ sys.exit(
+ "Problem splitting multuple hits?\n%r\n--> %s"
+ % (hit_def, e)
+ )
# print(hit_def, "-->", sallseqid)
positive = hsp.findtext("Hsp_positive")
ppos = "%0.2f" % (100 * float(positive) / float(length))
qframe = hsp.findtext("Hsp_query-frame")
sframe = hsp.findtext("Hsp_hit-frame")
if blast_program == "blastp":
- # Probably a bug in BLASTP that they use 0 or 1 depending on format
+ # Probably a bug in BLASTP that they use 0 or 1
+ # depending on format
if qframe == "0":
qframe = "1"
if sframe == "0":
sframe = "1"
slen = int(hit.findtext("Hit_len"))
- values.extend([sallseqid,
- hsp.findtext("Hsp_score"), # score,
- nident,
- positive,
- hsp.findtext("Hsp_gaps"), # gaps,
- ppos,
- qframe,
- sframe,
- # NOTE - for blastp, XML shows original seq, tabular uses XXX masking
- q_seq,
- h_seq,
- str(qlen),
- str(slen),
- salltitles,
- ])
+ values.extend(
+ [
+ sallseqid,
+ hsp.findtext("Hsp_score"), # score,
+ nident,
+ positive,
+ hsp.findtext("Hsp_gaps"), # gaps,
+ ppos,
+ qframe,
+ sframe,
+ # NOTE - for blastp, XML shows original seq,
+ # tabular uses XXX masking
+ q_seq,
+ h_seq,
+ str(qlen),
+ str(slen),
+ salltitles,
+ ]
+ )
if cols:
# Only a subset of the columns are needed
values = [values[colnames.index(c)] for c in cols]
diff -r e25d3acf6e68 -r 2889433c7ae1 tools/ncbi_blast_plus/check_no_duplicates.py
--- a/tools/ncbi_blast_plus/check_no_duplicates.py Tue Oct 23 08:48:19 2018 -0400
+++ b/tools/ncbi_blast_plus/check_no_duplicates.py Sat Jul 20 18:36:36 2019 -0400
@@ -31,7 +31,7 @@
if not magic:
# Empty file, special case
continue
- elif magic == b'\x1f\x8b':
+ elif magic == b"\x1f\x8b":
# Gzipped
handle = gzip.open(filename, "rt")
elif magic[0:1] == b">":
diff -r e25d3acf6e68 -r 2889433c7ae1 tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml
--- a/tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml Tue Oct 23 08:48:19 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_blastdbcmd_wrapper.xml Sat Jul 20 18:36:36 2019 -0400
@@ -8,7 +8,7 @@
## The command is a Cheetah template which allows some Python based syntax.
## Lines starting hash hash are comments. Galaxy will turn newlines into spaces
-blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path}"
+blastdbcmd -dbtype $db_opts.db_type -db "${db_opts.database.fields.path.replace(',',' ')}"
##TODO: What about -ctrl_a and -target_only as advanced options?
@@ -55,7 +55,7 @@
-
+
@@ -88,6 +88,15 @@
+
+
+
+
+
+
+
+
+
@@ -111,6 +120,12 @@
Extracts FASTA formatted sequences from a BLAST database
using the NCBI BLAST+ blastdbcmd command line tool.
+When giving a text file of entries, use one line per sequence.
+Optional valies should be space separate - the simplest syntax
+is ``identifier start-end`` (where ``end`` can be just ``-``),
+or ``identifier start-end strand`` (wheere the strand given as
+either ``+`` or ``-``).
+
.. class:: warningmark
**BLAST assigned identifiers**
@@ -131,7 +146,7 @@
-------
-@CLI_OPTIONS
+@CLI_OPTIONS@
-------
diff -r e25d3acf6e68 -r 2889433c7ae1 tools/ncbi_blast_plus/ncbi_macros.xml
--- a/tools/ncbi_blast_plus/ncbi_macros.xml Tue Oct 23 08:48:19 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_macros.xml Sat Jul 20 18:36:36 2019 -0400
@@ -1,5 +1,5 @@
- 0.3.1
+ 0.3.3
diff -r e25d3acf6e68 -r 2889433c7ae1 tools/ncbi_blast_plus/ncbi_makeprofiledb.xml
--- a/tools/ncbi_blast_plus/ncbi_makeprofiledb.xml Tue Oct 23 08:48:19 2018 -0400
+++ b/tools/ncbi_blast_plus/ncbi_makeprofiledb.xml Sat Jul 20 18:36:36 2019 -0400
@@ -90,7 +90,7 @@