# HG changeset patch
# User peterjc
# Date 1431528989 14400
# Node ID 91f55ee8fea5f99219e777cd4825cfa1980dae1b
# Parent 1a83f5ab9e95742eb387e2226e1d11c5ad67350b
v0.0.11; more tests and assorting minor changes
diff -r 1a83f5ab9e95 -r 91f55ee8fea5 test-data/k12_hypothetical_alt.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/k12_hypothetical_alt.tabular Wed May 13 10:56:29 2015 -0400
@@ -0,0 +1,2 @@
+#ID and Description Length
+gi|16127999|ref|NP_414546.1| hypothetical protein b0005 [Escherichia coli str. K-12 substr. MG1655] 98
diff -r 1a83f5ab9e95 -r 91f55ee8fea5 tools/seq_select_by_id/README.rst
--- a/tools/seq_select_by_id/README.rst Thu Nov 21 04:54:59 2013 -0500
+++ b/tools/seq_select_by_id/README.rst Wed May 13 10:56:29 2015 -0400
@@ -1,7 +1,7 @@
Galaxy tool to select FASTA, QUAL, FASTQ or SFF sequences by ID
===============================================================
-This tool is copyright 2011-2013 by Peter Cock, The James Hutton Institute
+This tool is copyright 2011-2015 by Peter Cock, The James Hutton Institute
(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
See the licence text below.
@@ -36,20 +36,20 @@
There are just two files to install to use this tool from within Galaxy:
-* seq_select_by_id.py (the Python script)
-* seq_select_by_id.xml (the Galaxy tool definition)
+* ``seq_select_by_id.py`` (the Python script)
+* ``seq_select_by_id.xml`` (the Galaxy tool definition)
-The suggested location is a dedicated tools/seq_select_by_id folder.
+The suggested location is a dedicated ``tools/seq_select_by_id`` folder.
-You will also need to modify the tools_conf.xml file to tell Galaxy to offer the
+You will also need to modify the ``tools_conf.xml`` file to tell Galaxy to offer the
tool. One suggested location is in the filters section. Simply add the line::
-If you wish to run the unit tests, also add this to tools_conf.xml.sample
-and move/copy the test-data files under Galaxy's test-data folder. Then::
+If you wish to run the unit tests, also move/copy the ``test-data/`` files
+under Galaxy's ``test-data/`` folder. Then::
- $ ./run_functional_tests.sh -id seq_select_by_id
+ $ ./run_tests.sh -id seq_select_by_id
You will also need to install Biopython 1.54 or later. That's it.
@@ -73,7 +73,18 @@
- Fixed Biopython dependency setup.
- Development moved to GitHub, https://github.com/peterjc/pico_galaxy
- Renamed folder and adopted README.rst naming.
-v0.0.8 - Corrected automated dependency definition
+v0.0.8 - Corrected automated dependency definition.
+v0.0.9 - Simplified XML to apply input format to output data.
+ - Tool definition now embeds citation information.
+ - Include input dataset name in output dataset names.
+ - If white space is found in the requested tabular field then only
+ the first word is used as the identifier (with a warning to stderr).
+v0.0.10 - Includes testing of stdout messages.
+ - Includes testing of failure modes.
+v0.0.11 - Use the ``format_source=...`` tag.
+ - Reorder XML elements (internal change only).
+ - Planemo for Tool Shed upload (``.shed.yml``, internal change only).
+ - Quote filenames in case of spaces (internal change only).
======= ======================================================================
@@ -86,21 +97,31 @@
Development has now moved to a dedicated GitHub repository:
https://github.com/peterjc/pico_galaxy/tree/master/tools
-For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
-the following command from the Galaxy root folder::
+For pushing a release to the test or main "Galaxy Tool Shed", use the following
+Planemo commands (which requires you have set your Tool Shed access details in
+``~/.planemo.yml`` and that you have access rights on the Tool Shed)::
+
+ $ planemo shed_upload --shed_target testtoolshed --check_diff ~/repositories/pico_galaxy/tools/seq_select_by_id/
+ ...
+
+or::
- $ tar -czf seq_select_by_id.tar.gz tools/seq_select_by_id/README.rst tools/seq_select_by_id/seq_select_by_id.* tools/seq_select_by_id/tool_dependencies.xml test-data/k12_ten_proteins.fasta test-data/k12_hypothetical.fasta test-data/k12_hypothetical.tabular
+ $ planemo shed_upload --shed_target toolshed --check_diff ~/repositories/pico_galaxy/tools/seq_select_by_id/
+ ...
+
+To just build and check the tar ball, use::
-Check this worked::
-
- $ tar -tzf seq_select_by_id.tar.gz
+ $ planemo shed_upload --tar_only ~/repositories/pico_galaxy/tools/seq_select_by_id/
+ ...
+ $ tar -tzf shed_upload.tar.gz
+ test-data/k12_hypothetical.fasta
+ test-data/k12_hypothetical.tabular
+ test-data/k12_hypothetical_alt.tabular
+ test-data/k12_ten_proteins.fasta
tools/seq_select_by_id/README.rst
tools/seq_select_by_id/seq_select_by_id.py
tools/seq_select_by_id/seq_select_by_id.xml
tools/seq_select_by_id/tool_dependencies.xml
- test-data/k12_ten_proteins.fasta
- test-data/k12_hypothetical.fasta
- test-data/k12_hypothetical.tabular
Licence (MIT)
diff -r 1a83f5ab9e95 -r 91f55ee8fea5 tools/seq_select_by_id/seq_select_by_id.py
--- a/tools/seq_select_by_id/seq_select_by_id.py Thu Nov 21 04:54:59 2013 -0500
+++ b/tools/seq_select_by_id/seq_select_by_id.py Wed May 13 10:56:29 2015 -0400
@@ -19,34 +19,32 @@
This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute UK.
All rights reserved. See accompanying text file for licence details (MIT
license).
-
-This is version 0.0.6 of the script.
"""
import sys
-def stop_err(msg, err=1):
+def sys_exit(msg, err=1):
sys.stderr.write(msg.rstrip() + "\n")
sys.exit(err)
if "-v" in sys.argv or "--version" in sys.argv:
- print "v0.0.6"
+ print "v0.0.9"
sys.exit(0)
#Parse Command Line
try:
tabular_file, col_arg, in_file, seq_format, out_file = sys.argv[1:]
except ValueError:
- stop_err("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
+ sys_exit("Expected five arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
try:
if col_arg.startswith("c"):
column = int(col_arg[1:])-1
else:
column = int(col_arg)-1
except ValueError:
- stop_err("Expected column number, got %s" % col_arg)
+ sys_exit("Expected column number, got %s" % col_arg)
if seq_format == "fastqcssanger":
- stop_err("Colorspace FASTQ not supported.")
+ sys_exit("Colorspace FASTQ not supported.")
elif seq_format.lower() in ["sff", "fastq", "qual", "fasta"]:
seq_format = seq_format.lower()
elif seq_format.lower().startswith("fastq"):
@@ -56,22 +54,35 @@
#We don't care what the scores are
seq_format = "qual"
else:
- stop_err("Unrecognised file format %r" % seq_format)
+ sys_exit("Unrecognised file format %r" % seq_format)
try:
from Bio import SeqIO
except ImportError:
- stop_err("Biopython 1.54 or later is required")
+ sys_exit("Biopython 1.54 or later is required")
def parse_ids(tabular_file, col):
- """Read tabular file and record all specified identifiers."""
+ """Read tabular file and record all specified identifiers.
+
+ Will print a single warning to stderr if any of the fields have
+ non-trailing white space (only the first word will be used as
+ the identifier).
+ """
handle = open(tabular_file, "rU")
+ warn = False
for line in handle:
if line.strip() and not line.startswith("#"):
- yield line.rstrip("\n").split("\t")[col].strip()
+ field = line.rstrip("\n").split("\t")[col].strip()
+ parts = field.split(None, 1)
+ if len(parts) > 1 and not warn:
+ warn = "WARNING: Some of your identifiers had white space in them, " + \
+ "using first word only. e.g.:\n%s\n" % field
+ yield parts[0]
handle.close()
+ if warn:
+ sys.stderr.write(warn)
#Index the sequence file.
#If very big, could use SeqIO.index_db() to avoid memory bottleneck...
@@ -83,7 +94,7 @@
try:
from Bio.SeqIO.SffIO import SffIterator, SffWriter
except ImportError:
- stop_err("Requires Biopython 1.54 or later")
+ sys_exit("Requires Biopython 1.54 or later")
try:
from Bio.SeqIO.SffIO import ReadRocheXmlManifest
@@ -109,7 +120,7 @@
except KeyError, err:
out_handle.close()
if name not in records:
- stop_err("Identifier %r not found in sequence file" % name)
+ sys_exit("Identifier %r not found in sequence file" % name)
else:
raise err
out_handle.close()
@@ -123,7 +134,7 @@
out_handle.write(records.get_raw(name))
except KeyError:
out_handle.close()
- stop_err("Identifier %r not found in sequence file" % name)
+ sys_exit("Identifier %r not found in sequence file" % name)
count += 1
out_handle.close()
diff -r 1a83f5ab9e95 -r 91f55ee8fea5 tools/seq_select_by_id/seq_select_by_id.xml
--- a/tools/seq_select_by_id/seq_select_by_id.xml Thu Nov 21 04:54:59 2013 -0500
+++ b/tools/seq_select_by_id/seq_select_by_id.xml Wed May 13 10:56:29 2015 -0400
@@ -1,35 +1,25 @@
-
+
from a tabular file
biopython
Bio
- seq_select_by_id.py --version
-
-seq_select_by_id.py $input_tabular $column $input_file $input_file.ext $output_file
-
+ seq_select_by_id.py --version
+
+seq_select_by_id.py "$input_tabular" "$column" "$input_file" "$input_file.ext" "$output_file"
+
-
-
-
-
-
-
-
-
-
-
-
+
@@ -37,6 +27,35 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -69,4 +88,8 @@
This tool is available to install into other Galaxy Instances via the Galaxy
Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_select_by_id
+
+ 10.7717/peerj.167
+ 10.1093/bioinformatics/btp163
+
diff -r 1a83f5ab9e95 -r 91f55ee8fea5 tools/seq_select_by_id/tool_dependencies.xml
--- a/tools/seq_select_by_id/tool_dependencies.xml Thu Nov 21 04:54:59 2013 -0500
+++ b/tools/seq_select_by_id/tool_dependencies.xml Wed May 13 10:56:29 2015 -0400
@@ -1,6 +1,6 @@
-
+