changeset 2:6f29bb9960ac draft

v0.0.3 - Fixed SFF; more tests
author peterjc
date Mon, 14 May 2018 12:09:50 -0400
parents 458f987918a6
children fcdf11fb34de
files test-data/MID4_GLZRM4E04_rnd30.length.tabular test-data/MID4_GLZRM4E04_rnd30.sff tools/seq_length/README.rst tools/seq_length/seq_length.py tools/seq_length/seq_length.xml
diffstat 5 files changed, 85 insertions(+), 18 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/MID4_GLZRM4E04_rnd30.length.tabular	Mon May 14 12:09:50 2018 -0400
@@ -0,0 +1,31 @@
+#Identifier	Length
+GLZRM4E04IPVGX	97
+GLZRM4E04I9EJK	168
+GLZRM4E04JHW0A	386
+GLZRM4E04IBNFY	296
+GLZRM4E04IKCQP	447
+GLZRM4E04ITXFU	275
+GLZRM4E04JAY4Z	146
+GLZRM4E04H9M21	152
+GLZRM4E04IQDP5	350
+GLZRM4E04I0CYE	81
+GLZRM4E04I2NA7	97
+GLZRM4E04I5BWJ	248
+GLZRM4E04IK8WX	59
+GLZRM4E04I22QH	383
+GLZRM4E04IVVFA	81
+GLZRM4E04ILU3V	49
+GLZRM4E04IDVJT	320
+GLZRM4E04I3ZJ6	197
+GLZRM4E04I3UBT	288
+GLZRM4E04H59S1	362
+GLZRM4E04JFA38	345
+GLZRM4E04J4EK3	372
+GLZRM4E04IK96G	156
+GLZRM4E04JVL8Q	464
+GLZRM4E04IOQ36	389
+GLZRM4E04JBJJ1	264
+GLZRM4E04IEFNO	473
+GLZRM4E04JOT5I	186
+GLZRM4E04J4HNG	42
+GLZRM4E04JC544	331
Binary file test-data/MID4_GLZRM4E04_rnd30.sff has changed
--- a/tools/seq_length/README.rst	Tue May 08 11:16:50 2018 -0400
+++ b/tools/seq_length/README.rst	Mon May 14 12:09:50 2018 -0400
@@ -62,6 +62,9 @@
 v0.0.1  - Initial version.
 v0.0.2  - Faster for FASTA and FASTQ.
         - Fixed typo.
+v0.0.3  - Improved command line usage (outside of Galaxy).
+        - More tests (now covers SFF as well).
+        - Fix requesting SFF format.
 ======= ======================================================================
 
 
--- a/tools/seq_length/seq_length.py	Tue May 08 11:16:50 2018 -0400
+++ b/tools/seq_length/seq_length.py	Mon May 14 12:09:50 2018 -0400
@@ -20,10 +20,36 @@
 from __future__ import print_function
 
 import sys
+from optparse import OptionParser
 
-if "-v" in sys.argv or "--version" in sys.argv:
-    print("v0.0.2")
+usage = r"""Use as follows to compute all the lengths in a sequence file:
+
+$ python seq_length.py -i example.fasta -f fasta -o lengths.tsv
+"""
+
+parser = OptionParser(usage=usage)
+parser.add_option('-i', '--input', dest='input',
+                  default=None, help='Input sequence filename (FASTA, FASTQ, etc)',
+                  metavar="FILE")
+parser.add_option('-f', '--format', dest='format',
+                  default=None, help='Input sequence format (FASTA, QUAL, FASTQ, SFF)')
+parser.add_option('-o', '--output', dest='output',
+                  default=None, help='Output filename (tabular)',
+                  metavar="FILE")
+parser.add_option("-v", "--version", dest="version",
+                  default=False, action="store_true",
+                  help="Show version and quit")
+options, args = parser.parse_args()
+
+if options.version:
+    print("v0.0.3")
     sys.exit(0)
+if not options.input:
+    sys.exit("Require an input filename")
+if not options.format:
+    sys.exit("Require the input format")
+if not options.output:
+    sys.exit("Require an output filename")
 
 try:
     from Bio import SeqIO
@@ -40,31 +66,25 @@
 except ImportError:
     sys.exit("Biopython tool old?, missing Bio.SeqIO.FastaIO.SimpleFastaParser")
 
+in_file = options.input
+out_file = options.output
 
-# Parse Command Line
-try:
-    in_file, seq_format, out_file = sys.argv[1:]
-except ValueError:
-    sys.exit("Expected three arguments (input file, format, output file), "
-             "got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))
-
-
-if seq_format.startswith("fastq"):
+if options.format.startswith("fastq"):
     # We don't care about the quality score encoding, just
     # need to translate Galaxy format name into something
     # Biopython will accept:
     format = "fastq"
-elif seq_format.lower() == "csfasta":
+elif options.format.lower() == "csfasta":
     # I have not tested with colour space FASTA
     format = "fasta"
-elif seq_format.lower == "sff":
+elif options.format.lower() == "sff":
     # The masked/trimmed numbers are more interesting
     format = "sff-trim"
-elif seq_format.lower() in ["fasta", "qual"]:
-    format = seq_format.lower()
+elif options.format.lower() in ["fasta", "qual"]:
+    format = options.format.lower()
 else:
     # TODO: Does Galaxy understand GenBank, EMBL, etc yet?
-    sys.exit("Unexpected format argument: %r" % seq_format)
+    sys.exit("Unexpected format argument: %r" % options.format)
 
 
 count = 0
--- a/tools/seq_length/seq_length.xml	Tue May 08 11:16:50 2018 -0400
+++ b/tools/seq_length/seq_length.xml	Mon May 14 12:09:50 2018 -0400
@@ -1,4 +1,4 @@
-<tool id="seq_length" name="Sequence lengths" version="0.0.2">
+<tool id="seq_length" name="Sequence lengths" version="0.0.3">
     <description>from FASTA, QUAL, FASTQ, or SFF file</description>
     <requirements>
         <!-- This is the currently the last release of Biopython which is available via Galaxy's legacy XML packaging system -->
@@ -8,7 +8,7 @@
 python $__tool_directory__/seq_length.py --version
 </version_command>
     <command detect_errors="aggressive">
-python $__tool_directory__/seq_length.py '$input_file' '$input_file.ext' '$output_file'
+python $__tool_directory__/seq_length.py -i '$input_file' -f '$input_file.ext' -o '$output_file'
     </command>
     <inputs>
         <param name="input_file" type="data" format="fasta,qual,fastq,sff" label="Sequence file" help="FASTA, QUAL, FASTQ, or SFF format." />
@@ -20,10 +20,23 @@
         <test>
             <param name="input_file" value="four_human_proteins.fasta" ftype="fasta" />
             <output name="output_file" file="four_human_proteins.length.tabular" ftype="tabular" />
+            <assert_stdout>
+                <has_line line="4 sequences, total length 3297" />
+            </assert_stdout>
         </test>
         <test>
             <param name="input_file" value="SRR639755_sample_strict.fastq" ftype="fastq" />
             <output name="output_file" file="SRR639755_sample_strict.length.tabular" ftype="tabular" />
+            <assert_stdout>
+                <has_line line="2 sequences, total length 202" />
+            </assert_stdout>
+        </test>
+        <test>
+            <param name="input_file" value="MID4_GLZRM4E04_rnd30.sff" ftype="sff" />
+            <output name="output_file" file="MID4_GLZRM4E04_rnd30.length.tabular" ftype="tabular" />
+            <assert_stdout>
+                <has_line line="30 sequences, total length 7504" />
+            </assert_stdout>
         </test>
     </tests>
     <help>