changeset 9:5f505ed46e16 draft

Bump Biopython dependency
author peterjc
date Thu, 30 Nov 2023 09:50:46 +0000
parents 7c40a1fbc82e
children bdaefd241921
files tools/sample_seqs/README.rst tools/sample_seqs/sample_seqs.py tools/sample_seqs/sample_seqs.xml tools/sample_seqs/tool_dependencies.xml
diffstat 4 files changed, 89 insertions(+), 48 deletions(-) [+]
line wrap: on
line diff
--- a/tools/sample_seqs/README.rst	Tue May 16 09:20:25 2017 -0400
+++ b/tools/sample_seqs/README.rst	Thu Nov 30 09:50:46 2023 +0000
@@ -1,7 +1,7 @@
 Galaxy tool to sub-sample sequence files
 ========================================
 
-This tool is copyright 2014-2017 by Peter Cock, The James Hutton Institute
+This tool is copyright 2014-2023 by Peter Cock, The James Hutton Institute
 (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
 See the licence text below (MIT licence).
 
@@ -62,7 +62,7 @@
 v0.2.0  - Option to give number of sequences (or pairs) desired.
           This works by first counting all your sequences, then calculates
           the percentage required in order to sample them uniformly (evenly).
-          This makes two passes through the input and is therefore slower. 
+          This makes two passes through the input and is therefore slower.
 v0.2.1  - Was missing a file for the functional tests.
         - Included testing of stdout messages.
         - Includes testing of failure modes.
@@ -75,6 +75,7 @@
         - Style changes to Python code (internal change only).
 v0.2.5  - Use ``<command detect_errors="aggressive">`` (internal change only).
         - Single quote command line arguments (internal change only).
+v0.2.6  - Bumped Biopython dependency version for Python 3 fixes.
 ======= ======================================================================
 
 
@@ -100,7 +101,7 @@
 
     $ planemo shed_upload --tar_only tools/sample_seqs/
     ...
-    $ tar -tzf shed_upload.tar.gz 
+    $ tar -tzf shed_upload.tar.gz
     test-data/MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff
     test-data/MID4_GLZRM4E04_rnd30_frclip.sff
     test-data/MID4_GLZRM4E04_rnd30_frclip.sample_C1.sff
--- a/tools/sample_seqs/sample_seqs.py	Tue May 16 09:20:25 2017 -0400
+++ b/tools/sample_seqs/sample_seqs.py	Thu Nov 30 09:50:46 2023 +0000
@@ -7,9 +7,9 @@
 
 Cock et al 2009. Biopython: freely available Python tools for computational
 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
 
-This script is copyright 2014-2015 by Peter Cock, The James Hutton Institute
+This script is copyright 2014-2021 by Peter Cock, The James Hutton Institute
 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
 See accompanying text file for licence details (MIT license).
 
@@ -32,34 +32,61 @@
 should be reproducible.
 
 If you have interleaved paired reads, use the --interleaved switch. If
-instead you have two matched files (one for each pair), run the two
-twice with the same sampling options to make to matched smaller files.
+instead you have two matched files (one for each pair), run this tool
+on each with the same sampling options to make two matched smaller files.
 """
 parser = OptionParser(usage=usage)
-parser.add_option('-i', '--input', dest='input',
-                  default=None, help='Input sequences filename',
-                  metavar="FILE")
-parser.add_option('-f', '--format', dest='format',
-                  default=None,
-                  help='Input sequence format (e.g. fasta, fastq, sff)')
-parser.add_option('-o', '--output', dest='output',
-                  default=None, help='Output sampled sequenced filename',
-                  metavar="FILE")
-parser.add_option('-p', '--percent', dest='percent',
-                  default=None,
-                  help='Take this percent of the reads')
-parser.add_option('-n', '--everyn', dest='everyn',
-                  default=None,
-                  help='Take every N-th read')
-parser.add_option('-c', '--count', dest='count',
-                  default=None,
-                  help='Take exactly N reads')
-parser.add_option("--interleaved", dest="interleaved",
-                  default=False, action="store_true",
-                  help="Input is interleaved reads, preserve the pairings")
-parser.add_option("-v", "--version", dest="version",
-                  default=False, action="store_true",
-                  help="Show version and quit")
+parser.add_option(
+    "-i",
+    "--input",
+    dest="input",
+    default=None,
+    help="Input sequences filename",
+    metavar="FILE",
+)
+parser.add_option(
+    "-f",
+    "--format",
+    dest="format",
+    default=None,
+    help="Input sequence format (e.g. fasta, fastq, sff)",
+)
+parser.add_option(
+    "-o",
+    "--output",
+    dest="output",
+    default=None,
+    help="Output sampled sequenced filename",
+    metavar="FILE",
+)
+parser.add_option(
+    "-p",
+    "--percent",
+    dest="percent",
+    default=None,
+    help="Take this percent of the reads",
+)
+parser.add_option(
+    "-n", "--everyn", dest="everyn", default=None, help="Take every N-th read"
+)
+parser.add_option(
+    "-c", "--count", dest="count", default=None, help="Take exactly N reads"
+)
+parser.add_option(
+    "--interleaved",
+    dest="interleaved",
+    default=False,
+    action="store_true",
+    help="Input is interleaved reads, preserve the pairings",
+)
+parser.add_option(
+    "-v",
+    "--version",
+    dest="version",
+    default=False,
+    action="store_true",
+    help="Show version and quit",
+)
 options, args = parser.parse_args()
 
 if options.version:
@@ -153,12 +180,13 @@
             count += 1
             if count % N == 1:
                 yield record
+
 elif options.percent:
     try:
         percent = float(options.percent) / 100.0
     except ValueError:
         sys.exit("Bad -p percent argument %r" % options.percent)
-    if not(0.0 <= percent <= 1.0):
+    if not (0.0 <= percent <= 1.0):
         sys.exit("Bad -p percent argument %r" % options.percent)
     sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent))
 
@@ -172,6 +200,7 @@
             if percent * count > taken:
                 taken += 1
                 yield record
+
 elif options.count:
     try:
         N = int(options.count)
@@ -184,11 +213,14 @@
     if interleaved:
         # Paired
         if total % 2:
-            sys.exit("Paired mode, but input file has an odd number of sequences: %i"
-                     % total)
+            sys.exit(
+                "Paired mode, but input file has an odd number of sequences: %i" % total
+            )
         elif N > total // 2:
-            sys.exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)."
-                     % (N, total // 2, total))
+            sys.exit(
+                "Requested %i sequence pairs, "
+                "but file only has %i pairs (%i sequences)." % (N, total // 2, total)
+            )
         total = total // 2
         if N == 1:
             sys.stderr.write("Sampling just first sequence pair!\n")
@@ -207,15 +239,18 @@
         else:
             sys.stderr.write("Sampling %i sequences\n" % N)
     if N == total:
+
         def sampler(iterator):
-            """Dummy filter to filter nothing, taking everything."""
+            """No-operation dummy filter, taking everything."""
             global N
             taken = 0
             for record in iterator:
                 taken += 1
                 yield record
             assert taken == N, "Picked %i, wanted %i" % (taken, N)
+
     else:
+
         def sampler(iterator):
             """Sample given number of sequences."""
             # Mimic the percentage sampler, with double check on final count
@@ -241,6 +276,7 @@
                     taken += 1
                     yield record
             assert taken == N, "Picked %i, wanted %i" % (taken, N)
+
 else:
     sys.exit("Must use either -n, -p or -c")
 
@@ -257,7 +293,7 @@
 
 
 def raw_fasta_iterator(handle):
-    """Yields raw FASTA records as multi-line strings."""
+    """Yield raw FASTA records as multi-line strings."""
     while True:
         line = handle.readline()
         if line == "":
@@ -268,8 +304,7 @@
     no_id_warned = False
     while True:
         if line[0] != ">":
-            raise ValueError(
-                "Records in Fasta files should start with '>' character")
+            raise ValueError("Records in Fasta files should start with '>' character")
         try:
             line[1:].split(None, 1)[0]
         except IndexError:
@@ -317,7 +352,9 @@
                     pos_handle.write("@%s\n%s\n+\n%s\n" % r1)
                     pos_handle.write("@%s\n%s\n+\n%s\n" % r2)
             else:
-                for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)):
+                for title, seq, qual in iterator_filter(
+                    FastqGeneralIterator(in_handle)
+                ):
                     count += 1
                     pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
     return count
@@ -341,7 +378,10 @@
             in_handle.seek(0)  # start again after getting manifest
             if inter:
                 from itertools import chain
-                count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle)))))
+
+                count = writer.write_file(
+                    chain.from_iterable(iterator_filter(pair(SffIterator(in_handle))))
+                )
                 assert count % 2 == 0, "Odd number of records? %i" % count
                 count /= 2
             else:
--- a/tools/sample_seqs/sample_seqs.xml	Tue May 16 09:20:25 2017 -0400
+++ b/tools/sample_seqs/sample_seqs.xml	Thu Nov 30 09:50:46 2023 +0000
@@ -1,7 +1,7 @@
-<tool id="sample_seqs" name="Sub-sample sequences files" version="0.2.5">
+<tool id="sample_seqs" name="Sub-sample sequences files" version="0.2.6">
     <description>e.g. to reduce coverage</description>
     <requirements>
-        <requirement type="package" version="1.67">biopython</requirement>
+        <requirement type="package" version="1.81">biopython</requirement>
     </requirements>
     <version_command>
 python $__tool_directory__/sample_seqs.py --version
@@ -235,7 +235,7 @@
 
 Cock et al (2009). Biopython: freely available Python tools for computational
 molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
 
 This tool is available to install into other Galaxy Instances via the Galaxy
 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/sample_seqs
--- a/tools/sample_seqs/tool_dependencies.xml	Tue May 16 09:20:25 2017 -0400
+++ b/tools/sample_seqs/tool_dependencies.xml	Thu Nov 30 09:50:46 2023 +0000
@@ -1,6 +1,6 @@
-<?xml version="1.0"?>
+<?xml version="1.0" ?>
 <tool_dependency>
     <package name="biopython" version="1.67">
-        <repository changeset_revision="a42f244cce44" name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" />
+        <repository name="package_biopython_1_67" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" changeset_revision="a12f73c3b116"/>
     </package>
-</tool_dependency>
+</tool_dependency>
\ No newline at end of file