# HG changeset patch # User peterjc # Date 1701337846 0 # Node ID 5f505ed46e16fbe019943bc2ab5de78a52c06d3f # Parent 7c40a1fbc82eb876c83328915cea143d75b41d2e Bump Biopython dependency diff -r 7c40a1fbc82e -r 5f505ed46e16 tools/sample_seqs/README.rst --- a/tools/sample_seqs/README.rst Tue May 16 09:20:25 2017 -0400 +++ b/tools/sample_seqs/README.rst Thu Nov 30 09:50:46 2023 +0000 @@ -1,7 +1,7 @@ Galaxy tool to sub-sample sequence files ======================================== -This tool is copyright 2014-2017 by Peter Cock, The James Hutton Institute +This tool is copyright 2014-2023 by Peter Cock, The James Hutton Institute (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. See the licence text below (MIT licence). @@ -62,7 +62,7 @@ v0.2.0 - Option to give number of sequences (or pairs) desired. This works by first counting all your sequences, then calculates the percentage required in order to sample them uniformly (evenly). - This makes two passes through the input and is therefore slower. + This makes two passes through the input and is therefore slower. v0.2.1 - Was missing a file for the functional tests. - Included testing of stdout messages. - Includes testing of failure modes. @@ -75,6 +75,7 @@ - Style changes to Python code (internal change only). v0.2.5 - Use ```` (internal change only). - Single quote command line arguments (internal change only). +v0.2.6 - Bumped Biopython dependency version for Python 3 fixes. ======= ====================================================================== @@ -100,7 +101,7 @@ $ planemo shed_upload --tar_only tools/sample_seqs/ ... - $ tar -tzf shed_upload.tar.gz + $ tar -tzf shed_upload.tar.gz test-data/MID4_GLZRM4E04_rnd30_frclip.pair_sample_N5.sff test-data/MID4_GLZRM4E04_rnd30_frclip.sff test-data/MID4_GLZRM4E04_rnd30_frclip.sample_C1.sff diff -r 7c40a1fbc82e -r 5f505ed46e16 tools/sample_seqs/sample_seqs.py --- a/tools/sample_seqs/sample_seqs.py Tue May 16 09:20:25 2017 -0400 +++ b/tools/sample_seqs/sample_seqs.py Thu Nov 30 09:50:46 2023 +0000 @@ -7,9 +7,9 @@ Cock et al 2009. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. +https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878. -This script is copyright 2014-2015 by Peter Cock, The James Hutton Institute +This script is copyright 2014-2021 by Peter Cock, The James Hutton Institute (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. See accompanying text file for licence details (MIT license). @@ -32,34 +32,61 @@ should be reproducible. If you have interleaved paired reads, use the --interleaved switch. If -instead you have two matched files (one for each pair), run the two -twice with the same sampling options to make to matched smaller files. +instead you have two matched files (one for each pair), run this tool +on each with the same sampling options to make two matched smaller files. """ parser = OptionParser(usage=usage) -parser.add_option('-i', '--input', dest='input', - default=None, help='Input sequences filename', - metavar="FILE") -parser.add_option('-f', '--format', dest='format', - default=None, - help='Input sequence format (e.g. fasta, fastq, sff)') -parser.add_option('-o', '--output', dest='output', - default=None, help='Output sampled sequenced filename', - metavar="FILE") -parser.add_option('-p', '--percent', dest='percent', - default=None, - help='Take this percent of the reads') -parser.add_option('-n', '--everyn', dest='everyn', - default=None, - help='Take every N-th read') -parser.add_option('-c', '--count', dest='count', - default=None, - help='Take exactly N reads') -parser.add_option("--interleaved", dest="interleaved", - default=False, action="store_true", - help="Input is interleaved reads, preserve the pairings") -parser.add_option("-v", "--version", dest="version", - default=False, action="store_true", - help="Show version and quit") +parser.add_option( + "-i", + "--input", + dest="input", + default=None, + help="Input sequences filename", + metavar="FILE", +) +parser.add_option( + "-f", + "--format", + dest="format", + default=None, + help="Input sequence format (e.g. fasta, fastq, sff)", +) +parser.add_option( + "-o", + "--output", + dest="output", + default=None, + help="Output sampled sequenced filename", + metavar="FILE", +) +parser.add_option( + "-p", + "--percent", + dest="percent", + default=None, + help="Take this percent of the reads", +) +parser.add_option( + "-n", "--everyn", dest="everyn", default=None, help="Take every N-th read" +) +parser.add_option( + "-c", "--count", dest="count", default=None, help="Take exactly N reads" +) +parser.add_option( + "--interleaved", + dest="interleaved", + default=False, + action="store_true", + help="Input is interleaved reads, preserve the pairings", +) +parser.add_option( + "-v", + "--version", + dest="version", + default=False, + action="store_true", + help="Show version and quit", +) options, args = parser.parse_args() if options.version: @@ -153,12 +180,13 @@ count += 1 if count % N == 1: yield record + elif options.percent: try: percent = float(options.percent) / 100.0 except ValueError: sys.exit("Bad -p percent argument %r" % options.percent) - if not(0.0 <= percent <= 1.0): + if not (0.0 <= percent <= 1.0): sys.exit("Bad -p percent argument %r" % options.percent) sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) @@ -172,6 +200,7 @@ if percent * count > taken: taken += 1 yield record + elif options.count: try: N = int(options.count) @@ -184,11 +213,14 @@ if interleaved: # Paired if total % 2: - sys.exit("Paired mode, but input file has an odd number of sequences: %i" - % total) + sys.exit( + "Paired mode, but input file has an odd number of sequences: %i" % total + ) elif N > total // 2: - sys.exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)." - % (N, total // 2, total)) + sys.exit( + "Requested %i sequence pairs, " + "but file only has %i pairs (%i sequences)." % (N, total // 2, total) + ) total = total // 2 if N == 1: sys.stderr.write("Sampling just first sequence pair!\n") @@ -207,15 +239,18 @@ else: sys.stderr.write("Sampling %i sequences\n" % N) if N == total: + def sampler(iterator): - """Dummy filter to filter nothing, taking everything.""" + """No-operation dummy filter, taking everything.""" global N taken = 0 for record in iterator: taken += 1 yield record assert taken == N, "Picked %i, wanted %i" % (taken, N) + else: + def sampler(iterator): """Sample given number of sequences.""" # Mimic the percentage sampler, with double check on final count @@ -241,6 +276,7 @@ taken += 1 yield record assert taken == N, "Picked %i, wanted %i" % (taken, N) + else: sys.exit("Must use either -n, -p or -c") @@ -257,7 +293,7 @@ def raw_fasta_iterator(handle): - """Yields raw FASTA records as multi-line strings.""" + """Yield raw FASTA records as multi-line strings.""" while True: line = handle.readline() if line == "": @@ -268,8 +304,7 @@ no_id_warned = False while True: if line[0] != ">": - raise ValueError( - "Records in Fasta files should start with '>' character") + raise ValueError("Records in Fasta files should start with '>' character") try: line[1:].split(None, 1)[0] except IndexError: @@ -317,7 +352,9 @@ pos_handle.write("@%s\n%s\n+\n%s\n" % r1) pos_handle.write("@%s\n%s\n+\n%s\n" % r2) else: - for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)): + for title, seq, qual in iterator_filter( + FastqGeneralIterator(in_handle) + ): count += 1 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) return count @@ -341,7 +378,10 @@ in_handle.seek(0) # start again after getting manifest if inter: from itertools import chain - count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle))))) + + count = writer.write_file( + chain.from_iterable(iterator_filter(pair(SffIterator(in_handle)))) + ) assert count % 2 == 0, "Odd number of records? %i" % count count /= 2 else: diff -r 7c40a1fbc82e -r 5f505ed46e16 tools/sample_seqs/sample_seqs.xml --- a/tools/sample_seqs/sample_seqs.xml Tue May 16 09:20:25 2017 -0400 +++ b/tools/sample_seqs/sample_seqs.xml Thu Nov 30 09:50:46 2023 +0000 @@ -1,7 +1,7 @@ - + e.g. to reduce coverage - biopython + biopython python $__tool_directory__/sample_seqs.py --version @@ -235,7 +235,7 @@ Cock et al (2009). Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. +https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878. This tool is available to install into other Galaxy Instances via the Galaxy Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/sample_seqs diff -r 7c40a1fbc82e -r 5f505ed46e16 tools/sample_seqs/tool_dependencies.xml --- a/tools/sample_seqs/tool_dependencies.xml Tue May 16 09:20:25 2017 -0400 +++ b/tools/sample_seqs/tool_dependencies.xml Thu Nov 30 09:50:46 2023 +0000 @@ -1,6 +1,6 @@ - + - + - + \ No newline at end of file