Mercurial > repos > peterjc > sample_seqs
changeset 5:6b71ad5d43fb draft
v0.2.3 clarified help, internal cleanup of Python script
author | peterjc |
---|---|
date | Wed, 01 Feb 2017 09:39:36 -0500 |
parents | d3aa9f25c24c |
children | 31f5701cd2e9 |
files | tools/sample_seqs/README.rst tools/sample_seqs/sample_seqs.py tools/sample_seqs/sample_seqs.xml tools/sample_seqs/tool_dependencies.xml |
diffstat | 4 files changed, 63 insertions(+), 55 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/sample_seqs/README.rst Wed Aug 05 12:30:18 2015 -0400 +++ b/tools/sample_seqs/README.rst Wed Feb 01 09:39:36 2017 -0500 @@ -67,8 +67,10 @@ - Included testing of stdout messages. - Includes testing of failure modes. v0.2.2 - Reorder XML elements (internal change only). - - Use ``format_source=...``` tag. + - Use ``format_source=...`` tag. - Planemo for Tool Shed upload (``.shed.yml``, internal change only). +v0.2.3 - Do the Biopython imports at the script start (internal change only). + - Clarify paired read example in help text. ======= ====================================================================== @@ -82,12 +84,12 @@ Planemo commands (which requires you have set your Tool Shed access details in ``~/.planemo.yml`` and that you have access rights on the Tool Shed):: - $ planemo shed_update --shed_target testtoolshed --check_diff ~/repositories/pico_galaxy/tools/sample_seqs/ + $ planemo shed_update -t testtoolshed --check_diff ~/repositories/pico_galaxy/tools/sample_seqs/ ... or:: - $ planemo shed_update --shed_target toolshed --check_diff ~/repositories/pico_galaxy/tools/sample_seqs/ + $ planemo shed_update -t toolshed --check_diff ~/repositories/pico_galaxy/tools/sample_seqs/ ... To just build and check the tar ball, use::
--- a/tools/sample_seqs/sample_seqs.py Wed Aug 05 12:30:18 2015 -0400 +++ b/tools/sample_seqs/sample_seqs.py Wed Feb 01 09:39:36 2017 -0500 @@ -19,12 +19,7 @@ import sys from optparse import OptionParser - -def sys_exit(msg, err=1): - sys.stderr.write(msg.rstrip() + "\n") - sys.exit(err) - -#Parse Command Line +# Parse Command Line usage = """Use as follows: $ python sample_seqs.py [options] @@ -35,6 +30,10 @@ This samples uniformly though the file, rather than at random, and therefore should be reproducible. + +If you have interleaved paired reads, use the --interleaved switch. If +instead you have two matched files (one for each pair), run the two +twice with the same sampling options to make to matched smaller files. """ parser = OptionParser(usage=usage) parser.add_option('-i', '--input', dest='input', @@ -64,26 +63,33 @@ options, args = parser.parse_args() if options.version: - print("v0.2.1") + print("v0.2.3") sys.exit(0) +try: + from Bio import SeqIO + from Bio.SeqIO.QualityIO import FastqGeneralIterator + from Bio.SeqIO.FastaIO import SimpleFastaParser + from Bio.SeqIO.SffIO import SffIterator, SffWriter +except ImportError: + sys.exit("This script requires Biopython.") + in_file = options.input out_file = options.output interleaved = options.interleaved if not in_file: - sys_exit("Require an input filename") + sys.exit("Require an input filename") if in_file != "/dev/stdin" and not os.path.isfile(in_file): - sys_exit("Missing input file %r" % in_file) + sys.exit("Missing input file %r" % in_file) if not out_file: - sys_exit("Require an output filename") + sys.exit("Require an output filename") if not options.format: - sys_exit("Require the sequence format") + sys.exit("Require the sequence format") seq_format = options.format.lower() def count_fasta(filename): - from Bio.SeqIO.FastaIO import SimpleFastaParser count = 0 with open(filename) as handle: for title, seq in SimpleFastaParser(handle): @@ -92,7 +98,6 @@ def count_fastq(filename): - from Bio.SeqIO.QualityIO import FastqGeneralIterator count = 0 with open(filename) as handle: for title, seq, qual in FastqGeneralIterator(handle): @@ -101,7 +106,6 @@ def count_sff(filename): - from Bio import SeqIO # If the SFF file has a built in index (which is normal), # this will be parsed and is the quicker than scanning # the whole file. @@ -109,29 +113,29 @@ def count_sequences(filename, format): - if seq_format == "sff": + if format == "sff": return count_sff(filename) - elif seq_format == "fasta": + elif format == "fasta": return count_fasta(filename) - elif seq_format.startswith("fastq"): + elif format.startswith("fastq"): return count_fastq(filename) else: - sys_exit("Unsupported file type %r" % seq_format) + sys.exit("Unsupported file type %r" % format) if options.percent and options.everyn: - sys_exit("Cannot combine -p and -n options") + sys.exit("Cannot combine -p and -n options") elif options.everyn and options.count: - sys_exit("Cannot combine -p and -c options") + sys.exit("Cannot combine -p and -c options") elif options.percent and options.count: - sys_exit("Cannot combine -n and -c options") + sys.exit("Cannot combine -n and -c options") elif options.everyn: try: N = int(options.everyn) - except: - sys_exit("Bad -n argument %r" % options.everyn) + except ValueError: + sys.exit("Bad -n argument %r" % options.everyn) if N < 2: - sys_exit("Bad -n argument %r" % options.everyn) + sys.exit("Bad -n argument %r" % options.everyn) if (N % 10) == 1: sys.stderr.write("Sampling every %ist sequence\n" % N) elif (N % 10) == 2: @@ -140,6 +144,7 @@ sys.stderr.write("Sampling every %ird sequence\n" % N) else: sys.stderr.write("Sampling every %ith sequence\n" % N) + def sampler(iterator): global N count = 0 @@ -150,11 +155,12 @@ elif options.percent: try: percent = float(options.percent) / 100.0 - except: - sys_exit("Bad -p percent argument %r" % options.percent) + except ValueError: + sys.exit("Bad -p percent argument %r" % options.percent) if percent <= 0.0 or 1.0 <= percent: - sys_exit("Bad -p percent argument %r" % options.percent) + sys.exit("Bad -p percent argument %r" % options.percent) sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) + def sampler(iterator): global percent count = 0 @@ -167,19 +173,19 @@ elif options.count: try: N = int(options.count) - except: - sys_exit("Bad -c count argument %r" % options.count) + except ValueError: + sys.exit("Bad -c count argument %r" % options.count) if N < 1: - sys_exit("Bad -c count argument %r" % options.count) + sys.exit("Bad -c count argument %r" % options.count) total = count_sequences(in_file, seq_format) sys.stderr.write("Input file has %i sequences\n" % total) if interleaved: # Paired if total % 2: - sys_exit("Paired mode, but input file has an odd number of sequences: %i" + sys.exit("Paired mode, but input file has an odd number of sequences: %i" % total) elif N > total // 2: - sys_exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)." + sys.exit("Requested %i sequence pairs, but file only has %i pairs (%i sequences)." % (N, total // 2, total)) total = total // 2 if N == 1: @@ -191,7 +197,7 @@ else: # Not paired if total < N: - sys_exit("Requested %i sequences, but file only has %i." % (N, total)) + sys.exit("Requested %i sequences, but file only has %i." % (N, total)) if N == 1: sys.stderr.write("Sampling just first sequence!\n") elif N == total: @@ -215,7 +221,7 @@ # i.e. What if percentage comes out slighty too low, and # we could end up missing last few desired sequences? percentage = float(N) / float(total) - #print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f" + # print("DEBUG: Want %i out of %i sequences/pairs, as a percentage %0.2f" # % (N, total, percentage * 100.0)) count = 0 taken = 0 @@ -233,7 +239,7 @@ yield record assert taken == N, "Picked %i, wanted %i" % (taken, N) else: - sys_exit("Must use either -n, -p or -c") + sys.exit("Must use either -n, -p or -c") def pair(iterator): @@ -252,7 +258,7 @@ while True: line = handle.readline() if line == "": - return # Premature end of file, or just empty? + return # Premature end of file, or just empty? if line[0] == ">": break @@ -279,11 +285,12 @@ line = handle.readline() yield "".join(lines) if not line: - return # StopIteration + return # StopIteration + def fasta_filter(in_file, out_file, iterator_filter, inter): count = 0 - #Galaxy now requires Python 2.5+ so can use with statements, + # Galaxy now requires Python 2.5+ so can use with statements, with open(in_file) as in_handle: with open(out_file, "w") as pos_handle: if inter: @@ -298,7 +305,6 @@ return count -from Bio.SeqIO.QualityIO import FastqGeneralIterator def fastq_filter(in_file, out_file, iterator_filter, inter): count = 0 with open(in_file) as in_handle: @@ -318,13 +324,9 @@ def sff_filter(in_file, out_file, iterator_filter, inter): count = 0 try: - from Bio.SeqIO.SffIO import SffIterator, SffWriter - except ImportError: - sys_exit("SFF filtering requires Biopython 1.54 or later") - try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: - #Prior to Biopython 1.56 this was a private function + # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest with open(in_file, "rb") as in_handle: try: @@ -334,7 +336,7 @@ in_handle.seek(0) with open(out_file, "wb") as out_handle: writer = SffWriter(out_handle, xml=manifest) - in_handle.seek(0) #start again after getting manifest + in_handle.seek(0) # start again after getting manifest if inter: from itertools import chain count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle))))) @@ -342,7 +344,6 @@ count /= 2 else: count = writer.write_file(iterator_filter(SffIterator(in_handle))) - #count = writer.write_file(SffIterator(in_handle)) return count if seq_format == "sff": @@ -352,7 +353,7 @@ elif seq_format.startswith("fastq"): count = fastq_filter(in_file, out_file, sampler, interleaved) else: - sys_exit("Unsupported file type %r" % seq_format) + sys.exit("Unsupported file type %r" % seq_format) if interleaved: sys.stderr.write("Selected %i pairs\n" % count)
--- a/tools/sample_seqs/sample_seqs.xml Wed Aug 05 12:30:18 2015 -0400 +++ b/tools/sample_seqs/sample_seqs.xml Wed Feb 01 09:39:36 2017 -0500 @@ -1,4 +1,4 @@ -<tool id="sample_seqs" name="Sub-sample sequences files" version="0.2.2"> +<tool id="sample_seqs" name="Sub-sample sequences files" version="0.2.3"> <description>e.g. to reduce coverage</description> <requirements> <requirement type="package" version="1.65">biopython</requirement> @@ -205,9 +205,13 @@ For example using 20% would take every 5th pair of records, or you could request 1000 read pairs. +If instead of interleaved paired reads you have two matched files (one +for each pair), run the tool twice with the same sampling options to +make to matched smaller files. + .. class:: warningmark -Note interleaves/pair mode does *not* actually check your read names +Note interleaved/pair mode does *not* actually check your read names match a known pair naming scheme! **Example Usage** @@ -215,8 +219,9 @@ Suppose you have some Illumina paired end data as files ``R1.fastq`` and ``R2.fastq`` which give an estimated x200 coverage, and you wish to do a *de novo* assembly with a tool like MIRA which recommends lower coverage. -Taking every 3rd read would reduce the estimated coverage to about x66, -and would preserve the pairing as well. +Running the tool twice (on ``R1.fastq`` and ``R2.fastq``) taking every +3rd read would reduce the estimated coverage to about x66, and would +preserve the pairing as well (as two smaller FASTQ files). Similarly, if you had some Illumina paired end data interleaved into one file with an estimated x200 coverage, you would run this tool in
--- a/tools/sample_seqs/tool_dependencies.xml Wed Aug 05 12:30:18 2015 -0400 +++ b/tools/sample_seqs/tool_dependencies.xml Wed Feb 01 09:39:36 2017 -0500 @@ -1,6 +1,6 @@ <?xml version="1.0"?> <tool_dependency> <package name="biopython" version="1.65"> - <repository changeset_revision="dc595937617c" name="package_biopython_1_65" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="d8185f5631ed" name="package_biopython_1_65" owner="biopython" toolshed="https://toolshed.g2.bx.psu.edu" /> </package> </tool_dependency>