# HG changeset patch # User peterjc # Date 1618611767 0 # Node ID f82868a026eac23be8680578df3b5f77a5456959 # Parent 481b0a925e66adc1602fe1a20487d304552a2b91 "v0.0.7 - long overdue upload to main Tool Shed" diff -r 481b0a925e66 -r f82868a026ea tools/seq_filter_by_mapping/README.rst --- a/tools/seq_filter_by_mapping/README.rst Wed May 17 09:24:01 2017 -0400 +++ b/tools/seq_filter_by_mapping/README.rst Fri Apr 16 22:22:47 2021 +0000 @@ -71,6 +71,7 @@ - Use ```` (internal change only). - Single quote command line arguments (internal change only). v0.0.6 - Python 3 compatible print function. +v0.0.7 - Script works on Python 2 and 3 (fixed input file mode) ======= ====================================================================== diff -r 481b0a925e66 -r f82868a026ea tools/seq_filter_by_mapping/seq_filter_by_mapping.py --- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.py Wed May 17 09:24:01 2017 -0400 +++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.py Fri Apr 16 22:22:47 2021 +0000 @@ -10,7 +10,7 @@ Cock et al 2009. Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. +https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878. This script is copyright 2014 by Peter Cock, The James Hutton Institute (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved. @@ -40,31 +40,57 @@ Multiple SAM/BAM mapping files may be given. """ parser = OptionParser(usage=usage) -parser.add_option('-i', '--input', dest='input', - default=None, help='Input sequences filename', - metavar="FILE") -parser.add_option('-f', '--format', dest='format', - default=None, - help='Input sequence format (e.g. fasta, fastq, sff)') -parser.add_option('-p', '--positive', dest='output_positive', - default=None, - help='Output filename for mapping reads', - metavar="FILE") -parser.add_option('-n', '--negative', dest='output_negative', - default=None, - help='Output filename for non-mapping reads', - metavar="FILE") -parser.add_option("-m", "--pair-mode", dest="pair_mode", - default="lax", - help="How to treat paired reads (lax or strict, default lax)") -parser.add_option("-v", "--version", dest="version", - default=False, action="store_true", - help="Show version and quit") +parser.add_option( + "-i", + "--input", + dest="input", + default=None, + help="Input sequences filename", + metavar="FILE", +) +parser.add_option( + "-f", + "--format", + dest="format", + default=None, + help="Input sequence format (e.g. fasta, fastq, sff)", +) +parser.add_option( + "-p", + "--positive", + dest="output_positive", + default=None, + help="Output filename for mapping reads", + metavar="FILE", +) +parser.add_option( + "-n", + "--negative", + dest="output_negative", + default=None, + help="Output filename for non-mapping reads", + metavar="FILE", +) +parser.add_option( + "-m", + "--pair-mode", + dest="pair_mode", + default="lax", + help="How to treat paired reads (lax or strict, default lax)", +) +parser.add_option( + "-v", + "--version", + dest="version", + default=False, + action="store_true", + help="Show version and quit", +) options, args = parser.parse_args() if options.version: - print("v0.0.6") + print("v0.0.7") sys.exit(0) in_file = options.input @@ -114,7 +140,7 @@ if match: # Use the fact this is a suffix, and regular expression will be # anchored to the end of the name: - return name[:match.start()] + return name[: match.start()] else: # Nothing to do return name @@ -128,19 +154,19 @@ assert clean_name("baz.q2") == "baz" mapped_chars = { - '>': '__gt__', - '<': '__lt__', - "'": '__sq__', - '"': '__dq__', - '[': '__ob__', - ']': '__cb__', - '{': '__oc__', - '}': '__cc__', - '@': '__at__', - '\n': '__cn__', - '\r': '__cr__', - '\t': '__tc__', - '#': '__pd__', + ">": "__gt__", + "<": "__lt__", + "'": "__sq__", + '"': "__dq__", + "[": "__ob__", + "]": "__cb__", + "{": "__oc__", + "}": "__cc__", + "@": "__at__", + "\n": "__cn__", + "\r": "__cr__", + "\t": "__tc__", + "#": "__pd__", } @@ -149,21 +175,25 @@ Parses BAM files via call out to samtools view command. """ - handle = open(filename, "rb") - magic = handle.read(4) + with open(filename, "rb") as handle: + magic = handle.read(4) + if magic == b"\x1f\x8b\x08\x04": # Presumably a BAM file then... - handle.close() # Call samtools view, don't need header so no -h added: - child = subprocess.Popen(["samtools", "view", filename], - stdin=None, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + child = subprocess.Popen( + ["samtools", "view", filename], + universal_newlines=True, + stdin=None, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) handle = child.stdout else: # Presumably a SAM file... child = None - handle.seek(0) + # Open in text mode + handle = open(filename) # Handle should now contain SAM records for line in handle: # Ignore header lines @@ -178,7 +208,7 @@ ids.add(qname) elif pair_mode == "strict": # For paired reads, require BOTH be mapped. - if (flag & 0x4): + if flag & 0x4: # This is unmapped, ignore it pass elif not (flag & 0x1): @@ -192,8 +222,11 @@ stdout, stderr = child.communicate() assert child.returncode is not None if child.returncode: - msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode, - filename, stderr) + msg = "Error %i from 'samtools view %s'\n%s" % ( + child.returncode, + filename, + stderr, + ) sys.exit(msg.strip(), child.returncode) else: handle.close() @@ -211,7 +244,7 @@ def crude_fasta_iterator(handle): - """Yields tuples, record ID and the full record as a string.""" + """Yield tuples, record ID and the full record as a string.""" while True: line = handle.readline() if line == "": @@ -222,8 +255,7 @@ no_id_warned = False while True: if line[0] != ">": - raise ValueError( - "Records in Fasta files should start with '>' character") + raise ValueError("Records in Fasta files should start with '>' character") try: id = line[1:].split(None, 1)[0] except IndexError: @@ -288,6 +320,7 @@ def fastq_filter(in_file, pos_file, neg_file, wanted): """FASTQ filter.""" from Bio.SeqIO.QualityIO import FastqGeneralIterator + pos_count = neg_count = 0 handle = open(in_file, "r") if pos_file is not None and neg_file is not None: @@ -355,13 +388,17 @@ out_handle = open(pos_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again after getting manifest - pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted) + pos_count = writer.write_file( + rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted + ) out_handle.close() if neg_file is not None: out_handle = open(neg_file, "wb") writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) # start again - neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted) + neg_count = writer.write_file( + rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted + ) out_handle.close() # And we're done in_handle.close() @@ -377,7 +414,9 @@ else: sys.exit("Unsupported file type %r" % seq_format) -pos_count, neg_count = sequence_filter(in_file, out_positive_file, out_negative_file, ids) +pos_count, neg_count = sequence_filter( + in_file, out_positive_file, out_negative_file, ids +) print("%i mapped and %i unmapped reads." % (pos_count, neg_count)) fraction = float(pos_count) * 100.0 / float(pos_count + neg_count) print("In total %i reads, of which %0.1f%% mapped." % (pos_count + neg_count, fraction)) diff -r 481b0a925e66 -r f82868a026ea tools/seq_filter_by_mapping/seq_filter_by_mapping.xml --- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml Wed May 17 09:24:01 2017 -0400 +++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml Fri Apr 16 22:22:47 2021 +0000 @@ -1,4 +1,4 @@ - + from SAM/BAM file biopython @@ -21,7 +21,7 @@ - + @@ -33,12 +33,12 @@ - + - - + --> @@ -50,19 +50,19 @@ - + - + - + @@ -97,7 +97,7 @@ Cock et al (2009). Biopython: freely available Python tools for computational molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. -http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. +https://doi.org/10.1093/bioinformatics/btp163 pmid:19304878. This tool is available to install into other Galaxy Instances via the Galaxy Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_mapping diff -r 481b0a925e66 -r f82868a026ea tools/seq_filter_by_mapping/tool_dependencies.xml --- a/tools/seq_filter_by_mapping/tool_dependencies.xml Wed May 17 09:24:01 2017 -0400 +++ b/tools/seq_filter_by_mapping/tool_dependencies.xml Fri Apr 16 22:22:47 2021 +0000 @@ -1,9 +1,9 @@ - + - + - + - + \ No newline at end of file