# HG changeset patch # User peterjc # Date 1494436604 14400 # Node ID 48e71dfd51b3c1363901093c72b5730c424b0e0f # Parent 8ff0ac66f1a30e477c1a12d5926038882db8a253 v0.0.5 Depend on Biopython 1.67 from Tool Shed or (Bio)conda diff -r 8ff0ac66f1a3 -r 48e71dfd51b3 tools/seq_filter_by_mapping/README.rst --- a/tools/seq_filter_by_mapping/README.rst Wed May 13 11:08:58 2015 -0400 +++ b/tools/seq_filter_by_mapping/README.rst Wed May 10 13:16:44 2017 -0400 @@ -1,7 +1,7 @@ Galaxy tool to filter FASTA, FASTQ or SFF sequences by SAM/BAM mapping ====================================================================== -This tool is copyright 2014-2015 by Peter Cock, The James Hutton Institute +This tool is copyright 2014-2017 by Peter Cock, The James Hutton Institute (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. See the licence text below. @@ -66,6 +66,10 @@ v0.0.4 - Use the ``format_source=...`` tag. - Reorder XML elements (internal change only). - Planemo for Tool Shed upload (``.shed.yml``, internal change only). +v0.0.5 - Python script cleanups (internal change only). + - Depends on Biopython 1.67 via legacy Tool Shed package or bioconda. + - Use ```` (internal change only). + - Single quote command line arguments (internal change only). ======= ====================================================================== @@ -82,17 +86,17 @@ Planemo commands (which requires you have set your Tool Shed access details in ``~/.planemo.yml`` and that you have access rights on the Tool Shed):: - $ planemo shed_upload --shed_target testtoolshed --check_diff ~/repositories/pico_galaxy/tools/seq_filter_by_mapping/ + $ planemo shed_update -t testtoolshed --check_diff tools/seq_filter_by_mapping/ ... or:: - $ planemo shed_upload --shed_target toolshed --check_diff ~/repositories/pico_galaxy/tools/seq_filter_by_mapping/ + $ planemo shed_update -t toolshed --check_diff tools/seq_filter_by_mapping/ ... To just build and check the tar ball, use:: - $ planemo shed_upload --tar_only ~/repositories/pico_galaxy/tools/seq_filter_by_mapping/ + $ planemo shed_upload --tar_only tools/seq_filter_by_mapping/ ... $ tar -tzf shed_upload.tar.gz test-data/SRR639755_mito_pairs.fastq.gz diff -r 8ff0ac66f1a3 -r 48e71dfd51b3 tools/seq_filter_by_mapping/seq_filter_by_mapping.py --- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.py Wed May 13 11:08:58 2015 -0400 +++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.py Wed May 10 13:16:44 2017 -0400 @@ -18,17 +18,15 @@ Use -v or --version to get the version, -h or --help for help. """ + import os -import sys import re import subprocess +import sys + from optparse import OptionParser -def sys_exit(msg, err=1): - sys.stderr.write(msg.rstrip() + "\n") - sys.exit(err) - -#Parse Command Line +# Parse Command Line usage = """Use as follows: $ python seq_filter_by_mapping.py [options] mapping.sam/bam [more mappings] @@ -64,7 +62,7 @@ options, args = parser.parse_args() if options.version: - print "v0.0.3" + print "v0.0.5" sys.exit(0) in_file = options.input @@ -74,27 +72,27 @@ pair_mode = options.pair_mode if in_file is None or not os.path.isfile(in_file): - sys_exit("Missing input file: %r" % in_file) + sys.exit("Missing input file: %r" % in_file) if out_positive_file is None and out_negative_file is None: - sys_exit("Neither output file requested") + sys.exit("Neither output file requested") if seq_format is None: - sys_exit("Missing sequence format") + sys.exit("Missing sequence format") if pair_mode not in ["lax", "strict"]: - sys_exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode) + sys.exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode) for mapping in args: if not os.path.isfile(mapping): - sys_exit("Mapping file %r not found" % mapping) + sys.exit("Mapping file %r not found" % mapping) if not args: - sys_exit("At least one SAM/BAM mapping file is required") + sys.exit("At least one SAM/BAM mapping file is required") -#Cope with three widely used suffix naming convensions, -#Illumina: /1 or /2 -#Forward/revered: .f or .r -#Sanger, e.g. .p1k and .q1k -#See http://staden.sourceforge.net/manual/pregap4_unix_50.html -#re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") -#re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") +# Cope with three widely used suffix naming convensions, +# Illumina: /1 or /2 +# Forward/revered: .f or .r +# Sanger, e.g. .p1k and .q1k +# See http://staden.sourceforge.net/manual/pregap4_unix_50.html +# re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$") +# re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$") re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$") assert re_suffix.search("demo.f") assert re_suffix.search("demo.s1") @@ -107,6 +105,7 @@ assert re_suffix.search("demo.q1") assert re_suffix.search("demo.q1lk") + def clean_name(name): """Remove suffix.""" match = re_suffix.search(name) @@ -117,6 +116,8 @@ else: # Nothing to do return name + + assert clean_name("foo/1") == "foo" assert clean_name("foo/2") == "foo" assert clean_name("bar.f") == "bar" @@ -124,20 +125,22 @@ assert clean_name("baz.p1") == "baz" assert clean_name("baz.q2") == "baz" -mapped_chars = { '>' :'__gt__', - '<' :'__lt__', - "'" :'__sq__', - '"' :'__dq__', - '[' :'__ob__', - ']' :'__cb__', - '{' :'__oc__', - '}' :'__cc__', - '@' : '__at__', - '\n' : '__cn__', - '\r' : '__cr__', - '\t' : '__tc__', - '#' : '__pd__' - } +mapped_chars = { + '>': '__gt__', + '<': '__lt__', + "'": '__sq__', + '"': '__dq__', + '[': '__ob__', + ']': '__cb__', + '{': '__oc__', + '}': '__cc__', + '@': '__at__', + '\n': '__cn__', + '\r': '__cr__', + '\t': '__tc__', + '#': '__pd__', +} + def load_mapping_ids(filename, pair_mode, ids): """Parse SAM/BAM file, updating given set of ids. @@ -189,7 +192,7 @@ if child.returncode: msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode, filename, stderr) - sys_exit(msg.strip(), child.returncode) + sys.exit(msg.strip(), child.returncode) else: handle.close() @@ -204,12 +207,13 @@ if len(ids) < 10: print("Looking for %s" % ", ".join(sorted(ids))) + def crude_fasta_iterator(handle): """Yields tuples, record ID and the full record as a string.""" while True: line = handle.readline() if line == "": - return # Premature end of file, or just empty? + return # Premature end of file, or just empty? if line[0] == ">": break @@ -236,16 +240,16 @@ line = handle.readline() yield id, "".join(lines) if not line: - return # StopIteration + return # StopIteration def fasta_filter(in_file, pos_file, neg_file, wanted): """FASTA filter producing 60 character line wrapped outout.""" pos_count = neg_count = 0 - #Galaxy now requires Python 2.5+ so can use with statements, + # Galaxy now requires Python 2.5+ so can use with statements, with open(in_file) as in_handle: - #Doing the if statement outside the loop for speed - #(with the downside of three very similar loops). + # Doing the if statement outside the loop for speed + # (with the downside of three very similar loops). if pos_file is not None and neg_file is not None: print "Generating two FASTA files" with open(pos_file, "w") as pos_handle: @@ -284,14 +288,14 @@ from Bio.SeqIO.QualityIO import FastqGeneralIterator pos_count = neg_count = 0 handle = open(in_file, "r") - if out_positive_file is not None and out_negative_file is not None: + if pos_file is not None and neg_file is not None: print "Generating two FASTQ files" - positive_handle = open(out_positive_file, "w") - negative_handle = open(out_negative_file, "w") + positive_handle = open(pos_file, "w") + negative_handle = open(neg_file, "w") print in_file for title, seq, qual in FastqGeneralIterator(handle): # print("%s --> %s" % (title, clean_name(title.split(None, 1)[0]))) - if clean_name(title.split(None, 1)[0]) in ids: + if clean_name(title.split(None, 1)[0]) in wanted: positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) pos_count += 1 else: @@ -299,21 +303,21 @@ neg_count += 1 positive_handle.close() negative_handle.close() - elif out_positive_file is not None: + elif pos_file is not None: print "Generating matching FASTQ file" - positive_handle = open(out_positive_file, "w") + positive_handle = open(pos_file, "w") for title, seq, qual in FastqGeneralIterator(handle): - if clean_name(title.split(None, 1)[0]) in ids: + if clean_name(title.split(None, 1)[0]) in wanted: positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) pos_count += 1 else: neg_count += 1 positive_handle.close() - elif out_negative_file is not None: + elif neg_file is not None: print "Generating non-matching FASTQ file" - negative_handle = open(out_negative_file, "w") + negative_handle = open(neg_file, "w") for title, seq, qual in FastqGeneralIterator(handle): - if clean_name(title.split(None, 1)[0]) in ids: + if clean_name(title.split(None, 1)[0]) in wanted: pos_count += 1 else: negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) @@ -328,48 +332,48 @@ try: from Bio.SeqIO.SffIO import SffIterator, SffWriter except ImportError: - sys_exit("SFF filtering requires Biopython 1.54 or later") + sys.exit("SFF filtering requires Biopython 1.54 or later") try: from Bio.SeqIO.SffIO import ReadRocheXmlManifest except ImportError: - #Prior to Biopython 1.56 this was a private function + # Prior to Biopython 1.56 this was a private function from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest - in_handle = open(in_file, "rb") #must be binary mode! + in_handle = open(in_file, "rb") # must be binary mode! try: manifest = ReadRocheXmlManifest(in_handle) except ValueError: manifest = None - #This makes two passes though the SFF file with isn't so efficient, - #but this makes the code simple. + # This makes two passes though the SFF file with isn't so efficient, + # but this makes the code simple. pos_count = neg_count = 0 - if out_positive_file is not None: - out_handle = open(out_positive_file, "wb") + if pos_file is not None: + out_handle = open(pos_file, "wb") writer = SffWriter(out_handle, xml=manifest) - in_handle.seek(0) #start again after getting manifest - pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids) + in_handle.seek(0) # start again after getting manifest + pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted) out_handle.close() - if out_negative_file is not None: - out_handle = open(out_negative_file, "wb") + if neg_file is not None: + out_handle = open(neg_file, "wb") writer = SffWriter(out_handle, xml=manifest) - in_handle.seek(0) #start again - neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids) + in_handle.seek(0) # start again + neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted) out_handle.close() - #And we're done + # And we're done in_handle.close() return pos_count, neg_count -if seq_format.lower()=="sff": +if seq_format.lower() == "sff": sequence_filter = sff_filter -elif seq_format.lower()=="fasta": +elif seq_format.lower() == "fasta": sequence_filter = fasta_filter elif seq_format.lower().startswith("fastq"): sequence_filter = fastq_filter else: - sys_exit("Unsupported file type %r" % seq_format) + sys.exit("Unsupported file type %r" % seq_format) pos_count, neg_count = sequence_filter(in_file, out_positive_file, out_negative_file, ids) print("%i mapped and %i unmapped reads." % (pos_count, neg_count)) diff -r 8ff0ac66f1a3 -r 48e71dfd51b3 tools/seq_filter_by_mapping/seq_filter_by_mapping.xml --- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml Wed May 13 11:08:58 2015 -0400 +++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml Wed May 10 13:16:44 2017 -0400 @@ -1,28 +1,23 @@ - + from SAM/BAM file - biopython - Bio - samtools + biopython samtools - - - - - - seq_filter_by_mapping.py --version - -seq_filter_by_mapping.py -i "$input_file" -f "$input_file.ext" -m $pair_mode + +python $__tool_directory__/seq_filter_by_mapping.py --version + + +python $__tool_directory__/seq_filter_by_mapping.py -i '$input_file' -f '$input_file.ext' -m $pair_mode #if $output_choice_cond.output_choice=="both" - -p $output_pos -n $output_neg + -p '$output_pos' -n '$output_neg' #elif $output_choice_cond.output_choice=="pos" - -p $output_pos + -p '$output_pos' #elif $output_choice_cond.output_choice=="neg" - -n $output_neg + -n '$output_neg' #end if ## Now loop over all the mapping files -#for i in $mapping_file#${i} #end for# +#for i in $mapping_file#'${i}' #end for# diff -r 8ff0ac66f1a3 -r 48e71dfd51b3 tools/seq_filter_by_mapping/tool_dependencies.xml --- a/tools/seq_filter_by_mapping/tool_dependencies.xml Wed May 13 11:08:58 2015 -0400 +++ b/tools/seq_filter_by_mapping/tool_dependencies.xml Wed May 10 13:16:44 2017 -0400 @@ -1,9 +1,9 @@ - - + + - +