seq_filter_by_mapping: tools/seq_filter_by_mapping/seq_filter_by

comparison tools/seq_filter_by_mapping/seq_filter_by_mapping.py @ 2:48e71dfd51b3 draft

v0.0.5 Depend on Biopython 1.67 from Tool Shed or (Bio)conda

author	peterjc
date	Wed, 10 May 2017 13:16:44 -0400
parents	8ff0ac66f1a3
children	481b0a925e66

comparison

equal deleted inserted replaced

-:8ff0ac66f1a3
+:48e71dfd51b3
 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
 See accompanying text file for licence details (MIT license).
 Use -v or --version to get the version, -h or --help for help.
 """
 import os
-import sys
 import re
 import subprocess
+import sys
 from optparse import OptionParser
-def sys_exit(msg, err=1):
+# Parse Command Line
-sys.stderr.write(msg.rstrip() + "\n")
-sys.exit(err)
-#Parse Command Line
 usage = """Use as follows:
 $ python seq_filter_by_mapping.py [options] mapping.sam/bam [more mappings]
 e.g. Positive matches using column one from a single BAM file:
 help="Show version and quit")
 options, args = parser.parse_args()
 if options.version:
-print "v0.0.3"
+print "v0.0.5"
 sys.exit(0)
 in_file = options.input
 seq_format = options.format
 out_positive_file = options.output_positive
 out_negative_file = options.output_negative
 pair_mode = options.pair_mode
 if in_file is None or not os.path.isfile(in_file):
-sys_exit("Missing input file: %r" % in_file)
+sys.exit("Missing input file: %r" % in_file)
 if out_positive_file is None and out_negative_file is None:
-sys_exit("Neither output file requested")
+sys.exit("Neither output file requested")
 if seq_format is None:
-sys_exit("Missing sequence format")
+sys.exit("Missing sequence format")
 if pair_mode not in ["lax", "strict"]:
-sys_exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode)
+sys.exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode)
 for mapping in args:
 if not os.path.isfile(mapping):
-sys_exit("Mapping file %r not found" % mapping)
+sys.exit("Mapping file %r not found" % mapping)
 if not args:
-sys_exit("At least one SAM/BAM mapping file is required")
+sys.exit("At least one SAM/BAM mapping file is required")
-#Cope with three widely used suffix naming convensions,
+# Cope with three widely used suffix naming convensions,
-#Illumina: /1 or /2
+# Illumina: /1 or /2
-#Forward/revered: .f or .r
+# Forward/revered: .f or .r
-#Sanger, e.g. .p1k and .q1k
+# Sanger, e.g. .p1k and .q1k
-#See http://staden.sourceforge.net/manual/pregap4_unix_50.html
+# See http://staden.sourceforge.net/manual/pregap4_unix_50.html
-#re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
+# re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
-#re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
+# re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
 re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$")
 assert re_suffix.search("demo.f")
 assert re_suffix.search("demo.s1")
 assert re_suffix.search("demo.f1k")
 assert re_suffix.search("demo.p1")
 assert re_suffix.search("demo/2")
 assert re_suffix.search("demo.r")
 assert re_suffix.search("demo.q1")
 assert re_suffix.search("demo.q1lk")
 def clean_name(name):
 """Remove suffix."""
 match = re_suffix.search(name)
 if match:
 # Use the fact this is a suffix, and regular expression will be
 # anchored to the end of the name:
 return name[:match.start()]
 else:
 # Nothing to do
 return name
 assert clean_name("foo/1") == "foo"
 assert clean_name("foo/2") == "foo"
 assert clean_name("bar.f") == "bar"
 assert clean_name("bar.r") == "bar"
 assert clean_name("baz.p1") == "baz"
 assert clean_name("baz.q2") == "baz"
-mapped_chars = { '>' :'__gt__',
+mapped_chars = {
-'<' :'__lt__',
+'>': '__gt__',
-"'" :'__sq__',
+'<': '__lt__',
-'"' :'__dq__',
+"'": '__sq__',
-'[' :'__ob__',
+'"': '__dq__',
-']' :'__cb__',
+'[': '__ob__',
-'{' :'__oc__',
+']': '__cb__',
-'}' :'__cc__',
+'{': '__oc__',
-'@' : '__at__',
+'}': '__cc__',
-'\n' : '__cn__',
+'@': '__at__',
-'\r' : '__cr__',
+'\n': '__cn__',
-'\t' : '__tc__',
+'\r': '__cr__',
-'#' : '__pd__'
+'\t': '__tc__',
-}
+'#': '__pd__',
+}
 def load_mapping_ids(filename, pair_mode, ids):
 """Parse SAM/BAM file, updating given set of ids.
 Parses BAM files via call out to samtools view command.
 stdout, stderr = child.communicate()
 assert child.returncode is not None
 if child.returncode:
 msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode,
 filename, stderr)
-sys_exit(msg.strip(), child.returncode)
+sys.exit(msg.strip(), child.returncode)
 else:
 handle.close()
 # Read mapping file(s) and record all mapped identifiers
 # more than just qname (need /1 or /2 indicator)
 print("Loaded %i mapped IDs" % (len(ids)))
 if len(ids) < 10:
 print("Looking for %s" % ", ".join(sorted(ids)))
 def crude_fasta_iterator(handle):
 """Yields tuples, record ID and the full record as a string."""
 while True:
 line = handle.readline()
 if line == "":
-return # Premature end of file, or just empty?
+return  # Premature end of file, or just empty?
 if line[0] == ">":
 break
 no_id_warned = False
 while True:
 break
 lines.append(line)
 line = handle.readline()
 yield id, "".join(lines)
 if not line:
-return # StopIteration
+return  # StopIteration
 def fasta_filter(in_file, pos_file, neg_file, wanted):
 """FASTA filter producing 60 character line wrapped outout."""
 pos_count = neg_count = 0
-#Galaxy now requires Python 2.5+ so can use with statements,
+# Galaxy now requires Python 2.5+ so can use with statements,
 with open(in_file) as in_handle:
-#Doing the if statement outside the loop for speed
+# Doing the if statement outside the loop for speed
-#(with the downside of three very similar loops).
+# (with the downside of three very similar loops).
 if pos_file is not None and neg_file is not None:
 print "Generating two FASTA files"
 with open(pos_file, "w") as pos_handle:
 with open(neg_file, "w") as neg_handle:
 for identifier, record in crude_fasta_iterator(in_handle):
 def fastq_filter(in_file, pos_file, neg_file, wanted):
 """FASTQ filter."""
 from Bio.SeqIO.QualityIO import FastqGeneralIterator
 pos_count = neg_count = 0
 handle = open(in_file, "r")
-if out_positive_file is not None and out_negative_file is not None:
+if pos_file is not None and neg_file is not None:
 print "Generating two FASTQ files"
-positive_handle = open(out_positive_file, "w")
+positive_handle = open(pos_file, "w")
-negative_handle = open(out_negative_file, "w")
+negative_handle = open(neg_file, "w")
 print in_file
 for title, seq, qual in FastqGeneralIterator(handle):
 # print("%s --> %s" % (title, clean_name(title.split(None, 1)[0])))
-if clean_name(title.split(None, 1)[0]) in ids:
+if clean_name(title.split(None, 1)[0]) in wanted:
 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
 pos_count += 1
 else:
 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
 neg_count += 1
 positive_handle.close()
 negative_handle.close()
-elif out_positive_file is not None:
+elif pos_file is not None:
 print "Generating matching FASTQ file"
-positive_handle = open(out_positive_file, "w")
+positive_handle = open(pos_file, "w")
 for title, seq, qual in FastqGeneralIterator(handle):
-if clean_name(title.split(None, 1)[0]) in ids:
+if clean_name(title.split(None, 1)[0]) in wanted:
 positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
 pos_count += 1
 else:
 neg_count += 1
 positive_handle.close()
-elif out_negative_file is not None:
+elif neg_file is not None:
 print "Generating non-matching FASTQ file"
-negative_handle = open(out_negative_file, "w")
+negative_handle = open(neg_file, "w")
 for title, seq, qual in FastqGeneralIterator(handle):
-if clean_name(title.split(None, 1)[0]) in ids:
+if clean_name(title.split(None, 1)[0]) in wanted:
 pos_count += 1
 else:
 negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
 neg_count += 1
 negative_handle.close()
 def sff_filter(in_file, pos_file, neg_file, wanted):
 """SFF filter."""
 try:
 from Bio.SeqIO.SffIO import SffIterator, SffWriter
 except ImportError:
-sys_exit("SFF filtering requires Biopython 1.54 or later")
+sys.exit("SFF filtering requires Biopython 1.54 or later")
 try:
 from Bio.SeqIO.SffIO import ReadRocheXmlManifest
 except ImportError:
-#Prior to Biopython 1.56 this was a private function
+# Prior to Biopython 1.56 this was a private function
 from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
-in_handle = open(in_file, "rb") #must be binary mode!
+in_handle = open(in_file, "rb")  # must be binary mode!
 try:
 manifest = ReadRocheXmlManifest(in_handle)
 except ValueError:
 manifest = None
-#This makes two passes though the SFF file with isn't so efficient,
+# This makes two passes though the SFF file with isn't so efficient,
-#but this makes the code simple.
+# but this makes the code simple.
 pos_count = neg_count = 0
-if out_positive_file is not None:
+if pos_file is not None:
-out_handle = open(out_positive_file, "wb")
+out_handle = open(pos_file, "wb")
 writer = SffWriter(out_handle, xml=manifest)
-in_handle.seek(0) #start again after getting manifest
+in_handle.seek(0)  # start again after getting manifest
-pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids)
+pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted)
 out_handle.close()
-if out_negative_file is not None:
+if neg_file is not None:
-out_handle = open(out_negative_file, "wb")
+out_handle = open(neg_file, "wb")
 writer = SffWriter(out_handle, xml=manifest)
-in_handle.seek(0) #start again
+in_handle.seek(0)  # start again
-neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids)
+neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted)
 out_handle.close()
-#And we're done
+# And we're done
 in_handle.close()
 return pos_count, neg_count
-if seq_format.lower()=="sff":
+if seq_format.lower() == "sff":
 sequence_filter = sff_filter
-elif seq_format.lower()=="fasta":
+elif seq_format.lower() == "fasta":
 sequence_filter = fasta_filter
 elif seq_format.lower().startswith("fastq"):
 sequence_filter = fastq_filter
 else:
-sys_exit("Unsupported file type %r" % seq_format)
+sys.exit("Unsupported file type %r" % seq_format)
 pos_count, neg_count = sequence_filter(in_file, out_positive_file, out_negative_file, ids)
 print("%i mapped and %i unmapped reads." % (pos_count, neg_count))
 fraction = float(pos_count) * 100.0 / float(pos_count + neg_count)
 print("In total %i reads, of which %0.1f%% mapped." % (pos_count + neg_count, fraction))

Mercurial > repos > peterjc > seq_filter_by_mapping

comparison tools/seq_filter_by_mapping/seq_filter_by_mapping.py @ 2:48e71dfd51b3 draft