Mercurial > repos > nick > sequence_content_trimmer

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/getreads.py	Tue Dec 01 21:33:27 2015 -0500
@@ -0,0 +1,156 @@
+"""A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and
+sequence.
+All format parsers follow this API:
+  with open('sequence.fasta') as fasta:
+    for read in getreads.getparser(fasta, filetype='fasta'):
+      print "There is a sequence with this FASTA identifier: "+read.id
+      print "Its sequence is "+read.seq
+The properties of Read are:
+  name: The entire FASTA header line, SAM column 1, etc.
+  id:   The first whitespace-delimited part of the name.
+  seq:  The sequence.
+  qual: The quality scores (unless the format is FASTA).
+"""
+
+
+def getparser(filehandle, filetype='fasta'):
+  if filetype == 'fasta':
+    return FastaReader(filehandle)
+  elif filetype == 'fastq':
+    return FastqReader(filehandle)
+  elif filetype == 'sam':
+    return SamReader(filehandle)
+  elif filetype == 'tsv':
+    return TsvReader(filehandle)
+  else:
+    raise ValueError('Illegal argument: filetype=\''+filetype+'\'')
+
+
+class FormatError(Exception):
+  def __init__(self, message=None):
+    if message:
+      Exception.__init__(self, message)
+
+
+class Read(object):
+  def __init__(self, name='', seq='', id_='', qual=''):
+    self.name = name
+    self.seq = seq
+    self.id = id_
+    self.qual = qual
+
+
+class Reader(object):
+  """Base class for all other parsers."""
+  def __init__(self, filehandle):
+    self.filehandle = filehandle
+  def __iter__(self):
+    return self.parser()
+
+
+class TsvReader(Reader):
+  """A parser for a simple tab-delimited format.
+  Column 1: name
+  Column 2: sequence
+  Column 3: quality scores (optional)"""
+  def parser(self):
+    for line in self.filehandle:
+      fields = line.rstrip('\r\n').split('\t')
+      if len(fields) < 2:
+        continue
+      read = Read()
+      read.name = fields[0]
+      if read.name:
+        read.id = read.name.split()[0]
+      read.seq = fields[1]
+      if len(fields) >= 3:
+        read.qual = fields[2]
+      yield read
+
+
+class SamReader(Reader):
+  """A simple SAM parser.
+  Assumptions:
+  Lines starting with "@" with 3 fields are headers. All others are alignments.
+  All alignment lines have 11 or more fields. Other lines will be skipped.
+  """
+  def parser(self):
+    for line in self.filehandle:
+      fields = line.split('\t')
+      if len(fields) < 11:
+        continue
+      # Skip headers.
+      if fields[0].startswith('@') and len(fields[0]) == 3:
+        continue
+      read = Read()
+      read.name = fields[0]
+      if read.name:
+        read.id = read.name.split()[0]
+      read.seq = fields[9]
+      read.qual = fields[10].rstrip('\r\n')
+      yield read
+
+
+class FastaReader(Reader):
+  """A simple FASTA parser that reads one sequence at a time into memory."""
+  def parser(self):
+    read = Read()
+    while True:
+      line_raw = self.filehandle.readline()
+      if not line_raw:
+        if read.seq:
+          yield read
+        raise StopIteration
+      line = line_raw.strip()
+      # Allow empty lines.
+      if not line:
+        continue
+      if line.startswith('>'):
+        if read.seq:
+          yield read
+        read = Read()
+        read.name = line[1:]  # remove ">"
+        if read.name:
+          read.id = read.name.split()[0]
+        continue
+      else:
+        read.seq += line
+
+
+class FastqReader(Reader):
+  """A simple FASTQ parser. Can handle multi-line sequences, though."""
+  def parser(self):
+    read = Read()
+    state = 'header'
+    while True:
+      line_raw = self.filehandle.readline()
+      if not line_raw:
+        if read.seq:
+          yield read
+        raise StopIteration
+      line = line_raw.strip()
+      # Allow empty lines.
+      if not line:
+        continue
+      if state == 'header':
+        if not line.startswith('@'):
+          raise FormatError('line state = "header" but line does not start with "@"')
+        if read.seq:
+          yield read
+        read = Read()
+        read.name = line[1:]  # remove '@'
+        if read.name:
+          read.id = read.name.split()[0]
+        state = 'sequence'
+      elif state == 'sequence':
+        if line.startswith('+'):
+          state = 'plus'
+        else:
+          read.seq += line
+      elif state == 'plus' or state == 'quality':
+        state = 'quality'
+        togo = len(read.seq) - len(read.qual)
+        read.qual += line[:togo]
+        # The end of the quality lines is when we have a quality string as long as the sequence.
+        if len(read.qual) >= len(read.seq):
+          state = 'header'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trimmer.py	Tue Dec 01 21:33:27 2015 -0500
@@ -0,0 +1,220 @@
+#!/usr/bin/env python
+from __future__ import division
+import sys
+import argparse
+import getreads
+
+OPT_DEFAULTS = {'win_len':1, 'thres':1.0, 'filt_bases':'N'}
+USAGE = "%(prog)s [options] [input_1.fq [input_2.fq output_1.fq output_2.fq]]"
+DESCRIPTION = """Trim the 5' ends of reads by sequence content, e.g. by GC content or presence of
+N's."""
+
+
+def main(argv):
+
+  parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE)
+  parser.set_defaults(**OPT_DEFAULTS)
+
+  parser.add_argument('infile1', metavar='reads_1.fq', nargs='?', type=argparse.FileType('r'),
+    default=sys.stdin,
+    help='Input reads (mate 1). Omit to read from stdin.')
+  parser.add_argument('infile2', metavar='reads_2.fq', nargs='?', type=argparse.FileType('r'),
+    help='Input reads (mate 2). If given, it will preserve pairs (if one read is filtered out '
+         'entirely, the other will also be lost).')
+  parser.add_argument('outfile1', metavar='reads.filt_1.fq', nargs='?', type=argparse.FileType('w'),
+    default=sys.stdout,
+    help='Output file for mate 1. WARNING: Will overwrite.')
+  parser.add_argument('outfile2', metavar='reads.filt_2.fq', nargs='?', type=argparse.FileType('w'),
+    help='Output file for mate 2. WARNING: Will overwrite.')
+  parser.add_argument('-f', '--format', dest='filetype', choices=('fasta', 'fastq'),
+    help='Input read format.')
+  parser.add_argument('-F', '--out-format', dest='out_filetype', choices=('fasta', 'fastq'),
+    help='Output read format. Default: whatever the input format is.')
+  parser.add_argument('-b', '--filt-bases',
+    help='The bases to filter on. Case-insensitive. Default: %(default)s.')
+  parser.add_argument('-t', '--thres', type=float,
+    help='The threshold. The read will be trimmed once the proportion of filter bases in the '
+         'window exceed this fraction (not a percentage). Default: %(default)s.')
+  parser.add_argument('-w', '--window', dest='win_len', type=int,
+    help='Window size for trimming. Default: %(default)s.')
+  parser.add_argument('-i', '--invert', action='store_true',
+    help='Invert the filter bases: filter on bases NOT present in the --filt-bases.')
+  parser.add_argument('-m', '--min-length', type=int,
+    help='Set a minimum read length. Reads which are trimmed below this length will be filtered '
+         'out (omitted entirely from the output). Read pairs will be preserved: both reads in a '
+         'pair must exceed this length to be kept. Set to 0 to only omit empty reads.')
+  parser.add_argument('--error',
+    help='Fail with this error message (useful for Galaxy tool).')
+  parser.add_argument('-A', '--acgt', action='store_true',
+    help='Filter on any non-ACGT base (shortcut for "--invert --filt-bases ACGT").')
+  parser.add_argument('-I', '--iupac', action='store_true',
+    help='Filter on any non-IUPAC base (shortcut for "--invert --filt-bases ACGTUWSMKRYBDHVN-").')
+
+  args = parser.parse_args(argv[1:])
+
+  if args.error:
+    fail('Error: '+args.error)
+
+  # Catch invalid argument combinations.
+  if args.infile1 and args.infile2 and not (args.outfile1 and args.outfile2):
+    fail('Error: If giving two input files (paired end), must specify both output files.')
+  # Determine filetypes, open input file parsers.
+  filetype1 = get_filetype(args.infile1, args.filetype)
+  file1_parser = iter(getreads.getparser(args.infile1, filetype=filetype1))
+  if args.infile2:
+    paired = True
+    filetype2 = get_filetype(args.infile2, args.filetype)
+    file2_parser = iter(getreads.getparser(args.infile2, filetype=filetype2))
+  else:
+    filetype2 = None
+    file2_parser = None
+    paired = False
+  # Override output filetypes if it was specified on the command line.
+  if args.out_filetype:
+    filetype1 = args.out_filetype
+    filetype2 = args.out_filetype
+
+  # Determine the filter bases and whether to invert the selection.
+  filt_bases = args.filt_bases
+  invert = args.invert
+  if args.acgt:
+    filt_bases = 'ACGT'
+    invert = True
+  elif args.iupac:
+    filt_bases = 'ACGTUWSMKRYBDHVN-'
+    invert = True
+
+  # Do the actual trimming.
+  try:
+    trim_reads(file1_parser, file2_parser, args.outfile1, args.outfile2, filetype1, filetype2,
+               paired, args.win_len, args.thres, filt_bases, invert, args.min_length)
+  finally:
+    for filehandle in (args.infile1, args.infile2, args.outfile1, args.outfile2):
+      if filehandle and filehandle is not sys.stdin and filehandle is not sys.stdout:
+        filehandle.close()
+
+
+def trim_reads(file1_parser, file2_parser, outfile1, outfile2, filetype1, filetype2, paired,
+               win_len, thres, filt_bases, invert, min_length):
+  """Trim all the reads in the input file(s), writing to the output file(s)."""
+  read1 = None
+  read2 = None
+  while True:
+    # Read in the reads.
+    try:
+      read1 = next(file1_parser)
+      if paired:
+        read2 = next(file2_parser)
+    except StopIteration:
+      break
+    # Do trimming.
+    read1.seq = trim_read(read1.seq, win_len, thres, filt_bases, invert)
+    if filetype1 == 'fastq':
+      # If the output filetype is FASTQ, trim the quality scores too.
+      # If there are no input quality scores (i.e. the input is FASTA), use dummy scores instead.
+      # "z" is the highest alphanumeric score (PHRED 89), higher than any expected real score.
+      qual1 = read1.qual or 'z' * len(read1.seq)
+      read1.qual = qual1[:len(read1.seq)]
+    if paired:
+      read2.seq = trim_read(read2.seq, win_len, thres, filt_bases, invert)
+      if filetype2 == 'fastq':
+        qual2 = read2.qual or 'z' * len(read2.seq)
+        read2.qual = qual2[:len(read2.seq)]
+      # Output reads if they both pass the minimum length threshold (if any was given).
+      if min_length is None or (len(read1.seq) >= min_length and len(read2.seq) >= min_length):
+        write_read(outfile1, read1, filetype1)
+        write_read(outfile2, read2, filetype2)
+    else:
+      # Output read if it passes the minimum length threshold (if any was given).
+      if min_length is None or len(read1.seq) >= min_length:
+        write_read(outfile1, read1, filetype1)
+
+
+def get_filetype(infile, filetype_arg):
+  if infile is sys.stdin:
+    if filetype_arg:
+      filetype = filetype_arg
+    else:
+      fail('Error: You must specify the --format if reading from stdin.')
+  elif infile:
+    if filetype_arg:
+      filetype = filetype_arg
+    else:
+      if infile.name.endswith('.fa') or infile.name.endswith('.fasta'):
+        filetype = 'fasta'
+      elif infile.name.endswith('.fq') or infile.name.endswith('.fastq'):
+        filetype = 'fastq'
+      else:
+        fail('Error: Unrecognized file ending on "{}". Please specify the --format.'.format(infile))
+  else:
+    fail('Error: infile is {}'.format(infile))
+  return filetype
+
+
+def write_read(filehandle, read, filetype):
+  if filetype == 'fasta':
+    filehandle.write('>{name}\n{seq}\n'.format(**vars(read)))
+  elif filetype == 'fastq':
+    filehandle.write('@{name}\n{seq}\n+\n{qual}\n'.format(**vars(read)))
+
+
+def trim_read(seq, win_len, thres, filt_bases, invert):
+  """Trim an individual read and return its trimmed sequence.
+  This will track the frequency of bad bases in a window of length win_len, and trim once the
+  frequency goes below thres. The trim point will be just before the first (leftmost) bad base in
+  the window (the first window with a frequency below thres). The "bad" bases are the ones in
+  filt_bases if invert is False, or any base NOT in filt_bases if invert is True."""
+  # Algorithm:
+  # The window is a list which acts as a FIFO. As we scan from the left (3') end to the right (5')
+  # end, we append new bases to the right end of the window and pop them from the left end.
+  # Each base is only examined twice: when it enters the window and when it leaves it.
+  # We keep a running total of the number of bad bases in bad_bases_count, incrementing it when bad
+  # bases enter the window and decrementing it when they leave.
+  # We also track the location of bad bases in the window with bad_bases_coords so we can figure out
+  # where to cut if we have to trim.
+  max_bad_bases = win_len * thres
+  window = []
+  bad_bases_count = 0
+  bad_bases_coords = []
+  for coord, base in enumerate(seq.upper()):
+    # Shift window, adjust bad_bases_count and bad_bases_coords list.
+    window.append(base)
+    # Is the new base we're adding to the window a bad base?
+    if invert:
+      bad_base = base not in filt_bases
+    else:
+      bad_base = base in filt_bases
+    # If so, increment the total and add its coordinate to the window.
+    if bad_base:
+      bad_bases_count += 1
+      bad_bases_coords.append(coord)
+    if len(window) > win_len:
+      first_base = window.pop(0)
+      # Is the base we're removing (the first base in the window) a bad base?
+      if invert:
+        bad_base = first_base not in filt_bases
+      else:
+        bad_base = first_base in filt_bases
+      # If so, decrement the total and remove its coordinate from the window.
+      if bad_base:
+        bad_bases_count -= 1
+        bad_bases_coords.pop(0)
+    # print bad_bases_coords
+    # Are we over the threshold?
+    if bad_bases_count > max_bad_bases:
+      break
+  # If we exceeded the threshold, trim the sequence at the first (leftmost) bad base in the window.
+  if bad_bases_count > max_bad_bases:
+    first_bad_base = bad_bases_coords[0]
+    return seq[0:first_bad_base]
+  else:
+    return seq
+
+
+def fail(message):
+  sys.stderr.write(message+"\n")
+  sys.exit(1)
+
+
+if __name__ == '__main__':
+  sys.exit(main(sys.argv))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trimmer.xml	Tue Dec 01 21:33:27 2015 -0500
@@ -0,0 +1,84 @@
+<tool id="sequence_content_trimmer" version="0.1" name="Sequence Content Trimmer">
+  <description>trim reads based on certain bases</description>
+  <command interpreter="python">
+  trimmer.py $input1
+  #if $paired.is_paired:
+    $input2 $output1 $output2
+    #if ('fasta' in $input1.extension and 'fastq' in $input2.extension) or ('fastq' in $input1.extension and 'fasta' in $input2.extension)
+      --error 'Both input files must be either fastq or fasta (no mixing the two).'
+    #end if
+  #end if
+  #if $input1.extension == 'fastq' or $input1.extension == 'fastqsanger' or $input1.extension == 'fastqillumina' or $input1.extension == 'fastqsolexa'
+    -f fastq
+  #elif $input1.extension == 'fasta'
+    -f fasta
+  #else
+    -f $input1.extension
+  #end if
+  -b $bases -t $thres -w $win_len $invert
+  #if $min_len.has_min_len:
+    -m $min_len.value
+  #end if
+  #if not $paired.is_paired:
+    &gt; $output1
+  #end if
+  </command>
+  <inputs>
+    <conditional name="paired">
+      <param name="is_paired" type="select" label="Paired reads?">
+        <option value="" selected="True">Unpaired</option>
+        <option value="true">Paired</option>
+      </param>
+      <when value="true">
+        <param name="input1" type="data" format="fasta,fastq" label="Input reads (mate 1)"/>
+        <param name="input2" type="data" format="fasta,fastq" label="Input reads (mate 2)"/>
+      </when>
+      <when value="">
+        <param name="input1" type="data" format="fasta,fastq" label="Input reads"/>
+      </when>
+    </conditional>
+    <param name="bases" type="text" value="N" label="Bases to filter on"/>
+    <param name="thres" type="float" value="0.5" min="0" max="1" label="Frequency threshold" help="Trim when the frequency of filter bases (or non-filter bases, if inverting) exceeds this value."/>
+    <param name="win_len" type="integer" value="10" min="1" label="Size of the window"/>
+    <param name="invert" type="boolean" truevalue="--invert" falsevalue="" checked="False" label="Invert filter bases" help="Trim when the frequency of bases NOT in the &quot;filter bases&quot; list exceeds the threshold."/>
+    <conditional name="min_len">
+      <param name="has_min_len" type="boolean" truevalue="true" falsevalue="" checked="False" label="Set a minimum read length"/>
+      <when value="true">
+        <param name="value" type="integer" value="10" min="0" label="Minimum read length" help="Reads trimmed to less than this length will be omitted from the output. Pairs will be preserved: both must exceed this threshold to be kept."/>
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data name="output1" format_source="input1"/>
+    <data name="output2" format_source="input2">
+      <filter>paired['is_paired']</filter>
+    </data>
+  </outputs>
+
+  <help>
+
+.. class:: infomark
+
+**What it does**
+
+This tool trims the 3' ends of reads based on the presence of the given bases. For instance, trim when N's are encountered or when the GC content exceeds a certain frequency.
+
+
+.. class:: infomark
+
+**How it works**
+
+This will slide along the read with a window, and trim once the frequency of filter bases exceeds the frequency threshold (unless "Invert filter bases" is enabled, when it will trim once non-filter bases exceed the threshold).
+
+The trim point will be just before the first (leftmost) filter base in the final window (the one where the frequency exceeded the threshold).
+
+
+.. class:: infomark
+
+**Input**
+
+The inputs can be in the following formats: fasta, fastq, fastqsanger, fastqillumina, and fastqsolexa. Both must be either a fasta or fastq type (no mixing fastq and fasta).
+
+  </help>
+
+</tool>