Mercurial > repos > nick > sequence_content_trimmer

--- a/getreads.py	Tue Dec 01 21:33:27 2015 -0500
+++ b/getreads.py	Fri May 27 23:29:45 2022 +0000
@@ -1,3 +1,9 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+import os
+import sys
+import types
 """A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and
 sequence.
 All format parsers follow this API:
@@ -12,18 +18,42 @@
   qual: The quality scores (unless the format is FASTA).
 """

+# Available formats.
+FORMATS = ('fasta', 'fastq', 'sam', 'tsv', 'lines')

-def getparser(filehandle, filetype='fasta'):
+QUAL_OFFSETS = {'sanger':33, 'solexa':64}
+
+
+def getparser(input, filetype, qual_format='sanger', name_col=1, seq_col=2, qual_col=3):
+  # Detect whether the input is an open file or a path.
+  # Return the appropriate reader.
   if filetype == 'fasta':
-    return FastaReader(filehandle)
+    return FastaReader(input)
   elif filetype == 'fastq':
-    return FastqReader(filehandle)
+    return FastqReader(input, qual_format=qual_format)
   elif filetype == 'sam':
-    return SamReader(filehandle)
+    return SamReader(input, qual_format=qual_format)
   elif filetype == 'tsv':
-    return TsvReader(filehandle)
+    return TsvReader(input, qual_format=qual_format,
+                     name_col=name_col, seq_col=seq_col, qual_col=qual_col)
+  elif filetype == 'lines':
+    return LineReader(input)
   else:
-    raise ValueError('Illegal argument: filetype=\''+filetype+'\'')
+    raise ValueError('Unrecognized format: {!r}'.format(filetype))
+
+
+def detect_input_type(obj):
+  """Is this an open filehandle, or is it a file path (string)?"""
+  try:
+    os.path.isfile(obj)
+    return 'path'
+  except TypeError:
+    if isinstance(obj, types.GeneratorType):
+      return 'generator'
+    elif hasattr(obj, 'read') and hasattr(obj, 'close'):
+      return 'file'
+    else:
+      return None


 class FormatError(Exception):
@@ -33,19 +63,79 @@


 class Read(object):
-  def __init__(self, name='', seq='', id_='', qual=''):
+  def __init__(self, name='', seq='', id_='', qual='', qual_format='sanger'):
     self.name = name
     self.seq = seq
-    self.id = id_
     self.qual = qual
+    if id_ or not self.name:
+      self.id = id_
+    elif self.name:
+      self.id = self.name.split()[0]
+    self.offset = QUAL_OFFSETS[qual_format]
+  @property
+  def scores(self):
+    if self.qual is None:
+      return None
+    scores = []
+    for qual_char in self.qual:
+      scores.append(ord(qual_char) - self.offset)
+    return scores
+  def to_fasta(self):
+    return f'>{self.name}\n{self.seq}'
+  def to_fastq(self):
+    return f'@{self.name}\n{self.seq}\n+\n{self.qual}'
+  def __str__(self):
+    if self.qual:
+      return self.to_fastq()
+    else:
+      return self.to_fasta()
+  def __repr__(self):
+    kwarg_strs = []
+    for kwarg in 'name', 'seq', 'id_', 'qual':
+      attr = kwarg.rstrip('_')
+      raw_value = getattr(self, attr)
+      if raw_value is not None and len(raw_value) >= 200:
+        value = raw_value[:199]+'…'
+      else:
+        value = raw_value
+      if value:
+        kwarg_strs.append(f'{kwarg}={value!r}')
+    return type(self).__name__+'('+', '.join(kwarg_strs)+')'


 class Reader(object):
   """Base class for all other parsers."""
-  def __init__(self, filehandle):
-    self.filehandle = filehandle
+  def __init__(self, input, **kwargs):
+    self.input = input
+    self.input_type = detect_input_type(input)
+    if self.input_type not in ('path', 'file', 'generator'):
+      raise ValueError('Input object {!r} not a file, string, or generator.'.format(input))
+    for key, value in kwargs.items():
+      setattr(self, key, value)
   def __iter__(self):
     return self.parser()
+  def bases(self):
+    for read in self.parser():
+      for base in read.seq:
+        yield base
+  def get_input_iterator(self):
+    if self.input_type == 'path':
+      return open(self.input)
+    else:
+      return self.input
+
+
+class LineReader(Reader):
+  """A parser for the simplest format: Only the sequence, one line per read."""
+  def parser(self):
+    input_iterator = self.get_input_iterator()
+    try:
+      for line in input_iterator:
+        read = Read(seq=line.rstrip('\r\n'))
+        yield read
+    finally:
+      if self.input_type == 'path':
+        input_iterator.close()


 class TsvReader(Reader):
@@ -54,18 +144,26 @@
   Column 2: sequence
   Column 3: quality scores (optional)"""
   def parser(self):
-    for line in self.filehandle:
-      fields = line.rstrip('\r\n').split('\t')
-      if len(fields) < 2:
-        continue
-      read = Read()
-      read.name = fields[0]
-      if read.name:
-        read.id = read.name.split()[0]
-      read.seq = fields[1]
-      if len(fields) >= 3:
-        read.qual = fields[2]
-      yield read
+    min_fields = max(self.name_col, self.seq_col)
+    input_iterator = self.get_input_iterator()
+    try:
+      for line in input_iterator:
+        fields = line.rstrip('\r\n').split('\t')
+        if len(fields) < min_fields:
+          continue
+        read = Read(qual_format=self.qual_format)
+        read.name = fields[self.name_col-1]
+        if read.name:
+          read.id = read.name.split()[0]
+        read.seq = fields[self.seq_col-1]
+        try:
+          read.qual = fields[self.qual_col-1]
+        except (TypeError, IndexError):
+          pass
+        yield read
+    finally:
+      if self.input_type == 'path':
+        input_iterator.close()


 class SamReader(Reader):
@@ -75,82 +173,148 @@
   All alignment lines have 11 or more fields. Other lines will be skipped.
   """
   def parser(self):
-    for line in self.filehandle:
-      fields = line.split('\t')
-      if len(fields) < 11:
-        continue
-      # Skip headers.
-      if fields[0].startswith('@') and len(fields[0]) == 3:
-        continue
-      read = Read()
-      read.name = fields[0]
-      if read.name:
-        read.id = read.name.split()[0]
-      read.seq = fields[9]
-      read.qual = fields[10].rstrip('\r\n')
-      yield read
+    input_iterator = self.get_input_iterator()
+    try:
+      for line in input_iterator:
+        fields = line.split('\t')
+        if len(fields) < 11:
+          continue
+        # Skip headers.
+        if fields[0].startswith('@') and len(fields[0]) == 3:
+          continue
+        read = Read(qual_format=self.qual_format)
+        read.name = fields[0]
+        if read.name:
+          read.id = read.name.split()[0]
+        read.seq = fields[9]
+        read.qual = fields[10].rstrip('\r\n')
+        yield read
+    finally:
+      if self.input_type == 'path':
+        input_iterator.close()


 class FastaReader(Reader):
   """A simple FASTA parser that reads one sequence at a time into memory."""
   def parser(self):
-    read = Read()
-    while True:
-      line_raw = self.filehandle.readline()
-      if not line_raw:
-        if read.seq:
-          yield read
-        raise StopIteration
-      line = line_raw.strip()
-      # Allow empty lines.
-      if not line:
-        continue
-      if line.startswith('>'):
-        if read.seq:
-          yield read
-        read = Read()
-        read.name = line[1:]  # remove ">"
-        if read.name:
-          read.id = read.name.split()[0]
-        continue
-      else:
-        read.seq += line
+    input_iterator = self.get_input_iterator()
+    try:
+      read = None
+      while True:
+        try:
+          line_raw = next(input_iterator)
+        except StopIteration:
+          if read is not None:
+            yield read
+          return
+        line = line_raw.rstrip('\r\n')
+        if line.startswith('>'):
+          if read is not None:
+            yield read
+          read = Read()
+          read.name = line[1:]  # remove ">"
+          if read.name:
+            read.id = read.name.split()[0]
+          continue
+        else:
+          read.seq += line
+    finally:
+      if self.input_type == 'path':
+        input_iterator.close()


 class FastqReader(Reader):
   """A simple FASTQ parser. Can handle multi-line sequences, though."""
   def parser(self):
-    read = Read()
-    state = 'header'
-    while True:
-      line_raw = self.filehandle.readline()
-      if not line_raw:
-        if read.seq:
-          yield read
-        raise StopIteration
-      line = line_raw.strip()
-      # Allow empty lines.
-      if not line:
-        continue
-      if state == 'header':
-        if not line.startswith('@'):
-          raise FormatError('line state = "header" but line does not start with "@"')
-        if read.seq:
-          yield read
-        read = Read()
-        read.name = line[1:]  # remove '@'
-        if read.name:
-          read.id = read.name.split()[0]
-        state = 'sequence'
-      elif state == 'sequence':
-        if line.startswith('+'):
-          state = 'plus'
-        else:
-          read.seq += line
-      elif state == 'plus' or state == 'quality':
-        state = 'quality'
-        togo = len(read.seq) - len(read.qual)
-        read.qual += line[:togo]
-        # The end of the quality lines is when we have a quality string as long as the sequence.
-        if len(read.qual) >= len(read.seq):
-          state = 'header'
+    input_iterator = self.get_input_iterator()
+    try:
+      read = None
+      line_num = 0
+      state = 'header'
+      while True:
+        try:
+          line_raw = next(input_iterator)
+        except StopIteration:
+          if read is not None:
+            yield read
+          return
+        line_num += 1
+        line = line_raw.rstrip('\r\n')
+        if state == 'header':
+          if not line.startswith('@'):
+            if line:
+              raise FormatError('line state = "header" but line does not start with "@":\n'+line)
+            else:
+              # Allow empty lines.
+              continue
+          if read is not None:
+            yield read
+          read = Read(qual_format=self.qual_format)
+          read.name = line[1:]  # remove '@'
+          if read.name:
+            read.id = read.name.split()[0]
+          state = 'sequence'
+        elif state == 'sequence':
+          if line.startswith('+'):
+            state = 'plus'
+          else:
+            read.seq += line
+        elif state == 'plus' or state == 'quality':
+          if line.startswith('@') and state == 'quality':
+            logging.warning('Looking for more quality scores but line starts with "@". This might '
+                            'be a header line and there were fewer quality scores than bases: {}'
+                            .format(line[:69]))
+          state = 'quality'
+          togo = len(read.seq) - len(read.qual)
+          read.qual += line[:togo]
+          # The end of the quality lines is when we have a quality string as long as the sequence.
+          if len(read.qual) >= len(read.seq):
+            state = 'header'
+    finally:
+      if self.input_type == 'path':
+        input_iterator.close()
+
+
+DESCRIPTION = 'Test parser by parsing an input file and printing its contents.'
+
+
+def make_argparser():
+  parser = argparse.ArgumentParser(description=DESCRIPTION)
+  parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin,
+    help='Input reads.')
+  parser.add_argument('-f', '--format', choices=('fasta', 'fastq', 'sam', 'tsv', 'lines'),
+    help='Input read format. Will be detected from the filename, if given.')
+  return parser
+
+
+def main(argv):
+  parser = make_argparser()
+  args = parser.parse_args(argv[1:])
+  if args.format:
+    format = args.format
+  elif args.infile is sys.stdin:
+    fail('Error: Must give a --format if reading from stdin.')
+  else:
+    ext = os.path.splitext(args.infile.name)[1]
+    if ext == '.fq':
+      format = 'fastq'
+    elif ext == '.fa':
+      format = 'fasta'
+    elif ext == '.txt':
+      format = 'lines'
+    else:
+      format = ext[1:]
+  print('Reading input as format {!r}.'.format(format))
+  for i, read in enumerate(getparser(args.infile, filetype=format)):
+    print('Read {} id/name: {!r}/{!r}'.format(i+1, read.id, read.name))
+    print('Read {} seq:  {!r}'.format(i+1, read.seq))
+    print('Read {} qual: {!r}'.format(i+1, read.qual))
+
+
+def fail(message):
+  sys.stderr.write(message+"\n")
+  sys.exit(1)
+
+
+if __name__ == '__main__':
+  sys.exit(main(sys.argv))
--- a/trimmer.py	Tue Dec 01 21:33:27 2015 -0500
+++ b/trimmer.py	Fri May 27 23:29:45 2022 +0000
@@ -1,59 +1,60 @@
-#!/usr/bin/env python
-from __future__ import division
+#!/usr/bin/env python3
 import sys
 import argparse
+import collections
 import getreads

-OPT_DEFAULTS = {'win_len':1, 'thres':1.0, 'filt_bases':'N'}
+QUANT_ORDER = 5
 USAGE = "%(prog)s [options] [input_1.fq [input_2.fq output_1.fq output_2.fq]]"
 DESCRIPTION = """Trim the 5' ends of reads by sequence content, e.g. by GC content or presence of
 N's."""


-def main(argv):
-
+def make_argparser():
   parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE)
-  parser.set_defaults(**OPT_DEFAULTS)
-
   parser.add_argument('infile1', metavar='reads_1.fq', nargs='?', type=argparse.FileType('r'),
     default=sys.stdin,
     help='Input reads (mate 1). Omit to read from stdin.')
   parser.add_argument('infile2', metavar='reads_2.fq', nargs='?', type=argparse.FileType('r'),
     help='Input reads (mate 2). If given, it will preserve pairs (if one read is filtered out '
          'entirely, the other will also be lost).')
-  parser.add_argument('outfile1', metavar='reads.filt_1.fq', nargs='?', type=argparse.FileType('w'),
+  parser.add_argument('outfile1', metavar='output_1.fq', nargs='?', type=argparse.FileType('w'),
     default=sys.stdout,
     help='Output file for mate 1. WARNING: Will overwrite.')
-  parser.add_argument('outfile2', metavar='reads.filt_2.fq', nargs='?', type=argparse.FileType('w'),
+  parser.add_argument('outfile2', metavar='output_2.fq', nargs='?', type=argparse.FileType('w'),
     help='Output file for mate 2. WARNING: Will overwrite.')
   parser.add_argument('-f', '--format', dest='filetype', choices=('fasta', 'fastq'),
     help='Input read format.')
   parser.add_argument('-F', '--out-format', dest='out_filetype', choices=('fasta', 'fastq'),
     help='Output read format. Default: whatever the input format is.')
-  parser.add_argument('-b', '--filt-bases',
+  parser.add_argument('-b', '--filt-bases', default='N',
     help='The bases to filter on. Case-insensitive. Default: %(default)s.')
-  parser.add_argument('-t', '--thres', type=float,
+  parser.add_argument('-t', '--thres', type=float, default=0.5,
     help='The threshold. The read will be trimmed once the proportion of filter bases in the '
          'window exceed this fraction (not a percentage). Default: %(default)s.')
-  parser.add_argument('-w', '--window', dest='win_len', type=int,
+  parser.add_argument('-w', '--window', dest='win_len', type=int, default=1,
     help='Window size for trimming. Default: %(default)s.')
   parser.add_argument('-i', '--invert', action='store_true',
     help='Invert the filter bases: filter on bases NOT present in the --filt-bases.')
-  parser.add_argument('-m', '--min-length', type=int,
+  parser.add_argument('-m', '--min-length', type=int, default=1,
     help='Set a minimum read length. Reads which are trimmed below this length will be filtered '
          'out (omitted entirely from the output). Read pairs will be preserved: both reads in a '
-         'pair must exceed this length to be kept. Set to 0 to only omit empty reads.')
-  parser.add_argument('--error',
-    help='Fail with this error message (useful for Galaxy tool).')
+         'pair must exceed this length to be kept. Set to 1 to only omit empty reads. '
+         'Default: %(default)s.')
   parser.add_argument('-A', '--acgt', action='store_true',
     help='Filter on any non-ACGT base (shortcut for "--invert --filt-bases ACGT").')
   parser.add_argument('-I', '--iupac', action='store_true',
     help='Filter on any non-IUPAC base (shortcut for "--invert --filt-bases ACGTUWSMKRYBDHVN-").')
-
-  args = parser.parse_args(argv[1:])
+  parser.add_argument('-q', '--quiet', action='store_true',
+    help='Don\'t print trimming stats on completion.')
+  parser.add_argument('-T', '--tsv', dest='stats_format', default='human',
+                      action='store_const', const='tsv')
+  return parser

-  if args.error:
-    fail('Error: '+args.error)
+
+def main(argv):
+  parser = make_argparser()
+  args = parser.parse_args(argv[1:])

   # Catch invalid argument combinations.
   if args.infile1 and args.infile2 and not (args.outfile1 and args.outfile2):
@@ -85,18 +86,28 @@
     invert = True

   # Do the actual trimming.
+  filters = {'win_len':args.win_len, 'thres':args.thres, 'filt_bases':filt_bases, 'invert':invert,
+             'min_len':args.min_length}
   try:
-    trim_reads(file1_parser, file2_parser, args.outfile1, args.outfile2, filetype1, filetype2,
-               paired, args.win_len, args.thres, filt_bases, invert, args.min_length)
+    stats = trim_reads(file1_parser, file2_parser, args.outfile1, args.outfile2,
+                       filetype1, filetype2, paired, filters)
   finally:
     for filehandle in (args.infile1, args.infile2, args.outfile1, args.outfile2):
       if filehandle and filehandle is not sys.stdin and filehandle is not sys.stdout:
         filehandle.close()

+  if not args.quiet:
+    print_stats(stats, args.stats_format)
+

 def trim_reads(file1_parser, file2_parser, outfile1, outfile2, filetype1, filetype2, paired,
-               win_len, thres, filt_bases, invert, min_length):
+               filters):
   """Trim all the reads in the input file(s), writing to the output file(s)."""
+  min_len = filters['min_len']
+  trims1 = collections.Counter()
+  trims2 = collections.Counter()
+  omitted1 = collections.Counter()
+  omitted2 = collections.Counter()
   read1 = None
   read2 = None
   while True:
@@ -108,26 +119,49 @@
     except StopIteration:
       break
     # Do trimming.
-    read1.seq = trim_read(read1.seq, win_len, thres, filt_bases, invert)
-    if filetype1 == 'fastq':
-      # If the output filetype is FASTQ, trim the quality scores too.
-      # If there are no input quality scores (i.e. the input is FASTA), use dummy scores instead.
-      # "z" is the highest alphanumeric score (PHRED 89), higher than any expected real score.
-      qual1 = read1.qual or 'z' * len(read1.seq)
-      read1.qual = qual1[:len(read1.seq)]
+    read1, trim_len1 = trim_read(read1, filters, filetype1)
+    trims1[trim_len1] += 1
     if paired:
-      read2.seq = trim_read(read2.seq, win_len, thres, filt_bases, invert)
-      if filetype2 == 'fastq':
-        qual2 = read2.qual or 'z' * len(read2.seq)
-        read2.qual = qual2[:len(read2.seq)]
+      read2, trim_len2 = trim_read(read2, filters, filetype2)
+      trims2[trim_len2] += 1
       # Output reads if they both pass the minimum length threshold (if any was given).
-      if min_length is None or (len(read1.seq) >= min_length and len(read2.seq) >= min_length):
+      if min_len is None or (len(read1.seq) >= min_len and len(read2.seq) >= min_len):
         write_read(outfile1, read1, filetype1)
         write_read(outfile2, read2, filetype2)
+      else:
+        if len(read1.seq) < min_len:
+          omitted1[trim_len1] += 1
+        if len(read2.seq) < min_len:
+          omitted2[trim_len2] += 1
     else:
       # Output read if it passes the minimum length threshold (if any was given).
-      if min_length is None or len(read1.seq) >= min_length:
+      if min_len is None or len(read1.seq) >= min_len:
         write_read(outfile1, read1, filetype1)
+      else:
+        omitted1[trim_len1] += 1
+  # Compile stats.
+  stats = {}
+  stats['reads'] = sum(trims1.values()) + sum(trims2.values())
+  stats['trimmed'] = stats['reads'] - trims1[0] - trims2[0]
+  stats['omitted'] = sum(omitted1.values()) + sum(omitted2.values())
+  if paired:
+    stats['trimmed1'] = stats['reads']//2 - trims1[0]
+    stats['trimmed2'] = stats['reads']//2 - trims2[0]
+    stats['omitted1'] = sum(omitted1.values())
+    stats['omitted2'] = sum(omitted2.values())
+  # Quintiles for trim lengths.
+  stats['quants'] = {'order':QUANT_ORDER}
+  if paired:
+    stats['quants']['trim1'] = get_counter_quantiles(trims1, order=QUANT_ORDER)
+    stats['quants']['trim2'] = get_counter_quantiles(trims2, order=QUANT_ORDER)
+    stats['quants']['trim'] = get_counter_quantiles(trims1 + trims2, order=QUANT_ORDER)
+    stats['quants']['omitted_trim1'] = get_counter_quantiles(omitted1, order=QUANT_ORDER)
+    stats['quants']['omitted_trim2'] = get_counter_quantiles(omitted2, order=QUANT_ORDER)
+    stats['quants']['omitted_trim'] = get_counter_quantiles(omitted1 + omitted2, order=QUANT_ORDER)
+  else:
+    stats['quants']['trim'] = get_counter_quantiles(trims1)
+    stats['quants']['omitted_trim'] = get_counter_quantiles(omitted1)
+  return stats


 def get_filetype(infile, filetype_arg):
@@ -158,7 +192,20 @@
     filehandle.write('@{name}\n{seq}\n+\n{qual}\n'.format(**vars(read)))


-def trim_read(seq, win_len, thres, filt_bases, invert):
+def trim_read(read, filters, filetype):
+  trimmed_seq = trim_seq(read.seq, **filters)
+  trim_len = len(read.seq) - len(trimmed_seq)
+  read.seq = trimmed_seq
+  if filetype == 'fastq':
+    # If the output filetype is FASTQ, trim the quality scores too.
+    # If there are no input quality scores (i.e. the input is FASTA), use dummy scores instead.
+    # "z" is the highest alphanumeric score (PHRED 89), higher than any expected real score.
+    qual = read.qual or 'z' * len(read.seq)
+    read.qual = qual[:len(read.seq)]
+  return read, trim_len
+
+
+def trim_seq(seq, win_len=1, thres=1.0, filt_bases='N', invert=False, **kwargs):
   """Trim an individual read and return its trimmed sequence.
   This will track the frequency of bad bases in a window of length win_len, and trim once the
   frequency goes below thres. The trim point will be just before the first (leftmost) bad base in
@@ -199,7 +246,6 @@
       if bad_base:
         bad_bases_count -= 1
         bad_bases_coords.pop(0)
-    # print bad_bases_coords
     # Are we over the threshold?
     if bad_bases_count > max_bad_bases:
       break
@@ -211,6 +257,95 @@
     return seq


+def get_counter_quantiles(counter, order=5):
+  """Return an arbitrary set of quantiles (including min and max values).
+  `counter` is a collections.Counter.
+  `order` is which quantile to perform (4 = quartiles, 5 = quintiles).
+  Warning: This expects a counter which has counted at least `order` elements.
+  If it receives a counter with fewer elements, it will simply return `list(counter.elements())`.
+  This will have fewer than the usual order+1 elements, and may not fit normal expectations of
+  what "quantiles" should be."""
+  quantiles = []
+  total = sum(counter.values())
+  if total <= order:
+    return list(counter.elements())
+  span_size = total / order
+  # Sort the items and go through them, looking for the one at the break points.
+  items = list(sorted(counter.items(), key=lambda i: i[0]))
+  quantiles.append(items[0][0])
+  total_seen = 0
+  current_span = 1
+  cut_point = int(round(current_span*span_size))
+  for item, count in items:
+    total_seen += count
+    if total_seen >= cut_point:
+      quantiles.append(item)
+      current_span += 1
+      cut_point = int(round(current_span*span_size))
+  return quantiles
+
+
+def print_stats(stats, format='human'):
+  if format == 'human':
+    lines = get_stats_lines_human(stats)
+  elif format == 'tsv':
+    lines = get_stats_lines_tsv(stats)
+  else:
+    fail('Error: Unrecognized format {!r}'.format(format))
+  sys.stderr.write('\n'.join(lines).format(**stats)+'\n')
+
+
+def get_stats_lines_human(stats):
+  # Single-stat lines:
+  lines = [
+    'Total reads in input:\t{reads}',
+    'Reads trimmed:\t{trimmed}'
+  ]
+  if 'trimmed1' in stats and 'trimmed2' in stats:
+    lines.append('  For mate 1:\t{trimmed1}')
+    lines.append('  For mate 2:\t{trimmed2}')
+  lines.append('Reads filtered out:\t{omitted}')
+  if 'omitted1' in stats and 'omitted2' in stats:
+    lines.append('  For mate 1:\t{omitted1}')
+    lines.append('  For mate 2:\t{omitted2}')
+  # Quantile lines:
+  quantile_lines = [
+    ('Bases trimmed quintiles', 'trim'),
+    ('  For mate 1', 'trim1'),
+    ('  For mate 2', 'trim2'),
+    ('Bases trimmed quintiles from filtered reads', 'omitted_trim'),
+    ('  For mate 1', 'omitted_trim1'),
+    ('  For mate 2', 'omitted_trim2')
+  ]
+  for desc, stat_name in quantile_lines:
+    if stat_name in stats['quants']:
+      quants_values = stats['quants'][stat_name]
+      if quants_values:
+        quants_str = ', '.join(map(str, quants_values))
+      else:
+        quants_str = 'N/A'
+      line = desc+':\t'+quants_str
+      lines.append(line)
+  return lines
+
+
+def get_stats_lines_tsv(stats):
+  lines = ['{reads}']
+  if 'trimmed1' in stats and 'trimmed2' in stats:
+    lines.append('{trimmed}\t{trimmed1}\t{trimmed2}')
+  else:
+    lines.append('{trimmed}')
+  if 'omitted1' in stats and 'omitted2' in stats:
+    lines.append('{omitted}\t{omitted1}\t{omitted2}')
+  else:
+    lines.append('{omitted}')
+  for stat_name in ('trim', 'trim1', 'trim2', 'omitted_trim', 'omitted_trim1', 'omitted_trim2'):
+    if stat_name in stats['quants']:
+      quants_values = stats['quants'][stat_name]
+      lines.append('\t'.join(map(str, quants_values)))
+  return lines
+
+
 def fail(message):
   sys.stderr.write(message+"\n")
   sys.exit(1)
--- a/trimmer.xml	Tue Dec 01 21:33:27 2015 -0500
+++ b/trimmer.xml	Fri May 27 23:29:45 2022 +0000
@@ -1,27 +1,31 @@
-<tool id="sequence_content_trimmer" version="0.1" name="Sequence Content Trimmer">
+<?xml version="1.0"?>
+<tool id="sequence_content_trimmer" version="0.2.3" name="Sequence Content Trimmer">
   <description>trim reads based on certain bases</description>
-  <command interpreter="python">
-  trimmer.py $input1
-  #if $paired.is_paired:
-    $input2 $output1 $output2
-    #if ('fasta' in $input1.extension and 'fastq' in $input2.extension) or ('fastq' in $input1.extension and 'fasta' in $input2.extension)
-      --error 'Both input files must be either fastq or fasta (no mixing the two).'
+  <command detect_errors="exit_code"><![CDATA[
+  #if $paired.is_paired and (('fasta' in $input1.extension and 'fastq' in $input2.extension) or \
+      ('fastq' in $input1.extension and 'fasta' in $input2.extension))
+    echo 'Both input files must be either fastq or fasta (no mixing the two).' >&2
+  #else
+    python '$__tool_directory__/trimmer.py' '$input1'
+    #if $paired.is_paired:
+      '$input2' '$output1' '$output2'
+    #end if
+    #if $input1.extension in ('fastq', 'fastqsanger', 'fastqillumina', 'fastqsolexa')
+      -f fastq
+    #elif $input1.extension == 'fasta'
+      -f fasta
+    #else
+      -f '$input1.extension'
+    #end if
+    -b '$bases' -t '$thres' -w '$win_len' $invert
+    #if $min_len.has_min_len:
+      -m '$min_len.value'
+    #end if
+    #if not $paired.is_paired:
+      > '$output1'
     #end if
   #end if
-  #if $input1.extension == 'fastq' or $input1.extension == 'fastqsanger' or $input1.extension == 'fastqillumina' or $input1.extension == 'fastqsolexa'
-    -f fastq
-  #elif $input1.extension == 'fasta'
-    -f fasta
-  #else
-    -f $input1.extension
-  #end if
-  -b $bases -t $thres -w $win_len $invert
-  #if $min_len.has_min_len:
-    -m $min_len.value
-  #end if
-  #if not $paired.is_paired:
-    &gt; $output1
-  #end if
+  ]]>
   </command>
   <inputs>
     <conditional name="paired">
@@ -49,8 +53,8 @@
     </conditional>
   </inputs>
   <outputs>
-    <data name="output1" format_source="input1"/>
-    <data name="output2" format_source="input2">
+    <data name="output1" format_source="input1" label="$tool.name on $on_string"/>
+    <data name="output2" format_source="input2" label="$tool.name on $on_string (mate 2)">
       <filter>paired['is_paired']</filter>
     </data>
   </outputs>
@@ -68,7 +72,7 @@

 **How it works**

-This will slide along the read with a window, and trim once the frequency of filter bases exceeds the frequency threshold (unless "Invert filter bases" is enabled, when it will trim once non-filter bases exceed the threshold).
+This will slide along the read with a window, and trim once the frequency of filter bases exceeds the frequency threshold (unless "Invert filter bases" is enabled, in which case it will trim once non-filter bases exceed the threshold).

 The trim point will be just before the first (leftmost) filter base in the final window (the one where the frequency exceeded the threshold).