Mercurial > repos > nick > sequence_content_trimmer
changeset 1:464aee13e2df draft default tip
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
author | nick |
---|---|
date | Fri, 27 May 2022 23:29:45 +0000 |
parents | 7f170cb06e2e |
children | |
files | getreads.py trimmer.py trimmer.xml |
diffstat | 3 files changed, 456 insertions(+), 153 deletions(-) [+] |
line wrap: on
line diff
--- a/getreads.py Tue Dec 01 21:33:27 2015 -0500 +++ b/getreads.py Fri May 27 23:29:45 2022 +0000 @@ -1,3 +1,9 @@ +#!/usr/bin/env python3 +import argparse +import logging +import os +import sys +import types """A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and sequence. All format parsers follow this API: @@ -12,18 +18,42 @@ qual: The quality scores (unless the format is FASTA). """ +# Available formats. +FORMATS = ('fasta', 'fastq', 'sam', 'tsv', 'lines') -def getparser(filehandle, filetype='fasta'): +QUAL_OFFSETS = {'sanger':33, 'solexa':64} + + +def getparser(input, filetype, qual_format='sanger', name_col=1, seq_col=2, qual_col=3): + # Detect whether the input is an open file or a path. + # Return the appropriate reader. if filetype == 'fasta': - return FastaReader(filehandle) + return FastaReader(input) elif filetype == 'fastq': - return FastqReader(filehandle) + return FastqReader(input, qual_format=qual_format) elif filetype == 'sam': - return SamReader(filehandle) + return SamReader(input, qual_format=qual_format) elif filetype == 'tsv': - return TsvReader(filehandle) + return TsvReader(input, qual_format=qual_format, + name_col=name_col, seq_col=seq_col, qual_col=qual_col) + elif filetype == 'lines': + return LineReader(input) else: - raise ValueError('Illegal argument: filetype=\''+filetype+'\'') + raise ValueError('Unrecognized format: {!r}'.format(filetype)) + + +def detect_input_type(obj): + """Is this an open filehandle, or is it a file path (string)?""" + try: + os.path.isfile(obj) + return 'path' + except TypeError: + if isinstance(obj, types.GeneratorType): + return 'generator' + elif hasattr(obj, 'read') and hasattr(obj, 'close'): + return 'file' + else: + return None class FormatError(Exception): @@ -33,19 +63,79 @@ class Read(object): - def __init__(self, name='', seq='', id_='', qual=''): + def __init__(self, name='', seq='', id_='', qual='', qual_format='sanger'): self.name = name self.seq = seq - self.id = id_ self.qual = qual + if id_ or not self.name: + self.id = id_ + elif self.name: + self.id = self.name.split()[0] + self.offset = QUAL_OFFSETS[qual_format] + @property + def scores(self): + if self.qual is None: + return None + scores = [] + for qual_char in self.qual: + scores.append(ord(qual_char) - self.offset) + return scores + def to_fasta(self): + return f'>{self.name}\n{self.seq}' + def to_fastq(self): + return f'@{self.name}\n{self.seq}\n+\n{self.qual}' + def __str__(self): + if self.qual: + return self.to_fastq() + else: + return self.to_fasta() + def __repr__(self): + kwarg_strs = [] + for kwarg in 'name', 'seq', 'id_', 'qual': + attr = kwarg.rstrip('_') + raw_value = getattr(self, attr) + if raw_value is not None and len(raw_value) >= 200: + value = raw_value[:199]+'…' + else: + value = raw_value + if value: + kwarg_strs.append(f'{kwarg}={value!r}') + return type(self).__name__+'('+', '.join(kwarg_strs)+')' class Reader(object): """Base class for all other parsers.""" - def __init__(self, filehandle): - self.filehandle = filehandle + def __init__(self, input, **kwargs): + self.input = input + self.input_type = detect_input_type(input) + if self.input_type not in ('path', 'file', 'generator'): + raise ValueError('Input object {!r} not a file, string, or generator.'.format(input)) + for key, value in kwargs.items(): + setattr(self, key, value) def __iter__(self): return self.parser() + def bases(self): + for read in self.parser(): + for base in read.seq: + yield base + def get_input_iterator(self): + if self.input_type == 'path': + return open(self.input) + else: + return self.input + + +class LineReader(Reader): + """A parser for the simplest format: Only the sequence, one line per read.""" + def parser(self): + input_iterator = self.get_input_iterator() + try: + for line in input_iterator: + read = Read(seq=line.rstrip('\r\n')) + yield read + finally: + if self.input_type == 'path': + input_iterator.close() class TsvReader(Reader): @@ -54,18 +144,26 @@ Column 2: sequence Column 3: quality scores (optional)""" def parser(self): - for line in self.filehandle: - fields = line.rstrip('\r\n').split('\t') - if len(fields) < 2: - continue - read = Read() - read.name = fields[0] - if read.name: - read.id = read.name.split()[0] - read.seq = fields[1] - if len(fields) >= 3: - read.qual = fields[2] - yield read + min_fields = max(self.name_col, self.seq_col) + input_iterator = self.get_input_iterator() + try: + for line in input_iterator: + fields = line.rstrip('\r\n').split('\t') + if len(fields) < min_fields: + continue + read = Read(qual_format=self.qual_format) + read.name = fields[self.name_col-1] + if read.name: + read.id = read.name.split()[0] + read.seq = fields[self.seq_col-1] + try: + read.qual = fields[self.qual_col-1] + except (TypeError, IndexError): + pass + yield read + finally: + if self.input_type == 'path': + input_iterator.close() class SamReader(Reader): @@ -75,82 +173,148 @@ All alignment lines have 11 or more fields. Other lines will be skipped. """ def parser(self): - for line in self.filehandle: - fields = line.split('\t') - if len(fields) < 11: - continue - # Skip headers. - if fields[0].startswith('@') and len(fields[0]) == 3: - continue - read = Read() - read.name = fields[0] - if read.name: - read.id = read.name.split()[0] - read.seq = fields[9] - read.qual = fields[10].rstrip('\r\n') - yield read + input_iterator = self.get_input_iterator() + try: + for line in input_iterator: + fields = line.split('\t') + if len(fields) < 11: + continue + # Skip headers. + if fields[0].startswith('@') and len(fields[0]) == 3: + continue + read = Read(qual_format=self.qual_format) + read.name = fields[0] + if read.name: + read.id = read.name.split()[0] + read.seq = fields[9] + read.qual = fields[10].rstrip('\r\n') + yield read + finally: + if self.input_type == 'path': + input_iterator.close() class FastaReader(Reader): """A simple FASTA parser that reads one sequence at a time into memory.""" def parser(self): - read = Read() - while True: - line_raw = self.filehandle.readline() - if not line_raw: - if read.seq: - yield read - raise StopIteration - line = line_raw.strip() - # Allow empty lines. - if not line: - continue - if line.startswith('>'): - if read.seq: - yield read - read = Read() - read.name = line[1:] # remove ">" - if read.name: - read.id = read.name.split()[0] - continue - else: - read.seq += line + input_iterator = self.get_input_iterator() + try: + read = None + while True: + try: + line_raw = next(input_iterator) + except StopIteration: + if read is not None: + yield read + return + line = line_raw.rstrip('\r\n') + if line.startswith('>'): + if read is not None: + yield read + read = Read() + read.name = line[1:] # remove ">" + if read.name: + read.id = read.name.split()[0] + continue + else: + read.seq += line + finally: + if self.input_type == 'path': + input_iterator.close() class FastqReader(Reader): """A simple FASTQ parser. Can handle multi-line sequences, though.""" def parser(self): - read = Read() - state = 'header' - while True: - line_raw = self.filehandle.readline() - if not line_raw: - if read.seq: - yield read - raise StopIteration - line = line_raw.strip() - # Allow empty lines. - if not line: - continue - if state == 'header': - if not line.startswith('@'): - raise FormatError('line state = "header" but line does not start with "@"') - if read.seq: - yield read - read = Read() - read.name = line[1:] # remove '@' - if read.name: - read.id = read.name.split()[0] - state = 'sequence' - elif state == 'sequence': - if line.startswith('+'): - state = 'plus' - else: - read.seq += line - elif state == 'plus' or state == 'quality': - state = 'quality' - togo = len(read.seq) - len(read.qual) - read.qual += line[:togo] - # The end of the quality lines is when we have a quality string as long as the sequence. - if len(read.qual) >= len(read.seq): - state = 'header' + input_iterator = self.get_input_iterator() + try: + read = None + line_num = 0 + state = 'header' + while True: + try: + line_raw = next(input_iterator) + except StopIteration: + if read is not None: + yield read + return + line_num += 1 + line = line_raw.rstrip('\r\n') + if state == 'header': + if not line.startswith('@'): + if line: + raise FormatError('line state = "header" but line does not start with "@":\n'+line) + else: + # Allow empty lines. + continue + if read is not None: + yield read + read = Read(qual_format=self.qual_format) + read.name = line[1:] # remove '@' + if read.name: + read.id = read.name.split()[0] + state = 'sequence' + elif state == 'sequence': + if line.startswith('+'): + state = 'plus' + else: + read.seq += line + elif state == 'plus' or state == 'quality': + if line.startswith('@') and state == 'quality': + logging.warning('Looking for more quality scores but line starts with "@". This might ' + 'be a header line and there were fewer quality scores than bases: {}' + .format(line[:69])) + state = 'quality' + togo = len(read.seq) - len(read.qual) + read.qual += line[:togo] + # The end of the quality lines is when we have a quality string as long as the sequence. + if len(read.qual) >= len(read.seq): + state = 'header' + finally: + if self.input_type == 'path': + input_iterator.close() + + +DESCRIPTION = 'Test parser by parsing an input file and printing its contents.' + + +def make_argparser(): + parser = argparse.ArgumentParser(description=DESCRIPTION) + parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin, + help='Input reads.') + parser.add_argument('-f', '--format', choices=('fasta', 'fastq', 'sam', 'tsv', 'lines'), + help='Input read format. Will be detected from the filename, if given.') + return parser + + +def main(argv): + parser = make_argparser() + args = parser.parse_args(argv[1:]) + if args.format: + format = args.format + elif args.infile is sys.stdin: + fail('Error: Must give a --format if reading from stdin.') + else: + ext = os.path.splitext(args.infile.name)[1] + if ext == '.fq': + format = 'fastq' + elif ext == '.fa': + format = 'fasta' + elif ext == '.txt': + format = 'lines' + else: + format = ext[1:] + print('Reading input as format {!r}.'.format(format)) + for i, read in enumerate(getparser(args.infile, filetype=format)): + print('Read {} id/name: {!r}/{!r}'.format(i+1, read.id, read.name)) + print('Read {} seq: {!r}'.format(i+1, read.seq)) + print('Read {} qual: {!r}'.format(i+1, read.qual)) + + +def fail(message): + sys.stderr.write(message+"\n") + sys.exit(1) + + +if __name__ == '__main__': + sys.exit(main(sys.argv))
--- a/trimmer.py Tue Dec 01 21:33:27 2015 -0500 +++ b/trimmer.py Fri May 27 23:29:45 2022 +0000 @@ -1,59 +1,60 @@ -#!/usr/bin/env python -from __future__ import division +#!/usr/bin/env python3 import sys import argparse +import collections import getreads -OPT_DEFAULTS = {'win_len':1, 'thres':1.0, 'filt_bases':'N'} +QUANT_ORDER = 5 USAGE = "%(prog)s [options] [input_1.fq [input_2.fq output_1.fq output_2.fq]]" DESCRIPTION = """Trim the 5' ends of reads by sequence content, e.g. by GC content or presence of N's.""" -def main(argv): - +def make_argparser(): parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE) - parser.set_defaults(**OPT_DEFAULTS) - parser.add_argument('infile1', metavar='reads_1.fq', nargs='?', type=argparse.FileType('r'), default=sys.stdin, help='Input reads (mate 1). Omit to read from stdin.') parser.add_argument('infile2', metavar='reads_2.fq', nargs='?', type=argparse.FileType('r'), help='Input reads (mate 2). If given, it will preserve pairs (if one read is filtered out ' 'entirely, the other will also be lost).') - parser.add_argument('outfile1', metavar='reads.filt_1.fq', nargs='?', type=argparse.FileType('w'), + parser.add_argument('outfile1', metavar='output_1.fq', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help='Output file for mate 1. WARNING: Will overwrite.') - parser.add_argument('outfile2', metavar='reads.filt_2.fq', nargs='?', type=argparse.FileType('w'), + parser.add_argument('outfile2', metavar='output_2.fq', nargs='?', type=argparse.FileType('w'), help='Output file for mate 2. WARNING: Will overwrite.') parser.add_argument('-f', '--format', dest='filetype', choices=('fasta', 'fastq'), help='Input read format.') parser.add_argument('-F', '--out-format', dest='out_filetype', choices=('fasta', 'fastq'), help='Output read format. Default: whatever the input format is.') - parser.add_argument('-b', '--filt-bases', + parser.add_argument('-b', '--filt-bases', default='N', help='The bases to filter on. Case-insensitive. Default: %(default)s.') - parser.add_argument('-t', '--thres', type=float, + parser.add_argument('-t', '--thres', type=float, default=0.5, help='The threshold. The read will be trimmed once the proportion of filter bases in the ' 'window exceed this fraction (not a percentage). Default: %(default)s.') - parser.add_argument('-w', '--window', dest='win_len', type=int, + parser.add_argument('-w', '--window', dest='win_len', type=int, default=1, help='Window size for trimming. Default: %(default)s.') parser.add_argument('-i', '--invert', action='store_true', help='Invert the filter bases: filter on bases NOT present in the --filt-bases.') - parser.add_argument('-m', '--min-length', type=int, + parser.add_argument('-m', '--min-length', type=int, default=1, help='Set a minimum read length. Reads which are trimmed below this length will be filtered ' 'out (omitted entirely from the output). Read pairs will be preserved: both reads in a ' - 'pair must exceed this length to be kept. Set to 0 to only omit empty reads.') - parser.add_argument('--error', - help='Fail with this error message (useful for Galaxy tool).') + 'pair must exceed this length to be kept. Set to 1 to only omit empty reads. ' + 'Default: %(default)s.') parser.add_argument('-A', '--acgt', action='store_true', help='Filter on any non-ACGT base (shortcut for "--invert --filt-bases ACGT").') parser.add_argument('-I', '--iupac', action='store_true', help='Filter on any non-IUPAC base (shortcut for "--invert --filt-bases ACGTUWSMKRYBDHVN-").') - - args = parser.parse_args(argv[1:]) + parser.add_argument('-q', '--quiet', action='store_true', + help='Don\'t print trimming stats on completion.') + parser.add_argument('-T', '--tsv', dest='stats_format', default='human', + action='store_const', const='tsv') + return parser - if args.error: - fail('Error: '+args.error) + +def main(argv): + parser = make_argparser() + args = parser.parse_args(argv[1:]) # Catch invalid argument combinations. if args.infile1 and args.infile2 and not (args.outfile1 and args.outfile2): @@ -85,18 +86,28 @@ invert = True # Do the actual trimming. + filters = {'win_len':args.win_len, 'thres':args.thres, 'filt_bases':filt_bases, 'invert':invert, + 'min_len':args.min_length} try: - trim_reads(file1_parser, file2_parser, args.outfile1, args.outfile2, filetype1, filetype2, - paired, args.win_len, args.thres, filt_bases, invert, args.min_length) + stats = trim_reads(file1_parser, file2_parser, args.outfile1, args.outfile2, + filetype1, filetype2, paired, filters) finally: for filehandle in (args.infile1, args.infile2, args.outfile1, args.outfile2): if filehandle and filehandle is not sys.stdin and filehandle is not sys.stdout: filehandle.close() + if not args.quiet: + print_stats(stats, args.stats_format) + def trim_reads(file1_parser, file2_parser, outfile1, outfile2, filetype1, filetype2, paired, - win_len, thres, filt_bases, invert, min_length): + filters): """Trim all the reads in the input file(s), writing to the output file(s).""" + min_len = filters['min_len'] + trims1 = collections.Counter() + trims2 = collections.Counter() + omitted1 = collections.Counter() + omitted2 = collections.Counter() read1 = None read2 = None while True: @@ -108,26 +119,49 @@ except StopIteration: break # Do trimming. - read1.seq = trim_read(read1.seq, win_len, thres, filt_bases, invert) - if filetype1 == 'fastq': - # If the output filetype is FASTQ, trim the quality scores too. - # If there are no input quality scores (i.e. the input is FASTA), use dummy scores instead. - # "z" is the highest alphanumeric score (PHRED 89), higher than any expected real score. - qual1 = read1.qual or 'z' * len(read1.seq) - read1.qual = qual1[:len(read1.seq)] + read1, trim_len1 = trim_read(read1, filters, filetype1) + trims1[trim_len1] += 1 if paired: - read2.seq = trim_read(read2.seq, win_len, thres, filt_bases, invert) - if filetype2 == 'fastq': - qual2 = read2.qual or 'z' * len(read2.seq) - read2.qual = qual2[:len(read2.seq)] + read2, trim_len2 = trim_read(read2, filters, filetype2) + trims2[trim_len2] += 1 # Output reads if they both pass the minimum length threshold (if any was given). - if min_length is None or (len(read1.seq) >= min_length and len(read2.seq) >= min_length): + if min_len is None or (len(read1.seq) >= min_len and len(read2.seq) >= min_len): write_read(outfile1, read1, filetype1) write_read(outfile2, read2, filetype2) + else: + if len(read1.seq) < min_len: + omitted1[trim_len1] += 1 + if len(read2.seq) < min_len: + omitted2[trim_len2] += 1 else: # Output read if it passes the minimum length threshold (if any was given). - if min_length is None or len(read1.seq) >= min_length: + if min_len is None or len(read1.seq) >= min_len: write_read(outfile1, read1, filetype1) + else: + omitted1[trim_len1] += 1 + # Compile stats. + stats = {} + stats['reads'] = sum(trims1.values()) + sum(trims2.values()) + stats['trimmed'] = stats['reads'] - trims1[0] - trims2[0] + stats['omitted'] = sum(omitted1.values()) + sum(omitted2.values()) + if paired: + stats['trimmed1'] = stats['reads']//2 - trims1[0] + stats['trimmed2'] = stats['reads']//2 - trims2[0] + stats['omitted1'] = sum(omitted1.values()) + stats['omitted2'] = sum(omitted2.values()) + # Quintiles for trim lengths. + stats['quants'] = {'order':QUANT_ORDER} + if paired: + stats['quants']['trim1'] = get_counter_quantiles(trims1, order=QUANT_ORDER) + stats['quants']['trim2'] = get_counter_quantiles(trims2, order=QUANT_ORDER) + stats['quants']['trim'] = get_counter_quantiles(trims1 + trims2, order=QUANT_ORDER) + stats['quants']['omitted_trim1'] = get_counter_quantiles(omitted1, order=QUANT_ORDER) + stats['quants']['omitted_trim2'] = get_counter_quantiles(omitted2, order=QUANT_ORDER) + stats['quants']['omitted_trim'] = get_counter_quantiles(omitted1 + omitted2, order=QUANT_ORDER) + else: + stats['quants']['trim'] = get_counter_quantiles(trims1) + stats['quants']['omitted_trim'] = get_counter_quantiles(omitted1) + return stats def get_filetype(infile, filetype_arg): @@ -158,7 +192,20 @@ filehandle.write('@{name}\n{seq}\n+\n{qual}\n'.format(**vars(read))) -def trim_read(seq, win_len, thres, filt_bases, invert): +def trim_read(read, filters, filetype): + trimmed_seq = trim_seq(read.seq, **filters) + trim_len = len(read.seq) - len(trimmed_seq) + read.seq = trimmed_seq + if filetype == 'fastq': + # If the output filetype is FASTQ, trim the quality scores too. + # If there are no input quality scores (i.e. the input is FASTA), use dummy scores instead. + # "z" is the highest alphanumeric score (PHRED 89), higher than any expected real score. + qual = read.qual or 'z' * len(read.seq) + read.qual = qual[:len(read.seq)] + return read, trim_len + + +def trim_seq(seq, win_len=1, thres=1.0, filt_bases='N', invert=False, **kwargs): """Trim an individual read and return its trimmed sequence. This will track the frequency of bad bases in a window of length win_len, and trim once the frequency goes below thres. The trim point will be just before the first (leftmost) bad base in @@ -199,7 +246,6 @@ if bad_base: bad_bases_count -= 1 bad_bases_coords.pop(0) - # print bad_bases_coords # Are we over the threshold? if bad_bases_count > max_bad_bases: break @@ -211,6 +257,95 @@ return seq +def get_counter_quantiles(counter, order=5): + """Return an arbitrary set of quantiles (including min and max values). + `counter` is a collections.Counter. + `order` is which quantile to perform (4 = quartiles, 5 = quintiles). + Warning: This expects a counter which has counted at least `order` elements. + If it receives a counter with fewer elements, it will simply return `list(counter.elements())`. + This will have fewer than the usual order+1 elements, and may not fit normal expectations of + what "quantiles" should be.""" + quantiles = [] + total = sum(counter.values()) + if total <= order: + return list(counter.elements()) + span_size = total / order + # Sort the items and go through them, looking for the one at the break points. + items = list(sorted(counter.items(), key=lambda i: i[0])) + quantiles.append(items[0][0]) + total_seen = 0 + current_span = 1 + cut_point = int(round(current_span*span_size)) + for item, count in items: + total_seen += count + if total_seen >= cut_point: + quantiles.append(item) + current_span += 1 + cut_point = int(round(current_span*span_size)) + return quantiles + + +def print_stats(stats, format='human'): + if format == 'human': + lines = get_stats_lines_human(stats) + elif format == 'tsv': + lines = get_stats_lines_tsv(stats) + else: + fail('Error: Unrecognized format {!r}'.format(format)) + sys.stderr.write('\n'.join(lines).format(**stats)+'\n') + + +def get_stats_lines_human(stats): + # Single-stat lines: + lines = [ + 'Total reads in input:\t{reads}', + 'Reads trimmed:\t{trimmed}' + ] + if 'trimmed1' in stats and 'trimmed2' in stats: + lines.append(' For mate 1:\t{trimmed1}') + lines.append(' For mate 2:\t{trimmed2}') + lines.append('Reads filtered out:\t{omitted}') + if 'omitted1' in stats and 'omitted2' in stats: + lines.append(' For mate 1:\t{omitted1}') + lines.append(' For mate 2:\t{omitted2}') + # Quantile lines: + quantile_lines = [ + ('Bases trimmed quintiles', 'trim'), + (' For mate 1', 'trim1'), + (' For mate 2', 'trim2'), + ('Bases trimmed quintiles from filtered reads', 'omitted_trim'), + (' For mate 1', 'omitted_trim1'), + (' For mate 2', 'omitted_trim2') + ] + for desc, stat_name in quantile_lines: + if stat_name in stats['quants']: + quants_values = stats['quants'][stat_name] + if quants_values: + quants_str = ', '.join(map(str, quants_values)) + else: + quants_str = 'N/A' + line = desc+':\t'+quants_str + lines.append(line) + return lines + + +def get_stats_lines_tsv(stats): + lines = ['{reads}'] + if 'trimmed1' in stats and 'trimmed2' in stats: + lines.append('{trimmed}\t{trimmed1}\t{trimmed2}') + else: + lines.append('{trimmed}') + if 'omitted1' in stats and 'omitted2' in stats: + lines.append('{omitted}\t{omitted1}\t{omitted2}') + else: + lines.append('{omitted}') + for stat_name in ('trim', 'trim1', 'trim2', 'omitted_trim', 'omitted_trim1', 'omitted_trim2'): + if stat_name in stats['quants']: + quants_values = stats['quants'][stat_name] + lines.append('\t'.join(map(str, quants_values))) + return lines + + def fail(message): sys.stderr.write(message+"\n") sys.exit(1)
--- a/trimmer.xml Tue Dec 01 21:33:27 2015 -0500 +++ b/trimmer.xml Fri May 27 23:29:45 2022 +0000 @@ -1,27 +1,31 @@ -<tool id="sequence_content_trimmer" version="0.1" name="Sequence Content Trimmer"> +<?xml version="1.0"?> +<tool id="sequence_content_trimmer" version="0.2.3" name="Sequence Content Trimmer"> <description>trim reads based on certain bases</description> - <command interpreter="python"> - trimmer.py $input1 - #if $paired.is_paired: - $input2 $output1 $output2 - #if ('fasta' in $input1.extension and 'fastq' in $input2.extension) or ('fastq' in $input1.extension and 'fasta' in $input2.extension) - --error 'Both input files must be either fastq or fasta (no mixing the two).' + <command detect_errors="exit_code"><![CDATA[ + #if $paired.is_paired and (('fasta' in $input1.extension and 'fastq' in $input2.extension) or \ + ('fastq' in $input1.extension and 'fasta' in $input2.extension)) + echo 'Both input files must be either fastq or fasta (no mixing the two).' >&2 + #else + python '$__tool_directory__/trimmer.py' '$input1' + #if $paired.is_paired: + '$input2' '$output1' '$output2' + #end if + #if $input1.extension in ('fastq', 'fastqsanger', 'fastqillumina', 'fastqsolexa') + -f fastq + #elif $input1.extension == 'fasta' + -f fasta + #else + -f '$input1.extension' + #end if + -b '$bases' -t '$thres' -w '$win_len' $invert + #if $min_len.has_min_len: + -m '$min_len.value' + #end if + #if not $paired.is_paired: + > '$output1' #end if #end if - #if $input1.extension == 'fastq' or $input1.extension == 'fastqsanger' or $input1.extension == 'fastqillumina' or $input1.extension == 'fastqsolexa' - -f fastq - #elif $input1.extension == 'fasta' - -f fasta - #else - -f $input1.extension - #end if - -b $bases -t $thres -w $win_len $invert - #if $min_len.has_min_len: - -m $min_len.value - #end if - #if not $paired.is_paired: - > $output1 - #end if + ]]> </command> <inputs> <conditional name="paired"> @@ -49,8 +53,8 @@ </conditional> </inputs> <outputs> - <data name="output1" format_source="input1"/> - <data name="output2" format_source="input2"> + <data name="output1" format_source="input1" label="$tool.name on $on_string"/> + <data name="output2" format_source="input2" label="$tool.name on $on_string (mate 2)"> <filter>paired['is_paired']</filter> </data> </outputs> @@ -68,7 +72,7 @@ **How it works** -This will slide along the read with a window, and trim once the frequency of filter bases exceeds the frequency threshold (unless "Invert filter bases" is enabled, when it will trim once non-filter bases exceed the threshold). +This will slide along the read with a window, and trim once the frequency of filter bases exceeds the frequency threshold (unless "Invert filter bases" is enabled, in which case it will trim once non-filter bases exceed the threshold). The trim point will be just before the first (leftmost) filter base in the final window (the one where the frequency exceeded the threshold).