Mercurial > repos > nick > sequence_content_trimmer
changeset 0:7f170cb06e2e draft
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
author | nick |
---|---|
date | Tue, 01 Dec 2015 21:33:27 -0500 |
parents | |
children | 464aee13e2df |
files | getreads.py trimmer.py trimmer.xml |
diffstat | 3 files changed, 460 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getreads.py Tue Dec 01 21:33:27 2015 -0500 @@ -0,0 +1,156 @@ +"""A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and +sequence. +All format parsers follow this API: + with open('sequence.fasta') as fasta: + for read in getreads.getparser(fasta, filetype='fasta'): + print "There is a sequence with this FASTA identifier: "+read.id + print "Its sequence is "+read.seq +The properties of Read are: + name: The entire FASTA header line, SAM column 1, etc. + id: The first whitespace-delimited part of the name. + seq: The sequence. + qual: The quality scores (unless the format is FASTA). +""" + + +def getparser(filehandle, filetype='fasta'): + if filetype == 'fasta': + return FastaReader(filehandle) + elif filetype == 'fastq': + return FastqReader(filehandle) + elif filetype == 'sam': + return SamReader(filehandle) + elif filetype == 'tsv': + return TsvReader(filehandle) + else: + raise ValueError('Illegal argument: filetype=\''+filetype+'\'') + + +class FormatError(Exception): + def __init__(self, message=None): + if message: + Exception.__init__(self, message) + + +class Read(object): + def __init__(self, name='', seq='', id_='', qual=''): + self.name = name + self.seq = seq + self.id = id_ + self.qual = qual + + +class Reader(object): + """Base class for all other parsers.""" + def __init__(self, filehandle): + self.filehandle = filehandle + def __iter__(self): + return self.parser() + + +class TsvReader(Reader): + """A parser for a simple tab-delimited format. + Column 1: name + Column 2: sequence + Column 3: quality scores (optional)""" + def parser(self): + for line in self.filehandle: + fields = line.rstrip('\r\n').split('\t') + if len(fields) < 2: + continue + read = Read() + read.name = fields[0] + if read.name: + read.id = read.name.split()[0] + read.seq = fields[1] + if len(fields) >= 3: + read.qual = fields[2] + yield read + + +class SamReader(Reader): + """A simple SAM parser. + Assumptions: + Lines starting with "@" with 3 fields are headers. All others are alignments. + All alignment lines have 11 or more fields. Other lines will be skipped. + """ + def parser(self): + for line in self.filehandle: + fields = line.split('\t') + if len(fields) < 11: + continue + # Skip headers. + if fields[0].startswith('@') and len(fields[0]) == 3: + continue + read = Read() + read.name = fields[0] + if read.name: + read.id = read.name.split()[0] + read.seq = fields[9] + read.qual = fields[10].rstrip('\r\n') + yield read + + +class FastaReader(Reader): + """A simple FASTA parser that reads one sequence at a time into memory.""" + def parser(self): + read = Read() + while True: + line_raw = self.filehandle.readline() + if not line_raw: + if read.seq: + yield read + raise StopIteration + line = line_raw.strip() + # Allow empty lines. + if not line: + continue + if line.startswith('>'): + if read.seq: + yield read + read = Read() + read.name = line[1:] # remove ">" + if read.name: + read.id = read.name.split()[0] + continue + else: + read.seq += line + + +class FastqReader(Reader): + """A simple FASTQ parser. Can handle multi-line sequences, though.""" + def parser(self): + read = Read() + state = 'header' + while True: + line_raw = self.filehandle.readline() + if not line_raw: + if read.seq: + yield read + raise StopIteration + line = line_raw.strip() + # Allow empty lines. + if not line: + continue + if state == 'header': + if not line.startswith('@'): + raise FormatError('line state = "header" but line does not start with "@"') + if read.seq: + yield read + read = Read() + read.name = line[1:] # remove '@' + if read.name: + read.id = read.name.split()[0] + state = 'sequence' + elif state == 'sequence': + if line.startswith('+'): + state = 'plus' + else: + read.seq += line + elif state == 'plus' or state == 'quality': + state = 'quality' + togo = len(read.seq) - len(read.qual) + read.qual += line[:togo] + # The end of the quality lines is when we have a quality string as long as the sequence. + if len(read.qual) >= len(read.seq): + state = 'header'
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trimmer.py Tue Dec 01 21:33:27 2015 -0500 @@ -0,0 +1,220 @@ +#!/usr/bin/env python +from __future__ import division +import sys +import argparse +import getreads + +OPT_DEFAULTS = {'win_len':1, 'thres':1.0, 'filt_bases':'N'} +USAGE = "%(prog)s [options] [input_1.fq [input_2.fq output_1.fq output_2.fq]]" +DESCRIPTION = """Trim the 5' ends of reads by sequence content, e.g. by GC content or presence of +N's.""" + + +def main(argv): + + parser = argparse.ArgumentParser(description=DESCRIPTION, usage=USAGE) + parser.set_defaults(**OPT_DEFAULTS) + + parser.add_argument('infile1', metavar='reads_1.fq', nargs='?', type=argparse.FileType('r'), + default=sys.stdin, + help='Input reads (mate 1). Omit to read from stdin.') + parser.add_argument('infile2', metavar='reads_2.fq', nargs='?', type=argparse.FileType('r'), + help='Input reads (mate 2). If given, it will preserve pairs (if one read is filtered out ' + 'entirely, the other will also be lost).') + parser.add_argument('outfile1', metavar='reads.filt_1.fq', nargs='?', type=argparse.FileType('w'), + default=sys.stdout, + help='Output file for mate 1. WARNING: Will overwrite.') + parser.add_argument('outfile2', metavar='reads.filt_2.fq', nargs='?', type=argparse.FileType('w'), + help='Output file for mate 2. WARNING: Will overwrite.') + parser.add_argument('-f', '--format', dest='filetype', choices=('fasta', 'fastq'), + help='Input read format.') + parser.add_argument('-F', '--out-format', dest='out_filetype', choices=('fasta', 'fastq'), + help='Output read format. Default: whatever the input format is.') + parser.add_argument('-b', '--filt-bases', + help='The bases to filter on. Case-insensitive. Default: %(default)s.') + parser.add_argument('-t', '--thres', type=float, + help='The threshold. The read will be trimmed once the proportion of filter bases in the ' + 'window exceed this fraction (not a percentage). Default: %(default)s.') + parser.add_argument('-w', '--window', dest='win_len', type=int, + help='Window size for trimming. Default: %(default)s.') + parser.add_argument('-i', '--invert', action='store_true', + help='Invert the filter bases: filter on bases NOT present in the --filt-bases.') + parser.add_argument('-m', '--min-length', type=int, + help='Set a minimum read length. Reads which are trimmed below this length will be filtered ' + 'out (omitted entirely from the output). Read pairs will be preserved: both reads in a ' + 'pair must exceed this length to be kept. Set to 0 to only omit empty reads.') + parser.add_argument('--error', + help='Fail with this error message (useful for Galaxy tool).') + parser.add_argument('-A', '--acgt', action='store_true', + help='Filter on any non-ACGT base (shortcut for "--invert --filt-bases ACGT").') + parser.add_argument('-I', '--iupac', action='store_true', + help='Filter on any non-IUPAC base (shortcut for "--invert --filt-bases ACGTUWSMKRYBDHVN-").') + + args = parser.parse_args(argv[1:]) + + if args.error: + fail('Error: '+args.error) + + # Catch invalid argument combinations. + if args.infile1 and args.infile2 and not (args.outfile1 and args.outfile2): + fail('Error: If giving two input files (paired end), must specify both output files.') + # Determine filetypes, open input file parsers. + filetype1 = get_filetype(args.infile1, args.filetype) + file1_parser = iter(getreads.getparser(args.infile1, filetype=filetype1)) + if args.infile2: + paired = True + filetype2 = get_filetype(args.infile2, args.filetype) + file2_parser = iter(getreads.getparser(args.infile2, filetype=filetype2)) + else: + filetype2 = None + file2_parser = None + paired = False + # Override output filetypes if it was specified on the command line. + if args.out_filetype: + filetype1 = args.out_filetype + filetype2 = args.out_filetype + + # Determine the filter bases and whether to invert the selection. + filt_bases = args.filt_bases + invert = args.invert + if args.acgt: + filt_bases = 'ACGT' + invert = True + elif args.iupac: + filt_bases = 'ACGTUWSMKRYBDHVN-' + invert = True + + # Do the actual trimming. + try: + trim_reads(file1_parser, file2_parser, args.outfile1, args.outfile2, filetype1, filetype2, + paired, args.win_len, args.thres, filt_bases, invert, args.min_length) + finally: + for filehandle in (args.infile1, args.infile2, args.outfile1, args.outfile2): + if filehandle and filehandle is not sys.stdin and filehandle is not sys.stdout: + filehandle.close() + + +def trim_reads(file1_parser, file2_parser, outfile1, outfile2, filetype1, filetype2, paired, + win_len, thres, filt_bases, invert, min_length): + """Trim all the reads in the input file(s), writing to the output file(s).""" + read1 = None + read2 = None + while True: + # Read in the reads. + try: + read1 = next(file1_parser) + if paired: + read2 = next(file2_parser) + except StopIteration: + break + # Do trimming. + read1.seq = trim_read(read1.seq, win_len, thres, filt_bases, invert) + if filetype1 == 'fastq': + # If the output filetype is FASTQ, trim the quality scores too. + # If there are no input quality scores (i.e. the input is FASTA), use dummy scores instead. + # "z" is the highest alphanumeric score (PHRED 89), higher than any expected real score. + qual1 = read1.qual or 'z' * len(read1.seq) + read1.qual = qual1[:len(read1.seq)] + if paired: + read2.seq = trim_read(read2.seq, win_len, thres, filt_bases, invert) + if filetype2 == 'fastq': + qual2 = read2.qual or 'z' * len(read2.seq) + read2.qual = qual2[:len(read2.seq)] + # Output reads if they both pass the minimum length threshold (if any was given). + if min_length is None or (len(read1.seq) >= min_length and len(read2.seq) >= min_length): + write_read(outfile1, read1, filetype1) + write_read(outfile2, read2, filetype2) + else: + # Output read if it passes the minimum length threshold (if any was given). + if min_length is None or len(read1.seq) >= min_length: + write_read(outfile1, read1, filetype1) + + +def get_filetype(infile, filetype_arg): + if infile is sys.stdin: + if filetype_arg: + filetype = filetype_arg + else: + fail('Error: You must specify the --format if reading from stdin.') + elif infile: + if filetype_arg: + filetype = filetype_arg + else: + if infile.name.endswith('.fa') or infile.name.endswith('.fasta'): + filetype = 'fasta' + elif infile.name.endswith('.fq') or infile.name.endswith('.fastq'): + filetype = 'fastq' + else: + fail('Error: Unrecognized file ending on "{}". Please specify the --format.'.format(infile)) + else: + fail('Error: infile is {}'.format(infile)) + return filetype + + +def write_read(filehandle, read, filetype): + if filetype == 'fasta': + filehandle.write('>{name}\n{seq}\n'.format(**vars(read))) + elif filetype == 'fastq': + filehandle.write('@{name}\n{seq}\n+\n{qual}\n'.format(**vars(read))) + + +def trim_read(seq, win_len, thres, filt_bases, invert): + """Trim an individual read and return its trimmed sequence. + This will track the frequency of bad bases in a window of length win_len, and trim once the + frequency goes below thres. The trim point will be just before the first (leftmost) bad base in + the window (the first window with a frequency below thres). The "bad" bases are the ones in + filt_bases if invert is False, or any base NOT in filt_bases if invert is True.""" + # Algorithm: + # The window is a list which acts as a FIFO. As we scan from the left (3') end to the right (5') + # end, we append new bases to the right end of the window and pop them from the left end. + # Each base is only examined twice: when it enters the window and when it leaves it. + # We keep a running total of the number of bad bases in bad_bases_count, incrementing it when bad + # bases enter the window and decrementing it when they leave. + # We also track the location of bad bases in the window with bad_bases_coords so we can figure out + # where to cut if we have to trim. + max_bad_bases = win_len * thres + window = [] + bad_bases_count = 0 + bad_bases_coords = [] + for coord, base in enumerate(seq.upper()): + # Shift window, adjust bad_bases_count and bad_bases_coords list. + window.append(base) + # Is the new base we're adding to the window a bad base? + if invert: + bad_base = base not in filt_bases + else: + bad_base = base in filt_bases + # If so, increment the total and add its coordinate to the window. + if bad_base: + bad_bases_count += 1 + bad_bases_coords.append(coord) + if len(window) > win_len: + first_base = window.pop(0) + # Is the base we're removing (the first base in the window) a bad base? + if invert: + bad_base = first_base not in filt_bases + else: + bad_base = first_base in filt_bases + # If so, decrement the total and remove its coordinate from the window. + if bad_base: + bad_bases_count -= 1 + bad_bases_coords.pop(0) + # print bad_bases_coords + # Are we over the threshold? + if bad_bases_count > max_bad_bases: + break + # If we exceeded the threshold, trim the sequence at the first (leftmost) bad base in the window. + if bad_bases_count > max_bad_bases: + first_bad_base = bad_bases_coords[0] + return seq[0:first_bad_base] + else: + return seq + + +def fail(message): + sys.stderr.write(message+"\n") + sys.exit(1) + + +if __name__ == '__main__': + sys.exit(main(sys.argv))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trimmer.xml Tue Dec 01 21:33:27 2015 -0500 @@ -0,0 +1,84 @@ +<tool id="sequence_content_trimmer" version="0.1" name="Sequence Content Trimmer"> + <description>trim reads based on certain bases</description> + <command interpreter="python"> + trimmer.py $input1 + #if $paired.is_paired: + $input2 $output1 $output2 + #if ('fasta' in $input1.extension and 'fastq' in $input2.extension) or ('fastq' in $input1.extension and 'fasta' in $input2.extension) + --error 'Both input files must be either fastq or fasta (no mixing the two).' + #end if + #end if + #if $input1.extension == 'fastq' or $input1.extension == 'fastqsanger' or $input1.extension == 'fastqillumina' or $input1.extension == 'fastqsolexa' + -f fastq + #elif $input1.extension == 'fasta' + -f fasta + #else + -f $input1.extension + #end if + -b $bases -t $thres -w $win_len $invert + #if $min_len.has_min_len: + -m $min_len.value + #end if + #if not $paired.is_paired: + > $output1 + #end if + </command> + <inputs> + <conditional name="paired"> + <param name="is_paired" type="select" label="Paired reads?"> + <option value="" selected="True">Unpaired</option> + <option value="true">Paired</option> + </param> + <when value="true"> + <param name="input1" type="data" format="fasta,fastq" label="Input reads (mate 1)"/> + <param name="input2" type="data" format="fasta,fastq" label="Input reads (mate 2)"/> + </when> + <when value=""> + <param name="input1" type="data" format="fasta,fastq" label="Input reads"/> + </when> + </conditional> + <param name="bases" type="text" value="N" label="Bases to filter on"/> + <param name="thres" type="float" value="0.5" min="0" max="1" label="Frequency threshold" help="Trim when the frequency of filter bases (or non-filter bases, if inverting) exceeds this value."/> + <param name="win_len" type="integer" value="10" min="1" label="Size of the window"/> + <param name="invert" type="boolean" truevalue="--invert" falsevalue="" checked="False" label="Invert filter bases" help="Trim when the frequency of bases NOT in the "filter bases" list exceeds the threshold."/> + <conditional name="min_len"> + <param name="has_min_len" type="boolean" truevalue="true" falsevalue="" checked="False" label="Set a minimum read length"/> + <when value="true"> + <param name="value" type="integer" value="10" min="0" label="Minimum read length" help="Reads trimmed to less than this length will be omitted from the output. Pairs will be preserved: both must exceed this threshold to be kept."/> + </when> + </conditional> + </inputs> + <outputs> + <data name="output1" format_source="input1"/> + <data name="output2" format_source="input2"> + <filter>paired['is_paired']</filter> + </data> + </outputs> + + <help> + +.. class:: infomark + +**What it does** + +This tool trims the 3' ends of reads based on the presence of the given bases. For instance, trim when N's are encountered or when the GC content exceeds a certain frequency. + + +.. class:: infomark + +**How it works** + +This will slide along the read with a window, and trim once the frequency of filter bases exceeds the frequency threshold (unless "Invert filter bases" is enabled, when it will trim once non-filter bases exceed the threshold). + +The trim point will be just before the first (leftmost) filter base in the final window (the one where the frequency exceeded the threshold). + + +.. class:: infomark + +**Input** + +The inputs can be in the following formats: fasta, fastq, fastqsanger, fastqillumina, and fastqsolexa. Both must be either a fasta or fastq type (no mixing fastq and fasta). + + </help> + +</tool>