Mercurial > repos > nick > sequence_content_trimmer
diff getreads.py @ 0:7f170cb06e2e draft
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
author | nick |
---|---|
date | Tue, 01 Dec 2015 21:33:27 -0500 |
parents | |
children | 464aee13e2df |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/getreads.py Tue Dec 01 21:33:27 2015 -0500 @@ -0,0 +1,156 @@ +"""A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and +sequence. +All format parsers follow this API: + with open('sequence.fasta') as fasta: + for read in getreads.getparser(fasta, filetype='fasta'): + print "There is a sequence with this FASTA identifier: "+read.id + print "Its sequence is "+read.seq +The properties of Read are: + name: The entire FASTA header line, SAM column 1, etc. + id: The first whitespace-delimited part of the name. + seq: The sequence. + qual: The quality scores (unless the format is FASTA). +""" + + +def getparser(filehandle, filetype='fasta'): + if filetype == 'fasta': + return FastaReader(filehandle) + elif filetype == 'fastq': + return FastqReader(filehandle) + elif filetype == 'sam': + return SamReader(filehandle) + elif filetype == 'tsv': + return TsvReader(filehandle) + else: + raise ValueError('Illegal argument: filetype=\''+filetype+'\'') + + +class FormatError(Exception): + def __init__(self, message=None): + if message: + Exception.__init__(self, message) + + +class Read(object): + def __init__(self, name='', seq='', id_='', qual=''): + self.name = name + self.seq = seq + self.id = id_ + self.qual = qual + + +class Reader(object): + """Base class for all other parsers.""" + def __init__(self, filehandle): + self.filehandle = filehandle + def __iter__(self): + return self.parser() + + +class TsvReader(Reader): + """A parser for a simple tab-delimited format. + Column 1: name + Column 2: sequence + Column 3: quality scores (optional)""" + def parser(self): + for line in self.filehandle: + fields = line.rstrip('\r\n').split('\t') + if len(fields) < 2: + continue + read = Read() + read.name = fields[0] + if read.name: + read.id = read.name.split()[0] + read.seq = fields[1] + if len(fields) >= 3: + read.qual = fields[2] + yield read + + +class SamReader(Reader): + """A simple SAM parser. + Assumptions: + Lines starting with "@" with 3 fields are headers. All others are alignments. + All alignment lines have 11 or more fields. Other lines will be skipped. + """ + def parser(self): + for line in self.filehandle: + fields = line.split('\t') + if len(fields) < 11: + continue + # Skip headers. + if fields[0].startswith('@') and len(fields[0]) == 3: + continue + read = Read() + read.name = fields[0] + if read.name: + read.id = read.name.split()[0] + read.seq = fields[9] + read.qual = fields[10].rstrip('\r\n') + yield read + + +class FastaReader(Reader): + """A simple FASTA parser that reads one sequence at a time into memory.""" + def parser(self): + read = Read() + while True: + line_raw = self.filehandle.readline() + if not line_raw: + if read.seq: + yield read + raise StopIteration + line = line_raw.strip() + # Allow empty lines. + if not line: + continue + if line.startswith('>'): + if read.seq: + yield read + read = Read() + read.name = line[1:] # remove ">" + if read.name: + read.id = read.name.split()[0] + continue + else: + read.seq += line + + +class FastqReader(Reader): + """A simple FASTQ parser. Can handle multi-line sequences, though.""" + def parser(self): + read = Read() + state = 'header' + while True: + line_raw = self.filehandle.readline() + if not line_raw: + if read.seq: + yield read + raise StopIteration + line = line_raw.strip() + # Allow empty lines. + if not line: + continue + if state == 'header': + if not line.startswith('@'): + raise FormatError('line state = "header" but line does not start with "@"') + if read.seq: + yield read + read = Read() + read.name = line[1:] # remove '@' + if read.name: + read.id = read.name.split()[0] + state = 'sequence' + elif state == 'sequence': + if line.startswith('+'): + state = 'plus' + else: + read.seq += line + elif state == 'plus' or state == 'quality': + state = 'quality' + togo = len(read.seq) - len(read.qual) + read.qual += line[:togo] + # The end of the quality lines is when we have a quality string as long as the sequence. + if len(read.qual) >= len(read.seq): + state = 'header'