comparison getreads.py @ 0:7f170cb06e2e draft

planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
author nick
date Tue, 01 Dec 2015 21:33:27 -0500
parents
children 464aee13e2df
comparison
equal deleted inserted replaced
-1:000000000000 0:7f170cb06e2e
1 """A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and
2 sequence.
3 All format parsers follow this API:
4 with open('sequence.fasta') as fasta:
5 for read in getreads.getparser(fasta, filetype='fasta'):
6 print "There is a sequence with this FASTA identifier: "+read.id
7 print "Its sequence is "+read.seq
8 The properties of Read are:
9 name: The entire FASTA header line, SAM column 1, etc.
10 id: The first whitespace-delimited part of the name.
11 seq: The sequence.
12 qual: The quality scores (unless the format is FASTA).
13 """
14
15
16 def getparser(filehandle, filetype='fasta'):
17 if filetype == 'fasta':
18 return FastaReader(filehandle)
19 elif filetype == 'fastq':
20 return FastqReader(filehandle)
21 elif filetype == 'sam':
22 return SamReader(filehandle)
23 elif filetype == 'tsv':
24 return TsvReader(filehandle)
25 else:
26 raise ValueError('Illegal argument: filetype=\''+filetype+'\'')
27
28
29 class FormatError(Exception):
30 def __init__(self, message=None):
31 if message:
32 Exception.__init__(self, message)
33
34
35 class Read(object):
36 def __init__(self, name='', seq='', id_='', qual=''):
37 self.name = name
38 self.seq = seq
39 self.id = id_
40 self.qual = qual
41
42
43 class Reader(object):
44 """Base class for all other parsers."""
45 def __init__(self, filehandle):
46 self.filehandle = filehandle
47 def __iter__(self):
48 return self.parser()
49
50
51 class TsvReader(Reader):
52 """A parser for a simple tab-delimited format.
53 Column 1: name
54 Column 2: sequence
55 Column 3: quality scores (optional)"""
56 def parser(self):
57 for line in self.filehandle:
58 fields = line.rstrip('\r\n').split('\t')
59 if len(fields) < 2:
60 continue
61 read = Read()
62 read.name = fields[0]
63 if read.name:
64 read.id = read.name.split()[0]
65 read.seq = fields[1]
66 if len(fields) >= 3:
67 read.qual = fields[2]
68 yield read
69
70
71 class SamReader(Reader):
72 """A simple SAM parser.
73 Assumptions:
74 Lines starting with "@" with 3 fields are headers. All others are alignments.
75 All alignment lines have 11 or more fields. Other lines will be skipped.
76 """
77 def parser(self):
78 for line in self.filehandle:
79 fields = line.split('\t')
80 if len(fields) < 11:
81 continue
82 # Skip headers.
83 if fields[0].startswith('@') and len(fields[0]) == 3:
84 continue
85 read = Read()
86 read.name = fields[0]
87 if read.name:
88 read.id = read.name.split()[0]
89 read.seq = fields[9]
90 read.qual = fields[10].rstrip('\r\n')
91 yield read
92
93
94 class FastaReader(Reader):
95 """A simple FASTA parser that reads one sequence at a time into memory."""
96 def parser(self):
97 read = Read()
98 while True:
99 line_raw = self.filehandle.readline()
100 if not line_raw:
101 if read.seq:
102 yield read
103 raise StopIteration
104 line = line_raw.strip()
105 # Allow empty lines.
106 if not line:
107 continue
108 if line.startswith('>'):
109 if read.seq:
110 yield read
111 read = Read()
112 read.name = line[1:] # remove ">"
113 if read.name:
114 read.id = read.name.split()[0]
115 continue
116 else:
117 read.seq += line
118
119
120 class FastqReader(Reader):
121 """A simple FASTQ parser. Can handle multi-line sequences, though."""
122 def parser(self):
123 read = Read()
124 state = 'header'
125 while True:
126 line_raw = self.filehandle.readline()
127 if not line_raw:
128 if read.seq:
129 yield read
130 raise StopIteration
131 line = line_raw.strip()
132 # Allow empty lines.
133 if not line:
134 continue
135 if state == 'header':
136 if not line.startswith('@'):
137 raise FormatError('line state = "header" but line does not start with "@"')
138 if read.seq:
139 yield read
140 read = Read()
141 read.name = line[1:] # remove '@'
142 if read.name:
143 read.id = read.name.split()[0]
144 state = 'sequence'
145 elif state == 'sequence':
146 if line.startswith('+'):
147 state = 'plus'
148 else:
149 read.seq += line
150 elif state == 'plus' or state == 'quality':
151 state = 'quality'
152 togo = len(read.seq) - len(read.qual)
153 read.qual += line[:togo]
154 # The end of the quality lines is when we have a quality string as long as the sequence.
155 if len(read.qual) >= len(read.seq):
156 state = 'header'