comparison getreads.py @ 1:464aee13e2df draft default tip

"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
author nick
date Fri, 27 May 2022 23:29:45 +0000
parents 7f170cb06e2e
children
comparison
equal deleted inserted replaced
0:7f170cb06e2e 1:464aee13e2df
1 #!/usr/bin/env python3
2 import argparse
3 import logging
4 import os
5 import sys
6 import types
1 """A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and 7 """A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and
2 sequence. 8 sequence.
3 All format parsers follow this API: 9 All format parsers follow this API:
4 with open('sequence.fasta') as fasta: 10 with open('sequence.fasta') as fasta:
5 for read in getreads.getparser(fasta, filetype='fasta'): 11 for read in getreads.getparser(fasta, filetype='fasta'):
10 id: The first whitespace-delimited part of the name. 16 id: The first whitespace-delimited part of the name.
11 seq: The sequence. 17 seq: The sequence.
12 qual: The quality scores (unless the format is FASTA). 18 qual: The quality scores (unless the format is FASTA).
13 """ 19 """
14 20
15 21 # Available formats.
16 def getparser(filehandle, filetype='fasta'): 22 FORMATS = ('fasta', 'fastq', 'sam', 'tsv', 'lines')
23
24 QUAL_OFFSETS = {'sanger':33, 'solexa':64}
25
26
27 def getparser(input, filetype, qual_format='sanger', name_col=1, seq_col=2, qual_col=3):
28 # Detect whether the input is an open file or a path.
29 # Return the appropriate reader.
17 if filetype == 'fasta': 30 if filetype == 'fasta':
18 return FastaReader(filehandle) 31 return FastaReader(input)
19 elif filetype == 'fastq': 32 elif filetype == 'fastq':
20 return FastqReader(filehandle) 33 return FastqReader(input, qual_format=qual_format)
21 elif filetype == 'sam': 34 elif filetype == 'sam':
22 return SamReader(filehandle) 35 return SamReader(input, qual_format=qual_format)
23 elif filetype == 'tsv': 36 elif filetype == 'tsv':
24 return TsvReader(filehandle) 37 return TsvReader(input, qual_format=qual_format,
38 name_col=name_col, seq_col=seq_col, qual_col=qual_col)
39 elif filetype == 'lines':
40 return LineReader(input)
25 else: 41 else:
26 raise ValueError('Illegal argument: filetype=\''+filetype+'\'') 42 raise ValueError('Unrecognized format: {!r}'.format(filetype))
43
44
45 def detect_input_type(obj):
46 """Is this an open filehandle, or is it a file path (string)?"""
47 try:
48 os.path.isfile(obj)
49 return 'path'
50 except TypeError:
51 if isinstance(obj, types.GeneratorType):
52 return 'generator'
53 elif hasattr(obj, 'read') and hasattr(obj, 'close'):
54 return 'file'
55 else:
56 return None
27 57
28 58
29 class FormatError(Exception): 59 class FormatError(Exception):
30 def __init__(self, message=None): 60 def __init__(self, message=None):
31 if message: 61 if message:
32 Exception.__init__(self, message) 62 Exception.__init__(self, message)
33 63
34 64
35 class Read(object): 65 class Read(object):
36 def __init__(self, name='', seq='', id_='', qual=''): 66 def __init__(self, name='', seq='', id_='', qual='', qual_format='sanger'):
37 self.name = name 67 self.name = name
38 self.seq = seq 68 self.seq = seq
39 self.id = id_
40 self.qual = qual 69 self.qual = qual
70 if id_ or not self.name:
71 self.id = id_
72 elif self.name:
73 self.id = self.name.split()[0]
74 self.offset = QUAL_OFFSETS[qual_format]
75 @property
76 def scores(self):
77 if self.qual is None:
78 return None
79 scores = []
80 for qual_char in self.qual:
81 scores.append(ord(qual_char) - self.offset)
82 return scores
83 def to_fasta(self):
84 return f'>{self.name}\n{self.seq}'
85 def to_fastq(self):
86 return f'@{self.name}\n{self.seq}\n+\n{self.qual}'
87 def __str__(self):
88 if self.qual:
89 return self.to_fastq()
90 else:
91 return self.to_fasta()
92 def __repr__(self):
93 kwarg_strs = []
94 for kwarg in 'name', 'seq', 'id_', 'qual':
95 attr = kwarg.rstrip('_')
96 raw_value = getattr(self, attr)
97 if raw_value is not None and len(raw_value) >= 200:
98 value = raw_value[:199]+'…'
99 else:
100 value = raw_value
101 if value:
102 kwarg_strs.append(f'{kwarg}={value!r}')
103 return type(self).__name__+'('+', '.join(kwarg_strs)+')'
41 104
42 105
43 class Reader(object): 106 class Reader(object):
44 """Base class for all other parsers.""" 107 """Base class for all other parsers."""
45 def __init__(self, filehandle): 108 def __init__(self, input, **kwargs):
46 self.filehandle = filehandle 109 self.input = input
110 self.input_type = detect_input_type(input)
111 if self.input_type not in ('path', 'file', 'generator'):
112 raise ValueError('Input object {!r} not a file, string, or generator.'.format(input))
113 for key, value in kwargs.items():
114 setattr(self, key, value)
47 def __iter__(self): 115 def __iter__(self):
48 return self.parser() 116 return self.parser()
117 def bases(self):
118 for read in self.parser():
119 for base in read.seq:
120 yield base
121 def get_input_iterator(self):
122 if self.input_type == 'path':
123 return open(self.input)
124 else:
125 return self.input
126
127
128 class LineReader(Reader):
129 """A parser for the simplest format: Only the sequence, one line per read."""
130 def parser(self):
131 input_iterator = self.get_input_iterator()
132 try:
133 for line in input_iterator:
134 read = Read(seq=line.rstrip('\r\n'))
135 yield read
136 finally:
137 if self.input_type == 'path':
138 input_iterator.close()
49 139
50 140
51 class TsvReader(Reader): 141 class TsvReader(Reader):
52 """A parser for a simple tab-delimited format. 142 """A parser for a simple tab-delimited format.
53 Column 1: name 143 Column 1: name
54 Column 2: sequence 144 Column 2: sequence
55 Column 3: quality scores (optional)""" 145 Column 3: quality scores (optional)"""
56 def parser(self): 146 def parser(self):
57 for line in self.filehandle: 147 min_fields = max(self.name_col, self.seq_col)
58 fields = line.rstrip('\r\n').split('\t') 148 input_iterator = self.get_input_iterator()
59 if len(fields) < 2: 149 try:
60 continue 150 for line in input_iterator:
61 read = Read() 151 fields = line.rstrip('\r\n').split('\t')
62 read.name = fields[0] 152 if len(fields) < min_fields:
63 if read.name: 153 continue
64 read.id = read.name.split()[0] 154 read = Read(qual_format=self.qual_format)
65 read.seq = fields[1] 155 read.name = fields[self.name_col-1]
66 if len(fields) >= 3: 156 if read.name:
67 read.qual = fields[2] 157 read.id = read.name.split()[0]
68 yield read 158 read.seq = fields[self.seq_col-1]
159 try:
160 read.qual = fields[self.qual_col-1]
161 except (TypeError, IndexError):
162 pass
163 yield read
164 finally:
165 if self.input_type == 'path':
166 input_iterator.close()
69 167
70 168
71 class SamReader(Reader): 169 class SamReader(Reader):
72 """A simple SAM parser. 170 """A simple SAM parser.
73 Assumptions: 171 Assumptions:
74 Lines starting with "@" with 3 fields are headers. All others are alignments. 172 Lines starting with "@" with 3 fields are headers. All others are alignments.
75 All alignment lines have 11 or more fields. Other lines will be skipped. 173 All alignment lines have 11 or more fields. Other lines will be skipped.
76 """ 174 """
77 def parser(self): 175 def parser(self):
78 for line in self.filehandle: 176 input_iterator = self.get_input_iterator()
79 fields = line.split('\t') 177 try:
80 if len(fields) < 11: 178 for line in input_iterator:
81 continue 179 fields = line.split('\t')
82 # Skip headers. 180 if len(fields) < 11:
83 if fields[0].startswith('@') and len(fields[0]) == 3: 181 continue
84 continue 182 # Skip headers.
85 read = Read() 183 if fields[0].startswith('@') and len(fields[0]) == 3:
86 read.name = fields[0] 184 continue
87 if read.name: 185 read = Read(qual_format=self.qual_format)
88 read.id = read.name.split()[0] 186 read.name = fields[0]
89 read.seq = fields[9] 187 if read.name:
90 read.qual = fields[10].rstrip('\r\n') 188 read.id = read.name.split()[0]
91 yield read 189 read.seq = fields[9]
190 read.qual = fields[10].rstrip('\r\n')
191 yield read
192 finally:
193 if self.input_type == 'path':
194 input_iterator.close()
92 195
93 196
94 class FastaReader(Reader): 197 class FastaReader(Reader):
95 """A simple FASTA parser that reads one sequence at a time into memory.""" 198 """A simple FASTA parser that reads one sequence at a time into memory."""
96 def parser(self): 199 def parser(self):
97 read = Read() 200 input_iterator = self.get_input_iterator()
98 while True: 201 try:
99 line_raw = self.filehandle.readline() 202 read = None
100 if not line_raw: 203 while True:
101 if read.seq: 204 try:
102 yield read 205 line_raw = next(input_iterator)
103 raise StopIteration 206 except StopIteration:
104 line = line_raw.strip() 207 if read is not None:
105 # Allow empty lines. 208 yield read
106 if not line: 209 return
107 continue 210 line = line_raw.rstrip('\r\n')
108 if line.startswith('>'): 211 if line.startswith('>'):
109 if read.seq: 212 if read is not None:
110 yield read 213 yield read
111 read = Read() 214 read = Read()
112 read.name = line[1:] # remove ">" 215 read.name = line[1:] # remove ">"
113 if read.name: 216 if read.name:
114 read.id = read.name.split()[0] 217 read.id = read.name.split()[0]
115 continue 218 continue
116 else: 219 else:
117 read.seq += line 220 read.seq += line
221 finally:
222 if self.input_type == 'path':
223 input_iterator.close()
118 224
119 225
120 class FastqReader(Reader): 226 class FastqReader(Reader):
121 """A simple FASTQ parser. Can handle multi-line sequences, though.""" 227 """A simple FASTQ parser. Can handle multi-line sequences, though."""
122 def parser(self): 228 def parser(self):
123 read = Read() 229 input_iterator = self.get_input_iterator()
124 state = 'header' 230 try:
125 while True: 231 read = None
126 line_raw = self.filehandle.readline() 232 line_num = 0
127 if not line_raw: 233 state = 'header'
128 if read.seq: 234 while True:
129 yield read 235 try:
130 raise StopIteration 236 line_raw = next(input_iterator)
131 line = line_raw.strip() 237 except StopIteration:
132 # Allow empty lines. 238 if read is not None:
133 if not line: 239 yield read
134 continue 240 return
135 if state == 'header': 241 line_num += 1
136 if not line.startswith('@'): 242 line = line_raw.rstrip('\r\n')
137 raise FormatError('line state = "header" but line does not start with "@"') 243 if state == 'header':
138 if read.seq: 244 if not line.startswith('@'):
139 yield read 245 if line:
140 read = Read() 246 raise FormatError('line state = "header" but line does not start with "@":\n'+line)
141 read.name = line[1:] # remove '@' 247 else:
142 if read.name: 248 # Allow empty lines.
143 read.id = read.name.split()[0] 249 continue
144 state = 'sequence' 250 if read is not None:
145 elif state == 'sequence': 251 yield read
146 if line.startswith('+'): 252 read = Read(qual_format=self.qual_format)
147 state = 'plus' 253 read.name = line[1:] # remove '@'
148 else: 254 if read.name:
149 read.seq += line 255 read.id = read.name.split()[0]
150 elif state == 'plus' or state == 'quality': 256 state = 'sequence'
151 state = 'quality' 257 elif state == 'sequence':
152 togo = len(read.seq) - len(read.qual) 258 if line.startswith('+'):
153 read.qual += line[:togo] 259 state = 'plus'
154 # The end of the quality lines is when we have a quality string as long as the sequence. 260 else:
155 if len(read.qual) >= len(read.seq): 261 read.seq += line
156 state = 'header' 262 elif state == 'plus' or state == 'quality':
263 if line.startswith('@') and state == 'quality':
264 logging.warning('Looking for more quality scores but line starts with "@". This might '
265 'be a header line and there were fewer quality scores than bases: {}'
266 .format(line[:69]))
267 state = 'quality'
268 togo = len(read.seq) - len(read.qual)
269 read.qual += line[:togo]
270 # The end of the quality lines is when we have a quality string as long as the sequence.
271 if len(read.qual) >= len(read.seq):
272 state = 'header'
273 finally:
274 if self.input_type == 'path':
275 input_iterator.close()
276
277
278 DESCRIPTION = 'Test parser by parsing an input file and printing its contents.'
279
280
281 def make_argparser():
282 parser = argparse.ArgumentParser(description=DESCRIPTION)
283 parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin,
284 help='Input reads.')
285 parser.add_argument('-f', '--format', choices=('fasta', 'fastq', 'sam', 'tsv', 'lines'),
286 help='Input read format. Will be detected from the filename, if given.')
287 return parser
288
289
290 def main(argv):
291 parser = make_argparser()
292 args = parser.parse_args(argv[1:])
293 if args.format:
294 format = args.format
295 elif args.infile is sys.stdin:
296 fail('Error: Must give a --format if reading from stdin.')
297 else:
298 ext = os.path.splitext(args.infile.name)[1]
299 if ext == '.fq':
300 format = 'fastq'
301 elif ext == '.fa':
302 format = 'fasta'
303 elif ext == '.txt':
304 format = 'lines'
305 else:
306 format = ext[1:]
307 print('Reading input as format {!r}.'.format(format))
308 for i, read in enumerate(getparser(args.infile, filetype=format)):
309 print('Read {} id/name: {!r}/{!r}'.format(i+1, read.id, read.name))
310 print('Read {} seq: {!r}'.format(i+1, read.seq))
311 print('Read {} qual: {!r}'.format(i+1, read.qual))
312
313
314 def fail(message):
315 sys.stderr.write(message+"\n")
316 sys.exit(1)
317
318
319 if __name__ == '__main__':
320 sys.exit(main(sys.argv))