Mercurial > repos > nick > sequence_content_trimmer
comparison getreads.py @ 1:464aee13e2df draft default tip
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
author | nick |
---|---|
date | Fri, 27 May 2022 23:29:45 +0000 |
parents | 7f170cb06e2e |
children |
comparison
equal
deleted
inserted
replaced
0:7f170cb06e2e | 1:464aee13e2df |
---|---|
1 #!/usr/bin/env python3 | |
2 import argparse | |
3 import logging | |
4 import os | |
5 import sys | |
6 import types | |
1 """A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and | 7 """A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and |
2 sequence. | 8 sequence. |
3 All format parsers follow this API: | 9 All format parsers follow this API: |
4 with open('sequence.fasta') as fasta: | 10 with open('sequence.fasta') as fasta: |
5 for read in getreads.getparser(fasta, filetype='fasta'): | 11 for read in getreads.getparser(fasta, filetype='fasta'): |
10 id: The first whitespace-delimited part of the name. | 16 id: The first whitespace-delimited part of the name. |
11 seq: The sequence. | 17 seq: The sequence. |
12 qual: The quality scores (unless the format is FASTA). | 18 qual: The quality scores (unless the format is FASTA). |
13 """ | 19 """ |
14 | 20 |
15 | 21 # Available formats. |
16 def getparser(filehandle, filetype='fasta'): | 22 FORMATS = ('fasta', 'fastq', 'sam', 'tsv', 'lines') |
23 | |
24 QUAL_OFFSETS = {'sanger':33, 'solexa':64} | |
25 | |
26 | |
27 def getparser(input, filetype, qual_format='sanger', name_col=1, seq_col=2, qual_col=3): | |
28 # Detect whether the input is an open file or a path. | |
29 # Return the appropriate reader. | |
17 if filetype == 'fasta': | 30 if filetype == 'fasta': |
18 return FastaReader(filehandle) | 31 return FastaReader(input) |
19 elif filetype == 'fastq': | 32 elif filetype == 'fastq': |
20 return FastqReader(filehandle) | 33 return FastqReader(input, qual_format=qual_format) |
21 elif filetype == 'sam': | 34 elif filetype == 'sam': |
22 return SamReader(filehandle) | 35 return SamReader(input, qual_format=qual_format) |
23 elif filetype == 'tsv': | 36 elif filetype == 'tsv': |
24 return TsvReader(filehandle) | 37 return TsvReader(input, qual_format=qual_format, |
38 name_col=name_col, seq_col=seq_col, qual_col=qual_col) | |
39 elif filetype == 'lines': | |
40 return LineReader(input) | |
25 else: | 41 else: |
26 raise ValueError('Illegal argument: filetype=\''+filetype+'\'') | 42 raise ValueError('Unrecognized format: {!r}'.format(filetype)) |
43 | |
44 | |
45 def detect_input_type(obj): | |
46 """Is this an open filehandle, or is it a file path (string)?""" | |
47 try: | |
48 os.path.isfile(obj) | |
49 return 'path' | |
50 except TypeError: | |
51 if isinstance(obj, types.GeneratorType): | |
52 return 'generator' | |
53 elif hasattr(obj, 'read') and hasattr(obj, 'close'): | |
54 return 'file' | |
55 else: | |
56 return None | |
27 | 57 |
28 | 58 |
29 class FormatError(Exception): | 59 class FormatError(Exception): |
30 def __init__(self, message=None): | 60 def __init__(self, message=None): |
31 if message: | 61 if message: |
32 Exception.__init__(self, message) | 62 Exception.__init__(self, message) |
33 | 63 |
34 | 64 |
35 class Read(object): | 65 class Read(object): |
36 def __init__(self, name='', seq='', id_='', qual=''): | 66 def __init__(self, name='', seq='', id_='', qual='', qual_format='sanger'): |
37 self.name = name | 67 self.name = name |
38 self.seq = seq | 68 self.seq = seq |
39 self.id = id_ | |
40 self.qual = qual | 69 self.qual = qual |
70 if id_ or not self.name: | |
71 self.id = id_ | |
72 elif self.name: | |
73 self.id = self.name.split()[0] | |
74 self.offset = QUAL_OFFSETS[qual_format] | |
75 @property | |
76 def scores(self): | |
77 if self.qual is None: | |
78 return None | |
79 scores = [] | |
80 for qual_char in self.qual: | |
81 scores.append(ord(qual_char) - self.offset) | |
82 return scores | |
83 def to_fasta(self): | |
84 return f'>{self.name}\n{self.seq}' | |
85 def to_fastq(self): | |
86 return f'@{self.name}\n{self.seq}\n+\n{self.qual}' | |
87 def __str__(self): | |
88 if self.qual: | |
89 return self.to_fastq() | |
90 else: | |
91 return self.to_fasta() | |
92 def __repr__(self): | |
93 kwarg_strs = [] | |
94 for kwarg in 'name', 'seq', 'id_', 'qual': | |
95 attr = kwarg.rstrip('_') | |
96 raw_value = getattr(self, attr) | |
97 if raw_value is not None and len(raw_value) >= 200: | |
98 value = raw_value[:199]+'…' | |
99 else: | |
100 value = raw_value | |
101 if value: | |
102 kwarg_strs.append(f'{kwarg}={value!r}') | |
103 return type(self).__name__+'('+', '.join(kwarg_strs)+')' | |
41 | 104 |
42 | 105 |
43 class Reader(object): | 106 class Reader(object): |
44 """Base class for all other parsers.""" | 107 """Base class for all other parsers.""" |
45 def __init__(self, filehandle): | 108 def __init__(self, input, **kwargs): |
46 self.filehandle = filehandle | 109 self.input = input |
110 self.input_type = detect_input_type(input) | |
111 if self.input_type not in ('path', 'file', 'generator'): | |
112 raise ValueError('Input object {!r} not a file, string, or generator.'.format(input)) | |
113 for key, value in kwargs.items(): | |
114 setattr(self, key, value) | |
47 def __iter__(self): | 115 def __iter__(self): |
48 return self.parser() | 116 return self.parser() |
117 def bases(self): | |
118 for read in self.parser(): | |
119 for base in read.seq: | |
120 yield base | |
121 def get_input_iterator(self): | |
122 if self.input_type == 'path': | |
123 return open(self.input) | |
124 else: | |
125 return self.input | |
126 | |
127 | |
128 class LineReader(Reader): | |
129 """A parser for the simplest format: Only the sequence, one line per read.""" | |
130 def parser(self): | |
131 input_iterator = self.get_input_iterator() | |
132 try: | |
133 for line in input_iterator: | |
134 read = Read(seq=line.rstrip('\r\n')) | |
135 yield read | |
136 finally: | |
137 if self.input_type == 'path': | |
138 input_iterator.close() | |
49 | 139 |
50 | 140 |
51 class TsvReader(Reader): | 141 class TsvReader(Reader): |
52 """A parser for a simple tab-delimited format. | 142 """A parser for a simple tab-delimited format. |
53 Column 1: name | 143 Column 1: name |
54 Column 2: sequence | 144 Column 2: sequence |
55 Column 3: quality scores (optional)""" | 145 Column 3: quality scores (optional)""" |
56 def parser(self): | 146 def parser(self): |
57 for line in self.filehandle: | 147 min_fields = max(self.name_col, self.seq_col) |
58 fields = line.rstrip('\r\n').split('\t') | 148 input_iterator = self.get_input_iterator() |
59 if len(fields) < 2: | 149 try: |
60 continue | 150 for line in input_iterator: |
61 read = Read() | 151 fields = line.rstrip('\r\n').split('\t') |
62 read.name = fields[0] | 152 if len(fields) < min_fields: |
63 if read.name: | 153 continue |
64 read.id = read.name.split()[0] | 154 read = Read(qual_format=self.qual_format) |
65 read.seq = fields[1] | 155 read.name = fields[self.name_col-1] |
66 if len(fields) >= 3: | 156 if read.name: |
67 read.qual = fields[2] | 157 read.id = read.name.split()[0] |
68 yield read | 158 read.seq = fields[self.seq_col-1] |
159 try: | |
160 read.qual = fields[self.qual_col-1] | |
161 except (TypeError, IndexError): | |
162 pass | |
163 yield read | |
164 finally: | |
165 if self.input_type == 'path': | |
166 input_iterator.close() | |
69 | 167 |
70 | 168 |
71 class SamReader(Reader): | 169 class SamReader(Reader): |
72 """A simple SAM parser. | 170 """A simple SAM parser. |
73 Assumptions: | 171 Assumptions: |
74 Lines starting with "@" with 3 fields are headers. All others are alignments. | 172 Lines starting with "@" with 3 fields are headers. All others are alignments. |
75 All alignment lines have 11 or more fields. Other lines will be skipped. | 173 All alignment lines have 11 or more fields. Other lines will be skipped. |
76 """ | 174 """ |
77 def parser(self): | 175 def parser(self): |
78 for line in self.filehandle: | 176 input_iterator = self.get_input_iterator() |
79 fields = line.split('\t') | 177 try: |
80 if len(fields) < 11: | 178 for line in input_iterator: |
81 continue | 179 fields = line.split('\t') |
82 # Skip headers. | 180 if len(fields) < 11: |
83 if fields[0].startswith('@') and len(fields[0]) == 3: | 181 continue |
84 continue | 182 # Skip headers. |
85 read = Read() | 183 if fields[0].startswith('@') and len(fields[0]) == 3: |
86 read.name = fields[0] | 184 continue |
87 if read.name: | 185 read = Read(qual_format=self.qual_format) |
88 read.id = read.name.split()[0] | 186 read.name = fields[0] |
89 read.seq = fields[9] | 187 if read.name: |
90 read.qual = fields[10].rstrip('\r\n') | 188 read.id = read.name.split()[0] |
91 yield read | 189 read.seq = fields[9] |
190 read.qual = fields[10].rstrip('\r\n') | |
191 yield read | |
192 finally: | |
193 if self.input_type == 'path': | |
194 input_iterator.close() | |
92 | 195 |
93 | 196 |
94 class FastaReader(Reader): | 197 class FastaReader(Reader): |
95 """A simple FASTA parser that reads one sequence at a time into memory.""" | 198 """A simple FASTA parser that reads one sequence at a time into memory.""" |
96 def parser(self): | 199 def parser(self): |
97 read = Read() | 200 input_iterator = self.get_input_iterator() |
98 while True: | 201 try: |
99 line_raw = self.filehandle.readline() | 202 read = None |
100 if not line_raw: | 203 while True: |
101 if read.seq: | 204 try: |
102 yield read | 205 line_raw = next(input_iterator) |
103 raise StopIteration | 206 except StopIteration: |
104 line = line_raw.strip() | 207 if read is not None: |
105 # Allow empty lines. | 208 yield read |
106 if not line: | 209 return |
107 continue | 210 line = line_raw.rstrip('\r\n') |
108 if line.startswith('>'): | 211 if line.startswith('>'): |
109 if read.seq: | 212 if read is not None: |
110 yield read | 213 yield read |
111 read = Read() | 214 read = Read() |
112 read.name = line[1:] # remove ">" | 215 read.name = line[1:] # remove ">" |
113 if read.name: | 216 if read.name: |
114 read.id = read.name.split()[0] | 217 read.id = read.name.split()[0] |
115 continue | 218 continue |
116 else: | 219 else: |
117 read.seq += line | 220 read.seq += line |
221 finally: | |
222 if self.input_type == 'path': | |
223 input_iterator.close() | |
118 | 224 |
119 | 225 |
120 class FastqReader(Reader): | 226 class FastqReader(Reader): |
121 """A simple FASTQ parser. Can handle multi-line sequences, though.""" | 227 """A simple FASTQ parser. Can handle multi-line sequences, though.""" |
122 def parser(self): | 228 def parser(self): |
123 read = Read() | 229 input_iterator = self.get_input_iterator() |
124 state = 'header' | 230 try: |
125 while True: | 231 read = None |
126 line_raw = self.filehandle.readline() | 232 line_num = 0 |
127 if not line_raw: | 233 state = 'header' |
128 if read.seq: | 234 while True: |
129 yield read | 235 try: |
130 raise StopIteration | 236 line_raw = next(input_iterator) |
131 line = line_raw.strip() | 237 except StopIteration: |
132 # Allow empty lines. | 238 if read is not None: |
133 if not line: | 239 yield read |
134 continue | 240 return |
135 if state == 'header': | 241 line_num += 1 |
136 if not line.startswith('@'): | 242 line = line_raw.rstrip('\r\n') |
137 raise FormatError('line state = "header" but line does not start with "@"') | 243 if state == 'header': |
138 if read.seq: | 244 if not line.startswith('@'): |
139 yield read | 245 if line: |
140 read = Read() | 246 raise FormatError('line state = "header" but line does not start with "@":\n'+line) |
141 read.name = line[1:] # remove '@' | 247 else: |
142 if read.name: | 248 # Allow empty lines. |
143 read.id = read.name.split()[0] | 249 continue |
144 state = 'sequence' | 250 if read is not None: |
145 elif state == 'sequence': | 251 yield read |
146 if line.startswith('+'): | 252 read = Read(qual_format=self.qual_format) |
147 state = 'plus' | 253 read.name = line[1:] # remove '@' |
148 else: | 254 if read.name: |
149 read.seq += line | 255 read.id = read.name.split()[0] |
150 elif state == 'plus' or state == 'quality': | 256 state = 'sequence' |
151 state = 'quality' | 257 elif state == 'sequence': |
152 togo = len(read.seq) - len(read.qual) | 258 if line.startswith('+'): |
153 read.qual += line[:togo] | 259 state = 'plus' |
154 # The end of the quality lines is when we have a quality string as long as the sequence. | 260 else: |
155 if len(read.qual) >= len(read.seq): | 261 read.seq += line |
156 state = 'header' | 262 elif state == 'plus' or state == 'quality': |
263 if line.startswith('@') and state == 'quality': | |
264 logging.warning('Looking for more quality scores but line starts with "@". This might ' | |
265 'be a header line and there were fewer quality scores than bases: {}' | |
266 .format(line[:69])) | |
267 state = 'quality' | |
268 togo = len(read.seq) - len(read.qual) | |
269 read.qual += line[:togo] | |
270 # The end of the quality lines is when we have a quality string as long as the sequence. | |
271 if len(read.qual) >= len(read.seq): | |
272 state = 'header' | |
273 finally: | |
274 if self.input_type == 'path': | |
275 input_iterator.close() | |
276 | |
277 | |
278 DESCRIPTION = 'Test parser by parsing an input file and printing its contents.' | |
279 | |
280 | |
281 def make_argparser(): | |
282 parser = argparse.ArgumentParser(description=DESCRIPTION) | |
283 parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin, | |
284 help='Input reads.') | |
285 parser.add_argument('-f', '--format', choices=('fasta', 'fastq', 'sam', 'tsv', 'lines'), | |
286 help='Input read format. Will be detected from the filename, if given.') | |
287 return parser | |
288 | |
289 | |
290 def main(argv): | |
291 parser = make_argparser() | |
292 args = parser.parse_args(argv[1:]) | |
293 if args.format: | |
294 format = args.format | |
295 elif args.infile is sys.stdin: | |
296 fail('Error: Must give a --format if reading from stdin.') | |
297 else: | |
298 ext = os.path.splitext(args.infile.name)[1] | |
299 if ext == '.fq': | |
300 format = 'fastq' | |
301 elif ext == '.fa': | |
302 format = 'fasta' | |
303 elif ext == '.txt': | |
304 format = 'lines' | |
305 else: | |
306 format = ext[1:] | |
307 print('Reading input as format {!r}.'.format(format)) | |
308 for i, read in enumerate(getparser(args.infile, filetype=format)): | |
309 print('Read {} id/name: {!r}/{!r}'.format(i+1, read.id, read.name)) | |
310 print('Read {} seq: {!r}'.format(i+1, read.seq)) | |
311 print('Read {} qual: {!r}'.format(i+1, read.qual)) | |
312 | |
313 | |
314 def fail(message): | |
315 sys.stderr.write(message+"\n") | |
316 sys.exit(1) | |
317 | |
318 | |
319 if __name__ == '__main__': | |
320 sys.exit(main(sys.argv)) |