Mercurial > repos > nick > sequence_content_trimmer
annotate getreads.py @ 1:464aee13e2df draft default tip
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
author | nick |
---|---|
date | Fri, 27 May 2022 23:29:45 +0000 |
parents | 7f170cb06e2e |
children |
rev | line source |
---|---|
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
1 #!/usr/bin/env python3 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
2 import argparse |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
3 import logging |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
4 import os |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
5 import sys |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
6 import types |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
7 """A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
8 sequence. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
9 All format parsers follow this API: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
10 with open('sequence.fasta') as fasta: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
11 for read in getreads.getparser(fasta, filetype='fasta'): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
12 print "There is a sequence with this FASTA identifier: "+read.id |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
13 print "Its sequence is "+read.seq |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
14 The properties of Read are: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
15 name: The entire FASTA header line, SAM column 1, etc. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
16 id: The first whitespace-delimited part of the name. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
17 seq: The sequence. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
18 qual: The quality scores (unless the format is FASTA). |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
19 """ |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
20 |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
21 # Available formats. |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
22 FORMATS = ('fasta', 'fastq', 'sam', 'tsv', 'lines') |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
23 |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
24 QUAL_OFFSETS = {'sanger':33, 'solexa':64} |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
25 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
26 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
27 def getparser(input, filetype, qual_format='sanger', name_col=1, seq_col=2, qual_col=3): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
28 # Detect whether the input is an open file or a path. |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
29 # Return the appropriate reader. |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
30 if filetype == 'fasta': |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
31 return FastaReader(input) |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
32 elif filetype == 'fastq': |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
33 return FastqReader(input, qual_format=qual_format) |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
34 elif filetype == 'sam': |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
35 return SamReader(input, qual_format=qual_format) |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
36 elif filetype == 'tsv': |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
37 return TsvReader(input, qual_format=qual_format, |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
38 name_col=name_col, seq_col=seq_col, qual_col=qual_col) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
39 elif filetype == 'lines': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
40 return LineReader(input) |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
41 else: |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
42 raise ValueError('Unrecognized format: {!r}'.format(filetype)) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
43 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
44 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
45 def detect_input_type(obj): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
46 """Is this an open filehandle, or is it a file path (string)?""" |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
47 try: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
48 os.path.isfile(obj) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
49 return 'path' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
50 except TypeError: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
51 if isinstance(obj, types.GeneratorType): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
52 return 'generator' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
53 elif hasattr(obj, 'read') and hasattr(obj, 'close'): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
54 return 'file' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
55 else: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
56 return None |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
57 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
58 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
59 class FormatError(Exception): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
60 def __init__(self, message=None): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
61 if message: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
62 Exception.__init__(self, message) |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
63 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
64 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
65 class Read(object): |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
66 def __init__(self, name='', seq='', id_='', qual='', qual_format='sanger'): |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
67 self.name = name |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
68 self.seq = seq |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
69 self.qual = qual |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
70 if id_ or not self.name: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
71 self.id = id_ |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
72 elif self.name: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
73 self.id = self.name.split()[0] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
74 self.offset = QUAL_OFFSETS[qual_format] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
75 @property |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
76 def scores(self): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
77 if self.qual is None: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
78 return None |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
79 scores = [] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
80 for qual_char in self.qual: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
81 scores.append(ord(qual_char) - self.offset) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
82 return scores |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
83 def to_fasta(self): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
84 return f'>{self.name}\n{self.seq}' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
85 def to_fastq(self): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
86 return f'@{self.name}\n{self.seq}\n+\n{self.qual}' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
87 def __str__(self): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
88 if self.qual: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
89 return self.to_fastq() |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
90 else: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
91 return self.to_fasta() |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
92 def __repr__(self): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
93 kwarg_strs = [] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
94 for kwarg in 'name', 'seq', 'id_', 'qual': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
95 attr = kwarg.rstrip('_') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
96 raw_value = getattr(self, attr) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
97 if raw_value is not None and len(raw_value) >= 200: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
98 value = raw_value[:199]+'…' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
99 else: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
100 value = raw_value |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
101 if value: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
102 kwarg_strs.append(f'{kwarg}={value!r}') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
103 return type(self).__name__+'('+', '.join(kwarg_strs)+')' |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
104 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
105 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
106 class Reader(object): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
107 """Base class for all other parsers.""" |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
108 def __init__(self, input, **kwargs): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
109 self.input = input |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
110 self.input_type = detect_input_type(input) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
111 if self.input_type not in ('path', 'file', 'generator'): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
112 raise ValueError('Input object {!r} not a file, string, or generator.'.format(input)) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
113 for key, value in kwargs.items(): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
114 setattr(self, key, value) |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
115 def __iter__(self): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
116 return self.parser() |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
117 def bases(self): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
118 for read in self.parser(): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
119 for base in read.seq: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
120 yield base |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
121 def get_input_iterator(self): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
122 if self.input_type == 'path': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
123 return open(self.input) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
124 else: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
125 return self.input |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
126 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
127 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
128 class LineReader(Reader): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
129 """A parser for the simplest format: Only the sequence, one line per read.""" |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
130 def parser(self): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
131 input_iterator = self.get_input_iterator() |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
132 try: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
133 for line in input_iterator: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
134 read = Read(seq=line.rstrip('\r\n')) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
135 yield read |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
136 finally: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
137 if self.input_type == 'path': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
138 input_iterator.close() |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
139 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
140 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
141 class TsvReader(Reader): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
142 """A parser for a simple tab-delimited format. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
143 Column 1: name |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
144 Column 2: sequence |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
145 Column 3: quality scores (optional)""" |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
146 def parser(self): |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
147 min_fields = max(self.name_col, self.seq_col) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
148 input_iterator = self.get_input_iterator() |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
149 try: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
150 for line in input_iterator: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
151 fields = line.rstrip('\r\n').split('\t') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
152 if len(fields) < min_fields: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
153 continue |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
154 read = Read(qual_format=self.qual_format) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
155 read.name = fields[self.name_col-1] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
156 if read.name: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
157 read.id = read.name.split()[0] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
158 read.seq = fields[self.seq_col-1] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
159 try: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
160 read.qual = fields[self.qual_col-1] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
161 except (TypeError, IndexError): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
162 pass |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
163 yield read |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
164 finally: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
165 if self.input_type == 'path': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
166 input_iterator.close() |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
167 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
168 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
169 class SamReader(Reader): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
170 """A simple SAM parser. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
171 Assumptions: |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
172 Lines starting with "@" with 3 fields are headers. All others are alignments. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
173 All alignment lines have 11 or more fields. Other lines will be skipped. |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
174 """ |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
175 def parser(self): |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
176 input_iterator = self.get_input_iterator() |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
177 try: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
178 for line in input_iterator: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
179 fields = line.split('\t') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
180 if len(fields) < 11: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
181 continue |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
182 # Skip headers. |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
183 if fields[0].startswith('@') and len(fields[0]) == 3: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
184 continue |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
185 read = Read(qual_format=self.qual_format) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
186 read.name = fields[0] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
187 if read.name: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
188 read.id = read.name.split()[0] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
189 read.seq = fields[9] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
190 read.qual = fields[10].rstrip('\r\n') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
191 yield read |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
192 finally: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
193 if self.input_type == 'path': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
194 input_iterator.close() |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
195 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
196 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
197 class FastaReader(Reader): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
198 """A simple FASTA parser that reads one sequence at a time into memory.""" |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
199 def parser(self): |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
200 input_iterator = self.get_input_iterator() |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
201 try: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
202 read = None |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
203 while True: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
204 try: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
205 line_raw = next(input_iterator) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
206 except StopIteration: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
207 if read is not None: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
208 yield read |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
209 return |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
210 line = line_raw.rstrip('\r\n') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
211 if line.startswith('>'): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
212 if read is not None: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
213 yield read |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
214 read = Read() |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
215 read.name = line[1:] # remove ">" |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
216 if read.name: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
217 read.id = read.name.split()[0] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
218 continue |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
219 else: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
220 read.seq += line |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
221 finally: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
222 if self.input_type == 'path': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
223 input_iterator.close() |
0
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
224 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
225 |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
226 class FastqReader(Reader): |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
227 """A simple FASTQ parser. Can handle multi-line sequences, though.""" |
7f170cb06e2e
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff
changeset
|
228 def parser(self): |
1
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
229 input_iterator = self.get_input_iterator() |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
230 try: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
231 read = None |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
232 line_num = 0 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
233 state = 'header' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
234 while True: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
235 try: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
236 line_raw = next(input_iterator) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
237 except StopIteration: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
238 if read is not None: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
239 yield read |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
240 return |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
241 line_num += 1 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
242 line = line_raw.rstrip('\r\n') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
243 if state == 'header': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
244 if not line.startswith('@'): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
245 if line: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
246 raise FormatError('line state = "header" but line does not start with "@":\n'+line) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
247 else: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
248 # Allow empty lines. |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
249 continue |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
250 if read is not None: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
251 yield read |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
252 read = Read(qual_format=self.qual_format) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
253 read.name = line[1:] # remove '@' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
254 if read.name: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
255 read.id = read.name.split()[0] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
256 state = 'sequence' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
257 elif state == 'sequence': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
258 if line.startswith('+'): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
259 state = 'plus' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
260 else: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
261 read.seq += line |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
262 elif state == 'plus' or state == 'quality': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
263 if line.startswith('@') and state == 'quality': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
264 logging.warning('Looking for more quality scores but line starts with "@". This might ' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
265 'be a header line and there were fewer quality scores than bases: {}' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
266 .format(line[:69])) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
267 state = 'quality' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
268 togo = len(read.seq) - len(read.qual) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
269 read.qual += line[:togo] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
270 # The end of the quality lines is when we have a quality string as long as the sequence. |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
271 if len(read.qual) >= len(read.seq): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
272 state = 'header' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
273 finally: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
274 if self.input_type == 'path': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
275 input_iterator.close() |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
276 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
277 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
278 DESCRIPTION = 'Test parser by parsing an input file and printing its contents.' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
279 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
280 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
281 def make_argparser(): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
282 parser = argparse.ArgumentParser(description=DESCRIPTION) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
283 parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin, |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
284 help='Input reads.') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
285 parser.add_argument('-f', '--format', choices=('fasta', 'fastq', 'sam', 'tsv', 'lines'), |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
286 help='Input read format. Will be detected from the filename, if given.') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
287 return parser |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
288 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
289 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
290 def main(argv): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
291 parser = make_argparser() |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
292 args = parser.parse_args(argv[1:]) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
293 if args.format: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
294 format = args.format |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
295 elif args.infile is sys.stdin: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
296 fail('Error: Must give a --format if reading from stdin.') |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
297 else: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
298 ext = os.path.splitext(args.infile.name)[1] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
299 if ext == '.fq': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
300 format = 'fastq' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
301 elif ext == '.fa': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
302 format = 'fasta' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
303 elif ext == '.txt': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
304 format = 'lines' |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
305 else: |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
306 format = ext[1:] |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
307 print('Reading input as format {!r}.'.format(format)) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
308 for i, read in enumerate(getparser(args.infile, filetype=format)): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
309 print('Read {} id/name: {!r}/{!r}'.format(i+1, read.id, read.name)) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
310 print('Read {} seq: {!r}'.format(i+1, read.seq)) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
311 print('Read {} qual: {!r}'.format(i+1, read.qual)) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
312 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
313 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
314 def fail(message): |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
315 sys.stderr.write(message+"\n") |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
316 sys.exit(1) |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
317 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
318 |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
319 if __name__ == '__main__': |
464aee13e2df
"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents:
0
diff
changeset
|
320 sys.exit(main(sys.argv)) |