annotate getreads.py @ 1:464aee13e2df draft default tip

"planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
author nick
date Fri, 27 May 2022 23:29:45 +0000
parents 7f170cb06e2e
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
1 #!/usr/bin/env python3
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
2 import argparse
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
3 import logging
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
4 import os
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
5 import sys
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
6 import types
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
7 """A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
8 sequence.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
9 All format parsers follow this API:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
10 with open('sequence.fasta') as fasta:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
11 for read in getreads.getparser(fasta, filetype='fasta'):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
12 print "There is a sequence with this FASTA identifier: "+read.id
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
13 print "Its sequence is "+read.seq
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
14 The properties of Read are:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
15 name: The entire FASTA header line, SAM column 1, etc.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
16 id: The first whitespace-delimited part of the name.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
17 seq: The sequence.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
18 qual: The quality scores (unless the format is FASTA).
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
19 """
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
20
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
21 # Available formats.
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
22 FORMATS = ('fasta', 'fastq', 'sam', 'tsv', 'lines')
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
23
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
24 QUAL_OFFSETS = {'sanger':33, 'solexa':64}
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
25
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
26
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
27 def getparser(input, filetype, qual_format='sanger', name_col=1, seq_col=2, qual_col=3):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
28 # Detect whether the input is an open file or a path.
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
29 # Return the appropriate reader.
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
30 if filetype == 'fasta':
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
31 return FastaReader(input)
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
32 elif filetype == 'fastq':
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
33 return FastqReader(input, qual_format=qual_format)
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
34 elif filetype == 'sam':
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
35 return SamReader(input, qual_format=qual_format)
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
36 elif filetype == 'tsv':
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
37 return TsvReader(input, qual_format=qual_format,
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
38 name_col=name_col, seq_col=seq_col, qual_col=qual_col)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
39 elif filetype == 'lines':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
40 return LineReader(input)
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
41 else:
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
42 raise ValueError('Unrecognized format: {!r}'.format(filetype))
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
43
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
44
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
45 def detect_input_type(obj):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
46 """Is this an open filehandle, or is it a file path (string)?"""
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
47 try:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
48 os.path.isfile(obj)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
49 return 'path'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
50 except TypeError:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
51 if isinstance(obj, types.GeneratorType):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
52 return 'generator'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
53 elif hasattr(obj, 'read') and hasattr(obj, 'close'):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
54 return 'file'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
55 else:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
56 return None
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
57
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
58
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
59 class FormatError(Exception):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
60 def __init__(self, message=None):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
61 if message:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
62 Exception.__init__(self, message)
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
63
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
64
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
65 class Read(object):
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
66 def __init__(self, name='', seq='', id_='', qual='', qual_format='sanger'):
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
67 self.name = name
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
68 self.seq = seq
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
69 self.qual = qual
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
70 if id_ or not self.name:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
71 self.id = id_
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
72 elif self.name:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
73 self.id = self.name.split()[0]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
74 self.offset = QUAL_OFFSETS[qual_format]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
75 @property
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
76 def scores(self):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
77 if self.qual is None:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
78 return None
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
79 scores = []
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
80 for qual_char in self.qual:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
81 scores.append(ord(qual_char) - self.offset)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
82 return scores
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
83 def to_fasta(self):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
84 return f'>{self.name}\n{self.seq}'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
85 def to_fastq(self):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
86 return f'@{self.name}\n{self.seq}\n+\n{self.qual}'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
87 def __str__(self):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
88 if self.qual:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
89 return self.to_fastq()
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
90 else:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
91 return self.to_fasta()
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
92 def __repr__(self):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
93 kwarg_strs = []
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
94 for kwarg in 'name', 'seq', 'id_', 'qual':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
95 attr = kwarg.rstrip('_')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
96 raw_value = getattr(self, attr)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
97 if raw_value is not None and len(raw_value) >= 200:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
98 value = raw_value[:199]+'…'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
99 else:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
100 value = raw_value
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
101 if value:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
102 kwarg_strs.append(f'{kwarg}={value!r}')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
103 return type(self).__name__+'('+', '.join(kwarg_strs)+')'
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
104
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
105
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
106 class Reader(object):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
107 """Base class for all other parsers."""
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
108 def __init__(self, input, **kwargs):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
109 self.input = input
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
110 self.input_type = detect_input_type(input)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
111 if self.input_type not in ('path', 'file', 'generator'):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
112 raise ValueError('Input object {!r} not a file, string, or generator.'.format(input))
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
113 for key, value in kwargs.items():
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
114 setattr(self, key, value)
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
115 def __iter__(self):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
116 return self.parser()
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
117 def bases(self):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
118 for read in self.parser():
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
119 for base in read.seq:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
120 yield base
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
121 def get_input_iterator(self):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
122 if self.input_type == 'path':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
123 return open(self.input)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
124 else:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
125 return self.input
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
126
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
127
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
128 class LineReader(Reader):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
129 """A parser for the simplest format: Only the sequence, one line per read."""
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
130 def parser(self):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
131 input_iterator = self.get_input_iterator()
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
132 try:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
133 for line in input_iterator:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
134 read = Read(seq=line.rstrip('\r\n'))
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
135 yield read
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
136 finally:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
137 if self.input_type == 'path':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
138 input_iterator.close()
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
139
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
140
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
141 class TsvReader(Reader):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
142 """A parser for a simple tab-delimited format.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
143 Column 1: name
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
144 Column 2: sequence
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
145 Column 3: quality scores (optional)"""
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
146 def parser(self):
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
147 min_fields = max(self.name_col, self.seq_col)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
148 input_iterator = self.get_input_iterator()
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
149 try:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
150 for line in input_iterator:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
151 fields = line.rstrip('\r\n').split('\t')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
152 if len(fields) < min_fields:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
153 continue
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
154 read = Read(qual_format=self.qual_format)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
155 read.name = fields[self.name_col-1]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
156 if read.name:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
157 read.id = read.name.split()[0]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
158 read.seq = fields[self.seq_col-1]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
159 try:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
160 read.qual = fields[self.qual_col-1]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
161 except (TypeError, IndexError):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
162 pass
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
163 yield read
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
164 finally:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
165 if self.input_type == 'path':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
166 input_iterator.close()
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
167
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
168
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
169 class SamReader(Reader):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
170 """A simple SAM parser.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
171 Assumptions:
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
172 Lines starting with "@" with 3 fields are headers. All others are alignments.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
173 All alignment lines have 11 or more fields. Other lines will be skipped.
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
174 """
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
175 def parser(self):
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
176 input_iterator = self.get_input_iterator()
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
177 try:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
178 for line in input_iterator:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
179 fields = line.split('\t')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
180 if len(fields) < 11:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
181 continue
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
182 # Skip headers.
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
183 if fields[0].startswith('@') and len(fields[0]) == 3:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
184 continue
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
185 read = Read(qual_format=self.qual_format)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
186 read.name = fields[0]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
187 if read.name:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
188 read.id = read.name.split()[0]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
189 read.seq = fields[9]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
190 read.qual = fields[10].rstrip('\r\n')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
191 yield read
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
192 finally:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
193 if self.input_type == 'path':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
194 input_iterator.close()
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
195
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
196
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
197 class FastaReader(Reader):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
198 """A simple FASTA parser that reads one sequence at a time into memory."""
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
199 def parser(self):
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
200 input_iterator = self.get_input_iterator()
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
201 try:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
202 read = None
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
203 while True:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
204 try:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
205 line_raw = next(input_iterator)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
206 except StopIteration:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
207 if read is not None:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
208 yield read
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
209 return
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
210 line = line_raw.rstrip('\r\n')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
211 if line.startswith('>'):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
212 if read is not None:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
213 yield read
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
214 read = Read()
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
215 read.name = line[1:] # remove ">"
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
216 if read.name:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
217 read.id = read.name.split()[0]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
218 continue
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
219 else:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
220 read.seq += line
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
221 finally:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
222 if self.input_type == 'path':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
223 input_iterator.close()
0
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
224
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
225
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
226 class FastqReader(Reader):
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
227 """A simple FASTQ parser. Can handle multi-line sequences, though."""
7f170cb06e2e planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
nick
parents:
diff changeset
228 def parser(self):
1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
229 input_iterator = self.get_input_iterator()
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
230 try:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
231 read = None
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
232 line_num = 0
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
233 state = 'header'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
234 while True:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
235 try:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
236 line_raw = next(input_iterator)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
237 except StopIteration:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
238 if read is not None:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
239 yield read
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
240 return
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
241 line_num += 1
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
242 line = line_raw.rstrip('\r\n')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
243 if state == 'header':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
244 if not line.startswith('@'):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
245 if line:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
246 raise FormatError('line state = "header" but line does not start with "@":\n'+line)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
247 else:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
248 # Allow empty lines.
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
249 continue
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
250 if read is not None:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
251 yield read
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
252 read = Read(qual_format=self.qual_format)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
253 read.name = line[1:] # remove '@'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
254 if read.name:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
255 read.id = read.name.split()[0]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
256 state = 'sequence'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
257 elif state == 'sequence':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
258 if line.startswith('+'):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
259 state = 'plus'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
260 else:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
261 read.seq += line
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
262 elif state == 'plus' or state == 'quality':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
263 if line.startswith('@') and state == 'quality':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
264 logging.warning('Looking for more quality scores but line starts with "@". This might '
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
265 'be a header line and there were fewer quality scores than bases: {}'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
266 .format(line[:69]))
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
267 state = 'quality'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
268 togo = len(read.seq) - len(read.qual)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
269 read.qual += line[:togo]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
270 # The end of the quality lines is when we have a quality string as long as the sequence.
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
271 if len(read.qual) >= len(read.seq):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
272 state = 'header'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
273 finally:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
274 if self.input_type == 'path':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
275 input_iterator.close()
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
276
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
277
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
278 DESCRIPTION = 'Test parser by parsing an input file and printing its contents.'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
279
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
280
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
281 def make_argparser():
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
282 parser = argparse.ArgumentParser(description=DESCRIPTION)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
283 parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin,
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
284 help='Input reads.')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
285 parser.add_argument('-f', '--format', choices=('fasta', 'fastq', 'sam', 'tsv', 'lines'),
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
286 help='Input read format. Will be detected from the filename, if given.')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
287 return parser
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
288
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
289
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
290 def main(argv):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
291 parser = make_argparser()
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
292 args = parser.parse_args(argv[1:])
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
293 if args.format:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
294 format = args.format
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
295 elif args.infile is sys.stdin:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
296 fail('Error: Must give a --format if reading from stdin.')
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
297 else:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
298 ext = os.path.splitext(args.infile.name)[1]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
299 if ext == '.fq':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
300 format = 'fastq'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
301 elif ext == '.fa':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
302 format = 'fasta'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
303 elif ext == '.txt':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
304 format = 'lines'
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
305 else:
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
306 format = ext[1:]
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
307 print('Reading input as format {!r}.'.format(format))
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
308 for i, read in enumerate(getparser(args.infile, filetype=format)):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
309 print('Read {} id/name: {!r}/{!r}'.format(i+1, read.id, read.name))
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
310 print('Read {} seq: {!r}'.format(i+1, read.seq))
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
311 print('Read {} qual: {!r}'.format(i+1, read.qual))
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
312
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
313
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
314 def fail(message):
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
315 sys.stderr.write(message+"\n")
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
316 sys.exit(1)
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
317
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
318
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
319 if __name__ == '__main__':
464aee13e2df "planemo upload commit 8e52aac4afce4ab7c4d244e2b70f205f70c16749-dirty"
nick
parents: 0
diff changeset
320 sys.exit(main(sys.argv))