Mercurial > repos > nick > sequence_content_trimmer
comparison getreads.py @ 0:7f170cb06e2e draft
planemo upload commit d76a1cf04f3e4bc735d320ccccbf7aecbc193395
author | nick |
---|---|
date | Tue, 01 Dec 2015 21:33:27 -0500 |
parents | |
children | 464aee13e2df |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7f170cb06e2e |
---|---|
1 """A simple parser for FASTA, FASTQ, SAM, etc. Create generators that just return the read name and | |
2 sequence. | |
3 All format parsers follow this API: | |
4 with open('sequence.fasta') as fasta: | |
5 for read in getreads.getparser(fasta, filetype='fasta'): | |
6 print "There is a sequence with this FASTA identifier: "+read.id | |
7 print "Its sequence is "+read.seq | |
8 The properties of Read are: | |
9 name: The entire FASTA header line, SAM column 1, etc. | |
10 id: The first whitespace-delimited part of the name. | |
11 seq: The sequence. | |
12 qual: The quality scores (unless the format is FASTA). | |
13 """ | |
14 | |
15 | |
16 def getparser(filehandle, filetype='fasta'): | |
17 if filetype == 'fasta': | |
18 return FastaReader(filehandle) | |
19 elif filetype == 'fastq': | |
20 return FastqReader(filehandle) | |
21 elif filetype == 'sam': | |
22 return SamReader(filehandle) | |
23 elif filetype == 'tsv': | |
24 return TsvReader(filehandle) | |
25 else: | |
26 raise ValueError('Illegal argument: filetype=\''+filetype+'\'') | |
27 | |
28 | |
29 class FormatError(Exception): | |
30 def __init__(self, message=None): | |
31 if message: | |
32 Exception.__init__(self, message) | |
33 | |
34 | |
35 class Read(object): | |
36 def __init__(self, name='', seq='', id_='', qual=''): | |
37 self.name = name | |
38 self.seq = seq | |
39 self.id = id_ | |
40 self.qual = qual | |
41 | |
42 | |
43 class Reader(object): | |
44 """Base class for all other parsers.""" | |
45 def __init__(self, filehandle): | |
46 self.filehandle = filehandle | |
47 def __iter__(self): | |
48 return self.parser() | |
49 | |
50 | |
51 class TsvReader(Reader): | |
52 """A parser for a simple tab-delimited format. | |
53 Column 1: name | |
54 Column 2: sequence | |
55 Column 3: quality scores (optional)""" | |
56 def parser(self): | |
57 for line in self.filehandle: | |
58 fields = line.rstrip('\r\n').split('\t') | |
59 if len(fields) < 2: | |
60 continue | |
61 read = Read() | |
62 read.name = fields[0] | |
63 if read.name: | |
64 read.id = read.name.split()[0] | |
65 read.seq = fields[1] | |
66 if len(fields) >= 3: | |
67 read.qual = fields[2] | |
68 yield read | |
69 | |
70 | |
71 class SamReader(Reader): | |
72 """A simple SAM parser. | |
73 Assumptions: | |
74 Lines starting with "@" with 3 fields are headers. All others are alignments. | |
75 All alignment lines have 11 or more fields. Other lines will be skipped. | |
76 """ | |
77 def parser(self): | |
78 for line in self.filehandle: | |
79 fields = line.split('\t') | |
80 if len(fields) < 11: | |
81 continue | |
82 # Skip headers. | |
83 if fields[0].startswith('@') and len(fields[0]) == 3: | |
84 continue | |
85 read = Read() | |
86 read.name = fields[0] | |
87 if read.name: | |
88 read.id = read.name.split()[0] | |
89 read.seq = fields[9] | |
90 read.qual = fields[10].rstrip('\r\n') | |
91 yield read | |
92 | |
93 | |
94 class FastaReader(Reader): | |
95 """A simple FASTA parser that reads one sequence at a time into memory.""" | |
96 def parser(self): | |
97 read = Read() | |
98 while True: | |
99 line_raw = self.filehandle.readline() | |
100 if not line_raw: | |
101 if read.seq: | |
102 yield read | |
103 raise StopIteration | |
104 line = line_raw.strip() | |
105 # Allow empty lines. | |
106 if not line: | |
107 continue | |
108 if line.startswith('>'): | |
109 if read.seq: | |
110 yield read | |
111 read = Read() | |
112 read.name = line[1:] # remove ">" | |
113 if read.name: | |
114 read.id = read.name.split()[0] | |
115 continue | |
116 else: | |
117 read.seq += line | |
118 | |
119 | |
120 class FastqReader(Reader): | |
121 """A simple FASTQ parser. Can handle multi-line sequences, though.""" | |
122 def parser(self): | |
123 read = Read() | |
124 state = 'header' | |
125 while True: | |
126 line_raw = self.filehandle.readline() | |
127 if not line_raw: | |
128 if read.seq: | |
129 yield read | |
130 raise StopIteration | |
131 line = line_raw.strip() | |
132 # Allow empty lines. | |
133 if not line: | |
134 continue | |
135 if state == 'header': | |
136 if not line.startswith('@'): | |
137 raise FormatError('line state = "header" but line does not start with "@"') | |
138 if read.seq: | |
139 yield read | |
140 read = Read() | |
141 read.name = line[1:] # remove '@' | |
142 if read.name: | |
143 read.id = read.name.split()[0] | |
144 state = 'sequence' | |
145 elif state == 'sequence': | |
146 if line.startswith('+'): | |
147 state = 'plus' | |
148 else: | |
149 read.seq += line | |
150 elif state == 'plus' or state == 'quality': | |
151 state = 'quality' | |
152 togo = len(read.seq) - len(read.qual) | |
153 read.qual += line[:togo] | |
154 # The end of the quality lines is when we have a quality string as long as the sequence. | |
155 if len(read.qual) >= len(read.seq): | |
156 state = 'header' |