5
|
1 #!/usr/bin/python
|
|
2 ''' Script to parse the reference genome sequence'''
|
|
3
|
|
4 # Features and assumptions:
|
|
5 # - filenames do not matter; gzip and bzip2 streams are autodetected, and
|
|
6 # the input format (FASTA or FASTQ) is autodetected
|
|
7
|
|
8
|
|
9 import sys, getopt, gzip, bz2, StringIO
|
|
10
|
|
11 class SequenceInputError(Exception):
|
|
12 """Used to raise error conditions."""
|
|
13
|
|
14
|
|
15 class Sequence:
|
|
16 """
|
|
17 Represents a protein, DNA, or RNA sequence.
|
|
18
|
|
19 Contains 3 members:
|
|
20 name: string
|
|
21 seq: string containing the full, unbroken sequence
|
|
22 qual: array of ints, one per character in seq (FASTQ only)
|
|
23 """
|
|
24
|
|
25 def __init__(self, name, seq, qual=None):
|
|
26 self.name = name
|
|
27 self.seq = seq
|
|
28 self.qual = qual
|
|
29
|
|
30 def __repr__(self):
|
|
31 out = []
|
|
32 out.append("Name: %s\n" % self.name)
|
|
33 out.append("Seq: %s" % self.seq)
|
|
34 if self.qual:
|
|
35 out.append("\nQual:\n")
|
|
36 for q in self.qual:
|
|
37 out.append("%d " % q)
|
|
38 return ''.join(out)
|
|
39
|
|
40
|
|
41 def supportCompression(stream):
|
|
42 """
|
|
43 Detect if a stream is compressed and modify to handle, if possible.
|
|
44 """
|
|
45 no_filename = False
|
|
46
|
|
47 try:
|
|
48 filename = stream.name
|
|
49 except AttributeError:
|
|
50 # probably a StringIO object
|
|
51 no_filename = True
|
|
52
|
|
53 if filename == '<stdin>':
|
|
54 stream = StringIO.StringIO(sys.stdin.read())
|
|
55
|
|
56 pos = stream.tell()
|
|
57 magic = stream.read(2)
|
|
58 stream.seek(pos)
|
|
59 if magic == '\037\213':
|
|
60 stream = gzip.GzipFile(None, 'rb', None, stream)
|
|
61 elif magic == 'BZ':
|
|
62 if no_filename:
|
|
63 raise SequenceInputError('Cannot handle bzip2 compressed ' +
|
|
64 'StringIO data.')
|
|
65 if filename == '<stdin>':
|
|
66 raise SequenceInputError('Cannot handle bzip2 compressed data ' +
|
|
67 'over stdin.')
|
|
68 stream = bz2.BZ2File(filename, 'rb')
|
|
69
|
|
70 return stream
|
|
71
|
|
72
|
|
73 def SequenceFile(stream):
|
|
74 """
|
|
75 Parses FASTA or FASTQ data, returning Sequence objects.
|
|
76
|
|
77 This is a generator, so that there is no need to store a full list of
|
|
78 sequences in memory if the user does not want to.
|
|
79 """
|
|
80
|
|
81 f = supportCompression(stream)
|
|
82
|
|
83 l = f.readline().rstrip()
|
|
84
|
|
85 # autodetect file format based on first character of line
|
|
86 if l[0] == '>':
|
|
87 format_type = 'fasta'
|
|
88 elif l[0] == '@':
|
|
89 format_type = 'fastq'
|
|
90 else:
|
|
91 raise SequenceInputError('Could not determine data type ' +
|
|
92 '(fasta or fastq)')
|
|
93
|
|
94 # l == '' indicates end-of-file
|
|
95 while l != '':
|
|
96 name = l[1:]
|
|
97 seq_list = []
|
|
98 qual_list = None
|
|
99
|
|
100 # Begin parsing
|
|
101 l = f.readline().rstrip()
|
|
102
|
|
103 if format_type == 'fasta':
|
|
104 while l != '' and l[0] != '>':
|
|
105 seq_list.append(l)
|
|
106 l = f.readline().rstrip()
|
|
107 s = ''.join(seq_list)
|
|
108
|
|
109 elif format_type == 'fastq':
|
|
110 # read in the sequence
|
|
111 while l != '' and l[0] != '+':
|
|
112 seq_list.append(l)
|
|
113 l = f.readline().rstrip()
|
|
114 s = ''.join(seq_list)
|
|
115
|
|
116 # check if the second sequence name (if existing) matches the first
|
|
117 if len(l) > 1:
|
|
118 if name != l[1:]:
|
|
119 raise SequenceInputError('Sequence and quality score ' +
|
|
120 'identifiers do not match ' +
|
|
121 '(\"%s\" != \"%s\")' % (name, l))
|
|
122 l = f.readline().rstrip()
|
|
123
|
|
124 # read in the quality scores
|
|
125 q = ''
|
|
126 while l != '' and len(q) < len(s):
|
|
127 q += l
|
|
128 l = f.readline().rstrip()
|
|
129
|
|
130 # ensure that the quality and sequence strings are the same length
|
|
131 if len(q) != len(s):
|
|
132 raise SequenceInputError('Sequence and quality string ' +
|
|
133 'lengths do not match (\"%s\")' % name)
|
|
134 qual_list = []
|
|
135 for c in q:
|
|
136 qual_list.append(ord(c) - 33)
|
|
137
|
|
138 yield Sequence(name, s, qual_list)
|
|
139
|
|
140 f.close()
|
|
141
|
|
142
|
|
143 if __name__ == '__main__':
|
|
144 seqs = 0
|
|
145 seqlen = 0
|
|
146
|
|
147
|
|
148 opts, args = getopt.getopt(sys.argv[1:], 'hp')
|
|
149
|
|
150 for o, a in opts:
|
|
151 if o == '-h':
|
|
152 print 'Usage:'
|
|
153 print ' %s <filename>' % sys.argv[0]
|
|
154 print ' %s < <filename>' % sys.argv[0]
|
|
155 print ' cat <filename> | %s [-p]\n' % sys.argv[0]
|
|
156 print ' <filename> must be in fasta or fastq format, and can be'
|
|
157 print ' gzipped, or (with the first usage style) bzip2\'d.'
|
|
158 sys.exit(0)
|
|
159
|
|
160 else:
|
|
161 print 'unhandled option'
|
|
162 sys.exit(1)
|
|
163
|
|
164 if len(args) > 0:
|
|
165 f = open(args[0], 'rb')
|
|
166 else:
|
|
167 f = sys.stdin
|
|
168
|
|
169
|
|
170 try:
|
|
171 for s in SequenceFile(f):
|
|
172 #print s # DEBUG
|
|
173 seqs += 1
|
|
174 seqlen += len(s.seq)
|
|
175
|
|
176 print 'Total sequences found: %d' % seqs
|
|
177 print 'Total residues found: %d' % seqlen
|
|
178 except SequenceInputError as e:
|
|
179 print >> sys.stderr, '\nFaulty input (record %d): %s' % \
|
|
180 (seqs+1, e.args[0])
|
|
181
|
|
182 f.close()
|