comparison refseq_parser.py @ 5:70e76f6bfcfb draft

Uploaded
author schang
date Tue, 02 Dec 2014 19:54:12 -0500
parents
children
comparison
equal deleted inserted replaced
4:398c3753c358 5:70e76f6bfcfb
1 #!/usr/bin/python
2 ''' Script to parse the reference genome sequence'''
3
4 # Features and assumptions:
5 # - filenames do not matter; gzip and bzip2 streams are autodetected, and
6 # the input format (FASTA or FASTQ) is autodetected
7
8
9 import sys, getopt, gzip, bz2, StringIO
10
11 class SequenceInputError(Exception):
12 """Used to raise error conditions."""
13
14
15 class Sequence:
16 """
17 Represents a protein, DNA, or RNA sequence.
18
19 Contains 3 members:
20 name: string
21 seq: string containing the full, unbroken sequence
22 qual: array of ints, one per character in seq (FASTQ only)
23 """
24
25 def __init__(self, name, seq, qual=None):
26 self.name = name
27 self.seq = seq
28 self.qual = qual
29
30 def __repr__(self):
31 out = []
32 out.append("Name: %s\n" % self.name)
33 out.append("Seq: %s" % self.seq)
34 if self.qual:
35 out.append("\nQual:\n")
36 for q in self.qual:
37 out.append("%d " % q)
38 return ''.join(out)
39
40
41 def supportCompression(stream):
42 """
43 Detect if a stream is compressed and modify to handle, if possible.
44 """
45 no_filename = False
46
47 try:
48 filename = stream.name
49 except AttributeError:
50 # probably a StringIO object
51 no_filename = True
52
53 if filename == '<stdin>':
54 stream = StringIO.StringIO(sys.stdin.read())
55
56 pos = stream.tell()
57 magic = stream.read(2)
58 stream.seek(pos)
59 if magic == '\037\213':
60 stream = gzip.GzipFile(None, 'rb', None, stream)
61 elif magic == 'BZ':
62 if no_filename:
63 raise SequenceInputError('Cannot handle bzip2 compressed ' +
64 'StringIO data.')
65 if filename == '<stdin>':
66 raise SequenceInputError('Cannot handle bzip2 compressed data ' +
67 'over stdin.')
68 stream = bz2.BZ2File(filename, 'rb')
69
70 return stream
71
72
73 def SequenceFile(stream):
74 """
75 Parses FASTA or FASTQ data, returning Sequence objects.
76
77 This is a generator, so that there is no need to store a full list of
78 sequences in memory if the user does not want to.
79 """
80
81 f = supportCompression(stream)
82
83 l = f.readline().rstrip()
84
85 # autodetect file format based on first character of line
86 if l[0] == '>':
87 format_type = 'fasta'
88 elif l[0] == '@':
89 format_type = 'fastq'
90 else:
91 raise SequenceInputError('Could not determine data type ' +
92 '(fasta or fastq)')
93
94 # l == '' indicates end-of-file
95 while l != '':
96 name = l[1:]
97 seq_list = []
98 qual_list = None
99
100 # Begin parsing
101 l = f.readline().rstrip()
102
103 if format_type == 'fasta':
104 while l != '' and l[0] != '>':
105 seq_list.append(l)
106 l = f.readline().rstrip()
107 s = ''.join(seq_list)
108
109 elif format_type == 'fastq':
110 # read in the sequence
111 while l != '' and l[0] != '+':
112 seq_list.append(l)
113 l = f.readline().rstrip()
114 s = ''.join(seq_list)
115
116 # check if the second sequence name (if existing) matches the first
117 if len(l) > 1:
118 if name != l[1:]:
119 raise SequenceInputError('Sequence and quality score ' +
120 'identifiers do not match ' +
121 '(\"%s\" != \"%s\")' % (name, l))
122 l = f.readline().rstrip()
123
124 # read in the quality scores
125 q = ''
126 while l != '' and len(q) < len(s):
127 q += l
128 l = f.readline().rstrip()
129
130 # ensure that the quality and sequence strings are the same length
131 if len(q) != len(s):
132 raise SequenceInputError('Sequence and quality string ' +
133 'lengths do not match (\"%s\")' % name)
134 qual_list = []
135 for c in q:
136 qual_list.append(ord(c) - 33)
137
138 yield Sequence(name, s, qual_list)
139
140 f.close()
141
142
143 if __name__ == '__main__':
144 seqs = 0
145 seqlen = 0
146
147
148 opts, args = getopt.getopt(sys.argv[1:], 'hp')
149
150 for o, a in opts:
151 if o == '-h':
152 print 'Usage:'
153 print ' %s <filename>' % sys.argv[0]
154 print ' %s < <filename>' % sys.argv[0]
155 print ' cat <filename> | %s [-p]\n' % sys.argv[0]
156 print ' <filename> must be in fasta or fastq format, and can be'
157 print ' gzipped, or (with the first usage style) bzip2\'d.'
158 sys.exit(0)
159
160 else:
161 print 'unhandled option'
162 sys.exit(1)
163
164 if len(args) > 0:
165 f = open(args[0], 'rb')
166 else:
167 f = sys.stdin
168
169
170 try:
171 for s in SequenceFile(f):
172 #print s # DEBUG
173 seqs += 1
174 seqlen += len(s.seq)
175
176 print 'Total sequences found: %d' % seqs
177 print 'Total residues found: %d' % seqlen
178 except SequenceInputError as e:
179 print >> sys.stderr, '\nFaulty input (record %d): %s' % \
180 (seqs+1, e.args[0])
181
182 f.close()