annotate refseq_parser.py @ 7:6d94241370cc draft default tip

Uploaded
author schang
date Tue, 02 Dec 2014 21:06:35 -0500
parents 70e76f6bfcfb
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
1 #!/usr/bin/python
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
2 ''' Script to parse the reference genome sequence'''
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
3
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
4 # Features and assumptions:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
5 # - filenames do not matter; gzip and bzip2 streams are autodetected, and
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
6 # the input format (FASTA or FASTQ) is autodetected
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
7
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
8
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
9 import sys, getopt, gzip, bz2, StringIO
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
10
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
11 class SequenceInputError(Exception):
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
12 """Used to raise error conditions."""
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
13
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
14
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
15 class Sequence:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
16 """
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
17 Represents a protein, DNA, or RNA sequence.
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
18
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
19 Contains 3 members:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
20 name: string
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
21 seq: string containing the full, unbroken sequence
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
22 qual: array of ints, one per character in seq (FASTQ only)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
23 """
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
24
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
25 def __init__(self, name, seq, qual=None):
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
26 self.name = name
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
27 self.seq = seq
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
28 self.qual = qual
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
29
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
30 def __repr__(self):
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
31 out = []
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
32 out.append("Name: %s\n" % self.name)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
33 out.append("Seq: %s" % self.seq)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
34 if self.qual:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
35 out.append("\nQual:\n")
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
36 for q in self.qual:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
37 out.append("%d " % q)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
38 return ''.join(out)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
39
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
40
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
41 def supportCompression(stream):
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
42 """
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
43 Detect if a stream is compressed and modify to handle, if possible.
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
44 """
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
45 no_filename = False
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
46
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
47 try:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
48 filename = stream.name
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
49 except AttributeError:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
50 # probably a StringIO object
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
51 no_filename = True
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
52
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
53 if filename == '<stdin>':
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
54 stream = StringIO.StringIO(sys.stdin.read())
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
55
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
56 pos = stream.tell()
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
57 magic = stream.read(2)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
58 stream.seek(pos)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
59 if magic == '\037\213':
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
60 stream = gzip.GzipFile(None, 'rb', None, stream)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
61 elif magic == 'BZ':
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
62 if no_filename:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
63 raise SequenceInputError('Cannot handle bzip2 compressed ' +
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
64 'StringIO data.')
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
65 if filename == '<stdin>':
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
66 raise SequenceInputError('Cannot handle bzip2 compressed data ' +
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
67 'over stdin.')
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
68 stream = bz2.BZ2File(filename, 'rb')
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
69
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
70 return stream
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
71
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
72
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
73 def SequenceFile(stream):
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
74 """
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
75 Parses FASTA or FASTQ data, returning Sequence objects.
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
76
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
77 This is a generator, so that there is no need to store a full list of
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
78 sequences in memory if the user does not want to.
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
79 """
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
80
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
81 f = supportCompression(stream)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
82
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
83 l = f.readline().rstrip()
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
84
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
85 # autodetect file format based on first character of line
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
86 if l[0] == '>':
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
87 format_type = 'fasta'
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
88 elif l[0] == '@':
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
89 format_type = 'fastq'
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
90 else:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
91 raise SequenceInputError('Could not determine data type ' +
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
92 '(fasta or fastq)')
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
93
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
94 # l == '' indicates end-of-file
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
95 while l != '':
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
96 name = l[1:]
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
97 seq_list = []
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
98 qual_list = None
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
99
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
100 # Begin parsing
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
101 l = f.readline().rstrip()
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
102
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
103 if format_type == 'fasta':
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
104 while l != '' and l[0] != '>':
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
105 seq_list.append(l)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
106 l = f.readline().rstrip()
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
107 s = ''.join(seq_list)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
108
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
109 elif format_type == 'fastq':
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
110 # read in the sequence
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
111 while l != '' and l[0] != '+':
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
112 seq_list.append(l)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
113 l = f.readline().rstrip()
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
114 s = ''.join(seq_list)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
115
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
116 # check if the second sequence name (if existing) matches the first
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
117 if len(l) > 1:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
118 if name != l[1:]:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
119 raise SequenceInputError('Sequence and quality score ' +
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
120 'identifiers do not match ' +
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
121 '(\"%s\" != \"%s\")' % (name, l))
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
122 l = f.readline().rstrip()
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
123
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
124 # read in the quality scores
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
125 q = ''
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
126 while l != '' and len(q) < len(s):
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
127 q += l
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
128 l = f.readline().rstrip()
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
129
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
130 # ensure that the quality and sequence strings are the same length
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
131 if len(q) != len(s):
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
132 raise SequenceInputError('Sequence and quality string ' +
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
133 'lengths do not match (\"%s\")' % name)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
134 qual_list = []
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
135 for c in q:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
136 qual_list.append(ord(c) - 33)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
137
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
138 yield Sequence(name, s, qual_list)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
139
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
140 f.close()
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
141
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
142
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
143 if __name__ == '__main__':
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
144 seqs = 0
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
145 seqlen = 0
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
146
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
147
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
148 opts, args = getopt.getopt(sys.argv[1:], 'hp')
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
149
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
150 for o, a in opts:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
151 if o == '-h':
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
152 print 'Usage:'
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
153 print ' %s <filename>' % sys.argv[0]
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
154 print ' %s < <filename>' % sys.argv[0]
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
155 print ' cat <filename> | %s [-p]\n' % sys.argv[0]
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
156 print ' <filename> must be in fasta or fastq format, and can be'
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
157 print ' gzipped, or (with the first usage style) bzip2\'d.'
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
158 sys.exit(0)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
159
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
160 else:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
161 print 'unhandled option'
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
162 sys.exit(1)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
163
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
164 if len(args) > 0:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
165 f = open(args[0], 'rb')
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
166 else:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
167 f = sys.stdin
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
168
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
169
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
170 try:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
171 for s in SequenceFile(f):
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
172 #print s # DEBUG
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
173 seqs += 1
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
174 seqlen += len(s.seq)
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
175
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
176 print 'Total sequences found: %d' % seqs
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
177 print 'Total residues found: %d' % seqlen
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
178 except SequenceInputError as e:
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
179 print >> sys.stderr, '\nFaulty input (record %d): %s' % \
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
180 (seqs+1, e.args[0])
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
181
70e76f6bfcfb Uploaded
schang
parents:
diff changeset
182 f.close()