Mercurial > repos > schang > frp_tool
comparison refseq_parser.py @ 5:70e76f6bfcfb draft
Uploaded
author | schang |
---|---|
date | Tue, 02 Dec 2014 19:54:12 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4:398c3753c358 | 5:70e76f6bfcfb |
---|---|
1 #!/usr/bin/python | |
2 ''' Script to parse the reference genome sequence''' | |
3 | |
4 # Features and assumptions: | |
5 # - filenames do not matter; gzip and bzip2 streams are autodetected, and | |
6 # the input format (FASTA or FASTQ) is autodetected | |
7 | |
8 | |
9 import sys, getopt, gzip, bz2, StringIO | |
10 | |
11 class SequenceInputError(Exception): | |
12 """Used to raise error conditions.""" | |
13 | |
14 | |
15 class Sequence: | |
16 """ | |
17 Represents a protein, DNA, or RNA sequence. | |
18 | |
19 Contains 3 members: | |
20 name: string | |
21 seq: string containing the full, unbroken sequence | |
22 qual: array of ints, one per character in seq (FASTQ only) | |
23 """ | |
24 | |
25 def __init__(self, name, seq, qual=None): | |
26 self.name = name | |
27 self.seq = seq | |
28 self.qual = qual | |
29 | |
30 def __repr__(self): | |
31 out = [] | |
32 out.append("Name: %s\n" % self.name) | |
33 out.append("Seq: %s" % self.seq) | |
34 if self.qual: | |
35 out.append("\nQual:\n") | |
36 for q in self.qual: | |
37 out.append("%d " % q) | |
38 return ''.join(out) | |
39 | |
40 | |
41 def supportCompression(stream): | |
42 """ | |
43 Detect if a stream is compressed and modify to handle, if possible. | |
44 """ | |
45 no_filename = False | |
46 | |
47 try: | |
48 filename = stream.name | |
49 except AttributeError: | |
50 # probably a StringIO object | |
51 no_filename = True | |
52 | |
53 if filename == '<stdin>': | |
54 stream = StringIO.StringIO(sys.stdin.read()) | |
55 | |
56 pos = stream.tell() | |
57 magic = stream.read(2) | |
58 stream.seek(pos) | |
59 if magic == '\037\213': | |
60 stream = gzip.GzipFile(None, 'rb', None, stream) | |
61 elif magic == 'BZ': | |
62 if no_filename: | |
63 raise SequenceInputError('Cannot handle bzip2 compressed ' + | |
64 'StringIO data.') | |
65 if filename == '<stdin>': | |
66 raise SequenceInputError('Cannot handle bzip2 compressed data ' + | |
67 'over stdin.') | |
68 stream = bz2.BZ2File(filename, 'rb') | |
69 | |
70 return stream | |
71 | |
72 | |
73 def SequenceFile(stream): | |
74 """ | |
75 Parses FASTA or FASTQ data, returning Sequence objects. | |
76 | |
77 This is a generator, so that there is no need to store a full list of | |
78 sequences in memory if the user does not want to. | |
79 """ | |
80 | |
81 f = supportCompression(stream) | |
82 | |
83 l = f.readline().rstrip() | |
84 | |
85 # autodetect file format based on first character of line | |
86 if l[0] == '>': | |
87 format_type = 'fasta' | |
88 elif l[0] == '@': | |
89 format_type = 'fastq' | |
90 else: | |
91 raise SequenceInputError('Could not determine data type ' + | |
92 '(fasta or fastq)') | |
93 | |
94 # l == '' indicates end-of-file | |
95 while l != '': | |
96 name = l[1:] | |
97 seq_list = [] | |
98 qual_list = None | |
99 | |
100 # Begin parsing | |
101 l = f.readline().rstrip() | |
102 | |
103 if format_type == 'fasta': | |
104 while l != '' and l[0] != '>': | |
105 seq_list.append(l) | |
106 l = f.readline().rstrip() | |
107 s = ''.join(seq_list) | |
108 | |
109 elif format_type == 'fastq': | |
110 # read in the sequence | |
111 while l != '' and l[0] != '+': | |
112 seq_list.append(l) | |
113 l = f.readline().rstrip() | |
114 s = ''.join(seq_list) | |
115 | |
116 # check if the second sequence name (if existing) matches the first | |
117 if len(l) > 1: | |
118 if name != l[1:]: | |
119 raise SequenceInputError('Sequence and quality score ' + | |
120 'identifiers do not match ' + | |
121 '(\"%s\" != \"%s\")' % (name, l)) | |
122 l = f.readline().rstrip() | |
123 | |
124 # read in the quality scores | |
125 q = '' | |
126 while l != '' and len(q) < len(s): | |
127 q += l | |
128 l = f.readline().rstrip() | |
129 | |
130 # ensure that the quality and sequence strings are the same length | |
131 if len(q) != len(s): | |
132 raise SequenceInputError('Sequence and quality string ' + | |
133 'lengths do not match (\"%s\")' % name) | |
134 qual_list = [] | |
135 for c in q: | |
136 qual_list.append(ord(c) - 33) | |
137 | |
138 yield Sequence(name, s, qual_list) | |
139 | |
140 f.close() | |
141 | |
142 | |
143 if __name__ == '__main__': | |
144 seqs = 0 | |
145 seqlen = 0 | |
146 | |
147 | |
148 opts, args = getopt.getopt(sys.argv[1:], 'hp') | |
149 | |
150 for o, a in opts: | |
151 if o == '-h': | |
152 print 'Usage:' | |
153 print ' %s <filename>' % sys.argv[0] | |
154 print ' %s < <filename>' % sys.argv[0] | |
155 print ' cat <filename> | %s [-p]\n' % sys.argv[0] | |
156 print ' <filename> must be in fasta or fastq format, and can be' | |
157 print ' gzipped, or (with the first usage style) bzip2\'d.' | |
158 sys.exit(0) | |
159 | |
160 else: | |
161 print 'unhandled option' | |
162 sys.exit(1) | |
163 | |
164 if len(args) > 0: | |
165 f = open(args[0], 'rb') | |
166 else: | |
167 f = sys.stdin | |
168 | |
169 | |
170 try: | |
171 for s in SequenceFile(f): | |
172 #print s # DEBUG | |
173 seqs += 1 | |
174 seqlen += len(s.seq) | |
175 | |
176 print 'Total sequences found: %d' % seqs | |
177 print 'Total residues found: %d' % seqlen | |
178 except SequenceInputError as e: | |
179 print >> sys.stderr, '\nFaulty input (record %d): %s' % \ | |
180 (seqs+1, e.args[0]) | |
181 | |
182 f.close() |