Mercurial > repos > schang > frp_tool
comparison refseq_parser.py @ 5:70e76f6bfcfb draft
Uploaded
| author | schang |
|---|---|
| date | Tue, 02 Dec 2014 19:54:12 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 4:398c3753c358 | 5:70e76f6bfcfb |
|---|---|
| 1 #!/usr/bin/python | |
| 2 ''' Script to parse the reference genome sequence''' | |
| 3 | |
| 4 # Features and assumptions: | |
| 5 # - filenames do not matter; gzip and bzip2 streams are autodetected, and | |
| 6 # the input format (FASTA or FASTQ) is autodetected | |
| 7 | |
| 8 | |
| 9 import sys, getopt, gzip, bz2, StringIO | |
| 10 | |
| 11 class SequenceInputError(Exception): | |
| 12 """Used to raise error conditions.""" | |
| 13 | |
| 14 | |
| 15 class Sequence: | |
| 16 """ | |
| 17 Represents a protein, DNA, or RNA sequence. | |
| 18 | |
| 19 Contains 3 members: | |
| 20 name: string | |
| 21 seq: string containing the full, unbroken sequence | |
| 22 qual: array of ints, one per character in seq (FASTQ only) | |
| 23 """ | |
| 24 | |
| 25 def __init__(self, name, seq, qual=None): | |
| 26 self.name = name | |
| 27 self.seq = seq | |
| 28 self.qual = qual | |
| 29 | |
| 30 def __repr__(self): | |
| 31 out = [] | |
| 32 out.append("Name: %s\n" % self.name) | |
| 33 out.append("Seq: %s" % self.seq) | |
| 34 if self.qual: | |
| 35 out.append("\nQual:\n") | |
| 36 for q in self.qual: | |
| 37 out.append("%d " % q) | |
| 38 return ''.join(out) | |
| 39 | |
| 40 | |
| 41 def supportCompression(stream): | |
| 42 """ | |
| 43 Detect if a stream is compressed and modify to handle, if possible. | |
| 44 """ | |
| 45 no_filename = False | |
| 46 | |
| 47 try: | |
| 48 filename = stream.name | |
| 49 except AttributeError: | |
| 50 # probably a StringIO object | |
| 51 no_filename = True | |
| 52 | |
| 53 if filename == '<stdin>': | |
| 54 stream = StringIO.StringIO(sys.stdin.read()) | |
| 55 | |
| 56 pos = stream.tell() | |
| 57 magic = stream.read(2) | |
| 58 stream.seek(pos) | |
| 59 if magic == '\037\213': | |
| 60 stream = gzip.GzipFile(None, 'rb', None, stream) | |
| 61 elif magic == 'BZ': | |
| 62 if no_filename: | |
| 63 raise SequenceInputError('Cannot handle bzip2 compressed ' + | |
| 64 'StringIO data.') | |
| 65 if filename == '<stdin>': | |
| 66 raise SequenceInputError('Cannot handle bzip2 compressed data ' + | |
| 67 'over stdin.') | |
| 68 stream = bz2.BZ2File(filename, 'rb') | |
| 69 | |
| 70 return stream | |
| 71 | |
| 72 | |
| 73 def SequenceFile(stream): | |
| 74 """ | |
| 75 Parses FASTA or FASTQ data, returning Sequence objects. | |
| 76 | |
| 77 This is a generator, so that there is no need to store a full list of | |
| 78 sequences in memory if the user does not want to. | |
| 79 """ | |
| 80 | |
| 81 f = supportCompression(stream) | |
| 82 | |
| 83 l = f.readline().rstrip() | |
| 84 | |
| 85 # autodetect file format based on first character of line | |
| 86 if l[0] == '>': | |
| 87 format_type = 'fasta' | |
| 88 elif l[0] == '@': | |
| 89 format_type = 'fastq' | |
| 90 else: | |
| 91 raise SequenceInputError('Could not determine data type ' + | |
| 92 '(fasta or fastq)') | |
| 93 | |
| 94 # l == '' indicates end-of-file | |
| 95 while l != '': | |
| 96 name = l[1:] | |
| 97 seq_list = [] | |
| 98 qual_list = None | |
| 99 | |
| 100 # Begin parsing | |
| 101 l = f.readline().rstrip() | |
| 102 | |
| 103 if format_type == 'fasta': | |
| 104 while l != '' and l[0] != '>': | |
| 105 seq_list.append(l) | |
| 106 l = f.readline().rstrip() | |
| 107 s = ''.join(seq_list) | |
| 108 | |
| 109 elif format_type == 'fastq': | |
| 110 # read in the sequence | |
| 111 while l != '' and l[0] != '+': | |
| 112 seq_list.append(l) | |
| 113 l = f.readline().rstrip() | |
| 114 s = ''.join(seq_list) | |
| 115 | |
| 116 # check if the second sequence name (if existing) matches the first | |
| 117 if len(l) > 1: | |
| 118 if name != l[1:]: | |
| 119 raise SequenceInputError('Sequence and quality score ' + | |
| 120 'identifiers do not match ' + | |
| 121 '(\"%s\" != \"%s\")' % (name, l)) | |
| 122 l = f.readline().rstrip() | |
| 123 | |
| 124 # read in the quality scores | |
| 125 q = '' | |
| 126 while l != '' and len(q) < len(s): | |
| 127 q += l | |
| 128 l = f.readline().rstrip() | |
| 129 | |
| 130 # ensure that the quality and sequence strings are the same length | |
| 131 if len(q) != len(s): | |
| 132 raise SequenceInputError('Sequence and quality string ' + | |
| 133 'lengths do not match (\"%s\")' % name) | |
| 134 qual_list = [] | |
| 135 for c in q: | |
| 136 qual_list.append(ord(c) - 33) | |
| 137 | |
| 138 yield Sequence(name, s, qual_list) | |
| 139 | |
| 140 f.close() | |
| 141 | |
| 142 | |
| 143 if __name__ == '__main__': | |
| 144 seqs = 0 | |
| 145 seqlen = 0 | |
| 146 | |
| 147 | |
| 148 opts, args = getopt.getopt(sys.argv[1:], 'hp') | |
| 149 | |
| 150 for o, a in opts: | |
| 151 if o == '-h': | |
| 152 print 'Usage:' | |
| 153 print ' %s <filename>' % sys.argv[0] | |
| 154 print ' %s < <filename>' % sys.argv[0] | |
| 155 print ' cat <filename> | %s [-p]\n' % sys.argv[0] | |
| 156 print ' <filename> must be in fasta or fastq format, and can be' | |
| 157 print ' gzipped, or (with the first usage style) bzip2\'d.' | |
| 158 sys.exit(0) | |
| 159 | |
| 160 else: | |
| 161 print 'unhandled option' | |
| 162 sys.exit(1) | |
| 163 | |
| 164 if len(args) > 0: | |
| 165 f = open(args[0], 'rb') | |
| 166 else: | |
| 167 f = sys.stdin | |
| 168 | |
| 169 | |
| 170 try: | |
| 171 for s in SequenceFile(f): | |
| 172 #print s # DEBUG | |
| 173 seqs += 1 | |
| 174 seqlen += len(s.seq) | |
| 175 | |
| 176 print 'Total sequences found: %d' % seqs | |
| 177 print 'Total residues found: %d' % seqlen | |
| 178 except SequenceInputError as e: | |
| 179 print >> sys.stderr, '\nFaulty input (record %d): %s' % \ | |
| 180 (seqs+1, e.args[0]) | |
| 181 | |
| 182 f.close() |
