| 5 | 1 #!/usr/bin/python | 
|  | 2 ''' Script to parse the reference genome sequence''' | 
|  | 3 | 
|  | 4 # Features and assumptions: | 
|  | 5 #  - filenames do not matter; gzip and bzip2 streams are autodetected, and | 
|  | 6 #    the input format (FASTA or FASTQ) is autodetected | 
|  | 7 | 
|  | 8 | 
|  | 9 import sys, getopt, gzip, bz2, StringIO | 
|  | 10 | 
|  | 11 class SequenceInputError(Exception): | 
|  | 12     """Used to raise error conditions.""" | 
|  | 13 | 
|  | 14 | 
|  | 15 class Sequence: | 
|  | 16     """ | 
|  | 17     Represents a protein, DNA, or RNA sequence. | 
|  | 18 | 
|  | 19     Contains 3 members: | 
|  | 20         name: string | 
|  | 21         seq: string containing the full, unbroken sequence | 
|  | 22         qual: array of ints, one per character in seq (FASTQ only) | 
|  | 23     """ | 
|  | 24 | 
|  | 25     def __init__(self, name, seq, qual=None): | 
|  | 26         self.name = name | 
|  | 27         self.seq = seq | 
|  | 28         self.qual = qual | 
|  | 29 | 
|  | 30     def __repr__(self): | 
|  | 31         out = [] | 
|  | 32         out.append("Name: %s\n" % self.name) | 
|  | 33         out.append("Seq:  %s" % self.seq) | 
|  | 34         if self.qual: | 
|  | 35             out.append("\nQual:\n") | 
|  | 36             for q in self.qual: | 
|  | 37                 out.append("%d " % q) | 
|  | 38         return ''.join(out) | 
|  | 39 | 
|  | 40 | 
|  | 41 def supportCompression(stream): | 
|  | 42     """ | 
|  | 43     Detect if a stream is compressed and modify to handle, if possible. | 
|  | 44     """ | 
|  | 45     no_filename = False | 
|  | 46 | 
|  | 47     try: | 
|  | 48         filename = stream.name | 
|  | 49     except AttributeError: | 
|  | 50         # probably a StringIO object | 
|  | 51         no_filename = True | 
|  | 52 | 
|  | 53     if filename == '<stdin>': | 
|  | 54         stream = StringIO.StringIO(sys.stdin.read()) | 
|  | 55 | 
|  | 56     pos = stream.tell() | 
|  | 57     magic = stream.read(2) | 
|  | 58     stream.seek(pos) | 
|  | 59     if magic == '\037\213': | 
|  | 60         stream = gzip.GzipFile(None, 'rb', None, stream) | 
|  | 61     elif magic == 'BZ': | 
|  | 62         if no_filename: | 
|  | 63             raise SequenceInputError('Cannot handle bzip2 compressed ' + | 
|  | 64                                      'StringIO data.') | 
|  | 65         if filename == '<stdin>': | 
|  | 66             raise SequenceInputError('Cannot handle bzip2 compressed data ' + | 
|  | 67                                      'over stdin.') | 
|  | 68         stream = bz2.BZ2File(filename, 'rb') | 
|  | 69 | 
|  | 70     return stream | 
|  | 71 | 
|  | 72 | 
|  | 73 def SequenceFile(stream): | 
|  | 74     """ | 
|  | 75     Parses FASTA or FASTQ data, returning Sequence objects. | 
|  | 76 | 
|  | 77     This is a generator, so that there is no need to store a full list of | 
|  | 78     sequences in memory if the user does not want to. | 
|  | 79     """ | 
|  | 80 | 
|  | 81     f = supportCompression(stream) | 
|  | 82 | 
|  | 83     l = f.readline().rstrip() | 
|  | 84 | 
|  | 85     # autodetect file format based on first character of line | 
|  | 86     if l[0] == '>': | 
|  | 87         format_type = 'fasta' | 
|  | 88     elif l[0] == '@': | 
|  | 89         format_type = 'fastq' | 
|  | 90     else: | 
|  | 91         raise SequenceInputError('Could not determine data type ' + | 
|  | 92                                  '(fasta or fastq)') | 
|  | 93 | 
|  | 94     # l == '' indicates end-of-file | 
|  | 95     while l != '': | 
|  | 96         name = l[1:] | 
|  | 97         seq_list = [] | 
|  | 98         qual_list = None | 
|  | 99 | 
|  | 100         # Begin parsing | 
|  | 101         l = f.readline().rstrip() | 
|  | 102 | 
|  | 103         if format_type == 'fasta': | 
|  | 104             while l != '' and l[0] != '>': | 
|  | 105                 seq_list.append(l) | 
|  | 106                 l = f.readline().rstrip() | 
|  | 107             s = ''.join(seq_list) | 
|  | 108 | 
|  | 109         elif format_type == 'fastq': | 
|  | 110             # read in the sequence | 
|  | 111             while l != '' and l[0] != '+': | 
|  | 112                 seq_list.append(l) | 
|  | 113                 l = f.readline().rstrip() | 
|  | 114             s = ''.join(seq_list) | 
|  | 115 | 
|  | 116             # check if the second sequence name (if existing) matches the first | 
|  | 117             if len(l) > 1: | 
|  | 118                 if name != l[1:]: | 
|  | 119                     raise SequenceInputError('Sequence and quality score ' + | 
|  | 120                                              'identifiers do not match ' + | 
|  | 121                                              '(\"%s\" != \"%s\")' % (name, l)) | 
|  | 122             l = f.readline().rstrip() | 
|  | 123 | 
|  | 124             # read in the quality scores | 
|  | 125             q = '' | 
|  | 126             while l != '' and len(q) < len(s): | 
|  | 127                 q += l | 
|  | 128                 l = f.readline().rstrip() | 
|  | 129 | 
|  | 130             # ensure that the quality and sequence strings are the same length | 
|  | 131             if len(q) != len(s): | 
|  | 132                 raise SequenceInputError('Sequence and quality string ' + | 
|  | 133                                          'lengths do not match (\"%s\")' % name) | 
|  | 134             qual_list = [] | 
|  | 135             for c in q: | 
|  | 136                 qual_list.append(ord(c) - 33) | 
|  | 137 | 
|  | 138         yield Sequence(name, s, qual_list) | 
|  | 139 | 
|  | 140     f.close() | 
|  | 141 | 
|  | 142 | 
|  | 143 if __name__ == '__main__': | 
|  | 144     seqs = 0 | 
|  | 145     seqlen = 0 | 
|  | 146 | 
|  | 147 | 
|  | 148     opts, args = getopt.getopt(sys.argv[1:], 'hp') | 
|  | 149 | 
|  | 150     for o, a in opts: | 
|  | 151         if o == '-h': | 
|  | 152             print 'Usage:' | 
|  | 153             print '  %s  <filename>' % sys.argv[0] | 
|  | 154             print '  %s  < <filename>' % sys.argv[0] | 
|  | 155             print '  cat <filename> | %s [-p]\n' % sys.argv[0] | 
|  | 156             print '  <filename> must be in fasta or fastq format, and can be' | 
|  | 157             print '  gzipped, or (with the first usage style) bzip2\'d.' | 
|  | 158             sys.exit(0) | 
|  | 159 | 
|  | 160         else: | 
|  | 161             print 'unhandled option' | 
|  | 162             sys.exit(1) | 
|  | 163 | 
|  | 164     if len(args) > 0: | 
|  | 165         f = open(args[0], 'rb') | 
|  | 166     else: | 
|  | 167         f = sys.stdin | 
|  | 168 | 
|  | 169 | 
|  | 170     try: | 
|  | 171         for s in SequenceFile(f): | 
|  | 172             #print s  # DEBUG | 
|  | 173             seqs += 1 | 
|  | 174             seqlen += len(s.seq) | 
|  | 175 | 
|  | 176         print 'Total sequences found: %d' % seqs | 
|  | 177         print 'Total residues found:  %d' % seqlen | 
|  | 178     except SequenceInputError as e: | 
|  | 179         print >> sys.stderr, '\nFaulty input (record %d): %s' % \ | 
|  | 180                              (seqs+1, e.args[0]) | 
|  | 181 | 
|  | 182     f.close() |