changeset 6:d4ac3899145b draft

Uploaded
author schang
date Tue, 02 Dec 2014 19:54:26 -0500
parents 70e76f6bfcfb
children 6d94241370cc
files SAM_parser.py
diffstat 1 files changed, 123 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SAM_parser.py	Tue Dec 02 19:54:26 2014 -0500
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+"""
+Parse SAM file format from Bowtie2.
+
+Features:
+ - remove header lines using bowtie option --no-hd, and keep only aligned reads --no-unal
+ - the refseq field is assumed to look like:
+     gi|228472346|ref|NZ_ACLQ01000011.1| 
+   which is common in HMP accessions to SRA.  In this case, the string
+   starting with "NZ_" is treated as the refseq
+"""
+
+from __future__ import division
+import sys, getopt
+
+class SAMFragment:
+    """
+    Represents a fragment match to a reference genome.
+    """
+
+    def __init__(self, sam_string):
+        '''Look at SAM file format documentation for more details'''
+        fields = sam_string.rstrip().split()
+
+        self.name = fields[0] #sample run name (QNAME)
+        self.refseq = fields[2].split('|')[3] # Reference genome name (Accession#) or RNAME
+#         self.refseq = fields[2]
+        self.location = int(fields[3]) # POS
+        self.cigar = fields[5] # CIGAR
+        self.sequence = fields[9] # SEQ
+        self.length = len(self.sequence) # length of sequence
+
+        for field in fields[11:]:
+            # Parses the OPT field that contains the MD tag for identity info. 
+            if field[0:5] == "MD:Z:":
+                md = field[5:]
+                # warning: finite state machine ahead!
+                # state = 0: nothing to remember
+                # state = 1: read a number, looking to see if next is a number
+                # state = 2: read a "^", looking for A, T, C, and G for deletion
+                state = 0
+                tempstr = ''
+                matches = 0
+                mismatches = 0
+                deletions = 0
+                for char in md:
+                    if char in ('0', '1', '2', '3', '4', '5', '6', '7',
+                                '8', '9'):
+                        if state == 2:
+                            deletions += len(tempstr)
+                            tempstr = ''
+                        state = 1
+                        tempstr += char
+                    elif char == '^':
+                        if state == 1:
+                            matches += int(tempstr)
+                            tempstr = ''
+                        state = 2
+                    elif char in ('A', 'T', 'C', 'G'):
+                        if state == 1:
+                            matches += int(tempstr)
+                            tempstr = ''
+                            state = 0
+                            mismatches += 1
+                        elif state == 2:
+                            tempstr += char
+                        else:
+                            mismatches += 1
+                if state == 1:
+                    matches += int(tempstr)
+                elif state == 2:
+                    deletions += len(tempstr)
+
+        self.matches = matches
+        self.mismatches = mismatches
+        self.deletions = deletions
+        self.identity = 100 * (matches / (matches + mismatches))
+
+    def __repr__(self):
+        out = []
+        out.append("Name:      %s\n" % self.name)
+        out.append("Location:  %d\n" % self.location)
+        out.append("Identity:  %4.1f\n" % self.identity)
+        return ''.join(out)
+
+
+def SAMFile(filename):
+    """
+    Parses a SAM format file, returning SAMFragment objects.
+
+    This is a generator, so that there is no need to store a full list of
+    fragments in memory if it's not necessary to do so.
+    """
+
+    f = open(filename, 'rb')
+
+    for line in f:
+        if line.startswith('@'):
+            continue
+        if not line.startswith("Your job"):  # preprocessing error on my part
+            yield SAMFragment(line)
+
+    f.close()
+
+
+if __name__ == '__main__':
+    opts, args = getopt.getopt(sys.argv[1:], 'h')
+
+    for o, a in opts:
+        if o == '-h':
+            print ('Usage:')
+            print ('  %s <sam_file>' % sys.argv[0])
+            sys.exit(0)
+        else:
+            print ('unhandled option')
+            sys.exit(1)
+
+    if len(args) == 0:
+        print ('Specify a SAM file as an argument')
+        sys.exit(1)
+
+    for frag in SAMFile(args[0]):
+        print (frag)