Mercurial > repos > schang > frp_tool
changeset 6:d4ac3899145b draft
Uploaded
author | schang |
---|---|
date | Tue, 02 Dec 2014 19:54:26 -0500 |
parents | 70e76f6bfcfb |
children | 6d94241370cc |
files | SAM_parser.py |
diffstat | 1 files changed, 123 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SAM_parser.py Tue Dec 02 19:54:26 2014 -0500 @@ -0,0 +1,123 @@ +#!/usr/bin/env python +""" +Parse SAM file format from Bowtie2. + +Features: + - remove header lines using bowtie option --no-hd, and keep only aligned reads --no-unal + - the refseq field is assumed to look like: + gi|228472346|ref|NZ_ACLQ01000011.1| + which is common in HMP accessions to SRA. In this case, the string + starting with "NZ_" is treated as the refseq +""" + +from __future__ import division +import sys, getopt + +class SAMFragment: + """ + Represents a fragment match to a reference genome. + """ + + def __init__(self, sam_string): + '''Look at SAM file format documentation for more details''' + fields = sam_string.rstrip().split() + + self.name = fields[0] #sample run name (QNAME) + self.refseq = fields[2].split('|')[3] # Reference genome name (Accession#) or RNAME +# self.refseq = fields[2] + self.location = int(fields[3]) # POS + self.cigar = fields[5] # CIGAR + self.sequence = fields[9] # SEQ + self.length = len(self.sequence) # length of sequence + + for field in fields[11:]: + # Parses the OPT field that contains the MD tag for identity info. + if field[0:5] == "MD:Z:": + md = field[5:] + # warning: finite state machine ahead! + # state = 0: nothing to remember + # state = 1: read a number, looking to see if next is a number + # state = 2: read a "^", looking for A, T, C, and G for deletion + state = 0 + tempstr = '' + matches = 0 + mismatches = 0 + deletions = 0 + for char in md: + if char in ('0', '1', '2', '3', '4', '5', '6', '7', + '8', '9'): + if state == 2: + deletions += len(tempstr) + tempstr = '' + state = 1 + tempstr += char + elif char == '^': + if state == 1: + matches += int(tempstr) + tempstr = '' + state = 2 + elif char in ('A', 'T', 'C', 'G'): + if state == 1: + matches += int(tempstr) + tempstr = '' + state = 0 + mismatches += 1 + elif state == 2: + tempstr += char + else: + mismatches += 1 + if state == 1: + matches += int(tempstr) + elif state == 2: + deletions += len(tempstr) + + self.matches = matches + self.mismatches = mismatches + self.deletions = deletions + self.identity = 100 * (matches / (matches + mismatches)) + + def __repr__(self): + out = [] + out.append("Name: %s\n" % self.name) + out.append("Location: %d\n" % self.location) + out.append("Identity: %4.1f\n" % self.identity) + return ''.join(out) + + +def SAMFile(filename): + """ + Parses a SAM format file, returning SAMFragment objects. + + This is a generator, so that there is no need to store a full list of + fragments in memory if it's not necessary to do so. + """ + + f = open(filename, 'rb') + + for line in f: + if line.startswith('@'): + continue + if not line.startswith("Your job"): # preprocessing error on my part + yield SAMFragment(line) + + f.close() + + +if __name__ == '__main__': + opts, args = getopt.getopt(sys.argv[1:], 'h') + + for o, a in opts: + if o == '-h': + print ('Usage:') + print (' %s <sam_file>' % sys.argv[0]) + sys.exit(0) + else: + print ('unhandled option') + sys.exit(1) + + if len(args) == 0: + print ('Specify a SAM file as an argument') + sys.exit(1) + + for frag in SAMFile(args[0]): + print (frag)