s_mart: commons/core/parsing/SamParser.py comparison

comparison commons/core/parsing/SamParser.py @ 36:44d5973c188c

Uploaded

author	m-zytnicki
date	Tue, 30 Apr 2013 15:02:29 -0400
parents	769e306b7933
children

comparison

equal deleted inserted replaced

-:d94018ca4ada
+:44d5973c188c
+#
+# Copyright INRA-URGI 2009-2010
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+#
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+import re
+import sys
+from commons.core.parsing.MapperParser import MapperParser
+from SMART.Java.Python.structure.Mapping import Mapping
+from SMART.Java.Python.structure.SubMapping import SubMapping
+from SMART.Java.Python.structure.Interval import Interval
+class SamParser(MapperParser):
+"""A class that parses SAM format (as given by BWA)"""
+def __init__(self, fileName, verbosity = 0):
+super(SamParser, self).__init__(fileName, verbosity)
+def __del__(self):
+super(SamParser, self).__del__()
+def getFileFormats():
+return ["sam"]
+getFileFormats = staticmethod(getFileFormats)
+def skipFirstLines(self):
+pass
+def getInfos(self):
+self.chromosomes = set()
+self.nbMappings  = 0
+self.size        = 0
+self.reset()
+if self.verbosity >= 10:
+print "Getting information on SAM file"
+self.reset()
+for line in self.handle:
+line = line.strip()
+if line == "" or line[0] == "@":
+continue
+parts      = line.split("\t")
+chromosome = parts[2]
+if chromosome != "*":
+self.chromosomes.add(chromosome)
+self.nbMappings += 1
+self.size += len(parts[8])
+if self.verbosity >= 10 and self.nbMappings % 100000 == 0:
+sys.stdout.write("    %d mappings read\r" % (self.nbMappings))
+sys.stdout.flush()
+self.reset()
+if self.verbosity >= 10:
+print "    %d mappings read" % (self.nbMappings)
+print "Done."
+def parseLine(self, line):
+line = line.strip()
+if line[0] == "@":
+return
+fields = line.split("\t")
+if len(fields) < 11:
+raise Exception("Line %d '%s' does not look like a SAM line (number of fields is %d instead of 11)" % (self.currentLineNb, line, len(fields)))
+name = fields[0]
+flag = int(fields[1])
+if (flag & 0x4) == 0x4:
+return None
+direction       = 1 if (flag & 0x10) == 0x0 else -1
+chromosome      = fields[2]
+genomeStart     = int(fields[3])
+quality         = fields[4]
+cigar           = fields[5]
+mate            = fields[6]
+mateGenomeStart = fields[7]
+gapSize         = fields[8]
+sequence        = fields[9]
+quality         = fields[10]
+tags            = fields[11:]
+if mateGenomeStart != "*":
+mateGenomeStart = int(mateGenomeStart)
+mapping       = Mapping()
+nbOccurrences = 1
+nbMismatches  = 0
+nbMatches     = 0
+nbGaps        = 0
+subMapping    = None
+queryOffset   = 0
+targetOffset  = 0
+currentNumber = 0
+readStart     = None
+for tag in tags:
+key = tag[:2]
+if key == "X0":
+nbOccurrences = int(tag[5:])
+elif key == "X1":
+nbOccurrences += int(tag[5:])
+elif key == "XM":
+nbMismatches = int(tag[5:])
+mapping.setTagValue("nbOccurrences", nbOccurrences)
+mapping.setTagValue("quality", int(fields[4]))
+for char in cigar:
+m = re.match(r"[0-9]", char)
+if m != None:
+currentNumber = currentNumber * 10 + (ord(char) - ord("0"))
+continue
+# match
+m = re.match(r"[M]", char)
+if m != None:
+if readStart == None:
+readStart = queryOffset
+if subMapping == None:
+subMapping = SubMapping()
+subMapping.setSize(currentNumber)
+subMapping.setDirection(direction)
+subMapping.queryInterval.setName(name)
+subMapping.queryInterval.setStart(queryOffset)
+subMapping.queryInterval.setDirection(direction)
+subMapping.targetInterval.setChromosome(chromosome)
+subMapping.targetInterval.setStart(genomeStart + targetOffset)
+subMapping.targetInterval.setDirection(1)
+nbMatches    += currentNumber
+targetOffset += currentNumber
+queryOffset  += currentNumber
+currentNumber = 0
+continue
+# insertion on the read
+m = re.match(r"[I]", char)
+if m != None:
+nbGaps       += 1
+queryOffset  += currentNumber
+currentNumber = 0
+continue
+# insertion on the genome
+m = re.match(r"[D]", char)
+if m != None:
+if subMapping != None:
+subMapping.queryInterval.setEnd(queryOffset - 1)
+subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
+mapping.addSubMapping(subMapping)
+subMapping    = None
+nbGaps       += 1
+targetOffset += currentNumber
+currentNumber = 0
+continue
+# intron
+m = re.match(r"[N]", char)
+if m != None:
+if subMapping != None:
+subMapping.queryInterval.setEnd(queryOffset - 1)
+subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
+mapping.addSubMapping(subMapping)
+subMapping    = None
+targetOffset += currentNumber
+currentNumber = 0
+continue
+# soft clipping (substitution)
+m = re.match(r"[S]", char)
+if m != None:
+nbMismatches += currentNumber
+targetOffset += currentNumber
+queryOffset  += currentNumber
+currentNumber = 0
+continue
+# hard clipping
+m = re.match(r"[H]", char)
+if m != None:
+targetOffset += currentNumber
+queryOffset  += currentNumber
+currentNumber = 0
+continue
+# padding
+m = re.match(r"[P]", char)
+if m != None:
+continue
+raise Exception("Do not understand paramer '%s' in line %s" % (char, line))
+if subMapping != None:
+subMapping.queryInterval.setEnd(queryOffset - 1)
+subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
+mapping.addSubMapping(subMapping)
+mapping.queryInterval.setStart(readStart)
+mapping.queryInterval.setEnd(queryOffset - 1)
+mapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
+mapping.setNbMismatches(nbMismatches)
+mapping.setNbGaps(nbGaps)
+mapping.queryInterval.setName(name)
+mapping.queryInterval.setDirection(direction)
+mapping.targetInterval.setChromosome(chromosome)
+mapping.targetInterval.setStart(genomeStart)
+mapping.targetInterval.setDirection(direction)
+mapping.setSize(len(sequence))
+mapping.setDirection(direction)
+return mapping

Mercurial > repos > yufei-luo > s_mart

comparison commons/core/parsing/SamParser.py @ 36:44d5973c188c