Mercurial > repos > yufei-luo > s_mart
comparison commons/core/parsing/SamParser.py @ 6:769e306b7933
Change the repository level.
| author | yufei-luo | 
|---|---|
| date | Fri, 18 Jan 2013 04:54:14 -0500 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 5:ea3082881bf8 | 6:769e306b7933 | 
|---|---|
| 1 # | |
| 2 # Copyright INRA-URGI 2009-2010 | |
| 3 # | |
| 4 # This software is governed by the CeCILL license under French law and | |
| 5 # abiding by the rules of distribution of free software. You can use, | |
| 6 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 7 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 8 # "http://www.cecill.info". | |
| 9 # | |
| 10 # As a counterpart to the access to the source code and rights to copy, | |
| 11 # modify and redistribute granted by the license, users are provided only | |
| 12 # with a limited warranty and the software's author, the holder of the | |
| 13 # economic rights, and the successive licensors have only limited | |
| 14 # liability. | |
| 15 # | |
| 16 # In this respect, the user's attention is drawn to the risks associated | |
| 17 # with loading, using, modifying and/or developing or reproducing the | |
| 18 # software by the user in light of its specific status of free software, | |
| 19 # that may mean that it is complicated to manipulate, and that also | |
| 20 # therefore means that it is reserved for developers and experienced | |
| 21 # professionals having in-depth computer knowledge. Users are therefore | |
| 22 # encouraged to load and test the software's suitability as regards their | |
| 23 # requirements in conditions enabling the security of their systems and/or | |
| 24 # data to be ensured and, more generally, to use and operate it in the | |
| 25 # same conditions as regards security. | |
| 26 # | |
| 27 # The fact that you are presently reading this means that you have had | |
| 28 # knowledge of the CeCILL license and that you accept its terms. | |
| 29 # | |
| 30 import re | |
| 31 import sys | |
| 32 from commons.core.parsing.MapperParser import MapperParser | |
| 33 from SMART.Java.Python.structure.Mapping import Mapping | |
| 34 from SMART.Java.Python.structure.SubMapping import SubMapping | |
| 35 from SMART.Java.Python.structure.Interval import Interval | |
| 36 | |
| 37 class SamParser(MapperParser): | |
| 38 """A class that parses SAM format (as given by BWA)""" | |
| 39 | |
| 40 def __init__(self, fileName, verbosity = 0): | |
| 41 super(SamParser, self).__init__(fileName, verbosity) | |
| 42 | |
| 43 | |
| 44 def __del__(self): | |
| 45 super(SamParser, self).__del__() | |
| 46 | |
| 47 | |
| 48 def getFileFormats(): | |
| 49 return ["sam"] | |
| 50 getFileFormats = staticmethod(getFileFormats) | |
| 51 | |
| 52 | |
| 53 def skipFirstLines(self): | |
| 54 pass | |
| 55 | |
| 56 | |
| 57 def getInfos(self): | |
| 58 self.chromosomes = set() | |
| 59 self.nbMappings = 0 | |
| 60 self.size = 0 | |
| 61 self.reset() | |
| 62 if self.verbosity >= 10: | |
| 63 print "Getting information on SAM file" | |
| 64 self.reset() | |
| 65 for line in self.handle: | |
| 66 line = line.strip() | |
| 67 if line == "" or line[0] == "@": | |
| 68 continue | |
| 69 parts = line.split("\t") | |
| 70 chromosome = parts[2] | |
| 71 if chromosome != "*": | |
| 72 self.chromosomes.add(chromosome) | |
| 73 self.nbMappings += 1 | |
| 74 self.size += len(parts[8]) | |
| 75 if self.verbosity >= 10 and self.nbMappings % 100000 == 0: | |
| 76 sys.stdout.write(" %d mappings read\r" % (self.nbMappings)) | |
| 77 sys.stdout.flush() | |
| 78 self.reset() | |
| 79 if self.verbosity >= 10: | |
| 80 print " %d mappings read" % (self.nbMappings) | |
| 81 print "Done." | |
| 82 | |
| 83 | |
| 84 def parseLine(self, line): | |
| 85 | |
| 86 line = line.strip() | |
| 87 if line[0] == "@": | |
| 88 return | |
| 89 | |
| 90 fields = line.split("\t") | |
| 91 if len(fields) < 11: | |
| 92 raise Exception("Line %d '%s' does not look like a SAM line (number of fields is %d instead of 11)" % (self.currentLineNb, line, len(fields))) | |
| 93 | |
| 94 name = fields[0] | |
| 95 flag = int(fields[1]) | |
| 96 | |
| 97 if (flag & 0x4) == 0x4: | |
| 98 return None | |
| 99 | |
| 100 direction = 1 if (flag & 0x10) == 0x0 else -1 | |
| 101 chromosome = fields[2] | |
| 102 genomeStart = int(fields[3]) | |
| 103 quality = fields[4] | |
| 104 cigar = fields[5] | |
| 105 mate = fields[6] | |
| 106 mateGenomeStart = fields[7] | |
| 107 gapSize = fields[8] | |
| 108 sequence = fields[9] | |
| 109 quality = fields[10] | |
| 110 tags = fields[11:] | |
| 111 | |
| 112 if mateGenomeStart != "*": | |
| 113 mateGenomeStart = int(mateGenomeStart) | |
| 114 | |
| 115 mapping = Mapping() | |
| 116 nbOccurrences = 1 | |
| 117 nbMismatches = 0 | |
| 118 nbMatches = 0 | |
| 119 nbGaps = 0 | |
| 120 subMapping = None | |
| 121 queryOffset = 0 | |
| 122 targetOffset = 0 | |
| 123 currentNumber = 0 | |
| 124 readStart = None | |
| 125 | |
| 126 for tag in tags: | |
| 127 key = tag[:2] | |
| 128 if key == "X0": | |
| 129 nbOccurrences = int(tag[5:]) | |
| 130 elif key == "X1": | |
| 131 nbOccurrences += int(tag[5:]) | |
| 132 elif key == "XM": | |
| 133 nbMismatches = int(tag[5:]) | |
| 134 mapping.setTagValue("nbOccurrences", nbOccurrences) | |
| 135 mapping.setTagValue("quality", int(fields[4])) | |
| 136 | |
| 137 for char in cigar: | |
| 138 m = re.match(r"[0-9]", char) | |
| 139 if m != None: | |
| 140 currentNumber = currentNumber * 10 + (ord(char) - ord("0")) | |
| 141 continue | |
| 142 # match | |
| 143 m = re.match(r"[M]", char) | |
| 144 if m != None: | |
| 145 if readStart == None: | |
| 146 readStart = queryOffset | |
| 147 if subMapping == None: | |
| 148 subMapping = SubMapping() | |
| 149 subMapping.setSize(currentNumber) | |
| 150 subMapping.setDirection(direction) | |
| 151 subMapping.queryInterval.setName(name) | |
| 152 subMapping.queryInterval.setStart(queryOffset) | |
| 153 subMapping.queryInterval.setDirection(direction) | |
| 154 subMapping.targetInterval.setChromosome(chromosome) | |
| 155 subMapping.targetInterval.setStart(genomeStart + targetOffset) | |
| 156 subMapping.targetInterval.setDirection(1) | |
| 157 nbMatches += currentNumber | |
| 158 targetOffset += currentNumber | |
| 159 queryOffset += currentNumber | |
| 160 currentNumber = 0 | |
| 161 continue | |
| 162 # insertion on the read | |
| 163 m = re.match(r"[I]", char) | |
| 164 if m != None: | |
| 165 nbGaps += 1 | |
| 166 queryOffset += currentNumber | |
| 167 currentNumber = 0 | |
| 168 continue | |
| 169 # insertion on the genome | |
| 170 m = re.match(r"[D]", char) | |
| 171 if m != None: | |
| 172 if subMapping != None: | |
| 173 subMapping.queryInterval.setEnd(queryOffset - 1) | |
| 174 subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1) | |
| 175 mapping.addSubMapping(subMapping) | |
| 176 subMapping = None | |
| 177 nbGaps += 1 | |
| 178 targetOffset += currentNumber | |
| 179 currentNumber = 0 | |
| 180 continue | |
| 181 # intron | |
| 182 m = re.match(r"[N]", char) | |
| 183 if m != None: | |
| 184 if subMapping != None: | |
| 185 subMapping.queryInterval.setEnd(queryOffset - 1) | |
| 186 subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1) | |
| 187 mapping.addSubMapping(subMapping) | |
| 188 subMapping = None | |
| 189 targetOffset += currentNumber | |
| 190 currentNumber = 0 | |
| 191 continue | |
| 192 # soft clipping (substitution) | |
| 193 m = re.match(r"[S]", char) | |
| 194 if m != None: | |
| 195 nbMismatches += currentNumber | |
| 196 targetOffset += currentNumber | |
| 197 queryOffset += currentNumber | |
| 198 currentNumber = 0 | |
| 199 continue | |
| 200 # hard clipping | |
| 201 m = re.match(r"[H]", char) | |
| 202 if m != None: | |
| 203 targetOffset += currentNumber | |
| 204 queryOffset += currentNumber | |
| 205 currentNumber = 0 | |
| 206 continue | |
| 207 # padding | |
| 208 m = re.match(r"[P]", char) | |
| 209 if m != None: | |
| 210 continue | |
| 211 raise Exception("Do not understand paramer '%s' in line %s" % (char, line)) | |
| 212 | |
| 213 if subMapping != None: | |
| 214 subMapping.queryInterval.setEnd(queryOffset - 1) | |
| 215 subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1) | |
| 216 mapping.addSubMapping(subMapping) | |
| 217 | |
| 218 mapping.queryInterval.setStart(readStart) | |
| 219 mapping.queryInterval.setEnd(queryOffset - 1) | |
| 220 mapping.targetInterval.setEnd(genomeStart + targetOffset - 1) | |
| 221 mapping.setNbMismatches(nbMismatches) | |
| 222 mapping.setNbGaps(nbGaps) | |
| 223 | |
| 224 mapping.queryInterval.setName(name) | |
| 225 mapping.queryInterval.setDirection(direction) | |
| 226 mapping.targetInterval.setChromosome(chromosome) | |
| 227 mapping.targetInterval.setStart(genomeStart) | |
| 228 mapping.targetInterval.setDirection(direction) | |
| 229 mapping.setSize(len(sequence)) | |
| 230 mapping.setDirection(direction) | |
| 231 | |
| 232 return mapping | |
| 233 | |
| 234 | 
