annotate commons/core/parsing/AxtParser.py @ 68:85e80c21b1f7 draft

Uploaded
author m-zytnicki
date Mon, 16 Nov 2015 12:00:32 -0500
parents 44d5973c188c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
36
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
1 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
2 # Copyright INRA-URGI 2009-2010
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
3 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
4 # This software is governed by the CeCILL license under French law and
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
5 # abiding by the rules of distribution of free software. You can use,
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
6 # modify and/ or redistribute the software under the terms of the CeCILL
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
7 # license as circulated by CEA, CNRS and INRIA at the following URL
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
8 # "http://www.cecill.info".
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
9 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
10 # As a counterpart to the access to the source code and rights to copy,
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
11 # modify and redistribute granted by the license, users are provided only
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
12 # with a limited warranty and the software's author, the holder of the
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
13 # economic rights, and the successive licensors have only limited
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
14 # liability.
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
15 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
16 # In this respect, the user's attention is drawn to the risks associated
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
17 # with loading, using, modifying and/or developing or reproducing the
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
18 # software by the user in light of its specific status of free software,
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
19 # that may mean that it is complicated to manipulate, and that also
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
20 # therefore means that it is reserved for developers and experienced
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
21 # professionals having in-depth computer knowledge. Users are therefore
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
22 # encouraged to load and test the software's suitability as regards their
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
23 # requirements in conditions enabling the security of their systems and/or
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
24 # data to be ensured and, more generally, to use and operate it in the
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
25 # same conditions as regards security.
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
26 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
27 # The fact that you are presently reading this means that you have had
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
28 # knowledge of the CeCILL license and that you accept its terms.
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
29 #
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
30 import re
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
31 import sys
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
32 from SMART.Java.Python.structure.Mapping import Mapping
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
33 from SMART.Java.Python.structure.SubMapping import SubMapping
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
34 from commons.core.parsing.MapperParser import MapperParser
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
35 from SMART.Java.Python.misc import Utils
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
36 from SMART.Java.Python.misc.Utils import getHammingDistance
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
37
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
38
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
39 class AxtParser(MapperParser):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
40 """A class that parses AXT (as given by Mosaik)"""
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
41
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
42 def __init__(self, fileName, verbosity = 0):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
43 super(AxtParser, self).__init__(fileName, verbosity)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
44 self.queryLine = None
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
45 self.subjectLine = None
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
46
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
47 def __del__(self):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
48 super(AxtParser, self).__del__()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
49
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
50
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
51 def getFileFormats():
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
52 return ["axt"]
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
53 getFileFormats = staticmethod(getFileFormats)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
54
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
55
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
56 def skipFirstLines(self):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
57 pass
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
58
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
59
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
60 def getInfos(self):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
61 self.chromosomes = set()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
62 self.nbMappings = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
63 self.size = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
64 cpt = 0
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
65 self.reset()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
66 for line in self.handle:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
67 line = line.strip()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
68 if line == "": continue
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
69 if cpt % 3 == 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
70 line = line.strip()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
71 parts = line.split(" ")
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
72 self.chromosomes.add(parts[1])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
73 self.size += int(parts[6])
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
74 self.nbMappings += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
75 cpt += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
76 if self.verbosity >= 10 and self.nbMappings % 100000 == 0:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
77 sys.stdout.write(" %d mappings read\r" % (self.nbMappings))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
78 sys.stdout.flush()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
79 self.reset()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
80 if self.verbosity >= 10:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
81 print " %d mappings read" % (self.nbMappings)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
82 print "Done."
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
83
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
84
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
85 def parseLine(self, line):
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
86
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
87 if line.strip() == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
88 for line in self.handle:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
89 self.currentLineNb += 1
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
90 break
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
91 if line.strip() == "":
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
92 return None
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
93
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
94 m = re.search(r"^\s*\d+\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+\d+\s*$", line)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
95 if m != None:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
96 mapping = Mapping()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
97 subMapping = SubMapping()
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
98
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
99 subMapping.queryInterval.setName(m.group(4))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
100 subMapping.queryInterval.setStart(min(int(m.group(5)), int(m.group(6))))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
101 subMapping.queryInterval.setEnd(max(int(m.group(5)), int(m.group(6))))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
102 subMapping.queryInterval.setDirection(m.group(7))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
103
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
104 subMapping.targetInterval.setChromosome(m.group(1))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
105 subMapping.targetInterval.setStart(min(int(m.group(2)), int(m.group(3))))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
106 subMapping.targetInterval.setEnd(max(int(m.group(2)), int(m.group(3))))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
107 subMapping.targetInterval.setDirection(1)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
108
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
109 subMapping.setSize(min(subMapping.targetInterval.getSize(), subMapping.queryInterval.getSize()))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
110 subMapping.setDirection(m.group(7))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
111
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
112 mapping.addSubMapping(subMapping)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
113
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
114 mapping.setDirection(m.group(7))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
115 mapping.targetInterval.setChromosome(m.group(1))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
116 mapping.targetInterval.setStart(min(int(m.group(2)), int(m.group(3))))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
117 mapping.targetInterval.setEnd(max(int(m.group(2)), int(m.group(3))))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
118
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
119 mapping.queryInterval.setName(m.group(4))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
120 mapping.queryInterval.setStart(min(int(m.group(5)), int(m.group(6))))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
121 mapping.queryInterval.setEnd(max(int(m.group(5)), int(m.group(6))))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
122
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
123 mapping.setSize(min(mapping.targetInterval.getSize(), mapping.queryInterval.getSize()))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
124
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
125 self.currentMapping = mapping
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
126 return None
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
127 if self.queryLine == None:
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
128 self.queryLine = line
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
129 return None
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
130 self.subjectLine = line
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
131 seqLen = float(len(self.subjectLine))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
132 dist = float(getHammingDistance(self.queryLine, self.subjectLine))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
133 self.currentMapping.setNbMismatches(getHammingDistance(self.queryLine, self.subjectLine))
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
134 self.currentMapping.setNbGaps(0)
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
135 self.queryLine = None
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
136 self.subjectLine = None
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
137 return self.currentMapping
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
138
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
139
44d5973c188c Uploaded
m-zytnicki
parents:
diff changeset
140