comparison smart_toolShed/SMART/Java/Python/CompareOverlappingSmallQuery.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2011
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 from optparse import OptionParser
32 from commons.core.parsing.ParserChooser import ParserChooser
33 from commons.core.writer.TranscriptWriter import TranscriptWriter
34 from SMART.Java.Python.structure.Interval import Interval
35 from SMART.Java.Python.structure.Transcript import Transcript
36 from SMART.Java.Python.structure.Mapping import Mapping
37 from SMART.Java.Python.misc.Progress import Progress
38 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
39
40 MINBIN = 3
41 MAXBIN = 7
42 REFERENCE = 0
43 QUERY = 1
44
45 def getBin(start, end):
46 for i in range(MINBIN, MAXBIN + 1):
47 binLevel = 10 ** i
48 if int(start / binLevel) == int(end / binLevel):
49 return int(i * 10 ** (MAXBIN + 1) + int(start / binLevel))
50 return int((MAXBIN + 1) * 10 ** (MAXBIN + 1))
51
52 def getOverlappingBins(start, end):
53 array = []
54 bigBin = int((MAXBIN + 1) * 10 ** (MAXBIN + 1))
55 for i in range(MINBIN, MAXBIN + 1):
56 binLevel = 10 ** i
57 array.append((int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)), int(i * 10 ** (MAXBIN + 1) + int(end / binLevel))))
58 array.append((bigBin, bigBin))
59 return array
60
61
62 class CompareOverlappingSmallQuery(object):
63
64 def __init__(self, verbosity):
65 self.verbosity = verbosity
66 self.tableNames = {}
67 self.nbQueries = 0
68 self.nbRefs = 0
69 self.nbWritten = 0
70 self.nbOverlaps = 0
71 self.distance = None
72 self.invert = False
73 self.antisense = False
74 self.collinear = False
75 self.bins = {}
76 self.overlaps = {}
77 self.notOverlapping = False
78
79 def setReferenceFile(self, fileName, format):
80 chooser = ParserChooser(self.verbosity)
81 chooser.findFormat(format)
82 self.refParser = chooser.getParser(fileName)
83
84 def setQueryFile(self, fileName, format):
85 chooser = ParserChooser(self.verbosity)
86 chooser.findFormat(format)
87 self.queryParser = chooser.getParser(fileName)
88
89 def setOutputFile(self, fileName):
90 self.writer = TranscriptWriter(fileName, "gff3", self.verbosity)
91
92 def setDistance(self, distance):
93 self.distance = distance
94
95 def setInvert(self, boolean):
96 self.invert = boolean
97
98 def setCollinear(self, boolean):
99 self.collinear = boolean
100
101 def setAntisense(self, boolean):
102 self.antisense = boolean
103
104 def includeNotOverlapping(self, boolean):
105 self.notOverlapping = boolean
106
107 def loadQuery(self):
108 progress = UnlimitedProgress(10000, "Reading queries", self.verbosity)
109 for transcript in self.queryParser.getIterator():
110 if transcript.__class__.__name__ == "Mapping":
111 transcript = transcript.getTranscript()
112 chromosome = transcript.getChromosome()
113 bin = getBin(transcript.getStart(), transcript.getEnd())
114 if chromosome not in self.bins:
115 self.bins[chromosome] = {}
116 if bin not in self.bins[chromosome]:
117 self.bins[chromosome][bin] = []
118 self.bins[chromosome][bin].append(transcript)
119 if self.notOverlapping or self.invert:
120 self.overlaps[transcript] = {}
121 self.nbQueries += 1
122 progress.inc()
123 progress.done()
124
125 def _compareTwoTranscripts(self, queryTranscript, refTranscript):
126 if not queryTranscript.overlapWithExon(refTranscript):
127 return False
128 if self.collinear and queryTranscript.getDirection() != refTranscript.getDirection():
129 return False
130 if self.antisense and queryTranscript.getDirection() == refTranscript.getDirection():
131 return False
132 return True
133
134 def _alterTranscript(self, transcript, type):
135 if type == REFERENCE:
136 if self.distance != None:
137 transcript.extendExons(self.distance)
138 return transcript
139
140 def _compareTranscript(self, refTranscript):
141 refChromosome = refTranscript.getChromosome()
142 if refChromosome not in self.bins:
143 return []
144 refStart = refTranscript.getStart()
145 refEnd = refTranscript.getEnd()
146 bins = getOverlappingBins(refStart, refEnd)
147 for binRange in bins:
148 for bin in range(binRange[0], binRange[1]+1):
149 if bin not in self.bins[refChromosome]:
150 continue
151 for queryTranscript in self.bins[refChromosome][bin]:
152 if self._compareTwoTranscripts(queryTranscript, refTranscript):
153 if queryTranscript not in self.overlaps:
154 self.overlaps[queryTranscript] = {}
155 nbElements = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1
156 self.overlaps[queryTranscript][refTranscript.getName()] = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1
157 self.nbOverlaps += nbElements
158
159 def _updateTranscript(self, queryTranscript):
160 overlaps = self.overlaps[queryTranscript]
161 queryTranscript.setTagValue("nbOverlaps", sum(overlaps.values()))
162 if overlaps:
163 queryTranscript.setTagValue("overlapsWith", "--".join(overlaps.keys())[:100])
164 return queryTranscript
165
166 def compare(self):
167 progress = UnlimitedProgress(10000, "Comparing references", self.verbosity)
168 for refTranscript in self.refParser.getIterator():
169 if refTranscript.__class__.__name__ == "Mapping":
170 refTranscript = refTranscript.getTranscript()
171 refTranscript = self._alterTranscript(refTranscript, REFERENCE)
172 self._compareTranscript(refTranscript)
173 self.nbRefs += 1
174 progress.inc()
175 progress.done()
176
177 def printResults(self):
178 for transcript in self.overlaps:
179 if not self.invert or not self.overlaps[transcript]:
180 if not self.invert:
181 transcript = self._updateTranscript(transcript)
182 self.writer.addTranscript(transcript)
183 self.nbWritten += 1
184 self.writer.close()
185
186 def displayResults(self):
187 print "# queries: %d" % (self.nbQueries)
188 print "# refs: %d" % (self.nbRefs)
189 print "# written: %d (%d overlaps)" % (self.nbWritten, self.nbOverlaps)
190
191 def run(self):
192 self.loadQuery()
193 self.compare()
194 self.printResults()
195 self.displayResults()
196
197 if __name__ == "__main__":
198
199 description = "Compare Overlapping Small Query v1.0.1: Provide the queries that overlap with a reference, when the query is small. [Category: Data Comparison]"
200
201 parser = OptionParser(description = description)
202 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")
203 parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
204 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")
205 parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
206 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]")
207 parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]")
208 parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]")
209 parser.add_option("-c", "--collinear", dest="collinear", action="store_true", default=False, help="provide collinear features [format: bool] [default: false]")
210 parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="provide antisense features [format: bool] [default: false]")
211 parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]")
212 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
213 (options, args) = parser.parse_args()
214
215 cosq = CompareOverlappingSmallQuery(options.verbosity)
216 cosq.setQueryFile(options.inputFileName1, options.format1)
217 cosq.setReferenceFile(options.inputFileName2, options.format2)
218 cosq.setOutputFile(options.outputFileName)
219 cosq.includeNotOverlapping(options.notOverlapping)
220 cosq.setDistance(options.distance)
221 cosq.setCollinear(options.collinear)
222 cosq.setAntisense(options.antisense)
223 cosq.setInvert(options.exclude)
224 cosq.run()
225
226