comparison SMART/Java/Python/CompareOverlappingSmallQuery.py @ 36:44d5973c188c

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 15:02:29 -0400
parents
children 169d364ddd91
comparison
equal deleted inserted replaced
35:d94018ca4ada 36:44d5973c188c
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2011
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 from optparse import OptionParser
32 from commons.core.parsing.ParserChooser import ParserChooser
33 from commons.core.writer.TranscriptWriter import TranscriptWriter
34 from SMART.Java.Python.structure.Interval import Interval
35 from SMART.Java.Python.structure.Transcript import Transcript
36 from SMART.Java.Python.structure.Mapping import Mapping
37 from SMART.Java.Python.misc.Progress import Progress
38 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
39
40 MINBIN = 3
41 MAXBIN = 7
42 REFERENCE = 0
43 QUERY = 1
44
45 def getBin(start, end):
46 for i in range(MINBIN, MAXBIN + 1):
47 binLevel = 10 ** i
48 if int(start / binLevel) == int(end / binLevel):
49 return int(i * 10 ** (MAXBIN + 1) + int(start / binLevel))
50 return int((MAXBIN + 1) * 10 ** (MAXBIN + 1))
51
52 def getOverlappingBins(start, end):
53 array = []
54 bigBin = int((MAXBIN + 1) * 10 ** (MAXBIN + 1))
55 for i in range(MINBIN, MAXBIN + 1):
56 binLevel = 10 ** i
57 array.append((int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)), int(i * 10 ** (MAXBIN + 1) + int(end / binLevel))))
58 array.append((bigBin, bigBin))
59 return array
60
61
62 class CompareOverlappingSmallQuery(object):
63
64 def __init__(self, verbosity):
65 self.verbosity = verbosity
66 self.tableNames = {}
67 self.nbQueries = 0
68 self.nbRefs = 0
69 self.nbWritten = 0
70 self.nbOverlaps = 0
71 self.distance = None
72 self.invert = False
73 self.antisense = False
74 self.collinear = False
75 self.pcOverlapQuery = False
76 self.pcOverlapRef = False
77 self.minOverlap = False
78 self.included = False
79 self.including = False
80 self.bins = {}
81 self.overlaps = {}
82 self.notOverlapping = False
83
84 def setReferenceFile(self, fileName, format):
85 chooser = ParserChooser(self.verbosity)
86 chooser.findFormat(format)
87 self.refParser = chooser.getParser(fileName)
88
89 def setQueryFile(self, fileName, format):
90 chooser = ParserChooser(self.verbosity)
91 chooser.findFormat(format)
92 self.queryParser = chooser.getParser(fileName)
93
94 def setOutputFile(self, fileName):
95 self.writer = TranscriptWriter(fileName, "gff3", self.verbosity)
96
97 def setDistance(self, distance):
98 self.distance = distance
99
100 def setInvert(self, boolean):
101 self.invert = boolean
102
103 def setCollinear(self, boolean):
104 self.collinear = boolean
105
106 def setAntisense(self, boolean):
107 self.antisense = boolean
108
109 def setMinPercentOverlap(self, pcOverlapQuery, pcOverlapRef):
110 self.pcOverlapQuery = pcOverlapQuery
111 self.pcOverlapRef = pcOverlapRef
112
113 def setMinOverlap(self, minOverlap):
114 self.minOverlap = minOverlap
115
116 def setInclude(self, included, including):
117 self.included = included
118 self.including = including
119
120 def includeNotOverlapping(self, boolean):
121 self.notOverlapping = boolean
122
123 def loadQuery(self):
124 progress = UnlimitedProgress(10000, "Reading queries", self.verbosity)
125 for transcript in self.queryParser.getIterator():
126 if transcript.__class__.__name__ == "Mapping":
127 transcript = transcript.getTranscript()
128 chromosome = transcript.getChromosome()
129 bin = getBin(transcript.getStart(), transcript.getEnd())
130 if chromosome not in self.bins:
131 self.bins[chromosome] = {}
132 if bin not in self.bins[chromosome]:
133 self.bins[chromosome][bin] = []
134 self.bins[chromosome][bin].append(transcript)
135 if self.notOverlapping or self.invert:
136 self.overlaps[transcript] = {}
137 self.nbQueries += 1
138 progress.inc()
139 progress.done()
140
141 def _compareTwoTranscripts(self, queryTranscript, refTranscript):
142 if not queryTranscript.overlapWithExon(refTranscript):
143 return False
144 if self.collinear and queryTranscript.getDirection() != refTranscript.getDirection():
145 return False
146 if self.antisense and queryTranscript.getDirection() == refTranscript.getDirection():
147 return False
148 if self.included and not refTranscript.include(queryTranscript):
149 return False
150 if self.including and not queryTranscript.include(refTranscript):
151 return False
152 querySize = queryTranscript.getSize()
153 if self.pcOverlapQuery and not queryTranscript.overlapWithExon(refTranscript, int(querySize * self.pcOverlapQuery / 100.0)):
154 return False
155 refSize = refTranscript.getSize()
156 if self.pcOverlapRef and not queryTranscript.overlapWithExon(refTranscript, int(refSize * self.pcOverlapRef / 100.0)):
157 return False
158 if self.minOverlap and not queryTranscript.overlapWithExon(refTranscript, self.minOverlap):
159 return False
160 return True
161
162 def _alterTranscript(self, transcript, type):
163 if type == REFERENCE:
164 if self.distance != None:
165 transcript.extendExons(self.distance)
166 return transcript
167
168 def _compareTranscript(self, refTranscript):
169 refChromosome = refTranscript.getChromosome()
170 if refChromosome not in self.bins:
171 return []
172 refStart = refTranscript.getStart()
173 refEnd = refTranscript.getEnd()
174 bins = getOverlappingBins(refStart, refEnd)
175 for binRange in bins:
176 for bin in range(binRange[0], binRange[1]+1):
177 if bin not in self.bins[refChromosome]:
178 continue
179 for queryTranscript in self.bins[refChromosome][bin]:
180 if self._compareTwoTranscripts(queryTranscript, refTranscript):
181 if queryTranscript not in self.overlaps:
182 self.overlaps[queryTranscript] = {}
183 nbElements = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1
184 self.overlaps[queryTranscript][refTranscript.getName()] = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1
185 self.nbOverlaps += nbElements
186
187 def _updateTranscript(self, queryTranscript):
188 overlaps = self.overlaps[queryTranscript]
189 queryTranscript.setTagValue("nbOverlaps", sum(overlaps.values()))
190 if overlaps:
191 queryTranscript.setTagValue("overlapsWith", "--".join(overlaps.keys())[:100])
192 return queryTranscript
193
194 def compare(self):
195 progress = UnlimitedProgress(10000, "Comparing references", self.verbosity)
196 for refTranscript in self.refParser.getIterator():
197 if refTranscript.__class__.__name__ == "Mapping":
198 refTranscript = refTranscript.getTranscript()
199 refTranscript = self._alterTranscript(refTranscript, REFERENCE)
200 self._compareTranscript(refTranscript)
201 self.nbRefs += 1
202 progress.inc()
203 progress.done()
204
205 def printResults(self):
206 for transcript in self.overlaps:
207 if not self.invert or not self.overlaps[transcript]:
208 if not self.invert:
209 transcript = self._updateTranscript(transcript)
210 self.writer.addTranscript(transcript)
211 self.nbWritten += 1
212 self.writer.close()
213
214 def displayResults(self):
215 if self.verbosity:
216 print "# queries: %d" % (self.nbQueries)
217 print "# refs: %d" % (self.nbRefs)
218 print "# written: %d (%d overlaps)" % (self.nbWritten, self.nbOverlaps)
219
220 def run(self):
221 self.loadQuery()
222 self.compare()
223 self.printResults()
224 self.displayResults()
225
226 if __name__ == "__main__":
227
228 description = "Compare Overlapping Small Query v1.0.1: Provide the queries that overlap with a reference, when the query is small. [Category: Data Comparison]"
229
230 parser = OptionParser(description = description)
231 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]")
232 parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
233 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]")
234 parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]")
235 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]")
236 parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]")
237 parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]")
238 parser.add_option("-c", "--collinear", dest="collinear", action="store_true", default=False, help="provide collinear features [format: bool] [default: false]")
239 parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="provide antisense features [format: bool] [default: false]")
240 parser.add_option("-m", "--minOverlap", dest="minOverlap", action="store", default=False, type="int", help="min. #nt overlap [format: bool] [default: false]")
241 parser.add_option("-p", "--pcOverlapQuery", dest="pcOverlapQuery", action="store", default=False, type="int", help="min. % overlap of the query [format: bool] [default: false]")
242 parser.add_option("-P", "--pcOverlapRef", dest="pcOverlapRef", action="store", default=False, type="int", help="min. % overlap of the reference [format: bool] [default: false]")
243 parser.add_option("-k", "--included", dest="included", action="store_true", default=False, help="provide query elements which are nested in reference elements [format: bool] [default: false]")
244 parser.add_option("-K", "--including", dest="including", action="store_true", default=False, help="provide query elements in which reference elements are nested [format: bool] [default: false]")
245 parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]")
246 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
247 (options, args) = parser.parse_args()
248
249 cosq = CompareOverlappingSmallQuery(options.verbosity)
250 cosq.setQueryFile(options.inputFileName1, options.format1)
251 cosq.setReferenceFile(options.inputFileName2, options.format2)
252 cosq.setOutputFile(options.outputFileName)
253 cosq.includeNotOverlapping(options.notOverlapping)
254 cosq.setDistance(options.distance)
255 cosq.setCollinear(options.collinear)
256 cosq.setAntisense(options.antisense)
257 cosq.setMinPercentOverlap(options.pcOverlapQuery, options.pcOverlapRef)
258 cosq.setMinOverlap(options.minOverlap)
259 cosq.setInclude(options.included, options.including)
260 cosq.setInvert(options.exclude)
261 cosq.run()