comparison SMART/Java/Python/RestrictFromCoverage.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children
comparison
equal deleted inserted replaced
5:ea3082881bf8 6:769e306b7933
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2012
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import os, struct, time, random
32 from optparse import OptionParser
33 from commons.core.parsing.ParserChooser import ParserChooser
34 from commons.core.writer.Gff3Writer import Gff3Writer
35 from SMART.Java.Python.structure.Transcript import Transcript
36 from SMART.Java.Python.structure.Interval import Interval
37 from SMART.Java.Python.ncList.NCList import NCList
38 from SMART.Java.Python.ncList.NCListCursor import NCListCursor
39 from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle
40 from SMART.Java.Python.ncList.FileSorter import FileSorter
41 from SMART.Java.Python.misc.Progress import Progress
42 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
43 from SMART.Java.Python.misc import Utils
44 try:
45 import cPickle as pickle
46 except:
47 import pickle
48
49 REFERENCE = 0
50 QUERY = 1
51 TYPES = (REFERENCE, QUERY)
52 TYPETOSTRING = {0: "reference", 1: "query"}
53
54 class RestrictFromCoverage(object):
55
56 def __init__(self, verbosity = 1):
57 self._verbosity = verbosity
58 self._randomNumber = random.randint(0, 100000)
59 self._nbWritten = 0
60 self._nbLines = dict([type, 0] for type in TYPES)
61 self._splittedFileNames = dict([type, {}] for type in TYPES)
62 self._nbElementsPerChromosome = dict([type, {}] for type in TYPES)
63 self._nbElements = dict([type, 0] for type in TYPES)
64
65 def __del__(self):
66 pass
67
68 def _close(self):
69 self._writer.close()
70
71 def setInputFileName(self, fileName, format, type):
72 chooser = ParserChooser(self._verbosity)
73 chooser.findFormat(format)
74 parser = chooser.getParser(fileName)
75 sortedFileName = "%s_%d_%d_sorted.pkl" % (os.path.splitext(fileName)[0], self._randomNumber, type)
76 if self._verbosity > 2:
77 print "Preparing %s file..." % (TYPETOSTRING[type])
78 startTime = time.time()
79 fs = FileSorter(parser, self._verbosity-1)
80 fs.perChromosome(True)
81 fs.setOutputFileName(sortedFileName)
82 fs.sort()
83 self._nbLines[type] = fs.getNbElements()
84 self._splittedFileNames[type] = fs.getOutputFileNames()
85 self._nbElementsPerChromosome[type] = fs.getNbElementsPerChromosome()
86 self._nbElements[type] = fs.getNbElements()
87 endTime = time.time()
88 if self._verbosity > 2:
89 print " ...done (%ds)" % (endTime - startTime)
90
91 def setOutputFileName(self, outputFileName):
92 self._writer = Gff3Writer(outputFileName)
93
94 def setPercent(self, minPercent, maxPercent):
95 self._minPercent = minPercent
96 self._maxPercent = maxPercent
97
98 def setNbNucleotides(self, minNb, maxNb):
99 self._minNucleotides = minNb
100 self._maxNucleotides = maxNb
101
102 def setOverlap(self, minOverlap, maxOverlap):
103 self._minOverlap = minOverlap
104 self._maxOverlap = maxOverlap
105
106 def setStrands(self, boolean):
107 self._twoStrands = boolean
108
109 def _compareChromosome(self, chromosome):
110 firstOverlap = 0
111 parser1 = NCListFileUnpickle(self._splittedFileNames[QUERY][chromosome], self._verbosity)
112 parser2 = NCListFileUnpickle(self._splittedFileNames[REFERENCE][chromosome], self._verbosity)
113 progress = Progress(self._nbElementsPerChromosome[QUERY][chromosome], "Analyzing %s" % (chromosome), self._verbosity)
114 for transcript1 in parser1.getIterator():
115 firstOverlap = self._compareList(transcript1, parser2)
116 parser2.setInitAddress(firstOverlap)
117 progress.inc()
118 progress.done()
119
120 def _compareList(self, transcript1, parser2):
121 values = []
122 for exon in transcript1.getExons():
123 values.append([0.0] * exon.getSize())
124 firstOverlap = None
125 for transcript2 in parser2.getIterator():
126 address = parser2.getCurrentTranscriptAddress()
127 nbElements = float(transcript2.getTagValue("nbElements")) if "nbElements" in transcript2.getTagNames() else 1.0
128 nbOccurrences = float(transcript2.getTagValue("nbOccurrences")) if "nbOccurrences" in transcript2.getTagNames() else 1.0
129 nbElements /= nbOccurrences
130 if transcript2.getStart() > transcript1.getEnd():
131 if firstOverlap == None:
132 firstOverlap = address
133 if self._checkValues(values):
134 self._printTranscript(transcript1)
135 return firstOverlap
136 elif transcript1.overlapWith(transcript2):
137 if firstOverlap == None:
138 firstOverlap = address
139 values = self._compareTranscript(transcript1, transcript2, values, nbElements)
140 if self._checkValues(values):
141 self._printTranscript(transcript1)
142 return firstOverlap
143
144 def _compareTranscript(self, transcript1, transcript2, values, nbElements):
145 if not transcript1.overlapWith(transcript2) or ((self._twoStrands) and transcript1.getDirection() != transcript2.getDirection()):
146 return values
147 for id1, exon1 in enumerate(transcript1.getExons()):
148 for exon2 in transcript2.getExons():
149 values[id1] = map(sum, zip(values[id1], self._compareExon(exon1, exon2, nbElements)))
150 return values
151
152 def _compareExon(self, exon1, exon2, nbElements):
153 array = [0.0] * exon1.getSize()
154 if not exon1.overlapWith(exon2) or ((self._twoStrands) and exon1.getDirection() != exon2.getDirection()):
155 return array
156 for pos in range(max(exon1.getStart(), exon2.getStart()) - exon1.getStart(), min(exon1.getEnd(), exon2.getEnd()) - exon1.getStart()+1):
157 array[pos] += nbElements
158 return array
159
160 def _filter(self, value):
161 if self._minOverlap and self._maxOverlap:
162 return self._minOverlap <= value <= self._maxOverlap
163 if self._minOverlap:
164 return self._minOverlap <= value
165 if self._maxOverlap:
166 return value <= self._maxOverlap
167 return True
168
169 def _checkValues(self, values):
170 nbValues = sum(map(len, values))
171 nbPosValues = sum(map(len, [filter(self._filter, valuePart) for valuePart in values]))
172 ratio = float(nbPosValues) / nbValues * 100
173 if self._minNucleotides and nbPosValues < self._minNucleotides:
174 return False
175 if self._maxNucleotides and nbPosValues > self._maxNucleotides:
176 return False
177 if self._minPercent and ratio < self._minPercent:
178 return False
179 if self._maxPercent and ratio > self._maxPercent:
180 return False
181 return True
182
183 def _printTranscript(self, transcript):
184 self._writer.addTranscript(transcript)
185 self._nbWritten += 1
186
187 def run(self):
188 for chromosome in sorted(self._splittedFileNames[QUERY].keys()):
189 self._compareChromosome(chromosome)
190 self._close()
191 if self._verbosity > 0:
192 print "# queries: %d" % (self._nbElements[QUERY])
193 print "# refs: %d" % (self._nbElements[REFERENCE])
194 print "# written: %d (%d%%)" % (self._nbWritten, 0 if self._nbElements[QUERY] == 0 else round(float(self._nbWritten) / self._nbElements[QUERY] * 100))
195
196
197 if __name__ == "__main__":
198 description = "Restrict From Coverage v1.0.0: Select the elements from the first set which have a given coverage. [Category: Data Comparison]"
199
200 parser = OptionParser(description = description)
201 parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]")
202 parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]")
203 parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")
204 parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]")
205 parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]")
206 parser.add_option("-n", "--minNucleotides", dest="minNucleotides", action="store", default=None, type="int", help="minimum number of nucleotides overlapping to declare an overlap [format: int]")
207 parser.add_option("-N", "--maxNucleotides", dest="maxNucleotides", action="store", default=None, type="int", help="maximum number of nucleotides overlapping to declare an overlap [format: int]")
208 parser.add_option("-p", "--minPercent", dest="minPercent", action="store", default=None, type="int", help="minimum percentage of nucleotides overlapping to declare an overlap [format: int]")
209 parser.add_option("-P", "--maxPercent", dest="maxPercent", action="store", default=None, type="int", help="maximum percentage of nucleotides overlapping to declare an overlap [format: int]")
210 parser.add_option("-e", "--minOverlap", dest="minOverlap", action="store", default=None, type="int", help="minimum number of elements from 2nd file to declare an overlap [format: int]")
211 parser.add_option("-E", "--maxOverlap", dest="maxOverlap", action="store", default=None, type="int", help="maximum number of elements from 2nd file to declare an overlap [format: int]")
212 parser.add_option("-s", "--strands", dest="strands", action="store_true", default=False, help="consider the two strands separately [format: bool] [default: false]")
213 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
214 (options, args) = parser.parse_args()
215
216 rfc = RestrictFromCoverage(options.verbosity)
217 rfc.setInputFileName(options.inputFileName1, options.format1, QUERY)
218 rfc.setInputFileName(options.inputFileName2, options.format2, REFERENCE)
219 rfc.setOutputFileName(options.output)
220 rfc.setNbNucleotides(options.minNucleotides, options.maxNucleotides)
221 rfc.setPercent(options.minPercent, options.maxPercent)
222 rfc.setOverlap(options.minOverlap, options.maxOverlap)
223 rfc.setStrands(options.strands)
224 rfc.run()