Mercurial > repos > yufei-luo > s_mart
comparison commons/tools/AnnotationStats.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:b0e8584489e6 | 18:94ab73e8a190 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Copyright INRA (Institut National de la Recherche Agronomique) | |
4 # http://www.inra.fr | |
5 # http://urgi.versailles.inra.fr | |
6 # | |
7 # This software is governed by the CeCILL license under French law and | |
8 # abiding by the rules of distribution of free software. You can use, | |
9 # modify and/ or redistribute the software under the terms of the CeCILL | |
10 # license as circulated by CEA, CNRS and INRIA at the following URL | |
11 # "http://www.cecill.info". | |
12 # | |
13 # As a counterpart to the access to the source code and rights to copy, | |
14 # modify and redistribute granted by the license, users are provided only | |
15 # with a limited warranty and the software's author, the holder of the | |
16 # economic rights, and the successive licensors have only limited | |
17 # liability. | |
18 # | |
19 # In this respect, the user's attention is drawn to the risks associated | |
20 # with loading, using, modifying and/or developing or reproducing the | |
21 # software by the user in light of its specific status of free software, | |
22 # that may mean that it is complicated to manipulate, and that also | |
23 # therefore means that it is reserved for developers and experienced | |
24 # professionals having in-depth computer knowledge. Users are therefore | |
25 # encouraged to load and test the software's suitability as regards their | |
26 # requirements in conditions enabling the security of their systems and/or | |
27 # data to be ensured and, more generally, to use and operate it in the | |
28 # same conditions as regards security. | |
29 # | |
30 # The fact that you are presently reading this means that you have had | |
31 # knowledge of the CeCILL license and that you accept its terms. | |
32 | |
33 ##@file | |
34 # Give summary information on a TE annotation table. | |
35 # options: | |
36 # -h: this help | |
37 # -t: analysis type (default = 1, 1: per transposable element (TE), 2: per cluster, 3: per classification, 4: with map input file) | |
38 # -p: name of the table (_path) or file (.path) with the annotated TE copies | |
39 # -s: name of the table (_seq) or file (.fasta or .fa) with the TE reference sequences | |
40 # -g: length of the genome (in bp) | |
41 # -m: name of the file with the group and the corresponding TE names (format = 'map') | |
42 # -o: name of the output file (default = pathTableName + '_stats.txt') | |
43 # -C: name of the configuration file to access MySQL (e.g. 'TEannot.cfg') | |
44 # -c: remove map files and blastclust file (if analysis type is 2 or 3) | |
45 # -I: identity coverage threshold (default = 0) | |
46 # -L: length coverage threshold (default=0.8) | |
47 # -v: verbosity level (default = 0) | |
48 | |
49 from commons.core.LoggerFactory import LoggerFactory | |
50 from commons.core.stat.Stat import Stat | |
51 from commons.core.sql.DbFactory import DbFactory | |
52 from commons.core.sql.TablePathAdaptator import TablePathAdaptator | |
53 from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator | |
54 from commons.tools.getCumulLengthFromTEannot import getCumulLengthFromTEannot | |
55 | |
56 LOG_DEPTH = "repet.tools" | |
57 | |
58 #TODO: use templating engine instead of raw strings for AnnotationStatsWriter | |
59 class AnnotationStats( object ): | |
60 | |
61 def __init__(self, analysisName="TE", clusterFileName="",seqTableName="", pathTableName="", genomeLength=0, statsFileName="", globalStatsFileName="", verbosity=3): | |
62 self._analysisName = analysisName | |
63 self._clusterFileName = clusterFileName | |
64 self._seqTableName = seqTableName | |
65 self._pathTableName = pathTableName | |
66 self._genomeLength = genomeLength | |
67 self._statsFileName = statsFileName | |
68 self._globalStatsFileName = globalStatsFileName | |
69 self._iDb = None | |
70 self._iTablePathAdaptator = None | |
71 self._iTableSeqAdaptator = None | |
72 self._save = False | |
73 self._clean = False | |
74 self._verbosity = verbosity | |
75 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) | |
76 | |
77 def _logAndRaise(self, errorMsg): | |
78 self._log.error(errorMsg) | |
79 raise Exception(errorMsg) | |
80 | |
81 def setCoverageThreshold( self, lengthThresh ): | |
82 self._coverageThreshold = float(lengthThresh) | |
83 | |
84 def setIdentityThreshold( self, identityThresh ): | |
85 self._identityThreshold = int(identityThresh) | |
86 | |
87 def setAnalyseType(self, analyseType): | |
88 self._analyseType = str(analyseType) | |
89 | |
90 def setPathTableName(self, pathTableName): | |
91 self._pathTableName = pathTableName | |
92 | |
93 def setDBInstance(self, iDb): | |
94 self._iDb = iDb | |
95 | |
96 def setTablePathAdaptator(self, iTablePathAdaptator): | |
97 self._iTablePathAdaptator = iTablePathAdaptator | |
98 | |
99 def setTableSeqAdaptator(self, iTableSeqAdaptator): | |
100 self._iTableSeqAdaptator = iTableSeqAdaptator | |
101 | |
102 ## Get the coverage of TE copies for a given family (using 'mapOp') | |
103 # | |
104 # @param consensus string name of a TE family ('subject_name' in the 'path' table) | |
105 # @return cumulCoverage integer cumulative coverage | |
106 # | |
107 def getCumulCoverage( self, consensus = "" ): | |
108 gclft = getCumulLengthFromTEannot() | |
109 gclft.setInputTable( self._pathTableName ) | |
110 gclft.setTErefseq( consensus ) | |
111 gclft.setClean() | |
112 gclft._db = self._iDb | |
113 gclft._tpA = self._iTablePathAdaptator | |
114 mapFileName = gclft.getAllSubjectsAsMapOfQueries() | |
115 mergeFileName = gclft.mergeRanges( mapFileName ) | |
116 cumulCoverage = gclft.getCumulLength( mergeFileName ) #self._iTablePathAdaptator.getCumulPathLength_from_subject( consensus ) | |
117 return cumulCoverage | |
118 | |
119 ## Get the number of full-lengths (95% <= L =< 105%) | |
120 # | |
121 # @param consensusLength integer | |
122 # @param lLengths list of integers | |
123 # @return fullLengthConsensusNb integer | |
124 # | |
125 def getNbFullLengths( self, consensusLength, lLengths ): | |
126 fullLengthConsensusNb = 0 | |
127 for i in lLengths: | |
128 if i / float(consensusLength ) >= 0.95 and i / float(consensusLength ) <= 1.05: | |
129 fullLengthConsensusNb += 1 | |
130 return fullLengthConsensusNb | |
131 | |
132 def getStatPerTE(self, consensusName): | |
133 dConsensusStats = {} | |
134 lLengthPerFragment = self._iTablePathAdaptator.getPathLengthListFromSubject(consensusName) | |
135 lLengthPerCopy = self._iTablePathAdaptator.getChainLengthListFromSubject(consensusName) | |
136 lIdentityPerCopy = self._iTablePathAdaptator.getChainIdentityListFromSubject(consensusName) | |
137 dConsensusStats["length"] = self._iTableSeqAdaptator.getSeqLengthFromAccession(consensusName) | |
138 dConsensusStats["cumulCoverage"] = self.getCumulCoverage(consensusName) | |
139 dConsensusStats["nbFragments"] = len(lLengthPerFragment) | |
140 dConsensusStats["nbFullLengthFragments"] = self.getNbFullLengths(dConsensusStats["length"], lLengthPerFragment) | |
141 dConsensusStats["nbCopies"] = len(lLengthPerCopy) | |
142 dConsensusStats["nbFullLengthCopies"] = self.getNbFullLengths(dConsensusStats["length"], lLengthPerCopy) | |
143 dConsensusStats["statsIdentityPerChain"] = Stat() | |
144 dConsensusStats["statsLengthPerChain"] = Stat() | |
145 dConsensusStats["statsLengthPerChainPerc"] = Stat() | |
146 self._statsForIdentityAndLength(dConsensusStats, lLengthPerCopy, lIdentityPerCopy) | |
147 return dConsensusStats | |
148 | |
149 def getStatPerCluster(self, lConsensusNames): | |
150 dConsensusClusterStats = {} | |
151 lLengthPerFragment = [] | |
152 lLengthPerCopy = [] | |
153 cumulCoverageLength = 0 | |
154 for consensusName in lConsensusNames: | |
155 cumulCoverageLength += self.getCumulCoverage(consensusName) | |
156 lLengthPerFragment.extend(self._iTablePathAdaptator.getPathLengthListFromSubject(consensusName)) | |
157 lLengthPerCopy.extend(self._iTablePathAdaptator.getChainLengthListFromSubject(consensusName)) | |
158 dConsensusClusterStats["cumulCoverage"] = cumulCoverageLength | |
159 dConsensusClusterStats["nbFragments"] = len(lLengthPerFragment) | |
160 dConsensusClusterStats["nbCopies"] = len(lLengthPerCopy) | |
161 return dConsensusClusterStats | |
162 | |
163 def getClusterListFromFile(self): | |
164 lClusters = [] | |
165 with open(self._clusterFileName) as fCluster: | |
166 for line in fCluster: | |
167 lConsensusNames = line.rstrip().split("\t") | |
168 lClusters.append(lConsensusNames) | |
169 return lClusters | |
170 | |
171 def run(self): | |
172 LoggerFactory.setLevel(self._log, self._verbosity) | |
173 self._iDb = DbFactory.createInstance() | |
174 self._iTablePathAdaptator = TablePathAdaptator(self._iDb, self._pathTableName) | |
175 self._iTableSeqAdaptator = TableSeqAdaptator(self._iDb, self._seqTableName) | |
176 | |
177 iASW = AnnotationStatsWriter() | |
178 if self._analysisName == "TE": | |
179 with open(self._statsFileName, "w") as fStats: | |
180 string = "%s\tlength\tcovg\tfrags\tfullLgthFrags\tcopies\tfullLgthCopies\tmeanId\tmeanLgth\tmeanLgthPerc\n" % self._analysisName | |
181 fStats.write(string) | |
182 | |
183 lNamesTErefseq = self._iTableSeqAdaptator.getAccessionsList() | |
184 lDistinctSubjects = self._iTablePathAdaptator.getSubjectList() | |
185 totalCumulCoverage = self.getCumulCoverage() | |
186 | |
187 with open(self._globalStatsFileName, "w") as fG: | |
188 fG.write("%s\n" % iASW.printResume(lNamesTErefseq, lDistinctSubjects, totalCumulCoverage, self._genomeLength)) | |
189 for consensusName in lNamesTErefseq: | |
190 self._log.debug("processing '%s'..." % consensusName) | |
191 dStatForOneConsensus = self.getStatPerTE(consensusName) | |
192 iASW.addCalculsOfOneTE(dStatForOneConsensus) | |
193 fStats.write("%s\n" % iASW.getStatAsString(consensusName, dStatForOneConsensus)) | |
194 fG.write(iASW.printStatsForAllTEs(len(lNamesTErefseq))) | |
195 | |
196 elif self._analysisName == "Cluster": | |
197 lClusters = self.getClusterListFromFile() | |
198 lClusters.sort(key=lambda k: len(k), reverse=True) | |
199 with open(self._statsFileName, "w") as fStats: | |
200 string = "%s\tcovg\tfrags\tcopies\n" % self._analysisName | |
201 #TODO: add fullLgthFrags and fullLgthCopies ? Is addition of previous results significant ? | |
202 fStats.write(string) | |
203 for index, lConsensus in enumerate(lClusters): | |
204 self._log.debug("processing '%s'..." % lConsensus) | |
205 dStatForOneCluster = self.getStatPerCluster(lConsensus) | |
206 fStats.write("%s\n" % iASW.getStatAsStringForCluster(str(index + 1), dStatForOneCluster)) | |
207 | |
208 if self._save: | |
209 outTableName = "%s_statsPer%s" % (self._pathTableName, self._analysisName) | |
210 self._iDb.createTable(outTableName, "pathstat", self._statsFileName) | |
211 | |
212 self._iDb.close() | |
213 self._log.info("END %s" % type(self).__name__) | |
214 | |
215 def _statsForIdentityAndLength(self, dStat, lLengthPerCopy, lIdentityPerCopy): | |
216 for i in lIdentityPerCopy: | |
217 dStat["statsIdentityPerChain"].add(i) | |
218 lLengthPercPerCopy = [] | |
219 for i in lLengthPerCopy: | |
220 dStat["statsLengthPerChain"].add(i) | |
221 lperc = 100 * i / float(dStat["length"]) | |
222 lLengthPercPerCopy.append(lperc) | |
223 dStat["statsLengthPerChainPerc"].add(lperc) | |
224 | |
225 class AnnotationStatsWriter(object): | |
226 | |
227 def __init__(self): | |
228 self._dAllTErefseqs = { "sumCumulCoverage": 0, | |
229 "totalNbFragments": 0, | |
230 "totalNbFullLengthFragments": 0, | |
231 "totalNbCopies": 0, | |
232 "totalNbFullLengthCopies": 0, | |
233 "nbFamWithFullLengthFragments": 0, | |
234 "nbFamWithOneFullLengthFragment": 0, | |
235 "nbFamWithTwoFullLengthFragments": 0, | |
236 "nbFamWithThreeFullLengthFragments": 0, | |
237 "nbFamWithMoreThanThreeFullLengthFragments": 0, | |
238 "nbFamWithFullLengthCopies": 0, | |
239 "nbFamWithOneFullLengthCopy": 0, | |
240 "nbFamWithTwoFullLengthCopies": 0, | |
241 "nbFamWithThreeFullLengthCopies": 0, | |
242 "nbFamWithMoreThanThreeFullLengthCopies": 0, | |
243 "statsAllCopiesMedIdentity": Stat(), | |
244 "statsAllCopiesMedLengthPerc": Stat() | |
245 } | |
246 | |
247 def getAllTEsRefSeqDict(self): | |
248 return self._dAllTErefseqs | |
249 | |
250 def getStatAsString( self, name, d ): | |
251 """ | |
252 Return a string with all data properly formatted. | |
253 """ | |
254 string = "" | |
255 string += "%s" % name | |
256 string += "\t%i" % d["length"] | |
257 string += "\t%i" % d["cumulCoverage"] | |
258 string += "\t%i" % d["nbFragments"] | |
259 string += "\t%i" % d["nbFullLengthFragments"] | |
260 string += "\t%i" % d["nbCopies"] | |
261 string += "\t%i" % d["nbFullLengthCopies"] | |
262 | |
263 if d["statsIdentityPerChain"].getValuesNumber() != 0: | |
264 string += "\t%.2f" % d["statsIdentityPerChain"].mean() | |
265 else: | |
266 string += "\tNA" | |
267 | |
268 if d["statsLengthPerChain"].getValuesNumber() != 0: | |
269 string += "\t%.2f" % d["statsLengthPerChain"].mean() | |
270 else: | |
271 string += "\tNA" | |
272 | |
273 if d["statsLengthPerChainPerc"].getValuesNumber() != 0: | |
274 string += "\t%.2f" % d["statsLengthPerChainPerc"].mean() | |
275 else: | |
276 string += "\tNA" | |
277 | |
278 return string | |
279 | |
280 def getStatAsStringForCluster( self, name, d ): | |
281 """ | |
282 Return a string with all data properly formatted. | |
283 """ | |
284 string = "" | |
285 string += "%s" % name | |
286 string += "\t%i" % d["cumulCoverage"] | |
287 string += "\t%i" % d["nbFragments"] | |
288 string += "\t%i" % d["nbCopies"] | |
289 | |
290 return string | |
291 | |
292 def addCalculsOfOneTE(self, dOneTErefseq): | |
293 self._dAllTErefseqs[ "sumCumulCoverage" ] += dOneTErefseq[ "cumulCoverage" ] | |
294 | |
295 self._dAllTErefseqs[ "totalNbFragments" ] += dOneTErefseq[ "nbFragments" ] | |
296 self._dAllTErefseqs[ "totalNbFullLengthFragments" ] += dOneTErefseq[ "nbFullLengthFragments" ] | |
297 if dOneTErefseq[ "nbFullLengthFragments" ] > 0: | |
298 self._dAllTErefseqs[ "nbFamWithFullLengthFragments" ] += 1 | |
299 if dOneTErefseq[ "nbFullLengthFragments" ] == 1: | |
300 self._dAllTErefseqs[ "nbFamWithOneFullLengthFragment" ] += 1 | |
301 elif dOneTErefseq[ "nbFullLengthFragments" ] == 2: | |
302 self._dAllTErefseqs[ "nbFamWithTwoFullLengthFragments" ] += 1 | |
303 elif dOneTErefseq[ "nbFullLengthFragments" ] == 3: | |
304 self._dAllTErefseqs[ "nbFamWithThreeFullLengthFragments" ] += 1 | |
305 elif dOneTErefseq[ "nbFullLengthFragments" ] > 3: | |
306 self._dAllTErefseqs[ "nbFamWithMoreThanThreeFullLengthFragments" ] += 1 | |
307 | |
308 self._dAllTErefseqs[ "totalNbCopies" ] += dOneTErefseq[ "nbCopies" ] | |
309 self._dAllTErefseqs[ "totalNbFullLengthCopies" ] += dOneTErefseq[ "nbFullLengthCopies" ] | |
310 if dOneTErefseq[ "nbFullLengthCopies" ] > 0: | |
311 self._dAllTErefseqs[ "nbFamWithFullLengthCopies" ] += 1 | |
312 if dOneTErefseq[ "nbFullLengthCopies" ] == 1: | |
313 self._dAllTErefseqs[ "nbFamWithOneFullLengthCopy" ] += 1 | |
314 elif dOneTErefseq[ "nbFullLengthCopies" ] == 2: | |
315 self._dAllTErefseqs[ "nbFamWithTwoFullLengthCopies" ] += 1 | |
316 elif dOneTErefseq[ "nbFullLengthCopies" ] == 3: | |
317 self._dAllTErefseqs[ "nbFamWithThreeFullLengthCopies" ] += 1 | |
318 elif dOneTErefseq[ "nbFullLengthCopies" ] > 3: | |
319 self._dAllTErefseqs[ "nbFamWithMoreThanThreeFullLengthCopies" ] += 1 | |
320 | |
321 if dOneTErefseq[ "statsIdentityPerChain" ].getValuesNumber() != 0: | |
322 self._dAllTErefseqs[ "statsAllCopiesMedIdentity" ].add( dOneTErefseq[ "statsIdentityPerChain" ].median() ) | |
323 | |
324 if dOneTErefseq[ "statsLengthPerChainPerc" ].getValuesNumber() != 0: | |
325 self._dAllTErefseqs[ "statsAllCopiesMedLengthPerc" ].add( dOneTErefseq[ "statsLengthPerChainPerc" ].median() ) | |
326 | |
327 def printStatsForAllTEs(self, TEnb): | |
328 # statString += "(sum of cumulative coverages: %i bp)" % ( self._dAllTErefseqs[ "sumCumulCoverage" ] ) | |
329 statString = "total nb of TE fragments: %i\n" % ( self._dAllTErefseqs[ "totalNbFragments" ] ) | |
330 | |
331 if self._dAllTErefseqs[ "totalNbFragments" ] != 0: | |
332 | |
333 statString += "total nb full-length fragments: %i (%.2f%%)\n" % \ | |
334 ( self._dAllTErefseqs[ "totalNbFullLengthFragments" ], \ | |
335 100*self._dAllTErefseqs[ "totalNbFullLengthFragments" ] / float(self._dAllTErefseqs[ "totalNbFragments" ]) ) | |
336 | |
337 statString += "total nb of TE copies: %i\n" % ( self._dAllTErefseqs[ "totalNbCopies" ] ) | |
338 | |
339 statString += "total nb full-length copies: %i (%.2f%%)\n" % \ | |
340 ( self._dAllTErefseqs[ "totalNbFullLengthCopies" ], \ | |
341 100*self._dAllTErefseqs[ "totalNbFullLengthCopies" ] / float(self._dAllTErefseqs[ "totalNbCopies" ]) ) | |
342 | |
343 statString += "families with full-length fragments: %i (%.2f%%)\n" % \ | |
344 ( self._dAllTErefseqs[ "nbFamWithFullLengthFragments" ], \ | |
345 100*self._dAllTErefseqs[ "nbFamWithFullLengthFragments" ] / float(TEnb) ) | |
346 statString += " with only one full-length fragment: %i\n" % ( self._dAllTErefseqs[ "nbFamWithOneFullLengthFragment" ] ) | |
347 statString += " with only two full-length fragments: %i\n" % ( self._dAllTErefseqs[ "nbFamWithTwoFullLengthFragments" ] ) | |
348 statString += " with only three full-length fragments: %i\n" % ( self._dAllTErefseqs[ "nbFamWithThreeFullLengthFragments" ] ) | |
349 statString += " with more than three full-length fragments: %i\n" % ( self._dAllTErefseqs[ "nbFamWithMoreThanThreeFullLengthFragments" ] ) | |
350 | |
351 statString += "families with full-length copies: %i (%.2f%%)\n" % \ | |
352 ( self._dAllTErefseqs[ "nbFamWithFullLengthCopies" ], \ | |
353 100*self._dAllTErefseqs[ "nbFamWithFullLengthCopies" ] / float(TEnb) ) | |
354 statString += " with only one full-length copy: %i\n" % ( self._dAllTErefseqs[ "nbFamWithOneFullLengthCopy" ] ) | |
355 statString += " with only two full-length copies: %i\n" % ( self._dAllTErefseqs[ "nbFamWithTwoFullLengthCopies" ] ) | |
356 statString += " with only three full-length copies: %i\n" % ( self._dAllTErefseqs[ "nbFamWithThreeFullLengthCopies" ] ) | |
357 statString += " with more than three full-length copies: %i\n" % ( self._dAllTErefseqs[ "nbFamWithMoreThanThreeFullLengthCopies" ] ) | |
358 | |
359 statString += "mean of median identity of all families: %.2f +- %.2f\n" % \ | |
360 ( self._dAllTErefseqs[ "statsAllCopiesMedIdentity" ].mean(), \ | |
361 self._dAllTErefseqs[ "statsAllCopiesMedIdentity" ].sd() ) | |
362 | |
363 statString += "mean of median length percentage of all families: %.2f +- %.2f\n" % \ | |
364 ( self._dAllTErefseqs[ "statsAllCopiesMedLengthPerc" ].mean(), \ | |
365 self._dAllTErefseqs[ "statsAllCopiesMedLengthPerc" ].sd() ) | |
366 return statString | |
367 | |
368 def printResume(self, lNamesTErefseq, lDistinctSubjects, totalCumulCoverage, genomeLength): | |
369 statString = "nb of sequences: %i\n" % len(lNamesTErefseq) | |
370 statString += "nb of matched sequences: %i\n" % len(lDistinctSubjects) | |
371 statString += "cumulative coverage: %i bp\n" % totalCumulCoverage | |
372 statString += "coverage percentage: %.2f%%\n" % ( 100 * totalCumulCoverage / float(genomeLength) ) | |
373 # statString += "processing the %i TE families..." % len(lNamesTErefseq) | |
374 return statString |