18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
4 # http://www.inra.fr
|
|
5 # http://urgi.versailles.inra.fr
|
|
6 #
|
|
7 # This software is governed by the CeCILL license under French law and
|
|
8 # abiding by the rules of distribution of free software. You can use,
|
|
9 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
10 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
11 # "http://www.cecill.info".
|
|
12 #
|
|
13 # As a counterpart to the access to the source code and rights to copy,
|
|
14 # modify and redistribute granted by the license, users are provided only
|
|
15 # with a limited warranty and the software's author, the holder of the
|
|
16 # economic rights, and the successive licensors have only limited
|
|
17 # liability.
|
|
18 #
|
|
19 # In this respect, the user's attention is drawn to the risks associated
|
|
20 # with loading, using, modifying and/or developing or reproducing the
|
|
21 # software by the user in light of its specific status of free software,
|
|
22 # that may mean that it is complicated to manipulate, and that also
|
|
23 # therefore means that it is reserved for developers and experienced
|
|
24 # professionals having in-depth computer knowledge. Users are therefore
|
|
25 # encouraged to load and test the software's suitability as regards their
|
|
26 # requirements in conditions enabling the security of their systems and/or
|
|
27 # data to be ensured and, more generally, to use and operate it in the
|
|
28 # same conditions as regards security.
|
|
29 #
|
|
30 # The fact that you are presently reading this means that you have had
|
|
31 # knowledge of the CeCILL license and that you accept its terms.
|
|
32
|
|
33 ##@file
|
|
34 # Give summary information on a TE annotation table.
|
|
35 # options:
|
|
36 # -h: this help
|
|
37 # -t: analysis type (default = 1, 1: per transposable element (TE), 2: per cluster, 3: per classification, 4: with map input file)
|
|
38 # -p: name of the table (_path) or file (.path) with the annotated TE copies
|
|
39 # -s: name of the table (_seq) or file (.fasta or .fa) with the TE reference sequences
|
|
40 # -g: length of the genome (in bp)
|
|
41 # -m: name of the file with the group and the corresponding TE names (format = 'map')
|
|
42 # -o: name of the output file (default = pathTableName + '_stats.txt')
|
|
43 # -C: name of the configuration file to access MySQL (e.g. 'TEannot.cfg')
|
|
44 # -c: remove map files and blastclust file (if analysis type is 2 or 3)
|
|
45 # -I: identity coverage threshold (default = 0)
|
|
46 # -L: length coverage threshold (default=0.8)
|
|
47 # -v: verbosity level (default = 0)
|
|
48
|
|
49 from commons.core.LoggerFactory import LoggerFactory
|
|
50 from commons.core.stat.Stat import Stat
|
|
51 from commons.core.sql.DbFactory import DbFactory
|
|
52 from commons.core.sql.TablePathAdaptator import TablePathAdaptator
|
|
53 from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator
|
|
54 from commons.tools.getCumulLengthFromTEannot import getCumulLengthFromTEannot
|
|
55
|
|
56 LOG_DEPTH = "repet.tools"
|
|
57
|
|
58 #TODO: use templating engine instead of raw strings for AnnotationStatsWriter
|
|
59 class AnnotationStats( object ):
|
|
60
|
|
61 def __init__(self, analysisName="TE", clusterFileName="",seqTableName="", pathTableName="", genomeLength=0, statsFileName="", globalStatsFileName="", verbosity=3):
|
|
62 self._analysisName = analysisName
|
|
63 self._clusterFileName = clusterFileName
|
|
64 self._seqTableName = seqTableName
|
|
65 self._pathTableName = pathTableName
|
|
66 self._genomeLength = genomeLength
|
|
67 self._statsFileName = statsFileName
|
|
68 self._globalStatsFileName = globalStatsFileName
|
|
69 self._iDb = None
|
|
70 self._iTablePathAdaptator = None
|
|
71 self._iTableSeqAdaptator = None
|
|
72 self._save = False
|
|
73 self._clean = False
|
|
74 self._verbosity = verbosity
|
|
75 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
|
|
76
|
|
77 def _logAndRaise(self, errorMsg):
|
|
78 self._log.error(errorMsg)
|
|
79 raise Exception(errorMsg)
|
|
80
|
|
81 def setCoverageThreshold( self, lengthThresh ):
|
|
82 self._coverageThreshold = float(lengthThresh)
|
|
83
|
|
84 def setIdentityThreshold( self, identityThresh ):
|
|
85 self._identityThreshold = int(identityThresh)
|
|
86
|
|
87 def setAnalyseType(self, analyseType):
|
|
88 self._analyseType = str(analyseType)
|
|
89
|
|
90 def setPathTableName(self, pathTableName):
|
|
91 self._pathTableName = pathTableName
|
|
92
|
|
93 def setDBInstance(self, iDb):
|
|
94 self._iDb = iDb
|
|
95
|
|
96 def setTablePathAdaptator(self, iTablePathAdaptator):
|
|
97 self._iTablePathAdaptator = iTablePathAdaptator
|
|
98
|
|
99 def setTableSeqAdaptator(self, iTableSeqAdaptator):
|
|
100 self._iTableSeqAdaptator = iTableSeqAdaptator
|
|
101
|
|
102 ## Get the coverage of TE copies for a given family (using 'mapOp')
|
|
103 #
|
|
104 # @param consensus string name of a TE family ('subject_name' in the 'path' table)
|
|
105 # @return cumulCoverage integer cumulative coverage
|
|
106 #
|
|
107 def getCumulCoverage( self, consensus = "" ):
|
|
108 gclft = getCumulLengthFromTEannot()
|
|
109 gclft.setInputTable( self._pathTableName )
|
|
110 gclft.setTErefseq( consensus )
|
|
111 gclft.setClean()
|
|
112 gclft._db = self._iDb
|
|
113 gclft._tpA = self._iTablePathAdaptator
|
|
114 mapFileName = gclft.getAllSubjectsAsMapOfQueries()
|
|
115 mergeFileName = gclft.mergeRanges( mapFileName )
|
|
116 cumulCoverage = gclft.getCumulLength( mergeFileName ) #self._iTablePathAdaptator.getCumulPathLength_from_subject( consensus )
|
|
117 return cumulCoverage
|
|
118
|
|
119 ## Get the number of full-lengths (95% <= L =< 105%)
|
|
120 #
|
|
121 # @param consensusLength integer
|
|
122 # @param lLengths list of integers
|
|
123 # @return fullLengthConsensusNb integer
|
|
124 #
|
|
125 def getNbFullLengths( self, consensusLength, lLengths ):
|
|
126 fullLengthConsensusNb = 0
|
|
127 for i in lLengths:
|
|
128 if i / float(consensusLength ) >= 0.95 and i / float(consensusLength ) <= 1.05:
|
|
129 fullLengthConsensusNb += 1
|
|
130 return fullLengthConsensusNb
|
|
131
|
|
132 def getStatPerTE(self, consensusName):
|
|
133 dConsensusStats = {}
|
|
134 lLengthPerFragment = self._iTablePathAdaptator.getPathLengthListFromSubject(consensusName)
|
|
135 lLengthPerCopy = self._iTablePathAdaptator.getChainLengthListFromSubject(consensusName)
|
|
136 lIdentityPerCopy = self._iTablePathAdaptator.getChainIdentityListFromSubject(consensusName)
|
|
137 dConsensusStats["length"] = self._iTableSeqAdaptator.getSeqLengthFromAccession(consensusName)
|
|
138 dConsensusStats["cumulCoverage"] = self.getCumulCoverage(consensusName)
|
|
139 dConsensusStats["nbFragments"] = len(lLengthPerFragment)
|
|
140 dConsensusStats["nbFullLengthFragments"] = self.getNbFullLengths(dConsensusStats["length"], lLengthPerFragment)
|
|
141 dConsensusStats["nbCopies"] = len(lLengthPerCopy)
|
|
142 dConsensusStats["nbFullLengthCopies"] = self.getNbFullLengths(dConsensusStats["length"], lLengthPerCopy)
|
|
143 dConsensusStats["statsIdentityPerChain"] = Stat()
|
|
144 dConsensusStats["statsLengthPerChain"] = Stat()
|
|
145 dConsensusStats["statsLengthPerChainPerc"] = Stat()
|
|
146 self._statsForIdentityAndLength(dConsensusStats, lLengthPerCopy, lIdentityPerCopy)
|
|
147 return dConsensusStats
|
|
148
|
|
149 def getStatPerCluster(self, lConsensusNames):
|
|
150 dConsensusClusterStats = {}
|
|
151 lLengthPerFragment = []
|
|
152 lLengthPerCopy = []
|
|
153 cumulCoverageLength = 0
|
|
154 for consensusName in lConsensusNames:
|
|
155 cumulCoverageLength += self.getCumulCoverage(consensusName)
|
|
156 lLengthPerFragment.extend(self._iTablePathAdaptator.getPathLengthListFromSubject(consensusName))
|
|
157 lLengthPerCopy.extend(self._iTablePathAdaptator.getChainLengthListFromSubject(consensusName))
|
|
158 dConsensusClusterStats["cumulCoverage"] = cumulCoverageLength
|
|
159 dConsensusClusterStats["nbFragments"] = len(lLengthPerFragment)
|
|
160 dConsensusClusterStats["nbCopies"] = len(lLengthPerCopy)
|
|
161 return dConsensusClusterStats
|
|
162
|
|
163 def getClusterListFromFile(self):
|
|
164 lClusters = []
|
|
165 with open(self._clusterFileName) as fCluster:
|
|
166 for line in fCluster:
|
|
167 lConsensusNames = line.rstrip().split("\t")
|
|
168 lClusters.append(lConsensusNames)
|
|
169 return lClusters
|
|
170
|
|
171 def run(self):
|
|
172 LoggerFactory.setLevel(self._log, self._verbosity)
|
|
173 self._iDb = DbFactory.createInstance()
|
|
174 self._iTablePathAdaptator = TablePathAdaptator(self._iDb, self._pathTableName)
|
|
175 self._iTableSeqAdaptator = TableSeqAdaptator(self._iDb, self._seqTableName)
|
|
176
|
|
177 iASW = AnnotationStatsWriter()
|
|
178 if self._analysisName == "TE":
|
|
179 with open(self._statsFileName, "w") as fStats:
|
|
180 string = "%s\tlength\tcovg\tfrags\tfullLgthFrags\tcopies\tfullLgthCopies\tmeanId\tmeanLgth\tmeanLgthPerc\n" % self._analysisName
|
|
181 fStats.write(string)
|
|
182
|
|
183 lNamesTErefseq = self._iTableSeqAdaptator.getAccessionsList()
|
|
184 lDistinctSubjects = self._iTablePathAdaptator.getSubjectList()
|
|
185 totalCumulCoverage = self.getCumulCoverage()
|
|
186
|
|
187 with open(self._globalStatsFileName, "w") as fG:
|
|
188 fG.write("%s\n" % iASW.printResume(lNamesTErefseq, lDistinctSubjects, totalCumulCoverage, self._genomeLength))
|
|
189 for consensusName in lNamesTErefseq:
|
|
190 self._log.debug("processing '%s'..." % consensusName)
|
|
191 dStatForOneConsensus = self.getStatPerTE(consensusName)
|
|
192 iASW.addCalculsOfOneTE(dStatForOneConsensus)
|
|
193 fStats.write("%s\n" % iASW.getStatAsString(consensusName, dStatForOneConsensus))
|
|
194 fG.write(iASW.printStatsForAllTEs(len(lNamesTErefseq)))
|
|
195
|
|
196 elif self._analysisName == "Cluster":
|
|
197 lClusters = self.getClusterListFromFile()
|
|
198 lClusters.sort(key=lambda k: len(k), reverse=True)
|
|
199 with open(self._statsFileName, "w") as fStats:
|
|
200 string = "%s\tcovg\tfrags\tcopies\n" % self._analysisName
|
|
201 #TODO: add fullLgthFrags and fullLgthCopies ? Is addition of previous results significant ?
|
|
202 fStats.write(string)
|
|
203 for index, lConsensus in enumerate(lClusters):
|
|
204 self._log.debug("processing '%s'..." % lConsensus)
|
|
205 dStatForOneCluster = self.getStatPerCluster(lConsensus)
|
|
206 fStats.write("%s\n" % iASW.getStatAsStringForCluster(str(index + 1), dStatForOneCluster))
|
|
207
|
|
208 if self._save:
|
|
209 outTableName = "%s_statsPer%s" % (self._pathTableName, self._analysisName)
|
|
210 self._iDb.createTable(outTableName, "pathstat", self._statsFileName)
|
|
211
|
|
212 self._iDb.close()
|
|
213 self._log.info("END %s" % type(self).__name__)
|
|
214
|
|
215 def _statsForIdentityAndLength(self, dStat, lLengthPerCopy, lIdentityPerCopy):
|
|
216 for i in lIdentityPerCopy:
|
|
217 dStat["statsIdentityPerChain"].add(i)
|
|
218 lLengthPercPerCopy = []
|
|
219 for i in lLengthPerCopy:
|
|
220 dStat["statsLengthPerChain"].add(i)
|
|
221 lperc = 100 * i / float(dStat["length"])
|
|
222 lLengthPercPerCopy.append(lperc)
|
|
223 dStat["statsLengthPerChainPerc"].add(lperc)
|
|
224
|
|
225 class AnnotationStatsWriter(object):
|
|
226
|
|
227 def __init__(self):
|
|
228 self._dAllTErefseqs = { "sumCumulCoverage": 0,
|
|
229 "totalNbFragments": 0,
|
|
230 "totalNbFullLengthFragments": 0,
|
|
231 "totalNbCopies": 0,
|
|
232 "totalNbFullLengthCopies": 0,
|
|
233 "nbFamWithFullLengthFragments": 0,
|
|
234 "nbFamWithOneFullLengthFragment": 0,
|
|
235 "nbFamWithTwoFullLengthFragments": 0,
|
|
236 "nbFamWithThreeFullLengthFragments": 0,
|
|
237 "nbFamWithMoreThanThreeFullLengthFragments": 0,
|
|
238 "nbFamWithFullLengthCopies": 0,
|
|
239 "nbFamWithOneFullLengthCopy": 0,
|
|
240 "nbFamWithTwoFullLengthCopies": 0,
|
|
241 "nbFamWithThreeFullLengthCopies": 0,
|
|
242 "nbFamWithMoreThanThreeFullLengthCopies": 0,
|
|
243 "statsAllCopiesMedIdentity": Stat(),
|
|
244 "statsAllCopiesMedLengthPerc": Stat()
|
|
245 }
|
|
246
|
|
247 def getAllTEsRefSeqDict(self):
|
|
248 return self._dAllTErefseqs
|
|
249
|
|
250 def getStatAsString( self, name, d ):
|
|
251 """
|
|
252 Return a string with all data properly formatted.
|
|
253 """
|
|
254 string = ""
|
|
255 string += "%s" % name
|
|
256 string += "\t%i" % d["length"]
|
|
257 string += "\t%i" % d["cumulCoverage"]
|
|
258 string += "\t%i" % d["nbFragments"]
|
|
259 string += "\t%i" % d["nbFullLengthFragments"]
|
|
260 string += "\t%i" % d["nbCopies"]
|
|
261 string += "\t%i" % d["nbFullLengthCopies"]
|
|
262
|
|
263 if d["statsIdentityPerChain"].getValuesNumber() != 0:
|
|
264 string += "\t%.2f" % d["statsIdentityPerChain"].mean()
|
|
265 else:
|
|
266 string += "\tNA"
|
|
267
|
|
268 if d["statsLengthPerChain"].getValuesNumber() != 0:
|
|
269 string += "\t%.2f" % d["statsLengthPerChain"].mean()
|
|
270 else:
|
|
271 string += "\tNA"
|
|
272
|
|
273 if d["statsLengthPerChainPerc"].getValuesNumber() != 0:
|
|
274 string += "\t%.2f" % d["statsLengthPerChainPerc"].mean()
|
|
275 else:
|
|
276 string += "\tNA"
|
|
277
|
|
278 return string
|
|
279
|
|
280 def getStatAsStringForCluster( self, name, d ):
|
|
281 """
|
|
282 Return a string with all data properly formatted.
|
|
283 """
|
|
284 string = ""
|
|
285 string += "%s" % name
|
|
286 string += "\t%i" % d["cumulCoverage"]
|
|
287 string += "\t%i" % d["nbFragments"]
|
|
288 string += "\t%i" % d["nbCopies"]
|
|
289
|
|
290 return string
|
|
291
|
|
292 def addCalculsOfOneTE(self, dOneTErefseq):
|
|
293 self._dAllTErefseqs[ "sumCumulCoverage" ] += dOneTErefseq[ "cumulCoverage" ]
|
|
294
|
|
295 self._dAllTErefseqs[ "totalNbFragments" ] += dOneTErefseq[ "nbFragments" ]
|
|
296 self._dAllTErefseqs[ "totalNbFullLengthFragments" ] += dOneTErefseq[ "nbFullLengthFragments" ]
|
|
297 if dOneTErefseq[ "nbFullLengthFragments" ] > 0:
|
|
298 self._dAllTErefseqs[ "nbFamWithFullLengthFragments" ] += 1
|
|
299 if dOneTErefseq[ "nbFullLengthFragments" ] == 1:
|
|
300 self._dAllTErefseqs[ "nbFamWithOneFullLengthFragment" ] += 1
|
|
301 elif dOneTErefseq[ "nbFullLengthFragments" ] == 2:
|
|
302 self._dAllTErefseqs[ "nbFamWithTwoFullLengthFragments" ] += 1
|
|
303 elif dOneTErefseq[ "nbFullLengthFragments" ] == 3:
|
|
304 self._dAllTErefseqs[ "nbFamWithThreeFullLengthFragments" ] += 1
|
|
305 elif dOneTErefseq[ "nbFullLengthFragments" ] > 3:
|
|
306 self._dAllTErefseqs[ "nbFamWithMoreThanThreeFullLengthFragments" ] += 1
|
|
307
|
|
308 self._dAllTErefseqs[ "totalNbCopies" ] += dOneTErefseq[ "nbCopies" ]
|
|
309 self._dAllTErefseqs[ "totalNbFullLengthCopies" ] += dOneTErefseq[ "nbFullLengthCopies" ]
|
|
310 if dOneTErefseq[ "nbFullLengthCopies" ] > 0:
|
|
311 self._dAllTErefseqs[ "nbFamWithFullLengthCopies" ] += 1
|
|
312 if dOneTErefseq[ "nbFullLengthCopies" ] == 1:
|
|
313 self._dAllTErefseqs[ "nbFamWithOneFullLengthCopy" ] += 1
|
|
314 elif dOneTErefseq[ "nbFullLengthCopies" ] == 2:
|
|
315 self._dAllTErefseqs[ "nbFamWithTwoFullLengthCopies" ] += 1
|
|
316 elif dOneTErefseq[ "nbFullLengthCopies" ] == 3:
|
|
317 self._dAllTErefseqs[ "nbFamWithThreeFullLengthCopies" ] += 1
|
|
318 elif dOneTErefseq[ "nbFullLengthCopies" ] > 3:
|
|
319 self._dAllTErefseqs[ "nbFamWithMoreThanThreeFullLengthCopies" ] += 1
|
|
320
|
|
321 if dOneTErefseq[ "statsIdentityPerChain" ].getValuesNumber() != 0:
|
|
322 self._dAllTErefseqs[ "statsAllCopiesMedIdentity" ].add( dOneTErefseq[ "statsIdentityPerChain" ].median() )
|
|
323
|
|
324 if dOneTErefseq[ "statsLengthPerChainPerc" ].getValuesNumber() != 0:
|
|
325 self._dAllTErefseqs[ "statsAllCopiesMedLengthPerc" ].add( dOneTErefseq[ "statsLengthPerChainPerc" ].median() )
|
|
326
|
|
327 def printStatsForAllTEs(self, TEnb):
|
|
328 # statString += "(sum of cumulative coverages: %i bp)" % ( self._dAllTErefseqs[ "sumCumulCoverage" ] )
|
|
329 statString = "total nb of TE fragments: %i\n" % ( self._dAllTErefseqs[ "totalNbFragments" ] )
|
|
330
|
|
331 if self._dAllTErefseqs[ "totalNbFragments" ] != 0:
|
|
332
|
|
333 statString += "total nb full-length fragments: %i (%.2f%%)\n" % \
|
|
334 ( self._dAllTErefseqs[ "totalNbFullLengthFragments" ], \
|
|
335 100*self._dAllTErefseqs[ "totalNbFullLengthFragments" ] / float(self._dAllTErefseqs[ "totalNbFragments" ]) )
|
|
336
|
|
337 statString += "total nb of TE copies: %i\n" % ( self._dAllTErefseqs[ "totalNbCopies" ] )
|
|
338
|
|
339 statString += "total nb full-length copies: %i (%.2f%%)\n" % \
|
|
340 ( self._dAllTErefseqs[ "totalNbFullLengthCopies" ], \
|
|
341 100*self._dAllTErefseqs[ "totalNbFullLengthCopies" ] / float(self._dAllTErefseqs[ "totalNbCopies" ]) )
|
|
342
|
|
343 statString += "families with full-length fragments: %i (%.2f%%)\n" % \
|
|
344 ( self._dAllTErefseqs[ "nbFamWithFullLengthFragments" ], \
|
|
345 100*self._dAllTErefseqs[ "nbFamWithFullLengthFragments" ] / float(TEnb) )
|
|
346 statString += " with only one full-length fragment: %i\n" % ( self._dAllTErefseqs[ "nbFamWithOneFullLengthFragment" ] )
|
|
347 statString += " with only two full-length fragments: %i\n" % ( self._dAllTErefseqs[ "nbFamWithTwoFullLengthFragments" ] )
|
|
348 statString += " with only three full-length fragments: %i\n" % ( self._dAllTErefseqs[ "nbFamWithThreeFullLengthFragments" ] )
|
|
349 statString += " with more than three full-length fragments: %i\n" % ( self._dAllTErefseqs[ "nbFamWithMoreThanThreeFullLengthFragments" ] )
|
|
350
|
|
351 statString += "families with full-length copies: %i (%.2f%%)\n" % \
|
|
352 ( self._dAllTErefseqs[ "nbFamWithFullLengthCopies" ], \
|
|
353 100*self._dAllTErefseqs[ "nbFamWithFullLengthCopies" ] / float(TEnb) )
|
|
354 statString += " with only one full-length copy: %i\n" % ( self._dAllTErefseqs[ "nbFamWithOneFullLengthCopy" ] )
|
|
355 statString += " with only two full-length copies: %i\n" % ( self._dAllTErefseqs[ "nbFamWithTwoFullLengthCopies" ] )
|
|
356 statString += " with only three full-length copies: %i\n" % ( self._dAllTErefseqs[ "nbFamWithThreeFullLengthCopies" ] )
|
|
357 statString += " with more than three full-length copies: %i\n" % ( self._dAllTErefseqs[ "nbFamWithMoreThanThreeFullLengthCopies" ] )
|
|
358
|
|
359 statString += "mean of median identity of all families: %.2f +- %.2f\n" % \
|
|
360 ( self._dAllTErefseqs[ "statsAllCopiesMedIdentity" ].mean(), \
|
|
361 self._dAllTErefseqs[ "statsAllCopiesMedIdentity" ].sd() )
|
|
362
|
|
363 statString += "mean of median length percentage of all families: %.2f +- %.2f\n" % \
|
|
364 ( self._dAllTErefseqs[ "statsAllCopiesMedLengthPerc" ].mean(), \
|
|
365 self._dAllTErefseqs[ "statsAllCopiesMedLengthPerc" ].sd() )
|
|
366 return statString
|
|
367
|
|
368 def printResume(self, lNamesTErefseq, lDistinctSubjects, totalCumulCoverage, genomeLength):
|
|
369 statString = "nb of sequences: %i\n" % len(lNamesTErefseq)
|
|
370 statString += "nb of matched sequences: %i\n" % len(lDistinctSubjects)
|
|
371 statString += "cumulative coverage: %i bp\n" % totalCumulCoverage
|
|
372 statString += "coverage percentage: %.2f%%\n" % ( 100 * totalCumulCoverage / float(genomeLength) )
|
|
373 # statString += "processing the %i TE families..." % len(lNamesTErefseq)
|
|
374 return statString
|