18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
4 # http://www.inra.fr
|
|
5 # http://urgi.versailles.inra.fr
|
|
6 #
|
|
7 # This software is governed by the CeCILL license under French law and
|
|
8 # abiding by the rules of distribution of free software. You can use,
|
|
9 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
10 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
11 # "http://www.cecill.info".
|
|
12 #
|
|
13 # As a counterpart to the access to the source code and rights to copy,
|
|
14 # modify and redistribute granted by the license, users are provided only
|
|
15 # with a limited warranty and the software's author, the holder of the
|
|
16 # economic rights, and the successive licensors have only limited
|
|
17 # liability.
|
|
18 #
|
|
19 # In this respect, the user's attention is drawn to the risks associated
|
|
20 # with loading, using, modifying and/or developing or reproducing the
|
|
21 # software by the user in light of its specific status of free software,
|
|
22 # that may mean that it is complicated to manipulate, and that also
|
|
23 # therefore means that it is reserved for developers and experienced
|
|
24 # professionals having in-depth computer knowledge. Users are therefore
|
|
25 # encouraged to load and test the software's suitability as regards their
|
|
26 # requirements in conditions enabling the security of their systems and/or
|
|
27 # data to be ensured and, more generally, to use and operate it in the
|
|
28 # same conditions as regards security.
|
|
29 #
|
|
30 # The fact that you are presently reading this means that you have had
|
|
31 # knowledge of the CeCILL license and that you accept its terms.
|
|
32
|
|
33 from commons.core.LoggerFactory import LoggerFactory
|
|
34 from commons.core.utils.RepetOptionParser import RepetOptionParser
|
|
35 from commons.core.utils.FileUtils import FileUtils
|
|
36 from commons.core.stat.Stat import Stat
|
|
37 from commons.core.seq.BioseqDB import BioseqDB
|
|
38 from commons.launcher.LaunchBlastclust import LaunchBlastclust
|
|
39 from commons.tools.AnnotationStats import AnnotationStats
|
|
40 import os
|
|
41
|
|
42 CONSENSUS = "TE"
|
|
43 CLUSTER = "Cluster"
|
|
44 LOG_DEPTH = "repet.tools"
|
|
45 LOG_FORMAT = "%(message)s"
|
|
46
|
|
47 class PostAnalyzeTELib(object):
|
|
48
|
|
49 def __init__(self, analysis = 1, fastaFileName = "", clusterFileName = "", pathTableName="", seqTableName="", genomeSize=0, configFileName = "", doClean = False, verbosity = 3):
|
|
50 self._analysis = analysis
|
|
51 self._fastaFileName = fastaFileName
|
|
52 self._pathTableName = pathTableName
|
|
53 self._seqTableName = seqTableName
|
|
54 self._genomeSize = genomeSize
|
|
55 if self._analysis == 1:
|
|
56 self.setBioseqDB()
|
|
57 self._identity = 0
|
|
58 self._coverage = 80
|
|
59 self._applyCovThresholdOnBothSeq = False
|
|
60 self.setClusterFileName(clusterFileName)
|
|
61 self.setStatPerClusterFileName()
|
|
62 self.setClassifStatPerClusterFileName()
|
|
63 self.setAnnotationStatsPerTEFileName()
|
|
64 self.setAnnotationStatsPerClusterFileName()
|
|
65 self._doClean = doClean
|
|
66 self._verbosity = verbosity
|
|
67 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity, LOG_FORMAT)
|
|
68
|
|
69 def setAttributesFromCmdLine(self):
|
|
70 description = "Tool to post-analyze a TE library : clusterize, give stats on cluster, on annotation,...\n"
|
|
71 epilog = "\nExample 1: clustering (e.g. to detect redundancy)\n"
|
|
72 epilog += "\t$ python PostAnalyzeTELib.py -a 1 -i TElib.fa -L 98 -S 95 -b\n"
|
|
73 epilog += "Example 2: classification stats per cluster\n"
|
|
74 epilog += "\t$ python PostAnalyzeTELib.py -a 2 -t TElib.tab\n"
|
|
75 epilog += "Example 3: annotation stats per consensus\n"
|
|
76 epilog += "\t$ python PostAnalyzeTELib.py -a 3 -p project_chr_allTEs_nr_noSSR_join_path -s project_refTEs_seq -g 129919500\n"
|
|
77 epilog += "Example 4: annotation stats per cluster\n"
|
|
78 epilog += "\t$ python PostAnalyzeTELib.py -a 4 -t TElib.tab -p project_chr_allTEs_nr_noSSR_join_path -s project_refTEs_seq -g 129919500\n"
|
|
79 parser = RepetOptionParser(description = description, epilog = epilog)
|
|
80 parser.add_option("-a", "--analysis", dest = "analysis", action = "store", type = "int", help = "analysis number [default: 1, 2, 3, 4]", default = 1)
|
|
81 parser.add_option("-i", "--fasta", dest = "fastaFileName", action = "store", type = "string", help = "fasta file name [for analysis 1] [format: fasta]", default = "")
|
|
82 parser.add_option("-L", "--coverage", dest = "coverage", action = "store", type = "int", help = "length coverage threshold [for analysis 1] [format: %] [default: 80]", default = 80)
|
|
83 parser.add_option("-S", "--identity", dest = "identity", action = "store", type = "int", help = "identity threshold [for analysis 1 with BlastClust] [format: %] [default: 0 => no threshold]", default = 0)
|
|
84 parser.add_option("-b", "--both", dest = "bothSeq", action = "store_true", help = "require coverage on both neighbours [for analysis 1] [default: False]", default = False)
|
|
85 parser.add_option("-t", "--cluster", dest = "clusterFileName", action = "store", type = "string", help = "cluster file name [for analysis 2 and 4] [default: <input>.tab]", default = "")
|
|
86 parser.add_option("-p", "--path", dest = "pathTableName", action = "store", type = "string", help = "name of the table (_path) with the annotated TE copies [for analysis 3 and 4]", default = "")
|
|
87 parser.add_option("-s", "--seq", dest = "seqTableName", action = "store", type = "string", help = "name of the table (_seq) with the TE reference sequences [for analysis 3 and 4]", default = "")
|
|
88 parser.add_option("-g", "--genome", dest = "genomeSize", action = "store", type = "int", help = "genome size [for analysis 3 and 4]", default = None)
|
|
89 parser.add_option("-c", "--clean", dest = "doClean", action = "store_true", help = "clean temporary files [optional] [default: False]", default = False)
|
|
90 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 1]", default = 1)
|
|
91 options = parser.parse_args()[0]
|
|
92 self._setAttributesFromOptions(options)
|
|
93
|
|
94 def _setAttributesFromOptions(self, options):
|
|
95 self.setAnalysis(options.analysis)
|
|
96 if self._analysis == 1:
|
|
97 self.setFastaFileName(options.fastaFileName)
|
|
98 self.setBioseqDB()
|
|
99 self.setIdentity(options.identity)
|
|
100 self.setCoverage(options.coverage)
|
|
101 self.setApplyCovThresholdOnBothSeq(options.bothSeq)
|
|
102 self.setClusterFileName(options.clusterFileName)
|
|
103 self.setPathTableName(options.pathTableName)
|
|
104 self.setSeqTableName(options.seqTableName)
|
|
105 self.setStatPerClusterFileName()
|
|
106 self.setClassifStatPerClusterFileName()
|
|
107 self.setAnnotationStatsPerTEFileName()
|
|
108 self.setAnnotationStatsPerClusterFileName()
|
|
109 self.setGenomeSize(options.genomeSize)
|
|
110 self.setDoClean(options.doClean)
|
|
111 self.setVerbosity(options.verbosity)
|
|
112
|
|
113 def setAnalysis(self, analysis):
|
|
114 self._analysis = analysis
|
|
115
|
|
116 def setFastaFileName(self, fastaFileName):
|
|
117 self._fastaFileName = fastaFileName
|
|
118
|
|
119 def setBioseqDB(self):
|
|
120 self._iBioseqDB = BioseqDB(name = self._fastaFileName)
|
|
121
|
|
122 def setIdentity(self, identity):
|
|
123 self._identity = identity
|
|
124
|
|
125 def setCoverage(self, coverage):
|
|
126 self._coverage = coverage
|
|
127
|
|
128 def setApplyCovThresholdOnBothSeq(self, bothSeq):
|
|
129 self._applyCovThresholdOnBothSeq = bothSeq
|
|
130
|
|
131 def setClusterFileName(self, clusterFileName):
|
|
132 if clusterFileName == "":
|
|
133 self._clusterFileName = "%s.tab" % os.path.splitext(os.path.basename(self._fastaFileName))[0]
|
|
134 else:
|
|
135 self._clusterFileName = clusterFileName
|
|
136
|
|
137 def setStatPerClusterFileName(self):
|
|
138 self._statPerClusterFileName = "%s.statsPerCluster.tab" % os.path.splitext(self._clusterFileName)[0]
|
|
139 self._globalStatPerClusterFileName = "%s.globalStatsPerCluster.txt" % os.path.splitext(self._clusterFileName)[0]
|
|
140
|
|
141 def setClassifStatPerClusterFileName(self):
|
|
142 self._classifStatPerClusterFileName = "%s.classifStatsPerCluster.tab" % os.path.splitext(self._clusterFileName)[0]
|
|
143
|
|
144 def setAnnotationStatsPerTEFileName(self):
|
|
145 self._annotStatsPerTEFileName = "%s.annotStatsPerTE.tab" % self._pathTableName
|
|
146 self._globalAnnotStatsPerTEFileName = "%s.globalAnnotStatsPerTE.txt" % self._pathTableName
|
|
147
|
|
148 def setAnnotationStatsPerClusterFileName(self):
|
|
149 self._annotStatsPerClusterFileName = "%s.annotStatsPerCluster.tab" % self._pathTableName
|
|
150
|
|
151 def setPathTableName(self, pathTableName):
|
|
152 self._pathTableName = pathTableName
|
|
153
|
|
154 def setSeqTableName(self, seqTableName):
|
|
155 self._seqTableName = seqTableName
|
|
156
|
|
157 def setGenomeSize(self, genomeSize):
|
|
158 self._genomeSize = genomeSize
|
|
159
|
|
160 def setDoClean(self, doClean):
|
|
161 self._doClean = doClean
|
|
162
|
|
163 def setVerbosity(self, verbosity):
|
|
164 self._verbosity = verbosity
|
|
165
|
|
166 def _checkOptions(self):
|
|
167 if (self._fastaFileName == "" or not FileUtils.isRessourceExists(self._fastaFileName)) and self._analysis == 1:
|
|
168 self._logAndRaise("ERROR: Missing fasta file" )
|
|
169
|
|
170 def _logAndRaise(self, errorMsg):
|
|
171 self._log.error(errorMsg)
|
|
172 raise Exception(errorMsg)
|
|
173
|
|
174 def run(self):
|
|
175 LoggerFactory.setLevel(self._log, self._verbosity)
|
|
176 self._checkOptions()
|
|
177 self._log.info("START PostAnalyzeTELib analysis %d" % self._analysis)
|
|
178 if self._analysis == 1:
|
|
179 #TODO: option to choose clustering program (blastclust, MCL, uclust,...)
|
|
180 self._log.debug("TE lib: %s" % self._fastaFileName)
|
|
181 self._log.info("Launch blastclust...")
|
|
182 iLaunchBlastclust = LaunchBlastclust(clean = self._doClean)
|
|
183 iLaunchBlastclust.setTmpFileName(self._clusterFileName)
|
|
184 iLaunchBlastclust.setIdentityThreshold(self._identity)
|
|
185 iLaunchBlastclust.setCoverageThreshold(self._coverage/100.0)
|
|
186 if self._applyCovThresholdOnBothSeq:
|
|
187 iLaunchBlastclust.setBothSequences("T")
|
|
188 else:
|
|
189 iLaunchBlastclust.setBothSequences("F")
|
|
190 iLaunchBlastclust.setVerbosityLevel(self._verbosity - 3)
|
|
191 iLaunchBlastclust.launchBlastclust(self._fastaFileName)
|
|
192 self._log.info("Compute stats...")
|
|
193 self._giveStatsOnTEClusters()
|
|
194 if self._analysis == 2:
|
|
195 #TODO add global stats (e.g.: nb of cluster without PotentialChimeric, nb of clusters with LTR...) ?
|
|
196 self._log.debug("Consensus cluster file name: %s" % self._clusterFileName)
|
|
197 self._log.info("Compute classification stats...")
|
|
198 self._giveStatsOnConsensusClusters()
|
|
199 if self._analysis == 3:
|
|
200 #TODO: in option, save stats in DB
|
|
201 self._log.info("Compute annotation stats per TE...")
|
|
202 iAnnotationStats = AnnotationStats(analysisName="TE", seqTableName=self._seqTableName, pathTableName=self._pathTableName,\
|
|
203 genomeLength=self._genomeSize, statsFileName=self._annotStatsPerTEFileName, globalStatsFileName=self._globalAnnotStatsPerTEFileName, verbosity=self._verbosity-1)
|
|
204 iAnnotationStats.run()
|
|
205 if self._analysis == 4:
|
|
206 self._log.info("Compute annotation stats per cluster...")
|
|
207 iAnnotationStats = AnnotationStats(analysisName="Cluster", clusterFileName=self._clusterFileName, seqTableName=self._seqTableName, pathTableName=self._pathTableName,\
|
|
208 genomeLength=self._genomeSize, statsFileName=self._annotStatsPerClusterFileName, verbosity=self._verbosity-1)
|
|
209 iAnnotationStats.run()
|
|
210 self._log.info("END PostAnalyzeTELib analysis %d" % self._analysis)
|
|
211
|
|
212 def _giveStatsOnConsensusClusters(self):
|
|
213 with open(self._classifStatPerClusterFileName, "w") as f:
|
|
214 f.write("cluster\tnoCat\tPotentialChimeric\tcomp\tincomp\tclassifs (nbTEs)\n")
|
|
215 with open(self._clusterFileName) as fCluster:
|
|
216 for clusterId, line in enumerate(fCluster):
|
|
217 dClassifNb = {}
|
|
218 nbIncomp =0
|
|
219 nbComp=0
|
|
220 nbChim=0
|
|
221 nbNoCat=0
|
|
222 lConsensus = line.split()
|
|
223 for consensus in lConsensus:
|
|
224 classifInfos = consensus.split("_")[0]
|
|
225
|
|
226 if "-incomp" in classifInfos:
|
|
227 nbIncomp += 1
|
|
228 classifInfos = classifInfos.replace("-incomp","")
|
|
229 if "-comp" in classifInfos:
|
|
230 nbComp += 1
|
|
231 classifInfos = classifInfos.replace("-comp","")
|
|
232 if "-chim" in classifInfos:
|
|
233 nbChim += 1
|
|
234 classifInfos = classifInfos.replace("-chim","")
|
|
235 if "noCat" in classifInfos:
|
|
236 nbNoCat += 1
|
|
237 classifInfos = classifInfos.replace("noCat","")
|
|
238
|
|
239 classif = classifInfos.split("-")[-1]
|
|
240 if classif != "":
|
|
241 if dClassifNb.get(classif, None) is None:
|
|
242 dClassifNb[classif] = 0
|
|
243 dClassifNb[classif] +=1
|
|
244
|
|
245 occurences= []
|
|
246 for classif, occs in dClassifNb.items():
|
|
247 occurences.append("%s (%d)" % (classif, occs))
|
|
248
|
|
249 f.write("%d\t%d\t%d\t%d\t%d\t%s\n" % (clusterId+1, nbNoCat, nbChim\
|
|
250 , nbComp, nbIncomp,"\t".join(occurences)))
|
|
251
|
|
252 def _giveStatsOnTEClusters(self):
|
|
253 with open(self._clusterFileName) as fCluster:
|
|
254 with open(self._statPerClusterFileName, 'w') as fStatPerCluster:
|
|
255 fStatPerCluster.write("cluster\tsequencesNb\tsizeOfSmallestSeq\tsizeOfLargestSeq\taverageSize\tmedSize\n")
|
|
256 line = fCluster.readline()
|
|
257 clusterNb = 0
|
|
258 clusterSeqList= line.split()
|
|
259 minClusterSize = len(clusterSeqList)
|
|
260 maxClusterSize = 0
|
|
261 totalSeqNb = 0
|
|
262 seqNbInBigClusters = 0
|
|
263 dClusterSize2ClusterNb = {1:0, 2:0, 3:0}
|
|
264 while line:
|
|
265 clusterSeqList= line.split()
|
|
266 seqNb = len(clusterSeqList)
|
|
267 totalSeqNb += seqNb
|
|
268 if seqNb > 2:
|
|
269 seqNbInBigClusters += seqNb
|
|
270 dClusterSize2ClusterNb[3] += 1
|
|
271 else:
|
|
272 dClusterSize2ClusterNb[seqNb] += 1
|
|
273 if seqNb > maxClusterSize:
|
|
274 maxClusterSize = seqNb
|
|
275 if seqNb < minClusterSize:
|
|
276 minClusterSize = seqNb
|
|
277 line = fCluster.readline()
|
|
278 clusterNb += 1
|
|
279 clusterSeqLengths = self._iBioseqDB.getSeqLengthByListOfName(clusterSeqList)
|
|
280 iStatSeqLengths = Stat(clusterSeqLengths)
|
|
281 fStatPerCluster.write("%d\t%d\t%d\t%d\t%d\t%d\n" %(clusterNb, seqNb, min(clusterSeqLengths), max(clusterSeqLengths), iStatSeqLengths.mean(), iStatSeqLengths.median()))
|
|
282
|
|
283 with open(self._globalStatPerClusterFileName, 'w') as fG:
|
|
284 fG.write("nb of clusters: %d\n" % clusterNb)
|
|
285 fG.write("nb of clusters with 1 sequence: %d\n" % dClusterSize2ClusterNb[1])
|
|
286 fG.write("nb of clusters with 2 sequences: %d\n" % dClusterSize2ClusterNb[2])
|
|
287 fG.write("nb of clusters with >2 sequences: %d (%d sequences)\n" % (dClusterSize2ClusterNb[3], seqNbInBigClusters))
|
|
288 fG.write("nb of sequences: %d\n" % totalSeqNb)
|
|
289 fG.write("nb of sequences in the largest cluster: %d\n" % maxClusterSize)
|
|
290 fG.write("nb of sequences in the smallest cluster: %d\n" % minClusterSize)
|
|
291 lSeqSizes = self._iBioseqDB.getListOfSequencesLength()
|
|
292 iStat = Stat(lSeqSizes)
|
|
293 fG.write("size of the smallest sequence: %d\n" % min(lSeqSizes))
|
|
294 fG.write("size of the largest sequence: %d\n" % max(lSeqSizes))
|
|
295 fG.write("average sequences size: %d\n" % iStat.mean())
|
|
296 fG.write("median sequences size: %d\n" % iStat.median())
|
|
297
|
|
298 if __name__ == "__main__":
|
|
299 iLaunch = PostAnalyzeTELib()
|
|
300 iLaunch.setAttributesFromCmdLine()
|
|
301 iLaunch.run() |