comparison commons/tools/PostAnalyzeTELib.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
comparison
equal deleted inserted replaced
17:b0e8584489e6 18:94ab73e8a190
1 #!/usr/bin/env python
2
3 # Copyright INRA (Institut National de la Recherche Agronomique)
4 # http://www.inra.fr
5 # http://urgi.versailles.inra.fr
6 #
7 # This software is governed by the CeCILL license under French law and
8 # abiding by the rules of distribution of free software. You can use,
9 # modify and/ or redistribute the software under the terms of the CeCILL
10 # license as circulated by CEA, CNRS and INRIA at the following URL
11 # "http://www.cecill.info".
12 #
13 # As a counterpart to the access to the source code and rights to copy,
14 # modify and redistribute granted by the license, users are provided only
15 # with a limited warranty and the software's author, the holder of the
16 # economic rights, and the successive licensors have only limited
17 # liability.
18 #
19 # In this respect, the user's attention is drawn to the risks associated
20 # with loading, using, modifying and/or developing or reproducing the
21 # software by the user in light of its specific status of free software,
22 # that may mean that it is complicated to manipulate, and that also
23 # therefore means that it is reserved for developers and experienced
24 # professionals having in-depth computer knowledge. Users are therefore
25 # encouraged to load and test the software's suitability as regards their
26 # requirements in conditions enabling the security of their systems and/or
27 # data to be ensured and, more generally, to use and operate it in the
28 # same conditions as regards security.
29 #
30 # The fact that you are presently reading this means that you have had
31 # knowledge of the CeCILL license and that you accept its terms.
32
33 from commons.core.LoggerFactory import LoggerFactory
34 from commons.core.utils.RepetOptionParser import RepetOptionParser
35 from commons.core.utils.FileUtils import FileUtils
36 from commons.core.stat.Stat import Stat
37 from commons.core.seq.BioseqDB import BioseqDB
38 from commons.launcher.LaunchBlastclust import LaunchBlastclust
39 from commons.tools.AnnotationStats import AnnotationStats
40 import os
41
42 CONSENSUS = "TE"
43 CLUSTER = "Cluster"
44 LOG_DEPTH = "repet.tools"
45 LOG_FORMAT = "%(message)s"
46
47 class PostAnalyzeTELib(object):
48
49 def __init__(self, analysis = 1, fastaFileName = "", clusterFileName = "", pathTableName="", seqTableName="", genomeSize=0, configFileName = "", doClean = False, verbosity = 3):
50 self._analysis = analysis
51 self._fastaFileName = fastaFileName
52 self._pathTableName = pathTableName
53 self._seqTableName = seqTableName
54 self._genomeSize = genomeSize
55 if self._analysis == 1:
56 self.setBioseqDB()
57 self._identity = 0
58 self._coverage = 80
59 self._applyCovThresholdOnBothSeq = False
60 self.setClusterFileName(clusterFileName)
61 self.setStatPerClusterFileName()
62 self.setClassifStatPerClusterFileName()
63 self.setAnnotationStatsPerTEFileName()
64 self.setAnnotationStatsPerClusterFileName()
65 self._doClean = doClean
66 self._verbosity = verbosity
67 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity, LOG_FORMAT)
68
69 def setAttributesFromCmdLine(self):
70 description = "Tool to post-analyze a TE library : clusterize, give stats on cluster, on annotation,...\n"
71 epilog = "\nExample 1: clustering (e.g. to detect redundancy)\n"
72 epilog += "\t$ python PostAnalyzeTELib.py -a 1 -i TElib.fa -L 98 -S 95 -b\n"
73 epilog += "Example 2: classification stats per cluster\n"
74 epilog += "\t$ python PostAnalyzeTELib.py -a 2 -t TElib.tab\n"
75 epilog += "Example 3: annotation stats per consensus\n"
76 epilog += "\t$ python PostAnalyzeTELib.py -a 3 -p project_chr_allTEs_nr_noSSR_join_path -s project_refTEs_seq -g 129919500\n"
77 epilog += "Example 4: annotation stats per cluster\n"
78 epilog += "\t$ python PostAnalyzeTELib.py -a 4 -t TElib.tab -p project_chr_allTEs_nr_noSSR_join_path -s project_refTEs_seq -g 129919500\n"
79 parser = RepetOptionParser(description = description, epilog = epilog)
80 parser.add_option("-a", "--analysis", dest = "analysis", action = "store", type = "int", help = "analysis number [default: 1, 2, 3, 4]", default = 1)
81 parser.add_option("-i", "--fasta", dest = "fastaFileName", action = "store", type = "string", help = "fasta file name [for analysis 1] [format: fasta]", default = "")
82 parser.add_option("-L", "--coverage", dest = "coverage", action = "store", type = "int", help = "length coverage threshold [for analysis 1] [format: %] [default: 80]", default = 80)
83 parser.add_option("-S", "--identity", dest = "identity", action = "store", type = "int", help = "identity threshold [for analysis 1 with BlastClust] [format: %] [default: 0 => no threshold]", default = 0)
84 parser.add_option("-b", "--both", dest = "bothSeq", action = "store_true", help = "require coverage on both neighbours [for analysis 1] [default: False]", default = False)
85 parser.add_option("-t", "--cluster", dest = "clusterFileName", action = "store", type = "string", help = "cluster file name [for analysis 2 and 4] [default: <input>.tab]", default = "")
86 parser.add_option("-p", "--path", dest = "pathTableName", action = "store", type = "string", help = "name of the table (_path) with the annotated TE copies [for analysis 3 and 4]", default = "")
87 parser.add_option("-s", "--seq", dest = "seqTableName", action = "store", type = "string", help = "name of the table (_seq) with the TE reference sequences [for analysis 3 and 4]", default = "")
88 parser.add_option("-g", "--genome", dest = "genomeSize", action = "store", type = "int", help = "genome size [for analysis 3 and 4]", default = None)
89 parser.add_option("-c", "--clean", dest = "doClean", action = "store_true", help = "clean temporary files [optional] [default: False]", default = False)
90 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 1]", default = 1)
91 options = parser.parse_args()[0]
92 self._setAttributesFromOptions(options)
93
94 def _setAttributesFromOptions(self, options):
95 self.setAnalysis(options.analysis)
96 if self._analysis == 1:
97 self.setFastaFileName(options.fastaFileName)
98 self.setBioseqDB()
99 self.setIdentity(options.identity)
100 self.setCoverage(options.coverage)
101 self.setApplyCovThresholdOnBothSeq(options.bothSeq)
102 self.setClusterFileName(options.clusterFileName)
103 self.setPathTableName(options.pathTableName)
104 self.setSeqTableName(options.seqTableName)
105 self.setStatPerClusterFileName()
106 self.setClassifStatPerClusterFileName()
107 self.setAnnotationStatsPerTEFileName()
108 self.setAnnotationStatsPerClusterFileName()
109 self.setGenomeSize(options.genomeSize)
110 self.setDoClean(options.doClean)
111 self.setVerbosity(options.verbosity)
112
113 def setAnalysis(self, analysis):
114 self._analysis = analysis
115
116 def setFastaFileName(self, fastaFileName):
117 self._fastaFileName = fastaFileName
118
119 def setBioseqDB(self):
120 self._iBioseqDB = BioseqDB(name = self._fastaFileName)
121
122 def setIdentity(self, identity):
123 self._identity = identity
124
125 def setCoverage(self, coverage):
126 self._coverage = coverage
127
128 def setApplyCovThresholdOnBothSeq(self, bothSeq):
129 self._applyCovThresholdOnBothSeq = bothSeq
130
131 def setClusterFileName(self, clusterFileName):
132 if clusterFileName == "":
133 self._clusterFileName = "%s.tab" % os.path.splitext(os.path.basename(self._fastaFileName))[0]
134 else:
135 self._clusterFileName = clusterFileName
136
137 def setStatPerClusterFileName(self):
138 self._statPerClusterFileName = "%s.statsPerCluster.tab" % os.path.splitext(self._clusterFileName)[0]
139 self._globalStatPerClusterFileName = "%s.globalStatsPerCluster.txt" % os.path.splitext(self._clusterFileName)[0]
140
141 def setClassifStatPerClusterFileName(self):
142 self._classifStatPerClusterFileName = "%s.classifStatsPerCluster.tab" % os.path.splitext(self._clusterFileName)[0]
143
144 def setAnnotationStatsPerTEFileName(self):
145 self._annotStatsPerTEFileName = "%s.annotStatsPerTE.tab" % self._pathTableName
146 self._globalAnnotStatsPerTEFileName = "%s.globalAnnotStatsPerTE.txt" % self._pathTableName
147
148 def setAnnotationStatsPerClusterFileName(self):
149 self._annotStatsPerClusterFileName = "%s.annotStatsPerCluster.tab" % self._pathTableName
150
151 def setPathTableName(self, pathTableName):
152 self._pathTableName = pathTableName
153
154 def setSeqTableName(self, seqTableName):
155 self._seqTableName = seqTableName
156
157 def setGenomeSize(self, genomeSize):
158 self._genomeSize = genomeSize
159
160 def setDoClean(self, doClean):
161 self._doClean = doClean
162
163 def setVerbosity(self, verbosity):
164 self._verbosity = verbosity
165
166 def _checkOptions(self):
167 if (self._fastaFileName == "" or not FileUtils.isRessourceExists(self._fastaFileName)) and self._analysis == 1:
168 self._logAndRaise("ERROR: Missing fasta file" )
169
170 def _logAndRaise(self, errorMsg):
171 self._log.error(errorMsg)
172 raise Exception(errorMsg)
173
174 def run(self):
175 LoggerFactory.setLevel(self._log, self._verbosity)
176 self._checkOptions()
177 self._log.info("START PostAnalyzeTELib analysis %d" % self._analysis)
178 if self._analysis == 1:
179 #TODO: option to choose clustering program (blastclust, MCL, uclust,...)
180 self._log.debug("TE lib: %s" % self._fastaFileName)
181 self._log.info("Launch blastclust...")
182 iLaunchBlastclust = LaunchBlastclust(clean = self._doClean)
183 iLaunchBlastclust.setTmpFileName(self._clusterFileName)
184 iLaunchBlastclust.setIdentityThreshold(self._identity)
185 iLaunchBlastclust.setCoverageThreshold(self._coverage/100.0)
186 if self._applyCovThresholdOnBothSeq:
187 iLaunchBlastclust.setBothSequences("T")
188 else:
189 iLaunchBlastclust.setBothSequences("F")
190 iLaunchBlastclust.setVerbosityLevel(self._verbosity - 3)
191 iLaunchBlastclust.launchBlastclust(self._fastaFileName)
192 self._log.info("Compute stats...")
193 self._giveStatsOnTEClusters()
194 if self._analysis == 2:
195 #TODO add global stats (e.g.: nb of cluster without PotentialChimeric, nb of clusters with LTR...) ?
196 self._log.debug("Consensus cluster file name: %s" % self._clusterFileName)
197 self._log.info("Compute classification stats...")
198 self._giveStatsOnConsensusClusters()
199 if self._analysis == 3:
200 #TODO: in option, save stats in DB
201 self._log.info("Compute annotation stats per TE...")
202 iAnnotationStats = AnnotationStats(analysisName="TE", seqTableName=self._seqTableName, pathTableName=self._pathTableName,\
203 genomeLength=self._genomeSize, statsFileName=self._annotStatsPerTEFileName, globalStatsFileName=self._globalAnnotStatsPerTEFileName, verbosity=self._verbosity-1)
204 iAnnotationStats.run()
205 if self._analysis == 4:
206 self._log.info("Compute annotation stats per cluster...")
207 iAnnotationStats = AnnotationStats(analysisName="Cluster", clusterFileName=self._clusterFileName, seqTableName=self._seqTableName, pathTableName=self._pathTableName,\
208 genomeLength=self._genomeSize, statsFileName=self._annotStatsPerClusterFileName, verbosity=self._verbosity-1)
209 iAnnotationStats.run()
210 self._log.info("END PostAnalyzeTELib analysis %d" % self._analysis)
211
212 def _giveStatsOnConsensusClusters(self):
213 with open(self._classifStatPerClusterFileName, "w") as f:
214 f.write("cluster\tnoCat\tPotentialChimeric\tcomp\tincomp\tclassifs (nbTEs)\n")
215 with open(self._clusterFileName) as fCluster:
216 for clusterId, line in enumerate(fCluster):
217 dClassifNb = {}
218 nbIncomp =0
219 nbComp=0
220 nbChim=0
221 nbNoCat=0
222 lConsensus = line.split()
223 for consensus in lConsensus:
224 classifInfos = consensus.split("_")[0]
225
226 if "-incomp" in classifInfos:
227 nbIncomp += 1
228 classifInfos = classifInfos.replace("-incomp","")
229 if "-comp" in classifInfos:
230 nbComp += 1
231 classifInfos = classifInfos.replace("-comp","")
232 if "-chim" in classifInfos:
233 nbChim += 1
234 classifInfos = classifInfos.replace("-chim","")
235 if "noCat" in classifInfos:
236 nbNoCat += 1
237 classifInfos = classifInfos.replace("noCat","")
238
239 classif = classifInfos.split("-")[-1]
240 if classif != "":
241 if dClassifNb.get(classif, None) is None:
242 dClassifNb[classif] = 0
243 dClassifNb[classif] +=1
244
245 occurences= []
246 for classif, occs in dClassifNb.items():
247 occurences.append("%s (%d)" % (classif, occs))
248
249 f.write("%d\t%d\t%d\t%d\t%d\t%s\n" % (clusterId+1, nbNoCat, nbChim\
250 , nbComp, nbIncomp,"\t".join(occurences)))
251
252 def _giveStatsOnTEClusters(self):
253 with open(self._clusterFileName) as fCluster:
254 with open(self._statPerClusterFileName, 'w') as fStatPerCluster:
255 fStatPerCluster.write("cluster\tsequencesNb\tsizeOfSmallestSeq\tsizeOfLargestSeq\taverageSize\tmedSize\n")
256 line = fCluster.readline()
257 clusterNb = 0
258 clusterSeqList= line.split()
259 minClusterSize = len(clusterSeqList)
260 maxClusterSize = 0
261 totalSeqNb = 0
262 seqNbInBigClusters = 0
263 dClusterSize2ClusterNb = {1:0, 2:0, 3:0}
264 while line:
265 clusterSeqList= line.split()
266 seqNb = len(clusterSeqList)
267 totalSeqNb += seqNb
268 if seqNb > 2:
269 seqNbInBigClusters += seqNb
270 dClusterSize2ClusterNb[3] += 1
271 else:
272 dClusterSize2ClusterNb[seqNb] += 1
273 if seqNb > maxClusterSize:
274 maxClusterSize = seqNb
275 if seqNb < minClusterSize:
276 minClusterSize = seqNb
277 line = fCluster.readline()
278 clusterNb += 1
279 clusterSeqLengths = self._iBioseqDB.getSeqLengthByListOfName(clusterSeqList)
280 iStatSeqLengths = Stat(clusterSeqLengths)
281 fStatPerCluster.write("%d\t%d\t%d\t%d\t%d\t%d\n" %(clusterNb, seqNb, min(clusterSeqLengths), max(clusterSeqLengths), iStatSeqLengths.mean(), iStatSeqLengths.median()))
282
283 with open(self._globalStatPerClusterFileName, 'w') as fG:
284 fG.write("nb of clusters: %d\n" % clusterNb)
285 fG.write("nb of clusters with 1 sequence: %d\n" % dClusterSize2ClusterNb[1])
286 fG.write("nb of clusters with 2 sequences: %d\n" % dClusterSize2ClusterNb[2])
287 fG.write("nb of clusters with >2 sequences: %d (%d sequences)\n" % (dClusterSize2ClusterNb[3], seqNbInBigClusters))
288 fG.write("nb of sequences: %d\n" % totalSeqNb)
289 fG.write("nb of sequences in the largest cluster: %d\n" % maxClusterSize)
290 fG.write("nb of sequences in the smallest cluster: %d\n" % minClusterSize)
291 lSeqSizes = self._iBioseqDB.getListOfSequencesLength()
292 iStat = Stat(lSeqSizes)
293 fG.write("size of the smallest sequence: %d\n" % min(lSeqSizes))
294 fG.write("size of the largest sequence: %d\n" % max(lSeqSizes))
295 fG.write("average sequences size: %d\n" % iStat.mean())
296 fG.write("median sequences size: %d\n" % iStat.median())
297
298 if __name__ == "__main__":
299 iLaunch = PostAnalyzeTELib()
300 iLaunch.setAttributesFromCmdLine()
301 iLaunch.run()