Mercurial > repos > yufei-luo > s_mart
comparison commons/tools/PostAnalyzeTELib.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:b0e8584489e6 | 18:94ab73e8a190 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Copyright INRA (Institut National de la Recherche Agronomique) | |
4 # http://www.inra.fr | |
5 # http://urgi.versailles.inra.fr | |
6 # | |
7 # This software is governed by the CeCILL license under French law and | |
8 # abiding by the rules of distribution of free software. You can use, | |
9 # modify and/ or redistribute the software under the terms of the CeCILL | |
10 # license as circulated by CEA, CNRS and INRIA at the following URL | |
11 # "http://www.cecill.info". | |
12 # | |
13 # As a counterpart to the access to the source code and rights to copy, | |
14 # modify and redistribute granted by the license, users are provided only | |
15 # with a limited warranty and the software's author, the holder of the | |
16 # economic rights, and the successive licensors have only limited | |
17 # liability. | |
18 # | |
19 # In this respect, the user's attention is drawn to the risks associated | |
20 # with loading, using, modifying and/or developing or reproducing the | |
21 # software by the user in light of its specific status of free software, | |
22 # that may mean that it is complicated to manipulate, and that also | |
23 # therefore means that it is reserved for developers and experienced | |
24 # professionals having in-depth computer knowledge. Users are therefore | |
25 # encouraged to load and test the software's suitability as regards their | |
26 # requirements in conditions enabling the security of their systems and/or | |
27 # data to be ensured and, more generally, to use and operate it in the | |
28 # same conditions as regards security. | |
29 # | |
30 # The fact that you are presently reading this means that you have had | |
31 # knowledge of the CeCILL license and that you accept its terms. | |
32 | |
33 from commons.core.LoggerFactory import LoggerFactory | |
34 from commons.core.utils.RepetOptionParser import RepetOptionParser | |
35 from commons.core.utils.FileUtils import FileUtils | |
36 from commons.core.stat.Stat import Stat | |
37 from commons.core.seq.BioseqDB import BioseqDB | |
38 from commons.launcher.LaunchBlastclust import LaunchBlastclust | |
39 from commons.tools.AnnotationStats import AnnotationStats | |
40 import os | |
41 | |
42 CONSENSUS = "TE" | |
43 CLUSTER = "Cluster" | |
44 LOG_DEPTH = "repet.tools" | |
45 LOG_FORMAT = "%(message)s" | |
46 | |
47 class PostAnalyzeTELib(object): | |
48 | |
49 def __init__(self, analysis = 1, fastaFileName = "", clusterFileName = "", pathTableName="", seqTableName="", genomeSize=0, configFileName = "", doClean = False, verbosity = 3): | |
50 self._analysis = analysis | |
51 self._fastaFileName = fastaFileName | |
52 self._pathTableName = pathTableName | |
53 self._seqTableName = seqTableName | |
54 self._genomeSize = genomeSize | |
55 if self._analysis == 1: | |
56 self.setBioseqDB() | |
57 self._identity = 0 | |
58 self._coverage = 80 | |
59 self._applyCovThresholdOnBothSeq = False | |
60 self.setClusterFileName(clusterFileName) | |
61 self.setStatPerClusterFileName() | |
62 self.setClassifStatPerClusterFileName() | |
63 self.setAnnotationStatsPerTEFileName() | |
64 self.setAnnotationStatsPerClusterFileName() | |
65 self._doClean = doClean | |
66 self._verbosity = verbosity | |
67 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity, LOG_FORMAT) | |
68 | |
69 def setAttributesFromCmdLine(self): | |
70 description = "Tool to post-analyze a TE library : clusterize, give stats on cluster, on annotation,...\n" | |
71 epilog = "\nExample 1: clustering (e.g. to detect redundancy)\n" | |
72 epilog += "\t$ python PostAnalyzeTELib.py -a 1 -i TElib.fa -L 98 -S 95 -b\n" | |
73 epilog += "Example 2: classification stats per cluster\n" | |
74 epilog += "\t$ python PostAnalyzeTELib.py -a 2 -t TElib.tab\n" | |
75 epilog += "Example 3: annotation stats per consensus\n" | |
76 epilog += "\t$ python PostAnalyzeTELib.py -a 3 -p project_chr_allTEs_nr_noSSR_join_path -s project_refTEs_seq -g 129919500\n" | |
77 epilog += "Example 4: annotation stats per cluster\n" | |
78 epilog += "\t$ python PostAnalyzeTELib.py -a 4 -t TElib.tab -p project_chr_allTEs_nr_noSSR_join_path -s project_refTEs_seq -g 129919500\n" | |
79 parser = RepetOptionParser(description = description, epilog = epilog) | |
80 parser.add_option("-a", "--analysis", dest = "analysis", action = "store", type = "int", help = "analysis number [default: 1, 2, 3, 4]", default = 1) | |
81 parser.add_option("-i", "--fasta", dest = "fastaFileName", action = "store", type = "string", help = "fasta file name [for analysis 1] [format: fasta]", default = "") | |
82 parser.add_option("-L", "--coverage", dest = "coverage", action = "store", type = "int", help = "length coverage threshold [for analysis 1] [format: %] [default: 80]", default = 80) | |
83 parser.add_option("-S", "--identity", dest = "identity", action = "store", type = "int", help = "identity threshold [for analysis 1 with BlastClust] [format: %] [default: 0 => no threshold]", default = 0) | |
84 parser.add_option("-b", "--both", dest = "bothSeq", action = "store_true", help = "require coverage on both neighbours [for analysis 1] [default: False]", default = False) | |
85 parser.add_option("-t", "--cluster", dest = "clusterFileName", action = "store", type = "string", help = "cluster file name [for analysis 2 and 4] [default: <input>.tab]", default = "") | |
86 parser.add_option("-p", "--path", dest = "pathTableName", action = "store", type = "string", help = "name of the table (_path) with the annotated TE copies [for analysis 3 and 4]", default = "") | |
87 parser.add_option("-s", "--seq", dest = "seqTableName", action = "store", type = "string", help = "name of the table (_seq) with the TE reference sequences [for analysis 3 and 4]", default = "") | |
88 parser.add_option("-g", "--genome", dest = "genomeSize", action = "store", type = "int", help = "genome size [for analysis 3 and 4]", default = None) | |
89 parser.add_option("-c", "--clean", dest = "doClean", action = "store_true", help = "clean temporary files [optional] [default: False]", default = False) | |
90 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 1]", default = 1) | |
91 options = parser.parse_args()[0] | |
92 self._setAttributesFromOptions(options) | |
93 | |
94 def _setAttributesFromOptions(self, options): | |
95 self.setAnalysis(options.analysis) | |
96 if self._analysis == 1: | |
97 self.setFastaFileName(options.fastaFileName) | |
98 self.setBioseqDB() | |
99 self.setIdentity(options.identity) | |
100 self.setCoverage(options.coverage) | |
101 self.setApplyCovThresholdOnBothSeq(options.bothSeq) | |
102 self.setClusterFileName(options.clusterFileName) | |
103 self.setPathTableName(options.pathTableName) | |
104 self.setSeqTableName(options.seqTableName) | |
105 self.setStatPerClusterFileName() | |
106 self.setClassifStatPerClusterFileName() | |
107 self.setAnnotationStatsPerTEFileName() | |
108 self.setAnnotationStatsPerClusterFileName() | |
109 self.setGenomeSize(options.genomeSize) | |
110 self.setDoClean(options.doClean) | |
111 self.setVerbosity(options.verbosity) | |
112 | |
113 def setAnalysis(self, analysis): | |
114 self._analysis = analysis | |
115 | |
116 def setFastaFileName(self, fastaFileName): | |
117 self._fastaFileName = fastaFileName | |
118 | |
119 def setBioseqDB(self): | |
120 self._iBioseqDB = BioseqDB(name = self._fastaFileName) | |
121 | |
122 def setIdentity(self, identity): | |
123 self._identity = identity | |
124 | |
125 def setCoverage(self, coverage): | |
126 self._coverage = coverage | |
127 | |
128 def setApplyCovThresholdOnBothSeq(self, bothSeq): | |
129 self._applyCovThresholdOnBothSeq = bothSeq | |
130 | |
131 def setClusterFileName(self, clusterFileName): | |
132 if clusterFileName == "": | |
133 self._clusterFileName = "%s.tab" % os.path.splitext(os.path.basename(self._fastaFileName))[0] | |
134 else: | |
135 self._clusterFileName = clusterFileName | |
136 | |
137 def setStatPerClusterFileName(self): | |
138 self._statPerClusterFileName = "%s.statsPerCluster.tab" % os.path.splitext(self._clusterFileName)[0] | |
139 self._globalStatPerClusterFileName = "%s.globalStatsPerCluster.txt" % os.path.splitext(self._clusterFileName)[0] | |
140 | |
141 def setClassifStatPerClusterFileName(self): | |
142 self._classifStatPerClusterFileName = "%s.classifStatsPerCluster.tab" % os.path.splitext(self._clusterFileName)[0] | |
143 | |
144 def setAnnotationStatsPerTEFileName(self): | |
145 self._annotStatsPerTEFileName = "%s.annotStatsPerTE.tab" % self._pathTableName | |
146 self._globalAnnotStatsPerTEFileName = "%s.globalAnnotStatsPerTE.txt" % self._pathTableName | |
147 | |
148 def setAnnotationStatsPerClusterFileName(self): | |
149 self._annotStatsPerClusterFileName = "%s.annotStatsPerCluster.tab" % self._pathTableName | |
150 | |
151 def setPathTableName(self, pathTableName): | |
152 self._pathTableName = pathTableName | |
153 | |
154 def setSeqTableName(self, seqTableName): | |
155 self._seqTableName = seqTableName | |
156 | |
157 def setGenomeSize(self, genomeSize): | |
158 self._genomeSize = genomeSize | |
159 | |
160 def setDoClean(self, doClean): | |
161 self._doClean = doClean | |
162 | |
163 def setVerbosity(self, verbosity): | |
164 self._verbosity = verbosity | |
165 | |
166 def _checkOptions(self): | |
167 if (self._fastaFileName == "" or not FileUtils.isRessourceExists(self._fastaFileName)) and self._analysis == 1: | |
168 self._logAndRaise("ERROR: Missing fasta file" ) | |
169 | |
170 def _logAndRaise(self, errorMsg): | |
171 self._log.error(errorMsg) | |
172 raise Exception(errorMsg) | |
173 | |
174 def run(self): | |
175 LoggerFactory.setLevel(self._log, self._verbosity) | |
176 self._checkOptions() | |
177 self._log.info("START PostAnalyzeTELib analysis %d" % self._analysis) | |
178 if self._analysis == 1: | |
179 #TODO: option to choose clustering program (blastclust, MCL, uclust,...) | |
180 self._log.debug("TE lib: %s" % self._fastaFileName) | |
181 self._log.info("Launch blastclust...") | |
182 iLaunchBlastclust = LaunchBlastclust(clean = self._doClean) | |
183 iLaunchBlastclust.setTmpFileName(self._clusterFileName) | |
184 iLaunchBlastclust.setIdentityThreshold(self._identity) | |
185 iLaunchBlastclust.setCoverageThreshold(self._coverage/100.0) | |
186 if self._applyCovThresholdOnBothSeq: | |
187 iLaunchBlastclust.setBothSequences("T") | |
188 else: | |
189 iLaunchBlastclust.setBothSequences("F") | |
190 iLaunchBlastclust.setVerbosityLevel(self._verbosity - 3) | |
191 iLaunchBlastclust.launchBlastclust(self._fastaFileName) | |
192 self._log.info("Compute stats...") | |
193 self._giveStatsOnTEClusters() | |
194 if self._analysis == 2: | |
195 #TODO add global stats (e.g.: nb of cluster without PotentialChimeric, nb of clusters with LTR...) ? | |
196 self._log.debug("Consensus cluster file name: %s" % self._clusterFileName) | |
197 self._log.info("Compute classification stats...") | |
198 self._giveStatsOnConsensusClusters() | |
199 if self._analysis == 3: | |
200 #TODO: in option, save stats in DB | |
201 self._log.info("Compute annotation stats per TE...") | |
202 iAnnotationStats = AnnotationStats(analysisName="TE", seqTableName=self._seqTableName, pathTableName=self._pathTableName,\ | |
203 genomeLength=self._genomeSize, statsFileName=self._annotStatsPerTEFileName, globalStatsFileName=self._globalAnnotStatsPerTEFileName, verbosity=self._verbosity-1) | |
204 iAnnotationStats.run() | |
205 if self._analysis == 4: | |
206 self._log.info("Compute annotation stats per cluster...") | |
207 iAnnotationStats = AnnotationStats(analysisName="Cluster", clusterFileName=self._clusterFileName, seqTableName=self._seqTableName, pathTableName=self._pathTableName,\ | |
208 genomeLength=self._genomeSize, statsFileName=self._annotStatsPerClusterFileName, verbosity=self._verbosity-1) | |
209 iAnnotationStats.run() | |
210 self._log.info("END PostAnalyzeTELib analysis %d" % self._analysis) | |
211 | |
212 def _giveStatsOnConsensusClusters(self): | |
213 with open(self._classifStatPerClusterFileName, "w") as f: | |
214 f.write("cluster\tnoCat\tPotentialChimeric\tcomp\tincomp\tclassifs (nbTEs)\n") | |
215 with open(self._clusterFileName) as fCluster: | |
216 for clusterId, line in enumerate(fCluster): | |
217 dClassifNb = {} | |
218 nbIncomp =0 | |
219 nbComp=0 | |
220 nbChim=0 | |
221 nbNoCat=0 | |
222 lConsensus = line.split() | |
223 for consensus in lConsensus: | |
224 classifInfos = consensus.split("_")[0] | |
225 | |
226 if "-incomp" in classifInfos: | |
227 nbIncomp += 1 | |
228 classifInfos = classifInfos.replace("-incomp","") | |
229 if "-comp" in classifInfos: | |
230 nbComp += 1 | |
231 classifInfos = classifInfos.replace("-comp","") | |
232 if "-chim" in classifInfos: | |
233 nbChim += 1 | |
234 classifInfos = classifInfos.replace("-chim","") | |
235 if "noCat" in classifInfos: | |
236 nbNoCat += 1 | |
237 classifInfos = classifInfos.replace("noCat","") | |
238 | |
239 classif = classifInfos.split("-")[-1] | |
240 if classif != "": | |
241 if dClassifNb.get(classif, None) is None: | |
242 dClassifNb[classif] = 0 | |
243 dClassifNb[classif] +=1 | |
244 | |
245 occurences= [] | |
246 for classif, occs in dClassifNb.items(): | |
247 occurences.append("%s (%d)" % (classif, occs)) | |
248 | |
249 f.write("%d\t%d\t%d\t%d\t%d\t%s\n" % (clusterId+1, nbNoCat, nbChim\ | |
250 , nbComp, nbIncomp,"\t".join(occurences))) | |
251 | |
252 def _giveStatsOnTEClusters(self): | |
253 with open(self._clusterFileName) as fCluster: | |
254 with open(self._statPerClusterFileName, 'w') as fStatPerCluster: | |
255 fStatPerCluster.write("cluster\tsequencesNb\tsizeOfSmallestSeq\tsizeOfLargestSeq\taverageSize\tmedSize\n") | |
256 line = fCluster.readline() | |
257 clusterNb = 0 | |
258 clusterSeqList= line.split() | |
259 minClusterSize = len(clusterSeqList) | |
260 maxClusterSize = 0 | |
261 totalSeqNb = 0 | |
262 seqNbInBigClusters = 0 | |
263 dClusterSize2ClusterNb = {1:0, 2:0, 3:0} | |
264 while line: | |
265 clusterSeqList= line.split() | |
266 seqNb = len(clusterSeqList) | |
267 totalSeqNb += seqNb | |
268 if seqNb > 2: | |
269 seqNbInBigClusters += seqNb | |
270 dClusterSize2ClusterNb[3] += 1 | |
271 else: | |
272 dClusterSize2ClusterNb[seqNb] += 1 | |
273 if seqNb > maxClusterSize: | |
274 maxClusterSize = seqNb | |
275 if seqNb < minClusterSize: | |
276 minClusterSize = seqNb | |
277 line = fCluster.readline() | |
278 clusterNb += 1 | |
279 clusterSeqLengths = self._iBioseqDB.getSeqLengthByListOfName(clusterSeqList) | |
280 iStatSeqLengths = Stat(clusterSeqLengths) | |
281 fStatPerCluster.write("%d\t%d\t%d\t%d\t%d\t%d\n" %(clusterNb, seqNb, min(clusterSeqLengths), max(clusterSeqLengths), iStatSeqLengths.mean(), iStatSeqLengths.median())) | |
282 | |
283 with open(self._globalStatPerClusterFileName, 'w') as fG: | |
284 fG.write("nb of clusters: %d\n" % clusterNb) | |
285 fG.write("nb of clusters with 1 sequence: %d\n" % dClusterSize2ClusterNb[1]) | |
286 fG.write("nb of clusters with 2 sequences: %d\n" % dClusterSize2ClusterNb[2]) | |
287 fG.write("nb of clusters with >2 sequences: %d (%d sequences)\n" % (dClusterSize2ClusterNb[3], seqNbInBigClusters)) | |
288 fG.write("nb of sequences: %d\n" % totalSeqNb) | |
289 fG.write("nb of sequences in the largest cluster: %d\n" % maxClusterSize) | |
290 fG.write("nb of sequences in the smallest cluster: %d\n" % minClusterSize) | |
291 lSeqSizes = self._iBioseqDB.getListOfSequencesLength() | |
292 iStat = Stat(lSeqSizes) | |
293 fG.write("size of the smallest sequence: %d\n" % min(lSeqSizes)) | |
294 fG.write("size of the largest sequence: %d\n" % max(lSeqSizes)) | |
295 fG.write("average sequences size: %d\n" % iStat.mean()) | |
296 fG.write("median sequences size: %d\n" % iStat.median()) | |
297 | |
298 if __name__ == "__main__": | |
299 iLaunch = PostAnalyzeTELib() | |
300 iLaunch.setAttributesFromCmdLine() | |
301 iLaunch.run() |