Mercurial > repos > yufei-luo > s_mart
view commons/tools/PostAnalyzeTELib.py @ 19:9bcfa7936eec
Deleted selected files
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:23:29 -0400 |
parents | 94ab73e8a190 |
children |
line wrap: on
line source
#!/usr/bin/env python # Copyright INRA (Institut National de la Recherche Agronomique) # http://www.inra.fr # http://urgi.versailles.inra.fr # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. from commons.core.LoggerFactory import LoggerFactory from commons.core.utils.RepetOptionParser import RepetOptionParser from commons.core.utils.FileUtils import FileUtils from commons.core.stat.Stat import Stat from commons.core.seq.BioseqDB import BioseqDB from commons.launcher.LaunchBlastclust import LaunchBlastclust from commons.tools.AnnotationStats import AnnotationStats import os CONSENSUS = "TE" CLUSTER = "Cluster" LOG_DEPTH = "repet.tools" LOG_FORMAT = "%(message)s" class PostAnalyzeTELib(object): def __init__(self, analysis = 1, fastaFileName = "", clusterFileName = "", pathTableName="", seqTableName="", genomeSize=0, configFileName = "", doClean = False, verbosity = 3): self._analysis = analysis self._fastaFileName = fastaFileName self._pathTableName = pathTableName self._seqTableName = seqTableName self._genomeSize = genomeSize if self._analysis == 1: self.setBioseqDB() self._identity = 0 self._coverage = 80 self._applyCovThresholdOnBothSeq = False self.setClusterFileName(clusterFileName) self.setStatPerClusterFileName() self.setClassifStatPerClusterFileName() self.setAnnotationStatsPerTEFileName() self.setAnnotationStatsPerClusterFileName() self._doClean = doClean self._verbosity = verbosity self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity, LOG_FORMAT) def setAttributesFromCmdLine(self): description = "Tool to post-analyze a TE library : clusterize, give stats on cluster, on annotation,...\n" epilog = "\nExample 1: clustering (e.g. to detect redundancy)\n" epilog += "\t$ python PostAnalyzeTELib.py -a 1 -i TElib.fa -L 98 -S 95 -b\n" epilog += "Example 2: classification stats per cluster\n" epilog += "\t$ python PostAnalyzeTELib.py -a 2 -t TElib.tab\n" epilog += "Example 3: annotation stats per consensus\n" epilog += "\t$ python PostAnalyzeTELib.py -a 3 -p project_chr_allTEs_nr_noSSR_join_path -s project_refTEs_seq -g 129919500\n" epilog += "Example 4: annotation stats per cluster\n" epilog += "\t$ python PostAnalyzeTELib.py -a 4 -t TElib.tab -p project_chr_allTEs_nr_noSSR_join_path -s project_refTEs_seq -g 129919500\n" parser = RepetOptionParser(description = description, epilog = epilog) parser.add_option("-a", "--analysis", dest = "analysis", action = "store", type = "int", help = "analysis number [default: 1, 2, 3, 4]", default = 1) parser.add_option("-i", "--fasta", dest = "fastaFileName", action = "store", type = "string", help = "fasta file name [for analysis 1] [format: fasta]", default = "") parser.add_option("-L", "--coverage", dest = "coverage", action = "store", type = "int", help = "length coverage threshold [for analysis 1] [format: %] [default: 80]", default = 80) parser.add_option("-S", "--identity", dest = "identity", action = "store", type = "int", help = "identity threshold [for analysis 1 with BlastClust] [format: %] [default: 0 => no threshold]", default = 0) parser.add_option("-b", "--both", dest = "bothSeq", action = "store_true", help = "require coverage on both neighbours [for analysis 1] [default: False]", default = False) parser.add_option("-t", "--cluster", dest = "clusterFileName", action = "store", type = "string", help = "cluster file name [for analysis 2 and 4] [default: <input>.tab]", default = "") parser.add_option("-p", "--path", dest = "pathTableName", action = "store", type = "string", help = "name of the table (_path) with the annotated TE copies [for analysis 3 and 4]", default = "") parser.add_option("-s", "--seq", dest = "seqTableName", action = "store", type = "string", help = "name of the table (_seq) with the TE reference sequences [for analysis 3 and 4]", default = "") parser.add_option("-g", "--genome", dest = "genomeSize", action = "store", type = "int", help = "genome size [for analysis 3 and 4]", default = None) parser.add_option("-c", "--clean", dest = "doClean", action = "store_true", help = "clean temporary files [optional] [default: False]", default = False) parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 1]", default = 1) options = parser.parse_args()[0] self._setAttributesFromOptions(options) def _setAttributesFromOptions(self, options): self.setAnalysis(options.analysis) if self._analysis == 1: self.setFastaFileName(options.fastaFileName) self.setBioseqDB() self.setIdentity(options.identity) self.setCoverage(options.coverage) self.setApplyCovThresholdOnBothSeq(options.bothSeq) self.setClusterFileName(options.clusterFileName) self.setPathTableName(options.pathTableName) self.setSeqTableName(options.seqTableName) self.setStatPerClusterFileName() self.setClassifStatPerClusterFileName() self.setAnnotationStatsPerTEFileName() self.setAnnotationStatsPerClusterFileName() self.setGenomeSize(options.genomeSize) self.setDoClean(options.doClean) self.setVerbosity(options.verbosity) def setAnalysis(self, analysis): self._analysis = analysis def setFastaFileName(self, fastaFileName): self._fastaFileName = fastaFileName def setBioseqDB(self): self._iBioseqDB = BioseqDB(name = self._fastaFileName) def setIdentity(self, identity): self._identity = identity def setCoverage(self, coverage): self._coverage = coverage def setApplyCovThresholdOnBothSeq(self, bothSeq): self._applyCovThresholdOnBothSeq = bothSeq def setClusterFileName(self, clusterFileName): if clusterFileName == "": self._clusterFileName = "%s.tab" % os.path.splitext(os.path.basename(self._fastaFileName))[0] else: self._clusterFileName = clusterFileName def setStatPerClusterFileName(self): self._statPerClusterFileName = "%s.statsPerCluster.tab" % os.path.splitext(self._clusterFileName)[0] self._globalStatPerClusterFileName = "%s.globalStatsPerCluster.txt" % os.path.splitext(self._clusterFileName)[0] def setClassifStatPerClusterFileName(self): self._classifStatPerClusterFileName = "%s.classifStatsPerCluster.tab" % os.path.splitext(self._clusterFileName)[0] def setAnnotationStatsPerTEFileName(self): self._annotStatsPerTEFileName = "%s.annotStatsPerTE.tab" % self._pathTableName self._globalAnnotStatsPerTEFileName = "%s.globalAnnotStatsPerTE.txt" % self._pathTableName def setAnnotationStatsPerClusterFileName(self): self._annotStatsPerClusterFileName = "%s.annotStatsPerCluster.tab" % self._pathTableName def setPathTableName(self, pathTableName): self._pathTableName = pathTableName def setSeqTableName(self, seqTableName): self._seqTableName = seqTableName def setGenomeSize(self, genomeSize): self._genomeSize = genomeSize def setDoClean(self, doClean): self._doClean = doClean def setVerbosity(self, verbosity): self._verbosity = verbosity def _checkOptions(self): if (self._fastaFileName == "" or not FileUtils.isRessourceExists(self._fastaFileName)) and self._analysis == 1: self._logAndRaise("ERROR: Missing fasta file" ) def _logAndRaise(self, errorMsg): self._log.error(errorMsg) raise Exception(errorMsg) def run(self): LoggerFactory.setLevel(self._log, self._verbosity) self._checkOptions() self._log.info("START PostAnalyzeTELib analysis %d" % self._analysis) if self._analysis == 1: #TODO: option to choose clustering program (blastclust, MCL, uclust,...) self._log.debug("TE lib: %s" % self._fastaFileName) self._log.info("Launch blastclust...") iLaunchBlastclust = LaunchBlastclust(clean = self._doClean) iLaunchBlastclust.setTmpFileName(self._clusterFileName) iLaunchBlastclust.setIdentityThreshold(self._identity) iLaunchBlastclust.setCoverageThreshold(self._coverage/100.0) if self._applyCovThresholdOnBothSeq: iLaunchBlastclust.setBothSequences("T") else: iLaunchBlastclust.setBothSequences("F") iLaunchBlastclust.setVerbosityLevel(self._verbosity - 3) iLaunchBlastclust.launchBlastclust(self._fastaFileName) self._log.info("Compute stats...") self._giveStatsOnTEClusters() if self._analysis == 2: #TODO add global stats (e.g.: nb of cluster without PotentialChimeric, nb of clusters with LTR...) ? self._log.debug("Consensus cluster file name: %s" % self._clusterFileName) self._log.info("Compute classification stats...") self._giveStatsOnConsensusClusters() if self._analysis == 3: #TODO: in option, save stats in DB self._log.info("Compute annotation stats per TE...") iAnnotationStats = AnnotationStats(analysisName="TE", seqTableName=self._seqTableName, pathTableName=self._pathTableName,\ genomeLength=self._genomeSize, statsFileName=self._annotStatsPerTEFileName, globalStatsFileName=self._globalAnnotStatsPerTEFileName, verbosity=self._verbosity-1) iAnnotationStats.run() if self._analysis == 4: self._log.info("Compute annotation stats per cluster...") iAnnotationStats = AnnotationStats(analysisName="Cluster", clusterFileName=self._clusterFileName, seqTableName=self._seqTableName, pathTableName=self._pathTableName,\ genomeLength=self._genomeSize, statsFileName=self._annotStatsPerClusterFileName, verbosity=self._verbosity-1) iAnnotationStats.run() self._log.info("END PostAnalyzeTELib analysis %d" % self._analysis) def _giveStatsOnConsensusClusters(self): with open(self._classifStatPerClusterFileName, "w") as f: f.write("cluster\tnoCat\tPotentialChimeric\tcomp\tincomp\tclassifs (nbTEs)\n") with open(self._clusterFileName) as fCluster: for clusterId, line in enumerate(fCluster): dClassifNb = {} nbIncomp =0 nbComp=0 nbChim=0 nbNoCat=0 lConsensus = line.split() for consensus in lConsensus: classifInfos = consensus.split("_")[0] if "-incomp" in classifInfos: nbIncomp += 1 classifInfos = classifInfos.replace("-incomp","") if "-comp" in classifInfos: nbComp += 1 classifInfos = classifInfos.replace("-comp","") if "-chim" in classifInfos: nbChim += 1 classifInfos = classifInfos.replace("-chim","") if "noCat" in classifInfos: nbNoCat += 1 classifInfos = classifInfos.replace("noCat","") classif = classifInfos.split("-")[-1] if classif != "": if dClassifNb.get(classif, None) is None: dClassifNb[classif] = 0 dClassifNb[classif] +=1 occurences= [] for classif, occs in dClassifNb.items(): occurences.append("%s (%d)" % (classif, occs)) f.write("%d\t%d\t%d\t%d\t%d\t%s\n" % (clusterId+1, nbNoCat, nbChim\ , nbComp, nbIncomp,"\t".join(occurences))) def _giveStatsOnTEClusters(self): with open(self._clusterFileName) as fCluster: with open(self._statPerClusterFileName, 'w') as fStatPerCluster: fStatPerCluster.write("cluster\tsequencesNb\tsizeOfSmallestSeq\tsizeOfLargestSeq\taverageSize\tmedSize\n") line = fCluster.readline() clusterNb = 0 clusterSeqList= line.split() minClusterSize = len(clusterSeqList) maxClusterSize = 0 totalSeqNb = 0 seqNbInBigClusters = 0 dClusterSize2ClusterNb = {1:0, 2:0, 3:0} while line: clusterSeqList= line.split() seqNb = len(clusterSeqList) totalSeqNb += seqNb if seqNb > 2: seqNbInBigClusters += seqNb dClusterSize2ClusterNb[3] += 1 else: dClusterSize2ClusterNb[seqNb] += 1 if seqNb > maxClusterSize: maxClusterSize = seqNb if seqNb < minClusterSize: minClusterSize = seqNb line = fCluster.readline() clusterNb += 1 clusterSeqLengths = self._iBioseqDB.getSeqLengthByListOfName(clusterSeqList) iStatSeqLengths = Stat(clusterSeqLengths) fStatPerCluster.write("%d\t%d\t%d\t%d\t%d\t%d\n" %(clusterNb, seqNb, min(clusterSeqLengths), max(clusterSeqLengths), iStatSeqLengths.mean(), iStatSeqLengths.median())) with open(self._globalStatPerClusterFileName, 'w') as fG: fG.write("nb of clusters: %d\n" % clusterNb) fG.write("nb of clusters with 1 sequence: %d\n" % dClusterSize2ClusterNb[1]) fG.write("nb of clusters with 2 sequences: %d\n" % dClusterSize2ClusterNb[2]) fG.write("nb of clusters with >2 sequences: %d (%d sequences)\n" % (dClusterSize2ClusterNb[3], seqNbInBigClusters)) fG.write("nb of sequences: %d\n" % totalSeqNb) fG.write("nb of sequences in the largest cluster: %d\n" % maxClusterSize) fG.write("nb of sequences in the smallest cluster: %d\n" % minClusterSize) lSeqSizes = self._iBioseqDB.getListOfSequencesLength() iStat = Stat(lSeqSizes) fG.write("size of the smallest sequence: %d\n" % min(lSeqSizes)) fG.write("size of the largest sequence: %d\n" % max(lSeqSizes)) fG.write("average sequences size: %d\n" % iStat.mean()) fG.write("median sequences size: %d\n" % iStat.median()) if __name__ == "__main__": iLaunch = PostAnalyzeTELib() iLaunch.setAttributesFromCmdLine() iLaunch.run()