Mercurial > repos > yufei-luo > s_mart
diff commons/tools/SplicerFromAnnotation.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/tools/SplicerFromAnnotation.py Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,218 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import sys +import ConfigParser + +from commons.core.sql.DbMySql import DbMySql +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.utils.FileUtils import FileUtils +from commons.core.parsing.FastaParser import FastaParser +from ConfigParser import MissingSectionHeaderError +from commons.core.sql.DbFactory import DbFactory +from commons.core.sql.TablePathAdaptator import TablePathAdaptator +from commons.core.LoggerFactory import LoggerFactory + +#TODO: use configuration file + +LOG_DEPTH = "repet.tools" + +## Get 3 annotation files, using output from TEannot: +#- consensus with one or more full length copy, +#- consensus with one or more full length fragment, +#- consensus without copy + +class SplicerFromAnnotation(object): + + def __init__(self, inInfoFileName = "", tableName = "", verbose = 0): + self._inInfoFileName = inInfoFileName + self._tableName = tableName + self._verbosity = verbose + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + raise Exception(errorMsg) + + def setAttributesFromCmdLine(self): + desc = "Splice annotations from genome. These annotations are Full Length Copy or Full Length Fragment according to consensus." + desc += "A TEs library and annotation are necessary. Connection to the database parameters are retrieved from the environment" + + examples = "\nExample : with a project called \"MyTEannotAnalysis\":\n" + examples += "\t$ python SplicerFromAnnotation.py -i inputFastaFileName -C configFileName -t MyTEannotAnalysis_refTEs_seq " + examples += "\n\t" + examples += "\n\n" + + parser = RepetOptionParser(description = desc, epilog = examples) + parser.add_option("-i", "--file", dest = "inputFastaFileName", action = "store", type = "string", help = "input file (mandatory) = output file with .splice)") + parser.add_option("-C", "--config", dest = "configFileName", action = "store", type = "string", help = "config file name to set database connection", default = "") + parser.add_option("-t", "--copyType", dest = "copyType" , action = "store", type = "int", help = "type number [default: 1, 2] 1 is Full Length Copy", default = 1 ) + parser.add_option("-I", "--identity", dest = "identity", action= "store", type = "float", help = "identity between 0 and 100 [default: 80]", default = 80) + parser.add_option("-o", "--outputFile",dest = "outputFile", action= "store", type = "string", help = "output fasta file (default=input File + '.splice')", default = "") + parser.add_option("-v", "--verbose", dest = "verbose", action = "store", type = "int", help = "verbosity level (default=0)", default = 0) + (options, args) = parser.parse_args() + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.setConfigFileName(options.configFileName) + self.setInputFileName(options.inputFastaFileName) + self.setOutputFileName(options.outputFile) + self.setIdentity(options.identity) + self.setCopyType(options.copyType) + self.setVerbose(options.verbose) + + def setCopyType(self, copyType): + self._copyType = copyType + + def setIdentity(self,identity): + self._identity=identity + + def setInputFileName(self, inputFastaFileName): + self._inputFastaFileName = inputFastaFileName + self._projectName= os.path.basename(self._inputFastaFileName) + self._projectName = self._projectName.split('.')[0] + self._fF=FastaParser(self._inputFastaFileName) + self._fF.getInfos() + self.genomeSize=self._fF.size + self.nbSeqGenome=self._fF.nbSequences + + def setOutputFileName(self,outputFile): + self._outputFileName = outputFile + + def setConfigFileName(self, configFileName): + self._configFileName = configFileName + configFileHandle = open(self._configFileName) + config = ConfigParser.ConfigParser() + + try : + config.readfp( configFileHandle ) + except MissingSectionHeaderError: + self._logAndRaise("Config file " + self._configFileName + " must begin with a section header ") + + self.setup_env( config ) + + def setVerbose(self, verbose): + self._verbosity = verbose + + def setup_env(self, config): + os.environ["REPET_HOST"] = config.get("repet_env", "repet_host") + os.environ["REPET_USER"] = config.get("repet_env", "repet_user") + os.environ["REPET_PW"] = config.get("repet_env", "repet_pw") + os.environ["REPET_DB"] = config.get("repet_env", "repet_db") + os.environ["REPET_PORT"] = config.get("repet_env", "repet_port") + os.environ["REPET_JOB_MANAGER"] = config.get("repet_env", "repet_job_manager") + + def checkOptions(self): + if self._inputFastaFileName != "": + if not FileUtils.isRessourceExists(self._inputFastaFileName): + self._logAndRaise("Input fasta file does not exist!") + else: + self._logAndRaise("No specified -i option! It is mandatory") + + if self._outputFileName =="": + self._outputFileName = os.path.basename(self._inputFastaFileName)+'.splice' + + if self._copyType!=1 or self._copyType!=2: + self._logAndRaise("Copy type must be only 1 or 2!") + if self._configFileName != "": + iDb = DbMySql(cfgFileName = self._configFileName) + iDb.close() + else: + self._logAndRaise("No specified config file name!") + + + def run(self): + LoggerFactory.setLevel(self._log, self._verbosity) + self.checkOptions() + + msg = "START SplicerFromAnnotation" + msg += "\n input info file: %s" % self._inputFastaFileName + msg += "\n Copy type is: %s" % self._copyType + msg += "\n identity is: %s" % self._identity + msg += "\n host is: %s" % os.environ["REPET_HOST"] + msg += "\n user is: %s" % os.environ["REPET_USER"] + msg += "\n DB is: %s" % os.environ["REPET_DB"] + msg += "\n port is: %s" % os.environ["REPET_PORT"] + self._log.debug("%s\n" % msg) + + cmd="PostAnalyzeTELib.py -a 3 -p %s_chr_allTEs_nr_noSSR_join_path -s %s_refTEs_seq -g %s" % (self._projectName,self._projectName,self.genomeSize) + os.system(cmd) + + cmd="GetSpecificTELibAccordingToAnnotation.py -i %s_chr_allTEs_nr_noSSR_join_path.annotStatsPerTE.tab -t %s_refTEs_seq -v 2" % (self._projectName,self._projectName) + os.system(cmd) + + if self._copyType == 1 : + f = open("Splicer_inputFile_chr_allTEs_nr_noSSR_join_path.annotStatsPerTE_FullLengthCopy.txt", "r") + else : + f = open("Splicer_inputFile_chr_allTEs_nr_noSSR_join_path.annotStatsPerTE_FullLengthFrag.txt", "r") + + lines=f.readlines()[1:] + if len(lines)>0: + lConsensusHeader_copyType=[i.split('\t',1)[0] for i in lines] + db = DbFactory.createInstance() + + sql_cmd = "CREATE TABLE %s_annotationIdentitySup%d_path SELECT * FROM %s_chr_allTEs_nr_noSSR_join_path where identity >=%f" % ( self._projectName,int(self._identity),self._projectName,self._identity) + db.execute( sql_cmd ) + + iTPA = TablePathAdaptator(db, "%s_annotationIdentitySup%d_path" % (self._projectName, int(self._identity))) + lAllDistinctPath=[] + for consensusName in lConsensusHeader_copyType: + lDistinctPath = iTPA.getIdListFromSubject(consensusName) + lAllDistinctPath=lAllDistinctPath+lDistinctPath + + iTPA = TablePathAdaptator(db,"%s_chr_allTEs_nr_noSSR_join_path" % self._projectName) + sql_cmd = "CREATE TABLE %s_annotationToSplice_path LIKE %s_chr_allTEs_nr_noSSR_join_path" % ( self._projectName, self._projectName ) + db.execute( sql_cmd ) + + for pathId in lAllDistinctPath: + sql_cmd = "INSERT INTO %s_annotationToSplice_path SELECT * FROM %s_chr_allTEs_nr_noSSR_join_path where path =%d" % ( self._projectName, self._projectName, pathId ) + db.execute( sql_cmd ) + db.close() + + cmd="SpliceTEsFromGenome.py -i %s_annotationToSplice_path -f path -g %s -o %s -C %s -v 2" % (self._projectName, self._inputFastaFileName, self._outputFileName, self._configFileName) + os.system(cmd) + + else : + msg = "There is no consensus in this copy type.\n" + self._log.info(msg) + f.close() + + self._log.info("END SplicerFromAnnotation") + return 0 + +if __name__ == '__main__': + iGetTELib = SplicerFromAnnotation() + iGetTELib.setAttributesFromCmdLine() + iGetTELib.run() + \ No newline at end of file