Mercurial > repos > yufei-luo > s_mart
diff commons/tools/GFF3Maker.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/tools/GFF3Maker.py Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,498 @@ +#!/usr/bin/env python + +##@file GFF3Maker.py + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.utils.FileUtils import FileUtils +from commons.core.sql.DbFactory import DbFactory +from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator +from commons.core.sql.TablePathAdaptator import TablePathAdaptator +import sys +import os + +## GFF3Maker exports annotations from a 'path' table into a GFF3 file. +# +class GFF3Maker(object): + + def __init__(self, inFastaName = "", tablesFileName = "", classifTableName = "", isChado = False, isGFF3WithoutAnnotation = False, isWithSequence = False, areMatchPartsCompulsory = False, configFileName = "", verbose = 0, doMergeIdenticalMatches = False, doSplit = False): + self._inFastaName = inFastaName + self._tablesFileName = tablesFileName + self._classifTableName = classifTableName + self._isChado = isChado + self._isGFF3WithoutAnnotation = isGFF3WithoutAnnotation + self._isWithSequence = isWithSequence + self._areMatchPartsCompulsory = areMatchPartsCompulsory + self._configFileName = configFileName + self._doMergeIdenticalMatches = doMergeIdenticalMatches + self._doSplit = doSplit + self._iDB = None + self._verbose = verbose + + def setAttributesFromCmdLine(self): + description = "GFF3Maker exports annotations from 'path', 'set' and/or 'classif' tables into a GFF3 file\n" + parser = RepetOptionParser(description = description) + parser.add_option("-f", "--inseq", dest = "inFastaName", action = "store", type = "string", help = "'seq' table recording the input sequences", default = "") + parser.add_option("-t", "--tablesfile", dest = "tablesFileName", action = "store", type = "string", help = "tabulated file of table name to use to create the gff3 files (fields: tier name, format, table name)", default = "") + parser.add_option("-w", "--withSequence", dest = "isWithSequence", action = "store_true", help = "write the sequence at the end of GFF3 file", default = False) + parser.add_option("-a", "--withoutAnnotation", dest = "isGFF3WithoutAnnotation", action = "store_true", help = "write GFF3 files even if no annotation", default = False) + parser.add_option("-p", "--matchPart", dest = "areMatchPartsCompulsory", action = "store_true", help = "always associate a match_part to a match", default = False) + parser.add_option("-i", "--classifTable", dest = "classifTable", action = "store", type = "string", help = "name of the TE library classification table [optional]", default = "") + parser.add_option("-c", "--chado", dest = "isChado", action = "store_true", help = "Chado compliance", default = False) + parser.add_option("-m", "--doMergeIdenticalMatches", dest = "doMergeIdenticalMatches", action = "store_true", help = "merge identical matches based on query start, query end, score", default = False) + parser.add_option("-s", "--doSplit", dest = "doSplit", action = "store_true", help = "split each GFF3 per annotation type", default = False) + parser.add_option("-C", "--config", dest = "configFileName", action = "store", type = "string", help = "configuration file for database connection", default = "") + parser.add_option("-v", "--verbose", dest = "verbose", action = "store", type = "int", help = "verbosity level (default=0, else 1 or 2)", default = 0) + options = parser.parse_args()[0] + self._setAttributesFromOptions(options) + + #TODO: write a "setAttributesFromConfig" + def _setAttributesFromOptions(self, options): + self.setInFastaName(options.inFastaName) + self.setTablesFileName(options.tablesFileName) + self.setClassifTable(options.classifTable) + self.setIsChado(options.isChado) + self.setDoMergeIdenticalMatches(options.doMergeIdenticalMatches) + self.setIsWithSequence(options.isWithSequence) + self.setIsGFF3WithoutAnnotation(options.isGFF3WithoutAnnotation) + self.setAreMatchPartCompulsory(options.areMatchPartsCompulsory) + self.setDoSplit(options.doSplit) + self.setConfigFileName(options.configFileName) + self.setVerbose(options.verbose) + + def setInFastaName(self, inFastaName): + self._inFastaName = inFastaName + + def setTablesFileName(self, tablesFileName): + self._tablesFileName = tablesFileName + + def setIsWithSequence(self, isWithSequence): + self._isWithSequence = isWithSequence + + def setIsGFF3WithoutAnnotation(self, isGFF3WithoutAnnotation): + self._isGFF3WithoutAnnotation = isGFF3WithoutAnnotation + + def setAreMatchPartCompulsory(self, areMatchPartsCompulsory): + self._areMatchPartsCompulsory = areMatchPartsCompulsory + + def setClassifTable(self, classifTable): + self._classifTableName = classifTable + + def setIsChado(self, isChado): + self._isChado = isChado + + def setDoMergeIdenticalMatches(self, doMergeIdenticalMatches): + self._doMergeIdenticalMatches = doMergeIdenticalMatches + + def setDoSplit(self, doSplit): + self._doSplit = doSplit + + def setConfigFileName(self, configFileName): + self._configFileName = configFileName + + def setVerbose(self, verbose): + self._verbose = verbose + + def checkOptions(self): + if self._inFastaName == "": + raise Exception("ERROR: options -f required") + + if self._configFileName != "": + if not FileUtils.isRessourceExists(self._configFileName): + raise Exception("ERROR: configuration file does not exist!") + + if self._classifTableName and not self._iDB.doesTableExist(self._classifTableName): + raise Exception("ERROR: classification table '%s' does not exist!" % self._classifTableName) + + ## Retrieve the features to write in the GFF3 file. + # + # @param pathTable string name of the table recording the annotations (i.e. the features) + # @param seqName string name of the sequence (the source feature) on which we want to visualize the matches (the features) + # @param source string the program that generated the feature (i.e. REPET) + # @param frame string "." by default (or "+", "-") + # @return pathString string which will be printed in path file + # + def _getPathFeatures(self, pathTable, seqTable, seqName, source, feature, frame): + pathString = "" + iTPA = TablePathAdaptator(self._iDB, pathTable) + lPaths = iTPA.getPathListSortedByQueryCoordAndScoreFromQuery(seqName) + # organise them into 'match' and 'match_part' + if lPaths: + dPathID2Data = self._gatherSamePathFeatures(lPaths) + # build the output string + for pathID in dPathID2Data: + pathString += self._organizeEachPathFeature(pathID, dPathID2Data[pathID], seqName, source, frame, seqTable) + return pathString + + ## Gather matches with the same path ID. + # + # @param data list of string lists results of a SQL request + # @return dPathID2Matchs dict whose keys are path IDs and values are matches data + # + def _gatherSamePathFeatures(self, lPaths): + dPathID2Matchs = {} + iPreviousPath = lPaths[0] + iPreviousPath.otherTargets = [] + for iPath in lPaths[1:]: + if self._doMergeIdenticalMatches and iPreviousPath.getQueryStart() == iPath.getQueryStart() and iPreviousPath.getQueryEnd() == iPath.getQueryEnd() and iPreviousPath.getScore() == iPath.getScore() and iPreviousPath.getSubjectName() != iPath.getSubjectName(): + iPreviousPath.otherTargets.append("%s %d %d" % (iPath.getSubjectName(), iPath.getSubjectStart(), iPath.getSubjectEnd())) + else: + self._addPathToMatchDict(dPathID2Matchs, iPreviousPath) + iPreviousPath = iPath + iPreviousPath.otherTargets = [] + self._addPathToMatchDict(dPathID2Matchs, iPreviousPath) + return dPathID2Matchs + + def _getClassifEvidenceBySeqName(self, seqName): + lseqName = seqName.split('_') + seqNameStructure = '%s_%%' % lseqName[0] + if lseqName[-1] == 'reversed': + seqNameStructure += '%s_%s' % (lseqName[-2],lseqName[-1]) + else: + seqNameStructure += lseqName[-1] + qry = "SELECT evidence FROM %s WHERE seq_name like \"%s\"" % (self._classifTableName, seqNameStructure) + self._iDB.execute(qry) + result = "" + try: + result = "".join(self._iDB.fetchall()[0]) + except: pass + return result + + def _addPathToMatchDict(self, dPathID2Matchs, iPreviousPath): + pathId = iPreviousPath.getIdentifier() + subjectStart = iPreviousPath.getSubjectStart() + subjectEnd = iPreviousPath.getSubjectEnd() + strand = iPreviousPath.getSubjectStrand() + if subjectStart > subjectEnd: + tmp = subjectStart + subjectStart = subjectEnd + subjectEnd = tmp + queryStart = iPreviousPath.getQueryStart() + queryEnd = iPreviousPath.getQueryEnd() + subjectName = iPreviousPath.getSubjectName() + eValue = iPreviousPath.getEvalue() + identity = iPreviousPath.getIdentity() + otherTargets = iPreviousPath.otherTargets + if dPathID2Matchs.has_key(pathId): + dPathID2Matchs[pathId].append([queryStart, queryEnd, strand, subjectName, subjectStart, subjectEnd, eValue, identity, otherTargets]) + else: + dPathID2Matchs[pathId] = [[queryStart, queryEnd, strand, subjectName, subjectStart, subjectEnd, eValue, identity, otherTargets]] + + def _getConsensusLengthByTargetName(self, targetName, seqTableName): + iTableSeqAdaptator = TableSeqAdaptator(self._iDB, seqTableName) + return iTableSeqAdaptator.getSeqLengthFromDescription(targetName) + + ## For a specific path ID, organize match data according to the GFF3 format. + # + # @param pathID string path ID + # @param lMatches match list + # @param seqName string name of the source feature + # @param source string 'source' field for GFF3 format + # @param frame string 'frame' field for GFF3 format + # @return lines string to write in the GFF3 file + # + def _organizeEachPathFeature(self, pathID, lMatches, seqName, source, frame, seqTable = ""): + lines = "" + minStart = lMatches[0][0] + maxEnd = lMatches[0][1] + minStartSubject = lMatches[0][4] + maxEndSubject = lMatches[0][5] + strand = lMatches[0][2] + + # for each match + for i in lMatches: + if i[0] < minStart: + minStart = i[0] + if i[1] > maxEnd: + maxEnd = i[1] + if i[4] < minStartSubject: + minStartSubject = i[4] + if i[5] > maxEndSubject: + maxEndSubject = i[5] + + target = lMatches[0][3] + + targetDescTag = "" + if self._classifTableName != "": + targetDescTag = self._getClassifEvidenceBySeqName(target) + if targetDescTag != "": + targetDescTag = ";TargetDescription=%s" % targetDescTag.replace('=', ':').replace(';', '').replace(',', ' |') + + targetLengthTag = "" + if seqTable != "": + targetLengthTag = self._getConsensusLengthByTargetName(target, seqTable) + targetLengthTag = ";TargetLength=%i" % targetLengthTag + + attributes = "ID=ms%s_%s_%s" % (pathID, seqName, target) + if self._isChado: + attributes += ";Target=%s+%s+%s" % (target, minStartSubject, maxEndSubject) + else: + attributes += ";Target=%s %s %s" % (target, minStartSubject, maxEndSubject) + + if lMatches[0][8]: + otherTargets = ", ".join(lMatches[0][8]) + attributes += ";OtherTargets=%s" % otherTargets + else: + attributes += targetLengthTag + attributes += targetDescTag + if len(lMatches) == 1: + attributes += ";Identity=%s" % lMatches[0][7] + + + lines += "%s\t%s\tmatch\t%s\t%s\t0.0\t%s\t%s\t%s\n" % (seqName, source, minStart, maxEnd, strand, frame, attributes) + + if len(lMatches) > 1 or self._areMatchPartsCompulsory: + count = 1 + for i in lMatches: + attributes = "ID=mp%s-%i_%s_%s" % (pathID, count, seqName, target) + attributes += ";Parent=ms%s%s%s%s%s" % (pathID, "_", seqName, "_", target) + if self._isChado: + attributes += ";Target=%s+%s+%s" % (target, i[4], i[5]) + else: + attributes += ";Target=%s %s %s" % (target, i[4], i[5]) + + if not i[8]: + attributes += ";Identity=%s" % i[7] + lines += "%s\t%s\tmatch_part\t%s\t%s\t%s\t%s\t%s\t%s\n" % (seqName, source, i[0], i[1], i[6], i[2], frame, attributes) + count += 1 + + return lines + + ## Retrieve the features to write in the GFF3 file. + # + # @param table string name of the table recording the annotations (i.e. the features) + # @param key string name of the sequence (the source feature) on which we want to visualize the matches (the features) + # @param source string the program that generated the feature (i.e. REPET) + # @param frame string "." by default (or "+", "-") + # @return setString string which will be printed in set file + # + def _getSetFeatures(self, table, key, source, feature, frame): + setString = "" + # retrieve all the data about the matches + qry = "SELECT DISTINCT path,name,start,end FROM %s WHERE chr=\"%s\"" % (table, key) + self._iDB.execute(qry) + data = self._iDB.fetchall() + # organise them into 'match' and 'match_part' + dPathID2Data = self._gatherSameSetFeatures(data) + # build the output string + for pathID in dPathID2Data: + setString += self._organizeEachSetFeature(pathID, dPathID2Data[pathID], key, source, frame) + return setString + + ## Gather matches with the same path ID. + # + # @param data list of string lists results of a SQL request + # @return dSetID2Matchs dict whose keys are set IDs and values are matches data + # + def _gatherSameSetFeatures(self, data): + dSetID2Matchs = {} + + for i in data: + setID = i[0] + name = i[1] + start = i[2] + end = i[3] + + matchStart = min(start, end) + matchEnd = max(start, end) + strand = self._getStrand(start, end) + + if dSetID2Matchs.has_key(setID): + dSetID2Matchs[setID].append([name, matchStart, matchEnd, strand]) + + else: + dSetID2Matchs[setID] = [[name, matchStart, matchEnd, strand]] + + return dSetID2Matchs + + ## For a specific set ID, organize match data according to the GFF3 format. + # + # @param setID string path ID + # @param lMatches match list + # @param seqName string name of the source feature + # @param source string 'source' field for GFF3 format + # @param frame string 'frame' field for GFF3 format + # @return lines string to write in the GFF3 file + # + def _organizeEachSetFeature(self, setID, lMatches, seqName, source, frame): + lines = "" + minStart = min(lMatches[0][1], lMatches[0][2]) + maxEnd = max(lMatches[0][1], lMatches[0][2]) + strand = lMatches[0][3] + + # for each match + for i in lMatches: + start = min(i[1],i[2]) + if start < minStart: + minStart = start + end = max(i[1],i[2]) + if end > maxEnd: + maxEnd = end + + target = lMatches[0][0].replace("(","").replace(")","").replace("#","") + + attributes = "ID=ms%s_%s_%s" % (setID, seqName, target) + if self._isChado: + attributes += ";Target=%s+%s+%s" % (target, "1", abs(minStart-maxEnd)+1) + else: + attributes += ";Target=%s %s %s" % (target, "1", abs(minStart-maxEnd)+1) + lines += "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (seqName, source, "match", minStart, maxEnd, "0.0", strand, frame, attributes) + + if len(lMatches) > 1 or self._areMatchPartsCompulsory: + count = 1 + for i in lMatches: + attributes = "ID=mp%s-%i_%s_%s" % (setID, count, seqName, target) + attributes += ";Parent=ms%s%s%s%s%s" % (setID, "_", seqName, "_", target) + if self._isChado: + attributes += ";Target=%s+%s+%s" % (target, "1", abs(i[1]-i[2])+1) + else: + attributes += ";Target=%s %s %s" % (target, "1", abs(i[1]-i[2])+1) + lines += "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (seqName, source, "match_part", i[1], i[2], "0.0", i[3], frame, attributes) + count += 1 + + return lines + + ## Return the strand ('+' if start < end, '-' otherwise). + # + # @param start integer start coordinate + # @param end integer end coordinate + # @return strand string "+" or "-" + # + def _getStrand(self, start, end): + if start < end: + return "+" + else: + return "-" + + def run(self): + #TODO: cat all gff in one gff file in option + if self._configFileName == "": + self._iDB = DbFactory.createInstance() + else: + self._iDB = DbFactory.createInstance(self._configFileName) + + self.checkOptions() + if self._verbose > 0: + print "START GFF3Maker" + sys.stdout.flush() + + tablesFile = open(self._tablesFileName, "r") + linesFromAnnotationTablesFile = tablesFile.readlines() + tablesFile.close() + feature = "region" + frame = "." + + iTSA = TableSeqAdaptator(self._iDB, self._inFastaName) + lTuples = iTSA.getAccessionAndLengthList() + for seqName, length in lTuples : + if not self._doSplit: + fileName = "%s.gff3" % seqName + outFile = open(fileName, "w") + outFile.write("##gff-version 3\n") + outFile.write("##sequence-region %s 1 %s\n" % (seqName, length)) + for line in linesFromAnnotationTablesFile: + if line[0] == "#": + continue + tok = line.split() + if len(tok) == 0: + break + source = tok[0] + format = tok[1] + table = tok[2] + tableseq = "" + if len(tok) == 4: + tableseq = tok[3] + if format == 'path' : + annotations = self._getPathFeatures(table, tableseq, seqName, source, feature, frame) + elif format == 'set' : + annotations = self._getSetFeatures(table, seqName, source, feature, frame) + else: + raise Exception("Wrong format : %s" % format) + outFile.write(annotations) + outFile.close() + #TODO: check getNbLinesInSingleFile() to handle big files + if not self._isGFF3WithoutAnnotation and FileUtils.getNbLinesInSingleFile(fileName) == 2: + os.remove(fileName) + elif self._isWithSequence: + outFile = open(fileName, "a") + outFile.write("##FASTA\n") + iBioseq = iTSA.getBioseqFromHeader(seqName) + iBioseq.write(outFile) + outFile.close() + else: + count = 1 + for line in linesFromAnnotationTablesFile: + if line[0] == "#": + continue + tok = line.split() + if len(tok) == 0: + break + source = tok[0] + format = tok[1] + table = tok[2] + tableseq = "" + if len(tok) == 4: + tableseq = tok[3] + fileName = "%s_Annot%i.gff3" % (seqName, count) + outFile = open(fileName, "w") + outFile.write("##gff-version 3\n") + outFile.write("##sequence-region %s 1 %s\n" % (seqName, length)) + if format == 'path' : + annotations = self._getPathFeatures(table, tableseq, seqName, source, feature, frame) + elif format == 'set' : + annotations = self._getSetFeatures(table, seqName, source, feature, frame) + else: + raise Exception("Wrong format : %s" % format) + outFile.write(annotations) + outFile.close() + #TODO: check getNbLinesInSingleFile() to handle big files + if not self._isGFF3WithoutAnnotation and FileUtils.getNbLinesInSingleFile(fileName) == 2: + os.remove(fileName) + elif self._isWithSequence: + outFile = open(fileName, "a") + outFile.write("##FASTA\n") + iBioseq = iTSA.getBioseqFromHeader(seqName) + iBioseq.write(outFile) + outFile.close() + count += 1 + + self._iDB.close() + + if self._verbose > 0: + print "END GFF3Maker" + sys.stdout.flush() + +if __name__ == "__main__": + iGFF3Maker = GFF3Maker() + iGFF3Maker.setAttributesFromCmdLine() + iGFF3Maker.run() + \ No newline at end of file