annotate commons/tools/GFF3Maker.py @ 19:9bcfa7936eec

Deleted selected files
author m-zytnicki
date Mon, 29 Apr 2013 03:23:29 -0400
parents 94ab73e8a190
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
1 #!/usr/bin/env python
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
2
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
3 ##@file GFF3Maker.py
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
4
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
5 # Copyright INRA (Institut National de la Recherche Agronomique)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
6 # http://www.inra.fr
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
7 # http://urgi.versailles.inra.fr
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
8 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
9 # This software is governed by the CeCILL license under French law and
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
10 # abiding by the rules of distribution of free software. You can use,
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
11 # modify and/ or redistribute the software under the terms of the CeCILL
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
12 # license as circulated by CEA, CNRS and INRIA at the following URL
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
13 # "http://www.cecill.info".
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
14 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
15 # As a counterpart to the access to the source code and rights to copy,
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
16 # modify and redistribute granted by the license, users are provided only
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
17 # with a limited warranty and the software's author, the holder of the
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
18 # economic rights, and the successive licensors have only limited
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
19 # liability.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
20 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
21 # In this respect, the user's attention is drawn to the risks associated
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
22 # with loading, using, modifying and/or developing or reproducing the
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
23 # software by the user in light of its specific status of free software,
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
24 # that may mean that it is complicated to manipulate, and that also
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
25 # therefore means that it is reserved for developers and experienced
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
26 # professionals having in-depth computer knowledge. Users are therefore
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
27 # encouraged to load and test the software's suitability as regards their
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
28 # requirements in conditions enabling the security of their systems and/or
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
29 # data to be ensured and, more generally, to use and operate it in the
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
30 # same conditions as regards security.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
31 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
32 # The fact that you are presently reading this means that you have had
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
33 # knowledge of the CeCILL license and that you accept its terms.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
34
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
35 from commons.core.utils.RepetOptionParser import RepetOptionParser
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
36 from commons.core.utils.FileUtils import FileUtils
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
37 from commons.core.sql.DbFactory import DbFactory
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
38 from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
39 from commons.core.sql.TablePathAdaptator import TablePathAdaptator
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
40 import sys
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
41 import os
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
42
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
43 ## GFF3Maker exports annotations from a 'path' table into a GFF3 file.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
44 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
45 class GFF3Maker(object):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
46
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
47 def __init__(self, inFastaName = "", tablesFileName = "", classifTableName = "", isChado = False, isGFF3WithoutAnnotation = False, isWithSequence = False, areMatchPartsCompulsory = False, configFileName = "", verbose = 0, doMergeIdenticalMatches = False, doSplit = False):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
48 self._inFastaName = inFastaName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
49 self._tablesFileName = tablesFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
50 self._classifTableName = classifTableName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
51 self._isChado = isChado
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
52 self._isGFF3WithoutAnnotation = isGFF3WithoutAnnotation
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
53 self._isWithSequence = isWithSequence
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
54 self._areMatchPartsCompulsory = areMatchPartsCompulsory
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
55 self._configFileName = configFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
56 self._doMergeIdenticalMatches = doMergeIdenticalMatches
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
57 self._doSplit = doSplit
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
58 self._iDB = None
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
59 self._verbose = verbose
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
60
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
61 def setAttributesFromCmdLine(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
62 description = "GFF3Maker exports annotations from 'path', 'set' and/or 'classif' tables into a GFF3 file\n"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
63 parser = RepetOptionParser(description = description)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
64 parser.add_option("-f", "--inseq", dest = "inFastaName", action = "store", type = "string", help = "'seq' table recording the input sequences", default = "")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
65 parser.add_option("-t", "--tablesfile", dest = "tablesFileName", action = "store", type = "string", help = "tabulated file of table name to use to create the gff3 files (fields: tier name, format, table name)", default = "")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
66 parser.add_option("-w", "--withSequence", dest = "isWithSequence", action = "store_true", help = "write the sequence at the end of GFF3 file", default = False)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
67 parser.add_option("-a", "--withoutAnnotation", dest = "isGFF3WithoutAnnotation", action = "store_true", help = "write GFF3 files even if no annotation", default = False)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
68 parser.add_option("-p", "--matchPart", dest = "areMatchPartsCompulsory", action = "store_true", help = "always associate a match_part to a match", default = False)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
69 parser.add_option("-i", "--classifTable", dest = "classifTable", action = "store", type = "string", help = "name of the TE library classification table [optional]", default = "")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
70 parser.add_option("-c", "--chado", dest = "isChado", action = "store_true", help = "Chado compliance", default = False)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
71 parser.add_option("-m", "--doMergeIdenticalMatches", dest = "doMergeIdenticalMatches", action = "store_true", help = "merge identical matches based on query start, query end, score", default = False)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
72 parser.add_option("-s", "--doSplit", dest = "doSplit", action = "store_true", help = "split each GFF3 per annotation type", default = False)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
73 parser.add_option("-C", "--config", dest = "configFileName", action = "store", type = "string", help = "configuration file for database connection", default = "")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
74 parser.add_option("-v", "--verbose", dest = "verbose", action = "store", type = "int", help = "verbosity level (default=0, else 1 or 2)", default = 0)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
75 options = parser.parse_args()[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
76 self._setAttributesFromOptions(options)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
77
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
78 #TODO: write a "setAttributesFromConfig"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
79 def _setAttributesFromOptions(self, options):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
80 self.setInFastaName(options.inFastaName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
81 self.setTablesFileName(options.tablesFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
82 self.setClassifTable(options.classifTable)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
83 self.setIsChado(options.isChado)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
84 self.setDoMergeIdenticalMatches(options.doMergeIdenticalMatches)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
85 self.setIsWithSequence(options.isWithSequence)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
86 self.setIsGFF3WithoutAnnotation(options.isGFF3WithoutAnnotation)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
87 self.setAreMatchPartCompulsory(options.areMatchPartsCompulsory)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
88 self.setDoSplit(options.doSplit)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
89 self.setConfigFileName(options.configFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
90 self.setVerbose(options.verbose)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
91
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
92 def setInFastaName(self, inFastaName):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
93 self._inFastaName = inFastaName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
94
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
95 def setTablesFileName(self, tablesFileName):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
96 self._tablesFileName = tablesFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
97
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
98 def setIsWithSequence(self, isWithSequence):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
99 self._isWithSequence = isWithSequence
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
100
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
101 def setIsGFF3WithoutAnnotation(self, isGFF3WithoutAnnotation):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
102 self._isGFF3WithoutAnnotation = isGFF3WithoutAnnotation
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
103
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
104 def setAreMatchPartCompulsory(self, areMatchPartsCompulsory):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
105 self._areMatchPartsCompulsory = areMatchPartsCompulsory
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
106
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
107 def setClassifTable(self, classifTable):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
108 self._classifTableName = classifTable
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
109
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
110 def setIsChado(self, isChado):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
111 self._isChado = isChado
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
112
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
113 def setDoMergeIdenticalMatches(self, doMergeIdenticalMatches):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
114 self._doMergeIdenticalMatches = doMergeIdenticalMatches
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
115
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
116 def setDoSplit(self, doSplit):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
117 self._doSplit = doSplit
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
118
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
119 def setConfigFileName(self, configFileName):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
120 self._configFileName = configFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
121
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
122 def setVerbose(self, verbose):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
123 self._verbose = verbose
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
124
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
125 def checkOptions(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
126 if self._inFastaName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
127 raise Exception("ERROR: options -f required")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
128
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
129 if self._configFileName != "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
130 if not FileUtils.isRessourceExists(self._configFileName):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
131 raise Exception("ERROR: configuration file does not exist!")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
132
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
133 if self._classifTableName and not self._iDB.doesTableExist(self._classifTableName):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
134 raise Exception("ERROR: classification table '%s' does not exist!" % self._classifTableName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
135
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
136 ## Retrieve the features to write in the GFF3 file.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
137 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
138 # @param pathTable string name of the table recording the annotations (i.e. the features)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
139 # @param seqName string name of the sequence (the source feature) on which we want to visualize the matches (the features)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
140 # @param source string the program that generated the feature (i.e. REPET)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
141 # @param frame string "." by default (or "+", "-")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
142 # @return pathString string which will be printed in path file
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
143 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
144 def _getPathFeatures(self, pathTable, seqTable, seqName, source, feature, frame):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
145 pathString = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
146 iTPA = TablePathAdaptator(self._iDB, pathTable)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
147 lPaths = iTPA.getPathListSortedByQueryCoordAndScoreFromQuery(seqName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
148 # organise them into 'match' and 'match_part'
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
149 if lPaths:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
150 dPathID2Data = self._gatherSamePathFeatures(lPaths)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
151 # build the output string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
152 for pathID in dPathID2Data:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
153 pathString += self._organizeEachPathFeature(pathID, dPathID2Data[pathID], seqName, source, frame, seqTable)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
154 return pathString
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
155
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
156 ## Gather matches with the same path ID.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
157 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
158 # @param data list of string lists results of a SQL request
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
159 # @return dPathID2Matchs dict whose keys are path IDs and values are matches data
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
160 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
161 def _gatherSamePathFeatures(self, lPaths):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
162 dPathID2Matchs = {}
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
163 iPreviousPath = lPaths[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
164 iPreviousPath.otherTargets = []
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
165 for iPath in lPaths[1:]:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
166 if self._doMergeIdenticalMatches and iPreviousPath.getQueryStart() == iPath.getQueryStart() and iPreviousPath.getQueryEnd() == iPath.getQueryEnd() and iPreviousPath.getScore() == iPath.getScore() and iPreviousPath.getSubjectName() != iPath.getSubjectName():
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
167 iPreviousPath.otherTargets.append("%s %d %d" % (iPath.getSubjectName(), iPath.getSubjectStart(), iPath.getSubjectEnd()))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
168 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
169 self._addPathToMatchDict(dPathID2Matchs, iPreviousPath)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
170 iPreviousPath = iPath
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
171 iPreviousPath.otherTargets = []
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
172 self._addPathToMatchDict(dPathID2Matchs, iPreviousPath)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
173 return dPathID2Matchs
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
174
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
175 def _getClassifEvidenceBySeqName(self, seqName):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
176 lseqName = seqName.split('_')
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
177 seqNameStructure = '%s_%%' % lseqName[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
178 if lseqName[-1] == 'reversed':
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
179 seqNameStructure += '%s_%s' % (lseqName[-2],lseqName[-1])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
180 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
181 seqNameStructure += lseqName[-1]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
182 qry = "SELECT evidence FROM %s WHERE seq_name like \"%s\"" % (self._classifTableName, seqNameStructure)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
183 self._iDB.execute(qry)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
184 result = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
185 try:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
186 result = "".join(self._iDB.fetchall()[0])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
187 except: pass
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
188 return result
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
189
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
190 def _addPathToMatchDict(self, dPathID2Matchs, iPreviousPath):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
191 pathId = iPreviousPath.getIdentifier()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
192 subjectStart = iPreviousPath.getSubjectStart()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
193 subjectEnd = iPreviousPath.getSubjectEnd()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
194 strand = iPreviousPath.getSubjectStrand()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
195 if subjectStart > subjectEnd:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
196 tmp = subjectStart
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
197 subjectStart = subjectEnd
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
198 subjectEnd = tmp
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
199 queryStart = iPreviousPath.getQueryStart()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
200 queryEnd = iPreviousPath.getQueryEnd()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
201 subjectName = iPreviousPath.getSubjectName()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
202 eValue = iPreviousPath.getEvalue()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
203 identity = iPreviousPath.getIdentity()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
204 otherTargets = iPreviousPath.otherTargets
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
205 if dPathID2Matchs.has_key(pathId):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
206 dPathID2Matchs[pathId].append([queryStart, queryEnd, strand, subjectName, subjectStart, subjectEnd, eValue, identity, otherTargets])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
207 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
208 dPathID2Matchs[pathId] = [[queryStart, queryEnd, strand, subjectName, subjectStart, subjectEnd, eValue, identity, otherTargets]]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
209
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
210 def _getConsensusLengthByTargetName(self, targetName, seqTableName):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
211 iTableSeqAdaptator = TableSeqAdaptator(self._iDB, seqTableName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
212 return iTableSeqAdaptator.getSeqLengthFromDescription(targetName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
213
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
214 ## For a specific path ID, organize match data according to the GFF3 format.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
215 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
216 # @param pathID string path ID
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
217 # @param lMatches match list
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
218 # @param seqName string name of the source feature
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
219 # @param source string 'source' field for GFF3 format
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
220 # @param frame string 'frame' field for GFF3 format
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
221 # @return lines string to write in the GFF3 file
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
222 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
223 def _organizeEachPathFeature(self, pathID, lMatches, seqName, source, frame, seqTable = ""):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
224 lines = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
225 minStart = lMatches[0][0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
226 maxEnd = lMatches[0][1]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
227 minStartSubject = lMatches[0][4]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
228 maxEndSubject = lMatches[0][5]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
229 strand = lMatches[0][2]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
230
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
231 # for each match
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
232 for i in lMatches:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
233 if i[0] < minStart:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
234 minStart = i[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
235 if i[1] > maxEnd:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
236 maxEnd = i[1]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
237 if i[4] < minStartSubject:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
238 minStartSubject = i[4]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
239 if i[5] > maxEndSubject:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
240 maxEndSubject = i[5]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
241
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
242 target = lMatches[0][3]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
243
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
244 targetDescTag = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
245 if self._classifTableName != "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
246 targetDescTag = self._getClassifEvidenceBySeqName(target)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
247 if targetDescTag != "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
248 targetDescTag = ";TargetDescription=%s" % targetDescTag.replace('=', ':').replace(';', '').replace(',', ' |')
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
249
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
250 targetLengthTag = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
251 if seqTable != "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
252 targetLengthTag = self._getConsensusLengthByTargetName(target, seqTable)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
253 targetLengthTag = ";TargetLength=%i" % targetLengthTag
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
254
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
255 attributes = "ID=ms%s_%s_%s" % (pathID, seqName, target)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
256 if self._isChado:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
257 attributes += ";Target=%s+%s+%s" % (target, minStartSubject, maxEndSubject)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
258 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
259 attributes += ";Target=%s %s %s" % (target, minStartSubject, maxEndSubject)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
260
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
261 if lMatches[0][8]:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
262 otherTargets = ", ".join(lMatches[0][8])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
263 attributes += ";OtherTargets=%s" % otherTargets
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
264 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
265 attributes += targetLengthTag
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
266 attributes += targetDescTag
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
267 if len(lMatches) == 1:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
268 attributes += ";Identity=%s" % lMatches[0][7]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
269
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
270
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
271 lines += "%s\t%s\tmatch\t%s\t%s\t0.0\t%s\t%s\t%s\n" % (seqName, source, minStart, maxEnd, strand, frame, attributes)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
272
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
273 if len(lMatches) > 1 or self._areMatchPartsCompulsory:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
274 count = 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
275 for i in lMatches:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
276 attributes = "ID=mp%s-%i_%s_%s" % (pathID, count, seqName, target)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
277 attributes += ";Parent=ms%s%s%s%s%s" % (pathID, "_", seqName, "_", target)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
278 if self._isChado:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
279 attributes += ";Target=%s+%s+%s" % (target, i[4], i[5])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
280 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
281 attributes += ";Target=%s %s %s" % (target, i[4], i[5])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
282
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
283 if not i[8]:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
284 attributes += ";Identity=%s" % i[7]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
285 lines += "%s\t%s\tmatch_part\t%s\t%s\t%s\t%s\t%s\t%s\n" % (seqName, source, i[0], i[1], i[6], i[2], frame, attributes)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
286 count += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
287
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
288 return lines
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
289
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
290 ## Retrieve the features to write in the GFF3 file.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
291 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
292 # @param table string name of the table recording the annotations (i.e. the features)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
293 # @param key string name of the sequence (the source feature) on which we want to visualize the matches (the features)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
294 # @param source string the program that generated the feature (i.e. REPET)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
295 # @param frame string "." by default (or "+", "-")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
296 # @return setString string which will be printed in set file
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
297 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
298 def _getSetFeatures(self, table, key, source, feature, frame):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
299 setString = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
300 # retrieve all the data about the matches
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
301 qry = "SELECT DISTINCT path,name,start,end FROM %s WHERE chr=\"%s\"" % (table, key)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
302 self._iDB.execute(qry)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
303 data = self._iDB.fetchall()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
304 # organise them into 'match' and 'match_part'
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
305 dPathID2Data = self._gatherSameSetFeatures(data)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
306 # build the output string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
307 for pathID in dPathID2Data:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
308 setString += self._organizeEachSetFeature(pathID, dPathID2Data[pathID], key, source, frame)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
309 return setString
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
310
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
311 ## Gather matches with the same path ID.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
312 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
313 # @param data list of string lists results of a SQL request
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
314 # @return dSetID2Matchs dict whose keys are set IDs and values are matches data
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
315 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
316 def _gatherSameSetFeatures(self, data):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
317 dSetID2Matchs = {}
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
318
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
319 for i in data:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
320 setID = i[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
321 name = i[1]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
322 start = i[2]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
323 end = i[3]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
324
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
325 matchStart = min(start, end)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
326 matchEnd = max(start, end)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
327 strand = self._getStrand(start, end)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
328
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
329 if dSetID2Matchs.has_key(setID):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
330 dSetID2Matchs[setID].append([name, matchStart, matchEnd, strand])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
331
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
332 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
333 dSetID2Matchs[setID] = [[name, matchStart, matchEnd, strand]]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
334
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
335 return dSetID2Matchs
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
336
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
337 ## For a specific set ID, organize match data according to the GFF3 format.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
338 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
339 # @param setID string path ID
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
340 # @param lMatches match list
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
341 # @param seqName string name of the source feature
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
342 # @param source string 'source' field for GFF3 format
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
343 # @param frame string 'frame' field for GFF3 format
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
344 # @return lines string to write in the GFF3 file
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
345 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
346 def _organizeEachSetFeature(self, setID, lMatches, seqName, source, frame):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
347 lines = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
348 minStart = min(lMatches[0][1], lMatches[0][2])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
349 maxEnd = max(lMatches[0][1], lMatches[0][2])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
350 strand = lMatches[0][3]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
351
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
352 # for each match
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
353 for i in lMatches:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
354 start = min(i[1],i[2])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
355 if start < minStart:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
356 minStart = start
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
357 end = max(i[1],i[2])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
358 if end > maxEnd:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
359 maxEnd = end
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
360
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
361 target = lMatches[0][0].replace("(","").replace(")","").replace("#","")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
362
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
363 attributes = "ID=ms%s_%s_%s" % (setID, seqName, target)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
364 if self._isChado:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
365 attributes += ";Target=%s+%s+%s" % (target, "1", abs(minStart-maxEnd)+1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
366 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
367 attributes += ";Target=%s %s %s" % (target, "1", abs(minStart-maxEnd)+1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
368 lines += "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (seqName, source, "match", minStart, maxEnd, "0.0", strand, frame, attributes)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
369
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
370 if len(lMatches) > 1 or self._areMatchPartsCompulsory:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
371 count = 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
372 for i in lMatches:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
373 attributes = "ID=mp%s-%i_%s_%s" % (setID, count, seqName, target)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
374 attributes += ";Parent=ms%s%s%s%s%s" % (setID, "_", seqName, "_", target)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
375 if self._isChado:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
376 attributes += ";Target=%s+%s+%s" % (target, "1", abs(i[1]-i[2])+1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
377 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
378 attributes += ";Target=%s %s %s" % (target, "1", abs(i[1]-i[2])+1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
379 lines += "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (seqName, source, "match_part", i[1], i[2], "0.0", i[3], frame, attributes)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
380 count += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
381
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
382 return lines
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
383
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
384 ## Return the strand ('+' if start < end, '-' otherwise).
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
385 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
386 # @param start integer start coordinate
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
387 # @param end integer end coordinate
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
388 # @return strand string "+" or "-"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
389 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
390 def _getStrand(self, start, end):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
391 if start < end:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
392 return "+"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
393 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
394 return "-"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
395
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
396 def run(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
397 #TODO: cat all gff in one gff file in option
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
398 if self._configFileName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
399 self._iDB = DbFactory.createInstance()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
400 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
401 self._iDB = DbFactory.createInstance(self._configFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
402
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
403 self.checkOptions()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
404 if self._verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
405 print "START GFF3Maker"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
406 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
407
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
408 tablesFile = open(self._tablesFileName, "r")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
409 linesFromAnnotationTablesFile = tablesFile.readlines()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
410 tablesFile.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
411 feature = "region"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
412 frame = "."
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
413
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
414 iTSA = TableSeqAdaptator(self._iDB, self._inFastaName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
415 lTuples = iTSA.getAccessionAndLengthList()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
416 for seqName, length in lTuples :
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
417 if not self._doSplit:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
418 fileName = "%s.gff3" % seqName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
419 outFile = open(fileName, "w")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
420 outFile.write("##gff-version 3\n")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
421 outFile.write("##sequence-region %s 1 %s\n" % (seqName, length))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
422 for line in linesFromAnnotationTablesFile:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
423 if line[0] == "#":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
424 continue
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
425 tok = line.split()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
426 if len(tok) == 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
427 break
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
428 source = tok[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
429 format = tok[1]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
430 table = tok[2]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
431 tableseq = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
432 if len(tok) == 4:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
433 tableseq = tok[3]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
434 if format == 'path' :
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
435 annotations = self._getPathFeatures(table, tableseq, seqName, source, feature, frame)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
436 elif format == 'set' :
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
437 annotations = self._getSetFeatures(table, seqName, source, feature, frame)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
438 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
439 raise Exception("Wrong format : %s" % format)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
440 outFile.write(annotations)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
441 outFile.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
442 #TODO: check getNbLinesInSingleFile() to handle big files
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
443 if not self._isGFF3WithoutAnnotation and FileUtils.getNbLinesInSingleFile(fileName) == 2:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
444 os.remove(fileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
445 elif self._isWithSequence:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
446 outFile = open(fileName, "a")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
447 outFile.write("##FASTA\n")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
448 iBioseq = iTSA.getBioseqFromHeader(seqName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
449 iBioseq.write(outFile)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
450 outFile.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
451 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
452 count = 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
453 for line in linesFromAnnotationTablesFile:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
454 if line[0] == "#":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
455 continue
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
456 tok = line.split()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
457 if len(tok) == 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
458 break
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
459 source = tok[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
460 format = tok[1]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
461 table = tok[2]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
462 tableseq = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
463 if len(tok) == 4:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
464 tableseq = tok[3]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
465 fileName = "%s_Annot%i.gff3" % (seqName, count)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
466 outFile = open(fileName, "w")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
467 outFile.write("##gff-version 3\n")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
468 outFile.write("##sequence-region %s 1 %s\n" % (seqName, length))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
469 if format == 'path' :
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
470 annotations = self._getPathFeatures(table, tableseq, seqName, source, feature, frame)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
471 elif format == 'set' :
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
472 annotations = self._getSetFeatures(table, seqName, source, feature, frame)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
473 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
474 raise Exception("Wrong format : %s" % format)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
475 outFile.write(annotations)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
476 outFile.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
477 #TODO: check getNbLinesInSingleFile() to handle big files
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
478 if not self._isGFF3WithoutAnnotation and FileUtils.getNbLinesInSingleFile(fileName) == 2:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
479 os.remove(fileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
480 elif self._isWithSequence:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
481 outFile = open(fileName, "a")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
482 outFile.write("##FASTA\n")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
483 iBioseq = iTSA.getBioseqFromHeader(seqName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
484 iBioseq.write(outFile)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
485 outFile.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
486 count += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
487
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
488 self._iDB.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
489
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
490 if self._verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
491 print "END GFF3Maker"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
492 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
493
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
494 if __name__ == "__main__":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
495 iGFF3Maker = GFF3Maker()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
496 iGFF3Maker.setAttributesFromCmdLine()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
497 iGFF3Maker.run()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
498