18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 ##@file GFF3Maker.py
|
|
4
|
|
5 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
6 # http://www.inra.fr
|
|
7 # http://urgi.versailles.inra.fr
|
|
8 #
|
|
9 # This software is governed by the CeCILL license under French law and
|
|
10 # abiding by the rules of distribution of free software. You can use,
|
|
11 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
12 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
13 # "http://www.cecill.info".
|
|
14 #
|
|
15 # As a counterpart to the access to the source code and rights to copy,
|
|
16 # modify and redistribute granted by the license, users are provided only
|
|
17 # with a limited warranty and the software's author, the holder of the
|
|
18 # economic rights, and the successive licensors have only limited
|
|
19 # liability.
|
|
20 #
|
|
21 # In this respect, the user's attention is drawn to the risks associated
|
|
22 # with loading, using, modifying and/or developing or reproducing the
|
|
23 # software by the user in light of its specific status of free software,
|
|
24 # that may mean that it is complicated to manipulate, and that also
|
|
25 # therefore means that it is reserved for developers and experienced
|
|
26 # professionals having in-depth computer knowledge. Users are therefore
|
|
27 # encouraged to load and test the software's suitability as regards their
|
|
28 # requirements in conditions enabling the security of their systems and/or
|
|
29 # data to be ensured and, more generally, to use and operate it in the
|
|
30 # same conditions as regards security.
|
|
31 #
|
|
32 # The fact that you are presently reading this means that you have had
|
|
33 # knowledge of the CeCILL license and that you accept its terms.
|
|
34
|
|
35 from commons.core.utils.RepetOptionParser import RepetOptionParser
|
|
36 from commons.core.utils.FileUtils import FileUtils
|
|
37 from commons.core.sql.DbFactory import DbFactory
|
|
38 from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator
|
|
39 from commons.core.sql.TablePathAdaptator import TablePathAdaptator
|
|
40 import sys
|
|
41 import os
|
|
42
|
|
43 ## GFF3Maker exports annotations from a 'path' table into a GFF3 file.
|
|
44 #
|
|
45 class GFF3Maker(object):
|
|
46
|
|
47 def __init__(self, inFastaName = "", tablesFileName = "", classifTableName = "", isChado = False, isGFF3WithoutAnnotation = False, isWithSequence = False, areMatchPartsCompulsory = False, configFileName = "", verbose = 0, doMergeIdenticalMatches = False, doSplit = False):
|
|
48 self._inFastaName = inFastaName
|
|
49 self._tablesFileName = tablesFileName
|
|
50 self._classifTableName = classifTableName
|
|
51 self._isChado = isChado
|
|
52 self._isGFF3WithoutAnnotation = isGFF3WithoutAnnotation
|
|
53 self._isWithSequence = isWithSequence
|
|
54 self._areMatchPartsCompulsory = areMatchPartsCompulsory
|
|
55 self._configFileName = configFileName
|
|
56 self._doMergeIdenticalMatches = doMergeIdenticalMatches
|
|
57 self._doSplit = doSplit
|
|
58 self._iDB = None
|
|
59 self._verbose = verbose
|
|
60
|
|
61 def setAttributesFromCmdLine(self):
|
|
62 description = "GFF3Maker exports annotations from 'path', 'set' and/or 'classif' tables into a GFF3 file\n"
|
|
63 parser = RepetOptionParser(description = description)
|
|
64 parser.add_option("-f", "--inseq", dest = "inFastaName", action = "store", type = "string", help = "'seq' table recording the input sequences", default = "")
|
|
65 parser.add_option("-t", "--tablesfile", dest = "tablesFileName", action = "store", type = "string", help = "tabulated file of table name to use to create the gff3 files (fields: tier name, format, table name)", default = "")
|
|
66 parser.add_option("-w", "--withSequence", dest = "isWithSequence", action = "store_true", help = "write the sequence at the end of GFF3 file", default = False)
|
|
67 parser.add_option("-a", "--withoutAnnotation", dest = "isGFF3WithoutAnnotation", action = "store_true", help = "write GFF3 files even if no annotation", default = False)
|
|
68 parser.add_option("-p", "--matchPart", dest = "areMatchPartsCompulsory", action = "store_true", help = "always associate a match_part to a match", default = False)
|
|
69 parser.add_option("-i", "--classifTable", dest = "classifTable", action = "store", type = "string", help = "name of the TE library classification table [optional]", default = "")
|
|
70 parser.add_option("-c", "--chado", dest = "isChado", action = "store_true", help = "Chado compliance", default = False)
|
|
71 parser.add_option("-m", "--doMergeIdenticalMatches", dest = "doMergeIdenticalMatches", action = "store_true", help = "merge identical matches based on query start, query end, score", default = False)
|
|
72 parser.add_option("-s", "--doSplit", dest = "doSplit", action = "store_true", help = "split each GFF3 per annotation type", default = False)
|
|
73 parser.add_option("-C", "--config", dest = "configFileName", action = "store", type = "string", help = "configuration file for database connection", default = "")
|
|
74 parser.add_option("-v", "--verbose", dest = "verbose", action = "store", type = "int", help = "verbosity level (default=0, else 1 or 2)", default = 0)
|
|
75 options = parser.parse_args()[0]
|
|
76 self._setAttributesFromOptions(options)
|
|
77
|
|
78 #TODO: write a "setAttributesFromConfig"
|
|
79 def _setAttributesFromOptions(self, options):
|
|
80 self.setInFastaName(options.inFastaName)
|
|
81 self.setTablesFileName(options.tablesFileName)
|
|
82 self.setClassifTable(options.classifTable)
|
|
83 self.setIsChado(options.isChado)
|
|
84 self.setDoMergeIdenticalMatches(options.doMergeIdenticalMatches)
|
|
85 self.setIsWithSequence(options.isWithSequence)
|
|
86 self.setIsGFF3WithoutAnnotation(options.isGFF3WithoutAnnotation)
|
|
87 self.setAreMatchPartCompulsory(options.areMatchPartsCompulsory)
|
|
88 self.setDoSplit(options.doSplit)
|
|
89 self.setConfigFileName(options.configFileName)
|
|
90 self.setVerbose(options.verbose)
|
|
91
|
|
92 def setInFastaName(self, inFastaName):
|
|
93 self._inFastaName = inFastaName
|
|
94
|
|
95 def setTablesFileName(self, tablesFileName):
|
|
96 self._tablesFileName = tablesFileName
|
|
97
|
|
98 def setIsWithSequence(self, isWithSequence):
|
|
99 self._isWithSequence = isWithSequence
|
|
100
|
|
101 def setIsGFF3WithoutAnnotation(self, isGFF3WithoutAnnotation):
|
|
102 self._isGFF3WithoutAnnotation = isGFF3WithoutAnnotation
|
|
103
|
|
104 def setAreMatchPartCompulsory(self, areMatchPartsCompulsory):
|
|
105 self._areMatchPartsCompulsory = areMatchPartsCompulsory
|
|
106
|
|
107 def setClassifTable(self, classifTable):
|
|
108 self._classifTableName = classifTable
|
|
109
|
|
110 def setIsChado(self, isChado):
|
|
111 self._isChado = isChado
|
|
112
|
|
113 def setDoMergeIdenticalMatches(self, doMergeIdenticalMatches):
|
|
114 self._doMergeIdenticalMatches = doMergeIdenticalMatches
|
|
115
|
|
116 def setDoSplit(self, doSplit):
|
|
117 self._doSplit = doSplit
|
|
118
|
|
119 def setConfigFileName(self, configFileName):
|
|
120 self._configFileName = configFileName
|
|
121
|
|
122 def setVerbose(self, verbose):
|
|
123 self._verbose = verbose
|
|
124
|
|
125 def checkOptions(self):
|
|
126 if self._inFastaName == "":
|
|
127 raise Exception("ERROR: options -f required")
|
|
128
|
|
129 if self._configFileName != "":
|
|
130 if not FileUtils.isRessourceExists(self._configFileName):
|
|
131 raise Exception("ERROR: configuration file does not exist!")
|
|
132
|
|
133 if self._classifTableName and not self._iDB.doesTableExist(self._classifTableName):
|
|
134 raise Exception("ERROR: classification table '%s' does not exist!" % self._classifTableName)
|
|
135
|
|
136 ## Retrieve the features to write in the GFF3 file.
|
|
137 #
|
|
138 # @param pathTable string name of the table recording the annotations (i.e. the features)
|
|
139 # @param seqName string name of the sequence (the source feature) on which we want to visualize the matches (the features)
|
|
140 # @param source string the program that generated the feature (i.e. REPET)
|
|
141 # @param frame string "." by default (or "+", "-")
|
|
142 # @return pathString string which will be printed in path file
|
|
143 #
|
|
144 def _getPathFeatures(self, pathTable, seqTable, seqName, source, feature, frame):
|
|
145 pathString = ""
|
|
146 iTPA = TablePathAdaptator(self._iDB, pathTable)
|
|
147 lPaths = iTPA.getPathListSortedByQueryCoordAndScoreFromQuery(seqName)
|
|
148 # organise them into 'match' and 'match_part'
|
|
149 if lPaths:
|
|
150 dPathID2Data = self._gatherSamePathFeatures(lPaths)
|
|
151 # build the output string
|
|
152 for pathID in dPathID2Data:
|
|
153 pathString += self._organizeEachPathFeature(pathID, dPathID2Data[pathID], seqName, source, frame, seqTable)
|
|
154 return pathString
|
|
155
|
|
156 ## Gather matches with the same path ID.
|
|
157 #
|
|
158 # @param data list of string lists results of a SQL request
|
|
159 # @return dPathID2Matchs dict whose keys are path IDs and values are matches data
|
|
160 #
|
|
161 def _gatherSamePathFeatures(self, lPaths):
|
|
162 dPathID2Matchs = {}
|
|
163 iPreviousPath = lPaths[0]
|
|
164 iPreviousPath.otherTargets = []
|
|
165 for iPath in lPaths[1:]:
|
|
166 if self._doMergeIdenticalMatches and iPreviousPath.getQueryStart() == iPath.getQueryStart() and iPreviousPath.getQueryEnd() == iPath.getQueryEnd() and iPreviousPath.getScore() == iPath.getScore() and iPreviousPath.getSubjectName() != iPath.getSubjectName():
|
|
167 iPreviousPath.otherTargets.append("%s %d %d" % (iPath.getSubjectName(), iPath.getSubjectStart(), iPath.getSubjectEnd()))
|
|
168 else:
|
|
169 self._addPathToMatchDict(dPathID2Matchs, iPreviousPath)
|
|
170 iPreviousPath = iPath
|
|
171 iPreviousPath.otherTargets = []
|
|
172 self._addPathToMatchDict(dPathID2Matchs, iPreviousPath)
|
|
173 return dPathID2Matchs
|
|
174
|
|
175 def _getClassifEvidenceBySeqName(self, seqName):
|
|
176 lseqName = seqName.split('_')
|
|
177 seqNameStructure = '%s_%%' % lseqName[0]
|
|
178 if lseqName[-1] == 'reversed':
|
|
179 seqNameStructure += '%s_%s' % (lseqName[-2],lseqName[-1])
|
|
180 else:
|
|
181 seqNameStructure += lseqName[-1]
|
|
182 qry = "SELECT evidence FROM %s WHERE seq_name like \"%s\"" % (self._classifTableName, seqNameStructure)
|
|
183 self._iDB.execute(qry)
|
|
184 result = ""
|
|
185 try:
|
|
186 result = "".join(self._iDB.fetchall()[0])
|
|
187 except: pass
|
|
188 return result
|
|
189
|
|
190 def _addPathToMatchDict(self, dPathID2Matchs, iPreviousPath):
|
|
191 pathId = iPreviousPath.getIdentifier()
|
|
192 subjectStart = iPreviousPath.getSubjectStart()
|
|
193 subjectEnd = iPreviousPath.getSubjectEnd()
|
|
194 strand = iPreviousPath.getSubjectStrand()
|
|
195 if subjectStart > subjectEnd:
|
|
196 tmp = subjectStart
|
|
197 subjectStart = subjectEnd
|
|
198 subjectEnd = tmp
|
|
199 queryStart = iPreviousPath.getQueryStart()
|
|
200 queryEnd = iPreviousPath.getQueryEnd()
|
|
201 subjectName = iPreviousPath.getSubjectName()
|
|
202 eValue = iPreviousPath.getEvalue()
|
|
203 identity = iPreviousPath.getIdentity()
|
|
204 otherTargets = iPreviousPath.otherTargets
|
|
205 if dPathID2Matchs.has_key(pathId):
|
|
206 dPathID2Matchs[pathId].append([queryStart, queryEnd, strand, subjectName, subjectStart, subjectEnd, eValue, identity, otherTargets])
|
|
207 else:
|
|
208 dPathID2Matchs[pathId] = [[queryStart, queryEnd, strand, subjectName, subjectStart, subjectEnd, eValue, identity, otherTargets]]
|
|
209
|
|
210 def _getConsensusLengthByTargetName(self, targetName, seqTableName):
|
|
211 iTableSeqAdaptator = TableSeqAdaptator(self._iDB, seqTableName)
|
|
212 return iTableSeqAdaptator.getSeqLengthFromDescription(targetName)
|
|
213
|
|
214 ## For a specific path ID, organize match data according to the GFF3 format.
|
|
215 #
|
|
216 # @param pathID string path ID
|
|
217 # @param lMatches match list
|
|
218 # @param seqName string name of the source feature
|
|
219 # @param source string 'source' field for GFF3 format
|
|
220 # @param frame string 'frame' field for GFF3 format
|
|
221 # @return lines string to write in the GFF3 file
|
|
222 #
|
|
223 def _organizeEachPathFeature(self, pathID, lMatches, seqName, source, frame, seqTable = ""):
|
|
224 lines = ""
|
|
225 minStart = lMatches[0][0]
|
|
226 maxEnd = lMatches[0][1]
|
|
227 minStartSubject = lMatches[0][4]
|
|
228 maxEndSubject = lMatches[0][5]
|
|
229 strand = lMatches[0][2]
|
|
230
|
|
231 # for each match
|
|
232 for i in lMatches:
|
|
233 if i[0] < minStart:
|
|
234 minStart = i[0]
|
|
235 if i[1] > maxEnd:
|
|
236 maxEnd = i[1]
|
|
237 if i[4] < minStartSubject:
|
|
238 minStartSubject = i[4]
|
|
239 if i[5] > maxEndSubject:
|
|
240 maxEndSubject = i[5]
|
|
241
|
|
242 target = lMatches[0][3]
|
|
243
|
|
244 targetDescTag = ""
|
|
245 if self._classifTableName != "":
|
|
246 targetDescTag = self._getClassifEvidenceBySeqName(target)
|
|
247 if targetDescTag != "":
|
|
248 targetDescTag = ";TargetDescription=%s" % targetDescTag.replace('=', ':').replace(';', '').replace(',', ' |')
|
|
249
|
|
250 targetLengthTag = ""
|
|
251 if seqTable != "":
|
|
252 targetLengthTag = self._getConsensusLengthByTargetName(target, seqTable)
|
|
253 targetLengthTag = ";TargetLength=%i" % targetLengthTag
|
|
254
|
|
255 attributes = "ID=ms%s_%s_%s" % (pathID, seqName, target)
|
|
256 if self._isChado:
|
|
257 attributes += ";Target=%s+%s+%s" % (target, minStartSubject, maxEndSubject)
|
|
258 else:
|
|
259 attributes += ";Target=%s %s %s" % (target, minStartSubject, maxEndSubject)
|
|
260
|
|
261 if lMatches[0][8]:
|
|
262 otherTargets = ", ".join(lMatches[0][8])
|
|
263 attributes += ";OtherTargets=%s" % otherTargets
|
|
264 else:
|
|
265 attributes += targetLengthTag
|
|
266 attributes += targetDescTag
|
|
267 if len(lMatches) == 1:
|
|
268 attributes += ";Identity=%s" % lMatches[0][7]
|
|
269
|
|
270
|
|
271 lines += "%s\t%s\tmatch\t%s\t%s\t0.0\t%s\t%s\t%s\n" % (seqName, source, minStart, maxEnd, strand, frame, attributes)
|
|
272
|
|
273 if len(lMatches) > 1 or self._areMatchPartsCompulsory:
|
|
274 count = 1
|
|
275 for i in lMatches:
|
|
276 attributes = "ID=mp%s-%i_%s_%s" % (pathID, count, seqName, target)
|
|
277 attributes += ";Parent=ms%s%s%s%s%s" % (pathID, "_", seqName, "_", target)
|
|
278 if self._isChado:
|
|
279 attributes += ";Target=%s+%s+%s" % (target, i[4], i[5])
|
|
280 else:
|
|
281 attributes += ";Target=%s %s %s" % (target, i[4], i[5])
|
|
282
|
|
283 if not i[8]:
|
|
284 attributes += ";Identity=%s" % i[7]
|
|
285 lines += "%s\t%s\tmatch_part\t%s\t%s\t%s\t%s\t%s\t%s\n" % (seqName, source, i[0], i[1], i[6], i[2], frame, attributes)
|
|
286 count += 1
|
|
287
|
|
288 return lines
|
|
289
|
|
290 ## Retrieve the features to write in the GFF3 file.
|
|
291 #
|
|
292 # @param table string name of the table recording the annotations (i.e. the features)
|
|
293 # @param key string name of the sequence (the source feature) on which we want to visualize the matches (the features)
|
|
294 # @param source string the program that generated the feature (i.e. REPET)
|
|
295 # @param frame string "." by default (or "+", "-")
|
|
296 # @return setString string which will be printed in set file
|
|
297 #
|
|
298 def _getSetFeatures(self, table, key, source, feature, frame):
|
|
299 setString = ""
|
|
300 # retrieve all the data about the matches
|
|
301 qry = "SELECT DISTINCT path,name,start,end FROM %s WHERE chr=\"%s\"" % (table, key)
|
|
302 self._iDB.execute(qry)
|
|
303 data = self._iDB.fetchall()
|
|
304 # organise them into 'match' and 'match_part'
|
|
305 dPathID2Data = self._gatherSameSetFeatures(data)
|
|
306 # build the output string
|
|
307 for pathID in dPathID2Data:
|
|
308 setString += self._organizeEachSetFeature(pathID, dPathID2Data[pathID], key, source, frame)
|
|
309 return setString
|
|
310
|
|
311 ## Gather matches with the same path ID.
|
|
312 #
|
|
313 # @param data list of string lists results of a SQL request
|
|
314 # @return dSetID2Matchs dict whose keys are set IDs and values are matches data
|
|
315 #
|
|
316 def _gatherSameSetFeatures(self, data):
|
|
317 dSetID2Matchs = {}
|
|
318
|
|
319 for i in data:
|
|
320 setID = i[0]
|
|
321 name = i[1]
|
|
322 start = i[2]
|
|
323 end = i[3]
|
|
324
|
|
325 matchStart = min(start, end)
|
|
326 matchEnd = max(start, end)
|
|
327 strand = self._getStrand(start, end)
|
|
328
|
|
329 if dSetID2Matchs.has_key(setID):
|
|
330 dSetID2Matchs[setID].append([name, matchStart, matchEnd, strand])
|
|
331
|
|
332 else:
|
|
333 dSetID2Matchs[setID] = [[name, matchStart, matchEnd, strand]]
|
|
334
|
|
335 return dSetID2Matchs
|
|
336
|
|
337 ## For a specific set ID, organize match data according to the GFF3 format.
|
|
338 #
|
|
339 # @param setID string path ID
|
|
340 # @param lMatches match list
|
|
341 # @param seqName string name of the source feature
|
|
342 # @param source string 'source' field for GFF3 format
|
|
343 # @param frame string 'frame' field for GFF3 format
|
|
344 # @return lines string to write in the GFF3 file
|
|
345 #
|
|
346 def _organizeEachSetFeature(self, setID, lMatches, seqName, source, frame):
|
|
347 lines = ""
|
|
348 minStart = min(lMatches[0][1], lMatches[0][2])
|
|
349 maxEnd = max(lMatches[0][1], lMatches[0][2])
|
|
350 strand = lMatches[0][3]
|
|
351
|
|
352 # for each match
|
|
353 for i in lMatches:
|
|
354 start = min(i[1],i[2])
|
|
355 if start < minStart:
|
|
356 minStart = start
|
|
357 end = max(i[1],i[2])
|
|
358 if end > maxEnd:
|
|
359 maxEnd = end
|
|
360
|
|
361 target = lMatches[0][0].replace("(","").replace(")","").replace("#","")
|
|
362
|
|
363 attributes = "ID=ms%s_%s_%s" % (setID, seqName, target)
|
|
364 if self._isChado:
|
|
365 attributes += ";Target=%s+%s+%s" % (target, "1", abs(minStart-maxEnd)+1)
|
|
366 else:
|
|
367 attributes += ";Target=%s %s %s" % (target, "1", abs(minStart-maxEnd)+1)
|
|
368 lines += "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (seqName, source, "match", minStart, maxEnd, "0.0", strand, frame, attributes)
|
|
369
|
|
370 if len(lMatches) > 1 or self._areMatchPartsCompulsory:
|
|
371 count = 1
|
|
372 for i in lMatches:
|
|
373 attributes = "ID=mp%s-%i_%s_%s" % (setID, count, seqName, target)
|
|
374 attributes += ";Parent=ms%s%s%s%s%s" % (setID, "_", seqName, "_", target)
|
|
375 if self._isChado:
|
|
376 attributes += ";Target=%s+%s+%s" % (target, "1", abs(i[1]-i[2])+1)
|
|
377 else:
|
|
378 attributes += ";Target=%s %s %s" % (target, "1", abs(i[1]-i[2])+1)
|
|
379 lines += "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (seqName, source, "match_part", i[1], i[2], "0.0", i[3], frame, attributes)
|
|
380 count += 1
|
|
381
|
|
382 return lines
|
|
383
|
|
384 ## Return the strand ('+' if start < end, '-' otherwise).
|
|
385 #
|
|
386 # @param start integer start coordinate
|
|
387 # @param end integer end coordinate
|
|
388 # @return strand string "+" or "-"
|
|
389 #
|
|
390 def _getStrand(self, start, end):
|
|
391 if start < end:
|
|
392 return "+"
|
|
393 else:
|
|
394 return "-"
|
|
395
|
|
396 def run(self):
|
|
397 #TODO: cat all gff in one gff file in option
|
|
398 if self._configFileName == "":
|
|
399 self._iDB = DbFactory.createInstance()
|
|
400 else:
|
|
401 self._iDB = DbFactory.createInstance(self._configFileName)
|
|
402
|
|
403 self.checkOptions()
|
|
404 if self._verbose > 0:
|
|
405 print "START GFF3Maker"
|
|
406 sys.stdout.flush()
|
|
407
|
|
408 tablesFile = open(self._tablesFileName, "r")
|
|
409 linesFromAnnotationTablesFile = tablesFile.readlines()
|
|
410 tablesFile.close()
|
|
411 feature = "region"
|
|
412 frame = "."
|
|
413
|
|
414 iTSA = TableSeqAdaptator(self._iDB, self._inFastaName)
|
|
415 lTuples = iTSA.getAccessionAndLengthList()
|
|
416 for seqName, length in lTuples :
|
|
417 if not self._doSplit:
|
|
418 fileName = "%s.gff3" % seqName
|
|
419 outFile = open(fileName, "w")
|
|
420 outFile.write("##gff-version 3\n")
|
|
421 outFile.write("##sequence-region %s 1 %s\n" % (seqName, length))
|
|
422 for line in linesFromAnnotationTablesFile:
|
|
423 if line[0] == "#":
|
|
424 continue
|
|
425 tok = line.split()
|
|
426 if len(tok) == 0:
|
|
427 break
|
|
428 source = tok[0]
|
|
429 format = tok[1]
|
|
430 table = tok[2]
|
|
431 tableseq = ""
|
|
432 if len(tok) == 4:
|
|
433 tableseq = tok[3]
|
|
434 if format == 'path' :
|
|
435 annotations = self._getPathFeatures(table, tableseq, seqName, source, feature, frame)
|
|
436 elif format == 'set' :
|
|
437 annotations = self._getSetFeatures(table, seqName, source, feature, frame)
|
|
438 else:
|
|
439 raise Exception("Wrong format : %s" % format)
|
|
440 outFile.write(annotations)
|
|
441 outFile.close()
|
|
442 #TODO: check getNbLinesInSingleFile() to handle big files
|
|
443 if not self._isGFF3WithoutAnnotation and FileUtils.getNbLinesInSingleFile(fileName) == 2:
|
|
444 os.remove(fileName)
|
|
445 elif self._isWithSequence:
|
|
446 outFile = open(fileName, "a")
|
|
447 outFile.write("##FASTA\n")
|
|
448 iBioseq = iTSA.getBioseqFromHeader(seqName)
|
|
449 iBioseq.write(outFile)
|
|
450 outFile.close()
|
|
451 else:
|
|
452 count = 1
|
|
453 for line in linesFromAnnotationTablesFile:
|
|
454 if line[0] == "#":
|
|
455 continue
|
|
456 tok = line.split()
|
|
457 if len(tok) == 0:
|
|
458 break
|
|
459 source = tok[0]
|
|
460 format = tok[1]
|
|
461 table = tok[2]
|
|
462 tableseq = ""
|
|
463 if len(tok) == 4:
|
|
464 tableseq = tok[3]
|
|
465 fileName = "%s_Annot%i.gff3" % (seqName, count)
|
|
466 outFile = open(fileName, "w")
|
|
467 outFile.write("##gff-version 3\n")
|
|
468 outFile.write("##sequence-region %s 1 %s\n" % (seqName, length))
|
|
469 if format == 'path' :
|
|
470 annotations = self._getPathFeatures(table, tableseq, seqName, source, feature, frame)
|
|
471 elif format == 'set' :
|
|
472 annotations = self._getSetFeatures(table, seqName, source, feature, frame)
|
|
473 else:
|
|
474 raise Exception("Wrong format : %s" % format)
|
|
475 outFile.write(annotations)
|
|
476 outFile.close()
|
|
477 #TODO: check getNbLinesInSingleFile() to handle big files
|
|
478 if not self._isGFF3WithoutAnnotation and FileUtils.getNbLinesInSingleFile(fileName) == 2:
|
|
479 os.remove(fileName)
|
|
480 elif self._isWithSequence:
|
|
481 outFile = open(fileName, "a")
|
|
482 outFile.write("##FASTA\n")
|
|
483 iBioseq = iTSA.getBioseqFromHeader(seqName)
|
|
484 iBioseq.write(outFile)
|
|
485 outFile.close()
|
|
486 count += 1
|
|
487
|
|
488 self._iDB.close()
|
|
489
|
|
490 if self._verbose > 0:
|
|
491 print "END GFF3Maker"
|
|
492 sys.stdout.flush()
|
|
493
|
|
494 if __name__ == "__main__":
|
|
495 iGFF3Maker = GFF3Maker()
|
|
496 iGFF3Maker.setAttributesFromCmdLine()
|
|
497 iGFF3Maker.run()
|
|
498 |