comparison commons/tools/GFF3Maker.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
comparison
equal deleted inserted replaced
17:b0e8584489e6 18:94ab73e8a190
1 #!/usr/bin/env python
2
3 ##@file GFF3Maker.py
4
5 # Copyright INRA (Institut National de la Recherche Agronomique)
6 # http://www.inra.fr
7 # http://urgi.versailles.inra.fr
8 #
9 # This software is governed by the CeCILL license under French law and
10 # abiding by the rules of distribution of free software. You can use,
11 # modify and/ or redistribute the software under the terms of the CeCILL
12 # license as circulated by CEA, CNRS and INRIA at the following URL
13 # "http://www.cecill.info".
14 #
15 # As a counterpart to the access to the source code and rights to copy,
16 # modify and redistribute granted by the license, users are provided only
17 # with a limited warranty and the software's author, the holder of the
18 # economic rights, and the successive licensors have only limited
19 # liability.
20 #
21 # In this respect, the user's attention is drawn to the risks associated
22 # with loading, using, modifying and/or developing or reproducing the
23 # software by the user in light of its specific status of free software,
24 # that may mean that it is complicated to manipulate, and that also
25 # therefore means that it is reserved for developers and experienced
26 # professionals having in-depth computer knowledge. Users are therefore
27 # encouraged to load and test the software's suitability as regards their
28 # requirements in conditions enabling the security of their systems and/or
29 # data to be ensured and, more generally, to use and operate it in the
30 # same conditions as regards security.
31 #
32 # The fact that you are presently reading this means that you have had
33 # knowledge of the CeCILL license and that you accept its terms.
34
35 from commons.core.utils.RepetOptionParser import RepetOptionParser
36 from commons.core.utils.FileUtils import FileUtils
37 from commons.core.sql.DbFactory import DbFactory
38 from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator
39 from commons.core.sql.TablePathAdaptator import TablePathAdaptator
40 import sys
41 import os
42
43 ## GFF3Maker exports annotations from a 'path' table into a GFF3 file.
44 #
45 class GFF3Maker(object):
46
47 def __init__(self, inFastaName = "", tablesFileName = "", classifTableName = "", isChado = False, isGFF3WithoutAnnotation = False, isWithSequence = False, areMatchPartsCompulsory = False, configFileName = "", verbose = 0, doMergeIdenticalMatches = False, doSplit = False):
48 self._inFastaName = inFastaName
49 self._tablesFileName = tablesFileName
50 self._classifTableName = classifTableName
51 self._isChado = isChado
52 self._isGFF3WithoutAnnotation = isGFF3WithoutAnnotation
53 self._isWithSequence = isWithSequence
54 self._areMatchPartsCompulsory = areMatchPartsCompulsory
55 self._configFileName = configFileName
56 self._doMergeIdenticalMatches = doMergeIdenticalMatches
57 self._doSplit = doSplit
58 self._iDB = None
59 self._verbose = verbose
60
61 def setAttributesFromCmdLine(self):
62 description = "GFF3Maker exports annotations from 'path', 'set' and/or 'classif' tables into a GFF3 file\n"
63 parser = RepetOptionParser(description = description)
64 parser.add_option("-f", "--inseq", dest = "inFastaName", action = "store", type = "string", help = "'seq' table recording the input sequences", default = "")
65 parser.add_option("-t", "--tablesfile", dest = "tablesFileName", action = "store", type = "string", help = "tabulated file of table name to use to create the gff3 files (fields: tier name, format, table name)", default = "")
66 parser.add_option("-w", "--withSequence", dest = "isWithSequence", action = "store_true", help = "write the sequence at the end of GFF3 file", default = False)
67 parser.add_option("-a", "--withoutAnnotation", dest = "isGFF3WithoutAnnotation", action = "store_true", help = "write GFF3 files even if no annotation", default = False)
68 parser.add_option("-p", "--matchPart", dest = "areMatchPartsCompulsory", action = "store_true", help = "always associate a match_part to a match", default = False)
69 parser.add_option("-i", "--classifTable", dest = "classifTable", action = "store", type = "string", help = "name of the TE library classification table [optional]", default = "")
70 parser.add_option("-c", "--chado", dest = "isChado", action = "store_true", help = "Chado compliance", default = False)
71 parser.add_option("-m", "--doMergeIdenticalMatches", dest = "doMergeIdenticalMatches", action = "store_true", help = "merge identical matches based on query start, query end, score", default = False)
72 parser.add_option("-s", "--doSplit", dest = "doSplit", action = "store_true", help = "split each GFF3 per annotation type", default = False)
73 parser.add_option("-C", "--config", dest = "configFileName", action = "store", type = "string", help = "configuration file for database connection", default = "")
74 parser.add_option("-v", "--verbose", dest = "verbose", action = "store", type = "int", help = "verbosity level (default=0, else 1 or 2)", default = 0)
75 options = parser.parse_args()[0]
76 self._setAttributesFromOptions(options)
77
78 #TODO: write a "setAttributesFromConfig"
79 def _setAttributesFromOptions(self, options):
80 self.setInFastaName(options.inFastaName)
81 self.setTablesFileName(options.tablesFileName)
82 self.setClassifTable(options.classifTable)
83 self.setIsChado(options.isChado)
84 self.setDoMergeIdenticalMatches(options.doMergeIdenticalMatches)
85 self.setIsWithSequence(options.isWithSequence)
86 self.setIsGFF3WithoutAnnotation(options.isGFF3WithoutAnnotation)
87 self.setAreMatchPartCompulsory(options.areMatchPartsCompulsory)
88 self.setDoSplit(options.doSplit)
89 self.setConfigFileName(options.configFileName)
90 self.setVerbose(options.verbose)
91
92 def setInFastaName(self, inFastaName):
93 self._inFastaName = inFastaName
94
95 def setTablesFileName(self, tablesFileName):
96 self._tablesFileName = tablesFileName
97
98 def setIsWithSequence(self, isWithSequence):
99 self._isWithSequence = isWithSequence
100
101 def setIsGFF3WithoutAnnotation(self, isGFF3WithoutAnnotation):
102 self._isGFF3WithoutAnnotation = isGFF3WithoutAnnotation
103
104 def setAreMatchPartCompulsory(self, areMatchPartsCompulsory):
105 self._areMatchPartsCompulsory = areMatchPartsCompulsory
106
107 def setClassifTable(self, classifTable):
108 self._classifTableName = classifTable
109
110 def setIsChado(self, isChado):
111 self._isChado = isChado
112
113 def setDoMergeIdenticalMatches(self, doMergeIdenticalMatches):
114 self._doMergeIdenticalMatches = doMergeIdenticalMatches
115
116 def setDoSplit(self, doSplit):
117 self._doSplit = doSplit
118
119 def setConfigFileName(self, configFileName):
120 self._configFileName = configFileName
121
122 def setVerbose(self, verbose):
123 self._verbose = verbose
124
125 def checkOptions(self):
126 if self._inFastaName == "":
127 raise Exception("ERROR: options -f required")
128
129 if self._configFileName != "":
130 if not FileUtils.isRessourceExists(self._configFileName):
131 raise Exception("ERROR: configuration file does not exist!")
132
133 if self._classifTableName and not self._iDB.doesTableExist(self._classifTableName):
134 raise Exception("ERROR: classification table '%s' does not exist!" % self._classifTableName)
135
136 ## Retrieve the features to write in the GFF3 file.
137 #
138 # @param pathTable string name of the table recording the annotations (i.e. the features)
139 # @param seqName string name of the sequence (the source feature) on which we want to visualize the matches (the features)
140 # @param source string the program that generated the feature (i.e. REPET)
141 # @param frame string "." by default (or "+", "-")
142 # @return pathString string which will be printed in path file
143 #
144 def _getPathFeatures(self, pathTable, seqTable, seqName, source, feature, frame):
145 pathString = ""
146 iTPA = TablePathAdaptator(self._iDB, pathTable)
147 lPaths = iTPA.getPathListSortedByQueryCoordAndScoreFromQuery(seqName)
148 # organise them into 'match' and 'match_part'
149 if lPaths:
150 dPathID2Data = self._gatherSamePathFeatures(lPaths)
151 # build the output string
152 for pathID in dPathID2Data:
153 pathString += self._organizeEachPathFeature(pathID, dPathID2Data[pathID], seqName, source, frame, seqTable)
154 return pathString
155
156 ## Gather matches with the same path ID.
157 #
158 # @param data list of string lists results of a SQL request
159 # @return dPathID2Matchs dict whose keys are path IDs and values are matches data
160 #
161 def _gatherSamePathFeatures(self, lPaths):
162 dPathID2Matchs = {}
163 iPreviousPath = lPaths[0]
164 iPreviousPath.otherTargets = []
165 for iPath in lPaths[1:]:
166 if self._doMergeIdenticalMatches and iPreviousPath.getQueryStart() == iPath.getQueryStart() and iPreviousPath.getQueryEnd() == iPath.getQueryEnd() and iPreviousPath.getScore() == iPath.getScore() and iPreviousPath.getSubjectName() != iPath.getSubjectName():
167 iPreviousPath.otherTargets.append("%s %d %d" % (iPath.getSubjectName(), iPath.getSubjectStart(), iPath.getSubjectEnd()))
168 else:
169 self._addPathToMatchDict(dPathID2Matchs, iPreviousPath)
170 iPreviousPath = iPath
171 iPreviousPath.otherTargets = []
172 self._addPathToMatchDict(dPathID2Matchs, iPreviousPath)
173 return dPathID2Matchs
174
175 def _getClassifEvidenceBySeqName(self, seqName):
176 lseqName = seqName.split('_')
177 seqNameStructure = '%s_%%' % lseqName[0]
178 if lseqName[-1] == 'reversed':
179 seqNameStructure += '%s_%s' % (lseqName[-2],lseqName[-1])
180 else:
181 seqNameStructure += lseqName[-1]
182 qry = "SELECT evidence FROM %s WHERE seq_name like \"%s\"" % (self._classifTableName, seqNameStructure)
183 self._iDB.execute(qry)
184 result = ""
185 try:
186 result = "".join(self._iDB.fetchall()[0])
187 except: pass
188 return result
189
190 def _addPathToMatchDict(self, dPathID2Matchs, iPreviousPath):
191 pathId = iPreviousPath.getIdentifier()
192 subjectStart = iPreviousPath.getSubjectStart()
193 subjectEnd = iPreviousPath.getSubjectEnd()
194 strand = iPreviousPath.getSubjectStrand()
195 if subjectStart > subjectEnd:
196 tmp = subjectStart
197 subjectStart = subjectEnd
198 subjectEnd = tmp
199 queryStart = iPreviousPath.getQueryStart()
200 queryEnd = iPreviousPath.getQueryEnd()
201 subjectName = iPreviousPath.getSubjectName()
202 eValue = iPreviousPath.getEvalue()
203 identity = iPreviousPath.getIdentity()
204 otherTargets = iPreviousPath.otherTargets
205 if dPathID2Matchs.has_key(pathId):
206 dPathID2Matchs[pathId].append([queryStart, queryEnd, strand, subjectName, subjectStart, subjectEnd, eValue, identity, otherTargets])
207 else:
208 dPathID2Matchs[pathId] = [[queryStart, queryEnd, strand, subjectName, subjectStart, subjectEnd, eValue, identity, otherTargets]]
209
210 def _getConsensusLengthByTargetName(self, targetName, seqTableName):
211 iTableSeqAdaptator = TableSeqAdaptator(self._iDB, seqTableName)
212 return iTableSeqAdaptator.getSeqLengthFromDescription(targetName)
213
214 ## For a specific path ID, organize match data according to the GFF3 format.
215 #
216 # @param pathID string path ID
217 # @param lMatches match list
218 # @param seqName string name of the source feature
219 # @param source string 'source' field for GFF3 format
220 # @param frame string 'frame' field for GFF3 format
221 # @return lines string to write in the GFF3 file
222 #
223 def _organizeEachPathFeature(self, pathID, lMatches, seqName, source, frame, seqTable = ""):
224 lines = ""
225 minStart = lMatches[0][0]
226 maxEnd = lMatches[0][1]
227 minStartSubject = lMatches[0][4]
228 maxEndSubject = lMatches[0][5]
229 strand = lMatches[0][2]
230
231 # for each match
232 for i in lMatches:
233 if i[0] < minStart:
234 minStart = i[0]
235 if i[1] > maxEnd:
236 maxEnd = i[1]
237 if i[4] < minStartSubject:
238 minStartSubject = i[4]
239 if i[5] > maxEndSubject:
240 maxEndSubject = i[5]
241
242 target = lMatches[0][3]
243
244 targetDescTag = ""
245 if self._classifTableName != "":
246 targetDescTag = self._getClassifEvidenceBySeqName(target)
247 if targetDescTag != "":
248 targetDescTag = ";TargetDescription=%s" % targetDescTag.replace('=', ':').replace(';', '').replace(',', ' |')
249
250 targetLengthTag = ""
251 if seqTable != "":
252 targetLengthTag = self._getConsensusLengthByTargetName(target, seqTable)
253 targetLengthTag = ";TargetLength=%i" % targetLengthTag
254
255 attributes = "ID=ms%s_%s_%s" % (pathID, seqName, target)
256 if self._isChado:
257 attributes += ";Target=%s+%s+%s" % (target, minStartSubject, maxEndSubject)
258 else:
259 attributes += ";Target=%s %s %s" % (target, minStartSubject, maxEndSubject)
260
261 if lMatches[0][8]:
262 otherTargets = ", ".join(lMatches[0][8])
263 attributes += ";OtherTargets=%s" % otherTargets
264 else:
265 attributes += targetLengthTag
266 attributes += targetDescTag
267 if len(lMatches) == 1:
268 attributes += ";Identity=%s" % lMatches[0][7]
269
270
271 lines += "%s\t%s\tmatch\t%s\t%s\t0.0\t%s\t%s\t%s\n" % (seqName, source, minStart, maxEnd, strand, frame, attributes)
272
273 if len(lMatches) > 1 or self._areMatchPartsCompulsory:
274 count = 1
275 for i in lMatches:
276 attributes = "ID=mp%s-%i_%s_%s" % (pathID, count, seqName, target)
277 attributes += ";Parent=ms%s%s%s%s%s" % (pathID, "_", seqName, "_", target)
278 if self._isChado:
279 attributes += ";Target=%s+%s+%s" % (target, i[4], i[5])
280 else:
281 attributes += ";Target=%s %s %s" % (target, i[4], i[5])
282
283 if not i[8]:
284 attributes += ";Identity=%s" % i[7]
285 lines += "%s\t%s\tmatch_part\t%s\t%s\t%s\t%s\t%s\t%s\n" % (seqName, source, i[0], i[1], i[6], i[2], frame, attributes)
286 count += 1
287
288 return lines
289
290 ## Retrieve the features to write in the GFF3 file.
291 #
292 # @param table string name of the table recording the annotations (i.e. the features)
293 # @param key string name of the sequence (the source feature) on which we want to visualize the matches (the features)
294 # @param source string the program that generated the feature (i.e. REPET)
295 # @param frame string "." by default (or "+", "-")
296 # @return setString string which will be printed in set file
297 #
298 def _getSetFeatures(self, table, key, source, feature, frame):
299 setString = ""
300 # retrieve all the data about the matches
301 qry = "SELECT DISTINCT path,name,start,end FROM %s WHERE chr=\"%s\"" % (table, key)
302 self._iDB.execute(qry)
303 data = self._iDB.fetchall()
304 # organise them into 'match' and 'match_part'
305 dPathID2Data = self._gatherSameSetFeatures(data)
306 # build the output string
307 for pathID in dPathID2Data:
308 setString += self._organizeEachSetFeature(pathID, dPathID2Data[pathID], key, source, frame)
309 return setString
310
311 ## Gather matches with the same path ID.
312 #
313 # @param data list of string lists results of a SQL request
314 # @return dSetID2Matchs dict whose keys are set IDs and values are matches data
315 #
316 def _gatherSameSetFeatures(self, data):
317 dSetID2Matchs = {}
318
319 for i in data:
320 setID = i[0]
321 name = i[1]
322 start = i[2]
323 end = i[3]
324
325 matchStart = min(start, end)
326 matchEnd = max(start, end)
327 strand = self._getStrand(start, end)
328
329 if dSetID2Matchs.has_key(setID):
330 dSetID2Matchs[setID].append([name, matchStart, matchEnd, strand])
331
332 else:
333 dSetID2Matchs[setID] = [[name, matchStart, matchEnd, strand]]
334
335 return dSetID2Matchs
336
337 ## For a specific set ID, organize match data according to the GFF3 format.
338 #
339 # @param setID string path ID
340 # @param lMatches match list
341 # @param seqName string name of the source feature
342 # @param source string 'source' field for GFF3 format
343 # @param frame string 'frame' field for GFF3 format
344 # @return lines string to write in the GFF3 file
345 #
346 def _organizeEachSetFeature(self, setID, lMatches, seqName, source, frame):
347 lines = ""
348 minStart = min(lMatches[0][1], lMatches[0][2])
349 maxEnd = max(lMatches[0][1], lMatches[0][2])
350 strand = lMatches[0][3]
351
352 # for each match
353 for i in lMatches:
354 start = min(i[1],i[2])
355 if start < minStart:
356 minStart = start
357 end = max(i[1],i[2])
358 if end > maxEnd:
359 maxEnd = end
360
361 target = lMatches[0][0].replace("(","").replace(")","").replace("#","")
362
363 attributes = "ID=ms%s_%s_%s" % (setID, seqName, target)
364 if self._isChado:
365 attributes += ";Target=%s+%s+%s" % (target, "1", abs(minStart-maxEnd)+1)
366 else:
367 attributes += ";Target=%s %s %s" % (target, "1", abs(minStart-maxEnd)+1)
368 lines += "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (seqName, source, "match", minStart, maxEnd, "0.0", strand, frame, attributes)
369
370 if len(lMatches) > 1 or self._areMatchPartsCompulsory:
371 count = 1
372 for i in lMatches:
373 attributes = "ID=mp%s-%i_%s_%s" % (setID, count, seqName, target)
374 attributes += ";Parent=ms%s%s%s%s%s" % (setID, "_", seqName, "_", target)
375 if self._isChado:
376 attributes += ";Target=%s+%s+%s" % (target, "1", abs(i[1]-i[2])+1)
377 else:
378 attributes += ";Target=%s %s %s" % (target, "1", abs(i[1]-i[2])+1)
379 lines += "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (seqName, source, "match_part", i[1], i[2], "0.0", i[3], frame, attributes)
380 count += 1
381
382 return lines
383
384 ## Return the strand ('+' if start < end, '-' otherwise).
385 #
386 # @param start integer start coordinate
387 # @param end integer end coordinate
388 # @return strand string "+" or "-"
389 #
390 def _getStrand(self, start, end):
391 if start < end:
392 return "+"
393 else:
394 return "-"
395
396 def run(self):
397 #TODO: cat all gff in one gff file in option
398 if self._configFileName == "":
399 self._iDB = DbFactory.createInstance()
400 else:
401 self._iDB = DbFactory.createInstance(self._configFileName)
402
403 self.checkOptions()
404 if self._verbose > 0:
405 print "START GFF3Maker"
406 sys.stdout.flush()
407
408 tablesFile = open(self._tablesFileName, "r")
409 linesFromAnnotationTablesFile = tablesFile.readlines()
410 tablesFile.close()
411 feature = "region"
412 frame = "."
413
414 iTSA = TableSeqAdaptator(self._iDB, self._inFastaName)
415 lTuples = iTSA.getAccessionAndLengthList()
416 for seqName, length in lTuples :
417 if not self._doSplit:
418 fileName = "%s.gff3" % seqName
419 outFile = open(fileName, "w")
420 outFile.write("##gff-version 3\n")
421 outFile.write("##sequence-region %s 1 %s\n" % (seqName, length))
422 for line in linesFromAnnotationTablesFile:
423 if line[0] == "#":
424 continue
425 tok = line.split()
426 if len(tok) == 0:
427 break
428 source = tok[0]
429 format = tok[1]
430 table = tok[2]
431 tableseq = ""
432 if len(tok) == 4:
433 tableseq = tok[3]
434 if format == 'path' :
435 annotations = self._getPathFeatures(table, tableseq, seqName, source, feature, frame)
436 elif format == 'set' :
437 annotations = self._getSetFeatures(table, seqName, source, feature, frame)
438 else:
439 raise Exception("Wrong format : %s" % format)
440 outFile.write(annotations)
441 outFile.close()
442 #TODO: check getNbLinesInSingleFile() to handle big files
443 if not self._isGFF3WithoutAnnotation and FileUtils.getNbLinesInSingleFile(fileName) == 2:
444 os.remove(fileName)
445 elif self._isWithSequence:
446 outFile = open(fileName, "a")
447 outFile.write("##FASTA\n")
448 iBioseq = iTSA.getBioseqFromHeader(seqName)
449 iBioseq.write(outFile)
450 outFile.close()
451 else:
452 count = 1
453 for line in linesFromAnnotationTablesFile:
454 if line[0] == "#":
455 continue
456 tok = line.split()
457 if len(tok) == 0:
458 break
459 source = tok[0]
460 format = tok[1]
461 table = tok[2]
462 tableseq = ""
463 if len(tok) == 4:
464 tableseq = tok[3]
465 fileName = "%s_Annot%i.gff3" % (seqName, count)
466 outFile = open(fileName, "w")
467 outFile.write("##gff-version 3\n")
468 outFile.write("##sequence-region %s 1 %s\n" % (seqName, length))
469 if format == 'path' :
470 annotations = self._getPathFeatures(table, tableseq, seqName, source, feature, frame)
471 elif format == 'set' :
472 annotations = self._getSetFeatures(table, seqName, source, feature, frame)
473 else:
474 raise Exception("Wrong format : %s" % format)
475 outFile.write(annotations)
476 outFile.close()
477 #TODO: check getNbLinesInSingleFile() to handle big files
478 if not self._isGFF3WithoutAnnotation and FileUtils.getNbLinesInSingleFile(fileName) == 2:
479 os.remove(fileName)
480 elif self._isWithSequence:
481 outFile = open(fileName, "a")
482 outFile.write("##FASTA\n")
483 iBioseq = iTSA.getBioseqFromHeader(seqName)
484 iBioseq.write(outFile)
485 outFile.close()
486 count += 1
487
488 self._iDB.close()
489
490 if self._verbose > 0:
491 print "END GFF3Maker"
492 sys.stdout.flush()
493
494 if __name__ == "__main__":
495 iGFF3Maker = GFF3Maker()
496 iGFF3Maker.setAttributesFromCmdLine()
497 iGFF3Maker.run()
498