18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import sys
|
|
4 import os
|
|
5 import getopt
|
|
6 import math
|
|
7 from commons.core.sql.DbMySql import DbMySql
|
|
8 from commons.core.sql.TablePathAdaptator import TablePathAdaptator
|
|
9 from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator
|
|
10 from commons.core.coord.PathUtils import PathUtils
|
|
11 from commons.core.coord.SetUtils import SetUtils
|
|
12 from commons.core.seq.BioseqUtils import BioseqUtils
|
|
13
|
|
14
|
|
15 class CorrelateTEageWithGCcontent( object ):
|
|
16
|
|
17 def __init__( self ):
|
|
18 self._inputCoord = ""
|
|
19 self._inputGenome = ""
|
|
20 self._inputTErefseq = ""
|
|
21 self._configFile = ""
|
|
22 self._outFile = ""
|
|
23 self._verbose = 0
|
|
24 self._db = None
|
|
25 self._tableCoord = ""
|
|
26 self._pathA = TablePathAdaptator()
|
|
27 self._tableGenome = ""
|
|
28 self._seqA = TableSeqAdaptator()
|
|
29
|
|
30
|
|
31 def help( self ):
|
|
32 print
|
|
33 print "usage: CorrelateTEageWithGCcontent.py [ options ]"
|
|
34 print "options:"
|
|
35 print " -h: this help"
|
|
36 print " -i: input TE coordinates (can be file or table)"
|
|
37 print " TEs as subjects in 'path' format"
|
|
38 print " -g: input genome sequences (can be fasta file or table)"
|
|
39 print " -r: input TE reference sequences (can be fasta file or table)"
|
|
40 print " -C: configuration file (if table as input)"
|
|
41 print " -o: output fasta file (default=inputCoord+'.gc')"
|
|
42 print " -v: verbosity level (default=0/1)"
|
|
43 print
|
|
44
|
|
45
|
|
46 def setAttributesFromCmdLine( self ):
|
|
47 try:
|
|
48 opts, args = getopt.getopt(sys.argv[1:],"hi:g:r:C:o:v:")
|
|
49 except getopt.GetoptError, err:
|
|
50 msg = "%s" % str(err)
|
|
51 sys.stderr.write( "%s\n" % msg )
|
|
52 self.help(); sys.exit(1)
|
|
53 for o,a in opts:
|
|
54 if o == "-h":
|
|
55 self.help(); sys.exit(0)
|
|
56 elif o == "-i":
|
|
57 self._inputCoord = a
|
|
58 elif o == "-g":
|
|
59 self._inputGenome = a
|
|
60 elif o == "-r":
|
|
61 self._inputTErefseq = a
|
|
62 elif o == "-C":
|
|
63 self._configFile = a
|
|
64 elif o =="-o":
|
|
65 self._outFile = a
|
|
66 elif o == "-v":
|
|
67 self._verbose = int(a)
|
|
68
|
|
69
|
|
70 def checkAttributes( self ):
|
|
71 if self._inputCoord == "":
|
|
72 msg = "ERROR: missing input TE coordinates (-i)"
|
|
73 sys.stderr.write( "%s\n" % msg )
|
|
74 self.help()
|
|
75 sys.exit(1)
|
|
76 if not os.path.exists( self._inputCoord ):
|
|
77 if not os.path.exists( self._configFile ):
|
|
78 msg = "ERROR: neither input file '%s' nor configuration file '%s'" % ( self._inputCoord, self._configFile )
|
|
79 sys.stderr.write( "%s\n" % msg )
|
|
80 self.help()
|
|
81 sys.exit(1)
|
|
82 if not os.path.exists( self._configFile ):
|
|
83 msg = "ERROR: can't find configuration file '%s'" % ( self._configFile )
|
|
84 sys.stderr.write( "%s\n" % msg )
|
|
85 sys.exit(1)
|
|
86 self._db = DbMySql( cfgFileName=self._configFile )
|
|
87 if not self._db.exist( self._inputCoord ):
|
|
88 msg = "ERROR: can't find table '%s'" % ( self._inputCoord )
|
|
89 sys.stderr.write( "%s\n" % msg )
|
|
90 self.help()
|
|
91 sys.exit(1)
|
|
92 self._tableCoord = self._inputCoord
|
|
93 else:
|
|
94 self._tableCoord = self._inputCoord.replace(".","_")
|
|
95 if self._inputGenome == "":
|
|
96 msg = "ERROR: missing input genome sequences (-g)"
|
|
97 sys.stderr.write( "%s\n" % msg )
|
|
98 self.help()
|
|
99 sys.exit(1)
|
|
100 if not os.path.exists( self._inputGenome ):
|
|
101 if not self._db.doesTableExist( self._inputGenome ):
|
|
102 msg = "ERROR: can't find table '%s'" % ( self._inputGenome )
|
|
103 sys.stderr.write( "%s\n" % msg )
|
|
104 self.help()
|
|
105 sys.exit(1)
|
|
106 self._tableGenome = self._inputGenome
|
|
107 else:
|
|
108 self._tableGenome = self._inputGenome.replace(".","_")
|
|
109 if self._inputTErefseq == "":
|
|
110 msg = "ERROR: missing input TE reference sequences (-r)"
|
|
111 sys.stderr.write( "%s\n" % msg )
|
|
112 self.help()
|
|
113 sys.exit(1)
|
|
114 if not os.path.exists( self._inputTErefseq ):
|
|
115 if not self._db.doesTableExist( self._inputTErefseq ):
|
|
116 msg = "ERROR: can't find table '%s'" % ( self._inputTErefseq )
|
|
117 sys.stderr.write( "%s\n" % msg )
|
|
118 self.help()
|
|
119 sys.exit(1)
|
|
120 if self._outFile == "":
|
|
121 self._outFile = "%s.gc" % ( self._inputCoord )
|
|
122
|
|
123
|
|
124 def getLengthOfTErefseqs( self ):
|
|
125 if os.path.exists( self._inputTErefseq ):
|
|
126 return BioseqUtils.getLengthPerSeqFromFile( self._inputTErefseq )
|
|
127 else:
|
|
128 dTErefseq2Length = {}
|
|
129 refseqA = TableSeqAdaptator( self._db, self._inputTErefseq )
|
|
130 lAccessions = refseqA.getAccessionsList()
|
|
131 for acc in lAccessions:
|
|
132 dTErefseq2Length[ acc ] = refseqA.getSeqLengthFromAccession( acc )
|
|
133 return dTErefseq2Length
|
|
134
|
|
135
|
|
136 def start( self ):
|
|
137 self.checkAttributes()
|
|
138 if self._verbose > 0:
|
|
139 print "START CorrelateTEageWithGCcontent.py"
|
|
140 sys.stdout.flush()
|
|
141 if os.path.exists( self._inputCoord ):
|
|
142 self._db = DbMySql( cfgFileName=self._configFile )
|
|
143 self._db.createTable( self._tableCoord, "path", self._inputCoord, True )
|
|
144 self._pathA = TablePathAdaptator( self._db, self._tableCoord )
|
|
145 if os.path.exists( self._inputGenome ):
|
|
146 self._db.createTable( self._tableGenome, "seq", self._inputGenome, True )
|
|
147 self._seqA = TableSeqAdaptator( self._db, self._tableGenome )
|
|
148 if self._verbose > 0:
|
|
149 print "output fasta file: %s" % self._outFile
|
|
150
|
|
151
|
|
152 def end( self ):
|
|
153 if os.path.exists( self._inputCoord ):
|
|
154 self._db.dropTable( self._tableCoord )
|
|
155 if os.path.exists( self._inputGenome ):
|
|
156 self._db.dropTable( self._tableGenome )
|
|
157 self._db.close()
|
|
158 if self._verbose > 0:
|
|
159 print "END CorrelateTEageWithGCcontent.py"
|
|
160 sys.stdout.flush()
|
|
161
|
|
162
|
|
163 def run( self ):
|
|
164 self.start()
|
|
165
|
|
166 dTErefseq2Length = self.getLengthOfTErefseqs()
|
|
167
|
|
168 outFileHandler = open( self._outFile, "w" )
|
|
169 outFileHandler.write( "copy\tTE\tchr\tlength\tid\tGC\tlengthPerc\n" )
|
|
170
|
|
171 lIdentifiers = self._pathA.getIdList()
|
|
172 nbTEcopies = len(lIdentifiers)
|
|
173 if self._verbose > 0:
|
|
174 print "nb of TE copies: %i" % ( nbTEcopies )
|
|
175 sys.stdout.flush()
|
|
176 count = 0
|
|
177 power10 = int( math.floor( math.log10( nbTEcopies ) ) ) - 1
|
|
178 for id in lIdentifiers:
|
|
179 count += 1
|
|
180 if self._verbose > 0 and power10 > 0 and count % math.pow(10,power10) == 0:
|
|
181 print "%s / %s" % ( str(count).zfill(power10+2), str(nbTEcopies).zfill(power10+2) )
|
|
182 sys.stdout.flush()
|
|
183 lPaths = self._pathA.getPathListFromId( id )
|
|
184 lSets = PathUtils.getSetListFromQueries( lPaths )
|
|
185 lMergedSets = SetUtils.mergeSetsInList( lSets )
|
|
186 bs = self._seqA.getBioseqFromSetList( lMergedSets )
|
|
187 data = "%i" % id
|
|
188 data += "\t%s" % ( bs.header.split("::")[0] )
|
|
189 data += "\t%s" % ( lPaths[0].getQueryName() )
|
|
190 data += "\t%i" % ( bs.getLength() )
|
|
191 data += "\t%.2f" % ( PathUtils.getIdentityFromPathList( lPaths ) )
|
|
192 data += "\t%.2f" % ( bs.getGCpercentage() )
|
|
193 data += "\t%.2f" % ( 100 * bs.getLength() / float( dTErefseq2Length[ bs.header.split("::")[0] ] ) )
|
|
194 outFileHandler.write( "%s\n" % data )
|
|
195
|
|
196 outFileHandler.close()
|
|
197
|
|
198 self.end()
|
|
199
|
|
200
|
|
201 if __name__ == "__main__":
|
|
202 i = CorrelateTEageWithGCcontent()
|
|
203 i.setAttributesFromCmdLine()
|
|
204 i.run()
|