comparison commons/tools/CorrelateTEageWithGCcontent.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
comparison
equal deleted inserted replaced
17:b0e8584489e6 18:94ab73e8a190
1 #!/usr/bin/env python
2
3 import sys
4 import os
5 import getopt
6 import math
7 from commons.core.sql.DbMySql import DbMySql
8 from commons.core.sql.TablePathAdaptator import TablePathAdaptator
9 from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator
10 from commons.core.coord.PathUtils import PathUtils
11 from commons.core.coord.SetUtils import SetUtils
12 from commons.core.seq.BioseqUtils import BioseqUtils
13
14
15 class CorrelateTEageWithGCcontent( object ):
16
17 def __init__( self ):
18 self._inputCoord = ""
19 self._inputGenome = ""
20 self._inputTErefseq = ""
21 self._configFile = ""
22 self._outFile = ""
23 self._verbose = 0
24 self._db = None
25 self._tableCoord = ""
26 self._pathA = TablePathAdaptator()
27 self._tableGenome = ""
28 self._seqA = TableSeqAdaptator()
29
30
31 def help( self ):
32 print
33 print "usage: CorrelateTEageWithGCcontent.py [ options ]"
34 print "options:"
35 print " -h: this help"
36 print " -i: input TE coordinates (can be file or table)"
37 print " TEs as subjects in 'path' format"
38 print " -g: input genome sequences (can be fasta file or table)"
39 print " -r: input TE reference sequences (can be fasta file or table)"
40 print " -C: configuration file (if table as input)"
41 print " -o: output fasta file (default=inputCoord+'.gc')"
42 print " -v: verbosity level (default=0/1)"
43 print
44
45
46 def setAttributesFromCmdLine( self ):
47 try:
48 opts, args = getopt.getopt(sys.argv[1:],"hi:g:r:C:o:v:")
49 except getopt.GetoptError, err:
50 msg = "%s" % str(err)
51 sys.stderr.write( "%s\n" % msg )
52 self.help(); sys.exit(1)
53 for o,a in opts:
54 if o == "-h":
55 self.help(); sys.exit(0)
56 elif o == "-i":
57 self._inputCoord = a
58 elif o == "-g":
59 self._inputGenome = a
60 elif o == "-r":
61 self._inputTErefseq = a
62 elif o == "-C":
63 self._configFile = a
64 elif o =="-o":
65 self._outFile = a
66 elif o == "-v":
67 self._verbose = int(a)
68
69
70 def checkAttributes( self ):
71 if self._inputCoord == "":
72 msg = "ERROR: missing input TE coordinates (-i)"
73 sys.stderr.write( "%s\n" % msg )
74 self.help()
75 sys.exit(1)
76 if not os.path.exists( self._inputCoord ):
77 if not os.path.exists( self._configFile ):
78 msg = "ERROR: neither input file '%s' nor configuration file '%s'" % ( self._inputCoord, self._configFile )
79 sys.stderr.write( "%s\n" % msg )
80 self.help()
81 sys.exit(1)
82 if not os.path.exists( self._configFile ):
83 msg = "ERROR: can't find configuration file '%s'" % ( self._configFile )
84 sys.stderr.write( "%s\n" % msg )
85 sys.exit(1)
86 self._db = DbMySql( cfgFileName=self._configFile )
87 if not self._db.exist( self._inputCoord ):
88 msg = "ERROR: can't find table '%s'" % ( self._inputCoord )
89 sys.stderr.write( "%s\n" % msg )
90 self.help()
91 sys.exit(1)
92 self._tableCoord = self._inputCoord
93 else:
94 self._tableCoord = self._inputCoord.replace(".","_")
95 if self._inputGenome == "":
96 msg = "ERROR: missing input genome sequences (-g)"
97 sys.stderr.write( "%s\n" % msg )
98 self.help()
99 sys.exit(1)
100 if not os.path.exists( self._inputGenome ):
101 if not self._db.doesTableExist( self._inputGenome ):
102 msg = "ERROR: can't find table '%s'" % ( self._inputGenome )
103 sys.stderr.write( "%s\n" % msg )
104 self.help()
105 sys.exit(1)
106 self._tableGenome = self._inputGenome
107 else:
108 self._tableGenome = self._inputGenome.replace(".","_")
109 if self._inputTErefseq == "":
110 msg = "ERROR: missing input TE reference sequences (-r)"
111 sys.stderr.write( "%s\n" % msg )
112 self.help()
113 sys.exit(1)
114 if not os.path.exists( self._inputTErefseq ):
115 if not self._db.doesTableExist( self._inputTErefseq ):
116 msg = "ERROR: can't find table '%s'" % ( self._inputTErefseq )
117 sys.stderr.write( "%s\n" % msg )
118 self.help()
119 sys.exit(1)
120 if self._outFile == "":
121 self._outFile = "%s.gc" % ( self._inputCoord )
122
123
124 def getLengthOfTErefseqs( self ):
125 if os.path.exists( self._inputTErefseq ):
126 return BioseqUtils.getLengthPerSeqFromFile( self._inputTErefseq )
127 else:
128 dTErefseq2Length = {}
129 refseqA = TableSeqAdaptator( self._db, self._inputTErefseq )
130 lAccessions = refseqA.getAccessionsList()
131 for acc in lAccessions:
132 dTErefseq2Length[ acc ] = refseqA.getSeqLengthFromAccession( acc )
133 return dTErefseq2Length
134
135
136 def start( self ):
137 self.checkAttributes()
138 if self._verbose > 0:
139 print "START CorrelateTEageWithGCcontent.py"
140 sys.stdout.flush()
141 if os.path.exists( self._inputCoord ):
142 self._db = DbMySql( cfgFileName=self._configFile )
143 self._db.createTable( self._tableCoord, "path", self._inputCoord, True )
144 self._pathA = TablePathAdaptator( self._db, self._tableCoord )
145 if os.path.exists( self._inputGenome ):
146 self._db.createTable( self._tableGenome, "seq", self._inputGenome, True )
147 self._seqA = TableSeqAdaptator( self._db, self._tableGenome )
148 if self._verbose > 0:
149 print "output fasta file: %s" % self._outFile
150
151
152 def end( self ):
153 if os.path.exists( self._inputCoord ):
154 self._db.dropTable( self._tableCoord )
155 if os.path.exists( self._inputGenome ):
156 self._db.dropTable( self._tableGenome )
157 self._db.close()
158 if self._verbose > 0:
159 print "END CorrelateTEageWithGCcontent.py"
160 sys.stdout.flush()
161
162
163 def run( self ):
164 self.start()
165
166 dTErefseq2Length = self.getLengthOfTErefseqs()
167
168 outFileHandler = open( self._outFile, "w" )
169 outFileHandler.write( "copy\tTE\tchr\tlength\tid\tGC\tlengthPerc\n" )
170
171 lIdentifiers = self._pathA.getIdList()
172 nbTEcopies = len(lIdentifiers)
173 if self._verbose > 0:
174 print "nb of TE copies: %i" % ( nbTEcopies )
175 sys.stdout.flush()
176 count = 0
177 power10 = int( math.floor( math.log10( nbTEcopies ) ) ) - 1
178 for id in lIdentifiers:
179 count += 1
180 if self._verbose > 0 and power10 > 0 and count % math.pow(10,power10) == 0:
181 print "%s / %s" % ( str(count).zfill(power10+2), str(nbTEcopies).zfill(power10+2) )
182 sys.stdout.flush()
183 lPaths = self._pathA.getPathListFromId( id )
184 lSets = PathUtils.getSetListFromQueries( lPaths )
185 lMergedSets = SetUtils.mergeSetsInList( lSets )
186 bs = self._seqA.getBioseqFromSetList( lMergedSets )
187 data = "%i" % id
188 data += "\t%s" % ( bs.header.split("::")[0] )
189 data += "\t%s" % ( lPaths[0].getQueryName() )
190 data += "\t%i" % ( bs.getLength() )
191 data += "\t%.2f" % ( PathUtils.getIdentityFromPathList( lPaths ) )
192 data += "\t%.2f" % ( bs.getGCpercentage() )
193 data += "\t%.2f" % ( 100 * bs.getLength() / float( dTErefseq2Length[ bs.header.split("::")[0] ] ) )
194 outFileHandler.write( "%s\n" % data )
195
196 outFileHandler.close()
197
198 self.end()
199
200
201 if __name__ == "__main__":
202 i = CorrelateTEageWithGCcontent()
203 i.setAttributesFromCmdLine()
204 i.run()