Mercurial > repos > yufei-luo > s_mart
comparison commons/tools/CorrelateTEageWithGCcontent.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:b0e8584489e6 | 18:94ab73e8a190 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import sys | |
4 import os | |
5 import getopt | |
6 import math | |
7 from commons.core.sql.DbMySql import DbMySql | |
8 from commons.core.sql.TablePathAdaptator import TablePathAdaptator | |
9 from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator | |
10 from commons.core.coord.PathUtils import PathUtils | |
11 from commons.core.coord.SetUtils import SetUtils | |
12 from commons.core.seq.BioseqUtils import BioseqUtils | |
13 | |
14 | |
15 class CorrelateTEageWithGCcontent( object ): | |
16 | |
17 def __init__( self ): | |
18 self._inputCoord = "" | |
19 self._inputGenome = "" | |
20 self._inputTErefseq = "" | |
21 self._configFile = "" | |
22 self._outFile = "" | |
23 self._verbose = 0 | |
24 self._db = None | |
25 self._tableCoord = "" | |
26 self._pathA = TablePathAdaptator() | |
27 self._tableGenome = "" | |
28 self._seqA = TableSeqAdaptator() | |
29 | |
30 | |
31 def help( self ): | |
32 print | |
33 print "usage: CorrelateTEageWithGCcontent.py [ options ]" | |
34 print "options:" | |
35 print " -h: this help" | |
36 print " -i: input TE coordinates (can be file or table)" | |
37 print " TEs as subjects in 'path' format" | |
38 print " -g: input genome sequences (can be fasta file or table)" | |
39 print " -r: input TE reference sequences (can be fasta file or table)" | |
40 print " -C: configuration file (if table as input)" | |
41 print " -o: output fasta file (default=inputCoord+'.gc')" | |
42 print " -v: verbosity level (default=0/1)" | |
43 print | |
44 | |
45 | |
46 def setAttributesFromCmdLine( self ): | |
47 try: | |
48 opts, args = getopt.getopt(sys.argv[1:],"hi:g:r:C:o:v:") | |
49 except getopt.GetoptError, err: | |
50 msg = "%s" % str(err) | |
51 sys.stderr.write( "%s\n" % msg ) | |
52 self.help(); sys.exit(1) | |
53 for o,a in opts: | |
54 if o == "-h": | |
55 self.help(); sys.exit(0) | |
56 elif o == "-i": | |
57 self._inputCoord = a | |
58 elif o == "-g": | |
59 self._inputGenome = a | |
60 elif o == "-r": | |
61 self._inputTErefseq = a | |
62 elif o == "-C": | |
63 self._configFile = a | |
64 elif o =="-o": | |
65 self._outFile = a | |
66 elif o == "-v": | |
67 self._verbose = int(a) | |
68 | |
69 | |
70 def checkAttributes( self ): | |
71 if self._inputCoord == "": | |
72 msg = "ERROR: missing input TE coordinates (-i)" | |
73 sys.stderr.write( "%s\n" % msg ) | |
74 self.help() | |
75 sys.exit(1) | |
76 if not os.path.exists( self._inputCoord ): | |
77 if not os.path.exists( self._configFile ): | |
78 msg = "ERROR: neither input file '%s' nor configuration file '%s'" % ( self._inputCoord, self._configFile ) | |
79 sys.stderr.write( "%s\n" % msg ) | |
80 self.help() | |
81 sys.exit(1) | |
82 if not os.path.exists( self._configFile ): | |
83 msg = "ERROR: can't find configuration file '%s'" % ( self._configFile ) | |
84 sys.stderr.write( "%s\n" % msg ) | |
85 sys.exit(1) | |
86 self._db = DbMySql( cfgFileName=self._configFile ) | |
87 if not self._db.exist( self._inputCoord ): | |
88 msg = "ERROR: can't find table '%s'" % ( self._inputCoord ) | |
89 sys.stderr.write( "%s\n" % msg ) | |
90 self.help() | |
91 sys.exit(1) | |
92 self._tableCoord = self._inputCoord | |
93 else: | |
94 self._tableCoord = self._inputCoord.replace(".","_") | |
95 if self._inputGenome == "": | |
96 msg = "ERROR: missing input genome sequences (-g)" | |
97 sys.stderr.write( "%s\n" % msg ) | |
98 self.help() | |
99 sys.exit(1) | |
100 if not os.path.exists( self._inputGenome ): | |
101 if not self._db.doesTableExist( self._inputGenome ): | |
102 msg = "ERROR: can't find table '%s'" % ( self._inputGenome ) | |
103 sys.stderr.write( "%s\n" % msg ) | |
104 self.help() | |
105 sys.exit(1) | |
106 self._tableGenome = self._inputGenome | |
107 else: | |
108 self._tableGenome = self._inputGenome.replace(".","_") | |
109 if self._inputTErefseq == "": | |
110 msg = "ERROR: missing input TE reference sequences (-r)" | |
111 sys.stderr.write( "%s\n" % msg ) | |
112 self.help() | |
113 sys.exit(1) | |
114 if not os.path.exists( self._inputTErefseq ): | |
115 if not self._db.doesTableExist( self._inputTErefseq ): | |
116 msg = "ERROR: can't find table '%s'" % ( self._inputTErefseq ) | |
117 sys.stderr.write( "%s\n" % msg ) | |
118 self.help() | |
119 sys.exit(1) | |
120 if self._outFile == "": | |
121 self._outFile = "%s.gc" % ( self._inputCoord ) | |
122 | |
123 | |
124 def getLengthOfTErefseqs( self ): | |
125 if os.path.exists( self._inputTErefseq ): | |
126 return BioseqUtils.getLengthPerSeqFromFile( self._inputTErefseq ) | |
127 else: | |
128 dTErefseq2Length = {} | |
129 refseqA = TableSeqAdaptator( self._db, self._inputTErefseq ) | |
130 lAccessions = refseqA.getAccessionsList() | |
131 for acc in lAccessions: | |
132 dTErefseq2Length[ acc ] = refseqA.getSeqLengthFromAccession( acc ) | |
133 return dTErefseq2Length | |
134 | |
135 | |
136 def start( self ): | |
137 self.checkAttributes() | |
138 if self._verbose > 0: | |
139 print "START CorrelateTEageWithGCcontent.py" | |
140 sys.stdout.flush() | |
141 if os.path.exists( self._inputCoord ): | |
142 self._db = DbMySql( cfgFileName=self._configFile ) | |
143 self._db.createTable( self._tableCoord, "path", self._inputCoord, True ) | |
144 self._pathA = TablePathAdaptator( self._db, self._tableCoord ) | |
145 if os.path.exists( self._inputGenome ): | |
146 self._db.createTable( self._tableGenome, "seq", self._inputGenome, True ) | |
147 self._seqA = TableSeqAdaptator( self._db, self._tableGenome ) | |
148 if self._verbose > 0: | |
149 print "output fasta file: %s" % self._outFile | |
150 | |
151 | |
152 def end( self ): | |
153 if os.path.exists( self._inputCoord ): | |
154 self._db.dropTable( self._tableCoord ) | |
155 if os.path.exists( self._inputGenome ): | |
156 self._db.dropTable( self._tableGenome ) | |
157 self._db.close() | |
158 if self._verbose > 0: | |
159 print "END CorrelateTEageWithGCcontent.py" | |
160 sys.stdout.flush() | |
161 | |
162 | |
163 def run( self ): | |
164 self.start() | |
165 | |
166 dTErefseq2Length = self.getLengthOfTErefseqs() | |
167 | |
168 outFileHandler = open( self._outFile, "w" ) | |
169 outFileHandler.write( "copy\tTE\tchr\tlength\tid\tGC\tlengthPerc\n" ) | |
170 | |
171 lIdentifiers = self._pathA.getIdList() | |
172 nbTEcopies = len(lIdentifiers) | |
173 if self._verbose > 0: | |
174 print "nb of TE copies: %i" % ( nbTEcopies ) | |
175 sys.stdout.flush() | |
176 count = 0 | |
177 power10 = int( math.floor( math.log10( nbTEcopies ) ) ) - 1 | |
178 for id in lIdentifiers: | |
179 count += 1 | |
180 if self._verbose > 0 and power10 > 0 and count % math.pow(10,power10) == 0: | |
181 print "%s / %s" % ( str(count).zfill(power10+2), str(nbTEcopies).zfill(power10+2) ) | |
182 sys.stdout.flush() | |
183 lPaths = self._pathA.getPathListFromId( id ) | |
184 lSets = PathUtils.getSetListFromQueries( lPaths ) | |
185 lMergedSets = SetUtils.mergeSetsInList( lSets ) | |
186 bs = self._seqA.getBioseqFromSetList( lMergedSets ) | |
187 data = "%i" % id | |
188 data += "\t%s" % ( bs.header.split("::")[0] ) | |
189 data += "\t%s" % ( lPaths[0].getQueryName() ) | |
190 data += "\t%i" % ( bs.getLength() ) | |
191 data += "\t%.2f" % ( PathUtils.getIdentityFromPathList( lPaths ) ) | |
192 data += "\t%.2f" % ( bs.getGCpercentage() ) | |
193 data += "\t%.2f" % ( 100 * bs.getLength() / float( dTErefseq2Length[ bs.header.split("::")[0] ] ) ) | |
194 outFileHandler.write( "%s\n" % data ) | |
195 | |
196 outFileHandler.close() | |
197 | |
198 self.end() | |
199 | |
200 | |
201 if __name__ == "__main__": | |
202 i = CorrelateTEageWithGCcontent() | |
203 i.setAttributesFromCmdLine() | |
204 i.run() |