18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import sys
|
|
4 import os
|
|
5 import getopt
|
|
6
|
|
7 from commons.core.sql.DbMySql import DbMySql
|
|
8 from commons.core.seq.FastaUtils import FastaUtils
|
|
9 from commons.core.coord.MapUtils import MapUtils
|
|
10 from commons.core.coord.AlignUtils import AlignUtils
|
|
11 from commons.core.coord.PathUtils import PathUtils
|
|
12
|
|
13
|
|
14 class SpliceTEsFromGenome( object ):
|
|
15
|
|
16 def __init__( self ):
|
|
17 self._inputData = ""
|
|
18 self._formatData = ""
|
|
19 self._genomeFile = ""
|
|
20 self._configFile = ""
|
|
21 self._outFile = ""
|
|
22 self._verbose = 0
|
|
23 self._db = None
|
|
24
|
|
25
|
|
26 def help( self ):
|
|
27 print
|
|
28 print "usage: SpliceTEsFromGenome.py [ options ]"
|
|
29 print "options:"
|
|
30 print " -h: this help"
|
|
31 print " -i: input TE coordinates (can be file or table)"
|
|
32 print " TEs as subjects if align or path format"
|
|
33 print " -f: format of the data (map/align/path)"
|
|
34 print " -g: genome file (format=fasta)"
|
|
35 print " -C: configuration file (if table as input)"
|
|
36 print " -o: output fasta file (default=genomeFile+'.splice')"
|
|
37 print " -v: verbosity level (default=0/1)"
|
|
38 print
|
|
39
|
|
40
|
|
41 def setAttributesFromCmdLine( self ):
|
|
42 try:
|
|
43 opts, args = getopt.getopt(sys.argv[1:],"hi:f:g:C:o:v:")
|
|
44 except getopt.GetoptError, err:
|
|
45 msg = "%s" % str(err)
|
|
46 sys.stderr.write( "%s\n" % msg )
|
|
47 self.help(); sys.exit(1)
|
|
48 for o,a in opts:
|
|
49 if o == "-h":
|
|
50 self.help(); sys.exit(0)
|
|
51 elif o == "-i":
|
|
52 self._inputData = a
|
|
53 elif o == "-f":
|
|
54 self._formatData = a
|
|
55 elif o == "-g":
|
|
56 self._genomeFile = a
|
|
57 elif o == "-C":
|
|
58 self._configFile = a
|
|
59 elif o =="-o":
|
|
60 self._outFile = a
|
|
61 elif o == "-v":
|
|
62 self._verbose = int(a)
|
|
63
|
|
64
|
|
65 def checkAttributes( self ):
|
|
66 if self._inputData == "":
|
|
67 msg = "ERROR: missing input data (-i)"
|
|
68 sys.stderr.write( "%s\n" % msg )
|
|
69 self.help()
|
|
70 sys.exit(1)
|
|
71 if not os.path.exists( self._inputData ):
|
|
72 if not os.path.exists( self._configFile ):
|
|
73 msg = "ERROR: neither input file '%s' nor configuration file '%s'" % ( self._inputData, self._configFile )
|
|
74 sys.stderr.write( "%s\n" % msg )
|
|
75 self.help()
|
|
76 sys.exit(1)
|
|
77 if not os.path.exists( self._configFile ):
|
|
78 msg = "ERROR: can't find config file '%s'" % ( self._configFile )
|
|
79 sys.stderr.write( "%s\n" % msg )
|
|
80 sys.exit(1)
|
|
81 self._db = DbMySql( cfgFileName=self._configFile )
|
|
82 if not self._db.doesTableExist( self._inputData ):
|
|
83 msg = "ERROR: can't find table '%s'" % ( self._inputData )
|
|
84 sys.stderr.write( "%s\n" % msg )
|
|
85 self.help()
|
|
86 sys.exit(1)
|
|
87 if self._formatData == "":
|
|
88 msg = "ERROR: need to precise format (-f)"
|
|
89 sys.stderr.write( "%s\n" % msg )
|
|
90 self.help()
|
|
91 sys.exit(1)
|
|
92 if self._formatData not in [ "map", "align", "path" ]:
|
|
93 msg = "ERROR: format '%s' not yet supported" % ( self._formatData )
|
|
94 sys.stderr.write( "%s\n" % msg )
|
|
95 self.help()
|
|
96 sys.exit(1)
|
|
97 if self._genomeFile == "":
|
|
98 msg = "ERROR: missing genome file (-g)"
|
|
99 sys.stderr.write( "%s\n" % msg )
|
|
100 self.help()
|
|
101 sys.exit(1)
|
|
102 if not os.path.exists( self._genomeFile ):
|
|
103 msg = "ERROR: can't find genome file '%s'" % ( self._genomeFile )
|
|
104 sys.stderr.write( "%s\n" % msg )
|
|
105 self.help()
|
|
106 sys.exit(1)
|
|
107 if self._outFile == "":
|
|
108 self._outFile = "%s.splice" % ( self._genomeFile )
|
|
109 if self._verbose > 0:
|
|
110 print "output fasta file: %s" % self._outFile
|
|
111
|
|
112
|
|
113 def getCoordsAsMapFile( self ):
|
|
114 if self._verbose > 0:
|
|
115 print "get TE coordinates as 'Map' file"
|
|
116 sys.stdout.flush()
|
|
117 if self._db != None:
|
|
118 cmd = "srptExportTable.py"
|
|
119 cmd += " -i %s" % ( self._inputData )
|
|
120 cmd += " -C %s" % ( self._configFile )
|
|
121 cmd += " -o %s.%s" % ( self._inputData, self._formatData )
|
|
122 returnStatus = os.system( cmd )
|
|
123 if returnStatus != 0:
|
|
124 msg = "ERROR while exporting data from table"
|
|
125 sys.stderr.write( "%s\n" % msg )
|
|
126 sys.exit(1)
|
|
127 self._inputData += ".%s" % ( self._formatData )
|
|
128
|
|
129 if self._formatData == "map":
|
|
130 return self._inputData
|
|
131 elif self._formatData == "align":
|
|
132 mapFile = "%s.map" % ( self._inputData )
|
|
133 AlignUtils.convertAlignFileIntoMapFileWithSubjectsOnQueries( self._inputData, mapFile )
|
|
134 return mapFile
|
|
135 elif self._formatData == "path":
|
|
136 mapFile = "%s.map" % ( self._inputData )
|
|
137 PathUtils.convertPathFileIntoMapFileWithSubjectsOnQueries( self._inputData, mapFile )
|
|
138 return mapFile
|
|
139
|
|
140
|
|
141 def mergeCoordsInMapFile( self, mapFile ):
|
|
142 if self._verbose > 0:
|
|
143 print "merge TE coordinates"
|
|
144 sys.stdout.flush()
|
|
145 mergeFile = "%s.merge" % ( mapFile )
|
|
146 MapUtils.mergeCoordsInFile( mapFile, mergeFile )
|
|
147 if self._formatData != "map" or self._db != None:
|
|
148 os.remove( mapFile )
|
|
149 return mergeFile
|
|
150
|
|
151
|
|
152 def spliceFastaFromCoords( self, mergeFile ):
|
|
153 if self._verbose > 0:
|
|
154 print "splice TE copies from the genome"
|
|
155 sys.stdout.flush()
|
|
156 FastaUtils.spliceFromCoords( self._genomeFile,
|
|
157 mergeFile,
|
|
158 self._outFile )
|
|
159
|
|
160 os.remove( mergeFile )
|
|
161
|
|
162
|
|
163 def start( self ):
|
|
164 self.checkAttributes()
|
|
165 if self._verbose > 0:
|
|
166 print "START SpliceTEsFromGenome.py"
|
|
167 sys.stdout.flush()
|
|
168
|
|
169
|
|
170 def end( self ):
|
|
171 if self._db != None:
|
|
172 self._db.close()
|
|
173 if self._verbose > 0:
|
|
174 print "END SpliceTEsFromGenome.py"
|
|
175 sys.stdout.flush()
|
|
176
|
|
177
|
|
178 def run( self ):
|
|
179 self.start()
|
|
180
|
|
181 mapFile = self.getCoordsAsMapFile()
|
|
182
|
|
183 mergeFile = self.mergeCoordsInMapFile( mapFile )
|
|
184
|
|
185 self.spliceFastaFromCoords( mergeFile )
|
|
186
|
|
187 self.end()
|
|
188
|
|
189
|
|
190 if __name__ == "__main__":
|
|
191 i = SpliceTEsFromGenome()
|
|
192 i.setAttributesFromCmdLine()
|
|
193 i.run()
|