comparison commons/tools/SpliceTEsFromGenome.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
comparison
equal deleted inserted replaced
17:b0e8584489e6 18:94ab73e8a190
1 #!/usr/bin/env python
2
3 import sys
4 import os
5 import getopt
6
7 from commons.core.sql.DbMySql import DbMySql
8 from commons.core.seq.FastaUtils import FastaUtils
9 from commons.core.coord.MapUtils import MapUtils
10 from commons.core.coord.AlignUtils import AlignUtils
11 from commons.core.coord.PathUtils import PathUtils
12
13
14 class SpliceTEsFromGenome( object ):
15
16 def __init__( self ):
17 self._inputData = ""
18 self._formatData = ""
19 self._genomeFile = ""
20 self._configFile = ""
21 self._outFile = ""
22 self._verbose = 0
23 self._db = None
24
25
26 def help( self ):
27 print
28 print "usage: SpliceTEsFromGenome.py [ options ]"
29 print "options:"
30 print " -h: this help"
31 print " -i: input TE coordinates (can be file or table)"
32 print " TEs as subjects if align or path format"
33 print " -f: format of the data (map/align/path)"
34 print " -g: genome file (format=fasta)"
35 print " -C: configuration file (if table as input)"
36 print " -o: output fasta file (default=genomeFile+'.splice')"
37 print " -v: verbosity level (default=0/1)"
38 print
39
40
41 def setAttributesFromCmdLine( self ):
42 try:
43 opts, args = getopt.getopt(sys.argv[1:],"hi:f:g:C:o:v:")
44 except getopt.GetoptError, err:
45 msg = "%s" % str(err)
46 sys.stderr.write( "%s\n" % msg )
47 self.help(); sys.exit(1)
48 for o,a in opts:
49 if o == "-h":
50 self.help(); sys.exit(0)
51 elif o == "-i":
52 self._inputData = a
53 elif o == "-f":
54 self._formatData = a
55 elif o == "-g":
56 self._genomeFile = a
57 elif o == "-C":
58 self._configFile = a
59 elif o =="-o":
60 self._outFile = a
61 elif o == "-v":
62 self._verbose = int(a)
63
64
65 def checkAttributes( self ):
66 if self._inputData == "":
67 msg = "ERROR: missing input data (-i)"
68 sys.stderr.write( "%s\n" % msg )
69 self.help()
70 sys.exit(1)
71 if not os.path.exists( self._inputData ):
72 if not os.path.exists( self._configFile ):
73 msg = "ERROR: neither input file '%s' nor configuration file '%s'" % ( self._inputData, self._configFile )
74 sys.stderr.write( "%s\n" % msg )
75 self.help()
76 sys.exit(1)
77 if not os.path.exists( self._configFile ):
78 msg = "ERROR: can't find config file '%s'" % ( self._configFile )
79 sys.stderr.write( "%s\n" % msg )
80 sys.exit(1)
81 self._db = DbMySql( cfgFileName=self._configFile )
82 if not self._db.doesTableExist( self._inputData ):
83 msg = "ERROR: can't find table '%s'" % ( self._inputData )
84 sys.stderr.write( "%s\n" % msg )
85 self.help()
86 sys.exit(1)
87 if self._formatData == "":
88 msg = "ERROR: need to precise format (-f)"
89 sys.stderr.write( "%s\n" % msg )
90 self.help()
91 sys.exit(1)
92 if self._formatData not in [ "map", "align", "path" ]:
93 msg = "ERROR: format '%s' not yet supported" % ( self._formatData )
94 sys.stderr.write( "%s\n" % msg )
95 self.help()
96 sys.exit(1)
97 if self._genomeFile == "":
98 msg = "ERROR: missing genome file (-g)"
99 sys.stderr.write( "%s\n" % msg )
100 self.help()
101 sys.exit(1)
102 if not os.path.exists( self._genomeFile ):
103 msg = "ERROR: can't find genome file '%s'" % ( self._genomeFile )
104 sys.stderr.write( "%s\n" % msg )
105 self.help()
106 sys.exit(1)
107 if self._outFile == "":
108 self._outFile = "%s.splice" % ( self._genomeFile )
109 if self._verbose > 0:
110 print "output fasta file: %s" % self._outFile
111
112
113 def getCoordsAsMapFile( self ):
114 if self._verbose > 0:
115 print "get TE coordinates as 'Map' file"
116 sys.stdout.flush()
117 if self._db != None:
118 cmd = "srptExportTable.py"
119 cmd += " -i %s" % ( self._inputData )
120 cmd += " -C %s" % ( self._configFile )
121 cmd += " -o %s.%s" % ( self._inputData, self._formatData )
122 returnStatus = os.system( cmd )
123 if returnStatus != 0:
124 msg = "ERROR while exporting data from table"
125 sys.stderr.write( "%s\n" % msg )
126 sys.exit(1)
127 self._inputData += ".%s" % ( self._formatData )
128
129 if self._formatData == "map":
130 return self._inputData
131 elif self._formatData == "align":
132 mapFile = "%s.map" % ( self._inputData )
133 AlignUtils.convertAlignFileIntoMapFileWithSubjectsOnQueries( self._inputData, mapFile )
134 return mapFile
135 elif self._formatData == "path":
136 mapFile = "%s.map" % ( self._inputData )
137 PathUtils.convertPathFileIntoMapFileWithSubjectsOnQueries( self._inputData, mapFile )
138 return mapFile
139
140
141 def mergeCoordsInMapFile( self, mapFile ):
142 if self._verbose > 0:
143 print "merge TE coordinates"
144 sys.stdout.flush()
145 mergeFile = "%s.merge" % ( mapFile )
146 MapUtils.mergeCoordsInFile( mapFile, mergeFile )
147 if self._formatData != "map" or self._db != None:
148 os.remove( mapFile )
149 return mergeFile
150
151
152 def spliceFastaFromCoords( self, mergeFile ):
153 if self._verbose > 0:
154 print "splice TE copies from the genome"
155 sys.stdout.flush()
156 FastaUtils.spliceFromCoords( self._genomeFile,
157 mergeFile,
158 self._outFile )
159
160 os.remove( mergeFile )
161
162
163 def start( self ):
164 self.checkAttributes()
165 if self._verbose > 0:
166 print "START SpliceTEsFromGenome.py"
167 sys.stdout.flush()
168
169
170 def end( self ):
171 if self._db != None:
172 self._db.close()
173 if self._verbose > 0:
174 print "END SpliceTEsFromGenome.py"
175 sys.stdout.flush()
176
177
178 def run( self ):
179 self.start()
180
181 mapFile = self.getCoordsAsMapFile()
182
183 mergeFile = self.mergeCoordsInMapFile( mapFile )
184
185 self.spliceFastaFromCoords( mergeFile )
186
187 self.end()
188
189
190 if __name__ == "__main__":
191 i = SpliceTEsFromGenome()
192 i.setAttributesFromCmdLine()
193 i.run()