annotate commons/pyRepetUnit/blastnForClassifierStep1/RepbaseBLRnForClassifierStep1.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
1 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
2 Launch Blaster and then Matcher to compare the input sequences with known TEs via blastn and record the results into a MySQL table.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
3 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
4
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
5 import os
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
6 import ConfigParser
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
7 from commons.core.utils.FileUtils import FileUtils
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
8 from commons.core.LoggerFactory import LoggerFactory
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
9
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
10 LOG_DEPTH = "repet.tools"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
11
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
12 class RepbaseBLRnForClassifierStep1( object ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
13
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
14 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
15 Launch Blaster and then Matcher to compare the input sequences with known TEs via blastn and record the results into a MySQL table.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
16
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
17 @param inFileName: name of the input fasta file
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
18 @type inFileName: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
19
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
20 @param launch_1: generic command at the beginning of a specific command
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
21 @type launch_1: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
22
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
23 @param launch_2: generic command at the end of a specific command
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
24 @type launch_2: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
25
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
26 @return: all the commands to run the job
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
27 @rtype: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
28
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
29 @param cDir: current directory (where to retrieve the result files)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
30 @ype cDir: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
31
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
32 @param tmpDir: temporary directory (where the job will run)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
33 @type tmpDir: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
34
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
35 @param configFileName: configuration file name
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
36 @type configFileName: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
37
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
38 @param logger: a logger Instance
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
39 @type logger: logger
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
40
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
41 @param verbose: verbose(0/1/2)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
42 @type verbose: int
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
43
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
44 @param pL: program launcher
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
45 @type pL: programLauncher Instance
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
46
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
47 @param project: project name
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
48 @type project: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
49
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
50 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
51
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
52 def __init__(self, inFileName, launch_1, launch_2, cDir, tmpDir, configFileName, verbose, pL, project):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
53 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
54 Constructor
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
55 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
56 self._inFileName = inFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
57 self._launch_1 = launch_1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
58 self._launch_2 = launch_2
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
59 self._cDir = cDir
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
60 self._tmpDir = tmpDir
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
61 self._verbose = verbose
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
62 self._pL = pL
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
63 self._project = project
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
64 self._fileUtils = FileUtils()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
65 self._config = ConfigParser.ConfigParser()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
66 self._configFileName = configFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
67 self._config.readfp( open(self._configFileName) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
68 self._bank = self._config.get("detect_features","TE_nucl_bank")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
69 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbose)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
70
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
71 def formatRepbase_ntIfNecessary( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
72 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
73 Format Repbase (make 'cut' files).
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
74 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
75 if not os.path.exists( "%s_cut" % ( self._bank ) ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
76 self._log.debug("prepare bank '%s'..." % ( self._bank ))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
77 prg = os.environ["REPET_PATH"] + "/bin/blaster"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
78 cmd = prg
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
79 cmd += " -s %s" % ( self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
80 cmd += " -n blastn"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
81 if self._config.get("detect_features","wublast") == "yes":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
82 cmd += " -W"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
83 cmd += " -r"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
84 cmd += " -P"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
85 self._pL.launch( prg, cmd )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
86 os.system( "rm -f %s-blastn-*.param" % ( self._bank ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
87
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
88 def createCmdToLaunch( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
89 cmd = self._launch_1 + os.environ["REPET_PATH"] + "/bin/blaster"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
90 cmd += " -q %s" % ( self._inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
91 cmd += " -s %s/%s" % ( self._cDir, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
92 cmd += " -B %s_BLRn_%s" % ( self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
93 cmd += " -n blastn"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
94 if self._config.get("detect_features","wublast") == "yes":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
95 cmd += " -W"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
96 cmd += " -r"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
97 cmd += " -v 1"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
98 cmd += self._launch_2
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
99
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
100 cmd += "if not os.path.exists( \"%s/%s_BLRn_%s.param\" ):\n" % ( self._cDir, self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
101 cmd += "\tos.system( \"mv %s_BLRn_%s.param %s\" )\n" % ( self._inFileName, self._bank, self._cDir )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
102 cmd += "if os.path.exists( \"%s_cut\" ):\n" % ( self._inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
103 cmd += "\tos.system( \"rm -f %s_cut*\" )\n" % ( self._inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
104 cmd += "if os.path.exists( \"%s.Nstretch.map\" ):\n" % ( self._inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
105 cmd += "\tos.remove( \"%s.Nstretch.map\" )\n" % ( self._inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
106 cmd += "if os.path.exists( \"%s_BLRn_%s.raw\" ):\n" % ( self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
107 cmd += "\tos.remove( \"%s_BLRn_%s.raw\" )\n" % ( self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
108 cmd += "if os.path.exists( \"%s_BLRn_%s.seq_treated\" ):\n" % ( self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
109 cmd += "\tos.remove( \"%s_BLRn_%s.seq_treated\" )\n" % ( self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
110
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
111 cmd += self._launch_1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
112 cmd += os.environ["REPET_PATH"] + "/bin/matcher"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
113 cmd += " -m %s_BLRn_%s.align" % ( self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
114 cmd += " -q %s" % ( self._inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
115 cmd += " -s %s/%s" % ( self._cDir, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
116 cmd += " -j"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
117 cmd += " -v 1"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
118 cmd += self._launch_2
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
119
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
120 cmd += "if not os.path.exists( \"%s/%s_BLRn_%s.align.clean_match.path\" ):\n" % ( self._cDir, self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
121 cmd += "\tos.system( \"mv %s_BLRn_%s.align.clean_match.path %s\" )\n" % ( self._inFileName, self._bank, self._cDir )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
122 cmd += "if not os.path.exists( \"%s/%s_BLRn_%s.align.clean_match.param\" ):\n" % ( self._cDir, self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
123 cmd += "\tos.system( \"mv %s_BLRn_%s.align.clean_match.param %s\" )\n" % ( self._inFileName, self._bank, self._cDir )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
124 cmd += "if os.path.exists( \"%s_BLRn_%s.align\" ):\n" % ( self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
125 cmd += "\tos.remove( \"%s_BLRn_%s.align\" )\n" % ( self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
126 cmd += "if os.path.exists( \"%s_BLRn_%s.align.clean_match.fa\" ):\n" % ( self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
127 cmd += "\tos.remove( \"%s_BLRn_%s.align.clean_match.fa\" )\n" % ( self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
128 cmd += "if os.path.exists( \"%s_BLRn_%s.align.clean_match.map\" ):\n" % ( self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
129 cmd += "\tos.remove( \"%s_BLRn_%s.align.clean_match.map\" )\n" % ( self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
130 cmd += "if os.path.exists( \"%s_BLRn_%s.align.clean_match.tab\" ):\n" % ( self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
131 cmd += "\tos.remove( \"%s_BLRn_%s.align.clean_match.tab\" )\n" % ( self._inFileName, self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
132
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
133 if self._tmpDir != self._cDir:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
134 cmd += "if os.path.exists( \"%s\" ):\n" % ( self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
135 cmd += "\tos.remove( \"%s\" )\n" % ( self._bank )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
136
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
137 return cmd
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
138
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
139 def collectRepbaseBLRn( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
140 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
141 Concatenate the outputs of blastn, adapt the ID and load the results into a table.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
142 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
143 bankFull = self._bank
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
144 bankPath, bank = os.path.split( bankFull )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
145 self._concatPathFile(bank)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
146 self._adaptIDInPathFile(bank)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
147 self._loadPathFileInTable(bank)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
148 self._findAndRemoveUselessFiles(bank)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
149
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
150 def _concatPathFile(self, bank):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
151 FileUtils.catFilesByPattern("../batch_*.fa_BLRn_%s.align.clean_match.path" % bank,
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
152 "%s_BLRn_%s.align.clean_match.path.tmp" % (self._project, bank))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
153
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
154 def _adaptIDInPathFile(self, bank):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
155 if os.path.exists(os.environ["REPET_PATH"] + "/bin/pathnum2id"):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
156 prg = os.environ["REPET_PATH"] + "/bin/pathnum2id"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
157 cmd = prg
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
158 cmd += " -i %s_BLRn_%s.align.clean_match.path.tmp" % (self._project, bank)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
159 cmd += " -o %s_BLRn_%s.align.clean_match.path" % (self._project, bank)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
160 cmd += " -v %i" % (self._verbose - 1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
161 self._pL.launch(prg, cmd)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
162 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
163 prg = os.environ["REPET_PATH"] + "/bin/pathnum2id.py"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
164 cmd = prg
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
165 cmd += " -i %s_BLRn_%s.align.clean_match.path.tmp" % (self._project, bank)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
166 cmd += " -o %s_BLRn_%s.align.clean_match.path" % (self._project, bank)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
167 self._pL.launch(prg, cmd)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
168
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
169 def _loadPathFileInTable(self, bank):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
170 prg = os.environ["REPET_PATH"] + "/bin/srptCreateTable.py"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
171 cmd = prg
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
172 cmd += " -f %s_BLRn_%s.align.clean_match.path" % (self._project, bank)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
173 cmd += " -n %s_TE_BLRn_path" % (self._project)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
174 cmd += " -t path"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
175 cmd += " -c ../%s" % (self._configFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
176 self._pL.launch(prg, cmd)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
177
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
178 def _findAndRemoveUselessFiles(self, bank):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
179 prg = "find"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
180 cmd = prg
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
181 cmd += " .. -name \"batch_*.fa_BLRn_%s.*\" -exec rm {} \;" % (bank)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
182 self._pL.launch(prg, cmd)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
183 prg = "rm"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
184 cmd = prg
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
185 cmd += " %s_BLRn_%s.align.clean_match.path.tmp" % (self._project, bank)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
186 self._pL.launch(prg, cmd)