18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import re
|
|
4 import getopt
|
|
5 import sys
|
|
6 from commons.pyRepetUnit.profilesDB.Profiles import Profiles
|
|
7 from commons.core.LoggerFactory import LoggerFactory
|
|
8
|
|
9 LOG_DEPTH = "commons.pyRepetUnit.profiles"
|
|
10
|
|
11 ## Format a profiles DB for pipelines in REPET
|
|
12 #
|
|
13 class ProfilesDB4Repet( object ):
|
|
14
|
|
15 def __init__(self):
|
|
16 self.profile = Profiles()
|
|
17 self._inputFile = ""
|
|
18 self._outputFile = ""
|
|
19 self._verbosity = 2
|
|
20 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
|
|
21
|
|
22
|
|
23 def _help( self ):
|
|
24 print
|
|
25 print "usage: %s.py [ options ]" % ( type(self).__name__ )
|
|
26 print "options:"
|
|
27 print " -h: this help"
|
|
28 print " -i: name of the profiles DB to format for Repet"
|
|
29 print " -o: name of the output profiles DB for Repet"
|
|
30 print
|
|
31
|
|
32
|
|
33 def _setAttributesFromCmdLine( self ):
|
|
34 try:
|
|
35 opts, args = getopt.getopt(sys.argv[1:],"hi:o:")
|
|
36 except getopt.GetoptError, err:
|
|
37 print str(err); self._help(); sys.exit(1)
|
|
38 for o,a in opts:
|
|
39 if o == "-h":
|
|
40 self._help(); sys.exit(0)
|
|
41 elif o == "-i":
|
|
42 self.setInputFile( a )
|
|
43 elif o == "-o":
|
|
44 self.setOutputFile( a )
|
|
45
|
|
46 #TDOD: add nb of each domain in log file, verbose...
|
|
47 def _searchCurrentDomain(self, profile):
|
|
48 currentDomain = ""
|
|
49 #TODO: pattern GAGA and GAGE should be excluded !
|
|
50 #TODO: add new tag like "ORF1_LTR" for ATHILA as key word in Pfam
|
|
51 #TODO: add new tags from GypsyDB (MOV etc...)
|
|
52 if (re.search("[gG][aA][Gg]", profile[1]) or re.search("[gG][aA][Gg]", profile[3])):
|
|
53 currentDomain = "GAG"
|
|
54 elif (re.search("Zinc knuckle", profile[1]) or re.search("Zinc knuckle", profile[3])):
|
|
55 currentDomain = "GAG"
|
|
56 elif (re.search("PF02813", profile[2]) or re.search("PF01021", profile[2])):
|
|
57 currentDomain = "GAG"
|
|
58 elif (re.search("GAG_", profile[1])):
|
|
59 currentDomain = "GAG"
|
|
60 elif (re.search("GAGCOAT_", profile[1])):
|
|
61 currentDomain = "GAG"
|
|
62
|
|
63 elif ((re.search("[aA]spartic", profile[1]) or re.search("[aA]aspartic", profile[3])) and (re.search("[pP]roteinase", profile[1]) or re.search("[pP]roteinase", profile[3]))):
|
|
64 currentDomain = "AP"
|
|
65 elif ((re.search("[aA]spartic", profile[1]) or re.search("[aA]spartic", profile[3])) and (re.search("[pP]rotease", profile[1]) or re.search("[pP]rotease", profile[3]))):
|
|
66 currentDomain = "AP"
|
|
67 elif ((re.search("[rR]etrotransposon", profile[1]) or re.search("[rR]etrotransposon", profile[3])) and (re.search("[pP]eptidase", profile[1]) or re.search("[pP]eptidase", profile[3]))):
|
|
68 currentDomain = "AP"
|
|
69 elif ((re.search("[aA]spartic", profile[1]) or re.search("[aA]spartic", profile[3])) and (re.search("[pP]eptidase", profile[1]) or re.search("[pP]eptidase", profile[3]))):
|
|
70 currentDomain = "AP"
|
|
71 elif ((re.search("[aA]spartic", profile[1]) or re.search("[aA]spartic", profile[3])) and (re.search("[eE]ndopeptidase", profile[1]) or re.search("[eE]ndopeptidase", profile[3]))):
|
|
72 currentDomain = "AP"
|
|
73 elif ((re.search("[aA]spartyl", profile[1]) or re.search("[aA]spartyl", profile[3])) and (re.search("[pP]roteinase", profile[1]) or re.search("[pP]roteinase", profile[3]))):
|
|
74 currentDomain = "AP"
|
|
75 elif ((re.search("[aA]spartyl", profile[1]) or re.search("[aA]spartyl", profile[3])) and (re.search("[pP]rotease", profile[1]) or re.search("[pP]rotease", profile[3]))):
|
|
76 currentDomain = "AP"
|
|
77 elif ((re.search("[aA]spartyl", profile[1]) or re.search("[aA]spartyl", profile[3])) and (re.search("[pP]eptidase", profile[1]) or re.search("[pP]eptidase", profile[3]))):
|
|
78 currentDomain = "AP"
|
|
79 elif ((re.search("[aA]spartyl", profile[1]) or re.search("[aA]spartyl", profile[3])) and (re.search("[eE]ndopeptidase", profile[1]) or re.search("[eE]ndopeptidase", profile[3]))):
|
|
80 currentDomain = "AP"
|
|
81 elif ((re.search("AP_", profile[1])) and not (re.search("[eE]ndonuclease", profile[3]))):
|
|
82 currentDomain = "AP"
|
|
83
|
|
84 elif ((re.search("[iI]ntegrase", profile[1]) or re.search("[iI]ntegrase", profile[3])) and not (re.search(" C ", profile[1])) and not (re.search(" C ", profile[3]))):
|
|
85 currentDomain = "INT"
|
|
86 elif (re.search(".*[Cc]hromo.*", profile[1]) or re.search(".*[Cc]hromo.*", profile[3])):
|
|
87 currentDomain = "INT"
|
|
88 elif (re.search("INT_", profile[1])):
|
|
89 currentDomain = "INT"
|
|
90
|
|
91
|
|
92 elif ((re.search("[rR]everse", profile[1]) or re.search("[Rr]everse", profile[3])) and (re.search("[tT]ranscriptase", profile[1]) or re.search("[tT]ranscriptase", profile[3]))):
|
|
93 currentDomain = "RT"
|
|
94 elif (re.search("RT_", profile[1])):
|
|
95 currentDomain = "RT"
|
|
96
|
|
97
|
|
98 elif ((re.search("R[nN]ase", profile[1]) or re.search("R[nN]ase", profile[3])) and (re.search("H", profile[1]) or re.search("H", profile[3]))):
|
|
99 currentDomain = "RH"
|
|
100 elif ((re.search("R[nN]ase", profile[1]) or re.search("R[nN]ase", profile[3])) and (re.search("H ", profile[1]) or re.search("H ", profile[3]))):
|
|
101 currentDomain = "RH"
|
|
102 elif (re.search("RNaseH_", profile[1])):
|
|
103 currentDomain = "RH"
|
|
104
|
|
105
|
|
106 elif ((re.search("[eE]nvelope", profile[1]) or re.search("[eE]nvelope", profile[3])) and (re.search("[pP]rotein", profile[1]) or re.search("[pP]rotein", profile[3]))):
|
|
107 currentDomain = "ENV"
|
|
108 elif ((re.search("ENV", profile[1]) or re.search("ENV", profile[3])) and (re.search("[pP]olyprotein", profile[1]) or re.search("[pP]olyprotein", profile[3]))):
|
|
109 currentDomain = "ENV"
|
|
110 elif ((re.search("[eE]nvelope", profile[1]) or re.search("[eE]nvelope", profile[3])) and (re.search("[pP]olyprotein", profile[1]) or re.search("[pP]olyprotein", profile[3]))):
|
|
111 currentDomain = "ENV"
|
|
112 elif ((re.search("[eE]nvelope", profile[1]) or re.search("[eE]nvelope", profile[3])) and (re.search("[gG]lycoprotein", profile[1]) or re.search("[gG]lycoprotein", profile[3]))):
|
|
113 currentDomain = "ENV"
|
|
114 elif (re.search("PF07253", profile[2]) or re.search("PF03811", profile[2])):
|
|
115 currentDomain = "ENV"
|
|
116 elif (re.search("ENV_", profile[1])):
|
|
117 currentDomain = "ENV"
|
|
118
|
|
119 elif ((re.search("[tT]yrosine", profile[1]) or re.search("[tT]yrosine", profile[3])) and (re.search("[rR]ecombinase", profile[1]) or re.search("[rR]ecombinase", profile[3]))):
|
|
120 currentDomain = "YR"
|
|
121
|
|
122 elif ((re.search("[eE]ndonuclease", profile[1]) or re.search("[eE]ndonuclease", profile[3])) and not (re.search("AP ", profile[1]) or re.search("AP ", profile[3])) and not (re.search("[aA]purinic", profile[1]) or re.search("[aA]purinic", profile[3]))):
|
|
123 currentDomain = "EN"
|
|
124
|
|
125 elif ((re.search("[eE]ndonuclease", profile[1]) or re.search("[eE]ndonuclease", profile[3])) and (re.search("AP ", profile[1]) or re.search("AP ", profile[3]))):
|
|
126 currentDomain = "APE"
|
|
127 elif ((re.search("[eE]ndonuclease", profile[1]) or re.search("[eE]ndonuclease", profile[3])) and (re.search("[aA]purinic", profile[1]) or re.search("[aA]purinic", profile[3]))):
|
|
128 currentDomain = "APE"
|
|
129
|
|
130 elif ((re.search("[tT]ransposase", profile[1]) or re.search("[tT]ransposase", profile[3])) and not (re.search("DDE ", profile[1]) or re.search("DDE ", profile[3]))):
|
|
131 currentDomain = "Tase"
|
|
132 elif (re.search("DUF659", profile[1])):
|
|
133 currentDomain = "Tase"
|
|
134 elif (re.search("PF01695", profile[2])) or (re.search("PF02316", profile[2])) or (re.search("PF09039", profile[2])) or (re.search("PF04827", profile[2])) or (re.search("PF05699", profile[2])):
|
|
135 currentDomain = "Tase"
|
|
136
|
|
137 elif ((re.search("[tT]ransposase", profile[1]) or re.search("[tT]ransposase", profile[3])) and (re.search("DDE ", profile[1]) or re.search("DDE ", profile[3]))):
|
|
138 currentDomain = "Tase*"
|
|
139
|
|
140 elif ((re.search("[rR]eplication", profile[1]) or re.search("[rR]eplication", profile[3])) and (re.search("[pP]rotein", profile[1]) or re.search("[pP]rotein", profile[3])) and (re.search("A ", profile[1]) or re.search("A ", profile[3]))):
|
|
141 currentDomain = "RPA"
|
|
142 elif (re.search("[rR]epA ", profile[1]) or re.search("[rR]epA ", profile[3])):
|
|
143 currentDomain = "RPA"
|
|
144 elif (re.search("RPA", profile[1]) or re.search("RPA", profile[3])):
|
|
145 currentDomain = "RPA"
|
|
146
|
|
147 elif (re.search("[cC]-integrase", profile[1]) or re.search("[cC]-integrase", profile[3])):
|
|
148 currentDomain = "C-INT"
|
|
149
|
|
150 elif ((re.search("[pP]ackaging", profile[1]) or re.search("[pP]ackaging", profile[3])) and (re.search("ATPase", profile[1]) or re.search("ATPase", profile[3]))):
|
|
151 currentDomain = "ATP"
|
|
152
|
|
153 elif ((re.search("[cC]ysteine", profile[1]) or re.search("[cC]ysteine", profile[3])) and (re.search("[pP]rotease", profile[1]) or re.search("[pP]rotease", profile[3]))):
|
|
154 currentDomain = "CYP"
|
|
155 elif ((re.search("[cC]ysteine", profile[1]) or re.search("[cC]ysteine", profile[3])) and (re.search("[pP]eptidase", profile[1]) or re.search("[pP]eptidase", profile[3]))):
|
|
156 currentDomain = "CYP"
|
|
157 elif (re.search("[pP]eptidase_C", profile[1]) or re.search("[pP]eptidase_C", profile[3])):
|
|
158 currentDomain = "CYP"
|
|
159 elif (re.search("PF00559", profile[2])):
|
|
160 currentDomain = "CYP"
|
|
161
|
|
162 elif (re.search("[pP]ol\S*_*B", profile[1]) or re.search("[pP]ol\S*_*B", profile[3])):
|
|
163 currentDomain = "POLB"
|
|
164 elif ((re.search("[pP]olymerase", profile[1]) or re.search("[pP]olymerase", profile[3])) and (re.search("B ", profile[1]) or re.search("B ", profile[3]))):
|
|
165 currentDomain = "POLB"
|
|
166
|
|
167 elif (re.search("[hH]elicase", profile[1]) or re.search("[hH]elicase", profile[3])):
|
|
168 currentDomain = "HEL"
|
|
169
|
|
170 else :
|
|
171 currentDomain = "OTHER"
|
|
172 return currentDomain
|
|
173
|
|
174
|
|
175 ## Replace the old profile name by accession number, name, domain and gather cut off
|
|
176 #
|
|
177 # @param fout file handle
|
|
178 # @param profile Profiles instance
|
|
179 # @param currentDomain string
|
|
180 #
|
|
181 def _writeModifiedProfile(self, fout, profile, currentDomain):
|
|
182 for i in xrange(0, len(profile), 1):
|
|
183 if i != 1:
|
|
184 fout.write(profile[i])
|
|
185 else:
|
|
186 fout.write("NAME " + self.profile.accNumber + "_"\
|
|
187 + self.profile.name + "_"\
|
|
188 + currentDomain + "_"\
|
|
189 + str(self.profile.GA_cut_off) + "\n")
|
|
190
|
|
191
|
|
192 ## Set input file name
|
|
193 #
|
|
194 # @param inputFileName string
|
|
195 #
|
|
196 def setInputFile(self, inputFileName):
|
|
197 self._inputFile = inputFileName
|
|
198
|
|
199
|
|
200 ## Set output file name
|
|
201 #
|
|
202 # @param outputFileName string
|
|
203 #
|
|
204 def setOutputFile(self, outputFileName):
|
|
205 self._outputFile = outputFileName
|
|
206
|
|
207
|
|
208 ## Read a profiles DB file, parse it and, write a new profiles DB with TE domain information and GA score cut_off placed side by side of the name
|
|
209 #
|
|
210 def run( self ):
|
|
211 LoggerFactory.setLevel(self._log, self._verbosity)
|
|
212 fileIn = open( self._inputFile )
|
|
213 fout = open( self._outputFile, "w" )
|
|
214 profile = self.profile.readAndRetrieve( fileIn )
|
|
215 while profile != None:
|
|
216 currentDomain = self._searchCurrentDomain(profile)
|
|
217 if currentDomain == "OTHER":
|
|
218 self._log.warning(self.profile.accNumber + " " + self.profile.name + " has no associated domain")
|
|
219 self._writeModifiedProfile(fout, profile, currentDomain)
|
|
220 profile = self.profile.read( fileIn )
|
|
221
|
|
222
|
|
223 if __name__ == "__main__":
|
|
224 i = ProfilesDB4Repet()
|
|
225 i._setAttributesFromCmdLine()
|
|
226 i.run()
|