comparison cheminfolib.py @ 15:81836d586797 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/openbabel commit d9c51279c061a1da948a2582d5b502ca7573adbf
author bgruening
date Thu, 15 Aug 2024 11:01:36 +0000
parents e94b2920d4e4
children
comparison
equal deleted inserted replaced
14:d7f0ca1f2669 15:81836d586797
9 import subprocess 9 import subprocess
10 import sys 10 import sys
11 import tempfile 11 import tempfile
12 from multiprocessing import Pool 12 from multiprocessing import Pool
13 13
14
15 try: 14 try:
16 from galaxy import eggs 15 from galaxy import eggs
17 eggs.require('psycopg2') 16
17 eggs.require("psycopg2")
18 except ImportError: 18 except ImportError:
19 psycopg2 = None 19 psycopg2 = None
20 print('psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB') 20 print(
21 "psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB"
22 )
21 23
22 try: 24 try:
23 from openbabel import openbabel, pybel 25 from openbabel import openbabel, pybel
26
24 openbabel.obErrorLog.StopLogging() 27 openbabel.obErrorLog.StopLogging()
25 except ImportError: 28 except ImportError:
26 openbabel, pybel = None, None 29 openbabel, pybel = None, None
27 print('OpenBabel could not be found. A few functions are not available without OpenBabel.') 30 print(
31 "OpenBabel could not be found. A few functions are not available without OpenBabel."
32 )
28 33
29 34
30 def CountLines(path): 35 def CountLines(path):
31 out = subprocess.Popen(['wc', '-l', path], 36 out = subprocess.Popen(
32 stdout=subprocess.PIPE, 37 ["wc", "-l", path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT
33 stderr=subprocess.STDOUT 38 ).communicate()[0]
34 ).communicate()[0] 39 return int(out.partition(b" ")[0])
35 return int(out.partition(b' ')[0])
36 40
37 41
38 def grep(pattern, file_obj): 42 def grep(pattern, file_obj):
39 grepper = re.compile(pattern) 43 grepper = re.compile(pattern)
40 for line in file_obj: 44 for line in file_obj:
47 mol = False 51 mol = False
48 possible_inchi = True 52 possible_inchi = True
49 for line_counter, line in enumerate(open(filepath)): 53 for line_counter, line in enumerate(open(filepath)):
50 if line_counter > 10000: 54 if line_counter > 10000:
51 break 55 break
52 if line.find('$$$$') != -1: 56 if line.find("$$$$") != -1:
53 return 'sdf' 57 return "sdf"
54 elif line.find('@<TRIPOS>MOLECULE') != -1: 58 elif line.find("@<TRIPOS>MOLECULE") != -1:
55 return 'mol2' 59 return "mol2"
56 elif line.find('ligand id') != -1: 60 elif line.find("ligand id") != -1:
57 return 'drf' 61 return "drf"
58 elif possible_inchi and re.findall('^InChI=', line): 62 elif possible_inchi and re.findall("^InChI=", line):
59 return 'inchi' 63 return "inchi"
60 elif re.findall(r'^M\s+END', line): 64 elif re.findall(r"^M\s+END", line):
61 mol = True 65 mol = True
62 # first line is not an InChI, so it can't be an InChI file 66 # first line is not an InChI, so it can't be an InChI file
63 possible_inchi = False 67 possible_inchi = False
64 68
65 if mol: 69 if mol:
66 # END can occures before $$$$, so and SDF file will 70 # END can occures before $$$$, so and SDF file will
67 # be recognised as mol, if you not using this hack' 71 # be recognised as mol, if you not using this hack'
68 return 'mol' 72 return "mol"
69 return 'smi' 73 return "smi"
70 74
71 75
72 def db_connect(args): 76 def db_connect(args):
73 try: 77 try:
74 db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd)) 78 db_conn = psycopg2.connect(
79 "dbname=%s user=%s host=%s password=%s"
80 % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd)
81 )
75 return db_conn 82 return db_conn
76 except psycopg2.Error: 83 except psycopg2.Error:
77 sys.exit('Unable to connect to the db') 84 sys.exit("Unable to connect to the db")
78 85
79 86
80 ColumnNames = { 87 ColumnNames = {
81 'can_smiles': 'Canonical SMILES', 88 "can_smiles": "Canonical SMILES",
82 'can': 'Canonical SMILES', 89 "can": "Canonical SMILES",
83 'inchi': 'InChI', 90 "inchi": "InChI",
84 'inchi_key': 'InChI key', 91 "inchi_key": "InChI key",
85 'inchi_key_first': 'InChI key first', 92 "inchi_key_first": "InChI key first",
86 'inchi_key_last': 'InChI key last', 93 "inchi_key_last": "InChI key last",
87 'molwt': 'Molecular weight', 94 "molwt": "Molecular weight",
88 'hbd': 'Hydrogen-bond donors', 95 "hbd": "Hydrogen-bond donors",
89 'donors': 'Hydrogen-bond donors', 96 "donors": "Hydrogen-bond donors",
90 'hba': 'Hydrogen-bond acceptors', 97 "hba": "Hydrogen-bond acceptors",
91 'acceptors': 'Hydrogen-bond acceptors', 98 "acceptors": "Hydrogen-bond acceptors",
92 'rotbonds': 'Rotatable bonds', 99 "rotbonds": "Rotatable bonds",
93 'logp': 'logP', 100 "logp": "logP",
94 'psa': 'Polar surface area', 101 "psa": "Polar surface area",
95 'mr': 'Molecular refractivity', 102 "mr": "Molecular refractivity",
96 'atoms': 'Number of heavy atoms', 103 "atoms": "Number of heavy atoms",
97 'rings': 'Number of rings', 104 "rings": "Number of rings",
98 'set_bits': 'FP2 bits', 105 "set_bits": "FP2 bits",
99 'id': 'Internal identifier', 106 "id": "Internal identifier",
100 'tani': 'Tanimoto coefficient', 107 "tani": "Tanimoto coefficient",
101 'spectrophore': 'Spectrophores(TM)', 108 "spectrophore": "Spectrophores(TM)",
102 'dist_spectrophore': 'Spectrophores(TM) distance to target', 109 "dist_spectrophore": "Spectrophores(TM) distance to target",
103 'synonym': 'Entry id', 110 "synonym": "Entry id",
104 } 111 }
105 112
106 OBDescriptor = { 113 OBDescriptor = {
107 'atoms': ["atoms", "Number of atoms"], 114 "atoms": ["atoms", "Number of atoms"],
108 'hatoms': ["hatoms", "Number of heavy atoms"], # self defined tag hatoms in plugindefines.txt 115 "hatoms": [
109 'can_smiles': ["cansmi", "Canonical SMILES"], 116 "hatoms",
110 'can_smilesNS': ["cansmiNS", "Canonical SMILES without isotopes or stereo"], 117 "Number of heavy atoms",
118 ], # self defined tag hatoms in plugindefines.txt
119 "can_smiles": ["cansmi", "Canonical SMILES"],
120 "can_smilesNS": ["cansmiNS", "Canonical SMILES without isotopes or stereo"],
111 # ["abonds", "Number of aromatic bonds"], 121 # ["abonds", "Number of aromatic bonds"],
112 # ["bonds", "Number of bonds"], 122 # ["bonds", "Number of bonds"],
113 # ["dbonds", "Number of double bonds"], 123 # ["dbonds", "Number of double bonds"],
114 # ["formula", "Chemical formula"], 124 # ["formula", "Chemical formula"],
115 'hba': ["HBA1", "Number of Hydrogen Bond Acceptors 1 (JoelLib)"], 125 "hba": ["HBA1", "Number of Hydrogen Bond Acceptors 1 (JoelLib)"],
116 'hba2': ["HBA2", "Number of Hydrogen Bond Acceptors 2 (JoelLib)"], 126 "hba2": ["HBA2", "Number of Hydrogen Bond Acceptors 2 (JoelLib)"],
117 'hbd': ["HBD", "Number of Hydrogen Bond Donors (JoelLib)"], 127 "hbd": ["HBD", "Number of Hydrogen Bond Donors (JoelLib)"],
118 'inchi': ["InChI", "IUPAC InChI identifier"], 128 "inchi": ["InChI", "IUPAC InChI identifier"],
119 'inchi_key': ["InChIKey", "InChIKey"], 129 "inchi_key": ["InChIKey", "InChIKey"],
120 # ["L5", "Lipinski Rule of Five"], 130 # ["L5", "Lipinski Rule of Five"],
121 'logp': ["logP", "octanol/water partition coefficient"], 131 "logp": ["logP", "octanol/water partition coefficient"],
122 'mr': ["MR", "molar refractivity"], 132 "mr": ["MR", "molar refractivity"],
123 'molwt': ["MW", "Molecular Weight filter"], 133 "molwt": ["MW", "Molecular Weight filter"],
124 # ["nF", "Number of Fluorine Atoms"], 134 # ["nF", "Number of Fluorine Atoms"],
125 # ["s", "SMARTS filter"], 135 # ["s", "SMARTS filter"],
126 # ["sbonds", "Number of single bonds"], 136 # ["sbonds", "Number of single bonds"],
127 # ["smarts", "SMARTS filter"], 137 # ["smarts", "SMARTS filter"],
128 # ["tbonds", "Number of triple bonds"], 138 # ["tbonds", "Number of triple bonds"],
129 # ["title", "For comparing a molecule's title"], 139 # ["title", "For comparing a molecule's title"],
130 'psa': ["TPSA", "topological polar surface area"], 140 "psa": ["TPSA", "topological polar surface area"],
131 'rotbonds': ['ROTATABLE_BOND', 'rotatable bonds'], 141 "rotbonds": ["ROTATABLE_BOND", "rotatable bonds"],
132 } 142 }
133 143
134 144
135 def print_output(args, rows): 145 def print_output(args, rows):
136 if args.oformat == 'table': 146 if args.oformat == "table":
137 outfile = open(args.output, 'w') 147 outfile = open(args.output, "w")
138 requested_fields = (filter(lambda x: x not in ["[", "]", "'"], args.fetch)).split(', ') 148 requested_fields = (
149 filter(lambda x: x not in ["[", "]", "'"], args.fetch)
150 ).split(", ")
139 if args.header: 151 if args.header:
140 outfile.write('Identifier\t' + '\t'.join([ColumnNames[key] for key in requested_fields]) + '\n') 152 outfile.write(
153 "Identifier\t"
154 + "\t".join([ColumnNames[key] for key in requested_fields])
155 + "\n"
156 )
141 for row in rows: 157 for row in rows:
142 outfile.write(row['synonym'] + '\t' + '\t'.join([str(row[key]) for key in requested_fields]) + '\n') 158 outfile.write(
143 159 row["synonym"]
144 elif args.oformat in ['sdf', 'mol2']: 160 + "\t"
161 + "\t".join([str(row[key]) for key in requested_fields])
162 + "\n"
163 )
164
165 elif args.oformat in ["sdf", "mol2"]:
145 outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True) 166 outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True)
146 for row in rows: 167 for row in rows:
147 try: 168 try:
148 mol = pybel.readstring('sdf', row['mol']) 169 mol = pybel.readstring("sdf", row["mol"])
149 if args.oformat == 'sdf': 170 if args.oformat == "sdf":
150 keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split(', ') 171 keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split(
151 mol.data.update({ColumnNames['synonym']: row['synonym']}) 172 ", "
152 if 'inchi_key' in keys: 173 )
153 keys = (', '.join(keys).replace("inchi_key", "inchi_key_first, inchi_key_last")).split(', ') 174 mol.data.update({ColumnNames["synonym"]: row["synonym"]})
154 [mol.data.update({ColumnNames[key]: row[key]}) for key in keys if key] 175 if "inchi_key" in keys:
176 keys = (
177 ", ".join(keys).replace(
178 "inchi_key", "inchi_key_first, inchi_key_last"
179 )
180 ).split(", ")
181 [
182 mol.data.update({ColumnNames[key]: row[key]})
183 for key in keys
184 if key
185 ]
155 outfile.write(mol) 186 outfile.write(mol)
156 except OSError: 187 except OSError:
157 pass 188 pass
158 else: 189 else:
159 outfile = open(args.output, 'w') 190 outfile = open(args.output, "w")
160 outfile.write('\n'.join(['%s\t%s' % (row[args.oformat], row['synonym']) for row in rows])) 191 outfile.write(
192 "\n".join(["%s\t%s" % (row[args.oformat], row["synonym"]) for row in rows])
193 )
161 outfile.close() 194 outfile.close()
162 195
163 196
164 def pybel_stop_logging(): 197 def pybel_stop_logging():
165 openbabel.obErrorLog.StopLogging() 198 openbabel.obErrorLog.StopLogging()
166 199
167 200
168 def get_properties_ext(mol): 201 def get_properties_ext(mol):
169 HBD = pybel.Smarts("[!#6;!H0]") 202 HBD = pybel.Smarts("[!#6;!H0]")
170 HBA = pybel.Smarts(("[$([$([#8,#16]);!$(*=N~O);" 203 HBA = pybel.Smarts(
171 "!$(*~N=O);X1,X2]),$([#7;v3;" 204 (
172 "!$([nH]);!$(*(-a)-a)])]" 205 "[$([$([#8,#16]);!$(*=N~O);"
173 )) 206 "!$(*~N=O);X1,X2]),$([#7;v3;"
207 "!$([nH]);!$(*(-a)-a)])]"
208 )
209 )
174 calc_desc_dict = mol.calcdesc() 210 calc_desc_dict = mol.calcdesc()
175 211
176 try: 212 try:
177 logp = calc_desc_dict['logP'] 213 logp = calc_desc_dict["logP"]
178 except KeyError: 214 except KeyError:
179 logp = calc_desc_dict['LogP'] 215 logp = calc_desc_dict["LogP"]
180 216
181 return {"molwt": mol.molwt, 217 return {
182 "logp": logp, 218 "molwt": mol.molwt,
183 "donors": len(HBD.findall(mol)), 219 "logp": logp,
184 "acceptors": len(HBA.findall(mol)), 220 "donors": len(HBD.findall(mol)),
185 "psa": calc_desc_dict['TPSA'], 221 "acceptors": len(HBA.findall(mol)),
186 "mr": calc_desc_dict['MR'], 222 "psa": calc_desc_dict["TPSA"],
187 "rotbonds": mol.OBMol.NumRotors(), 223 "mr": calc_desc_dict["MR"],
188 "can": mol.write("can").split()[0].strip(), # tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string) 224 "rotbonds": mol.OBMol.NumRotors(),
189 "inchi": mol.write("inchi").strip(), 225 "can": mol.write("can")
190 "inchi_key": get_inchikey(mol).strip(), 226 .split()[0]
191 "rings": len(mol.sssr), 227 .strip(), # tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string)
192 "atoms": mol.OBMol.NumHvyAtoms(), 228 "inchi": mol.write("inchi").strip(),
193 "spectrophore": OBspectrophore(mol), 229 "inchi_key": get_inchikey(mol).strip(),
194 } 230 "rings": len(mol.sssr),
231 "atoms": mol.OBMol.NumHvyAtoms(),
232 "spectrophore": OBspectrophore(mol),
233 }
195 234
196 235
197 def get_inchikey(mol): 236 def get_inchikey(mol):
198 conv = openbabel.OBConversion() 237 conv = openbabel.OBConversion()
199 conv.SetInAndOutFormats("mol", "inchi") 238 conv.SetInAndOutFormats("mol", "inchi")
204 243
205 def OBspectrophore(mol): 244 def OBspectrophore(mol):
206 spectrophore = pybel.ob.OBSpectrophore() 245 spectrophore = pybel.ob.OBSpectrophore()
207 # Parameters: rotation angle = 20, normalization for mean and sd, accuracy = 3.0 A and non-stereospecific cages. 246 # Parameters: rotation angle = 20, normalization for mean and sd, accuracy = 3.0 A and non-stereospecific cages.
208 spectrophore.SetNormalization(spectrophore.NormalizationTowardsZeroMeanAndUnitStd) 247 spectrophore.SetNormalization(spectrophore.NormalizationTowardsZeroMeanAndUnitStd)
209 return ', '.join(["%.3f" % value for value in spectrophore.GetSpectrophore(mol.OBMol)]) 248 return ", ".join(
210 249 ["%.3f" % value for value in spectrophore.GetSpectrophore(mol.OBMol)]
211 250 )
212 def split_library(lib_path, lib_format='sdf', package_size=None): 251
252
253 def split_library(lib_path, lib_format="sdf", package_size=None):
213 """ 254 """
214 Split a library of compounds. Usage: split_library(lib_path, lib_format, package_size) 255 Split a library of compounds. Usage: split_library(lib_path, lib_format, package_size)
215 IT currently ONLY WORKS FOR SD-Files 256 IT currently ONLY WORKS FOR SD-Files
216 """ 257 """
217 pack = 1 258 pack = 1
218 mol_counter = 0 259 mol_counter = 0
219 260
220 outfile = open('/%s/%s_pack_%i.%s' % ('/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w') 261 outfile = open(
221 262 "/%s/%s_pack_%i.%s"
222 for line in open(lib_path, 'r'): 263 % (
264 "/".join(lib_path.split("/")[:-1]),
265 lib_path.split("/")[-1].split(".")[0],
266 pack,
267 "sdf",
268 ),
269 "w",
270 )
271
272 for line in open(lib_path, "r"):
223 outfile.write(line) 273 outfile.write(line)
224 if line.strip() == '$$$$': 274 if line.strip() == "$$$$":
225 mol_counter += 1 275 mol_counter += 1
226 if mol_counter % package_size == 0: 276 if mol_counter % package_size == 0:
227 outfile.close() 277 outfile.close()
228 pack += 1 278 pack += 1
229 outfile = open('/%s/%s_pack_%i.%s' % ('/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w') 279 outfile = open(
280 "/%s/%s_pack_%i.%s"
281 % (
282 "/".join(lib_path.split("/")[:-1]),
283 lib_path.split("/")[-1].split(".")[0],
284 pack,
285 "sdf",
286 ),
287 "w",
288 )
230 if mol_counter * 10 % package_size == 0: 289 if mol_counter * 10 % package_size == 0:
231 print('%i molecules parsed, starting pack nr. %i' % (mol_counter, pack - 1)) 290 print(
291 "%i molecules parsed, starting pack nr. %i"
292 % (mol_counter, pack - 1)
293 )
232 outfile.close() 294 outfile.close()
233 295
234 return True 296 return True
235 297
236 298
240 Usage: split_smi_library(smiles_file, 10) 302 Usage: split_smi_library(smiles_file, 10)
241 """ 303 """
242 output_files = [] 304 output_files = []
243 tfile = tempfile.NamedTemporaryFile(delete=False) 305 tfile = tempfile.NamedTemporaryFile(delete=False)
244 306
245 smiles_handle = open(smiles_file, 'r') 307 smiles_handle = open(smiles_file, "r")
246 for count, line in enumerate(smiles_handle): 308 for count, line in enumerate(smiles_handle):
247 if count % structures_in_one_file == 0 and count != 0: 309 if count % structures_in_one_file == 0 and count != 0:
248 tfile.close() 310 tfile.close()
249 output_files.append(tfile.name) 311 output_files.append(tfile.name)
250 tfile = tempfile.NamedTemporaryFile(delete=False) 312 tfile = tempfile.NamedTemporaryFile(delete=False)
255 return output_files 317 return output_files
256 318
257 319
258 def mp_run(input_path, regex, PROCESSES, function_to_call): 320 def mp_run(input_path, regex, PROCESSES, function_to_call):
259 paths = [] 321 paths = []
260 [paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex))] 322 [
323 paths.append(compound_file)
324 for compound_file in glob.glob(str(input_path) + str(regex))
325 ]
261 paths.sort() 326 paths.sort()
262 327
263 pool = Pool(processes=PROCESSES) 328 pool = Pool(processes=PROCESSES)
264 print('Process initialized with', PROCESSES, 'processors') 329 print("Process initialized with", PROCESSES, "processors")
265 result = pool.map_async(function_to_call, paths) 330 result = pool.map_async(function_to_call, paths)
266 result.get() 331 result.get()
267 332
268 return paths 333 return paths
269 334
270 335
271 if __name__ == '__main__': 336 if __name__ == "__main__":
272 print(check_filetype(sys.argv[1])) 337 print(check_filetype(sys.argv[1]))