comparison cheminfolib.py @ 5:a5f4b80e6769 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/openbabel commit 1fe240ef0064a1a4a66d9be1ccace53824280b75"
author bgruening
date Mon, 19 Oct 2020 14:48:46 +0000
parents de4c80d17527
children fac2c28b4c55
comparison
equal deleted inserted replaced
4:de4c80d17527 5:a5f4b80e6769
2 """ 2 """
3 Small library with cheminformatic functions based on openbabel and pgchem. 3 Small library with cheminformatic functions based on openbabel and pgchem.
4 Copyright 2012, Bjoern Gruening and Xavier Lucas 4 Copyright 2012, Bjoern Gruening and Xavier Lucas
5 """ 5 """
6 6
7 import os, sys 7 import glob
8 import re
9 import subprocess
10 import sys
11 import tempfile
12 from multiprocessing import Pool
13
8 14
9 try: 15 try:
10 from galaxy import eggs 16 from galaxy import eggs
11 eggs.require('psycopg2') 17 eggs.require('psycopg2')
12 except: 18 except ImportError:
19 psycopg2 = None
13 print('psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB') 20 print('psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB')
14 21
15 try: 22 try:
16 from openbabel import openbabel, pybel 23 from openbabel import openbabel, pybel
17 openbabel.obErrorLog.StopLogging() 24 openbabel.obErrorLog.StopLogging()
18 except: 25 except ImportError:
26 openbabel, pybel = None, None
19 print('OpenBabel could not be found. A few functions are not available without OpenBabel.') 27 print('OpenBabel could not be found. A few functions are not available without OpenBabel.')
20 28
21 from multiprocessing import Pool 29
22 import glob, tempfile, re 30 def CountLines(path):
23 import subprocess
24
25 def CountLines( path ):
26 out = subprocess.Popen(['wc', '-l', path], 31 out = subprocess.Popen(['wc', '-l', path],
27 stdout=subprocess.PIPE, 32 stdout=subprocess.PIPE,
28 stderr=subprocess.STDOUT 33 stderr=subprocess.STDOUT
29 ).communicate()[0] 34 ).communicate()[0]
30 return int(out.partition(b' ')[0]) 35 return int(out.partition(b' ')[0])
36
31 37
32 def grep(pattern, file_obj): 38 def grep(pattern, file_obj):
33 grepper = re.compile(pattern) 39 grepper = re.compile(pattern)
34 for line in file_obj: 40 for line in file_obj:
35 if grepper.search(line): 41 if grepper.search(line):
36 return True 42 return True
37 return False 43 return False
44
38 45
39 def check_filetype(filepath): 46 def check_filetype(filepath):
40 mol = False 47 mol = False
41 possible_inchi = True 48 possible_inchi = True
42 for line_counter, line in enumerate(open(filepath)): 49 for line_counter, line in enumerate(open(filepath)):
48 return 'mol2' 55 return 'mol2'
49 elif line.find('ligand id') != -1: 56 elif line.find('ligand id') != -1:
50 return 'drf' 57 return 'drf'
51 elif possible_inchi and re.findall('^InChI=', line): 58 elif possible_inchi and re.findall('^InChI=', line):
52 return 'inchi' 59 return 'inchi'
53 elif re.findall('^M\s+END', line): 60 elif re.findall(r'^M\s+END', line):
54 mol = True 61 mol = True
55 # first line is not an InChI, so it can't be an InChI file 62 # first line is not an InChI, so it can't be an InChI file
56 possible_inchi = False 63 possible_inchi = False
57 64
58 if mol: 65 if mol:
59 # END can occures before $$$$, so and SDF file will 66 # END can occures before $$$$, so and SDF file will
60 # be recognised as mol, if you not using this hack' 67 # be recognised as mol, if you not using this hack'
61 return 'mol' 68 return 'mol'
62 return 'smi' 69 return 'smi'
63 70
71
64 def db_connect(args): 72 def db_connect(args):
65 try: 73 try:
66 db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd)); 74 db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd))
67 return db_conn 75 return db_conn
68 except: 76 except psycopg2.Error:
69 sys.exit('Unable to connect to the db') 77 sys.exit('Unable to connect to the db')
70 78
79
71 ColumnNames = { 80 ColumnNames = {
72 'can_smiles' : 'Canonical SMILES', 81 'can_smiles': 'Canonical SMILES',
73 'can' : 'Canonical SMILES', 82 'can': 'Canonical SMILES',
74 'inchi' : 'InChI', 83 'inchi': 'InChI',
75 'inchi_key' : 'InChI key', 84 'inchi_key': 'InChI key',
76 'inchi_key_first' : 'InChI key first', 85 'inchi_key_first': 'InChI key first',
77 'inchi_key_last' : 'InChI key last', 86 'inchi_key_last': 'InChI key last',
78 'molwt' : 'Molecular weight', 87 'molwt': 'Molecular weight',
79 'hbd' : 'Hydrogen-bond donors', 88 'hbd': 'Hydrogen-bond donors',
80 'donors' : 'Hydrogen-bond donors', 89 'donors': 'Hydrogen-bond donors',
81 'hba' : 'Hydrogen-bond acceptors', 90 'hba': 'Hydrogen-bond acceptors',
82 'acceptors' : 'Hydrogen-bond acceptors', 91 'acceptors': 'Hydrogen-bond acceptors',
83 'rotbonds' : 'Rotatable bonds', 92 'rotbonds': 'Rotatable bonds',
84 'logp' : 'logP', 93 'logp': 'logP',
85 'psa' : 'Polar surface area', 94 'psa': 'Polar surface area',
86 'mr' : 'Molecular refractivity', 95 'mr': 'Molecular refractivity',
87 'atoms' : 'Number of heavy atoms', 96 'atoms': 'Number of heavy atoms',
88 'rings' : 'Number of rings', 97 'rings': 'Number of rings',
89 'set_bits' : 'FP2 bits', 98 'set_bits': 'FP2 bits',
90 'id' : 'Internal identifier', 99 'id': 'Internal identifier',
91 'tani' : 'Tanimoto coefficient', 100 'tani': 'Tanimoto coefficient',
92 'spectrophore' : 'Spectrophores(TM)', 101 'spectrophore': 'Spectrophores(TM)',
93 'dist_spectrophore' : 'Spectrophores(TM) distance to target', 102 'dist_spectrophore': 'Spectrophores(TM) distance to target',
94 'synonym' : 'Entry id', 103 'synonym': 'Entry id',
95 } 104 }
96 105
97 OBDescriptor = { 106 OBDescriptor = {
98 'atoms': ["atoms","Number of atoms"], 107 'atoms': ["atoms", "Number of atoms"],
99 'hatoms': ["hatoms","Number of heavy atoms"], # self defined tag hatoms in plugindefines.txt 108 'hatoms': ["hatoms", "Number of heavy atoms"], # self defined tag hatoms in plugindefines.txt
100 'can_smiles' : ["cansmi","Canonical SMILES"], 109 'can_smiles': ["cansmi", "Canonical SMILES"],
101 'can_smilesNS' : ["cansmiNS","Canonical SMILES without isotopes or stereo"], 110 'can_smilesNS': ["cansmiNS", "Canonical SMILES without isotopes or stereo"],
102 #["abonds","Number of aromatic bonds"], 111 # ["abonds", "Number of aromatic bonds"],
103 #["bonds","Number of bonds"], 112 # ["bonds", "Number of bonds"],
104 #["dbonds","Number of double bonds"], 113 # ["dbonds", "Number of double bonds"],
105 #["formula","Chemical formula"], 114 # ["formula", "Chemical formula"],
106 'hba': ["HBA1","Number of Hydrogen Bond Acceptors 1 (JoelLib)"], 115 'hba': ["HBA1", "Number of Hydrogen Bond Acceptors 1 (JoelLib)"],
107 'hba2': ["HBA2","Number of Hydrogen Bond Acceptors 2 (JoelLib)"], 116 'hba2': ["HBA2", "Number of Hydrogen Bond Acceptors 2 (JoelLib)"],
108 'hbd': ["HBD","Number of Hydrogen Bond Donors (JoelLib)"], 117 'hbd': ["HBD", "Number of Hydrogen Bond Donors (JoelLib)"],
109 'inchi': ["InChI","IUPAC InChI identifier"], 118 'inchi': ["InChI", "IUPAC InChI identifier"],
110 'inchi_key': ["InChIKey","InChIKey"], 119 'inchi_key': ["InChIKey", "InChIKey"],
111 #["L5","Lipinski Rule of Five"], 120 # ["L5", "Lipinski Rule of Five"],
112 'logp': ["logP","octanol/water partition coefficient"], 121 'logp': ["logP", "octanol/water partition coefficient"],
113 'mr': ["MR","molar refractivity"], 122 'mr': ["MR", "molar refractivity"],
114 'molwt': ["MW","Molecular Weight filter"], 123 'molwt': ["MW", "Molecular Weight filter"],
115 #["nF","Number of Fluorine Atoms"], 124 # ["nF", "Number of Fluorine Atoms"],
116 #["s","SMARTS filter"], 125 # ["s", "SMARTS filter"],
117 #["sbonds","Number of single bonds"], 126 # ["sbonds", "Number of single bonds"],
118 #["smarts","SMARTS filter"], 127 # ["smarts", "SMARTS filter"],
119 #["tbonds","Number of triple bonds"], 128 # ["tbonds", "Number of triple bonds"],
120 #["title","For comparing a molecule's title"], 129 # ["title", "For comparing a molecule's title"],
121 'psa': ["TPSA","topological polar surface area"], 130 'psa': ["TPSA", "topological polar surface area"],
122 'rotbonds' : ['ROTATABLE_BOND', 'rotatable bonds'], 131 'rotbonds': ['ROTATABLE_BOND', 'rotatable bonds'],
123 } 132 }
124 133
125 134
126 def print_output(args, rows): 135 def print_output(args, rows):
127 if args.oformat == 'table': 136 if args.oformat == 'table':
128 outfile = open(args.output, 'w') 137 outfile = open(args.output, 'w')
129 requested_fields = (filter(lambda x: x not in ["[", "]", "'"], args.fetch)).split(', ') 138 requested_fields = (filter(lambda x: x not in ["[", "]", "'"], args.fetch)).split(', ')
130 if args.header: 139 if args.header:
131 outfile.write( 'Identifier\t' + '\t'.join( [ColumnNames[key] for key in requested_fields] ) + '\n' ) 140 outfile.write('Identifier\t' + '\t'.join([ColumnNames[key] for key in requested_fields]) + '\n')
132 for row in rows: 141 for row in rows:
133 outfile.write( row['synonym'] + '\t' + '\t'.join( [str(row[key]) for key in requested_fields] ) + '\n' ) 142 outfile.write(row['synonym'] + '\t' + '\t'.join([str(row[key]) for key in requested_fields]) + '\n')
134 143
135 elif args.oformat in ['sdf', 'mol2']: 144 elif args.oformat in ['sdf', 'mol2']:
136 outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True) 145 outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True)
137 for row in rows: 146 for row in rows:
138 try: 147 try:
139 mol = pybel.readstring('sdf', row['mol']) 148 mol = pybel.readstring('sdf', row['mol'])
140 if args.oformat == 'sdf': 149 if args.oformat == 'sdf':
141 keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split(', ') 150 keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split(', ')
142 mol.data.update( { ColumnNames['synonym'] : row['synonym'] } ) 151 mol.data.update({ColumnNames['synonym']: row['synonym']})
143 if 'inchi_key' in keys: 152 if 'inchi_key' in keys:
144 keys = (', '.join(keys).replace( "inchi_key", "inchi_key_first, inchi_key_last" )).split(', ') 153 keys = (', '.join(keys).replace("inchi_key", "inchi_key_first, inchi_key_last")).split(', ')
145 [ mol.data.update( { ColumnNames[key] : row[key] } ) for key in keys if key] 154 [mol.data.update({ColumnNames[key]: row[key]}) for key in keys if key]
146 outfile.write(mol) 155 outfile.write(mol)
147 except: 156 except OSError:
148 pass 157 pass
149 else: 158 else:
150 outfile = open(args.output, 'w') 159 outfile = open(args.output, 'w')
151 outfile.write( '\n'.join( [ '%s\t%s' % (row[args.oformat], row['synonym'] ) for row in rows ] ) ) 160 outfile.write('\n'.join(['%s\t%s' % (row[args.oformat], row['synonym']) for row in rows]))
152 outfile.close() 161 outfile.close()
162
153 163
154 def pybel_stop_logging(): 164 def pybel_stop_logging():
155 openbabel.obErrorLog.StopLogging() 165 openbabel.obErrorLog.StopLogging()
156 166
167
157 def get_properties_ext(mol): 168 def get_properties_ext(mol):
158
159 HBD = pybel.Smarts("[!#6;!H0]") 169 HBD = pybel.Smarts("[!#6;!H0]")
160 HBA = pybel.Smarts("[$([$([#8,#16]);!$(*=N~O);" + 170 HBA = pybel.Smarts(("[$([$([#8,#16]);!$(*=N~O);"
161 "!$(*~N=O);X1,X2]),$([#7;v3;" + 171 "!$(*~N=O);X1,X2]),$([#7;v3;"
162 "!$([nH]);!$(*(-a)-a)])]" 172 "!$([nH]);!$(*(-a)-a)])]"
163 ) 173 ))
164 calc_desc_dict = mol.calcdesc() 174 calc_desc_dict = mol.calcdesc()
165 175
166 try: 176 try:
167 logp = calc_desc_dict['logP'] 177 logp = calc_desc_dict['logP']
168 except: 178 except KeyError:
169 logp = calc_desc_dict['LogP'] 179 logp = calc_desc_dict['LogP']
170 180
171 return {"molwt": mol.molwt, 181 return {"molwt": mol.molwt,
172 "logp": logp, 182 "logp": logp,
173 "donors": len(HBD.findall(mol)), 183 "donors": len(HBD.findall(mol)),
174 "acceptors": len(HBA.findall(mol)), 184 "acceptors": len(HBA.findall(mol)),
175 "psa": calc_desc_dict['TPSA'], 185 "psa": calc_desc_dict['TPSA'],
176 "mr": calc_desc_dict['MR'], 186 "mr": calc_desc_dict['MR'],
177 "rotbonds": mol.OBMol.NumRotors(), 187 "rotbonds": mol.OBMol.NumRotors(),
178 "can": mol.write("can").split()[0].strip(), ### tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string) 188 "can": mol.write("can").split()[0].strip(), # tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string)
179 "inchi": mol.write("inchi").strip(), 189 "inchi": mol.write("inchi").strip(),
180 "inchi_key": get_inchikey(mol).strip(), 190 "inchi_key": get_inchikey(mol).strip(),
181 "rings": len(mol.sssr), 191 "rings": len(mol.sssr),
182 "atoms": mol.OBMol.NumHvyAtoms(), 192 "atoms": mol.OBMol.NumHvyAtoms(),
183 "spectrophore" : OBspectrophore(mol), 193 "spectrophore": OBspectrophore(mol),
184 } 194 }
195
185 196
186 def get_inchikey(mol): 197 def get_inchikey(mol):
187 conv = openbabel.OBConversion() 198 conv = openbabel.OBConversion()
188 conv.SetInAndOutFormats("mol", "inchi") 199 conv.SetInAndOutFormats("mol", "inchi")
189 conv.SetOptions("K", conv.OUTOPTIONS) 200 conv.SetOptions("K", conv.OUTOPTIONS)
190 inchikey = conv.WriteString( mol.OBMol ) 201 inchikey = conv.WriteString(mol.OBMol)
191 return inchikey 202 return inchikey
203
192 204
193 def OBspectrophore(mol): 205 def OBspectrophore(mol):
194 spectrophore = pybel.ob.OBSpectrophore() 206 spectrophore = pybel.ob.OBSpectrophore()
195 # Parameters: rotation angle = 20, normalization for mean and sd, accuracy = 3.0 A and non-stereospecific cages. 207 # Parameters: rotation angle = 20, normalization for mean and sd, accuracy = 3.0 A and non-stereospecific cages.
196 spectrophore.SetNormalization( spectrophore.NormalizationTowardsZeroMeanAndUnitStd ) 208 spectrophore.SetNormalization(spectrophore.NormalizationTowardsZeroMeanAndUnitStd)
197 return ', '.join( [ "%.3f" % value for value in spectrophore.GetSpectrophore( mol.OBMol ) ] ) 209 return ', '.join(["%.3f" % value for value in spectrophore.GetSpectrophore(mol.OBMol)])
198 210
199 def squared_euclidean_distance(a, b): 211
200 try: 212 def split_library(lib_path, lib_format='sdf', package_size=None):
201 return ((np.asarray( a ) - np.asarray( b ))**2).sum() 213 """
202 except ValueError: 214 Split a library of compounds. Usage: split_library(lib_path, lib_format, package_size)
203 return 0 215 IT currently ONLY WORKS FOR SD-Files
204
205 def split_library( lib_path, lib_format = 'sdf', package_size = None ):
206 """
207 Split a library of compounds. Usage: split_library( lib_path, lib_format, package_size )
208 IT currently ONLY WORKS FOR SD-Files
209 """ 216 """
210 pack = 1 217 pack = 1
211 mol_counter = 0 218 mol_counter = 0
212 219
213 outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' ) 220 outfile = open('/%s/%s_pack_%i.%s' % ('/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w')
214 221
215 for line in open(lib_path, 'r'): 222 for line in open(lib_path, 'r'):
216 outfile.write( line ) 223 outfile.write(line)
217 if line.strip() == '$$$$': 224 if line.strip() == '$$$$':
218 mol_counter += 1 225 mol_counter += 1
219 if mol_counter % package_size == 0: 226 if mol_counter % package_size == 0:
220 outfile.close() 227 outfile.close()
221 pack += 1 228 pack += 1
222 outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' ) 229 outfile = open('/%s/%s_pack_%i.%s' % ('/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w')
223 if mol_counter*10 % package_size == 0: 230 if mol_counter * 10 % package_size == 0:
224 print('%i molecules parsed, starting pack nr. %i' % ( mol_counter, pack - 1 )) 231 print('%i molecules parsed, starting pack nr. %i' % (mol_counter, pack - 1))
225 outfile.close() 232 outfile.close()
226 233
227 return True 234 return True
228 235
229 def split_smi_library( smiles_file, structures_in_one_file ): 236
230 """ 237 def split_smi_library(smiles_file, structures_in_one_file):
231 Split a file with SMILES to several files for multiprocessing usage. 238 """
232 Usage: split_smi_library( smiles_file, 10 ) 239 Split a file with SMILES to several files for multiprocessing usage.
240 Usage: split_smi_library(smiles_file, 10)
233 """ 241 """
234 output_files = [] 242 output_files = []
235 tfile = tempfile.NamedTemporaryFile(delete=False) 243 tfile = tempfile.NamedTemporaryFile(delete=False)
236 244
237 smiles_handle = open(smiles_file, 'r') 245 smiles_handle = open(smiles_file, 'r')
238 for count, line in enumerate( smiles_handle ): 246 for count, line in enumerate(smiles_handle):
239 if count % structures_in_one_file == 0 and count != 0: 247 if count % structures_in_one_file == 0 and count != 0:
240 tfile.close() 248 tfile.close()
241 output_files.append(tfile.name) 249 output_files.append(tfile.name)
242 tfile = tempfile.NamedTemporaryFile(delete=False) 250 tfile = tempfile.NamedTemporaryFile(delete=False)
243 tfile.write(line) 251 tfile.write(line)
245 output_files.append(tfile.name) 253 output_files.append(tfile.name)
246 smiles_handle.close() 254 smiles_handle.close()
247 return output_files 255 return output_files
248 256
249 257
250 def mp_run(input_path, regex, PROCESSES, function_to_call ): 258 def mp_run(input_path, regex, PROCESSES, function_to_call):
251 paths = [] 259 paths = []
252 [ paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex)) ] 260 [paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex))]
253 paths.sort() 261 paths.sort()
254 262
255 pool = Pool(processes=PROCESSES) 263 pool = Pool(processes=PROCESSES)
256 print('Process initialized with', PROCESSES, 'processors') 264 print('Process initialized with', PROCESSES, 'processors')
257 result = pool.map_async(function_to_call, paths) 265 result = pool.map_async(function_to_call, paths)
258 result.get() 266 result.get()
259 267
260 return paths 268 return paths
261 269
270
262 if __name__ == '__main__': 271 if __name__ == '__main__':
263 print(check_filetype(sys.argv[1])) 272 print(check_filetype(sys.argv[1]))
264