Mercurial > repos > bgruening > openbabel_remduplicates
comparison cheminfolib.py @ 13:12aca74f07d7 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/openbabel commit 1fe240ef0064a1a4a66d9be1ccace53824280b75"
author | bgruening |
---|---|
date | Mon, 19 Oct 2020 14:47:33 +0000 |
parents | 50ca8845e7f5 |
children | c5de6c19eb06 |
comparison
equal
deleted
inserted
replaced
12:50ca8845e7f5 | 13:12aca74f07d7 |
---|---|
2 """ | 2 """ |
3 Small library with cheminformatic functions based on openbabel and pgchem. | 3 Small library with cheminformatic functions based on openbabel and pgchem. |
4 Copyright 2012, Bjoern Gruening and Xavier Lucas | 4 Copyright 2012, Bjoern Gruening and Xavier Lucas |
5 """ | 5 """ |
6 | 6 |
7 import os, sys | 7 import glob |
8 import re | |
9 import subprocess | |
10 import sys | |
11 import tempfile | |
12 from multiprocessing import Pool | |
13 | |
8 | 14 |
9 try: | 15 try: |
10 from galaxy import eggs | 16 from galaxy import eggs |
11 eggs.require('psycopg2') | 17 eggs.require('psycopg2') |
12 except: | 18 except ImportError: |
19 psycopg2 = None | |
13 print('psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB') | 20 print('psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB') |
14 | 21 |
15 try: | 22 try: |
16 from openbabel import openbabel, pybel | 23 from openbabel import openbabel, pybel |
17 openbabel.obErrorLog.StopLogging() | 24 openbabel.obErrorLog.StopLogging() |
18 except: | 25 except ImportError: |
26 openbabel, pybel = None, None | |
19 print('OpenBabel could not be found. A few functions are not available without OpenBabel.') | 27 print('OpenBabel could not be found. A few functions are not available without OpenBabel.') |
20 | 28 |
21 from multiprocessing import Pool | 29 |
22 import glob, tempfile, re | 30 def CountLines(path): |
23 import subprocess | |
24 | |
25 def CountLines( path ): | |
26 out = subprocess.Popen(['wc', '-l', path], | 31 out = subprocess.Popen(['wc', '-l', path], |
27 stdout=subprocess.PIPE, | 32 stdout=subprocess.PIPE, |
28 stderr=subprocess.STDOUT | 33 stderr=subprocess.STDOUT |
29 ).communicate()[0] | 34 ).communicate()[0] |
30 return int(out.partition(b' ')[0]) | 35 return int(out.partition(b' ')[0]) |
36 | |
31 | 37 |
32 def grep(pattern, file_obj): | 38 def grep(pattern, file_obj): |
33 grepper = re.compile(pattern) | 39 grepper = re.compile(pattern) |
34 for line in file_obj: | 40 for line in file_obj: |
35 if grepper.search(line): | 41 if grepper.search(line): |
36 return True | 42 return True |
37 return False | 43 return False |
44 | |
38 | 45 |
39 def check_filetype(filepath): | 46 def check_filetype(filepath): |
40 mol = False | 47 mol = False |
41 possible_inchi = True | 48 possible_inchi = True |
42 for line_counter, line in enumerate(open(filepath)): | 49 for line_counter, line in enumerate(open(filepath)): |
48 return 'mol2' | 55 return 'mol2' |
49 elif line.find('ligand id') != -1: | 56 elif line.find('ligand id') != -1: |
50 return 'drf' | 57 return 'drf' |
51 elif possible_inchi and re.findall('^InChI=', line): | 58 elif possible_inchi and re.findall('^InChI=', line): |
52 return 'inchi' | 59 return 'inchi' |
53 elif re.findall('^M\s+END', line): | 60 elif re.findall(r'^M\s+END', line): |
54 mol = True | 61 mol = True |
55 # first line is not an InChI, so it can't be an InChI file | 62 # first line is not an InChI, so it can't be an InChI file |
56 possible_inchi = False | 63 possible_inchi = False |
57 | 64 |
58 if mol: | 65 if mol: |
59 # END can occures before $$$$, so and SDF file will | 66 # END can occures before $$$$, so and SDF file will |
60 # be recognised as mol, if you not using this hack' | 67 # be recognised as mol, if you not using this hack' |
61 return 'mol' | 68 return 'mol' |
62 return 'smi' | 69 return 'smi' |
63 | 70 |
71 | |
64 def db_connect(args): | 72 def db_connect(args): |
65 try: | 73 try: |
66 db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd)); | 74 db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd)) |
67 return db_conn | 75 return db_conn |
68 except: | 76 except psycopg2.Error: |
69 sys.exit('Unable to connect to the db') | 77 sys.exit('Unable to connect to the db') |
70 | 78 |
79 | |
71 ColumnNames = { | 80 ColumnNames = { |
72 'can_smiles' : 'Canonical SMILES', | 81 'can_smiles': 'Canonical SMILES', |
73 'can' : 'Canonical SMILES', | 82 'can': 'Canonical SMILES', |
74 'inchi' : 'InChI', | 83 'inchi': 'InChI', |
75 'inchi_key' : 'InChI key', | 84 'inchi_key': 'InChI key', |
76 'inchi_key_first' : 'InChI key first', | 85 'inchi_key_first': 'InChI key first', |
77 'inchi_key_last' : 'InChI key last', | 86 'inchi_key_last': 'InChI key last', |
78 'molwt' : 'Molecular weight', | 87 'molwt': 'Molecular weight', |
79 'hbd' : 'Hydrogen-bond donors', | 88 'hbd': 'Hydrogen-bond donors', |
80 'donors' : 'Hydrogen-bond donors', | 89 'donors': 'Hydrogen-bond donors', |
81 'hba' : 'Hydrogen-bond acceptors', | 90 'hba': 'Hydrogen-bond acceptors', |
82 'acceptors' : 'Hydrogen-bond acceptors', | 91 'acceptors': 'Hydrogen-bond acceptors', |
83 'rotbonds' : 'Rotatable bonds', | 92 'rotbonds': 'Rotatable bonds', |
84 'logp' : 'logP', | 93 'logp': 'logP', |
85 'psa' : 'Polar surface area', | 94 'psa': 'Polar surface area', |
86 'mr' : 'Molecular refractivity', | 95 'mr': 'Molecular refractivity', |
87 'atoms' : 'Number of heavy atoms', | 96 'atoms': 'Number of heavy atoms', |
88 'rings' : 'Number of rings', | 97 'rings': 'Number of rings', |
89 'set_bits' : 'FP2 bits', | 98 'set_bits': 'FP2 bits', |
90 'id' : 'Internal identifier', | 99 'id': 'Internal identifier', |
91 'tani' : 'Tanimoto coefficient', | 100 'tani': 'Tanimoto coefficient', |
92 'spectrophore' : 'Spectrophores(TM)', | 101 'spectrophore': 'Spectrophores(TM)', |
93 'dist_spectrophore' : 'Spectrophores(TM) distance to target', | 102 'dist_spectrophore': 'Spectrophores(TM) distance to target', |
94 'synonym' : 'Entry id', | 103 'synonym': 'Entry id', |
95 } | 104 } |
96 | 105 |
97 OBDescriptor = { | 106 OBDescriptor = { |
98 'atoms': ["atoms","Number of atoms"], | 107 'atoms': ["atoms", "Number of atoms"], |
99 'hatoms': ["hatoms","Number of heavy atoms"], # self defined tag hatoms in plugindefines.txt | 108 'hatoms': ["hatoms", "Number of heavy atoms"], # self defined tag hatoms in plugindefines.txt |
100 'can_smiles' : ["cansmi","Canonical SMILES"], | 109 'can_smiles': ["cansmi", "Canonical SMILES"], |
101 'can_smilesNS' : ["cansmiNS","Canonical SMILES without isotopes or stereo"], | 110 'can_smilesNS': ["cansmiNS", "Canonical SMILES without isotopes or stereo"], |
102 #["abonds","Number of aromatic bonds"], | 111 # ["abonds", "Number of aromatic bonds"], |
103 #["bonds","Number of bonds"], | 112 # ["bonds", "Number of bonds"], |
104 #["dbonds","Number of double bonds"], | 113 # ["dbonds", "Number of double bonds"], |
105 #["formula","Chemical formula"], | 114 # ["formula", "Chemical formula"], |
106 'hba': ["HBA1","Number of Hydrogen Bond Acceptors 1 (JoelLib)"], | 115 'hba': ["HBA1", "Number of Hydrogen Bond Acceptors 1 (JoelLib)"], |
107 'hba2': ["HBA2","Number of Hydrogen Bond Acceptors 2 (JoelLib)"], | 116 'hba2': ["HBA2", "Number of Hydrogen Bond Acceptors 2 (JoelLib)"], |
108 'hbd': ["HBD","Number of Hydrogen Bond Donors (JoelLib)"], | 117 'hbd': ["HBD", "Number of Hydrogen Bond Donors (JoelLib)"], |
109 'inchi': ["InChI","IUPAC InChI identifier"], | 118 'inchi': ["InChI", "IUPAC InChI identifier"], |
110 'inchi_key': ["InChIKey","InChIKey"], | 119 'inchi_key': ["InChIKey", "InChIKey"], |
111 #["L5","Lipinski Rule of Five"], | 120 # ["L5", "Lipinski Rule of Five"], |
112 'logp': ["logP","octanol/water partition coefficient"], | 121 'logp': ["logP", "octanol/water partition coefficient"], |
113 'mr': ["MR","molar refractivity"], | 122 'mr': ["MR", "molar refractivity"], |
114 'molwt': ["MW","Molecular Weight filter"], | 123 'molwt': ["MW", "Molecular Weight filter"], |
115 #["nF","Number of Fluorine Atoms"], | 124 # ["nF", "Number of Fluorine Atoms"], |
116 #["s","SMARTS filter"], | 125 # ["s", "SMARTS filter"], |
117 #["sbonds","Number of single bonds"], | 126 # ["sbonds", "Number of single bonds"], |
118 #["smarts","SMARTS filter"], | 127 # ["smarts", "SMARTS filter"], |
119 #["tbonds","Number of triple bonds"], | 128 # ["tbonds", "Number of triple bonds"], |
120 #["title","For comparing a molecule's title"], | 129 # ["title", "For comparing a molecule's title"], |
121 'psa': ["TPSA","topological polar surface area"], | 130 'psa': ["TPSA", "topological polar surface area"], |
122 'rotbonds' : ['ROTATABLE_BOND', 'rotatable bonds'], | 131 'rotbonds': ['ROTATABLE_BOND', 'rotatable bonds'], |
123 } | 132 } |
124 | 133 |
125 | 134 |
126 def print_output(args, rows): | 135 def print_output(args, rows): |
127 if args.oformat == 'table': | 136 if args.oformat == 'table': |
128 outfile = open(args.output, 'w') | 137 outfile = open(args.output, 'w') |
129 requested_fields = (filter(lambda x: x not in ["[", "]", "'"], args.fetch)).split(', ') | 138 requested_fields = (filter(lambda x: x not in ["[", "]", "'"], args.fetch)).split(', ') |
130 if args.header: | 139 if args.header: |
131 outfile.write( 'Identifier\t' + '\t'.join( [ColumnNames[key] for key in requested_fields] ) + '\n' ) | 140 outfile.write('Identifier\t' + '\t'.join([ColumnNames[key] for key in requested_fields]) + '\n') |
132 for row in rows: | 141 for row in rows: |
133 outfile.write( row['synonym'] + '\t' + '\t'.join( [str(row[key]) for key in requested_fields] ) + '\n' ) | 142 outfile.write(row['synonym'] + '\t' + '\t'.join([str(row[key]) for key in requested_fields]) + '\n') |
134 | 143 |
135 elif args.oformat in ['sdf', 'mol2']: | 144 elif args.oformat in ['sdf', 'mol2']: |
136 outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True) | 145 outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True) |
137 for row in rows: | 146 for row in rows: |
138 try: | 147 try: |
139 mol = pybel.readstring('sdf', row['mol']) | 148 mol = pybel.readstring('sdf', row['mol']) |
140 if args.oformat == 'sdf': | 149 if args.oformat == 'sdf': |
141 keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split(', ') | 150 keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split(', ') |
142 mol.data.update( { ColumnNames['synonym'] : row['synonym'] } ) | 151 mol.data.update({ColumnNames['synonym']: row['synonym']}) |
143 if 'inchi_key' in keys: | 152 if 'inchi_key' in keys: |
144 keys = (', '.join(keys).replace( "inchi_key", "inchi_key_first, inchi_key_last" )).split(', ') | 153 keys = (', '.join(keys).replace("inchi_key", "inchi_key_first, inchi_key_last")).split(', ') |
145 [ mol.data.update( { ColumnNames[key] : row[key] } ) for key in keys if key] | 154 [mol.data.update({ColumnNames[key]: row[key]}) for key in keys if key] |
146 outfile.write(mol) | 155 outfile.write(mol) |
147 except: | 156 except OSError: |
148 pass | 157 pass |
149 else: | 158 else: |
150 outfile = open(args.output, 'w') | 159 outfile = open(args.output, 'w') |
151 outfile.write( '\n'.join( [ '%s\t%s' % (row[args.oformat], row['synonym'] ) for row in rows ] ) ) | 160 outfile.write('\n'.join(['%s\t%s' % (row[args.oformat], row['synonym']) for row in rows])) |
152 outfile.close() | 161 outfile.close() |
162 | |
153 | 163 |
154 def pybel_stop_logging(): | 164 def pybel_stop_logging(): |
155 openbabel.obErrorLog.StopLogging() | 165 openbabel.obErrorLog.StopLogging() |
156 | 166 |
167 | |
157 def get_properties_ext(mol): | 168 def get_properties_ext(mol): |
158 | |
159 HBD = pybel.Smarts("[!#6;!H0]") | 169 HBD = pybel.Smarts("[!#6;!H0]") |
160 HBA = pybel.Smarts("[$([$([#8,#16]);!$(*=N~O);" + | 170 HBA = pybel.Smarts(("[$([$([#8,#16]);!$(*=N~O);" |
161 "!$(*~N=O);X1,X2]),$([#7;v3;" + | 171 "!$(*~N=O);X1,X2]),$([#7;v3;" |
162 "!$([nH]);!$(*(-a)-a)])]" | 172 "!$([nH]);!$(*(-a)-a)])]" |
163 ) | 173 )) |
164 calc_desc_dict = mol.calcdesc() | 174 calc_desc_dict = mol.calcdesc() |
165 | 175 |
166 try: | 176 try: |
167 logp = calc_desc_dict['logP'] | 177 logp = calc_desc_dict['logP'] |
168 except: | 178 except KeyError: |
169 logp = calc_desc_dict['LogP'] | 179 logp = calc_desc_dict['LogP'] |
170 | 180 |
171 return {"molwt": mol.molwt, | 181 return {"molwt": mol.molwt, |
172 "logp": logp, | 182 "logp": logp, |
173 "donors": len(HBD.findall(mol)), | 183 "donors": len(HBD.findall(mol)), |
174 "acceptors": len(HBA.findall(mol)), | 184 "acceptors": len(HBA.findall(mol)), |
175 "psa": calc_desc_dict['TPSA'], | 185 "psa": calc_desc_dict['TPSA'], |
176 "mr": calc_desc_dict['MR'], | 186 "mr": calc_desc_dict['MR'], |
177 "rotbonds": mol.OBMol.NumRotors(), | 187 "rotbonds": mol.OBMol.NumRotors(), |
178 "can": mol.write("can").split()[0].strip(), ### tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string) | 188 "can": mol.write("can").split()[0].strip(), # tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string) |
179 "inchi": mol.write("inchi").strip(), | 189 "inchi": mol.write("inchi").strip(), |
180 "inchi_key": get_inchikey(mol).strip(), | 190 "inchi_key": get_inchikey(mol).strip(), |
181 "rings": len(mol.sssr), | 191 "rings": len(mol.sssr), |
182 "atoms": mol.OBMol.NumHvyAtoms(), | 192 "atoms": mol.OBMol.NumHvyAtoms(), |
183 "spectrophore" : OBspectrophore(mol), | 193 "spectrophore": OBspectrophore(mol), |
184 } | 194 } |
195 | |
185 | 196 |
186 def get_inchikey(mol): | 197 def get_inchikey(mol): |
187 conv = openbabel.OBConversion() | 198 conv = openbabel.OBConversion() |
188 conv.SetInAndOutFormats("mol", "inchi") | 199 conv.SetInAndOutFormats("mol", "inchi") |
189 conv.SetOptions("K", conv.OUTOPTIONS) | 200 conv.SetOptions("K", conv.OUTOPTIONS) |
190 inchikey = conv.WriteString( mol.OBMol ) | 201 inchikey = conv.WriteString(mol.OBMol) |
191 return inchikey | 202 return inchikey |
203 | |
192 | 204 |
193 def OBspectrophore(mol): | 205 def OBspectrophore(mol): |
194 spectrophore = pybel.ob.OBSpectrophore() | 206 spectrophore = pybel.ob.OBSpectrophore() |
195 # Parameters: rotation angle = 20, normalization for mean and sd, accuracy = 3.0 A and non-stereospecific cages. | 207 # Parameters: rotation angle = 20, normalization for mean and sd, accuracy = 3.0 A and non-stereospecific cages. |
196 spectrophore.SetNormalization( spectrophore.NormalizationTowardsZeroMeanAndUnitStd ) | 208 spectrophore.SetNormalization(spectrophore.NormalizationTowardsZeroMeanAndUnitStd) |
197 return ', '.join( [ "%.3f" % value for value in spectrophore.GetSpectrophore( mol.OBMol ) ] ) | 209 return ', '.join(["%.3f" % value for value in spectrophore.GetSpectrophore(mol.OBMol)]) |
198 | 210 |
199 def squared_euclidean_distance(a, b): | 211 |
200 try: | 212 def split_library(lib_path, lib_format='sdf', package_size=None): |
201 return ((np.asarray( a ) - np.asarray( b ))**2).sum() | 213 """ |
202 except ValueError: | 214 Split a library of compounds. Usage: split_library(lib_path, lib_format, package_size) |
203 return 0 | 215 IT currently ONLY WORKS FOR SD-Files |
204 | |
205 def split_library( lib_path, lib_format = 'sdf', package_size = None ): | |
206 """ | |
207 Split a library of compounds. Usage: split_library( lib_path, lib_format, package_size ) | |
208 IT currently ONLY WORKS FOR SD-Files | |
209 """ | 216 """ |
210 pack = 1 | 217 pack = 1 |
211 mol_counter = 0 | 218 mol_counter = 0 |
212 | 219 |
213 outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' ) | 220 outfile = open('/%s/%s_pack_%i.%s' % ('/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w') |
214 | 221 |
215 for line in open(lib_path, 'r'): | 222 for line in open(lib_path, 'r'): |
216 outfile.write( line ) | 223 outfile.write(line) |
217 if line.strip() == '$$$$': | 224 if line.strip() == '$$$$': |
218 mol_counter += 1 | 225 mol_counter += 1 |
219 if mol_counter % package_size == 0: | 226 if mol_counter % package_size == 0: |
220 outfile.close() | 227 outfile.close() |
221 pack += 1 | 228 pack += 1 |
222 outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' ) | 229 outfile = open('/%s/%s_pack_%i.%s' % ('/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w') |
223 if mol_counter*10 % package_size == 0: | 230 if mol_counter * 10 % package_size == 0: |
224 print('%i molecules parsed, starting pack nr. %i' % ( mol_counter, pack - 1 )) | 231 print('%i molecules parsed, starting pack nr. %i' % (mol_counter, pack - 1)) |
225 outfile.close() | 232 outfile.close() |
226 | 233 |
227 return True | 234 return True |
228 | 235 |
229 def split_smi_library( smiles_file, structures_in_one_file ): | 236 |
230 """ | 237 def split_smi_library(smiles_file, structures_in_one_file): |
231 Split a file with SMILES to several files for multiprocessing usage. | 238 """ |
232 Usage: split_smi_library( smiles_file, 10 ) | 239 Split a file with SMILES to several files for multiprocessing usage. |
240 Usage: split_smi_library(smiles_file, 10) | |
233 """ | 241 """ |
234 output_files = [] | 242 output_files = [] |
235 tfile = tempfile.NamedTemporaryFile(delete=False) | 243 tfile = tempfile.NamedTemporaryFile(delete=False) |
236 | 244 |
237 smiles_handle = open(smiles_file, 'r') | 245 smiles_handle = open(smiles_file, 'r') |
238 for count, line in enumerate( smiles_handle ): | 246 for count, line in enumerate(smiles_handle): |
239 if count % structures_in_one_file == 0 and count != 0: | 247 if count % structures_in_one_file == 0 and count != 0: |
240 tfile.close() | 248 tfile.close() |
241 output_files.append(tfile.name) | 249 output_files.append(tfile.name) |
242 tfile = tempfile.NamedTemporaryFile(delete=False) | 250 tfile = tempfile.NamedTemporaryFile(delete=False) |
243 tfile.write(line) | 251 tfile.write(line) |
245 output_files.append(tfile.name) | 253 output_files.append(tfile.name) |
246 smiles_handle.close() | 254 smiles_handle.close() |
247 return output_files | 255 return output_files |
248 | 256 |
249 | 257 |
250 def mp_run(input_path, regex, PROCESSES, function_to_call ): | 258 def mp_run(input_path, regex, PROCESSES, function_to_call): |
251 paths = [] | 259 paths = [] |
252 [ paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex)) ] | 260 [paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex))] |
253 paths.sort() | 261 paths.sort() |
254 | 262 |
255 pool = Pool(processes=PROCESSES) | 263 pool = Pool(processes=PROCESSES) |
256 print('Process initialized with', PROCESSES, 'processors') | 264 print('Process initialized with', PROCESSES, 'processors') |
257 result = pool.map_async(function_to_call, paths) | 265 result = pool.map_async(function_to_call, paths) |
258 result.get() | 266 result.get() |
259 | 267 |
260 return paths | 268 return paths |
261 | 269 |
270 | |
262 if __name__ == '__main__': | 271 if __name__ == '__main__': |
263 print(check_filetype(sys.argv[1])) | 272 print(check_filetype(sys.argv[1])) |
264 |