Mercurial > repos > bgruening > openbabel_remsmall
comparison cheminfolib.py @ 0:2704d4017b13 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/openbabel commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
author | bgruening |
---|---|
date | Sat, 20 May 2017 08:39:53 -0400 |
parents | |
children | aebc671bae78 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2704d4017b13 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Small library with cheminformatic functions based on openbabel and pgchem. | |
4 Copyright 2012, Bjoern Gruening and Xavier Lucas | |
5 """ | |
6 | |
7 import os, sys | |
8 | |
9 try: | |
10 from galaxy import eggs | |
11 eggs.require('psycopg2') | |
12 except: | |
13 print('psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB') | |
14 | |
15 try: | |
16 import pybel | |
17 import openbabel | |
18 except: | |
19 print('OpenBabel could not be found. A few functions are not available without OpenBabel.') | |
20 | |
21 from multiprocessing import Pool | |
22 import glob, tempfile, re | |
23 import subprocess | |
24 | |
25 def CountLines( path ): | |
26 out = subprocess.Popen(['wc', '-l', path], | |
27 stdout=subprocess.PIPE, | |
28 stderr=subprocess.STDOUT | |
29 ).communicate()[0] | |
30 return int(out.partition(b' ')[0]) | |
31 | |
32 def grep(pattern, file_obj): | |
33 grepper = re.compile(pattern) | |
34 for line in file_obj: | |
35 if grepper.search(line): | |
36 return True | |
37 return False | |
38 | |
39 def check_filetype(filepath): | |
40 mol = False | |
41 possible_inchi = True | |
42 for line_counter, line in enumerate(open(filepath)): | |
43 if line_counter > 10000: | |
44 break | |
45 if line.find('$$$$') != -1: | |
46 return 'sdf' | |
47 elif line.find('@<TRIPOS>MOLECULE') != -1: | |
48 return 'mol2' | |
49 elif line.find('ligand id') != -1: | |
50 return 'drf' | |
51 elif possible_inchi and re.findall('^InChI=', line): | |
52 return 'inchi' | |
53 elif re.findall('^M\s+END', line): | |
54 mol = True | |
55 # first line is not an InChI, so it can't be an InChI file | |
56 possible_inchi = False | |
57 | |
58 if mol: | |
59 # END can occures before $$$$, so and SDF file will | |
60 # be recognised as mol, if you not using this hack' | |
61 return 'mol' | |
62 return 'smi' | |
63 | |
64 def db_connect(args): | |
65 try: | |
66 db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd)); | |
67 return db_conn | |
68 except: | |
69 sys.exit('Unable to connect to the db') | |
70 | |
71 ColumnNames = { | |
72 'can_smiles' : 'Canonical SMILES', | |
73 'can' : 'Canonical SMILES', | |
74 'inchi' : 'InChI', | |
75 'inchi_key' : 'InChI key', | |
76 'inchi_key_first' : 'InChI key first', | |
77 'inchi_key_last' : 'InChI key last', | |
78 'molwt' : 'Molecular weight', | |
79 'hbd' : 'Hydrogen-bond donors', | |
80 'donors' : 'Hydrogen-bond donors', | |
81 'hba' : 'Hydrogen-bond acceptors', | |
82 'acceptors' : 'Hydrogen-bond acceptors', | |
83 'rotbonds' : 'Rotatable bonds', | |
84 'logp' : 'logP', | |
85 'psa' : 'Polar surface area', | |
86 'mr' : 'Molecular refractivity', | |
87 'atoms' : 'Number of heavy atoms', | |
88 'rings' : 'Number of rings', | |
89 'set_bits' : 'FP2 bits', | |
90 'id' : 'Internal identifier', | |
91 'tani' : 'Tanimoto coefficient', | |
92 'spectrophore' : 'Spectrophores(TM)', | |
93 'dist_spectrophore' : 'Spectrophores(TM) distance to target', | |
94 'synonym' : 'Entry id', | |
95 } | |
96 | |
97 OBDescriptor = { | |
98 'atoms': ["atoms","Number of atoms"], | |
99 'hatoms': ["hatoms","Number of heavy atoms"], # self defined tag hatoms in plugindefines.txt | |
100 'can_smiles' : ["cansmi","Canonical SMILES"], | |
101 'can_smilesNS' : ["cansmiNS","Canonical SMILES without isotopes or stereo"], | |
102 #["abonds","Number of aromatic bonds"], | |
103 #["bonds","Number of bonds"], | |
104 #["dbonds","Number of double bonds"], | |
105 #["formula","Chemical formula"], | |
106 'hba': ["HBA1","Number of Hydrogen Bond Acceptors 1 (JoelLib)"], | |
107 'hba2': ["HBA2","Number of Hydrogen Bond Acceptors 2 (JoelLib)"], | |
108 'hbd': ["HBD","Number of Hydrogen Bond Donors (JoelLib)"], | |
109 'inchi': ["InChI","IUPAC InChI identifier"], | |
110 'inchi_key': ["InChIKey","InChIKey"], | |
111 #["L5","Lipinski Rule of Five"], | |
112 'logp': ["logP","octanol/water partition coefficient"], | |
113 'mr': ["MR","molar refractivity"], | |
114 'molwt': ["MW","Molecular Weight filter"], | |
115 #["nF","Number of Fluorine Atoms"], | |
116 #["s","SMARTS filter"], | |
117 #["sbonds","Number of single bonds"], | |
118 #["smarts","SMARTS filter"], | |
119 #["tbonds","Number of triple bonds"], | |
120 #["title","For comparing a molecule's title"], | |
121 'psa': ["TPSA","topological polar surface area"], | |
122 'rotbonds' : ['ROTATABLE_BOND', 'rotatable bonds'], | |
123 } | |
124 | |
125 | |
126 def print_output(args, rows): | |
127 if args.oformat == 'table': | |
128 outfile = open(args.output, 'w') | |
129 requested_fields = (filter(lambda x: x not in ["[", "]", "'"], args.fetch)).split(', ') | |
130 if args.header: | |
131 outfile.write( 'Identifier\t' + '\t'.join( [ColumnNames[key] for key in requested_fields] ) + '\n' ) | |
132 for row in rows: | |
133 outfile.write( row['synonym'] + '\t' + '\t'.join( [str(row[key]) for key in requested_fields] ) + '\n' ) | |
134 | |
135 elif args.oformat in ['sdf', 'mol2']: | |
136 outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True) | |
137 for row in rows: | |
138 try: | |
139 mol = pybel.readstring('sdf', row['mol']) | |
140 if args.oformat == 'sdf': | |
141 keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split(', ') | |
142 mol.data.update( { ColumnNames['synonym'] : row['synonym'] } ) | |
143 if 'inchi_key' in keys: | |
144 keys = (', '.join(keys).replace( "inchi_key", "inchi_key_first, inchi_key_last" )).split(', ') | |
145 [ mol.data.update( { ColumnNames[key] : row[key] } ) for key in keys if key] | |
146 outfile.write(mol) | |
147 except: | |
148 pass | |
149 else: | |
150 outfile = open(args.output, 'w') | |
151 outfile.write( '\n'.join( [ '%s\t%s' % (row[args.oformat], row['synonym'] ) for row in rows ] ) ) | |
152 outfile.close() | |
153 | |
154 def pybel_stop_logging(): | |
155 openbabel.obErrorLog.StopLogging() | |
156 | |
157 def get_properties_ext(mol): | |
158 | |
159 HBD = pybel.Smarts("[!#6;!H0]") | |
160 HBA = pybel.Smarts("[$([$([#8,#16]);!$(*=N~O);" + | |
161 "!$(*~N=O);X1,X2]),$([#7;v3;" + | |
162 "!$([nH]);!$(*(-a)-a)])]" | |
163 ) | |
164 calc_desc_dict = mol.calcdesc() | |
165 | |
166 try: | |
167 logp = calc_desc_dict['logP'] | |
168 except: | |
169 logp = calc_desc_dict['LogP'] | |
170 | |
171 return {"molwt": mol.molwt, | |
172 "logp": logp, | |
173 "donors": len(HBD.findall(mol)), | |
174 "acceptors": len(HBA.findall(mol)), | |
175 "psa": calc_desc_dict['TPSA'], | |
176 "mr": calc_desc_dict['MR'], | |
177 "rotbonds": mol.OBMol.NumRotors(), | |
178 "can": mol.write("can").split()[0].strip(), ### tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string) | |
179 "inchi": mol.write("inchi").strip(), | |
180 "inchi_key": get_inchikey(mol).strip(), | |
181 "rings": len(mol.sssr), | |
182 "atoms": mol.OBMol.NumHvyAtoms(), | |
183 "spectrophore" : OBspectrophore(mol), | |
184 } | |
185 | |
186 def get_inchikey(mol): | |
187 conv = openbabel.OBConversion() | |
188 conv.SetInAndOutFormats("mol", "inchi") | |
189 conv.SetOptions("K", conv.OUTOPTIONS) | |
190 inchikey = conv.WriteString( mol.OBMol ) | |
191 return inchikey | |
192 | |
193 def OBspectrophore(mol): | |
194 spectrophore = pybel.ob.OBSpectrophore() | |
195 # Parameters: rotation angle = 20, normalization for mean and sd, accuracy = 3.0 A and non-stereospecific cages. | |
196 spectrophore.SetNormalization( spectrophore.NormalizationTowardsZeroMeanAndUnitStd ) | |
197 return ', '.join( [ "%.3f" % value for value in spectrophore.GetSpectrophore( mol.OBMol ) ] ) | |
198 | |
199 def squared_euclidean_distance(a, b): | |
200 try: | |
201 return ((np.asarray( a ) - np.asarray( b ))**2).sum() | |
202 except ValueError: | |
203 return 0 | |
204 | |
205 def split_library( lib_path, lib_format = 'sdf', package_size = None ): | |
206 """ | |
207 Split a library of compounds. Usage: split_library( lib_path, lib_format, package_size ) | |
208 IT currently ONLY WORKS FOR SD-Files | |
209 """ | |
210 pack = 1 | |
211 mol_counter = 0 | |
212 | |
213 outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' ) | |
214 | |
215 for line in open(lib_path, 'r'): | |
216 outfile.write( line ) | |
217 if line.strip() == '$$$$': | |
218 mol_counter += 1 | |
219 if mol_counter % package_size == 0: | |
220 outfile.close() | |
221 pack += 1 | |
222 outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' ) | |
223 if mol_counter*10 % package_size == 0: | |
224 print('%i molecules parsed, starting pack nr. %i' % ( mol_counter, pack - 1 )) | |
225 outfile.close() | |
226 | |
227 return True | |
228 | |
229 def split_smi_library( smiles_file, structures_in_one_file ): | |
230 """ | |
231 Split a file with SMILES to several files for multiprocessing usage. | |
232 Usage: split_smi_library( smiles_file, 10 ) | |
233 """ | |
234 output_files = [] | |
235 tfile = tempfile.NamedTemporaryFile(delete=False) | |
236 | |
237 smiles_handle = open(smiles_file, 'r') | |
238 for count, line in enumerate( smiles_handle ): | |
239 if count % structures_in_one_file == 0 and count != 0: | |
240 tfile.close() | |
241 output_files.append(tfile.name) | |
242 tfile = tempfile.NamedTemporaryFile(delete=False) | |
243 tfile.write(line) | |
244 tfile.close() | |
245 output_files.append(tfile.name) | |
246 smiles_handle.close() | |
247 return output_files | |
248 | |
249 | |
250 def mp_run(input_path, regex, PROCESSES, function_to_call ): | |
251 paths = [] | |
252 [ paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex)) ] | |
253 paths.sort() | |
254 | |
255 pool = Pool(processes=PROCESSES) | |
256 print('Process initialized with', PROCESSES, 'processors') | |
257 result = pool.map_async(function_to_call, paths) | |
258 result.get() | |
259 | |
260 return paths | |
261 | |
262 if __name__ == '__main__': | |
263 print(check_filetype(sys.argv[1])) | |
264 |