comparison cheminfolib.py @ 0:2704d4017b13 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/openbabel commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
author bgruening
date Sat, 20 May 2017 08:39:53 -0400
parents
children aebc671bae78
comparison
equal deleted inserted replaced
-1:000000000000 0:2704d4017b13
1 #!/usr/bin/env python
2 """
3 Small library with cheminformatic functions based on openbabel and pgchem.
4 Copyright 2012, Bjoern Gruening and Xavier Lucas
5 """
6
7 import os, sys
8
9 try:
10 from galaxy import eggs
11 eggs.require('psycopg2')
12 except:
13 print('psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB')
14
15 try:
16 import pybel
17 import openbabel
18 except:
19 print('OpenBabel could not be found. A few functions are not available without OpenBabel.')
20
21 from multiprocessing import Pool
22 import glob, tempfile, re
23 import subprocess
24
25 def CountLines( path ):
26 out = subprocess.Popen(['wc', '-l', path],
27 stdout=subprocess.PIPE,
28 stderr=subprocess.STDOUT
29 ).communicate()[0]
30 return int(out.partition(b' ')[0])
31
32 def grep(pattern, file_obj):
33 grepper = re.compile(pattern)
34 for line in file_obj:
35 if grepper.search(line):
36 return True
37 return False
38
39 def check_filetype(filepath):
40 mol = False
41 possible_inchi = True
42 for line_counter, line in enumerate(open(filepath)):
43 if line_counter > 10000:
44 break
45 if line.find('$$$$') != -1:
46 return 'sdf'
47 elif line.find('@<TRIPOS>MOLECULE') != -1:
48 return 'mol2'
49 elif line.find('ligand id') != -1:
50 return 'drf'
51 elif possible_inchi and re.findall('^InChI=', line):
52 return 'inchi'
53 elif re.findall('^M\s+END', line):
54 mol = True
55 # first line is not an InChI, so it can't be an InChI file
56 possible_inchi = False
57
58 if mol:
59 # END can occures before $$$$, so and SDF file will
60 # be recognised as mol, if you not using this hack'
61 return 'mol'
62 return 'smi'
63
64 def db_connect(args):
65 try:
66 db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd));
67 return db_conn
68 except:
69 sys.exit('Unable to connect to the db')
70
71 ColumnNames = {
72 'can_smiles' : 'Canonical SMILES',
73 'can' : 'Canonical SMILES',
74 'inchi' : 'InChI',
75 'inchi_key' : 'InChI key',
76 'inchi_key_first' : 'InChI key first',
77 'inchi_key_last' : 'InChI key last',
78 'molwt' : 'Molecular weight',
79 'hbd' : 'Hydrogen-bond donors',
80 'donors' : 'Hydrogen-bond donors',
81 'hba' : 'Hydrogen-bond acceptors',
82 'acceptors' : 'Hydrogen-bond acceptors',
83 'rotbonds' : 'Rotatable bonds',
84 'logp' : 'logP',
85 'psa' : 'Polar surface area',
86 'mr' : 'Molecular refractivity',
87 'atoms' : 'Number of heavy atoms',
88 'rings' : 'Number of rings',
89 'set_bits' : 'FP2 bits',
90 'id' : 'Internal identifier',
91 'tani' : 'Tanimoto coefficient',
92 'spectrophore' : 'Spectrophores(TM)',
93 'dist_spectrophore' : 'Spectrophores(TM) distance to target',
94 'synonym' : 'Entry id',
95 }
96
97 OBDescriptor = {
98 'atoms': ["atoms","Number of atoms"],
99 'hatoms': ["hatoms","Number of heavy atoms"], # self defined tag hatoms in plugindefines.txt
100 'can_smiles' : ["cansmi","Canonical SMILES"],
101 'can_smilesNS' : ["cansmiNS","Canonical SMILES without isotopes or stereo"],
102 #["abonds","Number of aromatic bonds"],
103 #["bonds","Number of bonds"],
104 #["dbonds","Number of double bonds"],
105 #["formula","Chemical formula"],
106 'hba': ["HBA1","Number of Hydrogen Bond Acceptors 1 (JoelLib)"],
107 'hba2': ["HBA2","Number of Hydrogen Bond Acceptors 2 (JoelLib)"],
108 'hbd': ["HBD","Number of Hydrogen Bond Donors (JoelLib)"],
109 'inchi': ["InChI","IUPAC InChI identifier"],
110 'inchi_key': ["InChIKey","InChIKey"],
111 #["L5","Lipinski Rule of Five"],
112 'logp': ["logP","octanol/water partition coefficient"],
113 'mr': ["MR","molar refractivity"],
114 'molwt': ["MW","Molecular Weight filter"],
115 #["nF","Number of Fluorine Atoms"],
116 #["s","SMARTS filter"],
117 #["sbonds","Number of single bonds"],
118 #["smarts","SMARTS filter"],
119 #["tbonds","Number of triple bonds"],
120 #["title","For comparing a molecule's title"],
121 'psa': ["TPSA","topological polar surface area"],
122 'rotbonds' : ['ROTATABLE_BOND', 'rotatable bonds'],
123 }
124
125
126 def print_output(args, rows):
127 if args.oformat == 'table':
128 outfile = open(args.output, 'w')
129 requested_fields = (filter(lambda x: x not in ["[", "]", "'"], args.fetch)).split(', ')
130 if args.header:
131 outfile.write( 'Identifier\t' + '\t'.join( [ColumnNames[key] for key in requested_fields] ) + '\n' )
132 for row in rows:
133 outfile.write( row['synonym'] + '\t' + '\t'.join( [str(row[key]) for key in requested_fields] ) + '\n' )
134
135 elif args.oformat in ['sdf', 'mol2']:
136 outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True)
137 for row in rows:
138 try:
139 mol = pybel.readstring('sdf', row['mol'])
140 if args.oformat == 'sdf':
141 keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split(', ')
142 mol.data.update( { ColumnNames['synonym'] : row['synonym'] } )
143 if 'inchi_key' in keys:
144 keys = (', '.join(keys).replace( "inchi_key", "inchi_key_first, inchi_key_last" )).split(', ')
145 [ mol.data.update( { ColumnNames[key] : row[key] } ) for key in keys if key]
146 outfile.write(mol)
147 except:
148 pass
149 else:
150 outfile = open(args.output, 'w')
151 outfile.write( '\n'.join( [ '%s\t%s' % (row[args.oformat], row['synonym'] ) for row in rows ] ) )
152 outfile.close()
153
154 def pybel_stop_logging():
155 openbabel.obErrorLog.StopLogging()
156
157 def get_properties_ext(mol):
158
159 HBD = pybel.Smarts("[!#6;!H0]")
160 HBA = pybel.Smarts("[$([$([#8,#16]);!$(*=N~O);" +
161 "!$(*~N=O);X1,X2]),$([#7;v3;" +
162 "!$([nH]);!$(*(-a)-a)])]"
163 )
164 calc_desc_dict = mol.calcdesc()
165
166 try:
167 logp = calc_desc_dict['logP']
168 except:
169 logp = calc_desc_dict['LogP']
170
171 return {"molwt": mol.molwt,
172 "logp": logp,
173 "donors": len(HBD.findall(mol)),
174 "acceptors": len(HBA.findall(mol)),
175 "psa": calc_desc_dict['TPSA'],
176 "mr": calc_desc_dict['MR'],
177 "rotbonds": mol.OBMol.NumRotors(),
178 "can": mol.write("can").split()[0].strip(), ### tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string)
179 "inchi": mol.write("inchi").strip(),
180 "inchi_key": get_inchikey(mol).strip(),
181 "rings": len(mol.sssr),
182 "atoms": mol.OBMol.NumHvyAtoms(),
183 "spectrophore" : OBspectrophore(mol),
184 }
185
186 def get_inchikey(mol):
187 conv = openbabel.OBConversion()
188 conv.SetInAndOutFormats("mol", "inchi")
189 conv.SetOptions("K", conv.OUTOPTIONS)
190 inchikey = conv.WriteString( mol.OBMol )
191 return inchikey
192
193 def OBspectrophore(mol):
194 spectrophore = pybel.ob.OBSpectrophore()
195 # Parameters: rotation angle = 20, normalization for mean and sd, accuracy = 3.0 A and non-stereospecific cages.
196 spectrophore.SetNormalization( spectrophore.NormalizationTowardsZeroMeanAndUnitStd )
197 return ', '.join( [ "%.3f" % value for value in spectrophore.GetSpectrophore( mol.OBMol ) ] )
198
199 def squared_euclidean_distance(a, b):
200 try:
201 return ((np.asarray( a ) - np.asarray( b ))**2).sum()
202 except ValueError:
203 return 0
204
205 def split_library( lib_path, lib_format = 'sdf', package_size = None ):
206 """
207 Split a library of compounds. Usage: split_library( lib_path, lib_format, package_size )
208 IT currently ONLY WORKS FOR SD-Files
209 """
210 pack = 1
211 mol_counter = 0
212
213 outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' )
214
215 for line in open(lib_path, 'r'):
216 outfile.write( line )
217 if line.strip() == '$$$$':
218 mol_counter += 1
219 if mol_counter % package_size == 0:
220 outfile.close()
221 pack += 1
222 outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' )
223 if mol_counter*10 % package_size == 0:
224 print('%i molecules parsed, starting pack nr. %i' % ( mol_counter, pack - 1 ))
225 outfile.close()
226
227 return True
228
229 def split_smi_library( smiles_file, structures_in_one_file ):
230 """
231 Split a file with SMILES to several files for multiprocessing usage.
232 Usage: split_smi_library( smiles_file, 10 )
233 """
234 output_files = []
235 tfile = tempfile.NamedTemporaryFile(delete=False)
236
237 smiles_handle = open(smiles_file, 'r')
238 for count, line in enumerate( smiles_handle ):
239 if count % structures_in_one_file == 0 and count != 0:
240 tfile.close()
241 output_files.append(tfile.name)
242 tfile = tempfile.NamedTemporaryFile(delete=False)
243 tfile.write(line)
244 tfile.close()
245 output_files.append(tfile.name)
246 smiles_handle.close()
247 return output_files
248
249
250 def mp_run(input_path, regex, PROCESSES, function_to_call ):
251 paths = []
252 [ paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex)) ]
253 paths.sort()
254
255 pool = Pool(processes=PROCESSES)
256 print('Process initialized with', PROCESSES, 'processors')
257 result = pool.map_async(function_to_call, paths)
258 result.get()
259
260 return paths
261
262 if __name__ == '__main__':
263 print(check_filetype(sys.argv[1]))
264