Mercurial > repos > bgruening > openbabel_structure_distance_finder
comparison cheminfolib.py @ 0:c066b5accacf draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/openbabel commit 6c84abdd07f292048bf2194073e2e938e94158c4"
| author | bgruening |
|---|---|
| date | Wed, 25 Mar 2020 16:47:13 -0400 |
| parents | |
| children | 4c9d6b47045c |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c066b5accacf |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 Small library with cheminformatic functions based on openbabel and pgchem. | |
| 4 Copyright 2012, Bjoern Gruening and Xavier Lucas | |
| 5 """ | |
| 6 | |
| 7 import os, sys | |
| 8 | |
| 9 try: | |
| 10 from galaxy import eggs | |
| 11 eggs.require('psycopg2') | |
| 12 except: | |
| 13 print('psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB') | |
| 14 | |
| 15 try: | |
| 16 import pybel | |
| 17 import openbabel | |
| 18 except: | |
| 19 print('OpenBabel could not be found. A few functions are not available without OpenBabel.') | |
| 20 | |
| 21 from multiprocessing import Pool | |
| 22 import glob, tempfile, re | |
| 23 import subprocess | |
| 24 | |
| 25 def CountLines( path ): | |
| 26 out = subprocess.Popen(['wc', '-l', path], | |
| 27 stdout=subprocess.PIPE, | |
| 28 stderr=subprocess.STDOUT | |
| 29 ).communicate()[0] | |
| 30 return int(out.partition(b' ')[0]) | |
| 31 | |
| 32 def grep(pattern, file_obj): | |
| 33 grepper = re.compile(pattern) | |
| 34 for line in file_obj: | |
| 35 if grepper.search(line): | |
| 36 return True | |
| 37 return False | |
| 38 | |
| 39 def check_filetype(filepath): | |
| 40 mol = False | |
| 41 possible_inchi = True | |
| 42 for line_counter, line in enumerate(open(filepath)): | |
| 43 if line_counter > 10000: | |
| 44 break | |
| 45 if line.find('$$$$') != -1: | |
| 46 return 'sdf' | |
| 47 elif line.find('@<TRIPOS>MOLECULE') != -1: | |
| 48 return 'mol2' | |
| 49 elif line.find('ligand id') != -1: | |
| 50 return 'drf' | |
| 51 elif possible_inchi and re.findall('^InChI=', line): | |
| 52 return 'inchi' | |
| 53 elif re.findall('^M\s+END', line): | |
| 54 mol = True | |
| 55 # first line is not an InChI, so it can't be an InChI file | |
| 56 possible_inchi = False | |
| 57 | |
| 58 if mol: | |
| 59 # END can occures before $$$$, so and SDF file will | |
| 60 # be recognised as mol, if you not using this hack' | |
| 61 return 'mol' | |
| 62 return 'smi' | |
| 63 | |
| 64 def db_connect(args): | |
| 65 try: | |
| 66 db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd)); | |
| 67 return db_conn | |
| 68 except: | |
| 69 sys.exit('Unable to connect to the db') | |
| 70 | |
| 71 ColumnNames = { | |
| 72 'can_smiles' : 'Canonical SMILES', | |
| 73 'can' : 'Canonical SMILES', | |
| 74 'inchi' : 'InChI', | |
| 75 'inchi_key' : 'InChI key', | |
| 76 'inchi_key_first' : 'InChI key first', | |
| 77 'inchi_key_last' : 'InChI key last', | |
| 78 'molwt' : 'Molecular weight', | |
| 79 'hbd' : 'Hydrogen-bond donors', | |
| 80 'donors' : 'Hydrogen-bond donors', | |
| 81 'hba' : 'Hydrogen-bond acceptors', | |
| 82 'acceptors' : 'Hydrogen-bond acceptors', | |
| 83 'rotbonds' : 'Rotatable bonds', | |
| 84 'logp' : 'logP', | |
| 85 'psa' : 'Polar surface area', | |
| 86 'mr' : 'Molecular refractivity', | |
| 87 'atoms' : 'Number of heavy atoms', | |
| 88 'rings' : 'Number of rings', | |
| 89 'set_bits' : 'FP2 bits', | |
| 90 'id' : 'Internal identifier', | |
| 91 'tani' : 'Tanimoto coefficient', | |
| 92 'spectrophore' : 'Spectrophores(TM)', | |
| 93 'dist_spectrophore' : 'Spectrophores(TM) distance to target', | |
| 94 'synonym' : 'Entry id', | |
| 95 } | |
| 96 | |
| 97 OBDescriptor = { | |
| 98 'atoms': ["atoms","Number of atoms"], | |
| 99 'hatoms': ["hatoms","Number of heavy atoms"], # self defined tag hatoms in plugindefines.txt | |
| 100 'can_smiles' : ["cansmi","Canonical SMILES"], | |
| 101 'can_smilesNS' : ["cansmiNS","Canonical SMILES without isotopes or stereo"], | |
| 102 #["abonds","Number of aromatic bonds"], | |
| 103 #["bonds","Number of bonds"], | |
| 104 #["dbonds","Number of double bonds"], | |
| 105 #["formula","Chemical formula"], | |
| 106 'hba': ["HBA1","Number of Hydrogen Bond Acceptors 1 (JoelLib)"], | |
| 107 'hba2': ["HBA2","Number of Hydrogen Bond Acceptors 2 (JoelLib)"], | |
| 108 'hbd': ["HBD","Number of Hydrogen Bond Donors (JoelLib)"], | |
| 109 'inchi': ["InChI","IUPAC InChI identifier"], | |
| 110 'inchi_key': ["InChIKey","InChIKey"], | |
| 111 #["L5","Lipinski Rule of Five"], | |
| 112 'logp': ["logP","octanol/water partition coefficient"], | |
| 113 'mr': ["MR","molar refractivity"], | |
| 114 'molwt': ["MW","Molecular Weight filter"], | |
| 115 #["nF","Number of Fluorine Atoms"], | |
| 116 #["s","SMARTS filter"], | |
| 117 #["sbonds","Number of single bonds"], | |
| 118 #["smarts","SMARTS filter"], | |
| 119 #["tbonds","Number of triple bonds"], | |
| 120 #["title","For comparing a molecule's title"], | |
| 121 'psa': ["TPSA","topological polar surface area"], | |
| 122 'rotbonds' : ['ROTATABLE_BOND', 'rotatable bonds'], | |
| 123 } | |
| 124 | |
| 125 | |
| 126 def print_output(args, rows): | |
| 127 if args.oformat == 'table': | |
| 128 outfile = open(args.output, 'w') | |
| 129 requested_fields = (filter(lambda x: x not in ["[", "]", "'"], args.fetch)).split(', ') | |
| 130 if args.header: | |
| 131 outfile.write( 'Identifier\t' + '\t'.join( [ColumnNames[key] for key in requested_fields] ) + '\n' ) | |
| 132 for row in rows: | |
| 133 outfile.write( row['synonym'] + '\t' + '\t'.join( [str(row[key]) for key in requested_fields] ) + '\n' ) | |
| 134 | |
| 135 elif args.oformat in ['sdf', 'mol2']: | |
| 136 outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True) | |
| 137 for row in rows: | |
| 138 try: | |
| 139 mol = pybel.readstring('sdf', row['mol']) | |
| 140 if args.oformat == 'sdf': | |
| 141 keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split(', ') | |
| 142 mol.data.update( { ColumnNames['synonym'] : row['synonym'] } ) | |
| 143 if 'inchi_key' in keys: | |
| 144 keys = (', '.join(keys).replace( "inchi_key", "inchi_key_first, inchi_key_last" )).split(', ') | |
| 145 [ mol.data.update( { ColumnNames[key] : row[key] } ) for key in keys if key] | |
| 146 outfile.write(mol) | |
| 147 except: | |
| 148 pass | |
| 149 else: | |
| 150 outfile = open(args.output, 'w') | |
| 151 outfile.write( '\n'.join( [ '%s\t%s' % (row[args.oformat], row['synonym'] ) for row in rows ] ) ) | |
| 152 outfile.close() | |
| 153 | |
| 154 def pybel_stop_logging(): | |
| 155 openbabel.obErrorLog.StopLogging() | |
| 156 | |
| 157 def get_properties_ext(mol): | |
| 158 | |
| 159 HBD = pybel.Smarts("[!#6;!H0]") | |
| 160 HBA = pybel.Smarts("[$([$([#8,#16]);!$(*=N~O);" + | |
| 161 "!$(*~N=O);X1,X2]),$([#7;v3;" + | |
| 162 "!$([nH]);!$(*(-a)-a)])]" | |
| 163 ) | |
| 164 calc_desc_dict = mol.calcdesc() | |
| 165 | |
| 166 try: | |
| 167 logp = calc_desc_dict['logP'] | |
| 168 except: | |
| 169 logp = calc_desc_dict['LogP'] | |
| 170 | |
| 171 return {"molwt": mol.molwt, | |
| 172 "logp": logp, | |
| 173 "donors": len(HBD.findall(mol)), | |
| 174 "acceptors": len(HBA.findall(mol)), | |
| 175 "psa": calc_desc_dict['TPSA'], | |
| 176 "mr": calc_desc_dict['MR'], | |
| 177 "rotbonds": mol.OBMol.NumRotors(), | |
| 178 "can": mol.write("can").split()[0].strip(), ### tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string) | |
| 179 "inchi": mol.write("inchi").strip(), | |
| 180 "inchi_key": get_inchikey(mol).strip(), | |
| 181 "rings": len(mol.sssr), | |
| 182 "atoms": mol.OBMol.NumHvyAtoms(), | |
| 183 "spectrophore" : OBspectrophore(mol), | |
| 184 } | |
| 185 | |
| 186 def get_inchikey(mol): | |
| 187 conv = openbabel.OBConversion() | |
| 188 conv.SetInAndOutFormats("mol", "inchi") | |
| 189 conv.SetOptions("K", conv.OUTOPTIONS) | |
| 190 inchikey = conv.WriteString( mol.OBMol ) | |
| 191 return inchikey | |
| 192 | |
| 193 def OBspectrophore(mol): | |
| 194 spectrophore = pybel.ob.OBSpectrophore() | |
| 195 # Parameters: rotation angle = 20, normalization for mean and sd, accuracy = 3.0 A and non-stereospecific cages. | |
| 196 spectrophore.SetNormalization( spectrophore.NormalizationTowardsZeroMeanAndUnitStd ) | |
| 197 return ', '.join( [ "%.3f" % value for value in spectrophore.GetSpectrophore( mol.OBMol ) ] ) | |
| 198 | |
| 199 def squared_euclidean_distance(a, b): | |
| 200 try: | |
| 201 return ((np.asarray( a ) - np.asarray( b ))**2).sum() | |
| 202 except ValueError: | |
| 203 return 0 | |
| 204 | |
| 205 def split_library( lib_path, lib_format = 'sdf', package_size = None ): | |
| 206 """ | |
| 207 Split a library of compounds. Usage: split_library( lib_path, lib_format, package_size ) | |
| 208 IT currently ONLY WORKS FOR SD-Files | |
| 209 """ | |
| 210 pack = 1 | |
| 211 mol_counter = 0 | |
| 212 | |
| 213 outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' ) | |
| 214 | |
| 215 for line in open(lib_path, 'r'): | |
| 216 outfile.write( line ) | |
| 217 if line.strip() == '$$$$': | |
| 218 mol_counter += 1 | |
| 219 if mol_counter % package_size == 0: | |
| 220 outfile.close() | |
| 221 pack += 1 | |
| 222 outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' ) | |
| 223 if mol_counter*10 % package_size == 0: | |
| 224 print('%i molecules parsed, starting pack nr. %i' % ( mol_counter, pack - 1 )) | |
| 225 outfile.close() | |
| 226 | |
| 227 return True | |
| 228 | |
| 229 def split_smi_library( smiles_file, structures_in_one_file ): | |
| 230 """ | |
| 231 Split a file with SMILES to several files for multiprocessing usage. | |
| 232 Usage: split_smi_library( smiles_file, 10 ) | |
| 233 """ | |
| 234 output_files = [] | |
| 235 tfile = tempfile.NamedTemporaryFile(delete=False) | |
| 236 | |
| 237 smiles_handle = open(smiles_file, 'r') | |
| 238 for count, line in enumerate( smiles_handle ): | |
| 239 if count % structures_in_one_file == 0 and count != 0: | |
| 240 tfile.close() | |
| 241 output_files.append(tfile.name) | |
| 242 tfile = tempfile.NamedTemporaryFile(delete=False) | |
| 243 tfile.write(line) | |
| 244 tfile.close() | |
| 245 output_files.append(tfile.name) | |
| 246 smiles_handle.close() | |
| 247 return output_files | |
| 248 | |
| 249 | |
| 250 def mp_run(input_path, regex, PROCESSES, function_to_call ): | |
| 251 paths = [] | |
| 252 [ paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex)) ] | |
| 253 paths.sort() | |
| 254 | |
| 255 pool = Pool(processes=PROCESSES) | |
| 256 print('Process initialized with', PROCESSES, 'processors') | |
| 257 result = pool.map_async(function_to_call, paths) | |
| 258 result.get() | |
| 259 | |
| 260 return paths | |
| 261 | |
| 262 if __name__ == '__main__': | |
| 263 print(check_filetype(sys.argv[1])) | |
| 264 |
