openbabel_remduplicates: cheminfolib.py comparison

comparison cheminfolib.py @ 13:12aca74f07d7 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/openbabel commit 1fe240ef0064a1a4a66d9be1ccace53824280b75"

author	bgruening
date	Mon, 19 Oct 2020 14:47:33 +0000
parents	50ca8845e7f5
children	c5de6c19eb06

comparison

equal deleted inserted replaced

-:50ca8845e7f5
+:12aca74f07d7
 """
 Small library with cheminformatic functions based on openbabel and pgchem.
 Copyright 2012, Bjoern Gruening and Xavier Lucas
 """
-import os, sys
+import glob
+import re
+import subprocess
+import sys
+import tempfile
+from multiprocessing import Pool
 try:
 from galaxy import eggs
 eggs.require('psycopg2')
-except:
+except ImportError:
+psycopg2 = None
 print('psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB')
 try:
 from openbabel import openbabel, pybel
 openbabel.obErrorLog.StopLogging()
-except:
+except ImportError:
+openbabel, pybel = None, None
 print('OpenBabel could not be found. A few functions are not available without OpenBabel.')
-from multiprocessing import Pool
-import glob, tempfile, re
+def CountLines(path):
-import subprocess
-def CountLines( path ):
 out = subprocess.Popen(['wc', '-l', path],
 stdout=subprocess.PIPE,
 stderr=subprocess.STDOUT
 ).communicate()[0]
 return int(out.partition(b' ')[0])
 def grep(pattern, file_obj):
 grepper = re.compile(pattern)
 for line in file_obj:
 if grepper.search(line):
 return True
 return False
 def check_filetype(filepath):
 mol = False
 possible_inchi = True
 for line_counter, line in enumerate(open(filepath)):
 return 'mol2'
 elif line.find('ligand id') != -1:
 return 'drf'
 elif possible_inchi and re.findall('^InChI=', line):
 return 'inchi'
-elif re.findall('^M\s+END', line):
+elif re.findall(r'^M\s+END', line):
 mol = True
 # first line is not an InChI, so it can't be an InChI file
 possible_inchi = False
 if mol:
 # END can occures before $$$$, so and SDF file will
 # be recognised as mol, if you not using this hack'
 return 'mol'
 return 'smi'
 def db_connect(args):
 try:
-db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd));
+db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd))
 return db_conn
-except:
+except psycopg2.Error:
 sys.exit('Unable to connect to the db')
 ColumnNames = {
-'can_smiles' : 'Canonical SMILES',
+'can_smiles': 'Canonical SMILES',
-'can' : 'Canonical SMILES',
+'can': 'Canonical SMILES',
-'inchi' : 'InChI',
+'inchi': 'InChI',
-'inchi_key' : 'InChI key',
+'inchi_key': 'InChI key',
-'inchi_key_first' : 'InChI key first',
+'inchi_key_first': 'InChI key first',
-'inchi_key_last' : 'InChI key last',
+'inchi_key_last': 'InChI key last',
-'molwt' : 'Molecular weight',
+'molwt': 'Molecular weight',
-'hbd' : 'Hydrogen-bond donors',
+'hbd': 'Hydrogen-bond donors',
-'donors' : 'Hydrogen-bond donors',
+'donors': 'Hydrogen-bond donors',
-'hba' : 'Hydrogen-bond acceptors',
+'hba': 'Hydrogen-bond acceptors',
-'acceptors' : 'Hydrogen-bond acceptors',
+'acceptors': 'Hydrogen-bond acceptors',
-'rotbonds' : 'Rotatable bonds',
+'rotbonds': 'Rotatable bonds',
-'logp' : 'logP',
+'logp': 'logP',
-'psa' : 'Polar surface area',
+'psa': 'Polar surface area',
-'mr' : 'Molecular refractivity',
+'mr': 'Molecular refractivity',
-'atoms' : 'Number of heavy atoms',
+'atoms': 'Number of heavy atoms',
-'rings' : 'Number of rings',
+'rings': 'Number of rings',
-'set_bits' : 'FP2 bits',
+'set_bits': 'FP2 bits',
-'id' : 'Internal identifier',
+'id': 'Internal identifier',
-'tani' : 'Tanimoto coefficient',
+'tani': 'Tanimoto coefficient',
-'spectrophore' : 'Spectrophores(TM)',
+'spectrophore': 'Spectrophores(TM)',
-'dist_spectrophore' : 'Spectrophores(TM) distance to target',
+'dist_spectrophore': 'Spectrophores(TM) distance to target',
-'synonym' : 'Entry id',
+'synonym': 'Entry id',
 }
 OBDescriptor = {
-'atoms': ["atoms","Number of atoms"],
+'atoms': ["atoms", "Number of atoms"],
-'hatoms': ["hatoms","Number of heavy atoms"], # self defined tag hatoms in plugindefines.txt
+'hatoms': ["hatoms", "Number of heavy atoms"],  # self defined tag hatoms in plugindefines.txt
-'can_smiles' : ["cansmi","Canonical SMILES"],
+'can_smiles': ["cansmi", "Canonical SMILES"],
-'can_smilesNS' : ["cansmiNS","Canonical SMILES without isotopes or stereo"],
+'can_smilesNS': ["cansmiNS", "Canonical SMILES without isotopes or stereo"],
-#["abonds","Number of aromatic bonds"],
+# ["abonds", "Number of aromatic bonds"],
-#["bonds","Number of bonds"],
+# ["bonds", "Number of bonds"],
-#["dbonds","Number of double bonds"],
+# ["dbonds", "Number of double bonds"],
-#["formula","Chemical formula"],
+# ["formula", "Chemical formula"],
-'hba': ["HBA1","Number of Hydrogen Bond Acceptors 1 (JoelLib)"],
+'hba': ["HBA1", "Number of Hydrogen Bond Acceptors 1 (JoelLib)"],
-'hba2': ["HBA2","Number of Hydrogen Bond Acceptors 2 (JoelLib)"],
+'hba2': ["HBA2", "Number of Hydrogen Bond Acceptors 2 (JoelLib)"],
-'hbd': ["HBD","Number of Hydrogen Bond Donors (JoelLib)"],
+'hbd': ["HBD", "Number of Hydrogen Bond Donors (JoelLib)"],
-'inchi': ["InChI","IUPAC InChI identifier"],
+'inchi': ["InChI", "IUPAC InChI identifier"],
-'inchi_key': ["InChIKey","InChIKey"],
+'inchi_key': ["InChIKey", "InChIKey"],
-#["L5","Lipinski Rule of Five"],
+# ["L5", "Lipinski Rule of Five"],
-'logp': ["logP","octanol/water partition coefficient"],
+'logp': ["logP", "octanol/water partition coefficient"],
-'mr': ["MR","molar refractivity"],
+'mr': ["MR", "molar refractivity"],
-'molwt': ["MW","Molecular Weight filter"],
+'molwt': ["MW", "Molecular Weight filter"],
-#["nF","Number of Fluorine Atoms"],
+# ["nF", "Number of Fluorine Atoms"],
-#["s","SMARTS filter"],
+# ["s", "SMARTS filter"],
-#["sbonds","Number of single bonds"],
+# ["sbonds", "Number of single bonds"],
-#["smarts","SMARTS filter"],
+# ["smarts", "SMARTS filter"],
-#["tbonds","Number of triple bonds"],
+# ["tbonds", "Number of triple bonds"],
-#["title","For comparing a molecule's title"],
+# ["title", "For comparing a molecule's title"],
-'psa': ["TPSA","topological polar surface area"],
+'psa': ["TPSA", "topological polar surface area"],
-'rotbonds' : ['ROTATABLE_BOND', 'rotatable bonds'],
+'rotbonds': ['ROTATABLE_BOND', 'rotatable bonds'],
 }
 def print_output(args, rows):
 if args.oformat == 'table':
 outfile = open(args.output, 'w')
 requested_fields = (filter(lambda x: x not in ["[", "]", "'"], args.fetch)).split(', ')
 if args.header:
-outfile.write( 'Identifier\t' + '\t'.join( [ColumnNames[key] for key in requested_fields] ) + '\n' )
+outfile.write('Identifier\t' + '\t'.join([ColumnNames[key] for key in requested_fields]) + '\n')
 for row in rows:
-outfile.write( row['synonym'] + '\t' + '\t'.join( [str(row[key]) for key in requested_fields] ) + '\n' )
+outfile.write(row['synonym'] + '\t' + '\t'.join([str(row[key]) for key in requested_fields]) + '\n')
 elif args.oformat in ['sdf', 'mol2']:
 outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True)
 for row in rows:
 try:
 mol = pybel.readstring('sdf', row['mol'])
 if args.oformat == 'sdf':
 keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split(', ')
-mol.data.update( { ColumnNames['synonym'] : row['synonym'] } )
+mol.data.update({ColumnNames['synonym']: row['synonym']})
 if 'inchi_key' in keys:
-keys = (', '.join(keys).replace( "inchi_key", "inchi_key_first, inchi_key_last" )).split(', ')
+keys = (', '.join(keys).replace("inchi_key", "inchi_key_first, inchi_key_last")).split(', ')
-[ mol.data.update( { ColumnNames[key] : row[key] } ) for key in keys if key]
+[mol.data.update({ColumnNames[key]: row[key]}) for key in keys if key]
 outfile.write(mol)
-except:
+except OSError:
 pass
 else:
 outfile = open(args.output, 'w')
-outfile.write( '\n'.join( [ '%s\t%s' % (row[args.oformat], row['synonym'] ) for row in rows ] ) )
+outfile.write('\n'.join(['%s\t%s' % (row[args.oformat], row['synonym']) for row in rows]))
 outfile.close()
 def pybel_stop_logging():
 openbabel.obErrorLog.StopLogging()
 def get_properties_ext(mol):
 HBD = pybel.Smarts("[!#6;!H0]")
-HBA = pybel.Smarts("[$([$([#8,#16]);!$(*=N~O);" +
+HBA = pybel.Smarts(("[$([$([#8,#16]);!$(*=N~O);"
-"!$(*~N=O);X1,X2]),$([#7;v3;" +
+"!$(*~N=O);X1,X2]),$([#7;v3;"
 "!$([nH]);!$(*(-a)-a)])]"
-)
+))
 calc_desc_dict = mol.calcdesc()
 try:
 logp = calc_desc_dict['logP']
-except:
+except KeyError:
 logp = calc_desc_dict['LogP']
 return {"molwt": mol.molwt,
 "logp": logp,
 "donors": len(HBD.findall(mol)),
 "acceptors": len(HBA.findall(mol)),
 "psa": calc_desc_dict['TPSA'],
 "mr": calc_desc_dict['MR'],
 "rotbonds": mol.OBMol.NumRotors(),
-"can": mol.write("can").split()[0].strip(), ### tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string)
+"can": mol.write("can").split()[0].strip(),  # tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string)
 "inchi": mol.write("inchi").strip(),
 "inchi_key": get_inchikey(mol).strip(),
 "rings": len(mol.sssr),
 "atoms": mol.OBMol.NumHvyAtoms(),
-"spectrophore" : OBspectrophore(mol),
+"spectrophore": OBspectrophore(mol),
 }
 def get_inchikey(mol):
 conv = openbabel.OBConversion()
 conv.SetInAndOutFormats("mol", "inchi")
 conv.SetOptions("K", conv.OUTOPTIONS)
-inchikey = conv.WriteString( mol.OBMol )
+inchikey = conv.WriteString(mol.OBMol)
 return inchikey
 def OBspectrophore(mol):
 spectrophore = pybel.ob.OBSpectrophore()
 # Parameters: rotation angle = 20, normalization for mean and sd, accuracy = 3.0 A and non-stereospecific cages.
-spectrophore.SetNormalization( spectrophore.NormalizationTowardsZeroMeanAndUnitStd )
+spectrophore.SetNormalization(spectrophore.NormalizationTowardsZeroMeanAndUnitStd)
-return ', '.join( [ "%.3f" % value for value in spectrophore.GetSpectrophore( mol.OBMol ) ] )
+return ', '.join(["%.3f" % value for value in spectrophore.GetSpectrophore(mol.OBMol)])
-def squared_euclidean_distance(a, b):
-try:
+def split_library(lib_path, lib_format='sdf', package_size=None):
-return ((np.asarray( a ) - np.asarray( b ))**2).sum()
+"""
-except ValueError:
+Split a library of compounds. Usage: split_library(lib_path, lib_format, package_size)
-return 0
+IT currently ONLY WORKS FOR SD-Files
-def split_library( lib_path, lib_format = 'sdf', package_size = None ):
-"""
-Split a library of compounds. Usage: split_library( lib_path, lib_format, package_size )
-IT currently ONLY WORKS FOR SD-Files
 """
 pack = 1
 mol_counter = 0
-outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' )
+outfile = open('/%s/%s_pack_%i.%s' % ('/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w')
 for line in open(lib_path, 'r'):
-outfile.write( line )
+outfile.write(line)
 if line.strip() == '$$$$':
 mol_counter += 1
 if mol_counter % package_size == 0:
 outfile.close()
 pack += 1
-outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' )
+outfile = open('/%s/%s_pack_%i.%s' % ('/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w')
-if mol_counter*10 % package_size == 0:
+if mol_counter * 10 % package_size == 0:
-print('%i molecules parsed, starting pack nr. %i' % ( mol_counter, pack - 1 ))
+print('%i molecules parsed, starting pack nr. %i' % (mol_counter, pack - 1))
 outfile.close()
 return True
-def split_smi_library( smiles_file, structures_in_one_file ):
-"""
+def split_smi_library(smiles_file, structures_in_one_file):
-Split a file with SMILES to several files for multiprocessing usage.
+"""
-Usage: split_smi_library( smiles_file, 10 )
+Split a file with SMILES to several files for multiprocessing usage.
+Usage: split_smi_library(smiles_file, 10)
 """
 output_files = []
 tfile = tempfile.NamedTemporaryFile(delete=False)
 smiles_handle = open(smiles_file, 'r')
-for count, line in enumerate( smiles_handle ):
+for count, line in enumerate(smiles_handle):
 if count % structures_in_one_file == 0 and count != 0:
 tfile.close()
 output_files.append(tfile.name)
 tfile = tempfile.NamedTemporaryFile(delete=False)
 tfile.write(line)
 output_files.append(tfile.name)
 smiles_handle.close()
 return output_files
-def mp_run(input_path, regex, PROCESSES, function_to_call ):
+def mp_run(input_path, regex, PROCESSES, function_to_call):
 paths = []
-[ paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex)) ]
+[paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex))]
 paths.sort()
 pool = Pool(processes=PROCESSES)
 print('Process initialized with', PROCESSES, 'processors')
 result = pool.map_async(function_to_call, paths)
 result.get()
 return paths
 if __name__ == '__main__':
 print(check_filetype(sys.argv[1]))

Mercurial > repos > bgruening > openbabel_remduplicates

comparison cheminfolib.py @ 13:12aca74f07d7 draft