Mercurial > repos > bgruening > openbabel_remduplicates
diff cheminfolib.py @ 13:12aca74f07d7 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/openbabel commit 1fe240ef0064a1a4a66d9be1ccace53824280b75"
author | bgruening |
---|---|
date | Mon, 19 Oct 2020 14:47:33 +0000 |
parents | 50ca8845e7f5 |
children | c5de6c19eb06 |
line wrap: on
line diff
--- a/cheminfolib.py Tue Jul 28 08:38:56 2020 -0400 +++ b/cheminfolib.py Mon Oct 19 14:47:33 2020 +0000 @@ -4,31 +4,37 @@ Copyright 2012, Bjoern Gruening and Xavier Lucas """ -import os, sys +import glob +import re +import subprocess +import sys +import tempfile +from multiprocessing import Pool + try: from galaxy import eggs eggs.require('psycopg2') -except: +except ImportError: + psycopg2 = None print('psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB') try: from openbabel import openbabel, pybel openbabel.obErrorLog.StopLogging() -except: +except ImportError: + openbabel, pybel = None, None print('OpenBabel could not be found. A few functions are not available without OpenBabel.') -from multiprocessing import Pool -import glob, tempfile, re -import subprocess -def CountLines( path ): +def CountLines(path): out = subprocess.Popen(['wc', '-l', path], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT - ).communicate()[0] + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT + ).communicate()[0] return int(out.partition(b' ')[0]) + def grep(pattern, file_obj): grepper = re.compile(pattern) for line in file_obj: @@ -36,6 +42,7 @@ return True return False + def check_filetype(filepath): mol = False possible_inchi = True @@ -50,76 +57,78 @@ return 'drf' elif possible_inchi and re.findall('^InChI=', line): return 'inchi' - elif re.findall('^M\s+END', line): + elif re.findall(r'^M\s+END', line): mol = True # first line is not an InChI, so it can't be an InChI file possible_inchi = False if mol: - # END can occures before $$$$, so and SDF file will + # END can occures before $$$$, so and SDF file will # be recognised as mol, if you not using this hack' return 'mol' return 'smi' + def db_connect(args): try: - db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd)); + db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd)) return db_conn - except: + except psycopg2.Error: sys.exit('Unable to connect to the db') + ColumnNames = { - 'can_smiles' : 'Canonical SMILES', - 'can' : 'Canonical SMILES', - 'inchi' : 'InChI', - 'inchi_key' : 'InChI key', - 'inchi_key_first' : 'InChI key first', - 'inchi_key_last' : 'InChI key last', - 'molwt' : 'Molecular weight', - 'hbd' : 'Hydrogen-bond donors', - 'donors' : 'Hydrogen-bond donors', - 'hba' : 'Hydrogen-bond acceptors', - 'acceptors' : 'Hydrogen-bond acceptors', - 'rotbonds' : 'Rotatable bonds', - 'logp' : 'logP', - 'psa' : 'Polar surface area', - 'mr' : 'Molecular refractivity', - 'atoms' : 'Number of heavy atoms', - 'rings' : 'Number of rings', - 'set_bits' : 'FP2 bits', - 'id' : 'Internal identifier', - 'tani' : 'Tanimoto coefficient', - 'spectrophore' : 'Spectrophores(TM)', - 'dist_spectrophore' : 'Spectrophores(TM) distance to target', - 'synonym' : 'Entry id', + 'can_smiles': 'Canonical SMILES', + 'can': 'Canonical SMILES', + 'inchi': 'InChI', + 'inchi_key': 'InChI key', + 'inchi_key_first': 'InChI key first', + 'inchi_key_last': 'InChI key last', + 'molwt': 'Molecular weight', + 'hbd': 'Hydrogen-bond donors', + 'donors': 'Hydrogen-bond donors', + 'hba': 'Hydrogen-bond acceptors', + 'acceptors': 'Hydrogen-bond acceptors', + 'rotbonds': 'Rotatable bonds', + 'logp': 'logP', + 'psa': 'Polar surface area', + 'mr': 'Molecular refractivity', + 'atoms': 'Number of heavy atoms', + 'rings': 'Number of rings', + 'set_bits': 'FP2 bits', + 'id': 'Internal identifier', + 'tani': 'Tanimoto coefficient', + 'spectrophore': 'Spectrophores(TM)', + 'dist_spectrophore': 'Spectrophores(TM) distance to target', + 'synonym': 'Entry id', } OBDescriptor = { - 'atoms': ["atoms","Number of atoms"], - 'hatoms': ["hatoms","Number of heavy atoms"], # self defined tag hatoms in plugindefines.txt - 'can_smiles' : ["cansmi","Canonical SMILES"], - 'can_smilesNS' : ["cansmiNS","Canonical SMILES without isotopes or stereo"], - #["abonds","Number of aromatic bonds"], - #["bonds","Number of bonds"], - #["dbonds","Number of double bonds"], - #["formula","Chemical formula"], - 'hba': ["HBA1","Number of Hydrogen Bond Acceptors 1 (JoelLib)"], - 'hba2': ["HBA2","Number of Hydrogen Bond Acceptors 2 (JoelLib)"], - 'hbd': ["HBD","Number of Hydrogen Bond Donors (JoelLib)"], - 'inchi': ["InChI","IUPAC InChI identifier"], - 'inchi_key': ["InChIKey","InChIKey"], - #["L5","Lipinski Rule of Five"], - 'logp': ["logP","octanol/water partition coefficient"], - 'mr': ["MR","molar refractivity"], - 'molwt': ["MW","Molecular Weight filter"], - #["nF","Number of Fluorine Atoms"], - #["s","SMARTS filter"], - #["sbonds","Number of single bonds"], - #["smarts","SMARTS filter"], - #["tbonds","Number of triple bonds"], - #["title","For comparing a molecule's title"], - 'psa': ["TPSA","topological polar surface area"], - 'rotbonds' : ['ROTATABLE_BOND', 'rotatable bonds'], + 'atoms': ["atoms", "Number of atoms"], + 'hatoms': ["hatoms", "Number of heavy atoms"], # self defined tag hatoms in plugindefines.txt + 'can_smiles': ["cansmi", "Canonical SMILES"], + 'can_smilesNS': ["cansmiNS", "Canonical SMILES without isotopes or stereo"], + # ["abonds", "Number of aromatic bonds"], + # ["bonds", "Number of bonds"], + # ["dbonds", "Number of double bonds"], + # ["formula", "Chemical formula"], + 'hba': ["HBA1", "Number of Hydrogen Bond Acceptors 1 (JoelLib)"], + 'hba2': ["HBA2", "Number of Hydrogen Bond Acceptors 2 (JoelLib)"], + 'hbd': ["HBD", "Number of Hydrogen Bond Donors (JoelLib)"], + 'inchi': ["InChI", "IUPAC InChI identifier"], + 'inchi_key': ["InChIKey", "InChIKey"], + # ["L5", "Lipinski Rule of Five"], + 'logp': ["logP", "octanol/water partition coefficient"], + 'mr': ["MR", "molar refractivity"], + 'molwt': ["MW", "Molecular Weight filter"], + # ["nF", "Number of Fluorine Atoms"], + # ["s", "SMARTS filter"], + # ["sbonds", "Number of single bonds"], + # ["smarts", "SMARTS filter"], + # ["tbonds", "Number of triple bonds"], + # ["title", "For comparing a molecule's title"], + 'psa': ["TPSA", "topological polar surface area"], + 'rotbonds': ['ROTATABLE_BOND', 'rotatable bonds'], } @@ -128,9 +137,9 @@ outfile = open(args.output, 'w') requested_fields = (filter(lambda x: x not in ["[", "]", "'"], args.fetch)).split(', ') if args.header: - outfile.write( 'Identifier\t' + '\t'.join( [ColumnNames[key] for key in requested_fields] ) + '\n' ) + outfile.write('Identifier\t' + '\t'.join([ColumnNames[key] for key in requested_fields]) + '\n') for row in rows: - outfile.write( row['synonym'] + '\t' + '\t'.join( [str(row[key]) for key in requested_fields] ) + '\n' ) + outfile.write(row['synonym'] + '\t' + '\t'.join([str(row[key]) for key in requested_fields]) + '\n') elif args.oformat in ['sdf', 'mol2']: outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True) @@ -139,103 +148,102 @@ mol = pybel.readstring('sdf', row['mol']) if args.oformat == 'sdf': keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split(', ') - mol.data.update( { ColumnNames['synonym'] : row['synonym'] } ) + mol.data.update({ColumnNames['synonym']: row['synonym']}) if 'inchi_key' in keys: - keys = (', '.join(keys).replace( "inchi_key", "inchi_key_first, inchi_key_last" )).split(', ') - [ mol.data.update( { ColumnNames[key] : row[key] } ) for key in keys if key] + keys = (', '.join(keys).replace("inchi_key", "inchi_key_first, inchi_key_last")).split(', ') + [mol.data.update({ColumnNames[key]: row[key]}) for key in keys if key] outfile.write(mol) - except: + except OSError: pass else: outfile = open(args.output, 'w') - outfile.write( '\n'.join( [ '%s\t%s' % (row[args.oformat], row['synonym'] ) for row in rows ] ) ) + outfile.write('\n'.join(['%s\t%s' % (row[args.oformat], row['synonym']) for row in rows])) outfile.close() + def pybel_stop_logging(): openbabel.obErrorLog.StopLogging() + def get_properties_ext(mol): - HBD = pybel.Smarts("[!#6;!H0]") - HBA = pybel.Smarts("[$([$([#8,#16]);!$(*=N~O);" + - "!$(*~N=O);X1,X2]),$([#7;v3;" + - "!$([nH]);!$(*(-a)-a)])]" - ) + HBA = pybel.Smarts(("[$([$([#8,#16]);!$(*=N~O);" + "!$(*~N=O);X1,X2]),$([#7;v3;" + "!$([nH]);!$(*(-a)-a)])]" + )) calc_desc_dict = mol.calcdesc() try: logp = calc_desc_dict['logP'] - except: + except KeyError: logp = calc_desc_dict['LogP'] return {"molwt": mol.molwt, "logp": logp, "donors": len(HBD.findall(mol)), - "acceptors": len(HBA.findall(mol)), + "acceptors": len(HBA.findall(mol)), "psa": calc_desc_dict['TPSA'], "mr": calc_desc_dict['MR'], "rotbonds": mol.OBMol.NumRotors(), - "can": mol.write("can").split()[0].strip(), ### tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string) + "can": mol.write("can").split()[0].strip(), # tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string) "inchi": mol.write("inchi").strip(), "inchi_key": get_inchikey(mol).strip(), "rings": len(mol.sssr), "atoms": mol.OBMol.NumHvyAtoms(), - "spectrophore" : OBspectrophore(mol), - } + "spectrophore": OBspectrophore(mol), + } + def get_inchikey(mol): conv = openbabel.OBConversion() conv.SetInAndOutFormats("mol", "inchi") conv.SetOptions("K", conv.OUTOPTIONS) - inchikey = conv.WriteString( mol.OBMol ) + inchikey = conv.WriteString(mol.OBMol) return inchikey + def OBspectrophore(mol): spectrophore = pybel.ob.OBSpectrophore() # Parameters: rotation angle = 20, normalization for mean and sd, accuracy = 3.0 A and non-stereospecific cages. - spectrophore.SetNormalization( spectrophore.NormalizationTowardsZeroMeanAndUnitStd ) - return ', '.join( [ "%.3f" % value for value in spectrophore.GetSpectrophore( mol.OBMol ) ] ) + spectrophore.SetNormalization(spectrophore.NormalizationTowardsZeroMeanAndUnitStd) + return ', '.join(["%.3f" % value for value in spectrophore.GetSpectrophore(mol.OBMol)]) + -def squared_euclidean_distance(a, b): - try: - return ((np.asarray( a ) - np.asarray( b ))**2).sum() - except ValueError: - return 0 - -def split_library( lib_path, lib_format = 'sdf', package_size = None ): +def split_library(lib_path, lib_format='sdf', package_size=None): """ - Split a library of compounds. Usage: split_library( lib_path, lib_format, package_size ) - IT currently ONLY WORKS FOR SD-Files + Split a library of compounds. Usage: split_library(lib_path, lib_format, package_size) + IT currently ONLY WORKS FOR SD-Files """ pack = 1 mol_counter = 0 - outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' ) + outfile = open('/%s/%s_pack_%i.%s' % ('/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w') for line in open(lib_path, 'r'): - outfile.write( line ) + outfile.write(line) if line.strip() == '$$$$': mol_counter += 1 if mol_counter % package_size == 0: outfile.close() pack += 1 - outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' ) - if mol_counter*10 % package_size == 0: - print('%i molecules parsed, starting pack nr. %i' % ( mol_counter, pack - 1 )) + outfile = open('/%s/%s_pack_%i.%s' % ('/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w') + if mol_counter * 10 % package_size == 0: + print('%i molecules parsed, starting pack nr. %i' % (mol_counter, pack - 1)) outfile.close() return True -def split_smi_library( smiles_file, structures_in_one_file ): + +def split_smi_library(smiles_file, structures_in_one_file): """ - Split a file with SMILES to several files for multiprocessing usage. - Usage: split_smi_library( smiles_file, 10 ) + Split a file with SMILES to several files for multiprocessing usage. + Usage: split_smi_library(smiles_file, 10) """ output_files = [] tfile = tempfile.NamedTemporaryFile(delete=False) smiles_handle = open(smiles_file, 'r') - for count, line in enumerate( smiles_handle ): + for count, line in enumerate(smiles_handle): if count % structures_in_one_file == 0 and count != 0: tfile.close() output_files.append(tfile.name) @@ -247,9 +255,9 @@ return output_files -def mp_run(input_path, regex, PROCESSES, function_to_call ): +def mp_run(input_path, regex, PROCESSES, function_to_call): paths = [] - [ paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex)) ] + [paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex))] paths.sort() pool = Pool(processes=PROCESSES) @@ -259,6 +267,6 @@ return paths + if __name__ == '__main__': print(check_filetype(sys.argv[1])) -