Mercurial > repos > bgruening > qed
view qed.py @ 3:52a8d34dd08f draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/silicos-it/qed commit 943ff93be2257426d69a8406ed55c838495ecf3f"
author | bgruening |
---|---|
date | Fri, 30 Jul 2021 12:51:45 +0000 |
parents | ab73abead7fa |
children |
line wrap: on
line source
#!/usr/bin/env python __all__ = ["weights_max", "weights_mean", "weights_none", "default"] import argparse import os import re import sys # General from copy import deepcopy from math import exp, log from rdkit import Chem # RDKit from rdkit.Chem import Descriptors class SilicosItError(Exception): """Base class for exceptions in Silicos-it code""" class WrongArgument(SilicosItError): """ Exception raised when argument to function is not of correct type. Attributes: function -- function in which error occurred msg -- explanation of the error """ def __init__(self, function, msg): self.function = function self.msg = msg def check_filetype(filepath): mol = False possible_inchi = True for line_counter, line in enumerate(open(filepath)): if line_counter > 10000: break if line.find("$$$$") != -1: return "sdf" elif line.find("@<TRIPOS>MOLECULE") != -1: return "mol2" elif line.find("ligand id") != -1: return "drf" elif possible_inchi and re.findall("^InChI=", line): return "inchi" elif re.findall("^M\s+END", line): # noqa W605 mol = True # first line is not an InChI, so it can't be an InChI file possible_inchi = False if mol: # END can occures before $$$$, so and SDF file will # be recognised as mol, if you not using this hack' return "mol" return "smi" AliphaticRings = Chem.MolFromSmarts("[$([A;R][!a])]") AcceptorSmarts = [ "[oH0;X2]", "[OH1;X2;v2]", "[OH0;X2;v2]", "[OH0;X1;v2]", "[O-;X1]", "[SH0;X2;v2]", "[SH0;X1;v2]", "[S-;X1]", "[nH0;X2]", "[NH0;X1;v3]", "[$([N;+0;X3;v3]);!$(N[C,S]=O)]", ] Acceptors = [] for hba in AcceptorSmarts: Acceptors.append(Chem.MolFromSmarts(hba)) StructuralAlertSmarts = [ "*1[O,S,N]*1", "[S,C](=[O,S])[F,Br,Cl,I]", "[CX4][Cl,Br,I]", "[C,c]S(=O)(=O)O[C,c]", "[$([CH]),$(CC)]#CC(=O)[C,c]", "[$([CH]),$(CC)]#CC(=O)O[C,c]", "n[OH]", "[$([CH]),$(CC)]#CS(=O)(=O)[C,c]", "C=C(C=O)C=O", "n1c([F,Cl,Br,I])cccc1", "[CH1](=O)", "[O,o][O,o]", "[C;!R]=[N;!R]", "[N!R]=[N!R]", "[#6](=O)[#6](=O)", "[S,s][S,s]", "[N,n][NH2]", "C(=O)N[NH2]", "[C,c]=S", "[$([CH2]),$([CH][CX4]),$(C([CX4])[CX4])]=[$([CH2]),$([CH][CX4]),$(C([CX4])[CX4])]", "C1(=[O,N])C=CC(=[O,N])C=C1", "C1(=[O,N])C(=[O,N])C=CC=C1", "a21aa3a(aa1aaaa2)aaaa3", "a31a(a2a(aa1)aaaa2)aaaa3", "a1aa2a3a(a1)A=AA=A3=AA=A2", "c1cc([NH2])ccc1", "[Hg,Fe,As,Sb,Zn,Se,se,Te,B,Si,Na,Ca,Ge,Ag,Mg,K,Ba,Sr,Be,Ti,Mo,Mn,Ru,Pd,Ni,Cu,Au,Cd,Al,Ga,Sn,Rh,Tl,Bi,Nb,Li,Pb,Hf,Ho]", "I", "OS(=O)(=O)[O-]", "[N+](=O)[O-]", "C(=O)N[OH]", "C1NC(=O)NC(=O)1", "[SH]", "[S-]", "c1ccc([Cl,Br,I,F])c([Cl,Br,I,F])c1[Cl,Br,I,F]", "c1cc([Cl,Br,I,F])cc([Cl,Br,I,F])c1[Cl,Br,I,F]", "[CR1]1[CR1][CR1][CR1][CR1][CR1][CR1]1", "[CR1]1[CR1][CR1]cc[CR1][CR1]1", "[CR2]1[CR2][CR2][CR2][CR2][CR2][CR2][CR2]1", "[CR2]1[CR2][CR2]cc[CR2][CR2][CR2]1", "[CH2R2]1N[CH2R2][CH2R2][CH2R2][CH2R2][CH2R2]1", "[CH2R2]1N[CH2R2][CH2R2][CH2R2][CH2R2][CH2R2][CH2R2]1", "C#C", "[OR2,NR2]@[CR2]@[CR2]@[OR2,NR2]@[CR2]@[CR2]@[OR2,NR2]", "[$([N+R]),$([n+R]),$([N+]=C)][O-]", "[C,c]=N[OH]", "[C,c]=NOC=O", "[C,c](=O)[CX4,CR0X3,O][C,c](=O)", "c1ccc2c(c1)ccc(=O)o2", "[O+,o+,S+,s+]", "N=C=O", "[NX3,NX4][F,Cl,Br,I]", "c1ccccc1OC(=O)[#6]", "[CR0]=[CR0][CR0]=[CR0]", "[C+,c+,C-,c-]", "N=[N+]=[N-]", "C12C(NC(N1)=O)CSC2", "c1c([OH])c([OH,NH2,NH])ccc1", "P", "[N,O,S]C#N", "C=C=O", "[Si][F,Cl,Br,I]", "[SX2]O", "[SiR0,CR0](c1ccccc1)(c2ccccc2)(c3ccccc3)", "O1CCCCC1OC2CCC3CCCCC3C2", "N=[CR0][N,n,O,S]", "[cR2]1[cR2][cR2]([Nv3X3,Nv4X4])[cR2][cR2][cR2]1[cR2]2[cR2][cR2][cR2]([Nv3X3,Nv4X4])[cR2][cR2]2", "C=[C!r]C#N", "[cR2]1[cR2]c([N+0X3R0,nX3R0])c([N+0X3R0,nX3R0])[cR2][cR2]1", "[cR2]1[cR2]c([N+0X3R0,nX3R0])[cR2]c([N+0X3R0,nX3R0])[cR2]1", "[cR2]1[cR2]c([N+0X3R0,nX3R0])[cR2][cR2]c1([N+0X3R0,nX3R0])", "[OH]c1ccc([OH,NH2,NH])cc1", "c1ccccc1OC(=O)O", "[SX2H0][N]", "c12ccccc1(SC(S)=N2)", "c12ccccc1(SC(=S)N2)", "c1nnnn1C=O", "s1c(S)nnc1NC=O", "S1C=CSC1=S", "C(=O)Onnn", "OS(=O)(=O)C(F)(F)F", "N#CC[OH]", "N#CC(=O)", "S(=O)(=O)C#N", "N[CH2]C#N", "C1(=O)NCC1", "S(=O)(=O)[O-,OH]", "NC[F,Cl,Br,I]", "C=[C!r]O", "[NX2+0]=[O+0]", "[OR0,NR0][OR0,NR0]", "C(=O)O[C,H1].C(=O)O[C,H1].C(=O)O[C,H1]", "[CX2R0][NX3R0]", "c1ccccc1[C;!R]=[C;!R]c2ccccc2", "[NX3R0,NX4R0,OR0,SX2R0][CX4][NX3R0,NX4R0,OR0,SX2R0]", "[s,S,c,C,n,N,o,O]~[n+,N+](~[s,S,c,C,n,N,o,O])(~[s,S,c,C,n,N,o,O])~[s,S,c,C,n,N,o,O]", "[s,S,c,C,n,N,o,O]~[nX3+,NX3+](~[s,S,c,C,n,N])~[s,S,c,C,n,N]", "[*]=[N+]=[*]", "[SX3](=O)[O-,OH]", "N#N", "F.F.F.F", "[R0;D2][R0;D2][R0;D2][R0;D2]", "[cR,CR]~C(=O)NC(=O)~[cR,CR]", "C=!@CC=[O,S]", "[#6,#8,#16][C,c](=O)O[C,c]", "c[C;R0](=[O,S])[C,c]", "c[SX2][C;!R]", "C=C=C", "c1nc([F,Cl,Br,I,S])ncc1", "c1ncnc([F,Cl,Br,I,S])c1", "c1nc(c2c(n1)nc(n2)[F,Cl,Br,I])", "[C,c]S(=O)(=O)c1ccc(cc1)F", "[15N]", "[13C]", "[18O]", "[34S]", ] StructuralAlerts = [] for smarts in StructuralAlertSmarts: StructuralAlerts.append(Chem.MolFromSmarts(smarts)) # ADS parameters for the 8 molecular properties: [row][column] # rows[8]: MW, ALOGP, HBA, HBD, PSA, ROTB, AROM, ALERTS # columns[7]: A, B, C, D, E, F, DMAX # ALOGP parameters from Gregory Gerebtzoff (2012, Roche) pads1 = [ [ 2.817065973, 392.5754953, 290.7489764, 2.419764353, 49.22325677, 65.37051707, 104.9805561, ], [ 0.486849448, 186.2293718, 2.066177165, 3.902720615, 1.027025453, 0.913012565, 145.4314800, ], [ 2.948620388, 160.4605972, 3.615294657, 4.435986202, 0.290141953, 1.300669958, 148.7763046, ], [ 1.618662227, 1010.051101, 0.985094388, 0.000000001, 0.713820843, 0.920922555, 258.1632616, ], [ 1.876861559, 125.2232657, 62.90773554, 87.83366614, 12.01999824, 28.51324732, 104.5686167, ], [ 0.010000000, 272.4121427, 2.558379970, 1.565547684, 1.271567166, 2.758063707, 105.4420403, ], [ 3.217788970, 957.7374108, 2.274627939, 0.000000001, 1.317690384, 0.375760881, 312.3372610, ], [ 0.010000000, 1199.094025, -0.09002883, 0.000000001, 0.185904477, 0.875193782, 417.7253140, ], ] # ALOGP parameters from the original publication pads2 = [ [ 2.817065973, 392.5754953, 290.7489764, 2.419764353, 49.22325677, 65.37051707, 104.9805561, ], [ 3.172690585, 137.8624751, 2.534937431, 4.581497897, 0.822739154, 0.576295591, 131.3186604, ], [ 2.948620388, 160.4605972, 3.615294657, 4.435986202, 0.290141953, 1.300669958, 148.7763046, ], [ 1.618662227, 1010.051101, 0.985094388, 0.000000001, 0.713820843, 0.920922555, 258.1632616, ], [ 1.876861559, 125.2232657, 62.90773554, 87.83366614, 12.01999824, 28.51324732, 104.5686167, ], [ 0.010000000, 272.4121427, 2.558379970, 1.565547684, 1.271567166, 2.758063707, 105.4420403, ], [ 3.217788970, 957.7374108, 2.274627939, 0.000000001, 1.317690384, 0.375760881, 312.3372610, ], [ 0.010000000, 1199.094025, -0.09002883, 0.000000001, 0.185904477, 0.875193782, 417.7253140, ], ] def ads(x, a, b, c, d, e, f, dmax): return ( a + ( b / (1 + exp(-1 * (x - c + d / 2) / e)) * (1 - 1 / (1 + exp(-1 * (x - c - d / 2) / f))) ) ) / dmax def properties(mol): """ Calculates the properties that are required to calculate the QED descriptor. """ matches = [] if mol is None: raise WrongArgument("properties(mol)", "mol argument is 'None'") x = [0] * 9 x[0] = Descriptors.MolWt(mol) # MW x[1] = Descriptors.MolLogP(mol) # ALOGP for hba in Acceptors: # HBA if mol.HasSubstructMatch(hba): matches = mol.GetSubstructMatches(hba) x[2] += len(matches) x[3] = Descriptors.NumHDonors(mol) # HBD x[4] = Descriptors.TPSA(mol) # PSA x[5] = Descriptors.NumRotatableBonds(mol) # ROTB x[6] = Chem.GetSSSR(Chem.DeleteSubstructs(deepcopy(mol), AliphaticRings)) # AROM for alert in StructuralAlerts: # ALERTS if mol.HasSubstructMatch(alert): x[7] += 1 ro5_failed = 0 if x[3] > 5: ro5_failed += 1 # HBD if x[2] > 10: ro5_failed += 1 # HBA if x[0] >= 500: ro5_failed += 1 if x[1] > 5: ro5_failed += 1 x[8] = ro5_failed return x def qed(w, p, gerebtzoff): d = [0.00] * 8 if gerebtzoff: for i in range(0, 8): d[i] = ads( p[i], pads1[i][0], pads1[i][1], pads1[i][2], pads1[i][3], pads1[i][4], pads1[i][5], pads1[i][6], ) else: for i in range(0, 8): d[i] = ads( p[i], pads2[i][0], pads2[i][1], pads2[i][2], pads2[i][3], pads2[i][4], pads2[i][5], pads2[i][6], ) t = 0.0 for i in range(0, 8): t += w[i] * log(d[i]) return exp(t / sum(w)) def weights_max(mol, gerebtzoff=True, props=False): """ Calculates the QED descriptor using maximal descriptor weights. If props is specified we skip the calculation step and use the props-list of properties. """ if not props: props = properties(mol) return qed([0.50, 0.25, 0.00, 0.50, 0.00, 0.50, 0.25, 1.00], props, gerebtzoff) def weights_mean(mol, gerebtzoff=True, props=False): """ Calculates the QED descriptor using average descriptor weights. If props is specified we skip the calculation step and use the props-list of properties. """ if not props: props = properties(mol) return qed([0.66, 0.46, 0.05, 0.61, 0.06, 0.65, 0.48, 0.95], props, gerebtzoff) def weights_none(mol, gerebtzoff=True, props=False): """ Calculates the QED descriptor using unit weights. If props is specified we skip the calculation step and use the props-list of properties. """ if not props: props = properties(mol) return qed([1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00], props, gerebtzoff) def default(mol, gerebtzoff=True): """ Calculates the QED descriptor using average descriptor weights and Gregory Gerebtzoff parameters. """ return weights_mean(mol, gerebtzoff) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-i", "--input", required=True, help="path to the input file name" ) parser.add_argument( "-m", "--method", dest="method", choices=["max", "mean", "unweighted"], default="mean", help="Specify the method you want to use.", ) parser.add_argument( "--iformat", help="Input format. It must be supported by openbabel." ) parser.add_argument( "-o", "--outfile", type=argparse.FileType("w+"), default=sys.stdout, help="path to the result file, default it sdtout", ) parser.add_argument( "--header", dest="header", action="store_true", default=False, help="Write header line.", ) args = parser.parse_args() # Elucidate filetype and open supplier ifile = os.path.abspath(args.input) if not os.path.isfile(ifile): print("Error: ", ifile, " is not a file or cannot be found.") sys.exit(1) if not os.path.exists(ifile): print("Error: ", ifile, " does not exist or cannot be found.") sys.exit(1) if not os.access(ifile, os.R_OK): print("Error: ", ifile, " is not readable.") sys.exit(1) if not args.iformat: # try to guess the filetype filetype = check_filetype(ifile) else: filetype = args.iformat # sdf or smi """ We want to store the original SMILES in the output. So in case of a SMILES file iterate over the file and convert each line separate. """ if filetype == "sdf": supplier = Chem.SDMolSupplier(ifile) # Process file if args.header: args.outfile.write( "MW\tALOGP\tHBA\tHBD\tPSA\tROTB\tAROM\tALERTS\tLRo5\tQED\tNAME\n" ) count = 0 for mol in supplier: count += 1 if mol is None: print( "Warning: skipping molecule ", count, " and continuing with next." ) continue props = properties(mol) if args.method == "max": calc_qed = weights_max(mol, True, props) elif args.method == "unweighted": calc_qed = weights_none(mol, True, props) else: calc_qed = weights_mean(mol, True, props) args.outfile.write( "%.2f\t%.3f\t%d\t%d\t%.2f\t%d\t%d\t%d\t%s\t%.3f\t%-s\n" % ( props[0], props[1], props[2], props[3], props[4], props[5], props[6], props[7], props[8], calc_qed, mol.GetProp("_Name"), ) ) elif filetype == "smi": supplier = Chem.SmilesMolSupplier(ifile, " \t", 0, 1, False, True) # Process file if args.header: args.outfile.write( "MW\tALOGP\tHBA\tHBD\tPSA\tROTB\tAROM\tALERTS\tLRo5\tQED\tNAME\tSMILES\n" ) count = 0 for line in open(ifile): tokens = line.strip().split("\t") if len(tokens) > 1: smiles, title = tokens else: smiles = tokens[0] title = "" mol = Chem.MolFromSmiles(smiles) count += 1 if mol is None: print( "Warning: skipping molecule ", count, " and continuing with next." ) continue props = properties(mol) if args.method == "max": calc_qed = weights_max(mol, True, props) elif args.method == "unweighted": calc_qed = weights_none(mol, True, props) else: calc_qed = weights_mean(mol, True, props) args.outfile.write( "%.2f\t%.3f\t%d\t%d\t%.2f\t%d\t%d\t%d\t%s\t%.3f\t%-s\t%s\n" % ( props[0], props[1], props[2], props[3], props[4], props[5], props[6], props[7], props[8], calc_qed, title, smiles, ) ) else: sys.exit("Error: unknown file-type: %s" % filetype)