Mercurial > repos > bgruening > sucos_clustering
diff sucos.py @ 0:f80cfac80c53 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
author | bgruening |
---|---|
date | Wed, 02 Oct 2019 12:58:19 -0400 |
parents | |
children | 58d18838e244 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sucos.py Wed Oct 02 12:58:19 2019 -0400 @@ -0,0 +1,202 @@ +#!/usr/bin/env python +""" +Basic SuCOS scoring. Allows a set of molecules from a SD file to be overlayed to a reference molecule, +with the resulting scores being written as properties in the output SD file. + +SuCOS is the work of Susan Leung. +GitHub: https://github.com/susanhleung/SuCOS +Publication: https://doi.org/10.26434/chemrxiv.8100203.v1 +""" + +from __future__ import print_function +import argparse, os, sys, gzip +import numpy as np +from rdkit import Chem, rdBase, RDConfig +from rdkit.Chem import AllChem, rdShapeHelpers +from rdkit.Chem.FeatMaps import FeatMaps +import utils + + +### start function definitions ######################################### + +# Setting up the features to use in FeatureMap +fdef = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')) + +fmParams = {} +for k in fdef.GetFeatureFamilies(): + fparams = FeatMaps.FeatMapParams() + fmParams[k] = fparams + +keep = ('Donor', 'Acceptor', 'NegIonizable', 'PosIonizable', 'ZnBinder', + 'Aromatic', 'Hydrophobe', 'LumpedHydrophobe') + +def filterFeature(f): + result = f.GetFamily() in keep + # TODO - nothing ever seems to be filtered. Is this expected? + if not result: + utils.log("Filtered out feature type", f.GetFamily()) + return result + +def getRawFeatures(mol): + + rawFeats = fdef.GetFeaturesForMol(mol) + # filter that list down to only include the ones we're interested in + filtered = list(filter(filterFeature, rawFeats)) + return filtered + +def get_FeatureMapScore(small_feats, large_feats, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All): + """ + Generate the feature map score. + + :param small_feats: + :param large_feats: + :param tani: + :return: + """ + + featLists = [] + for rawFeats in [small_feats, large_feats]: + # filter that list down to only include the ones we're interested in + featLists.append(rawFeats) + fms = [FeatMaps.FeatMap(feats=x, weights=[1] * len(x), params=fmParams) for x in featLists] + # set the score mode + fms[0].scoreMode = score_mode + + try: + if tani: + c = fms[0].ScoreFeats(featLists[1]) + A = fms[0].GetNumFeatures() + B = len(featLists[1]) + if B != fms[1].GetNumFeatures(): + utils.log("Why isn't B equal to number of features...?!") + tani_score = float(c) / (A+B-c) + return tani_score + else: + fm_score = fms[0].ScoreFeats(featLists[1]) / min(fms[0].GetNumFeatures(), len(featLists[1])) + return fm_score + except ZeroDivisionError: + utils.log("ZeroDivisionError") + return 0 + + if tani: + tani_score = float(c) / (A+B-c) + return tani_score + else: + fm_score = fms[0].ScoreFeats(featLists[1]) / min(fms[0].GetNumFeatures(), len(featLists[1])) + return fm_score + + +def get_SucosScore(ref_mol, query_mol, tani=False, ref_features=None, query_features=None, score_mode=FeatMaps.FeatMapScoreMode.All): + """ + This is the key function that calculates the SuCOS scores and is expected to be called from other modules. + To improve performance you can pre-calculate the features and pass them in as optional parameters to avoid having + to recalculate them. Use the getRawFeatures function to pre-calculate the features. + + :param ref_mol: The reference molecule to compare to + :param query_mol: The molecule to align to the reference + :param tani: Whether to calculate Tanimoto distances + :param ref_features: An optional feature map for the reference molecule, avoiding the need to re-calculate it. + :param query_features: An optional feature map for the query molecule, avoiding the need to re-calculate it. + :return: A tuple of 3 values. 1 the sucos score, 2 the feature map score, + 3 the Tanimoto distance or 1 minus the protrude distance + """ + + if not ref_features: + ref_features = getRawFeatures(ref_mol) + if not query_features: + query_features = getRawFeatures(query_mol) + + fm_score = get_FeatureMapScore(ref_features, query_features, tani, score_mode) + fm_score = np.clip(fm_score, 0, 1) + + if tani: + tani_sim = 1 - float(rdShapeHelpers.ShapeTanimotoDist(ref_mol, query_mol)) + tani_sim = np.clip(tani_sim, 0, 1) + SuCOS_score = 0.5*fm_score + 0.5*tani_sim + return SuCOS_score, fm_score, tani_sim + else: + protrude_dist = rdShapeHelpers.ShapeProtrudeDist(ref_mol, query_mol, allowReordering=False) + protrude_dist = np.clip(protrude_dist, 0, 1) + protrude_val = 1.0 - protrude_dist + SuCOS_score = 0.5 * fm_score + 0.5 * protrude_val + return SuCOS_score, fm_score, protrude_val + +def process(refmol_filename, inputs_filename, outputs_filename, refmol_index=None, + refmol_format=None, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All): + + ref_mol = utils.read_single_molecule(refmol_filename, index=refmol_index, format=refmol_format) + #utils.log("Reference mol has", ref_mol.GetNumHeavyAtoms(), "heavy atoms") + ref_features = getRawFeatures(ref_mol) + + input_file = utils.open_file_for_reading(inputs_filename) + suppl = Chem.ForwardSDMolSupplier(input_file) + output_file = utils.open_file_for_writing(outputs_filename) + writer = Chem.SDWriter(output_file) + + count = 0 + total = 0 + errors = 0 + for mol in suppl: + count +=1 + if mol is None: + continue + #utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms") + try: + sucos_score, fm_score, val3 = get_SucosScore(ref_mol, mol, tani=tani, ref_features=ref_features, score_mode=score_mode) + mol.SetDoubleProp("SuCOS_Score", sucos_score) + mol.SetDoubleProp("SuCOS_FeatureMap_Score", fm_score) + if tani: + mol.SetDoubleProp("SuCOS_Tanimoto_Score", val3) + else: + mol.SetDoubleProp("SuCOS_Protrude_Score", val3) + utils.log("Scores:", sucos_score, fm_score, val3) + writer.write(mol) + total +=1 + except ValueError as e: + errors +=1 + utils.log("Molecule", count, "failed to score:", e.message) + + input_file.close() + writer.flush() + writer.close() + output_file.close() + + utils.log("Completed.", total, "processed, ", count, "succeeded, ", errors, "errors") + +def parse_score_mode(value): + if value == None or value == 'all': + return FeatMaps.FeatMapScoreMode.All + elif value == 'closest': + return FeatMaps.FeatMapScoreMode.Closest + elif value == 'best': + return FeatMaps.FeatMapScoreMode.Best + else: + raise ValueError(value + " is not a valid scoring mode option") + + +### start main execution ######################################### + +def main(): + + parser = argparse.ArgumentParser(description='SuCOS with RDKit') + parser.add_argument('-i', '--input', help='Input file in SDF format. Can be gzipped (*.gz).') + parser.add_argument('-r', '--refmol', help='Molecule to compare against in Molfile (.mol) or SDF (.sdf) format') + parser.add_argument('--refmol-format', help="Format for the reference molecule (mol or sdf). " + + "Only needed if files don't have the expected extensions") + parser.add_argument('--refmolidx', help='Reference molecule index in SD file if not the first', type=int, default=1) + parser.add_argument('-o', '--output', help='Output file in SDF format. Can be gzipped (*.gz).') + parser.add_argument('--tanimoto', action='store_true', help='Include Tanimoto distance in score') + parser.add_argument('--score_mode', choices=['all', 'closest', 'best'], + help="choose the scoring mode for the feature map, default is 'all'.") + + args = parser.parse_args() + utils.log("SuCOS Args: ", args) + + score_mode = parse_score_mode(args.score_mode) + + process(args.refmol, args.input, args.output, refmol_index=args.refmolidx, + refmol_format=args.refmol_format, tani=args.tanimoto, score_mode=score_mode) + + +if __name__ == "__main__": + main()