Mercurial > repos > bgruening > sucos_clustering
comparison sucos_max.py @ 0:f80cfac80c53 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
| author | bgruening |
|---|---|
| date | Wed, 02 Oct 2019 12:58:19 -0400 |
| parents | |
| children | 58d18838e244 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:f80cfac80c53 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 Assess ligands against a second set of molecules using SuCOS scores. | |
| 4 This is a quite specialised function that is designed to take a set of potential follow up | |
| 5 compounds and compare them to a set of clustered fragment hits to help identify which follow up | |
| 6 ligands best map to the binding space of the hits. | |
| 7 | |
| 8 The clustering of the fragment hits is expected to be performed with the sucos_cluster.py module | |
| 9 and will generate a set of SD files, one for each cluster of hits (presumably corresponding to a | |
| 10 binding pocket in the protein target). | |
| 11 | |
| 12 Each molecule in the input ligands is then compared (using SuCOS) to each hit in the clusters. There | |
| 13 are different modes which determine how the ligand is assessed. | |
| 14 | |
| 15 In mode 'max' the hit with the best SuCOS score is identified. The output is a SD file with each of the ligands, | |
| 16 with these additional fields for each molecule: | |
| 17 Max_SuCOS_Score - the best score | |
| 18 Max_SuCOS_FeatureMap_Score - the feature map score for the hit that has the best SuCOS score | |
| 19 Max_SuCOS_Protrude_Score - the protrude volume for the hit that has the best SuCOS score | |
| 20 Max_SuCOS_Cluster - the name of the cluster SD file that contains the best hit | |
| 21 Max_SuCOS_Index - the index of the best hit in the SD file | |
| 22 | |
| 23 In mode 'cum' the sum of all the scores is calculated and reported as the following properties for each molecule: | |
| 24 Cum_SuCOS_Score property: the sum of the SuCOS scores | |
| 25 Cum_SuCOS_FeatureMap_Score: the sum of the feature map scores | |
| 26 Cum_SuCOS_Protrude_Score: the sum of the protrude volume scores | |
| 27 | |
| 28 If a molecule has no alignment to any of the clustered hits (all alignment scores of zero) then it is not | |
| 29 included in the results. | |
| 30 | |
| 31 | |
| 32 SuCOS is the work of Susan Leung. | |
| 33 GitHub: https://github.com/susanhleung/SuCOS | |
| 34 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1 | |
| 35 """ | |
| 36 | |
| 37 import sucos, utils | |
| 38 import argparse, gzip, os | |
| 39 from rdkit import Chem | |
| 40 | |
| 41 | |
| 42 def process(inputfilename, clusterfilenames, outputfilename, mode): | |
| 43 | |
| 44 all_clusters = {} | |
| 45 for filename in clusterfilenames: | |
| 46 cluster = [] | |
| 47 cluster_file = utils.open_file_for_reading(filename) | |
| 48 suppl = Chem.ForwardSDMolSupplier(cluster_file) | |
| 49 i = 0 | |
| 50 for mol in suppl: | |
| 51 i += 1 | |
| 52 if not mol: | |
| 53 utils.log("WARNING: failed to generate molecule", i, "in cluster", filename) | |
| 54 continue | |
| 55 try: | |
| 56 features = sucos.getRawFeatures(mol) | |
| 57 cluster.append((mol, features)) | |
| 58 except: | |
| 59 utils.log("WARNING: failed to generate features for molecule", i, "in cluster", filename) | |
| 60 | |
| 61 cluster_file.close() | |
| 62 all_clusters[filename] = cluster | |
| 63 | |
| 64 input_file = utils.open_file_for_reading(inputfilename) | |
| 65 suppl = Chem.ForwardSDMolSupplier(input_file) | |
| 66 output_file = utils.open_file_for_writing(outputfilename) | |
| 67 writer = Chem.SDWriter(output_file) | |
| 68 | |
| 69 comparisons = 0 | |
| 70 mol_num = 0 | |
| 71 | |
| 72 for mol in suppl: | |
| 73 mol_num += 1 | |
| 74 if not mol: | |
| 75 utils.log("WARNING: failed to generate molecule", mol_num, "in input") | |
| 76 continue | |
| 77 try: | |
| 78 query_features = sucos.getRawFeatures(mol) | |
| 79 except: | |
| 80 utils.log("WARNING: failed to generate features for molecule", mol_num, "in input") | |
| 81 continue | |
| 82 scores = [0, 0, 0] | |
| 83 for clusterfilename in all_clusters: | |
| 84 cluster = all_clusters[clusterfilename] | |
| 85 index = 0 | |
| 86 for entry in cluster: | |
| 87 hit = entry[0] | |
| 88 ref_features = entry[1] | |
| 89 index += 1 | |
| 90 comparisons += 1 | |
| 91 sucos_score, fm_score, vol_score = sucos.get_SucosScore(hit, mol, | |
| 92 tani=False, ref_features=ref_features, query_features=query_features) | |
| 93 if mode == 'max': | |
| 94 if sucos_score > scores[0]: | |
| 95 scores[0] = sucos_score | |
| 96 scores[1] = fm_score | |
| 97 scores[2] = vol_score | |
| 98 cluster_name = clusterfilename | |
| 99 cluster_index = index | |
| 100 elif mode == 'cum': | |
| 101 scores[0] += sucos_score | |
| 102 scores[1] += fm_score | |
| 103 scores[2] += vol_score | |
| 104 else: | |
| 105 raise ValueError("Invalid mode: " + mode) | |
| 106 | |
| 107 if scores[0] > 0: | |
| 108 if mode == 'max': | |
| 109 cluster_file_name_only = cluster_name.split(os.sep)[-1] | |
| 110 #utils.log("Max SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2],"File:", cluster_file_name_only, "Index:", cluster_index) | |
| 111 mol.SetDoubleProp("Max_SuCOS_Score", scores[0]) | |
| 112 mol.SetDoubleProp("Max_SuCOS_FeatureMap_Score", scores[1]) | |
| 113 mol.SetDoubleProp("Max_SuCOS_Protrude_Score", scores[2]) | |
| 114 mol.SetProp("Max_SuCOS_Cluster", cluster_file_name_only) | |
| 115 mol.SetIntProp("Max_SuCOS_Index", cluster_index) | |
| 116 | |
| 117 else: | |
| 118 #utils.log("Cum SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2]) | |
| 119 mol.SetDoubleProp("Cum_SuCOS_Score", scores[0]) | |
| 120 mol.SetDoubleProp("Cum_SuCOS_FeatureMap_Score", scores[1]) | |
| 121 mol.SetDoubleProp("Cum_SuCOS_Protrude_Score", scores[2]) | |
| 122 | |
| 123 writer.write(mol) | |
| 124 | |
| 125 else: | |
| 126 utils.log("Molecule", mol_num, "did not overlay. Omitting from results") | |
| 127 | |
| 128 | |
| 129 input_file.close() | |
| 130 writer.flush() | |
| 131 writer.close() | |
| 132 output_file.close() | |
| 133 | |
| 134 utils.log("Completed", comparisons, "comparisons") | |
| 135 | |
| 136 | |
| 137 ### start main execution ######################################### | |
| 138 | |
| 139 def main(): | |
| 140 parser = argparse.ArgumentParser(description='Max SuCOS scores with RDKit') | |
| 141 parser.add_argument('-i', '--input', help='Input file to score in SDF format. Can be gzipped (*.gz).') | |
| 142 parser.add_argument('-o', '--output', help='Output file in SDF format. Can be gzipped (*.gz).') | |
| 143 parser.add_argument('-m', '--mode', choices=['max', 'cum'], | |
| 144 default='max', help='Score mode: max = best score, cum = sum of all scores') | |
| 145 parser.add_argument('clusters', nargs='*', help="One or more SDF files with the clustered hits") | |
| 146 | |
| 147 args = parser.parse_args() | |
| 148 utils.log("Max SuCOS Args: ", args) | |
| 149 | |
| 150 process(args.input, args.clusters, args.output, args.mode) | |
| 151 | |
| 152 | |
| 153 if __name__ == "__main__": | |
| 154 main() |
