Mercurial > repos > bgruening > chemfp
view nxn_clustering.py @ 12:3b14765c22ee draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 7fb96a3844b4771084f18de2346ed6d5e241d839"
| author | bgruening | 
|---|---|
| date | Sat, 25 Sep 2021 19:07:44 +0000 | 
| parents | 198b1e30c739 | 
| children | 
line wrap: on
 line source
# !/usr/bin/env python """ Modified version of code examples from the chemfp project. http://code.google.com/p/chem-fingerprints/ Thanks to Andrew Dalke of Andrew Dalke Scientific! """ import argparse import chemfp import matplotlib matplotlib.use("Agg") # noqa from matplotlib import rcParams # noqa rcParams.update({"figure.autolayout": True}) # noqa import numpy # noqa import pylab # noqa import scipy.cluster.hierarchy as hcluster # noqa def distance_matrix(arena, tanimoto_threshold=0.0): n = len(arena) # Start off a similarity matrix with 1.0s along the diagonal try: similarities = numpy.identity(n, "d") except Exception: raise Exception("Input dataset is to large!") chemfp.set_num_threads(args.processors) # Compute the full similarity matrix. # The implementation computes the upper-triangle then copies # the upper-triangle into lower-triangle. It does not include # terms for the diagonal. results = chemfp.search.threshold_tanimoto_search_symmetric( arena, threshold=tanimoto_threshold ) # Copy the results into the NumPy array. for row_index, row in enumerate(results.iter_indices_and_scores()): for target_index, target_score in row: similarities[row_index, target_index] = target_score # Return the distance matrix using the similarity matrix return 1.0 - similarities if __name__ == "__main__": parser = argparse.ArgumentParser( description="""NxN clustering for fps files. For more details please see the chemfp documentation: https://chemfp.readthedocs.org """ ) parser.add_argument( "-i", "--input", dest="input_path", required=True, help="Path to the input file.", ) parser.add_argument( "-c", "--cluster", dest="cluster_image", help="Path to the output cluster image.", ) parser.add_argument( "-s", "--smatrix", dest="similarity_matrix", help="Path to the similarity matrix output file.", ) parser.add_argument( "-t", "--threshold", dest="tanimoto_threshold", type=float, default=0.0, help="Tanimoto threshold [0.0]", ) parser.add_argument("--oformat", default="png", help="Output format (png, svg)") parser.add_argument("-p", "--processors", type=int, default=4) args = parser.parse_args() targets = chemfp.open(args.input_path, format="fps") arena = chemfp.load_fingerprints(targets) distances = distance_matrix(arena, args.tanimoto_threshold) if args.similarity_matrix: numpy.savetxt(args.similarity_matrix, distances) if args.cluster_image: linkage = hcluster.linkage(distances, method="single", metric="euclidean") hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.0) pylab.savefig(args.cluster_image, format=args.oformat)
