Mercurial > repos > bgruening > chemfp
comparison nxn_clustering.py @ 12:3b14765c22ee draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 7fb96a3844b4771084f18de2346ed6d5e241d839"
| author | bgruening | 
|---|---|
| date | Sat, 25 Sep 2021 19:07:44 +0000 | 
| parents | 198b1e30c739 | 
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 11:92c7cdc243e8 | 12:3b14765c22ee | 
|---|---|
| 1 #!/usr/bin/env python | 1 # !/usr/bin/env python | 
| 2 """ | 2 """ | 
| 3 Modified version of code examples from the chemfp project. | 3 Modified version of code examples from the chemfp project. | 
| 4 http://code.google.com/p/chem-fingerprints/ | 4 http://code.google.com/p/chem-fingerprints/ | 
| 5 Thanks to Andrew Dalke of Andrew Dalke Scientific! | 5 Thanks to Andrew Dalke of Andrew Dalke Scientific! | 
| 6 """ | 6 """ | 
| 7 | |
| 8 import argparse | |
| 9 | |
| 10 import chemfp | |
| 7 import matplotlib | 11 import matplotlib | 
| 8 matplotlib.use('Agg') | 12 matplotlib.use("Agg") # noqa | 
| 9 from matplotlib import rcParams | 13 from matplotlib import rcParams # noqa | 
| 10 rcParams.update({'figure.autolayout': True}) | 14 rcParams.update({"figure.autolayout": True}) # noqa | 
| 11 import argparse | 15 import numpy # noqa | 
| 12 import os | 16 import pylab # noqa | 
| 13 import chemfp | 17 import scipy.cluster.hierarchy as hcluster # noqa | 
| 14 import scipy.cluster.hierarchy as hcluster | |
| 15 import pylab | |
| 16 import numpy | |
| 17 | 18 | 
| 18 def distance_matrix(arena, tanimoto_threshold = 0.0): | 19 | 
| 20 def distance_matrix(arena, tanimoto_threshold=0.0): | |
| 19 n = len(arena) | 21 n = len(arena) | 
| 20 # Start off a similarity matrix with 1.0s along the diagonal | 22 # Start off a similarity matrix with 1.0s along the diagonal | 
| 21 try: | 23 try: | 
| 22 similarities = numpy.identity(n, "d") | 24 similarities = numpy.identity(n, "d") | 
| 23 except: | 25 except Exception: | 
| 24 raise Exception('Input dataset is to large!') | 26 raise Exception("Input dataset is to large!") | 
| 25 chemfp.set_num_threads( args.processors ) | 27 chemfp.set_num_threads(args.processors) | 
| 26 | 28 | 
| 27 ## Compute the full similarity matrix. | 29 # Compute the full similarity matrix. | 
| 28 # The implementation computes the upper-triangle then copies | 30 # The implementation computes the upper-triangle then copies | 
| 29 # the upper-triangle into lower-triangle. It does not include | 31 # the upper-triangle into lower-triangle. It does not include | 
| 30 # terms for the diagonal. | 32 # terms for the diagonal. | 
| 31 results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold) | 33 results = chemfp.search.threshold_tanimoto_search_symmetric( | 
| 34 arena, threshold=tanimoto_threshold | |
| 35 ) | |
| 32 | 36 | 
| 33 # Copy the results into the NumPy array. | 37 # Copy the results into the NumPy array. | 
| 34 for row_index, row in enumerate(results.iter_indices_and_scores()): | 38 for row_index, row in enumerate(results.iter_indices_and_scores()): | 
| 35 for target_index, target_score in row: | 39 for target_index, target_score in row: | 
| 36 similarities[row_index, target_index] = target_score | 40 similarities[row_index, target_index] = target_score | 
| 38 # Return the distance matrix using the similarity matrix | 42 # Return the distance matrix using the similarity matrix | 
| 39 return 1.0 - similarities | 43 return 1.0 - similarities | 
| 40 | 44 | 
| 41 | 45 | 
| 42 if __name__ == "__main__": | 46 if __name__ == "__main__": | 
| 43 parser = argparse.ArgumentParser(description="""NxN clustering for fps files. | 47 parser = argparse.ArgumentParser( | 
| 48 description="""NxN clustering for fps files. | |
| 44 For more details please see the chemfp documentation: | 49 For more details please see the chemfp documentation: | 
| 45 https://chemfp.readthedocs.org | 50 https://chemfp.readthedocs.org | 
| 46 """) | 51 """ | 
| 52 ) | |
| 47 | 53 | 
| 48 parser.add_argument("-i", "--input", dest="input_path", | 54 parser.add_argument( | 
| 49 required=True, | 55 "-i", | 
| 50 help="Path to the input file.") | 56 "--input", | 
| 57 dest="input_path", | |
| 58 required=True, | |
| 59 help="Path to the input file.", | |
| 60 ) | |
| 51 | 61 | 
| 52 parser.add_argument("-c", "--cluster", dest="cluster_image", | 62 parser.add_argument( | 
| 53 help="Path to the output cluster image.") | 63 "-c", | 
| 64 "--cluster", | |
| 65 dest="cluster_image", | |
| 66 help="Path to the output cluster image.", | |
| 67 ) | |
| 54 | 68 | 
| 55 parser.add_argument("-s", "--smatrix", dest="similarity_matrix", | 69 parser.add_argument( | 
| 56 help="Path to the similarity matrix output file.") | 70 "-s", | 
| 71 "--smatrix", | |
| 72 dest="similarity_matrix", | |
| 73 help="Path to the similarity matrix output file.", | |
| 74 ) | |
| 57 | 75 | 
| 58 parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", | 76 parser.add_argument( | 
| 59 type=float, default=0.0, | 77 "-t", | 
| 60 help="Tanimoto threshold [0.0]") | 78 "--threshold", | 
| 79 dest="tanimoto_threshold", | |
| 80 type=float, | |
| 81 default=0.0, | |
| 82 help="Tanimoto threshold [0.0]", | |
| 83 ) | |
| 61 | 84 | 
| 62 parser.add_argument("--oformat", default='png', help="Output format (png, svg)") | 85 parser.add_argument("--oformat", default="png", help="Output format (png, svg)") | 
| 63 | 86 | 
| 64 parser.add_argument('-p', '--processors', type=int, | 87 parser.add_argument("-p", "--processors", type=int, default=4) | 
| 65 default=4) | |
| 66 | 88 | 
| 67 args = parser.parse_args() | 89 args = parser.parse_args() | 
| 68 | 90 | 
| 69 targets = chemfp.open( args.input_path, format='fps' ) | 91 targets = chemfp.open(args.input_path, format="fps") | 
| 70 arena = chemfp.load_fingerprints( targets ) | 92 arena = chemfp.load_fingerprints(targets) | 
| 71 distances = distance_matrix( arena, args.tanimoto_threshold ) | 93 distances = distance_matrix(arena, args.tanimoto_threshold) | 
| 72 | 94 | 
| 73 if args.similarity_matrix: | 95 if args.similarity_matrix: | 
| 74 numpy.savetxt(args.similarity_matrix, distances) | 96 numpy.savetxt(args.similarity_matrix, distances) | 
| 75 | 97 | 
| 76 if args.cluster_image: | 98 if args.cluster_image: | 
| 77 linkage = hcluster.linkage(distances, method="single", metric="euclidean") | 99 linkage = hcluster.linkage(distances, method="single", metric="euclidean") | 
| 78 hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.) | 100 hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.0) | 
| 79 pylab.savefig(args.cluster_image, format=args.oformat) | 101 pylab.savefig(args.cluster_image, format=args.oformat) | 
| 80 | 
