| 0 | 1 #!/usr/bin/env python | 
|  | 2 """ | 
|  | 3     Modified version of code examples from the chemfp project. | 
|  | 4     http://code.google.com/p/chem-fingerprints/ | 
|  | 5     Thanks to Andrew Dalke of Andrew Dalke Scientific! | 
|  | 6 """ | 
|  | 7 | 
|  | 8 import chemfp | 
|  | 9 import sys | 
|  | 10 import os | 
|  | 11 import tempfile | 
|  | 12 | 
|  | 13 temp_file = tempfile.NamedTemporaryFile() | 
|  | 14 temp_link = "%s.%s" % (temp_file.name, 'fps') | 
|  | 15 temp_file.close() | 
|  | 16 os.system('ln -s %s %s' % (os.path.realpath(sys.argv[1]), temp_link) ) | 
|  | 17 | 
|  | 18 | 
|  | 19 chemfp_fingerprint_file = temp_link | 
|  | 20 tanimoto_threshold = float(sys.argv[2]) | 
|  | 21 outfile = sys.argv[3] | 
|  | 22 processors = int(sys.argv[4]) | 
|  | 23 | 
|  | 24 | 
|  | 25 def get_hit_indicies(hits): | 
|  | 26     return [id for (id, score) in hits] | 
|  | 27 | 
|  | 28 out = open(outfile, 'w') | 
|  | 29 dataset = chemfp.load_fingerprints( chemfp_fingerprint_file ) | 
|  | 30 | 
|  | 31 chemfp.set_num_threads( processors ) | 
|  | 32 search = dataset.threshold_tanimoto_search_arena(dataset, threshold = tanimoto_threshold) | 
|  | 33 #search = chemfp.search.threshold_tanimoto_search_symmetric (dataset, threshold = tanimoto_threshold) | 
|  | 34 | 
|  | 35 # Reorder so the centroid with the most hits comes first. | 
|  | 36 # (That's why I do a reverse search.) | 
|  | 37 # Ignore the arbitrariness of breaking ties by fingerprint index | 
|  | 38 results = sorted( (  (len(hits), i, hits) for (i, hits) in enumerate(search.iter_indices_and_scores())  ),reverse=True) | 
|  | 39 | 
|  | 40 | 
|  | 41 # Determine the true/false singletons and the clusters | 
|  | 42 true_singletons = [] | 
|  | 43 false_singletons = [] | 
|  | 44 clusters = [] | 
|  | 45 | 
|  | 46 seen = set() | 
|  | 47 | 
|  | 48 for (size, fp_idx, hits) in results: | 
|  | 49     if fp_idx in seen: | 
|  | 50         # Can't use a centroid which is already assigned | 
|  | 51         continue | 
|  | 52     seen.add(fp_idx) | 
|  | 53     print size, fp_idx, hits | 
|  | 54     if size == 1: | 
|  | 55         # The only fingerprint in the exclusion sphere is itself | 
|  | 56         true_singletons.append(fp_idx) | 
|  | 57         continue | 
|  | 58 | 
|  | 59     members = get_hit_indicies(hits) | 
|  | 60     # Figure out which ones haven't yet been assigned | 
|  | 61     unassigned = [target_idx for target_idx in members if target_idx not in seen] | 
|  | 62 | 
|  | 63     if not unassigned: | 
|  | 64         false_singletons.append(fp_idx) | 
|  | 65         continue | 
|  | 66 | 
|  | 67     # this is a new cluster | 
|  | 68     clusters.append( (fp_idx, unassigned) ) | 
|  | 69     seen.update(unassigned) | 
|  | 70 | 
|  | 71 len_cluster = len(clusters) | 
|  | 72 #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(dataset.ids[idx] for idx in true_singletons)) ) ) | 
|  | 73 #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(dataset.ids[idx] for idx in false_singletons)) ) ) | 
|  | 74 | 
|  | 75 out.write( "#%s true singletons\n" % len(true_singletons) ) | 
|  | 76 out.write( "#%s false singletons\n" % len(false_singletons) ) | 
|  | 77 out.write( "#clusters: %s\n" % len_cluster ) | 
|  | 78 | 
|  | 79 # Sort so the cluster with the most compounds comes first, | 
|  | 80 # then by alphabetically smallest id | 
|  | 81 def cluster_sort_key(cluster): | 
|  | 82     centroid_idx, members = cluster | 
|  | 83     return -len(members), dataset.ids[centroid_idx] | 
|  | 84 | 
|  | 85 clusters.sort(key=cluster_sort_key) | 
|  | 86 | 
|  | 87 | 
|  | 88 for centroid_idx, members in clusters: | 
|  | 89     centroid_name = dataset.ids[centroid_idx] | 
|  | 90     out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(sorted(dataset.ids[idx] for idx in members)))) | 
|  | 91     #ToDo: len(members) need to be some biggest top 90% or something ... | 
|  | 92 | 
|  | 93 for idx in sorted(true_singletons): | 
|  | 94     out.write("%s\t%s\n" % (dataset.ids[idx], 0)) | 
|  | 95 | 
|  | 96 out.close() | 
|  | 97 os.remove( temp_link ) |