Mercurial > repos > bgruening > chemfp
comparison chemfp_clustering/old/butina_clustering_old.py @ 0:354d3c6bb894 draft
Uploaded
| author | bgruening |
|---|---|
| date | Thu, 15 Aug 2013 03:27:06 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:354d3c6bb894 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 Modified version of code examples from the chemfp project. | |
| 4 http://code.google.com/p/chem-fingerprints/ | |
| 5 Thanks to Andrew Dalke of Andrew Dalke Scientific! | |
| 6 """ | |
| 7 | |
| 8 import chemfp | |
| 9 import sys | |
| 10 import os | |
| 11 import tempfile | |
| 12 | |
| 13 temp_file = tempfile.NamedTemporaryFile() | |
| 14 temp_link = "%s.%s" % (temp_file.name, 'fps') | |
| 15 temp_file.close() | |
| 16 os.system('ln -s %s %s' % (os.path.realpath(sys.argv[1]), temp_link) ) | |
| 17 | |
| 18 | |
| 19 chemfp_fingerprint_file = temp_link | |
| 20 tanimoto_threshold = float(sys.argv[2]) | |
| 21 outfile = sys.argv[3] | |
| 22 processors = int(sys.argv[4]) | |
| 23 | |
| 24 | |
| 25 def get_hit_indicies(hits): | |
| 26 return [id for (id, score) in hits] | |
| 27 | |
| 28 out = open(outfile, 'w') | |
| 29 dataset = chemfp.load_fingerprints( chemfp_fingerprint_file ) | |
| 30 | |
| 31 chemfp.set_num_threads( processors ) | |
| 32 search = dataset.threshold_tanimoto_search_arena(dataset, threshold = tanimoto_threshold) | |
| 33 #search = chemfp.search.threshold_tanimoto_search_symmetric (dataset, threshold = tanimoto_threshold) | |
| 34 | |
| 35 # Reorder so the centroid with the most hits comes first. | |
| 36 # (That's why I do a reverse search.) | |
| 37 # Ignore the arbitrariness of breaking ties by fingerprint index | |
| 38 results = sorted( ( (len(hits), i, hits) for (i, hits) in enumerate(search.iter_indices_and_scores()) ),reverse=True) | |
| 39 | |
| 40 | |
| 41 # Determine the true/false singletons and the clusters | |
| 42 true_singletons = [] | |
| 43 false_singletons = [] | |
| 44 clusters = [] | |
| 45 | |
| 46 seen = set() | |
| 47 | |
| 48 for (size, fp_idx, hits) in results: | |
| 49 if fp_idx in seen: | |
| 50 # Can't use a centroid which is already assigned | |
| 51 continue | |
| 52 seen.add(fp_idx) | |
| 53 print size, fp_idx, hits | |
| 54 if size == 1: | |
| 55 # The only fingerprint in the exclusion sphere is itself | |
| 56 true_singletons.append(fp_idx) | |
| 57 continue | |
| 58 | |
| 59 members = get_hit_indicies(hits) | |
| 60 # Figure out which ones haven't yet been assigned | |
| 61 unassigned = [target_idx for target_idx in members if target_idx not in seen] | |
| 62 | |
| 63 if not unassigned: | |
| 64 false_singletons.append(fp_idx) | |
| 65 continue | |
| 66 | |
| 67 # this is a new cluster | |
| 68 clusters.append( (fp_idx, unassigned) ) | |
| 69 seen.update(unassigned) | |
| 70 | |
| 71 len_cluster = len(clusters) | |
| 72 #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(dataset.ids[idx] for idx in true_singletons)) ) ) | |
| 73 #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(dataset.ids[idx] for idx in false_singletons)) ) ) | |
| 74 | |
| 75 out.write( "#%s true singletons\n" % len(true_singletons) ) | |
| 76 out.write( "#%s false singletons\n" % len(false_singletons) ) | |
| 77 out.write( "#clusters: %s\n" % len_cluster ) | |
| 78 | |
| 79 # Sort so the cluster with the most compounds comes first, | |
| 80 # then by alphabetically smallest id | |
| 81 def cluster_sort_key(cluster): | |
| 82 centroid_idx, members = cluster | |
| 83 return -len(members), dataset.ids[centroid_idx] | |
| 84 | |
| 85 clusters.sort(key=cluster_sort_key) | |
| 86 | |
| 87 | |
| 88 for centroid_idx, members in clusters: | |
| 89 centroid_name = dataset.ids[centroid_idx] | |
| 90 out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(sorted(dataset.ids[idx] for idx in members)))) | |
| 91 #ToDo: len(members) need to be some biggest top 90% or something ... | |
| 92 | |
| 93 for idx in sorted(true_singletons): | |
| 94 out.write("%s\t%s\n" % (dataset.ids[idx], 0)) | |
| 95 | |
| 96 out.close() | |
| 97 os.remove( temp_link ) |
