Mercurial > repos > bgruening > chemfp
comparison chemfp_clustering/butina_clustering.py @ 0:354d3c6bb894 draft
Uploaded
| author | bgruening |
|---|---|
| date | Thu, 15 Aug 2013 03:27:06 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:354d3c6bb894 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 Modified version of code examples from the chemfp project. | |
| 4 http://code.google.com/p/chem-fingerprints/ | |
| 5 Thanks to Andrew Dalke of Andrew Dalke Scientific! | |
| 6 """ | |
| 7 | |
| 8 import chemfp | |
| 9 import sys | |
| 10 import os | |
| 11 import tempfile | |
| 12 import argparse | |
| 13 import subprocess | |
| 14 from chemfp import search | |
| 15 | |
| 16 def unix_sort(results): | |
| 17 temp_unsorted = tempfile.NamedTemporaryFile(delete=False) | |
| 18 for (i,indices) in enumerate( results.iter_indices() ): | |
| 19 temp_unsorted.write('%s %s\n' % (len(indices), i)) | |
| 20 temp_unsorted.close() | |
| 21 temp_sorted = tempfile.NamedTemporaryFile(delete=False) | |
| 22 temp_sorted.close() | |
| 23 p = subprocess.Popen(['sort', '-n', '-r', '-k', '1,1'], stdin=open(temp_unsorted.name), stdout=open(temp_sorted.name, 'w+')) | |
| 24 stdout, stderr = p.communicate() | |
| 25 return_code = p.returncode | |
| 26 | |
| 27 if return_code: | |
| 28 sys.stdout.write(stdout) | |
| 29 sys.stderr.write(stderr) | |
| 30 sys.stderr.write("Return error code %i from command:\n" % return_code) | |
| 31 temp_sorted.close() | |
| 32 os.remove(temp_unsorted.name) | |
| 33 | |
| 34 for line in open(temp_sorted.name): | |
| 35 size, fp_idx = line.strip().split() | |
| 36 yield (int(size), int(fp_idx)) | |
| 37 | |
| 38 os.remove(temp_sorted.name) | |
| 39 | |
| 40 def butina( args ): | |
| 41 """ | |
| 42 Taylor-Butina clustering from the chemfp help. | |
| 43 """ | |
| 44 out = args.output_path | |
| 45 targets = chemfp.open( args.input_path, format='fps' ) | |
| 46 arena = chemfp.load_fingerprints( targets ) | |
| 47 | |
| 48 chemfp.set_num_threads( args.processors ) | |
| 49 results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold) | |
| 50 results.reorder_all("move-closest-first") | |
| 51 | |
| 52 sorted_ids = unix_sort(results) | |
| 53 | |
| 54 # Determine the true/false singletons and the clusters | |
| 55 true_singletons = [] | |
| 56 false_singletons = [] | |
| 57 clusters = [] | |
| 58 | |
| 59 seen = set() | |
| 60 #for (size, fp_idx, members) in results: | |
| 61 for (size, fp_idx) in sorted_ids: | |
| 62 members = results[fp_idx].get_indices() | |
| 63 #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members] | |
| 64 if fp_idx in seen: | |
| 65 # Can't use a centroid which is already assigned | |
| 66 continue | |
| 67 seen.add(fp_idx) | |
| 68 | |
| 69 if size == 0: | |
| 70 # The only fingerprint in the exclusion sphere is itself | |
| 71 true_singletons.append( fp_idx ) | |
| 72 continue | |
| 73 | |
| 74 # Figure out which ones haven't yet been assigned | |
| 75 unassigned = set(members) - seen | |
| 76 | |
| 77 if not unassigned: | |
| 78 false_singletons.append(fp_idx) | |
| 79 continue | |
| 80 | |
| 81 # this is a new cluster | |
| 82 clusters.append( (fp_idx, unassigned) ) | |
| 83 seen.update(unassigned) | |
| 84 | |
| 85 len_cluster = len(clusters) | |
| 86 #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) ) | |
| 87 #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) ) | |
| 88 | |
| 89 out.write( "#%s true singletons\n" % len(true_singletons) ) | |
| 90 out.write( "#%s false singletons\n" % len(false_singletons) ) | |
| 91 out.write( "#clusters: %s\n" % len_cluster ) | |
| 92 | |
| 93 # Sort so the cluster with the most compounds comes first, | |
| 94 # then by alphabetically smallest id | |
| 95 def cluster_sort_key(cluster): | |
| 96 centroid_idx, members = cluster | |
| 97 return -len(members), arena.ids[centroid_idx] | |
| 98 | |
| 99 clusters.sort(key=cluster_sort_key) | |
| 100 | |
| 101 for centroid_idx, members in clusters: | |
| 102 centroid_name = arena.ids[centroid_idx] | |
| 103 out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members))) | |
| 104 #ToDo: len(members) need to be some biggest top 90% or something ... | |
| 105 | |
| 106 for idx in true_singletons: | |
| 107 out.write("%s\t%s\n" % (arena.ids[idx], 0)) | |
| 108 | |
| 109 out.close() | |
| 110 | |
| 111 | |
| 112 if __name__ == "__main__": | |
| 113 parser = argparse.ArgumentParser(description="""Taylor-Butina clustering for fps files. | |
| 114 For more details please see the original publication or the chemfp documentation: | |
| 115 http://www.chemomine.co.uk/dbclus-paper.pdf | |
| 116 https://chemfp.readthedocs.org | |
| 117 """) | |
| 118 | |
| 119 parser.add_argument("-i", "--input", dest="input_path", | |
| 120 required=True, | |
| 121 help="Path to the input file.") | |
| 122 | |
| 123 parser.add_argument("-o", "--output", dest="output_path", type=argparse.FileType('w'), | |
| 124 default=sys.stdout, | |
| 125 help="Path to the output file.") | |
| 126 | |
| 127 parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", type=float, | |
| 128 default=0.8, | |
| 129 help="Tanimoto threshold [0.8]") | |
| 130 | |
| 131 parser.add_argument('-p', '--processors', type=int, | |
| 132 default=4) | |
| 133 | |
| 134 options = parser.parse_args() | |
| 135 butina( options ) |
