| 
0
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 """
 | 
| 
 | 
     3     Modified version of code examples from the chemfp project.
 | 
| 
 | 
     4     http://code.google.com/p/chem-fingerprints/
 | 
| 
 | 
     5     Thanks to Andrew Dalke of Andrew Dalke Scientific!
 | 
| 
 | 
     6 """
 | 
| 
 | 
     7 
 | 
| 
 | 
     8 import chemfp
 | 
| 
 | 
     9 import sys
 | 
| 
 | 
    10 import os
 | 
| 
 | 
    11 import tempfile
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 temp_file = tempfile.NamedTemporaryFile()
 | 
| 
 | 
    14 temp_link = "%s.%s" % (temp_file.name, 'fps')
 | 
| 
 | 
    15 temp_file.close()
 | 
| 
 | 
    16 os.system('ln -s %s %s' % (os.path.realpath(sys.argv[1]), temp_link) )
 | 
| 
 | 
    17 
 | 
| 
 | 
    18 
 | 
| 
 | 
    19 chemfp_fingerprint_file = temp_link
 | 
| 
 | 
    20 tanimoto_threshold = float(sys.argv[2])
 | 
| 
 | 
    21 outfile = sys.argv[3]
 | 
| 
 | 
    22 processors = int(sys.argv[4])
 | 
| 
 | 
    23 
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 def get_hit_indicies(hits):
 | 
| 
 | 
    26     return [id for (id, score) in hits]
 | 
| 
 | 
    27 
 | 
| 
 | 
    28 out = open(outfile, 'w')
 | 
| 
 | 
    29 dataset = chemfp.load_fingerprints( chemfp_fingerprint_file )
 | 
| 
 | 
    30 
 | 
| 
 | 
    31 chemfp.set_num_threads( processors )
 | 
| 
 | 
    32 search = dataset.threshold_tanimoto_search_arena(dataset, threshold = tanimoto_threshold)
 | 
| 
 | 
    33 #search = chemfp.search.threshold_tanimoto_search_symmetric (dataset, threshold = tanimoto_threshold)
 | 
| 
 | 
    34 
 | 
| 
 | 
    35 # Reorder so the centroid with the most hits comes first.
 | 
| 
 | 
    36 # (That's why I do a reverse search.)
 | 
| 
 | 
    37 # Ignore the arbitrariness of breaking ties by fingerprint index
 | 
| 
 | 
    38 results = sorted( (  (len(hits), i, hits) for (i, hits) in enumerate(search.iter_indices_and_scores())  ),reverse=True)
 | 
| 
 | 
    39 
 | 
| 
 | 
    40 
 | 
| 
 | 
    41 # Determine the true/false singletons and the clusters
 | 
| 
 | 
    42 true_singletons = []
 | 
| 
 | 
    43 false_singletons = []
 | 
| 
 | 
    44 clusters = []
 | 
| 
 | 
    45 
 | 
| 
 | 
    46 seen = set()
 | 
| 
 | 
    47 
 | 
| 
 | 
    48 for (size, fp_idx, hits) in results:
 | 
| 
 | 
    49     if fp_idx in seen:
 | 
| 
 | 
    50         # Can't use a centroid which is already assigned
 | 
| 
 | 
    51         continue
 | 
| 
 | 
    52     seen.add(fp_idx)
 | 
| 
 | 
    53     print size, fp_idx, hits
 | 
| 
 | 
    54     if size == 1:
 | 
| 
 | 
    55         # The only fingerprint in the exclusion sphere is itself
 | 
| 
 | 
    56         true_singletons.append(fp_idx)
 | 
| 
 | 
    57         continue
 | 
| 
 | 
    58 
 | 
| 
 | 
    59     members = get_hit_indicies(hits)
 | 
| 
 | 
    60     # Figure out which ones haven't yet been assigned
 | 
| 
 | 
    61     unassigned = [target_idx for target_idx in members if target_idx not in seen]
 | 
| 
 | 
    62 
 | 
| 
 | 
    63     if not unassigned:
 | 
| 
 | 
    64         false_singletons.append(fp_idx)
 | 
| 
 | 
    65         continue
 | 
| 
 | 
    66 
 | 
| 
 | 
    67     # this is a new cluster
 | 
| 
 | 
    68     clusters.append( (fp_idx, unassigned) )
 | 
| 
 | 
    69     seen.update(unassigned)
 | 
| 
 | 
    70 
 | 
| 
 | 
    71 len_cluster = len(clusters)
 | 
| 
 | 
    72 #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(dataset.ids[idx] for idx in true_singletons)) ) )
 | 
| 
 | 
    73 #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(dataset.ids[idx] for idx in false_singletons)) ) )
 | 
| 
 | 
    74 
 | 
| 
 | 
    75 out.write( "#%s true singletons\n" % len(true_singletons) )
 | 
| 
 | 
    76 out.write( "#%s false singletons\n" % len(false_singletons) )
 | 
| 
 | 
    77 out.write( "#clusters: %s\n" % len_cluster )
 | 
| 
 | 
    78 
 | 
| 
 | 
    79 # Sort so the cluster with the most compounds comes first,
 | 
| 
 | 
    80 # then by alphabetically smallest id
 | 
| 
 | 
    81 def cluster_sort_key(cluster):
 | 
| 
 | 
    82     centroid_idx, members = cluster
 | 
| 
 | 
    83     return -len(members), dataset.ids[centroid_idx]
 | 
| 
 | 
    84 
 | 
| 
 | 
    85 clusters.sort(key=cluster_sort_key)
 | 
| 
 | 
    86 
 | 
| 
 | 
    87 
 | 
| 
 | 
    88 for centroid_idx, members in clusters:
 | 
| 
 | 
    89     centroid_name = dataset.ids[centroid_idx]
 | 
| 
 | 
    90     out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(sorted(dataset.ids[idx] for idx in members))))
 | 
| 
 | 
    91     #ToDo: len(members) need to be some biggest top 90% or something ...
 | 
| 
 | 
    92 
 | 
| 
 | 
    93 for idx in sorted(true_singletons):
 | 
| 
 | 
    94     out.write("%s\t%s\n" % (dataset.ids[idx], 0))
 | 
| 
 | 
    95 
 | 
| 
 | 
    96 out.close()
 | 
| 
 | 
    97 os.remove( temp_link )
 |