Mercurial > repos > iuc > clustering_from_distmat
comparison clustering_from_distmat.py @ 0:8192b416f945 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/clustering_from_distmat/ commit a34052b87a2d05cabed5001c50f1bb10e74f97ee
| author | iuc |
|---|---|
| date | Thu, 08 Aug 2024 19:34:36 +0000 |
| parents | |
| children | c0b01c55a0e0 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:8192b416f945 |
|---|---|
| 1 import argparse | |
| 2 import sys | |
| 3 | |
| 4 import scipy | |
| 5 | |
| 6 | |
| 7 def linkage_as_newick(linkage, tip_names): | |
| 8 newick_parts = tip_names[::] | |
| 9 within_cluster_distances = [0] * len(tip_names) | |
| 10 for step in linkage: | |
| 11 n1 = int(step[0]) | |
| 12 n2 = int(step[1]) | |
| 13 d = float(step[2]) | |
| 14 d1 = d - within_cluster_distances[n1] | |
| 15 d2 = d - within_cluster_distances[n2] | |
| 16 id1 = newick_parts[n1] | |
| 17 id2 = newick_parts[n2] | |
| 18 part = f'({id1}:{d1 / 2},{id2}:{d2 / 2})' | |
| 19 within_cluster_distances.append(d) | |
| 20 newick_parts.append(part) | |
| 21 return newick_parts[-1].format(*newick_parts) + ';' | |
| 22 | |
| 23 | |
| 24 if __name__ == "__main__": | |
| 25 parser = argparse.ArgumentParser() | |
| 26 parser.add_argument( | |
| 27 'infile', | |
| 28 help='Distance matrix input file' | |
| 29 ) | |
| 30 parser.add_argument( | |
| 31 'out_prefix', | |
| 32 help="Output prefix" | |
| 33 ) | |
| 34 parser.add_argument | |
| 35 parser.add_argument( | |
| 36 '-m', '--method', default="average", | |
| 37 choices=[ | |
| 38 "single", | |
| 39 "complete", | |
| 40 "average", | |
| 41 "weighted", | |
| 42 "centroid", | |
| 43 "median", | |
| 44 "ward" | |
| 45 ], | |
| 46 help="Clustering method to use" | |
| 47 ) | |
| 48 cut_mode = parser.add_mutually_exclusive_group() | |
| 49 cut_mode.add_argument( | |
| 50 "-n", "--n-clusters", nargs="*", type=int | |
| 51 ) | |
| 52 cut_mode.add_argument( | |
| 53 "--height", nargs="*", type=float | |
| 54 ) | |
| 55 args = parser.parse_args() | |
| 56 | |
| 57 # TO DO: | |
| 58 # - parse outputs to generate | |
| 59 | |
| 60 # read from input and check that | |
| 61 # we have been passed a symmetric distance matrix | |
| 62 with open(args.infile) as i: | |
| 63 col_names = next(i).rstrip("\n\r").split("\t")[1:] | |
| 64 col_count = len(col_names) | |
| 65 if not col_count: | |
| 66 sys.exit( | |
| 67 'No data columns found. ' | |
| 68 'This tool expects tabular input with column names on the first line ' | |
| 69 'and a row name in the first column of each row followed by data columns.' | |
| 70 ) | |
| 71 row_count = 0 | |
| 72 matrix = [] | |
| 73 for line in i: | |
| 74 if not line.strip(): | |
| 75 # skip empty lines | |
| 76 continue | |
| 77 row_count += 1 | |
| 78 if row_count > col_count: | |
| 79 sys.exit( | |
| 80 'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' | |
| 81 'but got more rows than columns.' | |
| 82 ) | |
| 83 row_name, *row_data = line.strip(" \n\r").split("\t") | |
| 84 col_name = col_names[row_count - 1] | |
| 85 if not row_name: | |
| 86 # tolerate omitted row names, use col name instead | |
| 87 row_name = col_name | |
| 88 if row_name != col_name: | |
| 89 sys.exit( | |
| 90 'This tool expects a symmetric distance matrix with identical names for rows and columns, ' | |
| 91 f'but got "{col_name}" in column {row_count} and "{row_name}" on row {row_count}.' | |
| 92 ) | |
| 93 if len(row_data) != col_count: | |
| 94 sys.exit( | |
| 95 'This tool expects a symmetric distance matrix with the same number of columns on each row, ' | |
| 96 f'but row {row_count} ("{row_name}") has {len(row_data)} columns instead of {col_count}.' | |
| 97 ) | |
| 98 try: | |
| 99 matrix.append([float(x) for x in row_data]) | |
| 100 except ValueError as e: | |
| 101 sys.exit(str(e) + f' on row {row_count} ("{row_name}")') | |
| 102 if row_count < col_count: | |
| 103 sys.exit( | |
| 104 'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' | |
| 105 'but got more columns than rows.' | |
| 106 ) | |
| 107 | |
| 108 # turn the distance matrix into "condensed" vector form | |
| 109 # this gives us further checks and raises ValueErrors if: | |
| 110 # - the values on the diagonal aren't zero | |
| 111 # - the upper and lower triangle of the matrix aren't identical | |
| 112 D = scipy.spatial.distance.squareform(matrix) | |
| 113 | |
| 114 # perform the requested clustering and retrieve the result as a linkage object | |
| 115 linkage = scipy.cluster.hierarchy.linkage(D, args.method) | |
| 116 | |
| 117 with open(args.out_prefix + '.tree.newick', 'w') as o: | |
| 118 o.write(linkage_as_newick(linkage, col_names)) | |
| 119 | |
| 120 # cut the tree as specified and report sample to cluster assignments | |
| 121 if args.n_clusters or args.height: | |
| 122 if args.n_clusters: | |
| 123 cut_values = args.n_clusters | |
| 124 colname_template = "cluster_id_n{}" | |
| 125 else: | |
| 126 cut_values = args.height | |
| 127 colname_template = "cluster_id_h{}" | |
| 128 header_cols = ["sample"] + [ | |
| 129 colname_template.format(x) for x in cut_values | |
| 130 ] | |
| 131 cluster_assignments = [] | |
| 132 for name, cluster_ids in zip( | |
| 133 col_names, | |
| 134 scipy.cluster.hierarchy.cut_tree( | |
| 135 linkage, | |
| 136 args.n_clusters, | |
| 137 args.height | |
| 138 ) | |
| 139 ): | |
| 140 cluster_assignments.append( | |
| 141 [name] | |
| 142 + [str(c + 1) for c in cluster_ids] | |
| 143 ) | |
| 144 with open(args.out_prefix + '.cluster_assignments.tsv', 'w') as o: | |
| 145 print("\t".join(header_cols), file=o) | |
| 146 for ass in cluster_assignments: | |
| 147 print("\t".join(ass), file=o) |
