Mercurial > repos > iuc > clustering_from_distmat
comparison clustering_from_distmat.py @ 1:c0b01c55a0e0 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/clustering_from_distmat/ commit 65b5c6f177478883ce664aeb6f27d0bec7155fdc
| author | iuc |
|---|---|
| date | Mon, 19 Aug 2024 15:33:16 +0000 |
| parents | 8192b416f945 |
| children |
comparison
equal
deleted
inserted
replaced
| 0:8192b416f945 | 1:c0b01c55a0e0 |
|---|---|
| 1 import argparse | 1 import argparse |
| 2 import sys | 2 import sys |
| 3 from collections import Counter | |
| 3 | 4 |
| 4 import scipy | 5 import scipy |
| 5 | 6 |
| 6 | 7 |
| 7 def linkage_as_newick(linkage, tip_names): | 8 def linkage_as_newick(linkage, tip_names): |
| 43 "median", | 44 "median", |
| 44 "ward" | 45 "ward" |
| 45 ], | 46 ], |
| 46 help="Clustering method to use" | 47 help="Clustering method to use" |
| 47 ) | 48 ) |
| 49 missing_names = parser.add_mutually_exclusive_group() | |
| 50 missing_names.add_argument( | |
| 51 "--nc", "--no-colnames", action="store_true", | |
| 52 help="Indicate that the distance matrix input does not feature column names" | |
| 53 ) | |
| 54 missing_names.add_argument( | |
| 55 "--nr", "--no-rownames", action="store_true", | |
| 56 help="Indicate that the distance matrix input does not feature row names" | |
| 57 ) | |
| 48 cut_mode = parser.add_mutually_exclusive_group() | 58 cut_mode = parser.add_mutually_exclusive_group() |
| 49 cut_mode.add_argument( | 59 cut_mode.add_argument( |
| 50 "-n", "--n-clusters", nargs="*", type=int | 60 "-n", "--n-clusters", nargs="*", type=int |
| 51 ) | 61 ) |
| 52 cut_mode.add_argument( | 62 cut_mode.add_argument( |
| 53 "--height", nargs="*", type=float | 63 "--height", nargs="*", type=float |
| 54 ) | 64 ) |
| 65 parser.add_argument("-s", "--min-cluster-size", type=int, default=2) | |
| 55 args = parser.parse_args() | 66 args = parser.parse_args() |
| 56 | |
| 57 # TO DO: | |
| 58 # - parse outputs to generate | |
| 59 | 67 |
| 60 # read from input and check that | 68 # read from input and check that |
| 61 # we have been passed a symmetric distance matrix | 69 # we have been passed a symmetric distance matrix |
| 62 with open(args.infile) as i: | 70 with open(args.infile) as i: |
| 63 col_names = next(i).rstrip("\n\r").split("\t")[1:] | 71 col_count = None |
| 64 col_count = len(col_names) | |
| 65 if not col_count: | |
| 66 sys.exit( | |
| 67 'No data columns found. ' | |
| 68 'This tool expects tabular input with column names on the first line ' | |
| 69 'and a row name in the first column of each row followed by data columns.' | |
| 70 ) | |
| 71 row_count = 0 | 72 row_count = 0 |
| 72 matrix = [] | 73 matrix = [] |
| 74 if args.nc: | |
| 75 col_names = col_count = None | |
| 76 else: | |
| 77 while True: | |
| 78 # skip leading empty lines | |
| 79 line = next(i).rstrip("\n\r") | |
| 80 if line: | |
| 81 break | |
| 82 if args.nr: | |
| 83 col_names = line.split("\t") | |
| 84 else: | |
| 85 # first column is for row names, rest are column names | |
| 86 col_names = line.split("\t")[1:] | |
| 87 col_count = len(col_names) | |
| 88 if not col_count: | |
| 89 sys.exit( | |
| 90 'No data columns found. ' | |
| 91 'By default, this tool expects tabular input with column names on the first line ' | |
| 92 'and a row name in the first column of each row followed by data columns. ' | |
| 93 'Use --no-colnames or --no-rownames to modify the expected format.' | |
| 94 ) | |
| 73 for line in i: | 95 for line in i: |
| 74 if not line.strip(): | 96 if not line.strip(): |
| 75 # skip empty lines | 97 # skip empty lines |
| 76 continue | 98 continue |
| 77 row_count += 1 | 99 row_count += 1 |
| 78 if row_count > col_count: | 100 if col_count is not None and row_count > col_count: |
| 79 sys.exit( | 101 sys.exit( |
| 80 'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' | 102 'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' |
| 81 'but got more rows than columns.' | 103 'but got more rows than columns.' |
| 82 ) | 104 ) |
| 83 row_name, *row_data = line.strip(" \n\r").split("\t") | 105 if args.nr: |
| 106 row_name = None | |
| 107 row_data = line.rstrip("\n\r").split("\t") | |
| 108 else: | |
| 109 row_name, *row_data = line.rstrip("\n\r").split("\t") | |
| 110 if col_count is None: | |
| 111 col_count = len(row_data) | |
| 112 col_names = [None] * col_count | |
| 84 col_name = col_names[row_count - 1] | 113 col_name = col_names[row_count - 1] |
| 85 if not row_name: | 114 if not row_name and col_name: |
| 86 # tolerate omitted row names, use col name instead | 115 # tolerate omitted row names, use col name instead |
| 87 row_name = col_name | 116 row_name = col_name |
| 117 elif row_name and not col_name: | |
| 118 # likewise for column names | |
| 119 # plus update list of col names with row name | |
| 120 col_name = col_names[row_count - 1] = row_name | |
| 121 elif not row_name and not col_name: | |
| 122 sys.exit( | |
| 123 'Each sample in the distance matrix must have its name specified via a row name, a column name, or both, ' | |
| 124 f'but found no name for sample number {row_count}' | |
| 125 ) | |
| 88 if row_name != col_name: | 126 if row_name != col_name: |
| 89 sys.exit( | 127 sys.exit( |
| 90 'This tool expects a symmetric distance matrix with identical names for rows and columns, ' | 128 'This tool expects a symmetric distance matrix with identical names for rows and columns, ' |
| 91 f'but got "{col_name}" in column {row_count} and "{row_name}" on row {row_count}.' | 129 f'but got "{col_name}" in column {row_count} and "{row_name}" on row {row_count}.' |
| 92 ) | 130 ) |
| 96 f'but row {row_count} ("{row_name}") has {len(row_data)} columns instead of {col_count}.' | 134 f'but row {row_count} ("{row_name}") has {len(row_data)} columns instead of {col_count}.' |
| 97 ) | 135 ) |
| 98 try: | 136 try: |
| 99 matrix.append([float(x) for x in row_data]) | 137 matrix.append([float(x) for x in row_data]) |
| 100 except ValueError as e: | 138 except ValueError as e: |
| 101 sys.exit(str(e) + f' on row {row_count} ("{row_name}")') | 139 if args.nr: |
| 140 sys.exit(str(e) + f' on row {row_count}') | |
| 141 else: | |
| 142 sys.exit(str(e) + f' on row {row_count} ("{row_name}")') | |
| 102 if row_count < col_count: | 143 if row_count < col_count: |
| 103 sys.exit( | 144 sys.exit( |
| 104 'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' | 145 'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' |
| 105 'but got more columns than rows.' | 146 'but got more columns than rows.' |
| 106 ) | 147 ) |
| 126 cut_values = args.height | 167 cut_values = args.height |
| 127 colname_template = "cluster_id_h{}" | 168 colname_template = "cluster_id_h{}" |
| 128 header_cols = ["sample"] + [ | 169 header_cols = ["sample"] + [ |
| 129 colname_template.format(x) for x in cut_values | 170 colname_template.format(x) for x in cut_values |
| 130 ] | 171 ] |
| 172 cut_result = scipy.cluster.hierarchy.cut_tree( | |
| 173 linkage, | |
| 174 args.n_clusters, | |
| 175 args.height | |
| 176 ) | |
| 177 | |
| 178 # Go through the cut results once to determine cluster sizes | |
| 179 | |
| 180 # In the final report, the ids of clusters with fewer members than | |
| 181 # args.min_cluster_size will be masked with "-". | |
| 182 # The remaining cluster ids will be renumbered to start fom 1. | |
| 183 # This has to be done for each clustering resulting from the | |
| 184 # user-specified cut_values. | |
| 185 cluster_member_counts = [Counter() for _ in cut_values] | |
| 186 effective_cluster_ids = [{} for _ in cut_values] | |
| 187 for cluster_ids in cut_result: | |
| 188 for cl_count, cl_id, eff_id in zip(cluster_member_counts, cluster_ids, effective_cluster_ids): | |
| 189 cl_count[cl_id] += 1 | |
| 190 for counter, eff_ids in zip(cluster_member_counts, effective_cluster_ids): | |
| 191 eff_id = 1 | |
| 192 for item, count in counter.items(): | |
| 193 # Since Python 3.7, Counter objects (like dicts) preserve | |
| 194 # insertion order so we can be sure that in the mapping | |
| 195 # constructed below, clusters will get renumbered in | |
| 196 # the order they will be reported later. | |
| 197 if count >= args.min_cluster_size: | |
| 198 eff_ids[item] = str(eff_id) | |
| 199 eff_id += 1 | |
| 200 else: | |
| 201 eff_ids[item] = "-" | |
| 202 | |
| 203 # build and write the cluster assignment report | |
| 204 # with remapped cluster ids | |
| 131 cluster_assignments = [] | 205 cluster_assignments = [] |
| 132 for name, cluster_ids in zip( | 206 for name, cluster_ids in zip(col_names, cut_result): |
| 133 col_names, | |
| 134 scipy.cluster.hierarchy.cut_tree( | |
| 135 linkage, | |
| 136 args.n_clusters, | |
| 137 args.height | |
| 138 ) | |
| 139 ): | |
| 140 cluster_assignments.append( | 207 cluster_assignments.append( |
| 141 [name] | 208 [name] |
| 142 + [str(c + 1) for c in cluster_ids] | 209 + [ |
| 210 eff_ids[c] | |
| 211 for c, eff_ids in zip(cluster_ids, effective_cluster_ids) | |
| 212 ] | |
| 143 ) | 213 ) |
| 144 with open(args.out_prefix + '.cluster_assignments.tsv', 'w') as o: | 214 with open(args.out_prefix + '.cluster_assignments.tsv', 'w') as o: |
| 145 print("\t".join(header_cols), file=o) | 215 print("\t".join(header_cols), file=o) |
| 146 for ass in cluster_assignments: | 216 for ass in cluster_assignments: |
| 147 print("\t".join(ass), file=o) | 217 print("\t".join(ass), file=o) |
