Mercurial > repos > bgruening > flexynesis_utils
comparison flexynesis_utils.xml @ 0:1fea022f106e draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit b2463fb68d0ae54864d87718ee72f5e063aa4587
| author | bgruening |
|---|---|
| date | Tue, 24 Jun 2025 05:56:08 +0000 |
| parents | |
| children | 35f41ee7ca20 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1fea022f106e |
|---|---|
| 1 <tool id="flexynesis_utils" name="Flexynesis utils" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> | |
| 2 <description>Utility functions for Flexynesis</description> | |
| 3 <macros> | |
| 4 <import>macros.xml</import> | |
| 5 </macros> | |
| 6 <expand macro="requirements"/> | |
| 7 <required_files> | |
| 8 <include path="flexynesis_plot.py" /> | |
| 9 <include path="flexynesis_utils.py" /> | |
| 10 </required_files> | |
| 11 <command detect_errors="exit_code"><![CDATA[ | |
| 12 @CHECK_NON_COMMERCIAL_USE@ | |
| 13 mkdir -p inputs/ output/ && | |
| 14 #if $utils_conditional.util != "compute_ami_ari" and $utils_conditional.util != "split_data" and $utils_conditional.util != "binarize": | |
| 15 ln -s '$utils_conditional.X' 'inputs/$utils_conditional.X.element_identifier.$utils_conditional.X.ext' && | |
| 16 #end if | |
| 17 #if $utils_conditional.util != "split_data" and $utils_conditional.util != "binarize": | |
| 18 ln -s '$utils_conditional.labels' 'inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext' && | |
| 19 cat '$flexynesis_utils_config' && | |
| 20 python '$flexynesis_utils_config' | |
| 21 #end if | |
| 22 #if $utils_conditional.util == "split_data": | |
| 23 ln -s '$utils_conditional.clin' inputs/clin.csv && | |
| 24 #set $omics_names = [] | |
| 25 #for $omics_file in $utils_conditional.omics: | |
| 26 ln -s '$omics_file' 'inputs/${omics_file.element_identifier}.${omics_file.ext}' && | |
| 27 #silent $omics_names.append('inputs/' + str($omics_file.element_identifier) + '.' + str($omics_file.ext)) | |
| 28 #end for | |
| 29 | |
| 30 python '$__tool_directory__/flexynesis_utils.py' | |
| 31 --util split | |
| 32 --clin inputs/clin.csv | |
| 33 --omics '$(",".join($omics_names))' | |
| 34 --split $utils_conditional.split | |
| 35 --out output | |
| 36 #end if | |
| 37 #if $utils_conditional.util == "binarize": | |
| 38 ln -s '$utils_conditional.mutation' 'inputs/${utils_conditional.mutation.element_identifier}.${utils_conditional.mutation.ext}' && | |
| 39 python '$__tool_directory__/flexynesis_utils.py' | |
| 40 --util binarize | |
| 41 --mutation 'inputs/${utils_conditional.mutation.element_identifier}.${utils_conditional.mutation.ext}' | |
| 42 --gene_idx $utils_conditional.gene_idx | |
| 43 --sample_idx $utils_conditional.sample_idx | |
| 44 --out output | |
| 45 #end if | |
| 46 ]]></command> | |
| 47 <configfiles> | |
| 48 <configfile name="flexynesis_utils_config"><![CDATA[ | |
| 49 import sys | |
| 50 sys.path.append('$__tool_directory__/') | |
| 51 | |
| 52 import numpy as np | |
| 53 import pandas as pd | |
| 54 from flexynesis import ( | |
| 55 louvain_clustering, | |
| 56 get_optimal_clusters, | |
| 57 compute_ami_ari, | |
| 58 k_means_clustering | |
| 59 ) | |
| 60 | |
| 61 from flexynesis_plot import ( | |
| 62 load_omics | |
| 63 ) | |
| 64 | |
| 65 #if $utils_conditional.util == "louvain_clustering": | |
| 66 label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') | |
| 67 X = load_omics('inputs/$utils_conditional.X.element_identifier.$utils_conditional.X.ext') | |
| 68 | |
| 69 cluster_labels, G, partition = louvain_clustering( | |
| 70 #if $utils_conditional.threshold != "": | |
| 71 threshold=$utils_conditional.threshold, | |
| 72 #end if | |
| 73 #if $utils_conditional.k != "": | |
| 74 k=$utils_conditional.k, | |
| 75 #end if | |
| 76 X=X) | |
| 77 cluster_df = pd.DataFrame(data=cluster_labels, index=X.index, columns=['louvain_cluster']) | |
| 78 label_data = label_data.merge(cluster_df[['louvain_cluster']], left_index=True, right_index=True, how='left') | |
| 79 | |
| 80 output_path = f"output/clustered_labels.csv" | |
| 81 label_data.to_csv(output_path, index=True) | |
| 82 | |
| 83 #else if $utils_conditional.util == "get_optimal_clusters": | |
| 84 label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') | |
| 85 X = load_omics('inputs/$utils_conditional.X.element_identifier.$utils_conditional.X.ext') | |
| 86 | |
| 87 kmeans_cluster_labels, optimal_k, silhouette_scores = get_optimal_clusters( | |
| 88 data=X, | |
| 89 min_k=$utils_conditional.min_k, | |
| 90 max_k=$utils_conditional.max_k) | |
| 91 | |
| 92 print(f"Optimal number of clusters: {optimal_k}\n") | |
| 93 print(f"Silhouette scores: \n{silhouette_scores}") | |
| 94 | |
| 95 cluster_df = pd.DataFrame(data=kmeans_cluster_labels, index=X.index, columns=['optimal_kmeans_cluster']) | |
| 96 label_data = label_data.merge(cluster_df[['optimal_kmeans_cluster']], left_index=True, right_index=True, how='left') | |
| 97 | |
| 98 output_path = f"output/optimal_clusters_labels.csv" | |
| 99 label_data.to_csv(output_path, index=True) | |
| 100 | |
| 101 #else if $utils_conditional.util == "k_means_clustering": | |
| 102 label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') | |
| 103 X = load_omics('inputs/$utils_conditional.X.element_identifier.$utils_conditional.X.ext') | |
| 104 | |
| 105 cluster_labels, kmeans = k_means_clustering( | |
| 106 data=X, | |
| 107 k=$utils_conditional.k) | |
| 108 | |
| 109 print(f"{kmeans}") | |
| 110 cluster_df = pd.DataFrame(data=cluster_labels, index=X.index, columns=['kmeans_cluster']) | |
| 111 label_data = label_data.merge(cluster_df[['kmeans_cluster']], left_index=True, right_index=True, how='left') | |
| 112 | |
| 113 output_path = f"output/kmeans_labels.csv" | |
| 114 label_data.to_csv(output_path, index=True) | |
| 115 | |
| 116 #else if $utils_conditional.util == "compute_ami_ari": | |
| 117 label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') | |
| 118 | |
| 119 true_label = label_data.columns[$utils_conditional.true_label-2] | |
| 120 predicted_label = label_data.columns[$utils_conditional.predicted_label-2] | |
| 121 | |
| 122 true_labels = label_data[true_label] | |
| 123 predicted_labels = label_data[predicted_label] | |
| 124 | |
| 125 ami_ari = compute_ami_ari(labels1=true_labels, labels2=predicted_labels) | |
| 126 | |
| 127 print(f"AMI: {ami_ari['ami']}") | |
| 128 print(f"ARI: {ami_ari['ari']}") | |
| 129 | |
| 130 output_path = f"output/ami_ari.txt" | |
| 131 with open(output_path, 'w') as f: | |
| 132 f.write(f"AMI: {ami_ari['ami']}\n") | |
| 133 f.write(f"ARI: {ami_ari['ari']}\n") | |
| 134 | |
| 135 #end if | |
| 136 ]]></configfile> | |
| 137 </configfiles> | |
| 138 <inputs> | |
| 139 <expand macro="commercial_use_param"/> | |
| 140 <conditional name="utils_conditional"> | |
| 141 <param name="util" type="select" label="Flexynesis utils"> | |
| 142 <option value="louvain_clustering">Louvain Clustering</option> | |
| 143 <option value="get_optimal_clusters">Get Optimal Clusters</option> | |
| 144 <option value="k_means_clustering">K-Means Clustering</option> | |
| 145 <option value="compute_ami_ari">Compute AMI and ARI</option> | |
| 146 <option value="split_data">Split data to train and test</option> | |
| 147 <option value="binarize">Binarize mutation data</option> | |
| 148 </param> | |
| 149 <when value="louvain_clustering"> | |
| 150 <param argument="--X" type="data" format="tabular,csv" label="Matrix" help="Input matrix, (samples, features)"/> | |
| 151 <expand macro="plots_common_input"/> | |
| 152 <param argument="--threshold" type="float" min="0" optional="true" label="Distance threshold to create an edge between two nodes"/> | |
| 153 <param argument="--k" type="integer" min="0" optional="true" label="Number of nearest neighbors to connect for each node"/> | |
| 154 </when> | |
| 155 <when value="get_optimal_clusters"> | |
| 156 <param argument="--X" type="data" format="tabular,csv" label="Matrix" help="Input matrix, (samples, features)"/> | |
| 157 <expand macro="plots_common_input"/> | |
| 158 <param argument="--min_k" type="integer" min="0" value="2" optional="false" label="Minimum number of clusters to try"/> | |
| 159 <param argument="--max_k" type="integer" min="0" value="10" optional="false" label="Maximum number of clusters to try"/> | |
| 160 </when> | |
| 161 <when value="k_means_clustering"> | |
| 162 <param argument="--X" type="data" format="tabular,csv" label="Matrix" help="Input matrix, (samples, features)"/> | |
| 163 <expand macro="plots_common_input"/> | |
| 164 <param argument="--k" type="integer" min="0" optional="true" label="The number of clusters to form"/> | |
| 165 </when> | |
| 166 <when value="compute_ami_ari"> | |
| 167 <expand macro="plots_common_input"/> | |
| 168 <param name="true_label" type="data_column" data_ref="labels" label="Column name in the labels file to use for the true labels"/> | |
| 169 <param name="predicted_label" type="data_column" data_ref="labels" label="Column name in the labels file to use for the predicted labels"/> | |
| 170 </when> | |
| 171 <when value="split_data"> | |
| 172 <param argument="--clin" type="data" format="csv" optional="false" label="Clinical data" help="Samples in rows"/> | |
| 173 <param argument="--omics" type="data" format="tabular,csv" optional="false" multiple="true" label="Omics data" help="samples in columns"/> | |
| 174 <param argument="--split" type="float" min="0" max="1" value="0.7" label="Training/Test split ratio" help="Proportion of data to use for training (e.g., 0.7 means 70% train, 30% test)"/> | |
| 175 </when> | |
| 176 <when value="binarize"> | |
| 177 <param argument="--mutation" type="data" format="tabular,csv" label="Mutation data" help="Mutation data with both genes and samples in rows"/> | |
| 178 <param argument="--gene_idx" type="data_column" data_ref="mutation" label="Column in the mutation file with genes"/> | |
| 179 <param argument="--sample_idx" type="data_column" data_ref="mutation" label="Column in the mutation file with samples"/> | |
| 180 </when> | |
| 181 </conditional> | |
| 182 </inputs> | |
| 183 <outputs> | |
| 184 <data name="util_out" auto_format="true" from_work_dir="output/*" label="${tool.name} on ${on_string}: ${utils_conditional.util}"> | |
| 185 <filter>utils_conditional['util'] != "split_data"</filter> | |
| 186 </data> | |
| 187 <collection name="train_out" type="list" label="${tool.name} on ${on_string}: train datasets"> | |
| 188 <discover_datasets pattern="__name_and_ext__" format="csv" directory="output/train"/> | |
| 189 <filter>utils_conditional['util'] == "split_data"</filter> | |
| 190 </collection> | |
| 191 <collection name="test_out" type="list" label="${tool.name} on ${on_string}: test datasets"> | |
| 192 <discover_datasets pattern="__name_and_ext__" format="csv" directory="output/test"/> | |
| 193 <filter>utils_conditional['util'] == "split_data"</filter> | |
| 194 </collection> | |
| 195 </outputs> | |
| 196 <tests> | |
| 197 <!-- test 1: Louvain clustering --> | |
| 198 <test expect_num_outputs="1"> | |
| 199 <param name="non_commercial_use" value="True"/> | |
| 200 <conditional name="utils_conditional"> | |
| 201 <param name="util" value="louvain_clustering"/> | |
| 202 <param name="X" value="embeddings.csv"/> | |
| 203 <param name="labels" value="labels_pr.csv"/> | |
| 204 <param name="k" value="15"/> | |
| 205 </conditional> | |
| 206 <output name="util_out"> | |
| 207 <assert_contents> | |
| 208 <has_text text="sample_id,variable,class_label,probability,known_label,predicted_label,split,louvain_cluster"/> | |
| 209 <has_text text="MB-4818,CLAUDIN_SUBTYPE,LumA,0.8582904,LumB,LumA,test,3.0"/> | |
| 210 </assert_contents> | |
| 211 </output> | |
| 212 </test> | |
| 213 <!-- test 2: Get optimal clusters --> | |
| 214 <test expect_num_outputs="1"> | |
| 215 <param name="non_commercial_use" value="True"/> | |
| 216 <conditional name="utils_conditional"> | |
| 217 <param name="util" value="get_optimal_clusters"/> | |
| 218 <param name="X" value="embeddings.csv"/> | |
| 219 <param name="labels" value="labels_pr.csv"/> | |
| 220 <param name="min_k" value="2"/> | |
| 221 <param name="max_k" value="10"/> | |
| 222 </conditional> | |
| 223 <assert_stdout> | |
| 224 <has_text text="Optimal number of clusters: 2"/> | |
| 225 <has_text text="Silhouette scores: "/> | |
| 226 </assert_stdout> | |
| 227 <output name="util_out"> | |
| 228 <assert_contents> | |
| 229 <has_text text="sample_id,variable,class_label,probability,known_label,predicted_label,split,optimal_kmeans_cluster"/> | |
| 230 <has_text text="MB-4818,CLAUDIN_SUBTYPE,LumA,0.8582904,LumB,LumA,test,0.0"/> | |
| 231 </assert_contents> | |
| 232 </output> | |
| 233 </test> | |
| 234 <!-- test 3: K-Means clustering --> | |
| 235 <test expect_num_outputs="1"> | |
| 236 <param name="non_commercial_use" value="True"/> | |
| 237 <conditional name="utils_conditional"> | |
| 238 <param name="util" value="k_means_clustering"/> | |
| 239 <param name="X" value="embeddings.csv"/> | |
| 240 <param name="labels" value="labels_pr.csv"/> | |
| 241 <param name="k" value="2"/> | |
| 242 </conditional> | |
| 243 <assert_stdout> | |
| 244 <has_text text="KMeans(n_clusters=2, random_state=42)"/> | |
| 245 </assert_stdout> | |
| 246 <output name="util_out"> | |
| 247 <assert_contents> | |
| 248 <has_text text="sample_id,variable,class_label,probability,known_label,predicted_label,split,kmeans_cluster"/> | |
| 249 <has_text text="MB-4818,CLAUDIN_SUBTYPE,LumA,0.8582904,LumB,LumA,test,0.0"/> | |
| 250 </assert_contents> | |
| 251 </output> | |
| 252 </test> | |
| 253 <!-- test 4: Compute AMI and ARI --> | |
| 254 <test expect_num_outputs="1"> | |
| 255 <param name="non_commercial_use" value="True"/> | |
| 256 <conditional name="utils_conditional"> | |
| 257 <param name="util" value="compute_ami_ari"/> | |
| 258 <param name="labels" value="labels.csv"/> | |
| 259 <param name="true_label" value="5"/> | |
| 260 <param name="predicted_label" value="6"/> | |
| 261 </conditional> | |
| 262 <assert_stdout> | |
| 263 <has_text_matching expression="AMI: 0.5108[0-9]+"/> | |
| 264 <has_text_matching expression="ARI: 0.5258[0-9]+"/> | |
| 265 </assert_stdout> | |
| 266 <output name="util_out"> | |
| 267 <assert_contents> | |
| 268 <has_text_matching expression="AMI: 0.5108[0-9]+"/> | |
| 269 <has_text_matching expression="ARI: 0.5258[0-9]+"/> | |
| 270 </assert_contents> | |
| 271 </output> | |
| 272 </test> | |
| 273 <!-- test 5: Split data to train and test --> | |
| 274 <test expect_num_outputs="2"> | |
| 275 <param name="non_commercial_use" value="True"/> | |
| 276 <conditional name="utils_conditional"> | |
| 277 <param name="util" value="split_data"/> | |
| 278 <param name="clin" value="train/clin"/> | |
| 279 <param name="omics" value="train/cnv,train/gex"/> | |
| 280 <param name="split" value="0.7"/> | |
| 281 </conditional> | |
| 282 <output_collection name="train_out" type="list" count="3"> | |
| 283 <element name="clin"> | |
| 284 <assert_contents> | |
| 285 <has_text_matching expression="sample"/> | |
| 286 <has_n_lines n="645"/> | |
| 287 </assert_contents> | |
| 288 </element> | |
| 289 <element name="cnv"> | |
| 290 <assert_contents> | |
| 291 <has_text_matching expression="gene"/> | |
| 292 <has_n_lines n="25"/> | |
| 293 </assert_contents> | |
| 294 </element> | |
| 295 <element name="gex"> | |
| 296 <assert_contents> | |
| 297 <has_text_matching expression="gene"/> | |
| 298 <has_n_lines n="25"/> | |
| 299 </assert_contents> | |
| 300 </element> | |
| 301 </output_collection> | |
| 302 <output_collection name="test_out" type="list" count="3"> | |
| 303 <element name="clin"> | |
| 304 <assert_contents> | |
| 305 <has_text_matching expression="sample"/> | |
| 306 <has_n_lines n="277"/> | |
| 307 </assert_contents> | |
| 308 </element> | |
| 309 <element name="cnv"> | |
| 310 <assert_contents> | |
| 311 <has_text_matching expression="gene"/> | |
| 312 <has_n_lines n="25"/> | |
| 313 </assert_contents> | |
| 314 </element> | |
| 315 <element name="gex"> | |
| 316 <assert_contents> | |
| 317 <has_text_matching expression="gene"/> | |
| 318 <has_n_lines n="25"/> | |
| 319 </assert_contents> | |
| 320 </element> | |
| 321 </output_collection> | |
| 322 </test> | |
| 323 <!-- test 6: Binarize mutation data --> | |
| 324 <test expect_num_outputs="1"> | |
| 325 <param name="non_commercial_use" value="True"/> | |
| 326 <conditional name="utils_conditional"> | |
| 327 <param name="util" value="binarize"/> | |
| 328 <param name="mutation" value="mut.tabular"/> | |
| 329 <param name="gene_idx" value="1"/> | |
| 330 <param name="sample_idx" value="17"/> | |
| 331 </conditional> | |
| 332 <output name="util_out"> | |
| 333 <assert_contents> | |
| 334 <has_n_lines n="1611"/> | |
| 335 <has_text text="Hugo_Symbol"/> | |
| 336 <has_text text="AADACL2,0.0,0.0"/> | |
| 337 <has_text text="ABCB1,0.0,0.0,0.0,1.0"/> | |
| 338 </assert_contents> | |
| 339 </output> | |
| 340 </test> | |
| 341 </tests> | |
| 342 <help><![CDATA[ | |
| 343 @COMMON_HELP@ | |
| 344 | |
| 345 Flexynesis Utils provides a collection of clustering and evaluation utilities for multi-omics data analysis. This tool offers four main functionalities: | |
| 346 | |
| 347 1. **Louvain Clustering**: Community detection clustering algorithm that partitions data into clusters based on network modularity optimization | |
| 348 2. **Get Optimal Clusters**: Determines the optimal number of clusters using K-means clustering with silhouette score evaluation | |
| 349 3. **K-Means Clustering**: Standard K-means clustering algorithm for partitioning data into k clusters | |
| 350 4. **Compute AMI and ARI**: Calculates Adjusted Mutual Information (AMI) and Adjusted Rand Index (ARI) to evaluate clustering performance | |
| 351 5. **Split data to train and test**: Splits multi-omics data into training and testing sets based on a specified ratio | |
| 352 6. **Binarize mutation data**: Converts mutation data into a binary format, indicating presence or absence of mutations for each gene in each sample | |
| 353 | |
| 354 **Louvain Clustering** | |
| 355 Uses the Louvain algorithm for community detection in networks. This method builds a graph from the input data and finds communities (clusters) by optimizing modularity. | |
| 356 | |
| 357 Outputs Original labels file with added 'louvain_cluster' column | |
| 358 | |
| 359 **Get Optimal Clusters** | |
| 360 Performs K-means clustering for a range of k values and determines the optimal number of clusters using silhouette analysis. | |
| 361 | |
| 362 Outputs Original labels file with added 'optimal_kmeans_cluster' column | |
| 363 Console output shows the optimal k value and silhouette scores for each k. | |
| 364 | |
| 365 **K-Means Clustering** | |
| 366 Standard K-means clustering algorithm that partitions data into k clusters by minimizing within-cluster sum of squares. | |
| 367 | |
| 368 Outputs Original labels file with added 'kmeans_cluster' column | |
| 369 | |
| 370 **Compute AMI and ARI** | |
| 371 Evaluates clustering performance by comparing true labels with predicted labels using two standard metrics: | |
| 372 - *AMI (Adjusted Mutual Information)*: Measures mutual information between clusterings, adjusted for chance | |
| 373 - *ARI (Adjusted Rand Index)*: Measures similarity between clusterings, adjusted for chance | |
| 374 | |
| 375 Outputs Text file containing AMI and ARI scores | |
| 376 Both metrics range from 0 to 1, where 1 indicates perfect agreement | |
| 377 | |
| 378 **Search cBioPortal** | |
| 379 Fetches available data files from cBioPortal for a specified study ID. This allows users to explore and retrieve data from cBioPortal studies. | |
| 380 Outputs a text file listing available data files for the specified study ID | |
| 381 | |
| 382 **Split data to train and test** | |
| 383 Splits multi-omics data into training and testing sets based on a specified ratio. This is useful for preparing datasets for machine learning tasks. | |
| 384 You can use `Flexynesis cBioPortal import` to fetch data from cBioPortal and then use this tool to split the data into training and testing sets. | |
| 385 | |
| 386 **Binarize mutation data** | |
| 387 Converts mutation data into a binary format, indicating presence or absence of mutations for each gene in each sample. This is useful for preparing mutation data for analysis. | |
| 388 The output will be a tabular file with genes as rows and samples as columns, where each cell indicates whether a mutation is present (1) or absent (0). | |
| 389 | |
| 390 | |
| 391 .. _Documentation: https://bimsbstatic.mdc-berlin.de/akalin/buyar/flexynesis/site/ | |
| 392 .. _copyright holders: https://github.com/BIMSBbioinfo/flexynesis | |
| 393 ]]></help> | |
| 394 <expand macro="creator"/> | |
| 395 <expand macro="citations"/> | |
| 396 </tool> |
