Mercurial > repos > bgruening > flexynesis_plot
comparison macros.xml @ 0:1b0e4bd4815b draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit b2463fb68d0ae54864d87718ee72f5e063aa4587
| author | bgruening |
|---|---|
| date | Tue, 24 Jun 2025 05:56:44 +0000 |
| parents | |
| children | b768c6025ba1 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1b0e4bd4815b |
|---|---|
| 1 <macros> | |
| 2 <token name="@TOOL_VERSION@">0.2.20</token> | |
| 3 <token name="@VERSION_SUFFIX@">0</token> | |
| 4 <token name="@PROFILE@">24.1</token> | |
| 5 <xml name="requirements"> | |
| 6 <requirements> | |
| 7 <requirement type="package" version="@TOOL_VERSION@">flexynesis</requirement> | |
| 8 <yield/> | |
| 9 </requirements> | |
| 10 </xml> | |
| 11 <xml name="edam"> | |
| 12 <edam_topics> | |
| 13 <edam_topic>topic_0622</edam_topic> | |
| 14 <edam_topic>topic_3474</edam_topic> | |
| 15 <edam_topic>topic_2640</edam_topic> | |
| 16 </edam_topics> | |
| 17 <edam_operations> | |
| 18 <edam_operation>operation_3197</edam_operation> | |
| 19 <edam_operation>operation_2403</edam_operation> | |
| 20 <edam_operation>operation_2426</edam_operation> | |
| 21 </edam_operations> | |
| 22 </xml> | |
| 23 <xml name="sanitizer_printable"> | |
| 24 <sanitizer invalid_char=""> | |
| 25 <valid initial="string.printable"> | |
| 26 <remove value="'"/> | |
| 27 <remove value='"'/> | |
| 28 <remove value=" "/> | |
| 29 <yield/> | |
| 30 </valid> | |
| 31 </sanitizer> | |
| 32 </xml> | |
| 33 <xml name="sanitizer_letters"> | |
| 34 <sanitizer invalid_char=" "> | |
| 35 <valid initial="string.letters"> | |
| 36 <add value="_"/> | |
| 37 </valid> | |
| 38 </sanitizer> | |
| 39 </xml> | |
| 40 <token name="@CHECK_NON_COMMERCIAL_USE@"><![CDATA[ | |
| 41 #if not $non_commercial_use | |
| 42 >&2 echo "this tool is only available for non commercial use"; | |
| 43 exit 1; | |
| 44 #end if | |
| 45 ]]></token> | |
| 46 <xml name="commercial_use_param"> | |
| 47 <param name="non_commercial_use" label="I certify that I am not using this tool for commercial purposes." type="boolean" truevalue="NON_COMMERCIAL_USE" falsevalue="COMMERCIAL_USE" checked="False"> | |
| 48 <validator type="expression" message="This tool is only available for non-commercial use.">value == True</validator> | |
| 49 </param> | |
| 50 </xml> | |
| 51 <xml name="main_inputs"> | |
| 52 <param name="train_clin" type="data" format="csv" label="Training clinical data"/> | |
| 53 <param name="test_clin" type="data" format="csv" label="Test clinical data"/> | |
| 54 <param name="train_omics_main" type="data" format="csv" label="Training omics data"/> | |
| 55 <param name="test_omics_main" type="data" format="csv" label="Test omics data"/> | |
| 56 <param name="assay_main" type="text" optional="true" label="What type of assay is your input?" help="This would be used as output name."> | |
| 57 <expand macro="sanitizer_letters"/> | |
| 58 </param> | |
| 59 </xml> | |
| 60 <xml name="extra_inputs"> | |
| 61 <param name="train_omics" type="data" optional="true" format="csv" label="Training omics data"/> | |
| 62 <param name="test_omics" type="data" optional="true" format="csv" label="Test omics data"/> | |
| 63 <param name="assay" type="text" optional="true" label="What type of assay is your input?" help="This would be used as output name." > | |
| 64 <expand macro="sanitizer_letters"/> | |
| 65 </param> | |
| 66 </xml> | |
| 67 <xml name="advanced"> | |
| 68 <section name="advanced" title="Advanced Options"> | |
| 69 <param argument="--fusion_type" type="select" label="Fusion method" help="How to fuse the omics layers?"> | |
| 70 <option value="intermediate">intermediate</option> | |
| 71 <option value="early">early</option> | |
| 72 </param> | |
| 73 <param argument="--finetuning_samples" type="integer" min="0" value="0" label="Number of samples from the test dataset to use for fine-tuning the model." help="Set to 0 to disable fine-tuning."/> | |
| 74 <param argument="--variance_threshold" type="float" min="0" max="100" value="1" label="Variance threshold (as percentile) to drop low variance features." help="Set to 0 for no variance filtering."/> | |
| 75 <param argument="--correlation_threshold" type="float" min="0" max="1" value="0.8" label="Correlation threshold to drop highly redundant features." help="Set to 1 for no redundancy filtering."/> | |
| 76 <param argument="--subsample" type="integer" min="0" value="0" label="Downsample training set to randomly drawn N samples for training."/> | |
| 77 <param argument="--features_min" type="integer" min="0" value="500" label="Minimum number of features to retain after feature selection."/> | |
| 78 <param argument="--features_top_percentile" type="float" min="0" max="100" value="20" label="Top percentile features (among the features remaining after variance filtering and data cleanup) to retain after feature selection."/> | |
| 79 <param argument="--log_transform" type="boolean" truevalue="--log_transform True" falsevalue="" checked="false" label="Whether to apply log-transformation to input data matrices"/> | |
| 80 <param argument="--early_stop_patience" type="integer" min="-1" value="10" label="How many epochs to wait when no improvements in validation loss are observed." help="Set to -1 to disable early stopping."/> | |
| 81 <param argument="--hpo_iter" type="integer" min="1" value="100" label="Number of iterations for hyperparameter optimization."/> | |
| 82 <param argument="--val_size" type="float" min="0.0" max="1" value="0.2" label="Proportion of training data to be used as validation split"/> | |
| 83 <param argument="--hpo_patience" type="integer" min="0" value="10" label="How many hyperparameter optimization iterations to wait for when no improvements are observed." help="Set to 0 to disable early stopping."/> | |
| 84 <param argument="--use_cv" type="boolean" truevalue="--use_cv" falsevalue="" checked="false" label="Cross validation" help="If set, a 5-fold cross-validation training will be done. Otherwise, a single training on 80 percent of the dataset is done. "/> | |
| 85 <param argument="--use_loss_weighting" type="boolean" truevalue="--use_loss_weighting True" falsevalue="" checked="true" label="Whether to apply loss-balancing using uncertainty weights method."/> | |
| 86 <param argument="--evaluate_baseline_performance" type="boolean" truevalue="--evaluate_baseline_performance" falsevalue="" checked="false" label="Enable modeling also with Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset."/> | |
| 87 <param argument="--feature_importance_method" type="select" label="which method(s) to use to compute feature importance scores."> | |
| 88 <option value="Both" selected="true">Both</option> | |
| 89 <option value="IntegratedGradients">IntegratedGradients</option> | |
| 90 <option value="GradientShap">GradientShap</option> | |
| 91 </param> | |
| 92 </section> | |
| 93 </xml> | |
| 94 <xml name="plots_common_param"> | |
| 95 <yield/> | |
| 96 <param name="format" type="select" label="Output format"> | |
| 97 <option value="jpg" selected="true">jpg</option> | |
| 98 <option value="png">png</option> | |
| 99 <option value="pdf">pdf</option> | |
| 100 <option value="svg">svg</option> | |
| 101 </param> | |
| 102 <param name="dpi" type="integer" min="0" max="1200" value="300" label="DPI"/> | |
| 103 </xml> | |
| 104 <xml name="plots_common_input"> | |
| 105 <yield/> | |
| 106 <param argument="--labels" type="data" format="tabular,csv" label="Predicted labels" help="Generated by flexynesis"/> | |
| 107 </xml> | |
| 108 <token name="@PLOT_COMMON_CONFIG@"><![CDATA[ | |
| 109 label_data = load_labels('inputs/$plot_conditional.labels.element_identifier.$plot_conditional.labels.ext') | |
| 110 ]]></token> | |
| 111 <token name="@PR_ROC_BOX_CONFIG@"><![CDATA[ | |
| 112 @PLOT_COMMON_CONFIG@ | |
| 113 | |
| 114 # Check if this is a regression problem (no class probabilities) | |
| 115 non_na_probs = label_data['probability'].notna().sum() | |
| 116 | |
| 117 print(f" Non-NaN probabilities: {non_na_probs}/{len(label_data)}") | |
| 118 | |
| 119 # If most probabilities are NaN, this is likely a regression problem | |
| 120 if non_na_probs < len(label_data) * 0.1: # Less than 10% valid probabilities | |
| 121 raise ValueError(" Detected regression problem - precision-recall curves not applicable") | |
| 122 | |
| 123 # Debug: Check data quality | |
| 124 total_rows = len(label_data) | |
| 125 missing_labels = label_data['known_label'].isna().sum() | |
| 126 missing_probs = label_data['probability'].isna().sum() | |
| 127 unique_samples = label_data['sample_id'].nunique() | |
| 128 unique_classes = label_data['class_label'].nunique() | |
| 129 | |
| 130 print(f" Data summary: {total_rows} total rows, {unique_samples} unique samples, {unique_classes} unique classes") | |
| 131 print(f" Missing data: {missing_labels} missing known_label, {missing_probs} missing probability") | |
| 132 | |
| 133 if missing_labels > 0: | |
| 134 print(f" Warning: Found {missing_labels} missing known_label values") | |
| 135 missing_samples = label_data[label_data['known_label'].isna()]['sample_id'].unique()[:5] | |
| 136 print(f" Sample IDs with missing known_label: {list(missing_samples)}") | |
| 137 | |
| 138 # Remove rows with missing known_label | |
| 139 label_data = label_data.dropna(subset=['known_label']) | |
| 140 if label_data.empty: | |
| 141 raise ValueError("Error: No valid known_label data remaining") | |
| 142 | |
| 143 ]]></token> | |
| 144 <token name="@PR_ROC_CONFIG@"><![CDATA[ | |
| 145 @PR_ROC_BOX_CONFIG@ | |
| 146 | |
| 147 # 1. Pivot to wide format | |
| 148 prob_df = label_data.pivot(index='sample_id', columns='class_label', values='probability') | |
| 149 | |
| 150 print(f" After pivot: {prob_df.shape[0]} samples x {prob_df.shape[1]} classes") | |
| 151 print(f" Class columns: {list(prob_df.columns)}") | |
| 152 | |
| 153 # Check for NaN values in probability data | |
| 154 nan_counts = prob_df.isna().sum() | |
| 155 if nan_counts.any(): | |
| 156 print(f" NaN counts per class: {dict(nan_counts)}") | |
| 157 print(f" Samples with any NaN: {prob_df.isna().any(axis=1).sum()}/{len(prob_df)}") | |
| 158 | |
| 159 # Drop only rows where ALL probabilities are NaN | |
| 160 all_nan_rows = prob_df.isna().all(axis=1) | |
| 161 if all_nan_rows.any(): | |
| 162 print(f" Dropping {all_nan_rows.sum()} samples with all NaN probabilities") | |
| 163 prob_df = prob_df[~all_nan_rows] | |
| 164 | |
| 165 remaining_nans = prob_df.isna().sum().sum() | |
| 166 if remaining_nans > 0: | |
| 167 print(f" Warning: {remaining_nans} individual NaN values remain - filling with 0") | |
| 168 prob_df = prob_df.fillna(0) | |
| 169 | |
| 170 if prob_df.empty: | |
| 171 raise ValueError(f" Error: No valid probability data remaining for") | |
| 172 | |
| 173 # 2. Get true labels | |
| 174 true_labels_df = label_data.drop_duplicates('sample_id')[['sample_id', 'known_label']].set_index('sample_id') | |
| 175 | |
| 176 # 3. Align indices - only keep samples that exist in both datasets | |
| 177 common_indices = prob_df.index.intersection(true_labels_df.index) | |
| 178 if len(common_indices) == 0: | |
| 179 raise ValueError(f" Error: No common sample_ids between probability and true label data") | |
| 180 | |
| 181 print(f" Found {len(common_indices)} samples with both probability and true label data") | |
| 182 | |
| 183 # Filter both datasets to common indices | |
| 184 prob_df_aligned = prob_df.loc[common_indices] | |
| 185 y_true = true_labels_df.loc[common_indices]['known_label'] | |
| 186 | |
| 187 # 4. Final check for NaN values | |
| 188 if y_true.isna().any(): | |
| 189 raise ValueError(f" Error: True labels still contain NaN after alignment") | |
| 190 | |
| 191 if prob_df_aligned.isna().any().any(): | |
| 192 raise ValueError(f" Error: Probability data still contains NaN after alignment") | |
| 193 | |
| 194 # 5. Convert categorical labels to integer labels | |
| 195 # Create a mapping from class names to integers | |
| 196 class_names = list(prob_df_aligned.columns) | |
| 197 class_to_int = {class_name: i for i, class_name in enumerate(class_names)} | |
| 198 | |
| 199 print(f" Class mapping: {class_to_int}") | |
| 200 | |
| 201 # Convert true labels to integers | |
| 202 y_true_np = y_true.map(class_to_int).to_numpy() | |
| 203 y_probs_np = prob_df_aligned.to_numpy() | |
| 204 | |
| 205 print(f" Data shape: y_true={y_true_np.shape}, y_probs={y_probs_np.shape}") | |
| 206 print(f" Unique true labels (integers): {set(y_true_np)}") | |
| 207 print(f" Class labels (columns): {class_names}") | |
| 208 print(f" Label distribution: {dict(zip(*np.unique(y_true_np, return_counts=True)))}") | |
| 209 | |
| 210 # Check for any unmapped labels (will be NaN) | |
| 211 if pd.isna(y_true_np).any(): | |
| 212 raise ValueError(" Error: Some true labels could not be mapped to class columns") | |
| 213 | |
| 214 ]]></token> | |
| 215 <xml name="common_test"> | |
| 216 <param name="non_commercial_use" value="True"/> | |
| 217 <conditional name="training_type"> | |
| 218 <param name="model" value="s_train"/> | |
| 219 <param name="train_clin" value="train/clin" ftype="csv"/> | |
| 220 <param name="test_clin" value="test/clin" ftype="csv"/> | |
| 221 <param name="train_omics_main" value="train/gex" ftype="csv"/> | |
| 222 <param name="test_omics_main" value="test/gex" ftype="csv"/> | |
| 223 <param name="assay_main" value="bar"/> | |
| 224 <repeat name="omics"> | |
| 225 <param name="train_omics" value="train/cnv" ftype="csv"/> | |
| 226 <param name="test_omics" value="test/cnv" ftype="csv"/> | |
| 227 <param name="assay" value="foo"/> | |
| 228 </repeat> | |
| 229 <conditional name="model_class"> | |
| 230 <param name="model_class" value="DirectPred"/> | |
| 231 </conditional> | |
| 232 <param name="target_variables" value="Erlotinib"/> | |
| 233 <param name="surv_event_var" value="OS_STATUS"/> | |
| 234 <param name="surv_time_var" value="OS_MONTHS"/> | |
| 235 <section name="advanced"> | |
| 236 <param name="hpo_iter" value="1"/> | |
| 237 </section> | |
| 238 </conditional> | |
| 239 <yield/> | |
| 240 <output_collection name="results" type="list"> | |
| 241 <element name="job.embeddings_test"> | |
| 242 <assert_contents> | |
| 243 <has_n_lines n="50"/> | |
| 244 </assert_contents> | |
| 245 </element> | |
| 246 <element name="job.embeddings_train"> | |
| 247 <assert_contents> | |
| 248 <has_n_lines n="50"/> | |
| 249 </assert_contents> | |
| 250 </element> | |
| 251 <element name="job.feature_importance.GradientShap"> | |
| 252 <assert_contents> | |
| 253 <has_text_matching expression="Erlotinib,0,,bar,A2M,"/> | |
| 254 <has_text_matching expression="Erlotinib,0,,bar,ABCC4,"/> | |
| 255 <has_text_matching expression="GradientShap"/> | |
| 256 </assert_contents> | |
| 257 </element> | |
| 258 <element name="job.feature_importance.IntegratedGradients"> | |
| 259 <assert_contents> | |
| 260 <has_text_matching expression="Erlotinib,0,,bar,A2M,"/> | |
| 261 <has_text_matching expression="Erlotinib,0,,bar,ABCC4,"/> | |
| 262 <has_text_matching expression="IntegratedGradients"/> | |
| 263 </assert_contents> | |
| 264 </element> | |
| 265 <element name="job.feature_logs.bar"> | |
| 266 <assert_contents> | |
| 267 <has_n_lines n="25"/> | |
| 268 </assert_contents> | |
| 269 </element> | |
| 270 <element name="job.feature_logs.omics_foo"> | |
| 271 <assert_contents> | |
| 272 <has_n_lines n="25"/> | |
| 273 </assert_contents> | |
| 274 </element> | |
| 275 <element name="job.predicted_labels"> | |
| 276 <assert_contents> | |
| 277 <has_text_matching expression="source_dataset:A-704,Erlotinib,"/> | |
| 278 <has_text_matching expression="target_dataset:KMRC-20,Erlotinib,"/> | |
| 279 </assert_contents> | |
| 280 </element> | |
| 281 <element name="job.stats"> | |
| 282 <assert_contents> | |
| 283 <has_text_matching expression="DirectPred,Erlotinib,numerical,mse,"/> | |
| 284 <has_text_matching expression="DirectPred,Erlotinib,numerical,r2,"/> | |
| 285 <has_text_matching expression="DirectPred,Erlotinib,numerical,pearson_corr,"/> | |
| 286 </assert_contents> | |
| 287 </element> | |
| 288 </output_collection> | |
| 289 </xml> | |
| 290 <token name="@COMMON_HELP@"> | |
| 291 | |
| 292 .. class:: warningmark | |
| 293 | |
| 294 **WARNING: This tool is only available for NON-COMMERCIAL use. Permission is only granted for academic, research, and educational purposes. Before using, be sure to review, agree, and comply with the license.** | |
| 295 | |
| 296 Flexynesis is a deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction. | |
| 297 The package includes multiple types of deep learning architectures such as simple fully connected networks, supervised variational autoencoders, graph convolutional networks, multi-triplet networks different options of data layer fusion, and automates feature selection and hyperparameter optimization. | |
| 298 | |
| 299 For more information, please check the Documentation_ : | |
| 300 | |
| 301 For commercial use, please review the flexynesis license on GitHub and contact the `copyright holders`_ . | |
| 302 | |
| 303 ----- | |
| 304 | |
| 305 </token> | |
| 306 <xml name="creator"> | |
| 307 <creator> | |
| 308 <organization name="European Galaxy Team" url="https://galaxyproject.org/eu/"/> | |
| 309 <person givenName="Amirhossein" familyName="Naghsh Nilchi" email="nilchia@informatik.uni-freiburg.de"/> | |
| 310 <yield/> | |
| 311 <person givenName="Björn" familyName="Grüning" email="gruening@informatik.uni-freiburg.de"/> | |
| 312 </creator> | |
| 313 </xml> | |
| 314 <xml name="citations"> | |
| 315 <citations> | |
| 316 <citation type="doi">10.1101/2024.07.16.603606</citation> | |
| 317 </citations> | |
| 318 </xml> | |
| 319 </macros> |
