comparison macros.xml @ 0:1b0e4bd4815b draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit b2463fb68d0ae54864d87718ee72f5e063aa4587
author bgruening
date Tue, 24 Jun 2025 05:56:44 +0000
parents
children b768c6025ba1
comparison
equal deleted inserted replaced
-1:000000000000 0:1b0e4bd4815b
1 <macros>
2 <token name="@TOOL_VERSION@">0.2.20</token>
3 <token name="@VERSION_SUFFIX@">0</token>
4 <token name="@PROFILE@">24.1</token>
5 <xml name="requirements">
6 <requirements>
7 <requirement type="package" version="@TOOL_VERSION@">flexynesis</requirement>
8 <yield/>
9 </requirements>
10 </xml>
11 <xml name="edam">
12 <edam_topics>
13 <edam_topic>topic_0622</edam_topic>
14 <edam_topic>topic_3474</edam_topic>
15 <edam_topic>topic_2640</edam_topic>
16 </edam_topics>
17 <edam_operations>
18 <edam_operation>operation_3197</edam_operation>
19 <edam_operation>operation_2403</edam_operation>
20 <edam_operation>operation_2426</edam_operation>
21 </edam_operations>
22 </xml>
23 <xml name="sanitizer_printable">
24 <sanitizer invalid_char="">
25 <valid initial="string.printable">
26 <remove value="'"/>
27 <remove value='"'/>
28 <remove value=" "/>
29 <yield/>
30 </valid>
31 </sanitizer>
32 </xml>
33 <xml name="sanitizer_letters">
34 <sanitizer invalid_char=" ">
35 <valid initial="string.letters">
36 <add value="_"/>
37 </valid>
38 </sanitizer>
39 </xml>
40 <token name="@CHECK_NON_COMMERCIAL_USE@"><![CDATA[
41 #if not $non_commercial_use
42 >&2 echo "this tool is only available for non commercial use";
43 exit 1;
44 #end if
45 ]]></token>
46 <xml name="commercial_use_param">
47 <param name="non_commercial_use" label="I certify that I am not using this tool for commercial purposes." type="boolean" truevalue="NON_COMMERCIAL_USE" falsevalue="COMMERCIAL_USE" checked="False">
48 <validator type="expression" message="This tool is only available for non-commercial use.">value == True</validator>
49 </param>
50 </xml>
51 <xml name="main_inputs">
52 <param name="train_clin" type="data" format="csv" label="Training clinical data"/>
53 <param name="test_clin" type="data" format="csv" label="Test clinical data"/>
54 <param name="train_omics_main" type="data" format="csv" label="Training omics data"/>
55 <param name="test_omics_main" type="data" format="csv" label="Test omics data"/>
56 <param name="assay_main" type="text" optional="true" label="What type of assay is your input?" help="This would be used as output name.">
57 <expand macro="sanitizer_letters"/>
58 </param>
59 </xml>
60 <xml name="extra_inputs">
61 <param name="train_omics" type="data" optional="true" format="csv" label="Training omics data"/>
62 <param name="test_omics" type="data" optional="true" format="csv" label="Test omics data"/>
63 <param name="assay" type="text" optional="true" label="What type of assay is your input?" help="This would be used as output name." >
64 <expand macro="sanitizer_letters"/>
65 </param>
66 </xml>
67 <xml name="advanced">
68 <section name="advanced" title="Advanced Options">
69 <param argument="--fusion_type" type="select" label="Fusion method" help="How to fuse the omics layers?">
70 <option value="intermediate">intermediate</option>
71 <option value="early">early</option>
72 </param>
73 <param argument="--finetuning_samples" type="integer" min="0" value="0" label="Number of samples from the test dataset to use for fine-tuning the model." help="Set to 0 to disable fine-tuning."/>
74 <param argument="--variance_threshold" type="float" min="0" max="100" value="1" label="Variance threshold (as percentile) to drop low variance features." help="Set to 0 for no variance filtering."/>
75 <param argument="--correlation_threshold" type="float" min="0" max="1" value="0.8" label="Correlation threshold to drop highly redundant features." help="Set to 1 for no redundancy filtering."/>
76 <param argument="--subsample" type="integer" min="0" value="0" label="Downsample training set to randomly drawn N samples for training."/>
77 <param argument="--features_min" type="integer" min="0" value="500" label="Minimum number of features to retain after feature selection."/>
78 <param argument="--features_top_percentile" type="float" min="0" max="100" value="20" label="Top percentile features (among the features remaining after variance filtering and data cleanup) to retain after feature selection."/>
79 <param argument="--log_transform" type="boolean" truevalue="--log_transform True" falsevalue="" checked="false" label="Whether to apply log-transformation to input data matrices"/>
80 <param argument="--early_stop_patience" type="integer" min="-1" value="10" label="How many epochs to wait when no improvements in validation loss are observed." help="Set to -1 to disable early stopping."/>
81 <param argument="--hpo_iter" type="integer" min="1" value="100" label="Number of iterations for hyperparameter optimization."/>
82 <param argument="--val_size" type="float" min="0.0" max="1" value="0.2" label="Proportion of training data to be used as validation split"/>
83 <param argument="--hpo_patience" type="integer" min="0" value="10" label="How many hyperparameter optimization iterations to wait for when no improvements are observed." help="Set to 0 to disable early stopping."/>
84 <param argument="--use_cv" type="boolean" truevalue="--use_cv" falsevalue="" checked="false" label="Cross validation" help="If set, a 5-fold cross-validation training will be done. Otherwise, a single training on 80 percent of the dataset is done. "/>
85 <param argument="--use_loss_weighting" type="boolean" truevalue="--use_loss_weighting True" falsevalue="" checked="true" label="Whether to apply loss-balancing using uncertainty weights method."/>
86 <param argument="--evaluate_baseline_performance" type="boolean" truevalue="--evaluate_baseline_performance" falsevalue="" checked="false" label="Enable modeling also with Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset."/>
87 <param argument="--feature_importance_method" type="select" label="which method(s) to use to compute feature importance scores.">
88 <option value="Both" selected="true">Both</option>
89 <option value="IntegratedGradients">IntegratedGradients</option>
90 <option value="GradientShap">GradientShap</option>
91 </param>
92 </section>
93 </xml>
94 <xml name="plots_common_param">
95 <yield/>
96 <param name="format" type="select" label="Output format">
97 <option value="jpg" selected="true">jpg</option>
98 <option value="png">png</option>
99 <option value="pdf">pdf</option>
100 <option value="svg">svg</option>
101 </param>
102 <param name="dpi" type="integer" min="0" max="1200" value="300" label="DPI"/>
103 </xml>
104 <xml name="plots_common_input">
105 <yield/>
106 <param argument="--labels" type="data" format="tabular,csv" label="Predicted labels" help="Generated by flexynesis"/>
107 </xml>
108 <token name="@PLOT_COMMON_CONFIG@"><![CDATA[
109 label_data = load_labels('inputs/$plot_conditional.labels.element_identifier.$plot_conditional.labels.ext')
110 ]]></token>
111 <token name="@PR_ROC_BOX_CONFIG@"><![CDATA[
112 @PLOT_COMMON_CONFIG@
113
114 # Check if this is a regression problem (no class probabilities)
115 non_na_probs = label_data['probability'].notna().sum()
116
117 print(f" Non-NaN probabilities: {non_na_probs}/{len(label_data)}")
118
119 # If most probabilities are NaN, this is likely a regression problem
120 if non_na_probs < len(label_data) * 0.1: # Less than 10% valid probabilities
121 raise ValueError(" Detected regression problem - precision-recall curves not applicable")
122
123 # Debug: Check data quality
124 total_rows = len(label_data)
125 missing_labels = label_data['known_label'].isna().sum()
126 missing_probs = label_data['probability'].isna().sum()
127 unique_samples = label_data['sample_id'].nunique()
128 unique_classes = label_data['class_label'].nunique()
129
130 print(f" Data summary: {total_rows} total rows, {unique_samples} unique samples, {unique_classes} unique classes")
131 print(f" Missing data: {missing_labels} missing known_label, {missing_probs} missing probability")
132
133 if missing_labels > 0:
134 print(f" Warning: Found {missing_labels} missing known_label values")
135 missing_samples = label_data[label_data['known_label'].isna()]['sample_id'].unique()[:5]
136 print(f" Sample IDs with missing known_label: {list(missing_samples)}")
137
138 # Remove rows with missing known_label
139 label_data = label_data.dropna(subset=['known_label'])
140 if label_data.empty:
141 raise ValueError("Error: No valid known_label data remaining")
142
143 ]]></token>
144 <token name="@PR_ROC_CONFIG@"><![CDATA[
145 @PR_ROC_BOX_CONFIG@
146
147 # 1. Pivot to wide format
148 prob_df = label_data.pivot(index='sample_id', columns='class_label', values='probability')
149
150 print(f" After pivot: {prob_df.shape[0]} samples x {prob_df.shape[1]} classes")
151 print(f" Class columns: {list(prob_df.columns)}")
152
153 # Check for NaN values in probability data
154 nan_counts = prob_df.isna().sum()
155 if nan_counts.any():
156 print(f" NaN counts per class: {dict(nan_counts)}")
157 print(f" Samples with any NaN: {prob_df.isna().any(axis=1).sum()}/{len(prob_df)}")
158
159 # Drop only rows where ALL probabilities are NaN
160 all_nan_rows = prob_df.isna().all(axis=1)
161 if all_nan_rows.any():
162 print(f" Dropping {all_nan_rows.sum()} samples with all NaN probabilities")
163 prob_df = prob_df[~all_nan_rows]
164
165 remaining_nans = prob_df.isna().sum().sum()
166 if remaining_nans > 0:
167 print(f" Warning: {remaining_nans} individual NaN values remain - filling with 0")
168 prob_df = prob_df.fillna(0)
169
170 if prob_df.empty:
171 raise ValueError(f" Error: No valid probability data remaining for")
172
173 # 2. Get true labels
174 true_labels_df = label_data.drop_duplicates('sample_id')[['sample_id', 'known_label']].set_index('sample_id')
175
176 # 3. Align indices - only keep samples that exist in both datasets
177 common_indices = prob_df.index.intersection(true_labels_df.index)
178 if len(common_indices) == 0:
179 raise ValueError(f" Error: No common sample_ids between probability and true label data")
180
181 print(f" Found {len(common_indices)} samples with both probability and true label data")
182
183 # Filter both datasets to common indices
184 prob_df_aligned = prob_df.loc[common_indices]
185 y_true = true_labels_df.loc[common_indices]['known_label']
186
187 # 4. Final check for NaN values
188 if y_true.isna().any():
189 raise ValueError(f" Error: True labels still contain NaN after alignment")
190
191 if prob_df_aligned.isna().any().any():
192 raise ValueError(f" Error: Probability data still contains NaN after alignment")
193
194 # 5. Convert categorical labels to integer labels
195 # Create a mapping from class names to integers
196 class_names = list(prob_df_aligned.columns)
197 class_to_int = {class_name: i for i, class_name in enumerate(class_names)}
198
199 print(f" Class mapping: {class_to_int}")
200
201 # Convert true labels to integers
202 y_true_np = y_true.map(class_to_int).to_numpy()
203 y_probs_np = prob_df_aligned.to_numpy()
204
205 print(f" Data shape: y_true={y_true_np.shape}, y_probs={y_probs_np.shape}")
206 print(f" Unique true labels (integers): {set(y_true_np)}")
207 print(f" Class labels (columns): {class_names}")
208 print(f" Label distribution: {dict(zip(*np.unique(y_true_np, return_counts=True)))}")
209
210 # Check for any unmapped labels (will be NaN)
211 if pd.isna(y_true_np).any():
212 raise ValueError(" Error: Some true labels could not be mapped to class columns")
213
214 ]]></token>
215 <xml name="common_test">
216 <param name="non_commercial_use" value="True"/>
217 <conditional name="training_type">
218 <param name="model" value="s_train"/>
219 <param name="train_clin" value="train/clin" ftype="csv"/>
220 <param name="test_clin" value="test/clin" ftype="csv"/>
221 <param name="train_omics_main" value="train/gex" ftype="csv"/>
222 <param name="test_omics_main" value="test/gex" ftype="csv"/>
223 <param name="assay_main" value="bar"/>
224 <repeat name="omics">
225 <param name="train_omics" value="train/cnv" ftype="csv"/>
226 <param name="test_omics" value="test/cnv" ftype="csv"/>
227 <param name="assay" value="foo"/>
228 </repeat>
229 <conditional name="model_class">
230 <param name="model_class" value="DirectPred"/>
231 </conditional>
232 <param name="target_variables" value="Erlotinib"/>
233 <param name="surv_event_var" value="OS_STATUS"/>
234 <param name="surv_time_var" value="OS_MONTHS"/>
235 <section name="advanced">
236 <param name="hpo_iter" value="1"/>
237 </section>
238 </conditional>
239 <yield/>
240 <output_collection name="results" type="list">
241 <element name="job.embeddings_test">
242 <assert_contents>
243 <has_n_lines n="50"/>
244 </assert_contents>
245 </element>
246 <element name="job.embeddings_train">
247 <assert_contents>
248 <has_n_lines n="50"/>
249 </assert_contents>
250 </element>
251 <element name="job.feature_importance.GradientShap">
252 <assert_contents>
253 <has_text_matching expression="Erlotinib,0,,bar,A2M,"/>
254 <has_text_matching expression="Erlotinib,0,,bar,ABCC4,"/>
255 <has_text_matching expression="GradientShap"/>
256 </assert_contents>
257 </element>
258 <element name="job.feature_importance.IntegratedGradients">
259 <assert_contents>
260 <has_text_matching expression="Erlotinib,0,,bar,A2M,"/>
261 <has_text_matching expression="Erlotinib,0,,bar,ABCC4,"/>
262 <has_text_matching expression="IntegratedGradients"/>
263 </assert_contents>
264 </element>
265 <element name="job.feature_logs.bar">
266 <assert_contents>
267 <has_n_lines n="25"/>
268 </assert_contents>
269 </element>
270 <element name="job.feature_logs.omics_foo">
271 <assert_contents>
272 <has_n_lines n="25"/>
273 </assert_contents>
274 </element>
275 <element name="job.predicted_labels">
276 <assert_contents>
277 <has_text_matching expression="source_dataset:A-704,Erlotinib,"/>
278 <has_text_matching expression="target_dataset:KMRC-20,Erlotinib,"/>
279 </assert_contents>
280 </element>
281 <element name="job.stats">
282 <assert_contents>
283 <has_text_matching expression="DirectPred,Erlotinib,numerical,mse,"/>
284 <has_text_matching expression="DirectPred,Erlotinib,numerical,r2,"/>
285 <has_text_matching expression="DirectPred,Erlotinib,numerical,pearson_corr,"/>
286 </assert_contents>
287 </element>
288 </output_collection>
289 </xml>
290 <token name="@COMMON_HELP@">
291
292 .. class:: warningmark
293
294 **WARNING: This tool is only available for NON-COMMERCIAL use. Permission is only granted for academic, research, and educational purposes. Before using, be sure to review, agree, and comply with the license.**
295
296 Flexynesis is a deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction.
297 The package includes multiple types of deep learning architectures such as simple fully connected networks, supervised variational autoencoders, graph convolutional networks, multi-triplet networks different options of data layer fusion, and automates feature selection and hyperparameter optimization.
298
299 For more information, please check the Documentation_ :
300
301 For commercial use, please review the flexynesis license on GitHub and contact the `copyright holders`_ .
302
303 -----
304
305 </token>
306 <xml name="creator">
307 <creator>
308 <organization name="European Galaxy Team" url="https://galaxyproject.org/eu/"/>
309 <person givenName="Amirhossein" familyName="Naghsh Nilchi" email="nilchia@informatik.uni-freiburg.de"/>
310 <yield/>
311 <person givenName="Björn" familyName="Grüning" email="gruening@informatik.uni-freiburg.de"/>
312 </creator>
313 </xml>
314 <xml name="citations">
315 <citations>
316 <citation type="doi">10.1101/2024.07.16.603606</citation>
317 </citations>
318 </xml>
319 </macros>