annotate mil_bag.xml @ 0:e6e9ea0703ef draft default tip

planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
author goeckslab
date Thu, 19 Jun 2025 23:31:55 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
1 <tool id="bagging_tool" name="Bagging Embeddings Processor" version="1.0.0+0">
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
2 <description>Process CSV files to create bags of embeddings for machine learning</description>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
3 <requirements>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
4 <container type="docker">quay.io/goeckslab/milbag:1.0.0</container>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
5 </requirements>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
6 <stdio>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
7 <exit_code range="137" level="fatal_oom" description="Out of Memory" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
8 <exit_code range="1:" level="fatal" description="Error occurred. Please check Tool Standard Error" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
9 </stdio>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
10 <command>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
11 python "$__tool_directory__/mil_bag.py"
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
12 --embeddings_csv "$embeddings_csv"
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
13 --metadata_csv "$metadata_csv"
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
14 --split_proportions "$split_proportions"
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
15 --bag_size "$bag_size"
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
16 --pooling_method "$pooling_method"
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
17 --repeats "$repeats"
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
18 --output_csv "$output_csv"
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
19 #if $dataleak
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
20 --dataleak
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
21 #end if
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
22 #if $balance_enforced
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
23 --balance_enforced
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
24 #end if
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
25 #if $ludwig_format
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
26 --ludwig_format
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
27 #end if
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
28 #if $random_seed != ""
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
29 --random_seed "$random_seed"
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
30 #end if
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
31 #if $imbalance_cap != ""
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
32 --imbalance_cap "$imbalance_cap"
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
33 #end if
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
34 #if $truncate_bags
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
35 --truncate_bags
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
36 #end if
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
37 #if $use_gpu
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
38 --use_gpu
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
39 #end if
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
40 #if $by_sample != ""
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
41 --by_sample "$by_sample"
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
42 #end if
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
43 </command>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
44 <inputs>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
45 <param name="embeddings_csv" type="data" format="csv" label="Embeddings CSV File" help="CSV file containing embeddings with a 'sample_name' column."/>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
46 <param name="metadata_csv" type="data" format="csv" label="Metadata CSV File" help="CSV file with metadata containing 'sample_name' and 'label' columns."/>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
47 <param name="split_proportions" type="text" value="0.7,0.1,0.2" label="Split Proportions (train,val,test)" help="Comma-separated proportions (e.g., '0.7,0.1,0.2') for train, validation, and test splits."/>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
48 <param name="bag_size" type="text" value="3-5" label="Bag Size" help="Single number (e.g., '4') or range (e.g., '3-5') for bag sizes."/>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
49 <param name="pooling_method" type="select" label="Pooling Method" help="Method to aggregate embeddings into bags.">
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
50 <option value="max_pooling">Max Pooling</option>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
51 <option value="mean_pooling">Mean Pooling</option>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
52 <option value="sum_pooling">Sum Pooling</option>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
53 <option value="min_pooling">Min Pooling</option>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
54 <option value="median_pooling">Median Pooling</option>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
55 <option value="l2_norm_pooling">L2 Norm Pooling</option>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
56 <option value="geometric_mean_pooling">Geometric Mean Pooling</option>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
57 <option value="first_embedding">First Embedding</option>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
58 <option value="last_embedding">Last Embedding</option>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
59 <option value="attention_pooling">Attention Pooling</option>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
60 </param>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
61 <param name="repeats" type="integer" value="1" min="1" label="Number of Repeats" help="Number of times to repeat the bagging process."/>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
62 <param name="dataleak" type="boolean" truevalue="--dataleak" falsevalue="" checked="false" label="Prevent Data Leakage?" help="If checked, prevents data leakage by splitting on unique sample names."/>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
63 <param name="balance_enforced" type="boolean" truevalue="--balance_enforced" falsevalue="" checked="false" label="Enforce Balanced Bags?" help="If checked, alternates between classes to create balanced bags."/>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
64 <param name="ludwig_format" type="boolean" truevalue="--ludwig_format" falsevalue="" checked="false" label="Ludwig Format?" help="If checked, outputs embeddings as a single string column for Ludwig compatibility."/>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
65 <param name="by_sample" type="text" value="" optional="true" label="Splits for Within-Sample Bagging" help="Optional comma-separated list of splits (0=train, 1=val, 2=test) to bag within samples (e.g., '0,1'). Defaults to random or balanced bagging if empty."/>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
66 <param name="random_seed" type="integer" value="" optional="true" label="Random Seed" help="Optional integer seed for reproducibility (e.g., 42). Leave blank for random behavior."/>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
67 <param name="imbalance_cap" type="integer" value="" optional="true" label="Maximum Imbalance Percentage" help="Optional maximum allowable imbalance percentage between classes (e.g., 50). If set, balances bags to this threshold."/>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
68 <param name="truncate_bags" type="boolean" truevalue="--truncate_bags" falsevalue="" checked="false" label="Truncate Bags for Balance?" help="If checked, truncates bags to ensure equal counts of positive and negative bags."/>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
69 <param name="use_gpu" type="boolean" truevalue="--use_gpu" falsevalue="" checked="false" label="Use GPU?" help="If checked, uses GPU for pooling operations (requires compatible hardware and libraries)."/>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
70 </inputs>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
71 <outputs>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
72 <data name="output_csv" format="csv" label="processed_bags.csv"/>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
73 </outputs>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
74 <tests>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
75 <test>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
76 <param name="embeddings_csv" value="100_digits_embeddings.csv" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
77 <param name="metadata_csv" value="100_digits_metadata.csv" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
78 <param name="split_proportions" value="0.7,0.2,0.1" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
79 <param name="bag_size" value="2" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
80 <param name="pooling_method" value="mean_pooling" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
81 <param name="repeats" value="1" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
82 <param name="dataleak" value="true" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
83 <param name="balance_enforced" value="false" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
84 <param name="ludwig_format" value="true" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
85 <param name="by_sample" value="" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
86 <param name="random_seed" value="42" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
87 <param name="imbalance_cap" value="" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
88 <param name="truncate_bags" value="false" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
89 <param name="use_gpu" value="false" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
90 <output name="output_csv">
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
91 <assert_contents>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
92 <has_text text="bag_size" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
93 <has_n_columns min="1" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
94 </assert_contents>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
95 </output>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
96 </test>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
97 <test>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
98 <param name="embeddings_csv" value="100_digits_embeddings.csv" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
99 <param name="metadata_csv" value="100_digits_metadata.csv" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
100 <param name="split_proportions" value="0.7,0.2,0.1" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
101 <param name="bag_size" value="2" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
102 <param name="pooling_method" value="mean_pooling" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
103 <param name="repeats" value="1" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
104 <param name="dataleak" value="true" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
105 <param name="balance_enforced" value="false" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
106 <param name="ludwig_format" value="true" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
107 <param name="by_sample" value="2" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
108 <param name="random_seed" value="123" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
109 <param name="imbalance_cap" value="50" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
110 <param name="truncate_bags" value="true" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
111 <param name="use_gpu" value="true" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
112 <output name="output_csv">
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
113 <assert_contents>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
114 <has_text text="bag_size" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
115 <has_n_columns min="1" />
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
116 </assert_contents>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
117 </output>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
118 </test>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
119 </tests>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
120 <help>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
121 <![CDATA[
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
122 **What it does**
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
123 This tool processes embedding and metadata CSV files to create bags of samples with specified sizes and pooling methods, suitable for machine learning tasks.
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
124
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
125 **Inputs**
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
126 - **Embeddings CSV File**: A CSV file containing embeddings with a `sample_name` column.
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
127 - **Metadata CSV File**: A CSV file with metadata containing `sample_name` and `label` columns.
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
128 - **Split Proportions**: Define train, validation, and test split ratios (e.g., '0.7,0.1,0.2').
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
129 - **Bag Size**: Set a fixed number (e.g., '4') or range (e.g., '3-5') for bag sizes.
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
130 - **Pooling Method**: Choose how embeddings are aggregated into bags (e.g., mean, max).
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
131 - **Number of Repeats**: Specify how many times to repeat bagging (useful for augmentation).
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
132 - **Prevent Data Leakage**: Avoid leakage by splitting on unique sample names.
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
133 - **Enforce Balanced Bags**: Alternate classes for balanced bagging.
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
134 - **Ludwig Format**: Convert embeddings to a single string column for Ludwig compatibility.
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
135 - **Splits for Within-Sample Bagging**: Optional splits (0, 1, 2) to bag within samples (e.g., '0,1').
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
136 - **Random Seed**: Optional seed for reproducible results.
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
137 - **Maximum Imbalance Percentage**: Optional cap (e.g., 50) to balance class distribution.
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
138 - **Truncate Bags for Balance**: Truncate bags to equalize positive and negative counts.
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
139 - **Use GPU**: Enable GPU acceleration for pooling operations (if available).
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
140
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
141 **Outputs**
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
142 - A CSV file with bags of embeddings, including labels, split information, and processed embedding vectors.
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
143 ]]>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
144 </help>
e6e9ea0703ef planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff changeset
145 </tool>