Mercurial > repos > goeckslab > bagging_tool
annotate mil_bag.xml @ 0:e6e9ea0703ef draft default tip
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
| author | goeckslab |
|---|---|
| date | Thu, 19 Jun 2025 23:31:55 +0000 |
| parents | |
| children |
| rev | line source |
|---|---|
|
0
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
1 <tool id="bagging_tool" name="Bagging Embeddings Processor" version="1.0.0+0"> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
2 <description>Process CSV files to create bags of embeddings for machine learning</description> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
3 <requirements> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
4 <container type="docker">quay.io/goeckslab/milbag:1.0.0</container> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
5 </requirements> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
6 <stdio> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
7 <exit_code range="137" level="fatal_oom" description="Out of Memory" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
8 <exit_code range="1:" level="fatal" description="Error occurred. Please check Tool Standard Error" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
9 </stdio> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
10 <command> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
11 python "$__tool_directory__/mil_bag.py" |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
12 --embeddings_csv "$embeddings_csv" |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
13 --metadata_csv "$metadata_csv" |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
14 --split_proportions "$split_proportions" |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
15 --bag_size "$bag_size" |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
16 --pooling_method "$pooling_method" |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
17 --repeats "$repeats" |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
18 --output_csv "$output_csv" |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
19 #if $dataleak |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
20 --dataleak |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
21 #end if |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
22 #if $balance_enforced |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
23 --balance_enforced |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
24 #end if |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
25 #if $ludwig_format |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
26 --ludwig_format |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
27 #end if |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
28 #if $random_seed != "" |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
29 --random_seed "$random_seed" |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
30 #end if |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
31 #if $imbalance_cap != "" |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
32 --imbalance_cap "$imbalance_cap" |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
33 #end if |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
34 #if $truncate_bags |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
35 --truncate_bags |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
36 #end if |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
37 #if $use_gpu |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
38 --use_gpu |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
39 #end if |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
40 #if $by_sample != "" |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
41 --by_sample "$by_sample" |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
42 #end if |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
43 </command> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
44 <inputs> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
45 <param name="embeddings_csv" type="data" format="csv" label="Embeddings CSV File" help="CSV file containing embeddings with a 'sample_name' column."/> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
46 <param name="metadata_csv" type="data" format="csv" label="Metadata CSV File" help="CSV file with metadata containing 'sample_name' and 'label' columns."/> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
47 <param name="split_proportions" type="text" value="0.7,0.1,0.2" label="Split Proportions (train,val,test)" help="Comma-separated proportions (e.g., '0.7,0.1,0.2') for train, validation, and test splits."/> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
48 <param name="bag_size" type="text" value="3-5" label="Bag Size" help="Single number (e.g., '4') or range (e.g., '3-5') for bag sizes."/> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
49 <param name="pooling_method" type="select" label="Pooling Method" help="Method to aggregate embeddings into bags."> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
50 <option value="max_pooling">Max Pooling</option> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
51 <option value="mean_pooling">Mean Pooling</option> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
52 <option value="sum_pooling">Sum Pooling</option> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
53 <option value="min_pooling">Min Pooling</option> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
54 <option value="median_pooling">Median Pooling</option> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
55 <option value="l2_norm_pooling">L2 Norm Pooling</option> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
56 <option value="geometric_mean_pooling">Geometric Mean Pooling</option> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
57 <option value="first_embedding">First Embedding</option> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
58 <option value="last_embedding">Last Embedding</option> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
59 <option value="attention_pooling">Attention Pooling</option> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
60 </param> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
61 <param name="repeats" type="integer" value="1" min="1" label="Number of Repeats" help="Number of times to repeat the bagging process."/> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
62 <param name="dataleak" type="boolean" truevalue="--dataleak" falsevalue="" checked="false" label="Prevent Data Leakage?" help="If checked, prevents data leakage by splitting on unique sample names."/> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
63 <param name="balance_enforced" type="boolean" truevalue="--balance_enforced" falsevalue="" checked="false" label="Enforce Balanced Bags?" help="If checked, alternates between classes to create balanced bags."/> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
64 <param name="ludwig_format" type="boolean" truevalue="--ludwig_format" falsevalue="" checked="false" label="Ludwig Format?" help="If checked, outputs embeddings as a single string column for Ludwig compatibility."/> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
65 <param name="by_sample" type="text" value="" optional="true" label="Splits for Within-Sample Bagging" help="Optional comma-separated list of splits (0=train, 1=val, 2=test) to bag within samples (e.g., '0,1'). Defaults to random or balanced bagging if empty."/> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
66 <param name="random_seed" type="integer" value="" optional="true" label="Random Seed" help="Optional integer seed for reproducibility (e.g., 42). Leave blank for random behavior."/> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
67 <param name="imbalance_cap" type="integer" value="" optional="true" label="Maximum Imbalance Percentage" help="Optional maximum allowable imbalance percentage between classes (e.g., 50). If set, balances bags to this threshold."/> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
68 <param name="truncate_bags" type="boolean" truevalue="--truncate_bags" falsevalue="" checked="false" label="Truncate Bags for Balance?" help="If checked, truncates bags to ensure equal counts of positive and negative bags."/> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
69 <param name="use_gpu" type="boolean" truevalue="--use_gpu" falsevalue="" checked="false" label="Use GPU?" help="If checked, uses GPU for pooling operations (requires compatible hardware and libraries)."/> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
70 </inputs> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
71 <outputs> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
72 <data name="output_csv" format="csv" label="processed_bags.csv"/> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
73 </outputs> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
74 <tests> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
75 <test> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
76 <param name="embeddings_csv" value="100_digits_embeddings.csv" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
77 <param name="metadata_csv" value="100_digits_metadata.csv" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
78 <param name="split_proportions" value="0.7,0.2,0.1" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
79 <param name="bag_size" value="2" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
80 <param name="pooling_method" value="mean_pooling" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
81 <param name="repeats" value="1" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
82 <param name="dataleak" value="true" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
83 <param name="balance_enforced" value="false" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
84 <param name="ludwig_format" value="true" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
85 <param name="by_sample" value="" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
86 <param name="random_seed" value="42" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
87 <param name="imbalance_cap" value="" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
88 <param name="truncate_bags" value="false" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
89 <param name="use_gpu" value="false" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
90 <output name="output_csv"> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
91 <assert_contents> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
92 <has_text text="bag_size" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
93 <has_n_columns min="1" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
94 </assert_contents> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
95 </output> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
96 </test> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
97 <test> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
98 <param name="embeddings_csv" value="100_digits_embeddings.csv" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
99 <param name="metadata_csv" value="100_digits_metadata.csv" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
100 <param name="split_proportions" value="0.7,0.2,0.1" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
101 <param name="bag_size" value="2" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
102 <param name="pooling_method" value="mean_pooling" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
103 <param name="repeats" value="1" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
104 <param name="dataleak" value="true" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
105 <param name="balance_enforced" value="false" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
106 <param name="ludwig_format" value="true" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
107 <param name="by_sample" value="2" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
108 <param name="random_seed" value="123" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
109 <param name="imbalance_cap" value="50" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
110 <param name="truncate_bags" value="true" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
111 <param name="use_gpu" value="true" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
112 <output name="output_csv"> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
113 <assert_contents> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
114 <has_text text="bag_size" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
115 <has_n_columns min="1" /> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
116 </assert_contents> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
117 </output> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
118 </test> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
119 </tests> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
120 <help> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
121 <![CDATA[ |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
122 **What it does** |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
123 This tool processes embedding and metadata CSV files to create bags of samples with specified sizes and pooling methods, suitable for machine learning tasks. |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
124 |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
125 **Inputs** |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
126 - **Embeddings CSV File**: A CSV file containing embeddings with a `sample_name` column. |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
127 - **Metadata CSV File**: A CSV file with metadata containing `sample_name` and `label` columns. |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
128 - **Split Proportions**: Define train, validation, and test split ratios (e.g., '0.7,0.1,0.2'). |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
129 - **Bag Size**: Set a fixed number (e.g., '4') or range (e.g., '3-5') for bag sizes. |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
130 - **Pooling Method**: Choose how embeddings are aggregated into bags (e.g., mean, max). |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
131 - **Number of Repeats**: Specify how many times to repeat bagging (useful for augmentation). |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
132 - **Prevent Data Leakage**: Avoid leakage by splitting on unique sample names. |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
133 - **Enforce Balanced Bags**: Alternate classes for balanced bagging. |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
134 - **Ludwig Format**: Convert embeddings to a single string column for Ludwig compatibility. |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
135 - **Splits for Within-Sample Bagging**: Optional splits (0, 1, 2) to bag within samples (e.g., '0,1'). |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
136 - **Random Seed**: Optional seed for reproducible results. |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
137 - **Maximum Imbalance Percentage**: Optional cap (e.g., 50) to balance class distribution. |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
138 - **Truncate Bags for Balance**: Truncate bags to equalize positive and negative counts. |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
139 - **Use GPU**: Enable GPU acceleration for pooling operations (if available). |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
140 |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
141 **Outputs** |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
142 - A CSV file with bags of embeddings, including labels, split information, and processed embedding vectors. |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
143 ]]> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
144 </help> |
|
e6e9ea0703ef
planemo upload for repository https://github.com/goeckslab/gleam.git commit 783551569c645073698fce50f1ed9c4605b3e65a
goeckslab
parents:
diff
changeset
|
145 </tool> |
