0
|
1 <?xml version="1.0"?>
|
|
2 <tool name="chopin2" id="chopin2" version="1.0.6">
|
|
3 <description>Domain-Agnostic Supervised Learning with Hyperdimensional Computing</description>
|
|
4
|
|
5 <!-- Tool developer -->
|
|
6 <creator>
|
|
7 <person givenName="Fabio" familyName="Cumbo"
|
|
8 url="https://fabio-cumbo.github.io/"
|
|
9 email="fabio.cumbo@gmail.com" />
|
|
10 </creator>
|
|
11
|
|
12 <!-- Define dependencies -->
|
|
13 <requirements>
|
|
14 <requirement type="package" version="1.0.6">chopin2</requirement>
|
|
15 </requirements>
|
|
16
|
|
17 <command detect_errors="exit_code">
|
|
18 <![CDATA[
|
|
19 ln -s ${dataset};
|
|
20
|
|
21 chopin2
|
|
22
|
|
23 --dataset `basename $dataset`
|
|
24 --dimensionality ${dimensionality}
|
|
25 --levels ${levels}
|
|
26 --retrain ${retrain}
|
|
27 --stop
|
|
28 --crossv_k ${folds}
|
|
29
|
|
30 #if $feature_selection.enable_fs == "true":
|
|
31 --select_features
|
|
32 --group_min ${feature_selection.group_min}
|
|
33 --accuracy_threshold ${feature_selection.accuracy_threshold}
|
|
34 --accuracy_uncertainty_perc ${feature_selection.accuracy_uncertainty_perc}
|
|
35 #end if
|
|
36
|
|
37 --dump
|
|
38 --cleanup
|
|
39 --nproc "\${GALAXY_SLOTS:-4}"
|
|
40 --verbose
|
|
41 ]]>
|
|
42 </command>
|
|
43
|
|
44 <inputs>
|
|
45 <!-- Select a dataset -->
|
|
46 <param name="dataset" type="data" format="csv"
|
|
47 label="Select a dataset"
|
|
48 help="Input dataset with features on columns and observations on rows. Last column must contain classes." />
|
|
49
|
|
50 <!-- Vector dimensionality -->
|
|
51 <param name="dimensionality" type="integer" value="10000" min="100"
|
|
52 label="Vectors dimensionality"
|
|
53 help="Size of hypervectors is usually 10,000 in vector-symbolic architectures. However, lower values could work
|
|
54 with small datasets in terms of number of features and observations. Please note that you may require
|
|
55 to increase this number in case of datasets with a huge number of features." />
|
|
56
|
|
57 <!-- Number of levels -->
|
|
58 <param name="levels" type="integer" value="1000" min="2"
|
|
59 label="Levels"
|
|
60 help="Number of level vectors. You may consider to look at the distribution of your data in order to choose
|
|
61 the most appropriate value." />
|
|
62
|
|
63 <!-- Number of retraining iterations -->
|
|
64 <param name="retrain" type="integer" value="0" min="0"
|
|
65 label="Model retraining iterations"
|
|
66 help="Maximum number of retraining iterations. Class hypervectors are retrained to minimize errors caused by noise." />
|
|
67
|
|
68 <!-- Number of folds for cross-validation -->
|
|
69 <param name="folds" type="integer" value="2" min="2"
|
|
70 label="Number of folds for cross-validation"
|
|
71 help="This tool makes use of k-folds cross-validation to evaluate the accuracy of the hyperdimensional model.
|
|
72 Make sure to choose a good number of folds for validating the classification model. Please note that higher number
|
|
73 of folds could significantly increase the running time." />
|
|
74
|
|
75 <!-- Allow to discard genomes according to their completeness and contamination stats -->
|
|
76 <conditional name="feature_selection">
|
|
77 <!-- Enable feature selection -->
|
|
78 <param name="enable_fs" type="boolean" checked="false" truevalue="true" falsevalue="false"
|
|
79 label="Enable feature selection"
|
|
80 help="If selected, this will extract a set of features with the better discriminative power among classes.
|
|
81 The feature selection algorithm is defined as a backward variable selection method." />
|
|
82
|
|
83 <when value="true">
|
|
84 <!-- Minimum group size -->
|
|
85 <param name="group_min" type="integer" value="1" min="1"
|
|
86 label="Minimum number of selected features"
|
|
87 help="Tool will stop removing features if its number will reach this value." />
|
|
88
|
|
89 <!-- Accuracy threshold -->
|
|
90 <param name="accuracy_threshold" type="float" value="60.0" min="0.0" max="100.0"
|
|
91 label="Accuracy threshold"
|
|
92 help="Stop the execution if the best accuracy reached for a group of features is lower than this value." />
|
|
93
|
|
94 <!-- Accuracy uncertainty percentage -->
|
|
95 <param name="accuracy_uncertainty_perc" type="float" value="5.0" min="0.0" max="100.0"
|
|
96 label="Accuracy uncertainty percentage"
|
|
97 help="Consider non optimal solutions if model accuracy is greater than the best accuracy minus this percentage." />
|
|
98 </when>
|
|
99 </conditional>
|
|
100 </inputs>
|
|
101
|
|
102 <outputs>
|
|
103 <!-- Output summary file -->
|
|
104 <data format="txt" name="summary" label="${tool.name} on ${on_string}: Summary" from_work_dir="summary.txt" />
|
|
105 <!-- Output file with selected features -->
|
|
106 <data format="txt" name="selection" label="${tool.name} on ${on_string}: Selection" from_work_dir="selection.txt">
|
|
107 <filter>feature_selection["enable_fs"]</filter>
|
|
108 </data>
|
|
109 </outputs>
|
|
110
|
|
111 <help><![CDATA[
|
|
112 This is a domain-agnostic supervised learning tool that exploit the Hyperdimensional Computing paradigm to encode and compare data.
|
|
113
|
|
114 -----
|
|
115
|
|
116 **Input file**
|
|
117
|
|
118 The input is a CSV file representing a matrix with the observations on the rows and features on columns.
|
|
119 The last column contains classes associated to the observations.
|
|
120
|
|
121 The tool doesn't support datasets with missing values. It also supports numerical datasets only.
|
|
122
|
|
123 Please, refer to the example below to know how to structure your dataset:
|
|
124
|
|
125 +------------------+--------------+--------------+--------------+------+--------------+-------------+
|
|
126 | | **Feature1** | **Feature2** | **Feature3** | ... | **FeatureM** | **Class** |
|
|
127 +==================+==============+==============+==============+======+==============+=============+
|
|
128 | **Observation1** | 0.68 | 1.97 | 0.02 | ... | 0.01 | Case |
|
|
129 +------------------+--------------+--------------+--------------+------+--------------+-------------+
|
|
130 | **Observation2** | 0.52 | 0.60 | 1.16 | ... | 0.07 | Case |
|
|
131 +------------------+--------------+--------------+--------------+------+--------------+-------------+
|
|
132 | **Observation2** | 0.56 | 0.01 | 0.50 | ... | 1.16 | Control |
|
|
133 +------------------+--------------+--------------+--------------+------+--------------+-------------+
|
|
134 | ... | ... | ... | ... | ... | ... | ... |
|
|
135 +------------------+--------------+--------------+--------------+------+--------------+-------------+
|
|
136 | **ObservationN** | 0.05 | 1.86 | 0.03 | ... | 2.83 | Control |
|
|
137 +------------------+--------------+--------------+--------------+------+--------------+-------------+
|
|
138
|
|
139 -----
|
|
140
|
|
141 **Output**
|
|
142
|
|
143 The output is a summary table with information about the accuracy of the hyperdimensional model and
|
|
144 the number of retraining iterations that were required to achieve that level of accuracy.
|
|
145
|
|
146 In case the feature selection is enabled, it also returns a file with the list of selected features
|
|
147 that come out from the hyperdimensional model that reached the best accuracy.
|
|
148
|
|
149 -----
|
|
150
|
|
151 .. class:: infomark
|
|
152
|
|
153 **Notes**
|
|
154
|
|
155 Please visit the official GitHub repository_ for other information about `chopin2`.
|
|
156
|
|
157 .. _repository: https://github.com/fabio-cumbo/chopin2
|
|
158 ]]></help>
|
|
159
|
|
160 <citations>
|
|
161 <citation type="doi">10.3390/a13090233</citation>
|
|
162 </citations>
|
|
163 </tool> |