comparison train_test_split.xml @ 0:0985b0dd6f1a draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
author bgruening
date Fri, 01 Nov 2019 17:26:59 -0400
parents
children ce2fd1edbc6e
comparison
equal deleted inserted replaced
-1:000000000000 0:0985b0dd6f1a
1 <tool id="sklearn_train_test_split" name="Split Dataset" version="@VERSION@">
2 <description>into training and test subsets</description>
3 <macros>
4 <import>main_macros.xml</import>
5 <macro name="label_input" token_label="Select the dataset containing labels">
6 <param name="labels" type="data" format="tabular" label="@LABEL@"/>
7 <param name="header1" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Does the dataset contain header?" />
8 <param name="col" type="data_column" data_ref="labels" label="Select target column"/>
9 </macro>
10 </macros>
11 <expand macro="python_requirements"/>
12 <expand macro="macro_stdio"/>
13 <version_command>echo "@VERSION@"</version_command>
14 <command detect_errors="exit_code"><![CDATA[
15 python '$__tool_directory__/train_test_split.py'
16 --inputs '$inputs'
17 --infile_array '$infile_array'
18 #if $mode_selection.selected_mode == 'train_test_split' and $mode_selection.options.shuffle_selection.shuffle not in ['None', 'simple']
19 --infile_labels '$mode_selection.options.shuffle_selection.labels'
20 #end if
21 #if $mode_selection.selected_mode == 'cv_splitter' and $mode_selection.cv_selector.selected_cv in ['StratifiedKFold', 'RepeatedStratifiedKFold', 'StratifiedShuffleSplit', 'OrderedKFold', 'RepeatedOrderedKFold']
22 --infile_labels '$mode_selection.cv_selector.target_input.labels'
23 #end if
24 #if $mode_selection.selected_mode == 'cv_splitter' and $mode_selection.cv_selector.selected_cv in ['GroupKFold', 'GroupShuffleSplit', 'LeaveOneGroupOut', 'LeavePGroupsOut']
25 --infile_groups '$mode_selection.cv_selector.groups_selector.infile_g'
26 #end if
27 --outfile_train '$out_train'
28 --outfile_test '$out_test'
29 ]]>
30 </command>
31 <configfiles>
32 <inputs name="inputs"/>
33 </configfiles>
34 <inputs>
35 <param name="infile_array" type="data" format="tabular" label="Select the dataset containing array to split" help="This tool only supports to split one array at each tool run. If X, y are in separate files, the splitting task could be done by invoking this tool twice in which this input dataset is swapped while all other parameters are kept the same."/>
36 <param name="header0" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Does the dataset contain header?" />
37 <conditional name="mode_selection">
38 <param name="selected_mode" type="select" label="Select the splitting mode">
39 <option value="train_test_split" selected="true">Train Test Split</option>
40 <option value="cv_splitter">Cross-Validation Splitter</option>
41 </param>
42 <when value="train_test_split">
43 <section name="options" title="Options" expanded="true">
44 <param argument="test_size" type="float" min="0" optional="false" value="0.25" label="Test size:"
45 help="A float number, 0.0 - 1.0, represents the proportion of the dataset to be included in the test split."/>
46 <param argument="random_state" type="integer" optional="true" value="" label="Random seed number:"/>
47 <conditional name="shuffle_selection">
48 <param name="shuffle" type="select" label="Shuffle strategy">
49 <option value="None">None - No shuffle</option>
50 <option value="simple">Shuffle</option>
51 <option value="stratified">Stratified Shuffle</option>
52 <option value="group">Group Shuffle</option>
53 </param>
54 <when value="None"/>
55 <when value="simple"/>
56 <when value="stratified">
57 <expand macro="label_input"/>
58 </when>
59 <when value="group">
60 <expand macro="label_input" label="Select the dataset containing groups"/>
61 </when>
62 </conditional>
63 </section>
64 </when>
65 <when value="cv_splitter">
66 <conditional name="cv_selector">
67 <param name="selected_cv" type="select" label="Select the cv splitter:">
68 <option value="KFold">KFold</option>
69 <option value="RepeatedKFold">RepeatedKFold</option>
70 <option value="StratifiedKFold">StratifiedKFold</option>
71 <option value="RepeatedStratifiedKFold">RepeatedStratifiedKFold</option>
72 <option value="LeaveOneOut">LeaveOneOut</option>
73 <option value="LeavePOut">LeavePOut</option>
74 <option value="ShuffleSplit">ShuffleSplit</option>
75 <option value="StratifiedShuffleSplit">StratifiedShuffleSplit</option>
76 <option value="TimeSeriesSplit">TimeSeriesSplit</option>
77 <option value="PredefinedSplit">PredefinedSplit</option>
78 <option value="OrderedKFold">OrderedKFold</option>
79 <option value="RepeatedOrderedKFold">RepeatedOrderedKFold</option>
80 <option value="GroupKFold">GroupKFold</option>
81 <option value="GroupShuffleSplit">GroupShuffleSplit</option>
82 <option value="LeaveOneGroupOut">LeaveOneGroupOut</option>
83 <option value="LeavePGroupsOut">LeavePGroupsOut</option>
84 </param>
85 <when value="KFold">
86 <expand macro="cv_n_splits"/>
87 <expand macro="cv_shuffle"/>
88 <expand macro="random_state"/>
89 </when>
90 <when value="RepeatedKFold">
91 <expand macro="cv_n_splits" value="5"/>
92 <param argument="n_repeats" type="integer" value="10" label="n_repeats" help="Number of times cross-validator needs to be repeated." />
93 <expand macro="random_state" />
94 </when>
95 <when value="StratifiedKFold">
96 <expand macro="cv_n_splits"/>
97 <expand macro="cv_shuffle"/>
98 <expand macro="random_state"/>
99 <section name="target_input" title="Target values" expanded="true">
100 <expand macro="label_input"/>
101 </section>
102 </when>
103 <when value="RepeatedStratifiedKFold">
104 <expand macro="cv_n_splits" value="5"/>
105 <param argument="n_repeats" type="integer" value="10" label="n_repeats" help="Number of times cross-validator needs to be repeated." />
106 <expand macro="random_state" />
107 <section name="target_input" title="Target values" expanded="true">
108 <expand macro="label_input"/>
109 </section>
110 </when>
111 <when value="LeaveOneOut">
112 </when>
113 <when value="LeavePOut">
114 <param argument="p" type="integer" value="" label="p" help="Integer. Size of the test sets."/>
115 </when>
116 <when value="ShuffleSplit">
117 <expand macro="cv_n_splits" value="10" help="Number of re-shuffling and splitting iterations."/>
118 <expand macro="cv_test_size" value="0.1" />
119 <expand macro="random_state"/>
120 </when>
121 <when value="StratifiedShuffleSplit">
122 <expand macro="cv_n_splits" value="10" help="Number of re-shuffling and splitting iterations."/>
123 <expand macro="cv_test_size" value="0.1" />
124 <expand macro="random_state"/>
125 <section name="target_input" title="Target values" expanded="true">
126 <expand macro="label_input"/>
127 </section>
128 </when>
129 <when value="TimeSeriesSplit">
130 <expand macro="cv_n_splits"/>
131 <param argument="max_train_size" type="integer" value="" optional="true" label="Maximum size of the training set" help="Maximum size for a single training set." />
132 </when>
133 <when value="PredefinedSplit">
134 <param argument="test_fold" type="text" value="" area="true" label="test_fold" help="List, e.g., [0, 1, -1, 1], represents two test sets, [X[0]] and [X[1], X[3]], X[2] is excluded from any test set due to '-1'."/>
135 </when>
136 <when value="OrderedKFold">
137 <expand macro="cv_n_splits"/>
138 <expand macro="cv_shuffle"/>
139 <expand macro="random_state"/>
140 <section name="target_input" title="Target values" expanded="true">
141 <expand macro="label_input" label="Select the dataset containing target values"/>
142 </section>
143 </when>
144 <when value="RepeatedOrderedKFold">
145 <expand macro="cv_n_splits"/>
146 <param argument="n_repeats" type="integer" value="5"/>
147 <expand macro="random_state"/>
148 <section name="target_input" title="Target values" expanded="true">
149 <expand macro="label_input" label="Select the dataset containing target values"/>
150 </section>
151 </when>
152 <when value="GroupKFold">
153 <expand macro="cv_n_splits"/>
154 <expand macro="cv_groups" />
155 </when>
156 <when value="GroupShuffleSplit">
157 <expand macro="cv_n_splits" value="5"/>
158 <expand macro="cv_test_size"/>
159 <expand macro="random_state"/>
160 <expand macro="cv_groups"/>
161 </when>
162 <when value="LeaveOneGroupOut">
163 <expand macro="cv_groups"/>
164 </when>
165 <when value="LeavePGroupsOut">
166 <param argument="n_groups" type="integer" value="" label="n_groups" help="Number of groups (p) to leave out in the test split." />
167 <expand macro="cv_groups"/>
168 </when>
169 </conditional>
170 <param name="nth_split" type="integer" min="1" value="1" label="Type the index of split to output" help="Split index starts from 1 to total = n_splits (x n_repeats). (nth_split)"/>
171 </when>
172 </conditional>
173 </inputs>
174 <outputs>
175 <data format="tabular" name="out_train" label="${tool.name} on ${on_string} (train)"/>
176 <data format="tabular" name="out_test" label="${tool.name} on ${on_string} (test)"/>
177 </outputs>
178 <tests>
179 <test>
180 <param name="infile_array" value="regression_X.tabular" ftype="tabular"/>
181 <param name="header0" value="true"/>
182 <conditional name="mode_selection">
183 <param name="selected_mode" value="train_test_split"/>
184 <section name="options">
185 <param name="random_state" value="123"/>
186 <conditional name="shuffle_selection">
187 <param name="shuffle" value="simple"/>
188 </conditional>
189 </section>
190 </conditional>
191 <output name="out_train" file="train_test_split_train01.tabular" ftype="tabular"/>
192 <output name="out_test" file="train_test_split_test01.tabular" ftype="tabular"/>
193 </test>
194 <test>
195 <param name="infile_array" value="regression_X.tabular" ftype="tabular"/>
196 <param name="header0" value="true"/>
197 <conditional name="mode_selection">
198 <param name="selected_mode" value="cv_splitter"/>
199 <conditional name="cv_selector">
200 <param name="selected_cv" value="ShuffleSplit"/>
201 <param name="random_state" value="123"/>
202 <param name="n_splits" value="2"/>
203 <param name="test_size" value="0.25"/>
204 </conditional>
205 </conditional>
206 <output name="out_train" file="train_test_split_train01.tabular" ftype="tabular"/>
207 <output name="out_test" file="train_test_split_test01.tabular" ftype="tabular"/>
208 </test>
209 <test>
210 <param name="infile_array" value="imblearn_X.tabular" ftype="tabular"/>
211 <param name="header0" value="false"/>
212 <conditional name="mode_selection">
213 <param name="selected_mode" value="train_test_split"/>
214 <section name="options">
215 <param name="test_size" value="0.2"/>
216 <param name="random_state" value="123"/>
217 <conditional name="shuffle_selection">
218 <param name="shuffle" value="stratified"/>
219 <param name="labels" value="imblearn_y.tabular" ftype="tabular"/>
220 <param name="header1" value="false"/>
221 <param name="col" value="1"/>
222 </conditional>
223 </section>
224 </conditional>
225 <output name="out_train" file="train_test_split_train02.tabular" ftype="tabular"/>
226 <output name="out_test" file="train_test_split_test02.tabular" ftype="tabular"/>
227 </test>
228 <test>
229 <param name="infile_array" value="imblearn_X.tabular" ftype="tabular"/>
230 <param name="header0" value="false"/>
231 <conditional name="mode_selection">
232 <param name="selected_mode" value="cv_splitter"/>
233 <conditional name="cv_selector">
234 <param name="selected_cv" value="StratifiedShuffleSplit"/>
235 <param name="random_state" value="123"/>
236 <param name="test_size" value="0.2"/>
237 <param name="n_splits" value="1"/>
238 <section name="target_input">
239 <param name="labels" value="imblearn_y.tabular" ftype="tabular"/>
240 <param name="header1" value="false"/>
241 <param name="col" value="1"/>
242 </section>
243 </conditional>
244 </conditional>
245 <output name="out_train" file="train_test_split_train02.tabular" ftype="tabular"/>
246 <output name="out_test" file="train_test_split_test02.tabular" ftype="tabular"/>
247 </test>
248 <test>
249 <param name="infile_array" value="regression_X.tabular" ftype="tabular"/>
250 <param name="header0" value="true"/>
251 <conditional name="mode_selection">
252 <param name="selected_mode" value="cv_splitter"/>
253 <conditional name="cv_selector">
254 <param name="selected_cv" value="OrderedKFold"/>
255 <param name="random_state" value="123"/>
256 <param name="shuffle" value="true"/>
257 <param name="n_splits" value="5"/>
258 <section name="target_input">
259 <param name="labels" value="regression_y.tabular" ftype="tabular"/>
260 <param name="header1" value="true"/>
261 <param name="col" value="1"/>
262 </section>
263 </conditional>
264 </conditional>
265 <output name="out_train" file="train_test_split_train03.tabular" ftype="tabular"/>
266 <output name="out_test" file="train_test_split_test03.tabular" ftype="tabular"/>
267 </test>
268 </tests>
269 <help><![CDATA[
270 **What it does**
271 This tool implements splitter function and classes from `sklearn.model_selection` module to split contents (rows) of a table into two subsets for training and test, respectively . The simple train test split mode not only supports shuffle split and stratified shuffle split natively carried by the `train_test_split` function, but also gets extended to do group shuffle. The cross-validation splitter mode supports more diverse splitting strategies. Each tool run outputs one split, train and test. To get different splitting sets, for example, nested CV, multiple tool runs are needed with different `nth_split`.
272
273 - Train Test Split mode
274 - direct split, no shuffle
275 - shuffle split
276 - stratified shuffle split
277 - group shuffle split
278 - Cross-Validation Splitter mode
279 - KFold
280 - StratifiedKFold
281 - LeaveOneOut
282 - LeavePOut
283 - ...
284
285 Input: a tabular dataset.
286
287 Output: two tabular datasets containing training and test subsets, respectively.
288
289 ]]></help>
290 <expand macro="sklearn_citation"/>
291 </tool>