Mercurial > repos > bgruening > sklearn_train_test_split

diff train_test_split.xml @ 11:5da2217cd788 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
author: bgruening
date: Wed, 09 Aug 2023 13:33:25 +0000
parents: ce2fd1edbc6e
--- a/train_test_split.xml	Thu Aug 11 09:36:17 2022 +0000
+++ b/train_test_split.xml	Wed Aug 09 13:33:25 2023 +0000
@@ -1,15 +1,15 @@
-<tool id="sklearn_train_test_split" name="Split Dataset" version="@VERSION@">
+<tool id="sklearn_train_test_split" name="Split Dataset" version="@VERSION@" profile="@PROFILE@">
     <description>into training and test subsets</description>
     <macros>
         <import>main_macros.xml</import>
         <macro name="label_input" token_label="Select the dataset containing labels">
-            <param name="labels" type="data" format="tabular" label="@LABEL@"/>
+            <param name="labels" type="data" format="tabular" label="@LABEL@" />
             <param name="header1" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Does the dataset contain header?" />
-            <param name="col" type="data_column" data_ref="labels" label="Select target column"/>
+            <param name="col" type="data_column" data_ref="labels" label="Select target column" />
         </macro>
     </macros>
-    <expand macro="python_requirements"/>
-    <expand macro="macro_stdio"/>
+    <expand macro="python_requirements" />
+    <expand macro="macro_stdio" />
     <version_command>echo "@VERSION@"</version_command>
     <command detect_errors="exit_code"><![CDATA[
         python '$__tool_directory__/train_test_split.py'
@@ -29,10 +29,10 @@
     ]]>
     </command>
     <configfiles>
-        <inputs name="inputs"/>
+        <inputs name="inputs" />
     </configfiles>
     <inputs>
-        <param name="infile_array" type="data" format="tabular" label="Select the dataset containing array to split" help="This tool only supports to split one array at each tool run. If X, y are in separate files, the splitting task could be done by invoking this tool twice in which this input dataset is swapped while all other parameters are kept the same."/>
+        <param name="infile_array" type="data" format="tabular" label="Select the dataset containing array to split" help="This tool only supports to split one array at each tool run. If X, y are in separate files, the splitting task could be done by invoking this tool twice in which this input dataset is swapped while all other parameters are kept the same." />
         <param name="header0" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Does the dataset contain header?" />
         <conditional name="mode_selection">
             <param name="selected_mode" type="select" label="Select the splitting mode">
@@ -42,8 +42,8 @@
             <when value="train_test_split">
                 <section name="options" title="Options" expanded="true">
                     <param argument="test_size" type="float" min="0" optional="false" value="0.25" label="Test size:"
-                        help="A float number, 0.0 - 1.0, represents the proportion of the dataset to be included in the test split."/>
-                    <param argument="random_state" type="integer" optional="true" value="" label="Random seed number:"/>
+                        help="A float number, 0.0 - 1.0, represents the proportion of the dataset to be included in the test split." />
+                    <param argument="random_state" type="integer" optional="true" value="" label="Random seed number:" />
                     <conditional name="shuffle_selection">
                         <param name="shuffle" type="select" label="Shuffle strategy">
                             <option value="None">None - No shuffle</option>
@@ -51,13 +51,13 @@
                             <option value="stratified">Stratified Shuffle</option>
                             <option value="group">Group Shuffle</option>
                         </param>
-                        <when value="None"/>
-                        <when value="simple"/>
+                        <when value="None" />
+                        <when value="simple" />
                         <when value="stratified">
-                            <expand macro="label_input"/>
+                            <expand macro="label_input" />
                         </when>
                         <when value="group">
-                            <expand macro="label_input" label="Select the dataset containing groups"/>
+                            <expand macro="label_input" label="Select the dataset containing groups" />
                         </when>
                     </conditional>
                 </section>
@@ -83,198 +83,194 @@
                         <option value="LeavePGroupsOut">LeavePGroupsOut</option>
                     </param>
                     <when value="KFold">
-                        <expand macro="cv_n_splits"/>
-                        <expand macro="cv_shuffle"/>
-                        <expand macro="random_state"/>
+                        <expand macro="cv_n_splits" />
+                        <expand macro="cv_shuffle" />
+                        <expand macro="random_state" />
                     </when>
                     <when value="RepeatedKFold">
-                        <expand macro="cv_n_splits" value="5"/>
+                        <expand macro="cv_n_splits" value="5" />
                         <param argument="n_repeats" type="integer" value="10" label="n_repeats" help="Number of times cross-validator needs to be repeated." />
                         <expand macro="random_state" />
                     </when>
                     <when value="StratifiedKFold">
-                        <expand macro="cv_n_splits"/>
-                        <expand macro="cv_shuffle"/>
-                        <expand macro="random_state"/>
+                        <expand macro="cv_n_splits" />
+                        <expand macro="cv_shuffle" />
+                        <expand macro="random_state" />
                         <section name="target_input" title="Target values" expanded="true">
-                            <expand macro="label_input"/>
+                            <expand macro="label_input" />
                         </section>
                     </when>
                     <when value="RepeatedStratifiedKFold">
-                        <expand macro="cv_n_splits" value="5"/>
+                        <expand macro="cv_n_splits" value="5" />
                         <param argument="n_repeats" type="integer" value="10" label="n_repeats" help="Number of times cross-validator needs to be repeated." />
                         <expand macro="random_state" />
                         <section name="target_input" title="Target values" expanded="true">
-                            <expand macro="label_input"/>
+                            <expand macro="label_input" />
                         </section>
                     </when>
                     <when value="LeaveOneOut">
                     </when>
                     <when value="LeavePOut">
-                        <param argument="p" type="integer" value="" label="p" help="Integer. Size of the test sets."/>
+                        <param argument="p" type="integer" value="" label="p" help="Integer. Size of the test sets." />
                     </when>
                     <when value="ShuffleSplit">
-                        <expand macro="cv_n_splits" value="10" help="Number of re-shuffling and splitting iterations."/>
+                        <expand macro="cv_n_splits" value="10" help="Number of re-shuffling and splitting iterations." />
                         <expand macro="cv_test_size" value="0.1" />
-                        <expand macro="random_state"/>
+                        <expand macro="random_state" />
                     </when>
                     <when value="StratifiedShuffleSplit">
-                        <expand macro="cv_n_splits" value="10" help="Number of re-shuffling and splitting iterations."/>
+                        <expand macro="cv_n_splits" value="10" help="Number of re-shuffling and splitting iterations." />
                         <expand macro="cv_test_size" value="0.1" />
-                        <expand macro="random_state"/>
+                        <expand macro="random_state" />
                         <section name="target_input" title="Target values" expanded="true">
-                            <expand macro="label_input"/>
+                            <expand macro="label_input" />
                         </section>
                     </when>
                     <when value="TimeSeriesSplit">
-                        <expand macro="cv_n_splits"/>
+                        <expand macro="cv_n_splits" />
                         <param argument="max_train_size" type="integer" value="" optional="true" label="Maximum size of the training set" help="Maximum size for a single training set." />
                     </when>
                     <when value="PredefinedSplit">
-                        <param argument="test_fold" type="text" value="" area="true" label="test_fold" help="List, e.g., [0, 1, -1, 1], represents two test sets, [X[0]] and [X[1], X[3]], X[2] is excluded from any test set due to '-1'."/>
+                        <param argument="test_fold" type="text" value="" area="true" label="test_fold" help="List, e.g., [0, 1, -1, 1], represents two test sets, [X[0]] and [X[1], X[3]], X[2] is excluded from any test set due to '-1'." />
                     </when>
                     <when value="OrderedKFold">
-                        <expand macro="cv_n_splits"/>
-                        <expand macro="cv_shuffle"/>
-                        <expand macro="random_state"/>
+                        <expand macro="cv_n_splits" />
+                        <expand macro="cv_shuffle" />
+                        <expand macro="random_state" />
+                        <expand macro="cv_n_stratification_bins" />
                         <section name="target_input" title="Target values" expanded="true">
-                            <expand macro="label_input" label="Select the dataset containing target values"/>
+                            <expand macro="label_input" label="Select the dataset containing target values" />
                         </section>
                     </when>
                     <when value="RepeatedOrderedKFold">
-                        <expand macro="cv_n_splits"/>
-                        <param argument="n_repeats" type="integer" value="5"/>
-                        <expand macro="random_state"/>
+                        <expand macro="cv_n_splits" />
+                        <param argument="n_repeats" type="integer" value="5" />
+                        <expand macro="random_state" />
+                        <expand macro="cv_n_stratification_bins" />
                         <section name="target_input" title="Target values" expanded="true">
-                            <expand macro="label_input" label="Select the dataset containing target values"/>
+                            <expand macro="label_input" label="Select the dataset containing target values" />
                         </section>
                     </when>
                     <when value="GroupKFold">
-                        <expand macro="cv_n_splits"/>
+                        <expand macro="cv_n_splits" />
                         <expand macro="cv_groups" />
                     </when>
                     <when value="GroupShuffleSplit">
-                        <expand macro="cv_n_splits" value="5"/>
-                        <expand macro="cv_test_size"/>
-                        <expand macro="random_state"/>
-                        <expand macro="cv_groups"/>
+                        <expand macro="cv_n_splits" value="5" />
+                        <expand macro="cv_test_size" />
+                        <expand macro="random_state" />
+                        <expand macro="cv_groups" />
                     </when>
                     <when value="LeaveOneGroupOut">
-                        <expand macro="cv_groups"/>
+                        <expand macro="cv_groups" />
                     </when>
                     <when value="LeavePGroupsOut">
                         <param argument="n_groups" type="integer" value="" label="n_groups" help="Number of groups (p) to leave out in the test split." />
-                        <expand macro="cv_groups"/>
+                        <expand macro="cv_groups" />
                     </when>
                 </conditional>
-                <param name="nth_split" type="integer" min="1" value="1" label="Type the index of split to output" help="Split index starts from 1 to  total = n_splits (x n_repeats). (nth_split)"/>
+                <param name="nth_split" type="integer" min="1" value="1" label="Type the index of split to output" help="Split index starts from 1 to  total = n_splits (x n_repeats). (nth_split)" />
             </when>
         </conditional>
     </inputs>
     <outputs>
-        <data format="tabular" name="out_train" label="${tool.name} on ${on_string} (train)"/>
-        <data format="tabular" name="out_test" label="${tool.name} on ${on_string} (test)"/>
+        <data format="tabular" name="out_train" label="${tool.name} on ${on_string} (train)" />
+        <data format="tabular" name="out_test" label="${tool.name} on ${on_string} (test)" />
     </outputs>
     <tests>
         <test>
-            <param name="infile_array" value="regression_X.tabular" ftype="tabular"/>
-            <param name="header0" value="true"/>
+            <param name="infile_array" value="regression_X.tabular" ftype="tabular" />
+            <param name="header0" value="true" />
             <conditional name="mode_selection">
-                <param name="selected_mode" value="train_test_split"/>
+                <param name="selected_mode" value="train_test_split" />
                 <section name="options">
-                    <param name="random_state" value="123"/>
+                    <param name="random_state" value="123" />
                     <conditional name="shuffle_selection">
-                        <param name="shuffle" value="simple"/>
+                        <param name="shuffle" value="simple" />
                     </conditional>
                 </section>
             </conditional>
-            <output name="out_train" file="train_test_split_train01.tabular" ftype="tabular"/>
-            <output name="out_test" file="train_test_split_test01.tabular" ftype="tabular"/>
+            <output name="out_train" file="train_test_split_train01.tabular" ftype="tabular" />
+            <output name="out_test" file="train_test_split_test01.tabular" ftype="tabular" />
         </test>
          <test>
-            <param name="infile_array" value="regression_X.tabular" ftype="tabular"/>
-            <param name="header0" value="true"/>
+            <param name="infile_array" value="regression_X.tabular" ftype="tabular" />
+            <param name="header0" value="true" />
             <conditional name="mode_selection">
-                <param name="selected_mode" value="cv_splitter"/>
+                <param name="selected_mode" value="cv_splitter" />
                 <conditional name="cv_selector">
-                    <param name="selected_cv" value="ShuffleSplit"/>
-                    <param name="random_state" value="123"/>
-                    <param name="n_splits" value="2"/>
-                    <param name="test_size" value="0.25"/>
+                    <param name="selected_cv" value="ShuffleSplit" />
+                    <param name="random_state" value="123" />
+                    <param name="n_splits" value="2" />
+                    <param name="test_size" value="0.25" />
                 </conditional>
             </conditional>
-            <output name="out_train" file="train_test_split_train01.tabular" ftype="tabular"/>
-            <output name="out_test" file="train_test_split_test01.tabular" ftype="tabular"/>
+            <output name="out_train" file="train_test_split_train01.tabular" ftype="tabular" />
+            <output name="out_test" file="train_test_split_test01.tabular" ftype="tabular" />
         </test>
         <test>
-            <param name="infile_array" value="imblearn_X.tabular" ftype="tabular"/>
-            <param name="header0" value="false"/>
+            <param name="infile_array" value="imblearn_X.tabular" ftype="tabular" />
+            <param name="header0" value="false" />
             <conditional name="mode_selection">
-                <param name="selected_mode" value="train_test_split"/>
+                <param name="selected_mode" value="train_test_split" />
                 <section name="options">
-                    <param name="test_size" value="0.2"/>
-                    <param name="random_state" value="123"/>
+                    <param name="test_size" value="0.2" />
+                    <param name="random_state" value="123" />
                     <conditional name="shuffle_selection">
-                        <param name="shuffle" value="stratified"/>
-                        <param name="labels" value="imblearn_y.tabular" ftype="tabular"/>
-                        <param name="header1" value="false"/>
-                        <param name="col" value="1"/>
+                        <param name="shuffle" value="stratified" />
+                        <param name="labels" value="imblearn_y.tabular" ftype="tabular" />
+                        <param name="header1" value="false" />
+                        <param name="col" value="1" />
                     </conditional>
                 </section>
             </conditional>
-            <output name="out_train" file="train_test_split_train02.tabular" ftype="tabular"/>
-            <output name="out_test" file="train_test_split_test02.tabular" ftype="tabular"/>
+            <output name="out_train" file="train_test_split_train02.tabular" ftype="tabular" />
+            <output name="out_test" file="train_test_split_test02.tabular" ftype="tabular" />
         </test>
         <test>
-            <param name="infile_array" value="imblearn_X.tabular" ftype="tabular"/>
-            <param name="header0" value="false"/>
+            <param name="infile_array" value="imblearn_X.tabular" ftype="tabular" />
+            <param name="header0" value="false" />
             <conditional name="mode_selection">
-                <param name="selected_mode" value="cv_splitter"/>
+                <param name="selected_mode" value="cv_splitter" />
                 <conditional name="cv_selector">
-                    <param name="selected_cv" value="StratifiedShuffleSplit"/>
-                    <param name="random_state" value="123"/>
-                    <param name="test_size" value="0.2"/>
-                    <param name="n_splits" value="1"/>
+                    <param name="selected_cv" value="StratifiedShuffleSplit" />
+                    <param name="random_state" value="123" />
+                    <param name="test_size" value="0.2" />
+                    <param name="n_splits" value="1" />
                     <section name="target_input">
-                        <param name="labels" value="imblearn_y.tabular" ftype="tabular"/>
-                        <param name="header1" value="false"/>
-                        <param name="col" value="1"/>
+                        <param name="labels" value="imblearn_y.tabular" ftype="tabular" />
+                        <param name="header1" value="false" />
+                        <param name="col" value="1" />
                     </section>
                 </conditional>
             </conditional>
-            <output name="out_train" file="train_test_split_train02.tabular" ftype="tabular"/>
-            <output name="out_test" file="train_test_split_test02.tabular" ftype="tabular"/>
+            <output name="out_train" file="train_test_split_train02.tabular" ftype="tabular" />
+            <output name="out_test" file="train_test_split_test02.tabular" ftype="tabular" />
         </test>
         <test>
-            <param name="infile_array" value="regression_X.tabular" ftype="tabular"/>
-            <param name="header0" value="true"/>
+            <param name="infile_array" value="regression_X.tabular" ftype="tabular" />
+            <param name="header0" value="true" />
             <conditional name="mode_selection">
-                <param name="selected_mode" value="cv_splitter"/>
+                <param name="selected_mode" value="cv_splitter" />
                 <conditional name="cv_selector">
-                    <param name="selected_cv" value="OrderedKFold"/>
-                    <param name="random_state" value="123"/>
-                    <param name="shuffle" value="true"/>
-                    <param name="n_splits" value="5"/>
+                    <param name="selected_cv" value="OrderedKFold" />
+                    <param name="random_state" value="123" />
+                    <param name="shuffle" value="true" />
+                    <param name="n_splits" value="5" />
                     <section name="target_input">
-                        <param name="labels" value="regression_y.tabular" ftype="tabular"/>
-                        <param name="header1" value="true"/>
-                        <param name="col" value="1"/>
+                        <param name="labels" value="regression_y.tabular" ftype="tabular" />
+                        <param name="header1" value="true" />
+                        <param name="col" value="1" />
                     </section>
                 </conditional>
             </conditional>
-            <output name="out_train" file="train_test_split_train03.tabular" ftype="tabular"/>
-            <output name="out_test" file="train_test_split_test03.tabular" ftype="tabular"/>
+            <output name="out_train" file="train_test_split_train03.tabular" ftype="tabular" />
+            <output name="out_test" file="train_test_split_test03.tabular" ftype="tabular" />
         </test>
     </tests>
     <help><![CDATA[
 **What it does**
-
-This tool implements splitter function and classes from `sklearn.model_selection` module to split contents (rows) of a table into
-two subsets for training and test, respectively . The simple train test split mode not only supports shuffle split and stratified
-shuffle split natively carried by the `train_test_split` function, but also gets extended to do group shuffle.
-The cross-validation splitter mode supports more diverse splitting strategies. Each tool run outputs one split, train and test.
-To get different splitting sets, for example, nested CV, multiple tool runs are needed with different `nth_split`.
-Example: 6-fold CV. Set `n_splits` to 6. Run the tool 6 times with the same parameters, but set `nth_split` according to the number of the run (1-6). 
+This tool implements splitter function and classes from `sklearn.model_selection` module to split contents (rows) of a table into two subsets for training and test, respectively . The simple train test split mode not only supports shuffle split and stratified shuffle split natively carried by the `train_test_split` function, but also gets extended to do group shuffle. The cross-validation splitter mode supports more diverse splitting strategies. Each tool run outputs one split, train and test. To get different splitting sets, for example, nested CV, multiple tool runs are needed with different `nth_split`.
 
 - Train Test Split mode
     - direct split, no shuffle
@@ -293,5 +289,5 @@
 Output: two tabular datasets containing training and test subsets, respectively.
 
     ]]></help>
-    <expand macro="sklearn_citation"/>
-</tool>
+    <expand macro="sklearn_citation" />
+</tool>
\ No newline at end of file
author	bgruening
date	Wed, 09 Aug 2023 13:33:25 +0000
parents	ce2fd1edbc6e
children