Mercurial > repos > bgruening > scipy_sparse
annotate train_test_split.py @ 45:b3d80cb1ed87 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 5eca9041ce0154eded5aec07195502d5eb3cdd4f
author | bgruening |
---|---|
date | Fri, 03 Nov 2023 22:46:12 +0000 |
parents | 5af054432771 |
children |
rev | line source |
---|---|
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
1 import argparse |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
2 import json |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
3 import warnings |
40
5af054432771
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
bgruening
parents:
36
diff
changeset
|
4 from distutils.version import LooseVersion as Version |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
5 |
35
318484f56b6a
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 208a8d348e7c7a182cfbe1b6f17868146428a7e2"
bgruening
parents:
34
diff
changeset
|
6 import pandas as pd |
40
5af054432771
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
bgruening
parents:
36
diff
changeset
|
7 from galaxy_ml import __version__ as galaxy_ml_version |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
8 from galaxy_ml.model_validations import train_test_split |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
9 from galaxy_ml.utils import get_cv, read_columns |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
10 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
11 |
34
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
12 def _get_single_cv_split(params, array, infile_labels=None, infile_groups=None): |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
13 """output (train, test) subset from a cv splitter |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
14 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
15 Parameters |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
16 ---------- |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
17 params : dict |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
18 Galaxy tool inputs |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
19 array : pandas DataFrame object |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
20 The target dataset to split |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
21 infile_labels : str |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
22 File path to dataset containing target values |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
23 infile_groups : str |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
24 File path to dataset containing group values |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
25 """ |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
26 y = None |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
27 groups = None |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
28 |
34
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
29 nth_split = params["mode_selection"]["nth_split"] |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
30 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
31 # read groups |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
32 if infile_groups: |
36
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
33 header = ( |
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
34 "infer" |
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
35 if (params["mode_selection"]["cv_selector"]["groups_selector"]["header_g"]) |
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
36 else None |
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
37 ) |
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
38 column_option = params["mode_selection"]["cv_selector"]["groups_selector"][ |
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
39 "column_selector_options_g" |
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
40 ]["selected_column_selector_option_g"] |
34
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
41 if column_option in [ |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
42 "by_index_number", |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
43 "all_but_by_index_number", |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
44 "by_header_name", |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
45 "all_but_by_header_name", |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
46 ]: |
36
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
47 c = params["mode_selection"]["cv_selector"]["groups_selector"][ |
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
48 "column_selector_options_g" |
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
49 ]["col_g"] |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
50 else: |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
51 c = None |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
52 |
34
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
53 groups = read_columns( |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
54 infile_groups, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
55 c=c, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
56 c_option=column_option, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
57 sep="\t", |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
58 header=header, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
59 parse_dates=True, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
60 ) |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
61 groups = groups.ravel() |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
62 |
34
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
63 params["mode_selection"]["cv_selector"]["groups_selector"] = groups |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
64 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
65 # read labels |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
66 if infile_labels: |
34
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
67 target_input = params["mode_selection"]["cv_selector"].pop("target_input") |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
68 header = "infer" if target_input["header1"] else None |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
69 col_index = target_input["col"][0] - 1 |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
70 df = pd.read_csv(infile_labels, sep="\t", header=header, parse_dates=True) |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
71 y = df.iloc[:, col_index].values |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
72 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
73 # construct the cv splitter object |
40
5af054432771
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
bgruening
parents:
36
diff
changeset
|
74 cv_selector = params["mode_selection"]["cv_selector"] |
5af054432771
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
bgruening
parents:
36
diff
changeset
|
75 if Version(galaxy_ml_version) < Version("0.8.3"): |
5af054432771
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
bgruening
parents:
36
diff
changeset
|
76 cv_selector.pop("n_stratification_bins", None) |
5af054432771
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
bgruening
parents:
36
diff
changeset
|
77 splitter, groups = get_cv(cv_selector) |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
78 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
79 total_n_splits = splitter.get_n_splits(array.values, y=y, groups=groups) |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
80 if nth_split > total_n_splits: |
36
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
81 raise ValueError( |
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
82 "Total number of splits is {}, but got `nth_split` " |
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
83 "= {}".format(total_n_splits, nth_split) |
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
84 ) |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
85 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
86 i = 1 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
87 for train_index, test_index in splitter.split(array.values, y=y, groups=groups): |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
88 # suppose nth_split >= 1 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
89 if i == nth_split: |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
90 break |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
91 else: |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
92 i += 1 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
93 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
94 train = array.iloc[train_index, :] |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
95 test = array.iloc[test_index, :] |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
96 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
97 return train, test |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
98 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
99 |
34
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
100 def main( |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
101 inputs, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
102 infile_array, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
103 outfile_train, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
104 outfile_test, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
105 infile_labels=None, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
106 infile_groups=None, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
107 ): |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
108 """ |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
109 Parameter |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
110 --------- |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
111 inputs : str |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
112 File path to galaxy tool parameter |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
113 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
114 infile_array : str |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
115 File paths of input arrays separated by comma |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
116 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
117 infile_labels : str |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
118 File path to dataset containing labels |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
119 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
120 infile_groups : str |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
121 File path to dataset containing groups |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
122 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
123 outfile_train : str |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
124 File path to dataset containing train split |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
125 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
126 outfile_test : str |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
127 File path to dataset containing test split |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
128 """ |
34
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
129 warnings.simplefilter("ignore") |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
130 |
34
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
131 with open(inputs, "r") as param_handler: |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
132 params = json.load(param_handler) |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
133 |
34
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
134 input_header = params["header0"] |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
135 header = "infer" if input_header else None |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
136 array = pd.read_csv(infile_array, sep="\t", header=header, parse_dates=True) |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
137 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
138 # train test split |
34
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
139 if params["mode_selection"]["selected_mode"] == "train_test_split": |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
140 options = params["mode_selection"]["options"] |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
141 shuffle_selection = options.pop("shuffle_selection") |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
142 options["shuffle"] = shuffle_selection["shuffle"] |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
143 if infile_labels: |
34
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
144 header = "infer" if shuffle_selection["header1"] else None |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
145 col_index = shuffle_selection["col"][0] - 1 |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
146 df = pd.read_csv(infile_labels, sep="\t", header=header, parse_dates=True) |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
147 labels = df.iloc[:, col_index].values |
34
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
148 options["labels"] = labels |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
149 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
150 train, test = train_test_split(array, **options) |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
151 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
152 # cv splitter |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
153 else: |
36
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
154 train, test = _get_single_cv_split( |
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
155 params, array, infile_labels=infile_labels, infile_groups=infile_groups |
92e09b827300
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
bgruening
parents:
35
diff
changeset
|
156 ) |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
157 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
158 print("Input shape: %s" % repr(array.shape)) |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
159 print("Train shape: %s" % repr(train.shape)) |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
160 print("Test shape: %s" % repr(test.shape)) |
34
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
161 train.to_csv(outfile_train, sep="\t", header=input_header, index=False) |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
162 test.to_csv(outfile_test, sep="\t", header=input_header, index=False) |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
163 |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
164 |
34
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
165 if __name__ == "__main__": |
29
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
166 aparser = argparse.ArgumentParser() |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
167 aparser.add_argument("-i", "--inputs", dest="inputs", required=True) |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
168 aparser.add_argument("-X", "--infile_array", dest="infile_array") |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
169 aparser.add_argument("-y", "--infile_labels", dest="infile_labels") |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
170 aparser.add_argument("-g", "--infile_groups", dest="infile_groups") |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
171 aparser.add_argument("-o", "--outfile_train", dest="outfile_train") |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
172 aparser.add_argument("-t", "--outfile_test", dest="outfile_test") |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
173 args = aparser.parse_args() |
1658dff33c46
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff
changeset
|
174 |
34
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
175 main( |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
176 args.inputs, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
177 args.infile_array, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
178 args.outfile_train, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
179 args.outfile_test, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
180 args.infile_labels, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
181 args.infile_groups, |
4efd73be98bb
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents:
29
diff
changeset
|
182 ) |