annotate train_test_split.py @ 36:b75cae00f980 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 208a8d348e7c7a182cfbe1b6f17868146428a7e2"
author bgruening
date Tue, 13 Apr 2021 22:16:07 +0000
parents 0e5fcf7ddc75
children 1bef885255e0
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
1 import argparse
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
2 import json
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
3 import warnings
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
4
36
b75cae00f980 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 208a8d348e7c7a182cfbe1b6f17868146428a7e2"
bgruening
parents: 35
diff changeset
5 import pandas as pd
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
6 from galaxy_ml.model_validations import train_test_split
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
7 from galaxy_ml.utils import get_cv, read_columns
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
8
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
9
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
10 def _get_single_cv_split(params, array, infile_labels=None, infile_groups=None):
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
11 """output (train, test) subset from a cv splitter
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
12
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
13 Parameters
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
14 ----------
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
15 params : dict
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
16 Galaxy tool inputs
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
17 array : pandas DataFrame object
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
18 The target dataset to split
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
19 infile_labels : str
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
20 File path to dataset containing target values
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
21 infile_groups : str
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
22 File path to dataset containing group values
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
23 """
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
24 y = None
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
25 groups = None
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
26
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
27 nth_split = params["mode_selection"]["nth_split"]
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
28
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
29 # read groups
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
30 if infile_groups:
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
31 header = "infer" if (params["mode_selection"]["cv_selector"]["groups_selector"]["header_g"]) else None
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
32 column_option = params["mode_selection"]["cv_selector"]["groups_selector"]["column_selector_options_g"][
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
33 "selected_column_selector_option_g"
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
34 ]
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
35 if column_option in [
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
36 "by_index_number",
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
37 "all_but_by_index_number",
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
38 "by_header_name",
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
39 "all_but_by_header_name",
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
40 ]:
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
41 c = params["mode_selection"]["cv_selector"]["groups_selector"]["column_selector_options_g"]["col_g"]
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
42 else:
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
43 c = None
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
44
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
45 groups = read_columns(
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
46 infile_groups,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
47 c=c,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
48 c_option=column_option,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
49 sep="\t",
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
50 header=header,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
51 parse_dates=True,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
52 )
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
53 groups = groups.ravel()
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
54
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
55 params["mode_selection"]["cv_selector"]["groups_selector"] = groups
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
56
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
57 # read labels
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
58 if infile_labels:
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
59 target_input = params["mode_selection"]["cv_selector"].pop("target_input")
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
60 header = "infer" if target_input["header1"] else None
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
61 col_index = target_input["col"][0] - 1
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
62 df = pd.read_csv(infile_labels, sep="\t", header=header, parse_dates=True)
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
63 y = df.iloc[:, col_index].values
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
64
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
65 # construct the cv splitter object
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
66 splitter, groups = get_cv(params["mode_selection"]["cv_selector"])
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
67
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
68 total_n_splits = splitter.get_n_splits(array.values, y=y, groups=groups)
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
69 if nth_split > total_n_splits:
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
70 raise ValueError("Total number of splits is {}, but got `nth_split` " "= {}".format(total_n_splits, nth_split))
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
71
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
72 i = 1
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
73 for train_index, test_index in splitter.split(array.values, y=y, groups=groups):
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
74 # suppose nth_split >= 1
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
75 if i == nth_split:
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
76 break
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
77 else:
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
78 i += 1
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
79
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
80 train = array.iloc[train_index, :]
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
81 test = array.iloc[test_index, :]
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
82
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
83 return train, test
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
84
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
85
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
86 def main(
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
87 inputs,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
88 infile_array,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
89 outfile_train,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
90 outfile_test,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
91 infile_labels=None,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
92 infile_groups=None,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
93 ):
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
94 """
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
95 Parameter
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
96 ---------
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
97 inputs : str
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
98 File path to galaxy tool parameter
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
99
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
100 infile_array : str
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
101 File paths of input arrays separated by comma
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
102
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
103 infile_labels : str
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
104 File path to dataset containing labels
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
105
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
106 infile_groups : str
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
107 File path to dataset containing groups
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
108
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
109 outfile_train : str
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
110 File path to dataset containing train split
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
111
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
112 outfile_test : str
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
113 File path to dataset containing test split
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
114 """
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
115 warnings.simplefilter("ignore")
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
116
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
117 with open(inputs, "r") as param_handler:
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
118 params = json.load(param_handler)
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
119
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
120 input_header = params["header0"]
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
121 header = "infer" if input_header else None
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
122 array = pd.read_csv(infile_array, sep="\t", header=header, parse_dates=True)
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
123
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
124 # train test split
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
125 if params["mode_selection"]["selected_mode"] == "train_test_split":
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
126 options = params["mode_selection"]["options"]
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
127 shuffle_selection = options.pop("shuffle_selection")
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
128 options["shuffle"] = shuffle_selection["shuffle"]
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
129 if infile_labels:
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
130 header = "infer" if shuffle_selection["header1"] else None
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
131 col_index = shuffle_selection["col"][0] - 1
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
132 df = pd.read_csv(infile_labels, sep="\t", header=header, parse_dates=True)
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
133 labels = df.iloc[:, col_index].values
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
134 options["labels"] = labels
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
135
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
136 train, test = train_test_split(array, **options)
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
137
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
138 # cv splitter
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
139 else:
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
140 train, test = _get_single_cv_split(params, array, infile_labels=infile_labels, infile_groups=infile_groups)
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
141
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
142 print("Input shape: %s" % repr(array.shape))
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
143 print("Train shape: %s" % repr(train.shape))
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
144 print("Test shape: %s" % repr(test.shape))
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
145 train.to_csv(outfile_train, sep="\t", header=input_header, index=False)
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
146 test.to_csv(outfile_test, sep="\t", header=input_header, index=False)
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
147
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
148
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
149 if __name__ == "__main__":
29
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
150 aparser = argparse.ArgumentParser()
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
151 aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
152 aparser.add_argument("-X", "--infile_array", dest="infile_array")
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
153 aparser.add_argument("-y", "--infile_labels", dest="infile_labels")
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
154 aparser.add_argument("-g", "--infile_groups", dest="infile_groups")
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
155 aparser.add_argument("-o", "--outfile_train", dest="outfile_train")
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
156 aparser.add_argument("-t", "--outfile_test", dest="outfile_test")
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
157 args = aparser.parse_args()
66df2aa6cd6b "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
bgruening
parents:
diff changeset
158
35
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
159 main(
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
160 args.inputs,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
161 args.infile_array,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
162 args.outfile_train,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
163 args.outfile_test,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
164 args.infile_labels,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
165 args.infile_groups,
0e5fcf7ddc75 "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"
bgruening
parents: 29
diff changeset
166 )