Mercurial > repos > bgruening > sklearn_data_preprocess
comparison model_validations.py @ 24:9e43ee712723 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
author | bgruening |
---|---|
date | Tue, 14 May 2019 18:19:35 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
23:d6b8103c909c | 24:9e43ee712723 |
---|---|
1 """ | |
2 class | |
3 ----- | |
4 OrderedKFold | |
5 RepeatedOrderedKold | |
6 | |
7 | |
8 function | |
9 -------- | |
10 train_test_split | |
11 """ | |
12 | |
13 import numpy as np | |
14 import warnings | |
15 | |
16 from itertools import chain | |
17 from math import ceil, floor | |
18 from sklearn.model_selection import (GroupShuffleSplit, ShuffleSplit, | |
19 StratifiedShuffleSplit) | |
20 from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits | |
21 from sklearn.utils import check_random_state, indexable, safe_indexing | |
22 from sklearn.utils.validation import _num_samples, check_array | |
23 | |
24 | |
25 def _validate_shuffle_split(n_samples, test_size, train_size, | |
26 default_test_size=None): | |
27 """ | |
28 Validation helper to check if the test/test sizes are meaningful wrt to the | |
29 size of the data (n_samples) | |
30 """ | |
31 if test_size is None and train_size is None: | |
32 test_size = default_test_size | |
33 | |
34 test_size_type = np.asarray(test_size).dtype.kind | |
35 train_size_type = np.asarray(train_size).dtype.kind | |
36 | |
37 if (test_size_type == 'i' and (test_size >= n_samples or test_size <= 0) | |
38 or test_size_type == 'f' and (test_size <= 0 or test_size >= 1)): | |
39 raise ValueError('test_size={0} should be either positive and smaller' | |
40 ' than the number of samples {1} or a float in the ' | |
41 '(0, 1) range'.format(test_size, n_samples)) | |
42 | |
43 if (train_size_type == 'i' and (train_size >= n_samples or train_size <= 0) | |
44 or train_size_type == 'f' and (train_size <= 0 or train_size >= 1)): | |
45 raise ValueError('train_size={0} should be either positive and smaller' | |
46 ' than the number of samples {1} or a float in the ' | |
47 '(0, 1) range'.format(train_size, n_samples)) | |
48 | |
49 if train_size is not None and train_size_type not in ('i', 'f'): | |
50 raise ValueError("Invalid value for train_size: {}".format(train_size)) | |
51 if test_size is not None and test_size_type not in ('i', 'f'): | |
52 raise ValueError("Invalid value for test_size: {}".format(test_size)) | |
53 | |
54 if (train_size_type == 'f' and test_size_type == 'f' and | |
55 train_size + test_size > 1): | |
56 raise ValueError( | |
57 'The sum of test_size and train_size = {}, should be in the (0, 1)' | |
58 ' range. Reduce test_size and/or train_size.' | |
59 .format(train_size + test_size)) | |
60 | |
61 if test_size_type == 'f': | |
62 n_test = ceil(test_size * n_samples) | |
63 elif test_size_type == 'i': | |
64 n_test = float(test_size) | |
65 | |
66 if train_size_type == 'f': | |
67 n_train = floor(train_size * n_samples) | |
68 elif train_size_type == 'i': | |
69 n_train = float(train_size) | |
70 | |
71 if train_size is None: | |
72 n_train = n_samples - n_test | |
73 elif test_size is None: | |
74 n_test = n_samples - n_train | |
75 | |
76 if n_train + n_test > n_samples: | |
77 raise ValueError('The sum of train_size and test_size = %d, ' | |
78 'should be smaller than the number of ' | |
79 'samples %d. Reduce test_size and/or ' | |
80 'train_size.' % (n_train + n_test, n_samples)) | |
81 | |
82 n_train, n_test = int(n_train), int(n_test) | |
83 | |
84 if n_train == 0: | |
85 raise ValueError( | |
86 'With n_samples={}, test_size={} and train_size={}, the ' | |
87 'resulting train set will be empty. Adjust any of the ' | |
88 'aforementioned parameters.'.format(n_samples, test_size, | |
89 train_size) | |
90 ) | |
91 | |
92 return n_train, n_test | |
93 | |
94 | |
95 def train_test_split(*arrays, **options): | |
96 """Extend sklearn.model_selection.train_test_slit to have group split. | |
97 | |
98 Parameters | |
99 ---------- | |
100 *arrays : sequence of indexables with same length / shape[0] | |
101 Allowed inputs are lists, numpy arrays, scipy-sparse | |
102 matrices or pandas dataframes. | |
103 | |
104 test_size : float, int or None, optional (default=None) | |
105 If float, should be between 0.0 and 1.0 and represent the proportion | |
106 of the dataset to include in the test split. If int, represents the | |
107 absolute number of test samples. If None, the value is set to the | |
108 complement of the train size. If ``train_size`` is also None, it will | |
109 be set to 0.25. | |
110 | |
111 train_size : float, int, or None, (default=None) | |
112 If float, should be between 0.0 and 1.0 and represent the | |
113 proportion of the dataset to include in the train split. If | |
114 int, represents the absolute number of train samples. If None, | |
115 the value is automatically set to the complement of the test size. | |
116 | |
117 random_state : int, RandomState instance or None, optional (default=None) | |
118 If int, random_state is the seed used by the random number generator; | |
119 If RandomState instance, random_state is the random number generator; | |
120 If None, the random number generator is the RandomState instance used | |
121 by `np.random`. | |
122 | |
123 shuffle : None or str (default='simple') | |
124 How to shuffle the data before splitting. | |
125 None, no shuffle. | |
126 For str, one of 'simple', 'stratified' and 'group', corresponding to | |
127 `ShuffleSplit`, `StratifiedShuffleSplit` and `GroupShuffleSplit`, | |
128 respectively. | |
129 | |
130 labels : array-like or None (default=None) | |
131 Ignored if shuffle is None or 'simple'. | |
132 When shuffle='stratified', this array is used as class labels. | |
133 When shuffle='group', this array is used as groups. | |
134 | |
135 Returns | |
136 ------- | |
137 splitting : list, length=2 * len(arrays) | |
138 List containing train-test split of inputs. | |
139 | |
140 """ | |
141 n_arrays = len(arrays) | |
142 if n_arrays == 0: | |
143 raise ValueError("At least one array required as input") | |
144 test_size = options.pop('test_size', None) | |
145 train_size = options.pop('train_size', None) | |
146 random_state = options.pop('random_state', None) | |
147 shuffle = options.pop('shuffle', 'simple') | |
148 labels = options.pop('labels', None) | |
149 | |
150 if options: | |
151 raise TypeError("Invalid parameters passed: %s" % str(options)) | |
152 | |
153 arrays = indexable(*arrays) | |
154 | |
155 n_samples = _num_samples(arrays[0]) | |
156 if shuffle == 'group': | |
157 if labels is None: | |
158 raise ValueError("When shuffle='group', " | |
159 "labels should not be None!") | |
160 labels = check_array(labels, ensure_2d=False, dtype=None) | |
161 uniques = np.unique(labels) | |
162 n_samples = uniques.size | |
163 | |
164 n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, | |
165 default_test_size=0.25) | |
166 | |
167 shuffle_options = dict(test_size=n_test, | |
168 train_size=n_train, | |
169 random_state=random_state) | |
170 | |
171 if shuffle is None: | |
172 if labels is not None: | |
173 warnings.warn("The `labels` is ignored for " | |
174 "shuffle being None!") | |
175 | |
176 train = np.arange(n_train) | |
177 test = np.arange(n_train, n_train + n_test) | |
178 | |
179 elif shuffle == 'simple': | |
180 if labels is not None: | |
181 warnings.warn("The `labels` is not needed and therefore " | |
182 "ignored for ShuffleSplit, as shuffle='simple'!") | |
183 | |
184 cv = ShuffleSplit(**shuffle_options) | |
185 train, test = next(cv.split(X=arrays[0], y=None)) | |
186 | |
187 elif shuffle == 'stratified': | |
188 cv = StratifiedShuffleSplit(**shuffle_options) | |
189 train, test = next(cv.split(X=arrays[0], y=labels)) | |
190 | |
191 elif shuffle == 'group': | |
192 cv = GroupShuffleSplit(**shuffle_options) | |
193 train, test = next(cv.split(X=arrays[0], y=None, groups=labels)) | |
194 | |
195 else: | |
196 raise ValueError("The argument `shuffle` only supports None, " | |
197 "'simple', 'stratified' and 'group', but got `%s`!" | |
198 % shuffle) | |
199 | |
200 return list(chain.from_iterable((safe_indexing(a, train), | |
201 safe_indexing(a, test)) for a in arrays)) | |
202 | |
203 | |
204 class OrderedKFold(_BaseKFold): | |
205 """ | |
206 Split into K fold based on ordered target value | |
207 | |
208 Parameters | |
209 ---------- | |
210 n_splits : int, default=3 | |
211 Number of folds. Must be at least 2. | |
212 shuffle: bool | |
213 random_state: None or int | |
214 """ | |
215 | |
216 def __init__(self, n_splits=3, shuffle=False, random_state=None): | |
217 super(OrderedKFold, self).__init__(n_splits, shuffle, random_state) | |
218 | |
219 def _iter_test_indices(self, X, y, groups=None): | |
220 n_samples = _num_samples(X) | |
221 n_splits = self.n_splits | |
222 y = np.asarray(y) | |
223 sorted_index = np.argsort(y) | |
224 if self.shuffle: | |
225 current = 0 | |
226 rng = check_random_state(self.random_state) | |
227 for i in range(n_samples // int(n_splits)): | |
228 start, stop = current, current + n_splits | |
229 rng.shuffle(sorted_index[start:stop]) | |
230 current = stop | |
231 rng.shuffle(sorted_index[current:]) | |
232 | |
233 for i in range(n_splits): | |
234 yield sorted_index[i:n_samples:n_splits] | |
235 | |
236 | |
237 class RepeatedOrderedKFold(_RepeatedSplits): | |
238 """ Repeated OrderedKFold runs mutiple times with different randomization. | |
239 | |
240 Parameters | |
241 ---------- | |
242 n_splits : int, default=5 | |
243 Number of folds. Must be at least 2. | |
244 | |
245 n_repeats : int, default=5 | |
246 Number of times cross-validator to be repeated. | |
247 | |
248 random_state: int, RandomState instance or None. Optional | |
249 """ | |
250 def __init__(self, n_splits=5, n_repeats=5, random_state=None): | |
251 super(RepeatedOrderedKFold, self).__init__( | |
252 OrderedKFold, n_repeats, random_state, n_splits=n_splits) |