comparison model_validations.py @ 24:97b467e06354 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
author bgruening
date Tue, 14 May 2019 18:07:39 -0400
parents
children
comparison
equal deleted inserted replaced
23:4ba68dd788b3 24:97b467e06354
1 """
2 class
3 -----
4 OrderedKFold
5 RepeatedOrderedKold
6
7
8 function
9 --------
10 train_test_split
11 """
12
13 import numpy as np
14 import warnings
15
16 from itertools import chain
17 from math import ceil, floor
18 from sklearn.model_selection import (GroupShuffleSplit, ShuffleSplit,
19 StratifiedShuffleSplit)
20 from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits
21 from sklearn.utils import check_random_state, indexable, safe_indexing
22 from sklearn.utils.validation import _num_samples, check_array
23
24
25 def _validate_shuffle_split(n_samples, test_size, train_size,
26 default_test_size=None):
27 """
28 Validation helper to check if the test/test sizes are meaningful wrt to the
29 size of the data (n_samples)
30 """
31 if test_size is None and train_size is None:
32 test_size = default_test_size
33
34 test_size_type = np.asarray(test_size).dtype.kind
35 train_size_type = np.asarray(train_size).dtype.kind
36
37 if (test_size_type == 'i' and (test_size >= n_samples or test_size <= 0)
38 or test_size_type == 'f' and (test_size <= 0 or test_size >= 1)):
39 raise ValueError('test_size={0} should be either positive and smaller'
40 ' than the number of samples {1} or a float in the '
41 '(0, 1) range'.format(test_size, n_samples))
42
43 if (train_size_type == 'i' and (train_size >= n_samples or train_size <= 0)
44 or train_size_type == 'f' and (train_size <= 0 or train_size >= 1)):
45 raise ValueError('train_size={0} should be either positive and smaller'
46 ' than the number of samples {1} or a float in the '
47 '(0, 1) range'.format(train_size, n_samples))
48
49 if train_size is not None and train_size_type not in ('i', 'f'):
50 raise ValueError("Invalid value for train_size: {}".format(train_size))
51 if test_size is not None and test_size_type not in ('i', 'f'):
52 raise ValueError("Invalid value for test_size: {}".format(test_size))
53
54 if (train_size_type == 'f' and test_size_type == 'f' and
55 train_size + test_size > 1):
56 raise ValueError(
57 'The sum of test_size and train_size = {}, should be in the (0, 1)'
58 ' range. Reduce test_size and/or train_size.'
59 .format(train_size + test_size))
60
61 if test_size_type == 'f':
62 n_test = ceil(test_size * n_samples)
63 elif test_size_type == 'i':
64 n_test = float(test_size)
65
66 if train_size_type == 'f':
67 n_train = floor(train_size * n_samples)
68 elif train_size_type == 'i':
69 n_train = float(train_size)
70
71 if train_size is None:
72 n_train = n_samples - n_test
73 elif test_size is None:
74 n_test = n_samples - n_train
75
76 if n_train + n_test > n_samples:
77 raise ValueError('The sum of train_size and test_size = %d, '
78 'should be smaller than the number of '
79 'samples %d. Reduce test_size and/or '
80 'train_size.' % (n_train + n_test, n_samples))
81
82 n_train, n_test = int(n_train), int(n_test)
83
84 if n_train == 0:
85 raise ValueError(
86 'With n_samples={}, test_size={} and train_size={}, the '
87 'resulting train set will be empty. Adjust any of the '
88 'aforementioned parameters.'.format(n_samples, test_size,
89 train_size)
90 )
91
92 return n_train, n_test
93
94
95 def train_test_split(*arrays, **options):
96 """Extend sklearn.model_selection.train_test_slit to have group split.
97
98 Parameters
99 ----------
100 *arrays : sequence of indexables with same length / shape[0]
101 Allowed inputs are lists, numpy arrays, scipy-sparse
102 matrices or pandas dataframes.
103
104 test_size : float, int or None, optional (default=None)
105 If float, should be between 0.0 and 1.0 and represent the proportion
106 of the dataset to include in the test split. If int, represents the
107 absolute number of test samples. If None, the value is set to the
108 complement of the train size. If ``train_size`` is also None, it will
109 be set to 0.25.
110
111 train_size : float, int, or None, (default=None)
112 If float, should be between 0.0 and 1.0 and represent the
113 proportion of the dataset to include in the train split. If
114 int, represents the absolute number of train samples. If None,
115 the value is automatically set to the complement of the test size.
116
117 random_state : int, RandomState instance or None, optional (default=None)
118 If int, random_state is the seed used by the random number generator;
119 If RandomState instance, random_state is the random number generator;
120 If None, the random number generator is the RandomState instance used
121 by `np.random`.
122
123 shuffle : None or str (default='simple')
124 How to shuffle the data before splitting.
125 None, no shuffle.
126 For str, one of 'simple', 'stratified' and 'group', corresponding to
127 `ShuffleSplit`, `StratifiedShuffleSplit` and `GroupShuffleSplit`,
128 respectively.
129
130 labels : array-like or None (default=None)
131 Ignored if shuffle is None or 'simple'.
132 When shuffle='stratified', this array is used as class labels.
133 When shuffle='group', this array is used as groups.
134
135 Returns
136 -------
137 splitting : list, length=2 * len(arrays)
138 List containing train-test split of inputs.
139
140 """
141 n_arrays = len(arrays)
142 if n_arrays == 0:
143 raise ValueError("At least one array required as input")
144 test_size = options.pop('test_size', None)
145 train_size = options.pop('train_size', None)
146 random_state = options.pop('random_state', None)
147 shuffle = options.pop('shuffle', 'simple')
148 labels = options.pop('labels', None)
149
150 if options:
151 raise TypeError("Invalid parameters passed: %s" % str(options))
152
153 arrays = indexable(*arrays)
154
155 n_samples = _num_samples(arrays[0])
156 if shuffle == 'group':
157 if labels is None:
158 raise ValueError("When shuffle='group', "
159 "labels should not be None!")
160 labels = check_array(labels, ensure_2d=False, dtype=None)
161 uniques = np.unique(labels)
162 n_samples = uniques.size
163
164 n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size,
165 default_test_size=0.25)
166
167 shuffle_options = dict(test_size=n_test,
168 train_size=n_train,
169 random_state=random_state)
170
171 if shuffle is None:
172 if labels is not None:
173 warnings.warn("The `labels` is ignored for "
174 "shuffle being None!")
175
176 train = np.arange(n_train)
177 test = np.arange(n_train, n_train + n_test)
178
179 elif shuffle == 'simple':
180 if labels is not None:
181 warnings.warn("The `labels` is not needed and therefore "
182 "ignored for ShuffleSplit, as shuffle='simple'!")
183
184 cv = ShuffleSplit(**shuffle_options)
185 train, test = next(cv.split(X=arrays[0], y=None))
186
187 elif shuffle == 'stratified':
188 cv = StratifiedShuffleSplit(**shuffle_options)
189 train, test = next(cv.split(X=arrays[0], y=labels))
190
191 elif shuffle == 'group':
192 cv = GroupShuffleSplit(**shuffle_options)
193 train, test = next(cv.split(X=arrays[0], y=None, groups=labels))
194
195 else:
196 raise ValueError("The argument `shuffle` only supports None, "
197 "'simple', 'stratified' and 'group', but got `%s`!"
198 % shuffle)
199
200 return list(chain.from_iterable((safe_indexing(a, train),
201 safe_indexing(a, test)) for a in arrays))
202
203
204 class OrderedKFold(_BaseKFold):
205 """
206 Split into K fold based on ordered target value
207
208 Parameters
209 ----------
210 n_splits : int, default=3
211 Number of folds. Must be at least 2.
212 shuffle: bool
213 random_state: None or int
214 """
215
216 def __init__(self, n_splits=3, shuffle=False, random_state=None):
217 super(OrderedKFold, self).__init__(n_splits, shuffle, random_state)
218
219 def _iter_test_indices(self, X, y, groups=None):
220 n_samples = _num_samples(X)
221 n_splits = self.n_splits
222 y = np.asarray(y)
223 sorted_index = np.argsort(y)
224 if self.shuffle:
225 current = 0
226 rng = check_random_state(self.random_state)
227 for i in range(n_samples // int(n_splits)):
228 start, stop = current, current + n_splits
229 rng.shuffle(sorted_index[start:stop])
230 current = stop
231 rng.shuffle(sorted_index[current:])
232
233 for i in range(n_splits):
234 yield sorted_index[i:n_samples:n_splits]
235
236
237 class RepeatedOrderedKFold(_RepeatedSplits):
238 """ Repeated OrderedKFold runs mutiple times with different randomization.
239
240 Parameters
241 ----------
242 n_splits : int, default=5
243 Number of folds. Must be at least 2.
244
245 n_repeats : int, default=5
246 Number of times cross-validator to be repeated.
247
248 random_state: int, RandomState instance or None. Optional
249 """
250 def __init__(self, n_splits=5, n_repeats=5, random_state=None):
251 super(RepeatedOrderedKFold, self).__init__(
252 OrderedKFold, n_repeats, random_state, n_splits=n_splits)