Mercurial > repos > bgruening > sklearn_data_preprocess
diff model_validations.py @ 24:9e43ee712723 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
author | bgruening |
---|---|
date | Tue, 14 May 2019 18:19:35 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/model_validations.py Tue May 14 18:19:35 2019 -0400 @@ -0,0 +1,252 @@ +""" +class +----- +OrderedKFold +RepeatedOrderedKold + + +function +-------- +train_test_split +""" + +import numpy as np +import warnings + +from itertools import chain +from math import ceil, floor +from sklearn.model_selection import (GroupShuffleSplit, ShuffleSplit, + StratifiedShuffleSplit) +from sklearn.model_selection._split import _BaseKFold, _RepeatedSplits +from sklearn.utils import check_random_state, indexable, safe_indexing +from sklearn.utils.validation import _num_samples, check_array + + +def _validate_shuffle_split(n_samples, test_size, train_size, + default_test_size=None): + """ + Validation helper to check if the test/test sizes are meaningful wrt to the + size of the data (n_samples) + """ + if test_size is None and train_size is None: + test_size = default_test_size + + test_size_type = np.asarray(test_size).dtype.kind + train_size_type = np.asarray(train_size).dtype.kind + + if (test_size_type == 'i' and (test_size >= n_samples or test_size <= 0) + or test_size_type == 'f' and (test_size <= 0 or test_size >= 1)): + raise ValueError('test_size={0} should be either positive and smaller' + ' than the number of samples {1} or a float in the ' + '(0, 1) range'.format(test_size, n_samples)) + + if (train_size_type == 'i' and (train_size >= n_samples or train_size <= 0) + or train_size_type == 'f' and (train_size <= 0 or train_size >= 1)): + raise ValueError('train_size={0} should be either positive and smaller' + ' than the number of samples {1} or a float in the ' + '(0, 1) range'.format(train_size, n_samples)) + + if train_size is not None and train_size_type not in ('i', 'f'): + raise ValueError("Invalid value for train_size: {}".format(train_size)) + if test_size is not None and test_size_type not in ('i', 'f'): + raise ValueError("Invalid value for test_size: {}".format(test_size)) + + if (train_size_type == 'f' and test_size_type == 'f' and + train_size + test_size > 1): + raise ValueError( + 'The sum of test_size and train_size = {}, should be in the (0, 1)' + ' range. Reduce test_size and/or train_size.' + .format(train_size + test_size)) + + if test_size_type == 'f': + n_test = ceil(test_size * n_samples) + elif test_size_type == 'i': + n_test = float(test_size) + + if train_size_type == 'f': + n_train = floor(train_size * n_samples) + elif train_size_type == 'i': + n_train = float(train_size) + + if train_size is None: + n_train = n_samples - n_test + elif test_size is None: + n_test = n_samples - n_train + + if n_train + n_test > n_samples: + raise ValueError('The sum of train_size and test_size = %d, ' + 'should be smaller than the number of ' + 'samples %d. Reduce test_size and/or ' + 'train_size.' % (n_train + n_test, n_samples)) + + n_train, n_test = int(n_train), int(n_test) + + if n_train == 0: + raise ValueError( + 'With n_samples={}, test_size={} and train_size={}, the ' + 'resulting train set will be empty. Adjust any of the ' + 'aforementioned parameters.'.format(n_samples, test_size, + train_size) + ) + + return n_train, n_test + + +def train_test_split(*arrays, **options): + """Extend sklearn.model_selection.train_test_slit to have group split. + + Parameters + ---------- + *arrays : sequence of indexables with same length / shape[0] + Allowed inputs are lists, numpy arrays, scipy-sparse + matrices or pandas dataframes. + + test_size : float, int or None, optional (default=None) + If float, should be between 0.0 and 1.0 and represent the proportion + of the dataset to include in the test split. If int, represents the + absolute number of test samples. If None, the value is set to the + complement of the train size. If ``train_size`` is also None, it will + be set to 0.25. + + train_size : float, int, or None, (default=None) + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + + random_state : int, RandomState instance or None, optional (default=None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. + + shuffle : None or str (default='simple') + How to shuffle the data before splitting. + None, no shuffle. + For str, one of 'simple', 'stratified' and 'group', corresponding to + `ShuffleSplit`, `StratifiedShuffleSplit` and `GroupShuffleSplit`, + respectively. + + labels : array-like or None (default=None) + Ignored if shuffle is None or 'simple'. + When shuffle='stratified', this array is used as class labels. + When shuffle='group', this array is used as groups. + + Returns + ------- + splitting : list, length=2 * len(arrays) + List containing train-test split of inputs. + + """ + n_arrays = len(arrays) + if n_arrays == 0: + raise ValueError("At least one array required as input") + test_size = options.pop('test_size', None) + train_size = options.pop('train_size', None) + random_state = options.pop('random_state', None) + shuffle = options.pop('shuffle', 'simple') + labels = options.pop('labels', None) + + if options: + raise TypeError("Invalid parameters passed: %s" % str(options)) + + arrays = indexable(*arrays) + + n_samples = _num_samples(arrays[0]) + if shuffle == 'group': + if labels is None: + raise ValueError("When shuffle='group', " + "labels should not be None!") + labels = check_array(labels, ensure_2d=False, dtype=None) + uniques = np.unique(labels) + n_samples = uniques.size + + n_train, n_test = _validate_shuffle_split(n_samples, test_size, train_size, + default_test_size=0.25) + + shuffle_options = dict(test_size=n_test, + train_size=n_train, + random_state=random_state) + + if shuffle is None: + if labels is not None: + warnings.warn("The `labels` is ignored for " + "shuffle being None!") + + train = np.arange(n_train) + test = np.arange(n_train, n_train + n_test) + + elif shuffle == 'simple': + if labels is not None: + warnings.warn("The `labels` is not needed and therefore " + "ignored for ShuffleSplit, as shuffle='simple'!") + + cv = ShuffleSplit(**shuffle_options) + train, test = next(cv.split(X=arrays[0], y=None)) + + elif shuffle == 'stratified': + cv = StratifiedShuffleSplit(**shuffle_options) + train, test = next(cv.split(X=arrays[0], y=labels)) + + elif shuffle == 'group': + cv = GroupShuffleSplit(**shuffle_options) + train, test = next(cv.split(X=arrays[0], y=None, groups=labels)) + + else: + raise ValueError("The argument `shuffle` only supports None, " + "'simple', 'stratified' and 'group', but got `%s`!" + % shuffle) + + return list(chain.from_iterable((safe_indexing(a, train), + safe_indexing(a, test)) for a in arrays)) + + +class OrderedKFold(_BaseKFold): + """ + Split into K fold based on ordered target value + + Parameters + ---------- + n_splits : int, default=3 + Number of folds. Must be at least 2. + shuffle: bool + random_state: None or int + """ + + def __init__(self, n_splits=3, shuffle=False, random_state=None): + super(OrderedKFold, self).__init__(n_splits, shuffle, random_state) + + def _iter_test_indices(self, X, y, groups=None): + n_samples = _num_samples(X) + n_splits = self.n_splits + y = np.asarray(y) + sorted_index = np.argsort(y) + if self.shuffle: + current = 0 + rng = check_random_state(self.random_state) + for i in range(n_samples // int(n_splits)): + start, stop = current, current + n_splits + rng.shuffle(sorted_index[start:stop]) + current = stop + rng.shuffle(sorted_index[current:]) + + for i in range(n_splits): + yield sorted_index[i:n_samples:n_splits] + + +class RepeatedOrderedKFold(_RepeatedSplits): + """ Repeated OrderedKFold runs mutiple times with different randomization. + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + n_repeats : int, default=5 + Number of times cross-validator to be repeated. + + random_state: int, RandomState instance or None. Optional + """ + def __init__(self, n_splits=5, n_repeats=5, random_state=None): + super(RepeatedOrderedKFold, self).__init__( + OrderedKFold, n_repeats, random_state, n_splits=n_splits)