diff iraps_classifier.py @ 24:9e43ee712723 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
author bgruening
date Tue, 14 May 2019 18:19:35 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/iraps_classifier.py	Tue May 14 18:19:35 2019 -0400
@@ -0,0 +1,569 @@
+"""
+class IRAPSCore
+class IRAPSClassifier
+class BinarizeTargetClassifier
+class BinarizeTargetRegressor
+class _BinarizeTargetScorer
+class _BinarizeTargetProbaScorer
+
+binarize_auc_scorer
+binarize_average_precision_scorer
+
+binarize_accuracy_scorer
+binarize_balanced_accuracy_scorer
+binarize_precision_scorer
+binarize_recall_scorer
+"""
+
+
+import numpy as np
+import random
+import warnings
+
+from abc import ABCMeta
+from scipy.stats import ttest_ind
+from sklearn import metrics
+from sklearn.base import BaseEstimator, clone, RegressorMixin
+from sklearn.externals import six
+from sklearn.feature_selection.univariate_selection import _BaseFilter
+from sklearn.metrics.scorer import _BaseScorer
+from sklearn.pipeline import Pipeline
+from sklearn.utils import as_float_array, check_X_y
+from sklearn.utils._joblib import Parallel, delayed
+from sklearn.utils.validation import (check_array, check_is_fitted,
+                                      check_memory, column_or_1d)
+
+
+VERSION = '0.1.1'
+
+
+class IRAPSCore(six.with_metaclass(ABCMeta, BaseEstimator)):
+    """
+    Base class of IRAPSClassifier
+    From sklearn BaseEstimator:
+        get_params()
+        set_params()
+
+    Parameters
+    ----------
+    n_iter : int
+        sample count
+
+    positive_thres : float
+        z_score shreshold to discretize positive target values
+
+    negative_thres : float
+        z_score threshold to discretize negative target values
+
+    verbose : int
+        0 or geater, if not 0, print progress
+
+    n_jobs : int, default=1
+        The number of CPUs to use to do the computation.
+
+    pre_dispatch : int, or string.
+        Controls the number of jobs that get dispatched during parallel
+        execution. Reducing this number can be useful to avoid an
+        explosion of memory consumption when more jobs get dispatched
+        than CPUs can process. This parameter can be:
+            - None, in which case all the jobs are immediately
+              created and spawned. Use this for lightweight and
+              fast-running jobs, to avoid delays due to on-demand
+              spawning of the jobs
+            - An int, giving the exact number of total jobs that are
+              spawned
+            - A string, giving an expression as a function of n_jobs,
+              as in '2*n_jobs'
+
+    random_state : int or None
+    """
+
+    def __init__(self, n_iter=1000, positive_thres=-1, negative_thres=0,
+                 verbose=0, n_jobs=1, pre_dispatch='2*n_jobs',
+                 random_state=None):
+        """
+        IRAPS turns towwards general Anomaly Detection
+        It comapares positive_thres with negative_thres,
+        and decide which portion is the positive target.
+        e.g.:
+        (positive_thres=-1, negative_thres=0)
+                 => positive = Z_score of target < -1
+        (positive_thres=1, negative_thres=0)
+                 => positive = Z_score of target > 1
+
+        Note: The positive targets here is always the
+            abnormal minority group.
+        """
+        self.n_iter = n_iter
+        self.positive_thres = positive_thres
+        self.negative_thres = negative_thres
+        self.verbose = verbose
+        self.n_jobs = n_jobs
+        self.pre_dispatch = pre_dispatch
+        self.random_state = random_state
+
+    def fit(self, X, y):
+        """
+        X: array-like (n_samples x n_features)
+        y: 1-d array-like (n_samples)
+        """
+        X, y = check_X_y(X, y, ['csr', 'csc'], multi_output=False)
+
+        def _stochastic_sampling(X, y, random_state=None, positive_thres=-1,
+                                 negative_thres=0):
+            # each iteration select a random number of random subset of
+            # training samples. this is somewhat different from the original
+            # IRAPS method, but effect is almost the same.
+            SAMPLE_SIZE = [0.25, 0.75]
+            n_samples = X.shape[0]
+
+            if random_state is None:
+                n_select = random.randint(int(n_samples * SAMPLE_SIZE[0]),
+                                          int(n_samples * SAMPLE_SIZE[1]))
+                index = random.sample(list(range(n_samples)), n_select)
+            else:
+                n_select = random.Random(random_state).randint(
+                                    int(n_samples * SAMPLE_SIZE[0]),
+                                    int(n_samples * SAMPLE_SIZE[1]))
+                index = random.Random(random_state).sample(
+                                    list(range(n_samples)), n_select)
+
+            X_selected, y_selected = X[index], y[index]
+
+            # Spliting by z_scores.
+            y_selected = (y_selected - y_selected.mean()) / y_selected.std()
+            if positive_thres < negative_thres:
+                X_selected_positive = X_selected[y_selected < positive_thres]
+                X_selected_negative = X_selected[y_selected > negative_thres]
+            else:
+                X_selected_positive = X_selected[y_selected > positive_thres]
+                X_selected_negative = X_selected[y_selected < negative_thres]
+
+            # For every iteration, at least 5 responders are selected
+            if X_selected_positive.shape[0] < 5:
+                warnings.warn("Warning: fewer than 5 positives were selected!")
+                return
+
+            # p_values
+            _, p = ttest_ind(X_selected_positive, X_selected_negative,
+                             axis=0, equal_var=False)
+
+            # fold_change == mean change?
+            # TODO implement other normalization method
+            positive_mean = X_selected_positive.mean(axis=0)
+            negative_mean = X_selected_negative.mean(axis=0)
+            mean_change = positive_mean - negative_mean
+            # mean_change = np.select(
+            #       [positive_mean > negative_mean,
+            #           positive_mean < negative_mean],
+            #       [positive_mean / negative_mean,
+            #           -negative_mean / positive_mean])
+            # mean_change could be adjusted by power of 2
+            # mean_change = 2**mean_change \
+            #       if mean_change>0 else -2**abs(mean_change)
+
+            return p, mean_change, negative_mean
+
+        parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
+                            pre_dispatch=self.pre_dispatch)
+        if self.random_state is None:
+            res = parallel(delayed(_stochastic_sampling)(
+                    X, y, random_state=None,
+                    positive_thres=self.positive_thres,
+                    negative_thres=self.negative_thres)
+                        for i in range(self.n_iter))
+        else:
+            res = parallel(delayed(_stochastic_sampling)(
+                    X, y, random_state=seed,
+                    positive_thres=self.positive_thres,
+                    negative_thres=self.negative_thres)
+                        for seed in range(self.random_state,
+                                          self.random_state+self.n_iter))
+        res = [_ for _ in res if _]
+        if len(res) < 50:
+            raise ValueError("too few (%d) valid feature lists "
+                             "were generated!" % len(res))
+        pvalues = np.vstack([x[0] for x in res])
+        fold_changes = np.vstack([x[1] for x in res])
+        base_values = np.vstack([x[2] for x in res])
+
+        self.pvalues_ = np.asarray(pvalues)
+        self.fold_changes_ = np.asarray(fold_changes)
+        self.base_values_ = np.asarray(base_values)
+
+        return self
+
+
+def _iraps_core_fit(iraps_core, X, y):
+    return iraps_core.fit(X, y)
+
+
+class IRAPSClassifier(six.with_metaclass(ABCMeta, _BaseFilter,
+                                         BaseEstimator, RegressorMixin)):
+    """
+    Extend the bases of both sklearn feature_selector and classifier.
+    From sklearn BaseEstimator:
+        get_params()
+        set_params()
+    From sklearn _BaseFilter:
+        get_support()
+        fit_transform(X)
+        transform(X)
+    From sklearn RegressorMixin:
+        score(X, y): R2
+    New:
+        predict(X)
+        predict_label(X)
+        get_signature()
+    Properties:
+        discretize_value
+
+    Parameters
+    ----------
+    iraps_core: object
+    p_thres: float, threshold for p_values
+    fc_thres: float, threshold for fold change or mean difference
+    occurrence: float, occurrence rate selected by set of p_thres and fc_thres
+    discretize: float, threshold of z_score to discretize target value
+    memory: None, str or joblib.Memory object
+    min_signature_features: int, the mininum number of features in a signature
+    """
+
+    def __init__(self, iraps_core, p_thres=1e-4, fc_thres=0.1,
+                 occurrence=0.8, discretize=-1, memory=None,
+                 min_signature_features=1):
+        self.iraps_core = iraps_core
+        self.p_thres = p_thres
+        self.fc_thres = fc_thres
+        self.occurrence = occurrence
+        self.discretize = discretize
+        self.memory = memory
+        self.min_signature_features = min_signature_features
+
+    def fit(self, X, y):
+        memory = check_memory(self.memory)
+        cached_fit = memory.cache(_iraps_core_fit)
+        iraps_core = clone(self.iraps_core)
+        # allow pre-fitted iraps_core here
+        if not hasattr(iraps_core, 'pvalues_'):
+            iraps_core = cached_fit(iraps_core, X, y)
+        self.iraps_core_ = iraps_core
+
+        pvalues = as_float_array(iraps_core.pvalues_, copy=True)
+        # why np.nan is here?
+        pvalues[np.isnan(pvalues)] = np.finfo(pvalues.dtype).max
+
+        fold_changes = as_float_array(iraps_core.fold_changes_, copy=True)
+        fold_changes[np.isnan(fold_changes)] = 0.0
+
+        base_values = as_float_array(iraps_core.base_values_, copy=True)
+
+        p_thres = self.p_thres
+        fc_thres = self.fc_thres
+        occurrence = self.occurrence
+
+        mask_0 = np.zeros(pvalues.shape, dtype=np.int32)
+        # mark p_values less than the threashold
+        mask_0[pvalues <= p_thres] = 1
+        # mark fold_changes only when greater than the threashold
+        mask_0[abs(fold_changes) < fc_thres] = 0
+
+        # count the occurrence and mask greater than the threshold
+        counts = mask_0.sum(axis=0)
+        occurrence_thres = int(occurrence * iraps_core.n_iter)
+        mask = np.zeros(counts.shape, dtype=bool)
+        mask[counts >= occurrence_thres] = 1
+
+        # generate signature
+        fold_changes[mask_0 == 0] = 0.0
+        signature = fold_changes[:, mask].sum(axis=0) / counts[mask]
+        signature = np.vstack((signature, base_values[:, mask].mean(axis=0)))
+        # It's not clearn whether min_size could impact prediction
+        # performance
+        if signature is None\
+                or signature.shape[1] < self.min_signature_features:
+            raise ValueError("The classifier got None signature or the number "
+                             "of sinature feature is less than minimum!")
+
+        self.signature_ = np.asarray(signature)
+        self.mask_ = mask
+        # TODO: support other discretize method: fixed value, upper
+        # third quater, etc.
+        self.discretize_value = y.mean() + y.std() * self.discretize
+        if iraps_core.negative_thres > iraps_core.positive_thres:
+            self.less_is_positive = True
+        else:
+            self.less_is_positive = False
+
+        return self
+
+    def _get_support_mask(self):
+        """
+        return mask of feature selection indices
+        """
+        check_is_fitted(self, 'mask_')
+
+        return self.mask_
+
+    def get_signature(self):
+        """
+        return signature
+        """
+        check_is_fitted(self, 'signature_')
+
+        return self.signature_
+
+    def predict(self, X):
+        """
+        compute the correlation coefficient with irpas signature
+        """
+        signature = self.get_signature()
+
+        X = as_float_array(X)
+        X_transformed = self.transform(X) - signature[1]
+        corrcoef = np.array(
+            [np.corrcoef(signature[0], e)[0][1] for e in X_transformed])
+        corrcoef[np.isnan(corrcoef)] = np.finfo(np.float32).min
+
+        return corrcoef
+
+    def predict_label(self, X, clf_cutoff=0.4):
+        return self.predict(X) >= clf_cutoff
+
+
+class BinarizeTargetClassifier(BaseEstimator, RegressorMixin):
+    """
+    Convert continuous target to binary labels (True and False)
+    and apply a classification estimator.
+
+    Parameters
+    ----------
+    classifier: object
+        Estimator object such as derived from sklearn `ClassifierMixin`.
+
+    z_score: float, default=-1.0
+        Threshold value based on z_score. Will be ignored when
+        fixed_value is set
+
+    value: float, default=None
+        Threshold value
+
+    less_is_positive: boolean, default=True
+        When target is less the threshold value, it will be converted
+        to True, False otherwise.
+
+    Attributes
+    ----------
+    classifier_: object
+        Fitted classifier
+
+    discretize_value: float
+        The threshold value used to discretize True and False targets
+    """
+
+    def __init__(self, classifier, z_score=-1, value=None,
+                 less_is_positive=True):
+        self.classifier = classifier
+        self.z_score = z_score
+        self.value = value
+        self.less_is_positive = less_is_positive
+
+    def fit(self, X, y, sample_weight=None):
+        """
+        Convert y to True and False labels and then fit the classifier
+        with X and new y
+
+        Returns
+        ------
+        self: object
+        """
+        y = check_array(y, accept_sparse=False, force_all_finite=True,
+                        ensure_2d=False, dtype='numeric')
+        y = column_or_1d(y)
+
+        if self.value is None:
+            discretize_value = y.mean() + y.std() * self.z_score
+        else:
+            discretize_value = self.Value
+        self.discretize_value = discretize_value
+
+        if self.less_is_positive:
+            y_trans = y < discretize_value
+        else:
+            y_trans = y > discretize_value
+
+        self.classifier_ = clone(self.classifier)
+
+        if sample_weight is not None:
+            self.classifier_.fit(X, y_trans, sample_weight=sample_weight)
+        else:
+            self.classifier_.fit(X, y_trans)
+
+        if hasattr(self.classifier_, 'feature_importances_'):
+            self.feature_importances_ = self.classifier_.feature_importances_
+        if hasattr(self.classifier_, 'coef_'):
+            self.coef_ = self.classifier_.coef_
+        if hasattr(self.classifier_, 'n_outputs_'):
+            self.n_outputs_ = self.classifier_.n_outputs_
+        if hasattr(self.classifier_, 'n_features_'):
+            self.n_features_ = self.classifier_.n_features_
+
+        return self
+
+    def predict(self, X):
+        """
+        Predict class probabilities of X.
+        """
+        check_is_fitted(self, 'classifier_')
+        proba = self.classifier_.predict_proba(X)
+        return proba[:, 1]
+
+    def predict_label(self, X):
+        """Predict class label of X
+        """
+        check_is_fitted(self, 'classifier_')
+        return self.classifier_.predict(X)
+
+
+class _BinarizeTargetProbaScorer(_BaseScorer):
+    """
+    base class to make binarized target specific scorer
+    """
+
+    def __call__(self, clf, X, y, sample_weight=None):
+        clf_name = clf.__class__.__name__
+        # support pipeline object
+        if isinstance(clf, Pipeline):
+            main_estimator = clf.steps[-1][-1]
+        # support stacking ensemble estimators
+        # TODO support nested pipeline/stacking estimators
+        elif clf_name in ['StackingCVClassifier', 'StackingClassifier']:
+            main_estimator = clf.meta_clf_
+        elif clf_name in ['StackingCVRegressor', 'StackingRegressor']:
+            main_estimator = clf.meta_regr_
+        else:
+            main_estimator = clf
+
+        discretize_value = main_estimator.discretize_value
+        less_is_positive = main_estimator.less_is_positive
+
+        if less_is_positive:
+            y_trans = y < discretize_value
+        else:
+            y_trans = y > discretize_value
+
+        y_pred = clf.predict(X)
+        if sample_weight is not None:
+            return self._sign * self._score_func(y_trans, y_pred,
+                                                 sample_weight=sample_weight,
+                                                 **self._kwargs)
+        else:
+            return self._sign * self._score_func(y_trans, y_pred,
+                                                 **self._kwargs)
+
+
+# roc_auc
+binarize_auc_scorer =\
+        _BinarizeTargetProbaScorer(metrics.roc_auc_score, 1, {})
+
+# average_precision_scorer
+binarize_average_precision_scorer =\
+        _BinarizeTargetProbaScorer(metrics.average_precision_score, 1, {})
+
+# roc_auc_scorer
+iraps_auc_scorer = binarize_auc_scorer
+
+# average_precision_scorer
+iraps_average_precision_scorer = binarize_average_precision_scorer
+
+
+class BinarizeTargetRegressor(BaseEstimator, RegressorMixin):
+    """
+    Extend regression estimator to have discretize_value
+
+    Parameters
+    ----------
+    regressor: object
+        Estimator object such as derived from sklearn `RegressionMixin`.
+
+    z_score: float, default=-1.0
+        Threshold value based on z_score. Will be ignored when
+        fixed_value is set
+
+    value: float, default=None
+        Threshold value
+
+    less_is_positive: boolean, default=True
+        When target is less the threshold value, it will be converted
+        to True, False otherwise.
+
+    Attributes
+    ----------
+    regressor_: object
+        Fitted regressor
+
+    discretize_value: float
+        The threshold value used to discretize True and False targets
+    """
+
+    def __init__(self, regressor, z_score=-1, value=None,
+                 less_is_positive=True):
+        self.regressor = regressor
+        self.z_score = z_score
+        self.value = value
+        self.less_is_positive = less_is_positive
+
+    def fit(self, X, y, sample_weight=None):
+        """
+        Calculate the discretize_value fit the regressor with traning data
+
+        Returns
+        ------
+        self: object
+        """
+        y = check_array(y, accept_sparse=False, force_all_finite=True,
+                        ensure_2d=False, dtype='numeric')
+        y = column_or_1d(y)
+
+        if self.value is None:
+            discretize_value = y.mean() + y.std() * self.z_score
+        else:
+            discretize_value = self.Value
+        self.discretize_value = discretize_value
+
+        self.regressor_ = clone(self.regressor)
+
+        if sample_weight is not None:
+            self.regressor_.fit(X, y, sample_weight=sample_weight)
+        else:
+            self.regressor_.fit(X, y)
+
+        # attach classifier attributes
+        if hasattr(self.regressor_, 'feature_importances_'):
+            self.feature_importances_ = self.regressor_.feature_importances_
+        if hasattr(self.regressor_, 'coef_'):
+            self.coef_ = self.regressor_.coef_
+        if hasattr(self.regressor_, 'n_outputs_'):
+            self.n_outputs_ = self.regressor_.n_outputs_
+        if hasattr(self.regressor_, 'n_features_'):
+            self.n_features_ = self.regressor_.n_features_
+
+        return self
+
+    def predict(self, X):
+        """Predict target value of X
+        """
+        check_is_fitted(self, 'regressor_')
+        y_pred = self.regressor_.predict(X)
+        if not np.all((y_pred >= 0) & (y_pred <= 1)):
+            y_pred = (y_pred - y_pred.min()) / (y_pred.max() - y_pred.min())
+        if self.less_is_positive:
+            y_pred = 1 - y_pred
+        return y_pred
+
+
+# roc_auc_scorer
+regression_auc_scorer = binarize_auc_scorer
+
+# average_precision_scorer
+regression_average_precision_scorer = binarize_average_precision_scorer