comparison preprocessors.py @ 24:97b467e06354 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
author bgruening
date Tue, 14 May 2019 18:07:39 -0400
parents
children
comparison
equal deleted inserted replaced
23:4ba68dd788b3 24:97b467e06354
1 """
2 Z_RandomOverSampler
3 """
4
5 import imblearn
6 import numpy as np
7
8 from collections import Counter
9 from imblearn.over_sampling.base import BaseOverSampler
10 from imblearn.over_sampling import RandomOverSampler
11 from imblearn.pipeline import Pipeline as imbPipeline
12 from imblearn.utils import check_target_type
13 from scipy import sparse
14 from sklearn.base import BaseEstimator, TransformerMixin
15 from sklearn.preprocessing.data import _handle_zeros_in_scale
16 from sklearn.utils import check_array, safe_indexing
17 from sklearn.utils.fixes import nanpercentile
18 from sklearn.utils.validation import (check_is_fitted, check_X_y,
19 FLOAT_DTYPES)
20
21
22 class Z_RandomOverSampler(BaseOverSampler):
23
24 def __init__(self, sampling_strategy='auto',
25 return_indices=False,
26 random_state=None,
27 ratio=None,
28 negative_thres=0,
29 positive_thres=-1):
30 super(Z_RandomOverSampler, self).__init__(
31 sampling_strategy=sampling_strategy, ratio=ratio)
32 self.random_state = random_state
33 self.return_indices = return_indices
34 self.negative_thres = negative_thres
35 self.positive_thres = positive_thres
36
37 @staticmethod
38 def _check_X_y(X, y):
39 y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
40 X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None)
41 return X, y, binarize_y
42
43 def _fit_resample(self, X, y):
44 n_samples = X.shape[0]
45
46 # convert y to z_score
47 y_z = (y - y.mean()) / y.std()
48
49 index0 = np.arange(n_samples)
50 index_negative = index0[y_z > self.negative_thres]
51 index_positive = index0[y_z <= self.positive_thres]
52 index_unclassified = [x for x in index0
53 if x not in index_negative
54 and x not in index_positive]
55
56 y_z[index_negative] = 0
57 y_z[index_positive] = 1
58 y_z[index_unclassified] = -1
59
60 ros = RandomOverSampler(
61 sampling_strategy=self.sampling_strategy,
62 random_state=self.random_state,
63 ratio=self.ratio)
64 _, _ = ros.fit_resample(X, y_z)
65 sample_indices = ros.sample_indices_
66
67 print("Before sampler: %s. Total after: %s"
68 % (Counter(y_z), sample_indices.shape))
69
70 self.sample_indices_ = np.array(sample_indices)
71
72 if self.return_indices:
73 return (safe_indexing(X, sample_indices),
74 safe_indexing(y, sample_indices),
75 sample_indices)
76 return (safe_indexing(X, sample_indices),
77 safe_indexing(y, sample_indices))
78
79
80 def _get_quantiles(X, quantile_range):
81 """
82 Calculate column percentiles for 2d array
83
84 Parameters
85 ----------
86 X : array-like, shape [n_samples, n_features]
87 """
88 quantiles = []
89 for feature_idx in range(X.shape[1]):
90 if sparse.issparse(X):
91 column_nnz_data = X.data[
92 X.indptr[feature_idx]: X.indptr[feature_idx + 1]]
93 column_data = np.zeros(shape=X.shape[0], dtype=X.dtype)
94 column_data[:len(column_nnz_data)] = column_nnz_data
95 else:
96 column_data = X[:, feature_idx]
97 quantiles.append(nanpercentile(column_data, quantile_range))
98
99 quantiles = np.transpose(quantiles)
100
101 return quantiles
102
103
104 class TDMScaler(BaseEstimator, TransformerMixin):
105 """
106 Scale features using Training Distribution Matching (TDM) algorithm
107
108 References
109 ----------
110 .. [1] Thompson JA, Tan J and Greene CS (2016) Cross-platform
111 normalization of microarray and RNA-seq data for machine
112 learning applications. PeerJ 4, e1621.
113 """
114
115 def __init__(self, q_lower=25.0, q_upper=75.0, ):
116 self.q_lower = q_lower
117 self.q_upper = q_upper
118
119 def fit(self, X, y=None):
120 """
121 Parameters
122 ----------
123 X : array-like, shape [n_samples, n_features]
124 """
125 X = check_array(X, copy=True, estimator=self, dtype=FLOAT_DTYPES,
126 force_all_finite=True)
127
128 if not 0 <= self.q_lower <= self.q_upper <= 100:
129 raise ValueError("Invalid quantile parameter values: "
130 "q_lower %s, q_upper: %s"
131 % (str(self.q_lower), str(self.q_upper)))
132
133 # TODO sparse data
134 quantiles = nanpercentile(X, (self.q_lower, self.q_upper))
135 iqr = quantiles[1] - quantiles[0]
136
137 self.q_lower_ = quantiles[0]
138 self.q_upper_ = quantiles[1]
139 self.iqr_ = _handle_zeros_in_scale(iqr, copy=False)
140
141 self.max_ = np.nanmax(X)
142 self.min_ = np.nanmin(X)
143
144 return self
145
146 def transform(self, X):
147 """
148 Parameters
149 ----------
150 X : {array-like, sparse matrix}
151 The data used to scale along the specified axis.
152 """
153 check_is_fitted(self, 'iqr_', 'max_')
154 X = check_array(X, copy=True, estimator=self, dtype=FLOAT_DTYPES,
155 force_all_finite=True)
156
157 # TODO sparse data
158 train_upper_scale = (self.max_ - self.q_upper_) / self.iqr_
159 train_lower_scale = (self.q_lower_ - self.min_) / self.iqr_
160
161 test_quantiles = nanpercentile(X, (self.q_lower, self.q_upper))
162 test_iqr = _handle_zeros_in_scale(
163 test_quantiles[1] - test_quantiles[0], copy=False)
164
165 test_upper_bound = test_quantiles[1] + train_upper_scale * test_iqr
166 test_lower_bound = test_quantiles[0] - train_lower_scale * test_iqr
167
168 test_min = np.nanmin(X)
169 if test_lower_bound < test_min:
170 test_lower_bound = test_min
171
172 X[X > test_upper_bound] = test_upper_bound
173 X[X < test_lower_bound] = test_lower_bound
174
175 X = (X - test_lower_bound) / (test_upper_bound - test_lower_bound)\
176 * (self.max_ - self.min_) + self.min_
177
178 return X
179
180 def inverse_transform(self, X):
181 """
182 Scale the data back to the original state
183 """
184 raise NotImplementedError("Inverse transformation is not implemented!")