Mercurial > repos > bgruening > sklearn_sample_generator
comparison preprocessors.py @ 24:97b467e06354 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
author | bgruening |
---|---|
date | Tue, 14 May 2019 18:07:39 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
23:4ba68dd788b3 | 24:97b467e06354 |
---|---|
1 """ | |
2 Z_RandomOverSampler | |
3 """ | |
4 | |
5 import imblearn | |
6 import numpy as np | |
7 | |
8 from collections import Counter | |
9 from imblearn.over_sampling.base import BaseOverSampler | |
10 from imblearn.over_sampling import RandomOverSampler | |
11 from imblearn.pipeline import Pipeline as imbPipeline | |
12 from imblearn.utils import check_target_type | |
13 from scipy import sparse | |
14 from sklearn.base import BaseEstimator, TransformerMixin | |
15 from sklearn.preprocessing.data import _handle_zeros_in_scale | |
16 from sklearn.utils import check_array, safe_indexing | |
17 from sklearn.utils.fixes import nanpercentile | |
18 from sklearn.utils.validation import (check_is_fitted, check_X_y, | |
19 FLOAT_DTYPES) | |
20 | |
21 | |
22 class Z_RandomOverSampler(BaseOverSampler): | |
23 | |
24 def __init__(self, sampling_strategy='auto', | |
25 return_indices=False, | |
26 random_state=None, | |
27 ratio=None, | |
28 negative_thres=0, | |
29 positive_thres=-1): | |
30 super(Z_RandomOverSampler, self).__init__( | |
31 sampling_strategy=sampling_strategy, ratio=ratio) | |
32 self.random_state = random_state | |
33 self.return_indices = return_indices | |
34 self.negative_thres = negative_thres | |
35 self.positive_thres = positive_thres | |
36 | |
37 @staticmethod | |
38 def _check_X_y(X, y): | |
39 y, binarize_y = check_target_type(y, indicate_one_vs_all=True) | |
40 X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], dtype=None) | |
41 return X, y, binarize_y | |
42 | |
43 def _fit_resample(self, X, y): | |
44 n_samples = X.shape[0] | |
45 | |
46 # convert y to z_score | |
47 y_z = (y - y.mean()) / y.std() | |
48 | |
49 index0 = np.arange(n_samples) | |
50 index_negative = index0[y_z > self.negative_thres] | |
51 index_positive = index0[y_z <= self.positive_thres] | |
52 index_unclassified = [x for x in index0 | |
53 if x not in index_negative | |
54 and x not in index_positive] | |
55 | |
56 y_z[index_negative] = 0 | |
57 y_z[index_positive] = 1 | |
58 y_z[index_unclassified] = -1 | |
59 | |
60 ros = RandomOverSampler( | |
61 sampling_strategy=self.sampling_strategy, | |
62 random_state=self.random_state, | |
63 ratio=self.ratio) | |
64 _, _ = ros.fit_resample(X, y_z) | |
65 sample_indices = ros.sample_indices_ | |
66 | |
67 print("Before sampler: %s. Total after: %s" | |
68 % (Counter(y_z), sample_indices.shape)) | |
69 | |
70 self.sample_indices_ = np.array(sample_indices) | |
71 | |
72 if self.return_indices: | |
73 return (safe_indexing(X, sample_indices), | |
74 safe_indexing(y, sample_indices), | |
75 sample_indices) | |
76 return (safe_indexing(X, sample_indices), | |
77 safe_indexing(y, sample_indices)) | |
78 | |
79 | |
80 def _get_quantiles(X, quantile_range): | |
81 """ | |
82 Calculate column percentiles for 2d array | |
83 | |
84 Parameters | |
85 ---------- | |
86 X : array-like, shape [n_samples, n_features] | |
87 """ | |
88 quantiles = [] | |
89 for feature_idx in range(X.shape[1]): | |
90 if sparse.issparse(X): | |
91 column_nnz_data = X.data[ | |
92 X.indptr[feature_idx]: X.indptr[feature_idx + 1]] | |
93 column_data = np.zeros(shape=X.shape[0], dtype=X.dtype) | |
94 column_data[:len(column_nnz_data)] = column_nnz_data | |
95 else: | |
96 column_data = X[:, feature_idx] | |
97 quantiles.append(nanpercentile(column_data, quantile_range)) | |
98 | |
99 quantiles = np.transpose(quantiles) | |
100 | |
101 return quantiles | |
102 | |
103 | |
104 class TDMScaler(BaseEstimator, TransformerMixin): | |
105 """ | |
106 Scale features using Training Distribution Matching (TDM) algorithm | |
107 | |
108 References | |
109 ---------- | |
110 .. [1] Thompson JA, Tan J and Greene CS (2016) Cross-platform | |
111 normalization of microarray and RNA-seq data for machine | |
112 learning applications. PeerJ 4, e1621. | |
113 """ | |
114 | |
115 def __init__(self, q_lower=25.0, q_upper=75.0, ): | |
116 self.q_lower = q_lower | |
117 self.q_upper = q_upper | |
118 | |
119 def fit(self, X, y=None): | |
120 """ | |
121 Parameters | |
122 ---------- | |
123 X : array-like, shape [n_samples, n_features] | |
124 """ | |
125 X = check_array(X, copy=True, estimator=self, dtype=FLOAT_DTYPES, | |
126 force_all_finite=True) | |
127 | |
128 if not 0 <= self.q_lower <= self.q_upper <= 100: | |
129 raise ValueError("Invalid quantile parameter values: " | |
130 "q_lower %s, q_upper: %s" | |
131 % (str(self.q_lower), str(self.q_upper))) | |
132 | |
133 # TODO sparse data | |
134 quantiles = nanpercentile(X, (self.q_lower, self.q_upper)) | |
135 iqr = quantiles[1] - quantiles[0] | |
136 | |
137 self.q_lower_ = quantiles[0] | |
138 self.q_upper_ = quantiles[1] | |
139 self.iqr_ = _handle_zeros_in_scale(iqr, copy=False) | |
140 | |
141 self.max_ = np.nanmax(X) | |
142 self.min_ = np.nanmin(X) | |
143 | |
144 return self | |
145 | |
146 def transform(self, X): | |
147 """ | |
148 Parameters | |
149 ---------- | |
150 X : {array-like, sparse matrix} | |
151 The data used to scale along the specified axis. | |
152 """ | |
153 check_is_fitted(self, 'iqr_', 'max_') | |
154 X = check_array(X, copy=True, estimator=self, dtype=FLOAT_DTYPES, | |
155 force_all_finite=True) | |
156 | |
157 # TODO sparse data | |
158 train_upper_scale = (self.max_ - self.q_upper_) / self.iqr_ | |
159 train_lower_scale = (self.q_lower_ - self.min_) / self.iqr_ | |
160 | |
161 test_quantiles = nanpercentile(X, (self.q_lower, self.q_upper)) | |
162 test_iqr = _handle_zeros_in_scale( | |
163 test_quantiles[1] - test_quantiles[0], copy=False) | |
164 | |
165 test_upper_bound = test_quantiles[1] + train_upper_scale * test_iqr | |
166 test_lower_bound = test_quantiles[0] - train_lower_scale * test_iqr | |
167 | |
168 test_min = np.nanmin(X) | |
169 if test_lower_bound < test_min: | |
170 test_lower_bound = test_min | |
171 | |
172 X[X > test_upper_bound] = test_upper_bound | |
173 X[X < test_lower_bound] = test_lower_bound | |
174 | |
175 X = (X - test_lower_bound) / (test_upper_bound - test_lower_bound)\ | |
176 * (self.max_ - self.min_) + self.min_ | |
177 | |
178 return X | |
179 | |
180 def inverse_transform(self, X): | |
181 """ | |
182 Scale the data back to the original state | |
183 """ | |
184 raise NotImplementedError("Inverse transformation is not implemented!") |