Mercurial > repos > bgruening > sklearn_data_preprocess
comparison iraps_classifier.py @ 24:9e43ee712723 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
author | bgruening |
---|---|
date | Tue, 14 May 2019 18:19:35 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
23:d6b8103c909c | 24:9e43ee712723 |
---|---|
1 """ | |
2 class IRAPSCore | |
3 class IRAPSClassifier | |
4 class BinarizeTargetClassifier | |
5 class BinarizeTargetRegressor | |
6 class _BinarizeTargetScorer | |
7 class _BinarizeTargetProbaScorer | |
8 | |
9 binarize_auc_scorer | |
10 binarize_average_precision_scorer | |
11 | |
12 binarize_accuracy_scorer | |
13 binarize_balanced_accuracy_scorer | |
14 binarize_precision_scorer | |
15 binarize_recall_scorer | |
16 """ | |
17 | |
18 | |
19 import numpy as np | |
20 import random | |
21 import warnings | |
22 | |
23 from abc import ABCMeta | |
24 from scipy.stats import ttest_ind | |
25 from sklearn import metrics | |
26 from sklearn.base import BaseEstimator, clone, RegressorMixin | |
27 from sklearn.externals import six | |
28 from sklearn.feature_selection.univariate_selection import _BaseFilter | |
29 from sklearn.metrics.scorer import _BaseScorer | |
30 from sklearn.pipeline import Pipeline | |
31 from sklearn.utils import as_float_array, check_X_y | |
32 from sklearn.utils._joblib import Parallel, delayed | |
33 from sklearn.utils.validation import (check_array, check_is_fitted, | |
34 check_memory, column_or_1d) | |
35 | |
36 | |
37 VERSION = '0.1.1' | |
38 | |
39 | |
40 class IRAPSCore(six.with_metaclass(ABCMeta, BaseEstimator)): | |
41 """ | |
42 Base class of IRAPSClassifier | |
43 From sklearn BaseEstimator: | |
44 get_params() | |
45 set_params() | |
46 | |
47 Parameters | |
48 ---------- | |
49 n_iter : int | |
50 sample count | |
51 | |
52 positive_thres : float | |
53 z_score shreshold to discretize positive target values | |
54 | |
55 negative_thres : float | |
56 z_score threshold to discretize negative target values | |
57 | |
58 verbose : int | |
59 0 or geater, if not 0, print progress | |
60 | |
61 n_jobs : int, default=1 | |
62 The number of CPUs to use to do the computation. | |
63 | |
64 pre_dispatch : int, or string. | |
65 Controls the number of jobs that get dispatched during parallel | |
66 execution. Reducing this number can be useful to avoid an | |
67 explosion of memory consumption when more jobs get dispatched | |
68 than CPUs can process. This parameter can be: | |
69 - None, in which case all the jobs are immediately | |
70 created and spawned. Use this for lightweight and | |
71 fast-running jobs, to avoid delays due to on-demand | |
72 spawning of the jobs | |
73 - An int, giving the exact number of total jobs that are | |
74 spawned | |
75 - A string, giving an expression as a function of n_jobs, | |
76 as in '2*n_jobs' | |
77 | |
78 random_state : int or None | |
79 """ | |
80 | |
81 def __init__(self, n_iter=1000, positive_thres=-1, negative_thres=0, | |
82 verbose=0, n_jobs=1, pre_dispatch='2*n_jobs', | |
83 random_state=None): | |
84 """ | |
85 IRAPS turns towwards general Anomaly Detection | |
86 It comapares positive_thres with negative_thres, | |
87 and decide which portion is the positive target. | |
88 e.g.: | |
89 (positive_thres=-1, negative_thres=0) | |
90 => positive = Z_score of target < -1 | |
91 (positive_thres=1, negative_thres=0) | |
92 => positive = Z_score of target > 1 | |
93 | |
94 Note: The positive targets here is always the | |
95 abnormal minority group. | |
96 """ | |
97 self.n_iter = n_iter | |
98 self.positive_thres = positive_thres | |
99 self.negative_thres = negative_thres | |
100 self.verbose = verbose | |
101 self.n_jobs = n_jobs | |
102 self.pre_dispatch = pre_dispatch | |
103 self.random_state = random_state | |
104 | |
105 def fit(self, X, y): | |
106 """ | |
107 X: array-like (n_samples x n_features) | |
108 y: 1-d array-like (n_samples) | |
109 """ | |
110 X, y = check_X_y(X, y, ['csr', 'csc'], multi_output=False) | |
111 | |
112 def _stochastic_sampling(X, y, random_state=None, positive_thres=-1, | |
113 negative_thres=0): | |
114 # each iteration select a random number of random subset of | |
115 # training samples. this is somewhat different from the original | |
116 # IRAPS method, but effect is almost the same. | |
117 SAMPLE_SIZE = [0.25, 0.75] | |
118 n_samples = X.shape[0] | |
119 | |
120 if random_state is None: | |
121 n_select = random.randint(int(n_samples * SAMPLE_SIZE[0]), | |
122 int(n_samples * SAMPLE_SIZE[1])) | |
123 index = random.sample(list(range(n_samples)), n_select) | |
124 else: | |
125 n_select = random.Random(random_state).randint( | |
126 int(n_samples * SAMPLE_SIZE[0]), | |
127 int(n_samples * SAMPLE_SIZE[1])) | |
128 index = random.Random(random_state).sample( | |
129 list(range(n_samples)), n_select) | |
130 | |
131 X_selected, y_selected = X[index], y[index] | |
132 | |
133 # Spliting by z_scores. | |
134 y_selected = (y_selected - y_selected.mean()) / y_selected.std() | |
135 if positive_thres < negative_thres: | |
136 X_selected_positive = X_selected[y_selected < positive_thres] | |
137 X_selected_negative = X_selected[y_selected > negative_thres] | |
138 else: | |
139 X_selected_positive = X_selected[y_selected > positive_thres] | |
140 X_selected_negative = X_selected[y_selected < negative_thres] | |
141 | |
142 # For every iteration, at least 5 responders are selected | |
143 if X_selected_positive.shape[0] < 5: | |
144 warnings.warn("Warning: fewer than 5 positives were selected!") | |
145 return | |
146 | |
147 # p_values | |
148 _, p = ttest_ind(X_selected_positive, X_selected_negative, | |
149 axis=0, equal_var=False) | |
150 | |
151 # fold_change == mean change? | |
152 # TODO implement other normalization method | |
153 positive_mean = X_selected_positive.mean(axis=0) | |
154 negative_mean = X_selected_negative.mean(axis=0) | |
155 mean_change = positive_mean - negative_mean | |
156 # mean_change = np.select( | |
157 # [positive_mean > negative_mean, | |
158 # positive_mean < negative_mean], | |
159 # [positive_mean / negative_mean, | |
160 # -negative_mean / positive_mean]) | |
161 # mean_change could be adjusted by power of 2 | |
162 # mean_change = 2**mean_change \ | |
163 # if mean_change>0 else -2**abs(mean_change) | |
164 | |
165 return p, mean_change, negative_mean | |
166 | |
167 parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, | |
168 pre_dispatch=self.pre_dispatch) | |
169 if self.random_state is None: | |
170 res = parallel(delayed(_stochastic_sampling)( | |
171 X, y, random_state=None, | |
172 positive_thres=self.positive_thres, | |
173 negative_thres=self.negative_thres) | |
174 for i in range(self.n_iter)) | |
175 else: | |
176 res = parallel(delayed(_stochastic_sampling)( | |
177 X, y, random_state=seed, | |
178 positive_thres=self.positive_thres, | |
179 negative_thres=self.negative_thres) | |
180 for seed in range(self.random_state, | |
181 self.random_state+self.n_iter)) | |
182 res = [_ for _ in res if _] | |
183 if len(res) < 50: | |
184 raise ValueError("too few (%d) valid feature lists " | |
185 "were generated!" % len(res)) | |
186 pvalues = np.vstack([x[0] for x in res]) | |
187 fold_changes = np.vstack([x[1] for x in res]) | |
188 base_values = np.vstack([x[2] for x in res]) | |
189 | |
190 self.pvalues_ = np.asarray(pvalues) | |
191 self.fold_changes_ = np.asarray(fold_changes) | |
192 self.base_values_ = np.asarray(base_values) | |
193 | |
194 return self | |
195 | |
196 | |
197 def _iraps_core_fit(iraps_core, X, y): | |
198 return iraps_core.fit(X, y) | |
199 | |
200 | |
201 class IRAPSClassifier(six.with_metaclass(ABCMeta, _BaseFilter, | |
202 BaseEstimator, RegressorMixin)): | |
203 """ | |
204 Extend the bases of both sklearn feature_selector and classifier. | |
205 From sklearn BaseEstimator: | |
206 get_params() | |
207 set_params() | |
208 From sklearn _BaseFilter: | |
209 get_support() | |
210 fit_transform(X) | |
211 transform(X) | |
212 From sklearn RegressorMixin: | |
213 score(X, y): R2 | |
214 New: | |
215 predict(X) | |
216 predict_label(X) | |
217 get_signature() | |
218 Properties: | |
219 discretize_value | |
220 | |
221 Parameters | |
222 ---------- | |
223 iraps_core: object | |
224 p_thres: float, threshold for p_values | |
225 fc_thres: float, threshold for fold change or mean difference | |
226 occurrence: float, occurrence rate selected by set of p_thres and fc_thres | |
227 discretize: float, threshold of z_score to discretize target value | |
228 memory: None, str or joblib.Memory object | |
229 min_signature_features: int, the mininum number of features in a signature | |
230 """ | |
231 | |
232 def __init__(self, iraps_core, p_thres=1e-4, fc_thres=0.1, | |
233 occurrence=0.8, discretize=-1, memory=None, | |
234 min_signature_features=1): | |
235 self.iraps_core = iraps_core | |
236 self.p_thres = p_thres | |
237 self.fc_thres = fc_thres | |
238 self.occurrence = occurrence | |
239 self.discretize = discretize | |
240 self.memory = memory | |
241 self.min_signature_features = min_signature_features | |
242 | |
243 def fit(self, X, y): | |
244 memory = check_memory(self.memory) | |
245 cached_fit = memory.cache(_iraps_core_fit) | |
246 iraps_core = clone(self.iraps_core) | |
247 # allow pre-fitted iraps_core here | |
248 if not hasattr(iraps_core, 'pvalues_'): | |
249 iraps_core = cached_fit(iraps_core, X, y) | |
250 self.iraps_core_ = iraps_core | |
251 | |
252 pvalues = as_float_array(iraps_core.pvalues_, copy=True) | |
253 # why np.nan is here? | |
254 pvalues[np.isnan(pvalues)] = np.finfo(pvalues.dtype).max | |
255 | |
256 fold_changes = as_float_array(iraps_core.fold_changes_, copy=True) | |
257 fold_changes[np.isnan(fold_changes)] = 0.0 | |
258 | |
259 base_values = as_float_array(iraps_core.base_values_, copy=True) | |
260 | |
261 p_thres = self.p_thres | |
262 fc_thres = self.fc_thres | |
263 occurrence = self.occurrence | |
264 | |
265 mask_0 = np.zeros(pvalues.shape, dtype=np.int32) | |
266 # mark p_values less than the threashold | |
267 mask_0[pvalues <= p_thres] = 1 | |
268 # mark fold_changes only when greater than the threashold | |
269 mask_0[abs(fold_changes) < fc_thres] = 0 | |
270 | |
271 # count the occurrence and mask greater than the threshold | |
272 counts = mask_0.sum(axis=0) | |
273 occurrence_thres = int(occurrence * iraps_core.n_iter) | |
274 mask = np.zeros(counts.shape, dtype=bool) | |
275 mask[counts >= occurrence_thres] = 1 | |
276 | |
277 # generate signature | |
278 fold_changes[mask_0 == 0] = 0.0 | |
279 signature = fold_changes[:, mask].sum(axis=0) / counts[mask] | |
280 signature = np.vstack((signature, base_values[:, mask].mean(axis=0))) | |
281 # It's not clearn whether min_size could impact prediction | |
282 # performance | |
283 if signature is None\ | |
284 or signature.shape[1] < self.min_signature_features: | |
285 raise ValueError("The classifier got None signature or the number " | |
286 "of sinature feature is less than minimum!") | |
287 | |
288 self.signature_ = np.asarray(signature) | |
289 self.mask_ = mask | |
290 # TODO: support other discretize method: fixed value, upper | |
291 # third quater, etc. | |
292 self.discretize_value = y.mean() + y.std() * self.discretize | |
293 if iraps_core.negative_thres > iraps_core.positive_thres: | |
294 self.less_is_positive = True | |
295 else: | |
296 self.less_is_positive = False | |
297 | |
298 return self | |
299 | |
300 def _get_support_mask(self): | |
301 """ | |
302 return mask of feature selection indices | |
303 """ | |
304 check_is_fitted(self, 'mask_') | |
305 | |
306 return self.mask_ | |
307 | |
308 def get_signature(self): | |
309 """ | |
310 return signature | |
311 """ | |
312 check_is_fitted(self, 'signature_') | |
313 | |
314 return self.signature_ | |
315 | |
316 def predict(self, X): | |
317 """ | |
318 compute the correlation coefficient with irpas signature | |
319 """ | |
320 signature = self.get_signature() | |
321 | |
322 X = as_float_array(X) | |
323 X_transformed = self.transform(X) - signature[1] | |
324 corrcoef = np.array( | |
325 [np.corrcoef(signature[0], e)[0][1] for e in X_transformed]) | |
326 corrcoef[np.isnan(corrcoef)] = np.finfo(np.float32).min | |
327 | |
328 return corrcoef | |
329 | |
330 def predict_label(self, X, clf_cutoff=0.4): | |
331 return self.predict(X) >= clf_cutoff | |
332 | |
333 | |
334 class BinarizeTargetClassifier(BaseEstimator, RegressorMixin): | |
335 """ | |
336 Convert continuous target to binary labels (True and False) | |
337 and apply a classification estimator. | |
338 | |
339 Parameters | |
340 ---------- | |
341 classifier: object | |
342 Estimator object such as derived from sklearn `ClassifierMixin`. | |
343 | |
344 z_score: float, default=-1.0 | |
345 Threshold value based on z_score. Will be ignored when | |
346 fixed_value is set | |
347 | |
348 value: float, default=None | |
349 Threshold value | |
350 | |
351 less_is_positive: boolean, default=True | |
352 When target is less the threshold value, it will be converted | |
353 to True, False otherwise. | |
354 | |
355 Attributes | |
356 ---------- | |
357 classifier_: object | |
358 Fitted classifier | |
359 | |
360 discretize_value: float | |
361 The threshold value used to discretize True and False targets | |
362 """ | |
363 | |
364 def __init__(self, classifier, z_score=-1, value=None, | |
365 less_is_positive=True): | |
366 self.classifier = classifier | |
367 self.z_score = z_score | |
368 self.value = value | |
369 self.less_is_positive = less_is_positive | |
370 | |
371 def fit(self, X, y, sample_weight=None): | |
372 """ | |
373 Convert y to True and False labels and then fit the classifier | |
374 with X and new y | |
375 | |
376 Returns | |
377 ------ | |
378 self: object | |
379 """ | |
380 y = check_array(y, accept_sparse=False, force_all_finite=True, | |
381 ensure_2d=False, dtype='numeric') | |
382 y = column_or_1d(y) | |
383 | |
384 if self.value is None: | |
385 discretize_value = y.mean() + y.std() * self.z_score | |
386 else: | |
387 discretize_value = self.Value | |
388 self.discretize_value = discretize_value | |
389 | |
390 if self.less_is_positive: | |
391 y_trans = y < discretize_value | |
392 else: | |
393 y_trans = y > discretize_value | |
394 | |
395 self.classifier_ = clone(self.classifier) | |
396 | |
397 if sample_weight is not None: | |
398 self.classifier_.fit(X, y_trans, sample_weight=sample_weight) | |
399 else: | |
400 self.classifier_.fit(X, y_trans) | |
401 | |
402 if hasattr(self.classifier_, 'feature_importances_'): | |
403 self.feature_importances_ = self.classifier_.feature_importances_ | |
404 if hasattr(self.classifier_, 'coef_'): | |
405 self.coef_ = self.classifier_.coef_ | |
406 if hasattr(self.classifier_, 'n_outputs_'): | |
407 self.n_outputs_ = self.classifier_.n_outputs_ | |
408 if hasattr(self.classifier_, 'n_features_'): | |
409 self.n_features_ = self.classifier_.n_features_ | |
410 | |
411 return self | |
412 | |
413 def predict(self, X): | |
414 """ | |
415 Predict class probabilities of X. | |
416 """ | |
417 check_is_fitted(self, 'classifier_') | |
418 proba = self.classifier_.predict_proba(X) | |
419 return proba[:, 1] | |
420 | |
421 def predict_label(self, X): | |
422 """Predict class label of X | |
423 """ | |
424 check_is_fitted(self, 'classifier_') | |
425 return self.classifier_.predict(X) | |
426 | |
427 | |
428 class _BinarizeTargetProbaScorer(_BaseScorer): | |
429 """ | |
430 base class to make binarized target specific scorer | |
431 """ | |
432 | |
433 def __call__(self, clf, X, y, sample_weight=None): | |
434 clf_name = clf.__class__.__name__ | |
435 # support pipeline object | |
436 if isinstance(clf, Pipeline): | |
437 main_estimator = clf.steps[-1][-1] | |
438 # support stacking ensemble estimators | |
439 # TODO support nested pipeline/stacking estimators | |
440 elif clf_name in ['StackingCVClassifier', 'StackingClassifier']: | |
441 main_estimator = clf.meta_clf_ | |
442 elif clf_name in ['StackingCVRegressor', 'StackingRegressor']: | |
443 main_estimator = clf.meta_regr_ | |
444 else: | |
445 main_estimator = clf | |
446 | |
447 discretize_value = main_estimator.discretize_value | |
448 less_is_positive = main_estimator.less_is_positive | |
449 | |
450 if less_is_positive: | |
451 y_trans = y < discretize_value | |
452 else: | |
453 y_trans = y > discretize_value | |
454 | |
455 y_pred = clf.predict(X) | |
456 if sample_weight is not None: | |
457 return self._sign * self._score_func(y_trans, y_pred, | |
458 sample_weight=sample_weight, | |
459 **self._kwargs) | |
460 else: | |
461 return self._sign * self._score_func(y_trans, y_pred, | |
462 **self._kwargs) | |
463 | |
464 | |
465 # roc_auc | |
466 binarize_auc_scorer =\ | |
467 _BinarizeTargetProbaScorer(metrics.roc_auc_score, 1, {}) | |
468 | |
469 # average_precision_scorer | |
470 binarize_average_precision_scorer =\ | |
471 _BinarizeTargetProbaScorer(metrics.average_precision_score, 1, {}) | |
472 | |
473 # roc_auc_scorer | |
474 iraps_auc_scorer = binarize_auc_scorer | |
475 | |
476 # average_precision_scorer | |
477 iraps_average_precision_scorer = binarize_average_precision_scorer | |
478 | |
479 | |
480 class BinarizeTargetRegressor(BaseEstimator, RegressorMixin): | |
481 """ | |
482 Extend regression estimator to have discretize_value | |
483 | |
484 Parameters | |
485 ---------- | |
486 regressor: object | |
487 Estimator object such as derived from sklearn `RegressionMixin`. | |
488 | |
489 z_score: float, default=-1.0 | |
490 Threshold value based on z_score. Will be ignored when | |
491 fixed_value is set | |
492 | |
493 value: float, default=None | |
494 Threshold value | |
495 | |
496 less_is_positive: boolean, default=True | |
497 When target is less the threshold value, it will be converted | |
498 to True, False otherwise. | |
499 | |
500 Attributes | |
501 ---------- | |
502 regressor_: object | |
503 Fitted regressor | |
504 | |
505 discretize_value: float | |
506 The threshold value used to discretize True and False targets | |
507 """ | |
508 | |
509 def __init__(self, regressor, z_score=-1, value=None, | |
510 less_is_positive=True): | |
511 self.regressor = regressor | |
512 self.z_score = z_score | |
513 self.value = value | |
514 self.less_is_positive = less_is_positive | |
515 | |
516 def fit(self, X, y, sample_weight=None): | |
517 """ | |
518 Calculate the discretize_value fit the regressor with traning data | |
519 | |
520 Returns | |
521 ------ | |
522 self: object | |
523 """ | |
524 y = check_array(y, accept_sparse=False, force_all_finite=True, | |
525 ensure_2d=False, dtype='numeric') | |
526 y = column_or_1d(y) | |
527 | |
528 if self.value is None: | |
529 discretize_value = y.mean() + y.std() * self.z_score | |
530 else: | |
531 discretize_value = self.Value | |
532 self.discretize_value = discretize_value | |
533 | |
534 self.regressor_ = clone(self.regressor) | |
535 | |
536 if sample_weight is not None: | |
537 self.regressor_.fit(X, y, sample_weight=sample_weight) | |
538 else: | |
539 self.regressor_.fit(X, y) | |
540 | |
541 # attach classifier attributes | |
542 if hasattr(self.regressor_, 'feature_importances_'): | |
543 self.feature_importances_ = self.regressor_.feature_importances_ | |
544 if hasattr(self.regressor_, 'coef_'): | |
545 self.coef_ = self.regressor_.coef_ | |
546 if hasattr(self.regressor_, 'n_outputs_'): | |
547 self.n_outputs_ = self.regressor_.n_outputs_ | |
548 if hasattr(self.regressor_, 'n_features_'): | |
549 self.n_features_ = self.regressor_.n_features_ | |
550 | |
551 return self | |
552 | |
553 def predict(self, X): | |
554 """Predict target value of X | |
555 """ | |
556 check_is_fitted(self, 'regressor_') | |
557 y_pred = self.regressor_.predict(X) | |
558 if not np.all((y_pred >= 0) & (y_pred <= 1)): | |
559 y_pred = (y_pred - y_pred.min()) / (y_pred.max() - y_pred.min()) | |
560 if self.less_is_positive: | |
561 y_pred = 1 - y_pred | |
562 return y_pred | |
563 | |
564 | |
565 # roc_auc_scorer | |
566 regression_auc_scorer = binarize_auc_scorer | |
567 | |
568 # average_precision_scorer | |
569 regression_average_precision_scorer = binarize_average_precision_scorer |