comparison iraps_classifier.py @ 18:ec25331946b8 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
author bgruening
date Tue, 14 May 2019 18:17:57 -0400
parents
children
comparison
equal deleted inserted replaced
17:2bbbac61e48d 18:ec25331946b8
1 """
2 class IRAPSCore
3 class IRAPSClassifier
4 class BinarizeTargetClassifier
5 class BinarizeTargetRegressor
6 class _BinarizeTargetScorer
7 class _BinarizeTargetProbaScorer
8
9 binarize_auc_scorer
10 binarize_average_precision_scorer
11
12 binarize_accuracy_scorer
13 binarize_balanced_accuracy_scorer
14 binarize_precision_scorer
15 binarize_recall_scorer
16 """
17
18
19 import numpy as np
20 import random
21 import warnings
22
23 from abc import ABCMeta
24 from scipy.stats import ttest_ind
25 from sklearn import metrics
26 from sklearn.base import BaseEstimator, clone, RegressorMixin
27 from sklearn.externals import six
28 from sklearn.feature_selection.univariate_selection import _BaseFilter
29 from sklearn.metrics.scorer import _BaseScorer
30 from sklearn.pipeline import Pipeline
31 from sklearn.utils import as_float_array, check_X_y
32 from sklearn.utils._joblib import Parallel, delayed
33 from sklearn.utils.validation import (check_array, check_is_fitted,
34 check_memory, column_or_1d)
35
36
37 VERSION = '0.1.1'
38
39
40 class IRAPSCore(six.with_metaclass(ABCMeta, BaseEstimator)):
41 """
42 Base class of IRAPSClassifier
43 From sklearn BaseEstimator:
44 get_params()
45 set_params()
46
47 Parameters
48 ----------
49 n_iter : int
50 sample count
51
52 positive_thres : float
53 z_score shreshold to discretize positive target values
54
55 negative_thres : float
56 z_score threshold to discretize negative target values
57
58 verbose : int
59 0 or geater, if not 0, print progress
60
61 n_jobs : int, default=1
62 The number of CPUs to use to do the computation.
63
64 pre_dispatch : int, or string.
65 Controls the number of jobs that get dispatched during parallel
66 execution. Reducing this number can be useful to avoid an
67 explosion of memory consumption when more jobs get dispatched
68 than CPUs can process. This parameter can be:
69 - None, in which case all the jobs are immediately
70 created and spawned. Use this for lightweight and
71 fast-running jobs, to avoid delays due to on-demand
72 spawning of the jobs
73 - An int, giving the exact number of total jobs that are
74 spawned
75 - A string, giving an expression as a function of n_jobs,
76 as in '2*n_jobs'
77
78 random_state : int or None
79 """
80
81 def __init__(self, n_iter=1000, positive_thres=-1, negative_thres=0,
82 verbose=0, n_jobs=1, pre_dispatch='2*n_jobs',
83 random_state=None):
84 """
85 IRAPS turns towwards general Anomaly Detection
86 It comapares positive_thres with negative_thres,
87 and decide which portion is the positive target.
88 e.g.:
89 (positive_thres=-1, negative_thres=0)
90 => positive = Z_score of target < -1
91 (positive_thres=1, negative_thres=0)
92 => positive = Z_score of target > 1
93
94 Note: The positive targets here is always the
95 abnormal minority group.
96 """
97 self.n_iter = n_iter
98 self.positive_thres = positive_thres
99 self.negative_thres = negative_thres
100 self.verbose = verbose
101 self.n_jobs = n_jobs
102 self.pre_dispatch = pre_dispatch
103 self.random_state = random_state
104
105 def fit(self, X, y):
106 """
107 X: array-like (n_samples x n_features)
108 y: 1-d array-like (n_samples)
109 """
110 X, y = check_X_y(X, y, ['csr', 'csc'], multi_output=False)
111
112 def _stochastic_sampling(X, y, random_state=None, positive_thres=-1,
113 negative_thres=0):
114 # each iteration select a random number of random subset of
115 # training samples. this is somewhat different from the original
116 # IRAPS method, but effect is almost the same.
117 SAMPLE_SIZE = [0.25, 0.75]
118 n_samples = X.shape[0]
119
120 if random_state is None:
121 n_select = random.randint(int(n_samples * SAMPLE_SIZE[0]),
122 int(n_samples * SAMPLE_SIZE[1]))
123 index = random.sample(list(range(n_samples)), n_select)
124 else:
125 n_select = random.Random(random_state).randint(
126 int(n_samples * SAMPLE_SIZE[0]),
127 int(n_samples * SAMPLE_SIZE[1]))
128 index = random.Random(random_state).sample(
129 list(range(n_samples)), n_select)
130
131 X_selected, y_selected = X[index], y[index]
132
133 # Spliting by z_scores.
134 y_selected = (y_selected - y_selected.mean()) / y_selected.std()
135 if positive_thres < negative_thres:
136 X_selected_positive = X_selected[y_selected < positive_thres]
137 X_selected_negative = X_selected[y_selected > negative_thres]
138 else:
139 X_selected_positive = X_selected[y_selected > positive_thres]
140 X_selected_negative = X_selected[y_selected < negative_thres]
141
142 # For every iteration, at least 5 responders are selected
143 if X_selected_positive.shape[0] < 5:
144 warnings.warn("Warning: fewer than 5 positives were selected!")
145 return
146
147 # p_values
148 _, p = ttest_ind(X_selected_positive, X_selected_negative,
149 axis=0, equal_var=False)
150
151 # fold_change == mean change?
152 # TODO implement other normalization method
153 positive_mean = X_selected_positive.mean(axis=0)
154 negative_mean = X_selected_negative.mean(axis=0)
155 mean_change = positive_mean - negative_mean
156 # mean_change = np.select(
157 # [positive_mean > negative_mean,
158 # positive_mean < negative_mean],
159 # [positive_mean / negative_mean,
160 # -negative_mean / positive_mean])
161 # mean_change could be adjusted by power of 2
162 # mean_change = 2**mean_change \
163 # if mean_change>0 else -2**abs(mean_change)
164
165 return p, mean_change, negative_mean
166
167 parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
168 pre_dispatch=self.pre_dispatch)
169 if self.random_state is None:
170 res = parallel(delayed(_stochastic_sampling)(
171 X, y, random_state=None,
172 positive_thres=self.positive_thres,
173 negative_thres=self.negative_thres)
174 for i in range(self.n_iter))
175 else:
176 res = parallel(delayed(_stochastic_sampling)(
177 X, y, random_state=seed,
178 positive_thres=self.positive_thres,
179 negative_thres=self.negative_thres)
180 for seed in range(self.random_state,
181 self.random_state+self.n_iter))
182 res = [_ for _ in res if _]
183 if len(res) < 50:
184 raise ValueError("too few (%d) valid feature lists "
185 "were generated!" % len(res))
186 pvalues = np.vstack([x[0] for x in res])
187 fold_changes = np.vstack([x[1] for x in res])
188 base_values = np.vstack([x[2] for x in res])
189
190 self.pvalues_ = np.asarray(pvalues)
191 self.fold_changes_ = np.asarray(fold_changes)
192 self.base_values_ = np.asarray(base_values)
193
194 return self
195
196
197 def _iraps_core_fit(iraps_core, X, y):
198 return iraps_core.fit(X, y)
199
200
201 class IRAPSClassifier(six.with_metaclass(ABCMeta, _BaseFilter,
202 BaseEstimator, RegressorMixin)):
203 """
204 Extend the bases of both sklearn feature_selector and classifier.
205 From sklearn BaseEstimator:
206 get_params()
207 set_params()
208 From sklearn _BaseFilter:
209 get_support()
210 fit_transform(X)
211 transform(X)
212 From sklearn RegressorMixin:
213 score(X, y): R2
214 New:
215 predict(X)
216 predict_label(X)
217 get_signature()
218 Properties:
219 discretize_value
220
221 Parameters
222 ----------
223 iraps_core: object
224 p_thres: float, threshold for p_values
225 fc_thres: float, threshold for fold change or mean difference
226 occurrence: float, occurrence rate selected by set of p_thres and fc_thres
227 discretize: float, threshold of z_score to discretize target value
228 memory: None, str or joblib.Memory object
229 min_signature_features: int, the mininum number of features in a signature
230 """
231
232 def __init__(self, iraps_core, p_thres=1e-4, fc_thres=0.1,
233 occurrence=0.8, discretize=-1, memory=None,
234 min_signature_features=1):
235 self.iraps_core = iraps_core
236 self.p_thres = p_thres
237 self.fc_thres = fc_thres
238 self.occurrence = occurrence
239 self.discretize = discretize
240 self.memory = memory
241 self.min_signature_features = min_signature_features
242
243 def fit(self, X, y):
244 memory = check_memory(self.memory)
245 cached_fit = memory.cache(_iraps_core_fit)
246 iraps_core = clone(self.iraps_core)
247 # allow pre-fitted iraps_core here
248 if not hasattr(iraps_core, 'pvalues_'):
249 iraps_core = cached_fit(iraps_core, X, y)
250 self.iraps_core_ = iraps_core
251
252 pvalues = as_float_array(iraps_core.pvalues_, copy=True)
253 # why np.nan is here?
254 pvalues[np.isnan(pvalues)] = np.finfo(pvalues.dtype).max
255
256 fold_changes = as_float_array(iraps_core.fold_changes_, copy=True)
257 fold_changes[np.isnan(fold_changes)] = 0.0
258
259 base_values = as_float_array(iraps_core.base_values_, copy=True)
260
261 p_thres = self.p_thres
262 fc_thres = self.fc_thres
263 occurrence = self.occurrence
264
265 mask_0 = np.zeros(pvalues.shape, dtype=np.int32)
266 # mark p_values less than the threashold
267 mask_0[pvalues <= p_thres] = 1
268 # mark fold_changes only when greater than the threashold
269 mask_0[abs(fold_changes) < fc_thres] = 0
270
271 # count the occurrence and mask greater than the threshold
272 counts = mask_0.sum(axis=0)
273 occurrence_thres = int(occurrence * iraps_core.n_iter)
274 mask = np.zeros(counts.shape, dtype=bool)
275 mask[counts >= occurrence_thres] = 1
276
277 # generate signature
278 fold_changes[mask_0 == 0] = 0.0
279 signature = fold_changes[:, mask].sum(axis=0) / counts[mask]
280 signature = np.vstack((signature, base_values[:, mask].mean(axis=0)))
281 # It's not clearn whether min_size could impact prediction
282 # performance
283 if signature is None\
284 or signature.shape[1] < self.min_signature_features:
285 raise ValueError("The classifier got None signature or the number "
286 "of sinature feature is less than minimum!")
287
288 self.signature_ = np.asarray(signature)
289 self.mask_ = mask
290 # TODO: support other discretize method: fixed value, upper
291 # third quater, etc.
292 self.discretize_value = y.mean() + y.std() * self.discretize
293 if iraps_core.negative_thres > iraps_core.positive_thres:
294 self.less_is_positive = True
295 else:
296 self.less_is_positive = False
297
298 return self
299
300 def _get_support_mask(self):
301 """
302 return mask of feature selection indices
303 """
304 check_is_fitted(self, 'mask_')
305
306 return self.mask_
307
308 def get_signature(self):
309 """
310 return signature
311 """
312 check_is_fitted(self, 'signature_')
313
314 return self.signature_
315
316 def predict(self, X):
317 """
318 compute the correlation coefficient with irpas signature
319 """
320 signature = self.get_signature()
321
322 X = as_float_array(X)
323 X_transformed = self.transform(X) - signature[1]
324 corrcoef = np.array(
325 [np.corrcoef(signature[0], e)[0][1] for e in X_transformed])
326 corrcoef[np.isnan(corrcoef)] = np.finfo(np.float32).min
327
328 return corrcoef
329
330 def predict_label(self, X, clf_cutoff=0.4):
331 return self.predict(X) >= clf_cutoff
332
333
334 class BinarizeTargetClassifier(BaseEstimator, RegressorMixin):
335 """
336 Convert continuous target to binary labels (True and False)
337 and apply a classification estimator.
338
339 Parameters
340 ----------
341 classifier: object
342 Estimator object such as derived from sklearn `ClassifierMixin`.
343
344 z_score: float, default=-1.0
345 Threshold value based on z_score. Will be ignored when
346 fixed_value is set
347
348 value: float, default=None
349 Threshold value
350
351 less_is_positive: boolean, default=True
352 When target is less the threshold value, it will be converted
353 to True, False otherwise.
354
355 Attributes
356 ----------
357 classifier_: object
358 Fitted classifier
359
360 discretize_value: float
361 The threshold value used to discretize True and False targets
362 """
363
364 def __init__(self, classifier, z_score=-1, value=None,
365 less_is_positive=True):
366 self.classifier = classifier
367 self.z_score = z_score
368 self.value = value
369 self.less_is_positive = less_is_positive
370
371 def fit(self, X, y, sample_weight=None):
372 """
373 Convert y to True and False labels and then fit the classifier
374 with X and new y
375
376 Returns
377 ------
378 self: object
379 """
380 y = check_array(y, accept_sparse=False, force_all_finite=True,
381 ensure_2d=False, dtype='numeric')
382 y = column_or_1d(y)
383
384 if self.value is None:
385 discretize_value = y.mean() + y.std() * self.z_score
386 else:
387 discretize_value = self.Value
388 self.discretize_value = discretize_value
389
390 if self.less_is_positive:
391 y_trans = y < discretize_value
392 else:
393 y_trans = y > discretize_value
394
395 self.classifier_ = clone(self.classifier)
396
397 if sample_weight is not None:
398 self.classifier_.fit(X, y_trans, sample_weight=sample_weight)
399 else:
400 self.classifier_.fit(X, y_trans)
401
402 if hasattr(self.classifier_, 'feature_importances_'):
403 self.feature_importances_ = self.classifier_.feature_importances_
404 if hasattr(self.classifier_, 'coef_'):
405 self.coef_ = self.classifier_.coef_
406 if hasattr(self.classifier_, 'n_outputs_'):
407 self.n_outputs_ = self.classifier_.n_outputs_
408 if hasattr(self.classifier_, 'n_features_'):
409 self.n_features_ = self.classifier_.n_features_
410
411 return self
412
413 def predict(self, X):
414 """
415 Predict class probabilities of X.
416 """
417 check_is_fitted(self, 'classifier_')
418 proba = self.classifier_.predict_proba(X)
419 return proba[:, 1]
420
421 def predict_label(self, X):
422 """Predict class label of X
423 """
424 check_is_fitted(self, 'classifier_')
425 return self.classifier_.predict(X)
426
427
428 class _BinarizeTargetProbaScorer(_BaseScorer):
429 """
430 base class to make binarized target specific scorer
431 """
432
433 def __call__(self, clf, X, y, sample_weight=None):
434 clf_name = clf.__class__.__name__
435 # support pipeline object
436 if isinstance(clf, Pipeline):
437 main_estimator = clf.steps[-1][-1]
438 # support stacking ensemble estimators
439 # TODO support nested pipeline/stacking estimators
440 elif clf_name in ['StackingCVClassifier', 'StackingClassifier']:
441 main_estimator = clf.meta_clf_
442 elif clf_name in ['StackingCVRegressor', 'StackingRegressor']:
443 main_estimator = clf.meta_regr_
444 else:
445 main_estimator = clf
446
447 discretize_value = main_estimator.discretize_value
448 less_is_positive = main_estimator.less_is_positive
449
450 if less_is_positive:
451 y_trans = y < discretize_value
452 else:
453 y_trans = y > discretize_value
454
455 y_pred = clf.predict(X)
456 if sample_weight is not None:
457 return self._sign * self._score_func(y_trans, y_pred,
458 sample_weight=sample_weight,
459 **self._kwargs)
460 else:
461 return self._sign * self._score_func(y_trans, y_pred,
462 **self._kwargs)
463
464
465 # roc_auc
466 binarize_auc_scorer =\
467 _BinarizeTargetProbaScorer(metrics.roc_auc_score, 1, {})
468
469 # average_precision_scorer
470 binarize_average_precision_scorer =\
471 _BinarizeTargetProbaScorer(metrics.average_precision_score, 1, {})
472
473 # roc_auc_scorer
474 iraps_auc_scorer = binarize_auc_scorer
475
476 # average_precision_scorer
477 iraps_average_precision_scorer = binarize_average_precision_scorer
478
479
480 class BinarizeTargetRegressor(BaseEstimator, RegressorMixin):
481 """
482 Extend regression estimator to have discretize_value
483
484 Parameters
485 ----------
486 regressor: object
487 Estimator object such as derived from sklearn `RegressionMixin`.
488
489 z_score: float, default=-1.0
490 Threshold value based on z_score. Will be ignored when
491 fixed_value is set
492
493 value: float, default=None
494 Threshold value
495
496 less_is_positive: boolean, default=True
497 When target is less the threshold value, it will be converted
498 to True, False otherwise.
499
500 Attributes
501 ----------
502 regressor_: object
503 Fitted regressor
504
505 discretize_value: float
506 The threshold value used to discretize True and False targets
507 """
508
509 def __init__(self, regressor, z_score=-1, value=None,
510 less_is_positive=True):
511 self.regressor = regressor
512 self.z_score = z_score
513 self.value = value
514 self.less_is_positive = less_is_positive
515
516 def fit(self, X, y, sample_weight=None):
517 """
518 Calculate the discretize_value fit the regressor with traning data
519
520 Returns
521 ------
522 self: object
523 """
524 y = check_array(y, accept_sparse=False, force_all_finite=True,
525 ensure_2d=False, dtype='numeric')
526 y = column_or_1d(y)
527
528 if self.value is None:
529 discretize_value = y.mean() + y.std() * self.z_score
530 else:
531 discretize_value = self.Value
532 self.discretize_value = discretize_value
533
534 self.regressor_ = clone(self.regressor)
535
536 if sample_weight is not None:
537 self.regressor_.fit(X, y, sample_weight=sample_weight)
538 else:
539 self.regressor_.fit(X, y)
540
541 # attach classifier attributes
542 if hasattr(self.regressor_, 'feature_importances_'):
543 self.feature_importances_ = self.regressor_.feature_importances_
544 if hasattr(self.regressor_, 'coef_'):
545 self.coef_ = self.regressor_.coef_
546 if hasattr(self.regressor_, 'n_outputs_'):
547 self.n_outputs_ = self.regressor_.n_outputs_
548 if hasattr(self.regressor_, 'n_features_'):
549 self.n_features_ = self.regressor_.n_features_
550
551 return self
552
553 def predict(self, X):
554 """Predict target value of X
555 """
556 check_is_fitted(self, 'regressor_')
557 y_pred = self.regressor_.predict(X)
558 if not np.all((y_pred >= 0) & (y_pred <= 1)):
559 y_pred = (y_pred - y_pred.min()) / (y_pred.max() - y_pred.min())
560 if self.less_is_positive:
561 y_pred = 1 - y_pred
562 return y_pred
563
564
565 # roc_auc_scorer
566 regression_auc_scorer = binarize_auc_scorer
567
568 # average_precision_scorer
569 regression_average_precision_scorer = binarize_average_precision_scorer