Mercurial > repos > bgruening > sklearn_data_preprocess

diff keras_train_and_eval.py @ 37:1bef885255e0 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
author: bgruening
date: Sat, 01 May 2021 01:41:14 +0000
parents: b75cae00f980
children: a16f33c6ca64
--- a/keras_train_and_eval.py	Tue Apr 13 22:16:07 2021 +0000
+++ b/keras_train_and_eval.py	Sat May 01 01:41:14 2021 +0000
@@ -11,16 +11,9 @@
 from galaxy_ml.externals.selene_sdk.utils import compute_score
 from galaxy_ml.keras_galaxy_models import _predict_generator
 from galaxy_ml.model_validations import train_test_split
-from galaxy_ml.utils import (
-    clean_params,
-    get_main_estimator,
-    get_module,
-    get_scoring,
-    load_model,
-    read_columns,
-    SafeEval,
-    try_get_attr,
-)
+from galaxy_ml.utils import (clean_params, get_main_estimator,
+                             get_module, get_scoring, load_model, read_columns,
+                             SafeEval, try_get_attr)
 from scipy.io import mmread
 from sklearn.metrics.scorer import _check_multimetric_scoring
 from sklearn.model_selection import _search, _validation
@@ -28,7 +21,6 @@
 from sklearn.pipeline import Pipeline
 from sklearn.utils import indexable, safe_indexing
 
-
 _fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score")
 setattr(_search, "_fit_and_score", _fit_and_score)
 setattr(_validation, "_fit_and_score", _fit_and_score)
@@ -56,7 +48,10 @@
 
         param_name = p["sp_name"]
         if param_name.lower().endswith(NON_SEARCHABLE):
-            warnings.warn("Warning: `%s` is not eligible for search and was " "omitted!" % param_name)
+            warnings.warn(
+                "Warning: `%s` is not eligible for search and was "
+                "omitted!" % param_name
+            )
             continue
 
         if not swap_value.startswith(":"):
@@ -99,7 +94,11 @@
         index_arr = np.arange(n_samples)
         test = index_arr[np.isin(groups, group_names)]
         train = index_arr[~np.isin(groups, group_names)]
-        rval = list(chain.from_iterable((safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays))
+        rval = list(
+            chain.from_iterable(
+                (safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays
+            )
+        )
     else:
         rval = train_test_split(*new_arrays, **kwargs)
 
@@ -127,14 +126,22 @@
         pred_labels = (pred_probas > 0.5).astype("int32")
         targets = y_true.ravel().astype("int32")
         if not is_multimetric:
-            preds = pred_labels if scorer.__class__.__name__ == "_PredictScorer" else pred_probas
+            preds = (
+                pred_labels
+                if scorer.__class__.__name__ == "_PredictScorer"
+                else pred_probas
+            )
             score = scorer._score_func(targets, preds, **scorer._kwargs)
 
             return score
         else:
             scores = {}
             for name, one_scorer in scorer.items():
-                preds = pred_labels if one_scorer.__class__.__name__ == "_PredictScorer" else pred_probas
+                preds = (
+                    pred_labels
+                    if one_scorer.__class__.__name__ == "_PredictScorer"
+                    else pred_probas
+                )
                 score = one_scorer._score_func(targets, preds, **one_scorer._kwargs)
                 scores[name] = score
 
@@ -144,13 +151,21 @@
         pred_labels = (pred_probas > 0.5).astype("int32")
         targets = y_true.astype("int32")
         if not is_multimetric:
-            preds = pred_labels if scorer.__class__.__name__ == "_PredictScorer" else pred_probas
+            preds = (
+                pred_labels
+                if scorer.__class__.__name__ == "_PredictScorer"
+                else pred_probas
+            )
             score, _ = compute_score(preds, targets, scorer._score_func)
             return score
         else:
             scores = {}
             for name, one_scorer in scorer.items():
-                preds = pred_labels if one_scorer.__class__.__name__ == "_PredictScorer" else pred_probas
+                preds = (
+                    pred_labels
+                    if one_scorer.__class__.__name__ == "_PredictScorer"
+                    else pred_probas
+                )
                 score, _ = compute_score(preds, targets, one_scorer._score_func)
                 scores[name] = score
 
@@ -243,7 +258,9 @@
     # tabular input
     if input_type == "tabular":
         header = "infer" if params["input_options"]["header1"] else None
-        column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
+        column_option = params["input_options"]["column_selector_options_1"][
+            "selected_column_selector_option"
+        ]
         if column_option in [
             "by_index_number",
             "all_but_by_index_number",
@@ -295,7 +312,9 @@
 
     # Get target y
     header = "infer" if params["input_options"]["header2"] else None
-    column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
+    column_option = params["input_options"]["column_selector_options_2"][
+        "selected_column_selector_option2"
+    ]
     if column_option in [
         "by_index_number",
         "all_but_by_index_number",
@@ -313,12 +332,9 @@
         infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True)
         loaded_df[df_key] = infile2
 
-    y = read_columns(infile2,
-                     c=c,
-                     c_option=column_option,
-                     sep='\t',
-                     header=header,
-                     parse_dates=True)
+    y = read_columns(
+        infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True
+    )
     if len(y.shape) == 2 and y.shape[1] == 1:
         y = y.ravel()
     if input_type == "refseq_and_interval":
@@ -328,10 +344,14 @@
 
     # load groups
     if groups:
-        groups_selector = (params["experiment_schemes"]["test_split"]["split_algos"]).pop("groups_selector")
+        groups_selector = (
+            params["experiment_schemes"]["test_split"]["split_algos"]
+        ).pop("groups_selector")
 
         header = "infer" if groups_selector["header_g"] else None
-        column_option = groups_selector["column_selector_options_g"]["selected_column_selector_option_g"]
+        column_option = groups_selector["column_selector_options_g"][
+            "selected_column_selector_option_g"
+        ]
         if column_option in [
             "by_index_number",
             "all_but_by_index_number",
@@ -346,12 +366,14 @@
         if df_key in loaded_df:
             groups = loaded_df[df_key]
 
-        groups = read_columns(groups,
-                              c=c,
-                              c_option=column_option,
-                              sep='\t',
-                              header=header,
-                              parse_dates=True)
+        groups = read_columns(
+            groups,
+            c=c,
+            c_option=column_option,
+            sep="\t",
+            header=header,
+            parse_dates=True,
+        )
         groups = groups.ravel()
 
     # del loaded_df
@@ -364,7 +386,7 @@
         main_est.set_params(memory=memory)
 
     # handle scorer, convert to scorer dict
-    scoring = params['experiment_schemes']['metrics']['scoring']
+    scoring = params["experiment_schemes"]["metrics"]["scoring"]
     if scoring is not None:
         # get_scoring() expects secondary_scoring to be a comma separated string (not a list)
         # Check if secondary_scoring is specified
@@ -385,7 +407,9 @@
         if y is not None:
             test_split_options["labels"] = y
         else:
-            raise ValueError("Stratified shuffle split is not " "applicable on empty target values!")
+            raise ValueError(
+                "Stratified shuffle split is not " "applicable on empty target values!"
+            )
 
     (
         X_train,
@@ -408,7 +432,10 @@
             if y_train is not None:
                 val_split_options["labels"] = y_train
             else:
-                raise ValueError("Stratified shuffle split is not " "applicable on empty target values!")
+                raise ValueError(
+                    "Stratified shuffle split is not "
+                    "applicable on empty target values!"
+                )
 
         (
             X_train,
@@ -431,8 +458,12 @@
     if hasattr(estimator, "evaluate"):
         steps = estimator.prediction_steps
         batch_size = estimator.batch_size
-        generator = estimator.data_generator_.flow(X_test, y=y_test, batch_size=batch_size)
-        predictions, y_true = _predict_generator(estimator.model_, generator, steps=steps)
+        generator = estimator.data_generator_.flow(
+            X_test, y=y_test, batch_size=batch_size
+        )
+        predictions, y_true = _predict_generator(
+            estimator.model_, generator, steps=steps
+        )
         scores = _evaluate(y_true, predictions, scorer, is_multimetric=True)
 
     else:
author	bgruening
date	Sat, 01 May 2021 01:41:14 +0000
parents	b75cae00f980
children	a16f33c6ca64