Mercurial > repos > bgruening > sklearn_feature_selection
changeset 31:5773e98921fc draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
| author | bgruening | 
|---|---|
| date | Sat, 01 May 2021 01:20:14 +0000 | 
| parents | 1d20e0dce176 | 
| children | a7c667ff83fe | 
| files | association_rules.py fitted_model_eval.py keras_deep_learning.py keras_train_and_eval.py label_encoder.py ml_visualization_ex.py model_prediction.py search_model_validation.py simple_model_fit.py stacking_ensembles.py test-data/le_input_w_header.tabular test-data/le_input_wo_header.tabular test-data/le_output.tabular test-data/mba_input_int_w.tabular test-data/mba_input_int_wo.tabular test-data/mba_input_str_w.tabular test-data/mba_input_str_wo.tabular test-data/mba_out_str.tabular test-data/mba_output_int.tabular test-data/mba_output_str.tabular to_categorical.py train_test_eval.py train_test_split.py | 
| diffstat | 23 files changed, 649 insertions(+), 266 deletions(-) [+] | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/association_rules.py Sat May 01 01:20:14 2021 +0000 @@ -0,0 +1,116 @@ +import argparse +import json +import warnings + +import pandas as pd +from mlxtend.frequent_patterns import association_rules, fpgrowth +from mlxtend.preprocessing import TransactionEncoder + + +def main(inputs, infile, outfile, min_support=0.5, min_confidence=0.5, min_lift=1.0, min_conviction=1.0, max_length=None): + """ + Parameter + --------- + input : str + File path to galaxy tool parameter + + infile : str + File paths of input vector + + outfile : str + File path to output matrix + + min_support: float + Minimum support + + min_confidence: float + Minimum confidence + + min_lift: float + Minimum lift + + min_conviction: float + Minimum conviction + + max_length: int + Maximum length + + """ + warnings.simplefilter('ignore') + + with open(inputs, 'r') as param_handler: + params = json.load(param_handler) + + input_header = params['header0'] + header = 'infer' if input_header else None + + with open(infile) as fp: + lines = fp.read().splitlines() + + if header is not None: + lines = lines[1:] + + dataset = [] + for line in lines: + line_items = line.split("\t") + dataset.append(line_items) + + # TransactionEncoder learns the unique labels in the dataset and transforms the + # input dataset (a Python list of lists) into a one-hot encoded NumPy boolean array + te = TransactionEncoder() + te_ary = te.fit_transform(dataset) + + # Turn the encoded NumPy array into a DataFrame + df = pd.DataFrame(te_ary, columns=te.columns_) + + # Extract frequent itemsets for association rule mining + # use_colnames: Use DataFrames' column names in the returned DataFrame instead of column indices + frequent_itemsets = fpgrowth(df, min_support=min_support, use_colnames=True, max_len=max_length) + + # Get association rules, with confidence larger than min_confidence + rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence) + + # Filter association rules, keeping rules with lift and conviction larger than min_liftand and min_conviction + rules = rules[(rules['lift'] >= min_lift) & (rules['conviction'] >= min_conviction)] + + # Convert columns from frozenset to list (more readable) + rules['antecedents'] = rules['antecedents'].apply(list) + rules['consequents'] = rules['consequents'].apply(list) + + # The next 3 steps are intended to fix the order of the association + # rules generated, so tests that rely on diff'ing a desired output + # with an expected output can pass + + # 1) Sort entry in every row/column for columns 'antecedents' and 'consequents' + rules['antecedents'] = rules['antecedents'].apply(lambda row: sorted(row)) + rules['consequents'] = rules['consequents'].apply(lambda row: sorted(row)) + + # 2) Create two temporary string columns to sort on + rules['ant_str'] = rules['antecedents'].apply(lambda row: " ".join(row)) + rules['con_str'] = rules['consequents'].apply(lambda row: " ".join(row)) + + # 3) Sort results so they are re-producable + rules.sort_values(by=['ant_str', 'con_str'], inplace=True) + del rules['ant_str'] + del rules['con_str'] + rules.reset_index(drop=True, inplace=True) + + # Write association rules and metrics to file + rules.to_csv(outfile, sep="\t", index=False) + + +if __name__ == '__main__': + aparser = argparse.ArgumentParser() + aparser.add_argument("-i", "--inputs", dest="inputs", required=True) + aparser.add_argument("-y", "--infile", dest="infile", required=True) + aparser.add_argument("-o", "--outfile", dest="outfile", required=True) + aparser.add_argument("-s", "--support", dest="support", default=0.5) + aparser.add_argument("-c", "--confidence", dest="confidence", default=0.5) + aparser.add_argument("-l", "--lift", dest="lift", default=1.0) + aparser.add_argument("-v", "--conviction", dest="conviction", default=1.0) + aparser.add_argument("-t", "--length", dest="length", default=5) + args = aparser.parse_args() + + main(args.inputs, args.infile, args.outfile, + min_support=float(args.support), min_confidence=float(args.confidence), + min_lift=float(args.lift), min_conviction=float(args.conviction), max_length=int(args.length))
--- a/fitted_model_eval.py Tue Apr 13 22:00:10 2021 +0000 +++ b/fitted_model_eval.py Sat May 01 01:20:14 2021 +0000 @@ -30,7 +30,9 @@ # tabular input if input_type == "tabular": header = "infer" if params["input_options"]["header1"] else None - column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] + column_option = params["input_options"]["column_selector_options_1"][ + "selected_column_selector_option" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -52,7 +54,9 @@ # Get target y header = "infer" if params["input_options"]["header2"] else None - column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] + column_option = params["input_options"]["column_selector_options_2"][ + "selected_column_selector_option2" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -70,7 +74,9 @@ infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True) loaded_df[df_key] = infile2 - y = read_columns(infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True) + y = read_columns( + infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True + ) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel() @@ -123,7 +129,8 @@ if hasattr(main_est, "config") and hasattr(main_est, "load_weights"): if not infile_weights or infile_weights == "None": raise ValueError( - "The selected model skeleton asks for weights, " "but no dataset for weights was provided!" + "The selected model skeleton asks for weights, " + "but no dataset for weights was provided!" ) main_est.load_weights(infile_weights) @@ -142,7 +149,9 @@ scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer) if hasattr(estimator, "evaluate"): - scores = estimator.evaluate(X_test, y_test=y_test, scorer=scorer, is_multimetric=True) + scores = estimator.evaluate( + X_test, y_test=y_test, scorer=scorer, is_multimetric=True + ) else: scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True)
--- a/keras_deep_learning.py Tue Apr 13 22:00:10 2021 +0000 +++ b/keras_deep_learning.py Sat May 01 01:20:14 2021 +0000 @@ -10,12 +10,12 @@ from galaxy_ml.utils import get_search_params, SafeEval, try_get_attr from keras.models import Model, Sequential - safe_eval = SafeEval() def _handle_shape(literal): - """Eval integer or list/tuple of integers from string + """ + Eval integer or list/tuple of integers from string Parameters: ----------- @@ -32,7 +32,8 @@ def _handle_regularizer(literal): - """Construct regularizer from string literal + """ + Construct regularizer from string literal Parameters ---------- @@ -48,15 +49,16 @@ return None if l1 is None: - l1 = 0. + l1 = 0.0 if l2 is None: - l2 = 0. + l2 = 0.0 return keras.regularizers.l1_l2(l1=l1, l2=l2) def _handle_constraint(config): - """Construct constraint from galaxy tool parameters. + """ + Construct constraint from galaxy tool parameters. Suppose correct dictionary format Parameters @@ -72,14 +74,14 @@ "MinMaxNorm" } """ - constraint_type = config['constraint_type'] - if constraint_type in ('None', ''): + constraint_type = config["constraint_type"] + if constraint_type in ("None", ""): return None klass = getattr(keras.constraints, constraint_type) - options = config.get('constraint_options', {}) - if 'axis' in options: - options['axis'] = literal_eval(options['axis']) + options = config.get("constraint_options", {}) + if "axis" in options: + options["axis"] = literal_eval(options["axis"]) return klass(**options) @@ -89,62 +91,82 @@ def _handle_layer_parameters(params): - """Access to handle all kinds of parameters + """ + Access to handle all kinds of parameters """ for key, value in six.iteritems(params): - if value in ('None', ''): + if value in ("None", ""): params[key] = None continue - if type(value) in [int, float, bool]\ - or (type(value) is str and value.isalpha()): + if type(value) in [int, float, bool] or ( + type(value) is str and value.isalpha() + ): continue - if key in ['input_shape', 'noise_shape', 'shape', 'batch_shape', - 'target_shape', 'dims', 'kernel_size', 'strides', - 'dilation_rate', 'output_padding', 'cropping', 'size', - 'padding', 'pool_size', 'axis', 'shared_axes'] \ - and isinstance(value, str): + if ( + key + in [ + "input_shape", + "noise_shape", + "shape", + "batch_shape", + "target_shape", + "dims", + "kernel_size", + "strides", + "dilation_rate", + "output_padding", + "cropping", + "size", + "padding", + "pool_size", + "axis", + "shared_axes", + ] + and isinstance(value, str) + ): params[key] = _handle_shape(value) - elif key.endswith('_regularizer') and isinstance(value, dict): + elif key.endswith("_regularizer") and isinstance(value, dict): params[key] = _handle_regularizer(value) - elif key.endswith('_constraint') and isinstance(value, dict): + elif key.endswith("_constraint") and isinstance(value, dict): params[key] = _handle_constraint(value) - elif key == 'function': # No support for lambda/function eval + elif key == "function": # No support for lambda/function eval params.pop(key) return params def get_sequential_model(config): - """Construct keras Sequential model from Galaxy tool parameters + """ + Construct keras Sequential model from Galaxy tool parameters Parameters: ----------- config : dictionary, galaxy tool parameters loaded by JSON """ model = Sequential() - input_shape = _handle_shape(config['input_shape']) - layers = config['layers'] + input_shape = _handle_shape(config["input_shape"]) + layers = config["layers"] for layer in layers: - options = layer['layer_selection'] - layer_type = options.pop('layer_type') + options = layer["layer_selection"] + layer_type = options.pop("layer_type") klass = getattr(keras.layers, layer_type) - kwargs = options.pop('kwargs', '') + kwargs = options.pop("kwargs", "") # parameters needs special care options = _handle_layer_parameters(options) if kwargs: - kwargs = safe_eval('dict(' + kwargs + ')') + kwargs = safe_eval("dict(" + kwargs + ")") options.update(kwargs) # add input_shape to the first layer only - if not getattr(model, '_layers') and input_shape is not None: - options['input_shape'] = input_shape + if not getattr(model, "_layers") and input_shape is not None: + options["input_shape"] = input_shape model.add(klass(**options)) @@ -152,31 +174,32 @@ def get_functional_model(config): - """Construct keras functional model from Galaxy tool parameters + """ + Construct keras functional model from Galaxy tool parameters Parameters ----------- config : dictionary, galaxy tool parameters loaded by JSON """ - layers = config['layers'] + layers = config["layers"] all_layers = [] for layer in layers: - options = layer['layer_selection'] - layer_type = options.pop('layer_type') + options = layer["layer_selection"] + layer_type = options.pop("layer_type") klass = getattr(keras.layers, layer_type) - inbound_nodes = options.pop('inbound_nodes', None) - kwargs = options.pop('kwargs', '') + inbound_nodes = options.pop("inbound_nodes", None) + kwargs = options.pop("kwargs", "") # parameters needs special care options = _handle_layer_parameters(options) if kwargs: - kwargs = safe_eval('dict(' + kwargs + ')') + kwargs = safe_eval("dict(" + kwargs + ")") options.update(kwargs) # merge layers - if 'merging_layers' in options: - idxs = literal_eval(options.pop('merging_layers')) + if "merging_layers" in options: + idxs = literal_eval(options.pop("merging_layers")) merging_layers = [all_layers[i - 1] for i in idxs] new_layer = klass(**options)(merging_layers) # non-input layers @@ -188,41 +211,43 @@ all_layers.append(new_layer) - input_indexes = _handle_shape(config['input_layers']) + input_indexes = _handle_shape(config["input_layers"]) input_layers = [all_layers[i - 1] for i in input_indexes] - output_indexes = _handle_shape(config['output_layers']) + output_indexes = _handle_shape(config["output_layers"]) output_layers = [all_layers[i - 1] for i in output_indexes] return Model(inputs=input_layers, outputs=output_layers) def get_batch_generator(config): - """Construct keras online data generator from Galaxy tool parameters + """ + Construct keras online data generator from Galaxy tool parameters Parameters ----------- config : dictionary, galaxy tool parameters loaded by JSON """ - generator_type = config.pop('generator_type') - if generator_type == 'none': + generator_type = config.pop("generator_type") + if generator_type == "none": return None - klass = try_get_attr('galaxy_ml.preprocessors', generator_type) + klass = try_get_attr("galaxy_ml.preprocessors", generator_type) - if generator_type == 'GenomicIntervalBatchGenerator': - config['ref_genome_path'] = 'to_be_determined' - config['intervals_path'] = 'to_be_determined' - config['target_path'] = 'to_be_determined' - config['features'] = 'to_be_determined' + if generator_type == "GenomicIntervalBatchGenerator": + config["ref_genome_path"] = "to_be_determined" + config["intervals_path"] = "to_be_determined" + config["target_path"] = "to_be_determined" + config["features"] = "to_be_determined" else: - config['fasta_path'] = 'to_be_determined' + config["fasta_path"] = "to_be_determined" return klass(**config) def config_keras_model(inputs, outfile): - """ config keras model layers and output JSON + """ + config keras model layers and output JSON Parameters ---------- @@ -232,23 +257,30 @@ outfile : str Path to galaxy dataset containing keras model JSON. """ - model_type = inputs['model_selection']['model_type'] - layers_config = inputs['model_selection'] + model_type = inputs["model_selection"]["model_type"] + layers_config = inputs["model_selection"] - if model_type == 'sequential': + if model_type == "sequential": model = get_sequential_model(layers_config) else: model = get_functional_model(layers_config) json_string = model.to_json() - with open(outfile, 'w') as f: + with open(outfile, "w") as f: json.dump(json.loads(json_string), f, indent=2) -def build_keras_model(inputs, outfile, model_json, infile_weights=None, - batch_mode=False, outfile_params=None): - """ for `keras_model_builder` tool +def build_keras_model( + inputs, + outfile, + model_json, + infile_weights=None, + batch_mode=False, + outfile_params=None, +): + """ + for `keras_model_builder` tool Parameters ---------- @@ -265,75 +297,81 @@ outfile_params : str, default=None File path to search parameters output. """ - with open(model_json, 'r') as f: + with open(model_json, "r") as f: json_model = json.load(f) - config = json_model['config'] + config = json_model["config"] options = {} - if json_model['class_name'] == 'Sequential': - options['model_type'] = 'sequential' + if json_model["class_name"] == "Sequential": + options["model_type"] = "sequential" klass = Sequential - elif json_model['class_name'] == 'Model': - options['model_type'] = 'functional' + elif json_model["class_name"] == "Model": + options["model_type"] = "functional" klass = Model else: - raise ValueError("Unknow Keras model class: %s" - % json_model['class_name']) + raise ValueError("Unknow Keras model class: %s" % json_model["class_name"]) # load prefitted model - if inputs['mode_selection']['mode_type'] == 'prefitted': + if inputs["mode_selection"]["mode_type"] == "prefitted": estimator = klass.from_config(config) estimator.load_weights(infile_weights) # build train model else: - cls_name = inputs['mode_selection']['learning_type'] - klass = try_get_attr('galaxy_ml.keras_galaxy_models', cls_name) + cls_name = inputs["mode_selection"]["learning_type"] + klass = try_get_attr("galaxy_ml.keras_galaxy_models", cls_name) - options['loss'] = (inputs['mode_selection'] - ['compile_params']['loss']) - options['optimizer'] =\ - (inputs['mode_selection']['compile_params'] - ['optimizer_selection']['optimizer_type']).lower() + options["loss"] = inputs["mode_selection"]["compile_params"]["loss"] + options["optimizer"] = ( + inputs["mode_selection"]["compile_params"]["optimizer_selection"][ + "optimizer_type" + ] + ).lower() - options.update((inputs['mode_selection']['compile_params'] - ['optimizer_selection']['optimizer_options'])) + options.update( + ( + inputs["mode_selection"]["compile_params"]["optimizer_selection"][ + "optimizer_options" + ] + ) + ) - train_metrics = inputs['mode_selection']['compile_params']['metrics'] - if train_metrics[-1] == 'none': + train_metrics = inputs["mode_selection"]["compile_params"]["metrics"] + if train_metrics[-1] == "none": train_metrics = train_metrics[:-1] - options['metrics'] = train_metrics + options["metrics"] = train_metrics - options.update(inputs['mode_selection']['fit_params']) - options['seed'] = inputs['mode_selection']['random_seed'] + options.update(inputs["mode_selection"]["fit_params"]) + options["seed"] = inputs["mode_selection"]["random_seed"] if batch_mode: - generator = get_batch_generator(inputs['mode_selection'] - ['generator_selection']) - options['data_batch_generator'] = generator - options['prediction_steps'] = \ - inputs['mode_selection']['prediction_steps'] - options['class_positive_factor'] = \ - inputs['mode_selection']['class_positive_factor'] + generator = get_batch_generator( + inputs["mode_selection"]["generator_selection"] + ) + options["data_batch_generator"] = generator + options["prediction_steps"] = inputs["mode_selection"]["prediction_steps"] + options["class_positive_factor"] = inputs["mode_selection"][ + "class_positive_factor" + ] estimator = klass(config, **options) if outfile_params: hyper_params = get_search_params(estimator) # TODO: remove this after making `verbose` tunable for h_param in hyper_params: - if h_param[1].endswith('verbose'): - h_param[0] = '@' - df = pd.DataFrame(hyper_params, columns=['', 'Parameter', 'Value']) - df.to_csv(outfile_params, sep='\t', index=False) + if h_param[1].endswith("verbose"): + h_param[0] = "@" + df = pd.DataFrame(hyper_params, columns=["", "Parameter", "Value"]) + df.to_csv(outfile_params, sep="\t", index=False) print(repr(estimator)) # save model by pickle - with open(outfile, 'wb') as f: + with open(outfile, "wb") as f: pickle.dump(estimator, f, pickle.HIGHEST_PROTOCOL) -if __name__ == '__main__': - warnings.simplefilter('ignore') +if __name__ == "__main__": + warnings.simplefilter("ignore") aparser = argparse.ArgumentParser() aparser.add_argument("-i", "--inputs", dest="inputs", required=True) @@ -345,7 +383,7 @@ args = aparser.parse_args() input_json_path = args.inputs - with open(input_json_path, 'r') as param_handler: + with open(input_json_path, "r") as param_handler: inputs = json.load(param_handler) tool_id = args.tool_id @@ -355,18 +393,20 @@ infile_weights = args.infile_weights # for keras_model_config tool - if tool_id == 'keras_model_config': + if tool_id == "keras_model_config": config_keras_model(inputs, outfile) # for keras_model_builder tool else: batch_mode = False - if tool_id == 'keras_batch_models': + if tool_id == "keras_batch_models": batch_mode = True - build_keras_model(inputs=inputs, - model_json=model_json, - infile_weights=infile_weights, - batch_mode=batch_mode, - outfile=outfile, - outfile_params=outfile_params) + build_keras_model( + inputs=inputs, + model_json=model_json, + infile_weights=infile_weights, + batch_mode=batch_mode, + outfile=outfile, + outfile_params=outfile_params, + )
--- a/keras_train_and_eval.py Tue Apr 13 22:00:10 2021 +0000 +++ b/keras_train_and_eval.py Sat May 01 01:20:14 2021 +0000 @@ -11,16 +11,9 @@ from galaxy_ml.externals.selene_sdk.utils import compute_score from galaxy_ml.keras_galaxy_models import _predict_generator from galaxy_ml.model_validations import train_test_split -from galaxy_ml.utils import ( - clean_params, - get_main_estimator, - get_module, - get_scoring, - load_model, - read_columns, - SafeEval, - try_get_attr, -) +from galaxy_ml.utils import (clean_params, get_main_estimator, + get_module, get_scoring, load_model, read_columns, + SafeEval, try_get_attr) from scipy.io import mmread from sklearn.metrics.scorer import _check_multimetric_scoring from sklearn.model_selection import _search, _validation @@ -28,7 +21,6 @@ from sklearn.pipeline import Pipeline from sklearn.utils import indexable, safe_indexing - _fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score") setattr(_search, "_fit_and_score", _fit_and_score) setattr(_validation, "_fit_and_score", _fit_and_score) @@ -56,7 +48,10 @@ param_name = p["sp_name"] if param_name.lower().endswith(NON_SEARCHABLE): - warnings.warn("Warning: `%s` is not eligible for search and was " "omitted!" % param_name) + warnings.warn( + "Warning: `%s` is not eligible for search and was " + "omitted!" % param_name + ) continue if not swap_value.startswith(":"): @@ -99,7 +94,11 @@ index_arr = np.arange(n_samples) test = index_arr[np.isin(groups, group_names)] train = index_arr[~np.isin(groups, group_names)] - rval = list(chain.from_iterable((safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays)) + rval = list( + chain.from_iterable( + (safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays + ) + ) else: rval = train_test_split(*new_arrays, **kwargs) @@ -127,14 +126,22 @@ pred_labels = (pred_probas > 0.5).astype("int32") targets = y_true.ravel().astype("int32") if not is_multimetric: - preds = pred_labels if scorer.__class__.__name__ == "_PredictScorer" else pred_probas + preds = ( + pred_labels + if scorer.__class__.__name__ == "_PredictScorer" + else pred_probas + ) score = scorer._score_func(targets, preds, **scorer._kwargs) return score else: scores = {} for name, one_scorer in scorer.items(): - preds = pred_labels if one_scorer.__class__.__name__ == "_PredictScorer" else pred_probas + preds = ( + pred_labels + if one_scorer.__class__.__name__ == "_PredictScorer" + else pred_probas + ) score = one_scorer._score_func(targets, preds, **one_scorer._kwargs) scores[name] = score @@ -144,13 +151,21 @@ pred_labels = (pred_probas > 0.5).astype("int32") targets = y_true.astype("int32") if not is_multimetric: - preds = pred_labels if scorer.__class__.__name__ == "_PredictScorer" else pred_probas + preds = ( + pred_labels + if scorer.__class__.__name__ == "_PredictScorer" + else pred_probas + ) score, _ = compute_score(preds, targets, scorer._score_func) return score else: scores = {} for name, one_scorer in scorer.items(): - preds = pred_labels if one_scorer.__class__.__name__ == "_PredictScorer" else pred_probas + preds = ( + pred_labels + if one_scorer.__class__.__name__ == "_PredictScorer" + else pred_probas + ) score, _ = compute_score(preds, targets, one_scorer._score_func) scores[name] = score @@ -243,7 +258,9 @@ # tabular input if input_type == "tabular": header = "infer" if params["input_options"]["header1"] else None - column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] + column_option = params["input_options"]["column_selector_options_1"][ + "selected_column_selector_option" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -295,7 +312,9 @@ # Get target y header = "infer" if params["input_options"]["header2"] else None - column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] + column_option = params["input_options"]["column_selector_options_2"][ + "selected_column_selector_option2" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -313,12 +332,9 @@ infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True) loaded_df[df_key] = infile2 - y = read_columns(infile2, - c=c, - c_option=column_option, - sep='\t', - header=header, - parse_dates=True) + y = read_columns( + infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True + ) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel() if input_type == "refseq_and_interval": @@ -328,10 +344,14 @@ # load groups if groups: - groups_selector = (params["experiment_schemes"]["test_split"]["split_algos"]).pop("groups_selector") + groups_selector = ( + params["experiment_schemes"]["test_split"]["split_algos"] + ).pop("groups_selector") header = "infer" if groups_selector["header_g"] else None - column_option = groups_selector["column_selector_options_g"]["selected_column_selector_option_g"] + column_option = groups_selector["column_selector_options_g"][ + "selected_column_selector_option_g" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -346,12 +366,14 @@ if df_key in loaded_df: groups = loaded_df[df_key] - groups = read_columns(groups, - c=c, - c_option=column_option, - sep='\t', - header=header, - parse_dates=True) + groups = read_columns( + groups, + c=c, + c_option=column_option, + sep="\t", + header=header, + parse_dates=True, + ) groups = groups.ravel() # del loaded_df @@ -364,7 +386,7 @@ main_est.set_params(memory=memory) # handle scorer, convert to scorer dict - scoring = params['experiment_schemes']['metrics']['scoring'] + scoring = params["experiment_schemes"]["metrics"]["scoring"] if scoring is not None: # get_scoring() expects secondary_scoring to be a comma separated string (not a list) # Check if secondary_scoring is specified @@ -385,7 +407,9 @@ if y is not None: test_split_options["labels"] = y else: - raise ValueError("Stratified shuffle split is not " "applicable on empty target values!") + raise ValueError( + "Stratified shuffle split is not " "applicable on empty target values!" + ) ( X_train, @@ -408,7 +432,10 @@ if y_train is not None: val_split_options["labels"] = y_train else: - raise ValueError("Stratified shuffle split is not " "applicable on empty target values!") + raise ValueError( + "Stratified shuffle split is not " + "applicable on empty target values!" + ) ( X_train, @@ -431,8 +458,12 @@ if hasattr(estimator, "evaluate"): steps = estimator.prediction_steps batch_size = estimator.batch_size - generator = estimator.data_generator_.flow(X_test, y=y_test, batch_size=batch_size) - predictions, y_true = _predict_generator(estimator.model_, generator, steps=steps) + generator = estimator.data_generator_.flow( + X_test, y=y_test, batch_size=batch_size + ) + predictions, y_true = _predict_generator( + estimator.model_, generator, steps=steps + ) scores = _evaluate(y_true, predictions, scorer, is_multimetric=True) else:
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/label_encoder.py Sat May 01 01:20:14 2021 +0000 @@ -0,0 +1,48 @@ +import argparse +import json +import warnings + +import numpy as np +import pandas as pd +from sklearn.preprocessing import LabelEncoder + + +def main(inputs, infile, outfile): + """ + Parameter + --------- + input : str + File path to galaxy tool parameter + + infile : str + File paths of input vector + + outfile : str + File path to output vector + + """ + warnings.simplefilter('ignore') + + with open(inputs, 'r') as param_handler: + params = json.load(param_handler) + + input_header = params['header0'] + header = 'infer' if input_header else None + + input_vector = pd.read_csv(infile, sep='\t', header=header) + + le = LabelEncoder() + + output_vector = le.fit_transform(input_vector) + + np.savetxt(outfile, output_vector, fmt="%d", delimiter='\t') + + +if __name__ == '__main__': + aparser = argparse.ArgumentParser() + aparser.add_argument("-i", "--inputs", dest="inputs", required=True) + aparser.add_argument("-y", "--infile", dest="infile") + aparser.add_argument("-o", "--outfile", dest="outfile") + args = aparser.parse_args() + + main(args.inputs, args.infile, args.outfile)
--- a/ml_visualization_ex.py Tue Apr 13 22:00:10 2021 +0000 +++ b/ml_visualization_ex.py Sat May 01 01:20:14 2021 +0000 @@ -13,10 +13,10 @@ from keras.models import model_from_json from keras.utils import plot_model from sklearn.feature_selection.base import SelectorMixin -from sklearn.metrics import auc, average_precision_score, confusion_matrix, precision_recall_curve, roc_curve +from sklearn.metrics import (auc, average_precision_score, confusion_matrix, + precision_recall_curve, roc_curve) from sklearn.pipeline import Pipeline - safe_eval = SafeEval() # plotly default colors @@ -51,7 +51,9 @@ y_true = df1.iloc[:, idx].values y_score = df2.iloc[:, idx].values - precision, recall, _ = precision_recall_curve(y_true, y_score, pos_label=pos_label) + precision, recall, _ = precision_recall_curve( + y_true, y_score, pos_label=pos_label + ) ap = average_precision_score(y_true, y_score, pos_label=pos_label or 1) trace = go.Scatter( @@ -111,7 +113,9 @@ y_true = df1.iloc[:, idx].values y_score = df2.iloc[:, idx].values - precision, recall, _ = precision_recall_curve(y_true, y_score, pos_label=pos_label) + precision, recall, _ = precision_recall_curve( + y_true, y_score, pos_label=pos_label + ) ap = average_precision_score(y_true, y_score, pos_label=pos_label or 1) plt.step( @@ -155,7 +159,9 @@ y_true = df1.iloc[:, idx].values y_score = df2.iloc[:, idx].values - fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate) + fpr, tpr, _ = roc_curve( + y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate + ) roc_auc = auc(fpr, tpr) trace = go.Scatter( @@ -168,7 +174,9 @@ data.append(trace) layout = go.Layout( - xaxis=dict(title="False Positive Rate", linecolor="lightslategray", linewidth=1), + xaxis=dict( + title="False Positive Rate", linecolor="lightslategray", linewidth=1 + ), yaxis=dict(title="True Positive Rate", linecolor="lightslategray", linewidth=1), title=dict( text=title or "Receiver Operating Characteristic (ROC) Curve", @@ -204,7 +212,9 @@ os.rename("output.html", "output") -def visualize_roc_curve_matplotlib(df1, df2, pos_label, drop_intermediate=True, title=None): +def visualize_roc_curve_matplotlib( + df1, df2, pos_label, drop_intermediate=True, title=None +): """visualize roc-curve using matplotlib and output svg image""" backend = matplotlib.get_backend() if "inline" not in backend: @@ -216,7 +226,9 @@ y_true = df1.iloc[:, idx].values y_score = df2.iloc[:, idx].values - fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate) + fpr, tpr, _ = roc_curve( + y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate + ) roc_auc = auc(fpr, tpr) plt.step( @@ -253,11 +265,15 @@ col = plot_selection[column_name]["col1"] else: col = None - _, input_df = read_columns(file_path, c=col, - c_option=column_option, - return_df=True, - sep='\t', header=header, - parse_dates=True) + _, input_df = read_columns( + file_path, + c=col, + c_option=column_option, + return_df=True, + sep="\t", + header=header, + parse_dates=True, + ) return input_df @@ -344,7 +360,9 @@ with open(infile_estimator, "rb") as estimator_handler: estimator = load_model(estimator_handler) - column_option = params["plotting_selection"]["column_selector_options"]["selected_column_selector_option"] + column_option = params["plotting_selection"]["column_selector_options"][ + "selected_column_selector_option" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -379,7 +397,11 @@ else: coefs = getattr(estimator, "feature_importances_", None) if coefs is None: - raise RuntimeError("The classifier does not expose " '"coef_" or "feature_importances_" ' "attributes") + raise RuntimeError( + "The classifier does not expose " + '"coef_" or "feature_importances_" ' + "attributes" + ) threshold = params["plotting_selection"]["threshold"] if threshold is not None: @@ -454,7 +476,9 @@ layout = go.Layout( xaxis=dict(title="Number of features selected"), yaxis=dict(title="Cross validation score"), - title=dict(text=title or None, x=0.5, y=0.92, xanchor="center", yanchor="top"), + title=dict( + text=title or None, x=0.5, y=0.92, xanchor="center", yanchor="top" + ), font=dict(family="sans-serif", size=11), # control backgroud colors plot_bgcolor="rgba(255,255,255,0)", @@ -548,9 +572,13 @@ elif plot_type == "classification_confusion_matrix": plot_selection = params["plotting_selection"] - input_true = get_dataframe(true_labels, plot_selection, "header_true", "column_selector_options_true") + input_true = get_dataframe( + true_labels, plot_selection, "header_true", "column_selector_options_true" + ) header_predicted = "infer" if plot_selection["header_predicted"] else None - input_predicted = pd.read_csv(predicted_labels, sep="\t", parse_dates=True, header=header_predicted) + input_predicted = pd.read_csv( + predicted_labels, sep="\t", parse_dates=True, header=header_predicted + ) true_classes = input_true.iloc[:, -1].copy() predicted_classes = input_predicted.iloc[:, -1].copy() axis_labels = list(set(true_classes))
--- a/model_prediction.py Tue Apr 13 22:00:10 2021 +0000 +++ b/model_prediction.py Sat May 01 01:20:14 2021 +0000 @@ -63,7 +63,8 @@ if hasattr(main_est, "config") and hasattr(main_est, "load_weights"): if not infile_weights or infile_weights == "None": raise ValueError( - "The selected model skeleton asks for weights, " "but dataset for weights wan not selected!" + "The selected model skeleton asks for weights, " + "but dataset for weights wan not selected!" ) main_est.load_weights(infile_weights) @@ -72,7 +73,9 @@ # tabular input if input_type == "tabular": header = "infer" if params["input_options"]["header1"] else None - column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] + column_option = params["input_options"]["column_selector_options_1"][ + "selected_column_selector_option" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -122,9 +125,13 @@ pred_data_generator = klass(fasta_path, seq_length=seq_length) if params["method"] == "predict": - preds = estimator.predict(X, data_generator=pred_data_generator, steps=steps) + preds = estimator.predict( + X, data_generator=pred_data_generator, steps=steps + ) else: - preds = estimator.predict_proba(X, data_generator=pred_data_generator, steps=steps) + preds = estimator.predict_proba( + X, data_generator=pred_data_generator, steps=steps + ) # vcf input elif input_type == "variant_effect": @@ -135,7 +142,9 @@ if options["blacklist_regions"] == "none": options["blacklist_regions"] = None - pred_data_generator = klass(ref_genome_path=ref_seq, vcf_path=vcf_path, **options) + pred_data_generator = klass( + ref_genome_path=ref_seq, vcf_path=vcf_path, **options + ) pred_data_generator.set_processing_attrs()
--- a/search_model_validation.py Tue Apr 13 22:00:10 2021 +0000 +++ b/search_model_validation.py Sat May 01 01:20:14 2021 +0000 @@ -11,31 +11,16 @@ import numpy as np import pandas as pd import skrebate -from galaxy_ml.utils import ( - clean_params, - get_cv, - get_main_estimator, - get_module, - get_scoring, - load_model, - read_columns, - SafeEval, - try_get_attr -) +from galaxy_ml.utils import (clean_params, get_cv, + get_main_estimator, get_module, get_scoring, + load_model, read_columns, SafeEval, try_get_attr) from scipy.io import mmread -from sklearn import ( - cluster, - decomposition, - feature_selection, - kernel_approximation, - model_selection, - preprocessing, -) +from sklearn import (cluster, decomposition, feature_selection, + kernel_approximation, model_selection, preprocessing) from sklearn.exceptions import FitFailedWarning from sklearn.model_selection import _search, _validation from sklearn.model_selection._validation import _score, cross_validate - _fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score") setattr(_search, "_fit_and_score", _fit_and_score) setattr(_validation, "_fit_and_score", _fit_and_score) @@ -57,7 +42,10 @@ param_name = p["sp_name"] if param_name.lower().endswith(NON_SEARCHABLE): - print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name) + print( + "Warning: `%s` is not eligible for search and was " + "omitted!" % param_name + ) continue if not search_list.startswith(":"): @@ -90,7 +78,9 @@ decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), - decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), + decomposition.MiniBatchDictionaryLearning( + random_state=0, n_jobs=N_JOBS + ), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), @@ -107,14 +97,26 @@ skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), - imblearn.under_sampling.CondensedNearestNeighbour(random_state=0, n_jobs=N_JOBS), - imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), - imblearn.under_sampling.RepeatedEditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), + imblearn.under_sampling.CondensedNearestNeighbour( + random_state=0, n_jobs=N_JOBS + ), + imblearn.under_sampling.EditedNearestNeighbours( + random_state=0, n_jobs=N_JOBS + ), + imblearn.under_sampling.RepeatedEditedNearestNeighbours( + random_state=0, n_jobs=N_JOBS + ), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), - imblearn.under_sampling.InstanceHardnessThreshold(random_state=0, n_jobs=N_JOBS), + imblearn.under_sampling.InstanceHardnessThreshold( + random_state=0, n_jobs=N_JOBS + ), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), - imblearn.under_sampling.NeighbourhoodCleaningRule(random_state=0, n_jobs=N_JOBS), - imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), + imblearn.under_sampling.NeighbourhoodCleaningRule( + random_state=0, n_jobs=N_JOBS + ), + imblearn.under_sampling.OneSidedSelection( + random_state=0, n_jobs=N_JOBS + ), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), @@ -122,7 +124,9 @@ imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), - imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), + imblearn.over_sampling.SMOTENC( + categorical_features=[], random_state=0, n_jobs=N_JOBS + ), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0), ) @@ -205,7 +209,9 @@ # tabular input if input_type == "tabular": header = "infer" if params["input_options"]["header1"] else None - column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] + column_option = params["input_options"]["column_selector_options_1"][ + "selected_column_selector_option" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -261,7 +267,9 @@ # Get target y header = "infer" if params["input_options"]["header2"] else None - column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] + column_option = params["input_options"]["column_selector_options_2"][ + "selected_column_selector_option2" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -279,7 +287,9 @@ infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True) loaded_df[df_key] = infile2 - y = read_columns(infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True) + y = read_columns( + infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True + ) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel() if input_type == "refseq_and_interval": @@ -378,12 +388,16 @@ X, X_test, y, y_test = train_test_split(X, y, **split_options) elif split_options["shuffle"] == "group": if groups is None: - raise ValueError("No group based CV option was choosen for " "group shuffle!") + raise ValueError( + "No group based CV option was choosen for " "group shuffle!" + ) split_options["labels"] = groups if y is None: X, X_test, groups, _ = train_test_split(X, groups, **split_options) else: - X, X_test, y, y_test, groups, _ = train_test_split(X, y, groups, **split_options) + X, X_test, y, y_test, groups, _ = train_test_split( + X, y, groups, **split_options + ) else: if split_options["shuffle"] == "None": split_options["shuffle"] = None @@ -411,9 +425,13 @@ # TODO Solve deep learning models in pipeline if best_estimator_.__class__.__name__ == "KerasGBatchClassifier": - test_score = best_estimator_.evaluate(X_test, scorer=scorer_, is_multimetric=is_multimetric) + test_score = best_estimator_.evaluate( + X_test, scorer=scorer_, is_multimetric=is_multimetric + ) else: - test_score = _score(best_estimator_, X_test, y_test, scorer_, is_multimetric=is_multimetric) + test_score = _score( + best_estimator_, X_test, y_test, scorer_, is_multimetric=is_multimetric + ) if not is_multimetric: test_score = {primary_scoring: test_score} @@ -487,7 +505,9 @@ params = json.load(param_handler) # Override the refit parameter - params["search_schemes"]["options"]["refit"] = True if params["save"] != "nope" else False + params["search_schemes"]["options"]["refit"] = ( + True if params["save"] != "nope" else False + ) with open(infile_estimator, "rb") as estimator_handler: estimator = load_model(estimator_handler) @@ -499,17 +519,21 @@ options = params["search_schemes"]["options"] if groups: - header = "infer" if (options["cv_selector"]["groups_selector"]["header_g"]) else None - column_option = options["cv_selector"]["groups_selector"]["column_selector_options_g"][ - "selected_column_selector_option_g" - ] + header = ( + "infer" if (options["cv_selector"]["groups_selector"]["header_g"]) else None + ) + column_option = options["cv_selector"]["groups_selector"][ + "column_selector_options_g" + ]["selected_column_selector_option_g"] if column_option in [ "by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name", ]: - c = options["cv_selector"]["groups_selector"]["column_selector_options_g"]["col_g"] + c = options["cv_selector"]["groups_selector"]["column_selector_options_g"][ + "col_g" + ] else: c = None @@ -537,12 +561,14 @@ secondary_scoring = options["scoring"].get("secondary_scoring", None) if secondary_scoring is not None: # If secondary_scoring is specified, convert the list into comman separated string - options["scoring"]["secondary_scoring"] = ",".join(options["scoring"]["secondary_scoring"]) + options["scoring"]["secondary_scoring"] = ",".join( + options["scoring"]["secondary_scoring"] + ) options["scoring"] = get_scoring(options["scoring"]) if options["error_score"]: options["error_score"] = "raise" else: - options["error_score"] = np.NaN + options["error_score"] = np.nan if options["refit"] and isinstance(options["scoring"], dict): options["refit"] = primary_scoring if "pre_dispatch" in options and options["pre_dispatch"] == "": @@ -588,7 +614,9 @@ # make sure refit is choosen # this could be True for sklearn models, but not the case for # deep learning models - if not options["refit"] and not all(hasattr(estimator, attr) for attr in ("config", "model_type")): + if not options["refit"] and not all( + hasattr(estimator, attr) for attr in ("config", "model_type") + ): warnings.warn("Refit is change to `True` for nested validation!") setattr(searcher, "refit", True) @@ -687,7 +715,9 @@ cv_results = pd.DataFrame(searcher.cv_results_) cv_results = cv_results[sorted(cv_results.columns)] - cv_results.to_csv(path_or_buf=outfile_result, sep="\t", header=True, index=False) + cv_results.to_csv( + path_or_buf=outfile_result, sep="\t", header=True, index=False + ) memory.clear(warn=False)
--- a/simple_model_fit.py Tue Apr 13 22:00:10 2021 +0000 +++ b/simple_model_fit.py Sat May 01 01:20:14 2021 +0000 @@ -7,7 +7,6 @@ from scipy.io import mmread from sklearn.pipeline import Pipeline - N_JOBS = int(__import__("os").environ.get("GALAXY_SLOTS", 1)) @@ -36,7 +35,7 @@ if name == "memory" or name.endswith("__memory") or name.endswith("_path"): new_p = {name: None} estimator.set_params(**new_p) - elif n_jobs is not None and (name == 'n_jobs' or name.endswith('__n_jobs')): + elif n_jobs is not None and (name == "n_jobs" or name.endswith("__n_jobs")): new_p = {name: n_jobs} estimator.set_params(**new_p) elif name.endswith("callbacks"): @@ -68,7 +67,9 @@ # tabular input if input_type == "tabular": header = "infer" if params["input_options"]["header1"] else None - column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] + column_option = params["input_options"]["column_selector_options_1"][ + "selected_column_selector_option" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -90,7 +91,9 @@ # Get target y header = "infer" if params["input_options"]["header2"] else None - column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] + column_option = params["input_options"]["column_selector_options_2"][ + "selected_column_selector_option2" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -108,12 +111,9 @@ infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True) loaded_df[df_key] = infile2 - y = read_columns(infile2, - c=c, - c_option=column_option, - sep='\t', - header=header, - parse_dates=True) + y = read_columns( + infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True + ) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel()
--- a/stacking_ensembles.py Tue Apr 13 22:00:10 2021 +0000 +++ b/stacking_ensembles.py Sat May 01 01:20:14 2021 +0000 @@ -8,8 +8,8 @@ import mlxtend.classifier import mlxtend.regressor import pandas as pd -from galaxy_ml.utils import get_cv, get_estimator, get_search_params, load_model - +from galaxy_ml.utils import (get_cv, get_estimator, get_search_params, + load_model) warnings.filterwarnings("ignore") @@ -62,7 +62,9 @@ with open(meta_path, "rb") as f: meta_estimator = load_model(f) else: - estimator_json = params["algo_selection"]["meta_estimator"]["estimator_selector"] + estimator_json = params["algo_selection"]["meta_estimator"][ + "estimator_selector" + ] meta_estimator = get_estimator(estimator_json) options = params["algo_selection"]["options"] @@ -89,10 +91,14 @@ ensemble_estimator = klass(base_estimators, **options) elif mod == mlxtend.classifier: - ensemble_estimator = klass(classifiers=base_estimators, meta_classifier=meta_estimator, **options) + ensemble_estimator = klass( + classifiers=base_estimators, meta_classifier=meta_estimator, **options + ) else: - ensemble_estimator = klass(regressors=base_estimators, meta_regressor=meta_estimator, **options) + ensemble_estimator = klass( + regressors=base_estimators, meta_regressor=meta_estimator, **options + ) print(ensemble_estimator) for base_est in base_estimators:
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/le_input_w_header.tabular Sat May 01 01:20:14 2021 +0000 @@ -0,0 +1,5 @@ +Class +Liverpool +Real Madrid +Bayern Munich +A.C. Milan
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/le_input_wo_header.tabular Sat May 01 01:20:14 2021 +0000 @@ -0,0 +1,4 @@ +Liverpool +Real Madrid +Bayern Munich +A.C. Milan
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/le_output.tabular Sat May 01 01:20:14 2021 +0000 @@ -0,0 +1,4 @@ +2 +3 +1 +0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mba_input_int_w.tabular Sat May 01 01:20:14 2021 +0000 @@ -0,0 +1,6 @@ +Transactions +10 11 12 13 14 15 +16 11 12 13 14 15 +10 17 13 14 +10 18 19 13 15 +19 11 11 13 20 14
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mba_input_int_wo.tabular Sat May 01 01:20:14 2021 +0000 @@ -0,0 +1,5 @@ +10 11 12 13 14 15 +16 11 12 13 14 15 +10 17 13 14 +10 18 19 13 15 +19 11 11 13 20 14
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mba_input_str_w.tabular Sat May 01 01:20:14 2021 +0000 @@ -0,0 +1,6 @@ +Transactions +Milk Onion Nutmeg Kidney Beans Eggs Yogurt +Dill Onion Nutmeg Kidney Beans Eggs Yogurt +Milk Apple Kidney Beans Eggs +Milk Unicorn Corn Kidney Beans Yogurt +Corn Onion Onion Kidney Beans Ice cream Eggs
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mba_input_str_wo.tabular Sat May 01 01:20:14 2021 +0000 @@ -0,0 +1,5 @@ +Milk Onion Nutmeg Kidney Beans Eggs Yogurt +Dill Onion Nutmeg Kidney Beans Eggs Yogurt +Milk Apple Kidney Beans Eggs +Milk Unicorn Corn Kidney Beans Yogurt +Corn Onion Onion Kidney Beans Ice cream Eggs
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mba_out_str.tabular Sat May 01 01:20:14 2021 +0000 @@ -0,0 +1,7 @@ +antecedents consequents antecedent support consequent support support confidence lift leverage conviction +['Eggs'] ['Kidney Beans', 'Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 +['Eggs'] ['Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 +['Eggs', 'Kidney Beans'] ['Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 +['Kidney Beans', 'Onion'] ['Eggs'] 0.6 0.8 0.6 1.0 1.25 0.12 inf +['Onion'] ['Eggs'] 0.6 0.8 0.6 1.0 1.25 0.12 inf +['Onion'] ['Eggs', 'Kidney Beans'] 0.6 0.8 0.6 1.0 1.25 0.12 inf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mba_output_int.tabular Sat May 01 01:20:14 2021 +0000 @@ -0,0 +1,7 @@ +antecedents consequents antecedent support consequent support support confidence lift leverage conviction +['11'] ['13', '14'] 0.6 0.8 0.6 1.0 1.25 0.12 inf +['11'] ['14'] 0.6 0.8 0.6 1.0 1.25 0.12 inf +['11', '13'] ['14'] 0.6 0.8 0.6 1.0 1.25 0.12 inf +['13', '14'] ['11'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 +['14'] ['11'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 +['14'] ['11', '13'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mba_output_str.tabular Sat May 01 01:20:14 2021 +0000 @@ -0,0 +1,7 @@ +antecedents consequents antecedent support consequent support support confidence lift leverage conviction +['Eggs'] ['Kidney Beans', 'Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 +['Eggs'] ['Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 +['Eggs', 'Kidney Beans'] ['Onion'] 0.8 0.6 0.6 0.7499999999999999 1.2499999999999998 0.12 1.5999999999999994 +['Kidney Beans', 'Onion'] ['Eggs'] 0.6 0.8 0.6 1.0 1.25 0.12 inf +['Onion'] ['Eggs'] 0.6 0.8 0.6 1.0 1.25 0.12 inf +['Onion'] ['Eggs', 'Kidney Beans'] 0.6 0.8 0.6 1.0 1.25 0.12 inf
--- a/to_categorical.py Tue Apr 13 22:00:10 2021 +0000 +++ b/to_categorical.py Sat May 01 01:20:14 2021 +0000 @@ -43,7 +43,9 @@ aparser = argparse.ArgumentParser() aparser.add_argument("-i", "--inputs", dest="inputs", required=True) aparser.add_argument("-y", "--infile", dest="infile") - aparser.add_argument("-n", "--num_classes", dest="num_classes", type=int, default=None) + aparser.add_argument( + "-n", "--num_classes", dest="num_classes", type=int, default=None + ) aparser.add_argument("-o", "--outfile", dest="outfile") args = aparser.parse_args()
--- a/train_test_eval.py Tue Apr 13 22:00:10 2021 +0000 +++ b/train_test_eval.py Sat May 01 01:20:14 2021 +0000 @@ -9,14 +9,8 @@ import numpy as np import pandas as pd from galaxy_ml.model_validations import train_test_split -from galaxy_ml.utils import ( - get_module, - get_scoring, - load_model, - read_columns, - SafeEval, - try_get_attr, -) +from galaxy_ml.utils import (get_module, get_scoring, load_model, + read_columns, SafeEval, try_get_attr) from scipy.io import mmread from sklearn import pipeline from sklearn.metrics.scorer import _check_multimetric_scoring @@ -24,7 +18,6 @@ from sklearn.model_selection._validation import _score from sklearn.utils import indexable, safe_indexing - _fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score") setattr(_search, "_fit_and_score", _fit_and_score) setattr(_validation, "_fit_and_score", _fit_and_score) @@ -262,12 +255,9 @@ infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True) loaded_df[df_key] = infile2 - y = read_columns(infile2, - c=c, - c_option=column_option, - sep='\t', - header=header, - parse_dates=True) + y = read_columns( + infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True + ) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel() if input_type == "refseq_and_interval": @@ -299,12 +289,14 @@ if df_key in loaded_df: groups = loaded_df[df_key] - groups = read_columns(groups, - c=c, - c_option=column_option, - sep='\t', - header=header, - parse_dates=True) + groups = read_columns( + groups, + c=c, + c_option=column_option, + sep="\t", + header=header, + parse_dates=True, + ) groups = groups.ravel() # del loaded_df @@ -371,9 +363,14 @@ "Stratified shuffle split is not " "applicable on empty target values!" ) - X_train, X_test, y_train, y_test, groups_train, _groups_test = train_test_split_none( - X, y, groups, **test_split_options - ) + ( + X_train, + X_test, + y_train, + y_test, + groups_train, + _groups_test, + ) = train_test_split_none(X, y, groups, **test_split_options) exp_scheme = params["experiment_schemes"]["selected_exp_scheme"]
--- a/train_test_split.py Tue Apr 13 22:00:10 2021 +0000 +++ b/train_test_split.py Sat May 01 01:20:14 2021 +0000 @@ -28,17 +28,23 @@ # read groups if infile_groups: - header = "infer" if (params["mode_selection"]["cv_selector"]["groups_selector"]["header_g"]) else None - column_option = params["mode_selection"]["cv_selector"]["groups_selector"]["column_selector_options_g"][ - "selected_column_selector_option_g" - ] + header = ( + "infer" + if (params["mode_selection"]["cv_selector"]["groups_selector"]["header_g"]) + else None + ) + column_option = params["mode_selection"]["cv_selector"]["groups_selector"][ + "column_selector_options_g" + ]["selected_column_selector_option_g"] if column_option in [ "by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name", ]: - c = params["mode_selection"]["cv_selector"]["groups_selector"]["column_selector_options_g"]["col_g"] + c = params["mode_selection"]["cv_selector"]["groups_selector"][ + "column_selector_options_g" + ]["col_g"] else: c = None @@ -67,7 +73,10 @@ total_n_splits = splitter.get_n_splits(array.values, y=y, groups=groups) if nth_split > total_n_splits: - raise ValueError("Total number of splits is {}, but got `nth_split` " "= {}".format(total_n_splits, nth_split)) + raise ValueError( + "Total number of splits is {}, but got `nth_split` " + "= {}".format(total_n_splits, nth_split) + ) i = 1 for train_index, test_index in splitter.split(array.values, y=y, groups=groups): @@ -137,7 +146,9 @@ # cv splitter else: - train, test = _get_single_cv_split(params, array, infile_labels=infile_labels, infile_groups=infile_groups) + train, test = _get_single_cv_split( + params, array, infile_labels=infile_labels, infile_groups=infile_groups + ) print("Input shape: %s" % repr(array.shape)) print("Train shape: %s" % repr(train.shape))
