sklearn_train_test_eval: ml_visualization

comparison ml_visualization_ex.py @ 9:ead7adad8d0e draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit e2a5eade6d0e5ddf3a47630381a0ad90d80e8a04"

author	bgruening
date	Tue, 13 Apr 2021 18:45:35 +0000
parents	1b68acd5ac08
children	a9e0b963b7bb

comparison

equal deleted inserted replaced

-:e03a58b31c12
+:ead7adad8d0e
 safe_eval = SafeEval()
 # plotly default colors
 default_colors = [
-'#1f77b4',  # muted blue
+"#1f77b4",  # muted blue
-'#ff7f0e',  # safety orange
+"#ff7f0e",  # safety orange
-'#2ca02c',  # cooked asparagus green
+"#2ca02c",  # cooked asparagus green
-'#d62728',  # brick red
+"#d62728",  # brick red
-'#9467bd',  # muted purple
+"#9467bd",  # muted purple
-'#8c564b',  # chestnut brown
+"#8c564b",  # chestnut brown
-'#e377c2',  # raspberry yogurt pink
+"#e377c2",  # raspberry yogurt pink
-'#7f7f7f',  # middle gray
+"#7f7f7f",  # middle gray
-'#bcbd22',  # curry yellow-green
+"#bcbd22",  # curry yellow-green
-'#17becf'   # blue-teal
+"#17becf",  # blue-teal
 ]
 def visualize_pr_curve_plotly(df1, df2, pos_label, title=None):
 """output pr-curve in html using plotly
 data = []
 for idx in range(df1.shape[1]):
 y_true = df1.iloc[:, idx].values
 y_score = df2.iloc[:, idx].values
-precision, recall, _ = precision_recall_curve(
+precision, recall, _ = precision_recall_curve(y_true, y_score, pos_label=pos_label)
-y_true, y_score, pos_label=pos_label)
+ap = average_precision_score(y_true, y_score, pos_label=pos_label or 1)
-ap = average_precision_score(
-y_true, y_score, pos_label=pos_label or 1)
 trace = go.Scatter(
 x=recall,
 y=precision,
-mode='lines',
+mode="lines",
-marker=dict(
+marker=dict(color=default_colors[idx % len(default_colors)]),
-color=default_colors[idx % len(default_colors)]
+name="%s (area = %.3f)" % (idx, ap),
-),
-name='%s (area = %.3f)' % (idx, ap)
 )
 data.append(trace)
 layout = go.Layout(
-xaxis=dict(
+xaxis=dict(title="Recall", linecolor="lightslategray", linewidth=1),
-title='Recall',
+yaxis=dict(title="Precision", linecolor="lightslategray", linewidth=1),
-linecolor='lightslategray',
-linewidth=1
-),
-yaxis=dict(
-title='Precision',
-linecolor='lightslategray',
-linewidth=1
-),
 title=dict(
-text=title or 'Precision-Recall Curve',
+text=title or "Precision-Recall Curve",
 x=0.5,
 y=0.92,
-xanchor='center',
+xanchor="center",
-yanchor='top'
+yanchor="top",
 ),
-font=dict(
+font=dict(family="sans-serif", size=11),
-family="sans-serif",
-size=11
-),
 # control backgroud colors
-plot_bgcolor='rgba(255,255,255,0)'
+plot_bgcolor="rgba(255,255,255,0)",
 )
 """
 legend=dict(
 x=0.95,
 y=0,
 fig = go.Figure(data=data, layout=layout)
 plotly.offline.plot(fig, filename="output.html", auto_open=False)
 # to be discovered by `from_work_dir`
-os.rename('output.html', 'output')
+os.rename("output.html", "output")
 def visualize_pr_curve_matplotlib(df1, df2, pos_label, title=None):
-"""visualize pr-curve using matplotlib and output svg image
+"""visualize pr-curve using matplotlib and output svg image"""
-"""
 backend = matplotlib.get_backend()
 if "inline" not in backend:
 matplotlib.use("SVG")
-plt.style.use('seaborn-colorblind')
+plt.style.use("seaborn-colorblind")
 plt.figure()
 for idx in range(df1.shape[1]):
 y_true = df1.iloc[:, idx].values
 y_score = df2.iloc[:, idx].values
-precision, recall, _ = precision_recall_curve(
+precision, recall, _ = precision_recall_curve(y_true, y_score, pos_label=pos_label)
-y_true, y_score, pos_label=pos_label)
+ap = average_precision_score(y_true, y_score, pos_label=pos_label or 1)
-ap = average_precision_score(
-y_true, y_score, pos_label=pos_label or 1)
+plt.step(
+recall,
-plt.step(recall, precision, 'r-', color="black", alpha=0.3,
+precision,
-lw=1, where="post", label='%s (area = %.3f)' % (idx, ap))
+"r-",
+color="black",
+alpha=0.3,
+lw=1,
+where="post",
+label="%s (area = %.3f)" % (idx, ap),
+)
 plt.xlim([0.0, 1.0])
 plt.ylim([0.0, 1.05])
-plt.xlabel('Recall')
+plt.xlabel("Recall")
-plt.ylabel('Precision')
+plt.ylabel("Precision")
-title = title or 'Precision-Recall Curve'
+title = title or "Precision-Recall Curve"
 plt.title(title)
 folder = os.getcwd()
 plt.savefig(os.path.join(folder, "output.svg"), format="svg")
-os.rename(os.path.join(folder, "output.svg"),
+os.rename(os.path.join(folder, "output.svg"), os.path.join(folder, "output"))
-os.path.join(folder, "output"))
+def visualize_roc_curve_plotly(df1, df2, pos_label, drop_intermediate=True, title=None):
-def visualize_roc_curve_plotly(df1, df2, pos_label,
-drop_intermediate=True,
-title=None):
 """output roc-curve in html using plotly
 df1 : pandas.DataFrame
 Containing y_true
 df2 : pandas.DataFrame
 data = []
 for idx in range(df1.shape[1]):
 y_true = df1.iloc[:, idx].values
 y_score = df2.iloc[:, idx].values
-fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label,
+fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate)
-drop_intermediate=drop_intermediate)
 roc_auc = auc(fpr, tpr)
 trace = go.Scatter(
 x=fpr,
 y=tpr,
-mode='lines',
+mode="lines",
-marker=dict(
+marker=dict(color=default_colors[idx % len(default_colors)]),
-color=default_colors[idx % len(default_colors)]
+name="%s (area = %.3f)" % (idx, roc_auc),
-),
-name='%s (area = %.3f)' % (idx, roc_auc)
 )
 data.append(trace)
 layout = go.Layout(
-xaxis=dict(
+xaxis=dict(title="False Positive Rate", linecolor="lightslategray", linewidth=1),
-title='False Positive Rate',
+yaxis=dict(title="True Positive Rate", linecolor="lightslategray", linewidth=1),
-linecolor='lightslategray',
-linewidth=1
-),
-yaxis=dict(
-title='True Positive Rate',
-linecolor='lightslategray',
-linewidth=1
-),
 title=dict(
-text=title or 'Receiver Operating Characteristic (ROC) Curve',
+text=title or "Receiver Operating Characteristic (ROC) Curve",
 x=0.5,
 y=0.92,
-xanchor='center',
+xanchor="center",
-yanchor='top'
+yanchor="top",
 ),
-font=dict(
+font=dict(family="sans-serif", size=11),
-family="sans-serif",
-size=11
-),
 # control backgroud colors
-plot_bgcolor='rgba(255,255,255,0)'
+plot_bgcolor="rgba(255,255,255,0)",
 )
 """
 # legend=dict(
 # x=0.95,
 # y=0,
 fig = go.Figure(data=data, layout=layout)
 plotly.offline.plot(fig, filename="output.html", auto_open=False)
 # to be discovered by `from_work_dir`
-os.rename('output.html', 'output')
+os.rename("output.html", "output")
-def visualize_roc_curve_matplotlib(df1, df2, pos_label,
+def visualize_roc_curve_matplotlib(df1, df2, pos_label, drop_intermediate=True, title=None):
-drop_intermediate=True,
+"""visualize roc-curve using matplotlib and output svg image"""
-title=None):
-"""visualize roc-curve using matplotlib and output svg image
-"""
 backend = matplotlib.get_backend()
 if "inline" not in backend:
 matplotlib.use("SVG")
-plt.style.use('seaborn-colorblind')
+plt.style.use("seaborn-colorblind")
 plt.figure()
 for idx in range(df1.shape[1]):
 y_true = df1.iloc[:, idx].values
 y_score = df2.iloc[:, idx].values
-fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label,
+fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label, drop_intermediate=drop_intermediate)
-drop_intermediate=drop_intermediate)
 roc_auc = auc(fpr, tpr)
-plt.step(fpr, tpr, 'r-', color="black", alpha=0.3, lw=1,
+plt.step(
-where="post", label='%s (area = %.3f)' % (idx, roc_auc))
+fpr,
+tpr,
+"r-",
+color="black",
+alpha=0.3,
+lw=1,
+where="post",
+label="%s (area = %.3f)" % (idx, roc_auc),
+)
 plt.xlim([0.0, 1.0])
 plt.ylim([0.0, 1.05])
-plt.xlabel('False Positive Rate')
+plt.xlabel("False Positive Rate")
-plt.ylabel('True Positive Rate')
+plt.ylabel("True Positive Rate")
-title = title or 'Receiver Operating Characteristic (ROC) Curve'
+title = title or "Receiver Operating Characteristic (ROC) Curve"
 plt.title(title)
 folder = os.getcwd()
 plt.savefig(os.path.join(folder, "output.svg"), format="svg")
-os.rename(os.path.join(folder, "output.svg"),
+os.rename(os.path.join(folder, "output.svg"), os.path.join(folder, "output"))
-os.path.join(folder, "output"))
 def get_dataframe(file_path, plot_selection, header_name, column_name):
-header = 'infer' if plot_selection[header_name] else None
+header = "infer" if plot_selection[header_name] else None
 column_option = plot_selection[column_name]["selected_column_selector_option"]
-if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
+if column_option in [
+"by_index_number",
+"all_but_by_index_number",
+"by_header_name",
+"all_but_by_header_name",
+]:
 col = plot_selection[column_name]["col1"]
 else:
 col = None
 _, input_df = read_columns(file_path, c=col,
 c_option=column_option,
 return_df=True,
 sep='\t', header=header,
 parse_dates=True)
 return input_df
-def main(inputs, infile_estimator=None, infile1=None,
+def main(
-infile2=None, outfile_result=None,
+inputs,
-outfile_object=None, groups=None,
+infile_estimator=None,
-ref_seq=None, intervals=None,
+infile1=None,
-targets=None, fasta_path=None,
+infile2=None,
-model_config=None, true_labels=None,
+outfile_result=None,
-predicted_labels=None, plot_color=None,
+outfile_object=None,
-title=None):
+groups=None,
+ref_seq=None,
+intervals=None,
+targets=None,
+fasta_path=None,
+model_config=None,
+true_labels=None,
+predicted_labels=None,
+plot_color=None,
+title=None,
+):
 """
 Parameter
 ---------
 inputs : str
 File path to galaxy tool parameter
 Color of the confusion matrix heatmap
 title : str, default is None
 Title of the confusion matrix heatmap
 """
-warnings.simplefilter('ignore')
+warnings.simplefilter("ignore")
-with open(inputs, 'r') as param_handler:
+with open(inputs, "r") as param_handler:
 params = json.load(param_handler)
-title = params['plotting_selection']['title'].strip()
+title = params["plotting_selection"]["title"].strip()
-plot_type = params['plotting_selection']['plot_type']
+plot_type = params["plotting_selection"]["plot_type"]
-plot_format = params['plotting_selection']['plot_format']
+plot_format = params["plotting_selection"]["plot_format"]
-if plot_type == 'feature_importances':
+if plot_type == "feature_importances":
-with open(infile_estimator, 'rb') as estimator_handler:
+with open(infile_estimator, "rb") as estimator_handler:
 estimator = load_model(estimator_handler)
-column_option = (params['plotting_selection']
+column_option = params["plotting_selection"]["column_selector_options"]["selected_column_selector_option"]
-['column_selector_options']
+if column_option in [
-['selected_column_selector_option'])
+"by_index_number",
-if column_option in ['by_index_number', 'all_but_by_index_number',
+"all_but_by_index_number",
-'by_header_name', 'all_but_by_header_name']:
+"by_header_name",
-c = (params['plotting_selection']
+"all_but_by_header_name",
-['column_selector_options']['col1'])
+]:
+c = params["plotting_selection"]["column_selector_options"]["col1"]
 else:
 c = None
-_, input_df = read_columns(infile1, c=c,
+_, input_df = read_columns(
-c_option=column_option,
+infile1,
-return_df=True,
+c=c,
-sep='\t', header='infer',
+c_option=column_option,
-parse_dates=True)
+return_df=True,
+sep="\t",
+header="infer",
+parse_dates=True,
+)
 feature_names = input_df.columns.values
 if isinstance(estimator, Pipeline):
 for st in estimator.steps[:-1]:
 if isinstance(st[-1], SelectorMixin):
 mask = st[-1].get_support()
 feature_names = feature_names[mask]
 estimator = estimator.steps[-1][-1]
-if hasattr(estimator, 'coef_'):
+if hasattr(estimator, "coef_"):
 coefs = estimator.coef_
 else:
-coefs = getattr(estimator, 'feature_importances_', None)
+coefs = getattr(estimator, "feature_importances_", None)
 if coefs is None:
-raise RuntimeError('The classifier does not expose '
+raise RuntimeError("The classifier does not expose " '"coef_" or "feature_importances_" ' "attributes")
-'"coef_" or "feature_importances_" '
-'attributes')
+threshold = params["plotting_selection"]["threshold"]
-threshold = params['plotting_selection']['threshold']
 if threshold is not None:
 mask = (coefs > threshold) | (coefs < -threshold)
 coefs = coefs[mask]
 feature_names = feature_names[mask]
 # sort
 indices = np.argsort(coefs)[::-1]
-trace = go.Bar(x=feature_names[indices],
+trace = go.Bar(x=feature_names[indices], y=coefs[indices])
-y=coefs[indices])
 layout = go.Layout(title=title or "Feature Importances")
 fig = go.Figure(data=[trace], layout=layout)
-plotly.offline.plot(fig, filename="output.html",
+plotly.offline.plot(fig, filename="output.html", auto_open=False)
-auto_open=False)
 # to be discovered by `from_work_dir`
-os.rename('output.html', 'output')
+os.rename("output.html", "output")
 return 0
-elif plot_type in ('pr_curve', 'roc_curve'):
+elif plot_type in ("pr_curve", "roc_curve"):
-df1 = pd.read_csv(infile1, sep='\t', header='infer')
+df1 = pd.read_csv(infile1, sep="\t", header="infer")
-df2 = pd.read_csv(infile2, sep='\t', header='infer').astype(np.float32)
+df2 = pd.read_csv(infile2, sep="\t", header="infer").astype(np.float32)
-minimum = params['plotting_selection']['report_minimum_n_positives']
+minimum = params["plotting_selection"]["report_minimum_n_positives"]
 # filter out columns whose n_positives is beblow the threhold
 if minimum:
 mask = df1.sum(axis=0) >= minimum
 df1 = df1.loc[:, mask]
 df2 = df2.loc[:, mask]
-pos_label = params['plotting_selection']['pos_label'].strip() \
+pos_label = params["plotting_selection"]["pos_label"].strip() or None
-or None
+if plot_type == "pr_curve":
-if plot_type == 'pr_curve':
+if plot_format == "plotly_html":
-if plot_format == 'plotly_html':
 visualize_pr_curve_plotly(df1, df2, pos_label, title=title)
 else:
 visualize_pr_curve_matplotlib(df1, df2, pos_label, title)
-else:          # 'roc_curve'
+else:  # 'roc_curve'
-drop_intermediate = (params['plotting_selection']
+drop_intermediate = params["plotting_selection"]["drop_intermediate"]
-['drop_intermediate'])
+if plot_format == "plotly_html":
-if plot_format == 'plotly_html':
+visualize_roc_curve_plotly(
-visualize_roc_curve_plotly(df1, df2, pos_label,
+df1,
-drop_intermediate=drop_intermediate,
+df2,
-title=title)
+pos_label,
+drop_intermediate=drop_intermediate,
+title=title,
+)
 else:
 visualize_roc_curve_matplotlib(
-df1, df2, pos_label,
+df1,
+df2,
+pos_label,
 drop_intermediate=drop_intermediate,
-title=title)
+title=title,
+)
 return 0
-elif plot_type == 'rfecv_gridscores':
+elif plot_type == "rfecv_gridscores":
-input_df = pd.read_csv(infile1, sep='\t', header='infer')
+input_df = pd.read_csv(infile1, sep="\t", header="infer")
 scores = input_df.iloc[:, 0]
-steps = params['plotting_selection']['steps'].strip()
+steps = params["plotting_selection"]["steps"].strip()
 steps = safe_eval(steps)
 data = go.Scatter(
 x=list(range(len(scores))),
 y=scores,
 text=[str(_) for _ in steps] if steps else None,
-mode='lines'
+mode="lines",
 )
 layout = go.Layout(
 xaxis=dict(title="Number of features selected"),
 yaxis=dict(title="Cross validation score"),
-title=dict(
+title=dict(text=title or None, x=0.5, y=0.92, xanchor="center", yanchor="top"),
-text=title or None,
+font=dict(family="sans-serif", size=11),
-x=0.5,
-y=0.92,
-xanchor='center',
-yanchor='top'
-),
-font=dict(
-family="sans-serif",
-size=11
-),
 # control backgroud colors
-plot_bgcolor='rgba(255,255,255,0)'
+plot_bgcolor="rgba(255,255,255,0)",
 )
 """
 # legend=dict(
 # x=0.95,
 # y=0,
 # borderwidth=2
 # ),
 """
 fig = go.Figure(data=[data], layout=layout)
-plotly.offline.plot(fig, filename="output.html",
+plotly.offline.plot(fig, filename="output.html", auto_open=False)
-auto_open=False)
 # to be discovered by `from_work_dir`
-os.rename('output.html', 'output')
+os.rename("output.html", "output")
 return 0
-elif plot_type == 'learning_curve':
+elif plot_type == "learning_curve":
-input_df = pd.read_csv(infile1, sep='\t', header='infer')
+input_df = pd.read_csv(infile1, sep="\t", header="infer")
-plot_std_err = params['plotting_selection']['plot_std_err']
+plot_std_err = params["plotting_selection"]["plot_std_err"]
 data1 = go.Scatter(
-x=input_df['train_sizes_abs'],
+x=input_df["train_sizes_abs"],
-y=input_df['mean_train_scores'],
+y=input_df["mean_train_scores"],
-error_y=dict(
+error_y=dict(array=input_df["std_train_scores"]) if plot_std_err else None,
-array=input_df['std_train_scores']
+mode="lines",
-) if plot_std_err else None,
-mode='lines',
 name="Train Scores",
 )
 data2 = go.Scatter(
-x=input_df['train_sizes_abs'],
+x=input_df["train_sizes_abs"],
-y=input_df['mean_test_scores'],
+y=input_df["mean_test_scores"],
-error_y=dict(
+error_y=dict(array=input_df["std_test_scores"]) if plot_std_err else None,
-array=input_df['std_test_scores']
+mode="lines",
-) if plot_std_err else None,
-mode='lines',
 name="Test Scores",
 )
 layout = dict(
-xaxis=dict(
+xaxis=dict(title="No. of samples"),
-title='No. of samples'
+yaxis=dict(title="Performance Score"),
-),
-yaxis=dict(
-title='Performance Score'
-),
 # modify these configurations to customize image
 title=dict(
-text=title or 'Learning Curve',
+text=title or "Learning Curve",
 x=0.5,
 y=0.92,
-xanchor='center',
+xanchor="center",
-yanchor='top'
+yanchor="top",
 ),
-font=dict(
+font=dict(family="sans-serif", size=11),
-family="sans-serif",
-size=11
-),
 # control backgroud colors
-plot_bgcolor='rgba(255,255,255,0)'
+plot_bgcolor="rgba(255,255,255,0)",
 )
 """
 # legend=dict(
 # x=0.95,
 # y=0,
 # borderwidth=2
 # ),
 """
 fig = go.Figure(data=[data1, data2], layout=layout)
-plotly.offline.plot(fig, filename="output.html",
+plotly.offline.plot(fig, filename="output.html", auto_open=False)
-auto_open=False)
 # to be discovered by `from_work_dir`
-os.rename('output.html', 'output')
+os.rename("output.html", "output")
 return 0
-elif plot_type == 'keras_plot_model':
+elif plot_type == "keras_plot_model":
-with open(model_config, 'r') as f:
+with open(model_config, "r") as f:
 model_str = f.read()
 model = model_from_json(model_str)
 plot_model(model, to_file="output.png")
-os.rename('output.png', 'output')
+os.rename("output.png", "output")
 return 0
-elif plot_type == 'classification_confusion_matrix':
+elif plot_type == "classification_confusion_matrix":
 plot_selection = params["plotting_selection"]
 input_true = get_dataframe(true_labels, plot_selection, "header_true", "column_selector_options_true")
-header_predicted = 'infer' if plot_selection["header_predicted"] else None
+header_predicted = "infer" if plot_selection["header_predicted"] else None
-input_predicted = pd.read_csv(predicted_labels, sep='\t', parse_dates=True, header=header_predicted)
+input_predicted = pd.read_csv(predicted_labels, sep="\t", parse_dates=True, header=header_predicted)
 true_classes = input_true.iloc[:, -1].copy()
 predicted_classes = input_predicted.iloc[:, -1].copy()
 axis_labels = list(set(true_classes))
 c_matrix = confusion_matrix(true_classes, predicted_classes)
 fig, ax = plt.subplots(figsize=(7, 7))
 im = plt.imshow(c_matrix, cmap=plot_color)
 for i in range(len(c_matrix)):
 for j in range(len(c_matrix)):
 ax.text(j, i, c_matrix[i, j], ha="center", va="center", color="k")
-ax.set_ylabel('True class labels')
+ax.set_ylabel("True class labels")
-ax.set_xlabel('Predicted class labels')
+ax.set_xlabel("Predicted class labels")
 ax.set_title(title)
 ax.set_xticks(axis_labels)
 ax.set_yticks(axis_labels)
 fig.colorbar(im, ax=ax)
 fig.tight_layout()
 plt.savefig("output.png", dpi=125)
-os.rename('output.png', 'output')
+os.rename("output.png", "output")
 return 0
 # save pdf file to disk
 # fig.write_image("image.pdf", format='pdf')
 # fig.write_image("image.pdf", format='pdf', width=340*2, height=226*2)
-if __name__ == '__main__':
+if __name__ == "__main__":
 aparser = argparse.ArgumentParser()
 aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
 aparser.add_argument("-e", "--estimator", dest="infile_estimator")
 aparser.add_argument("-X", "--infile1", dest="infile1")
 aparser.add_argument("-y", "--infile2", dest="infile2")
 aparser.add_argument("-pl", "--predicted_labels", dest="predicted_labels")
 aparser.add_argument("-pc", "--plot_color", dest="plot_color")
 aparser.add_argument("-pt", "--title", dest="title")
 args = aparser.parse_args()
-main(args.inputs, args.infile_estimator, args.infile1, args.infile2,
+main(
-args.outfile_result, outfile_object=args.outfile_object,
+args.inputs,
-groups=args.groups, ref_seq=args.ref_seq, intervals=args.intervals,
+args.infile_estimator,
-targets=args.targets, fasta_path=args.fasta_path,
+args.infile1,
-model_config=args.model_config, true_labels=args.true_labels,
+args.infile2,
-predicted_labels=args.predicted_labels,
+args.outfile_result,
-plot_color=args.plot_color,
+outfile_object=args.outfile_object,
-title=args.title)
+groups=args.groups,
+ref_seq=args.ref_seq,
+intervals=args.intervals,
+targets=args.targets,
+fasta_path=args.fasta_path,
+model_config=args.model_config,
+true_labels=args.true_labels,
+predicted_labels=args.predicted_labels,
+plot_color=args.plot_color,
+title=args.title,
+)

Mercurial > repos > bgruening > sklearn_train_test_eval

comparison ml_visualization_ex.py @ 9:ead7adad8d0e draft