sklearn_train_test_split: ml_visualization

comparison ml_visualization_ex.py @ 2:5a092779412e draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 5b2ac730ec6d3b762faa9034eddd19ad1b347476"

author	bgruening
date	Mon, 16 Dec 2019 05:34:17 -0500
parents	0985b0dd6f1a
children	145208b3579d

comparison

equal deleted inserted replaced

-:910ebff96ddc
+:5a092779412e
 import argparse
 import json
+import matplotlib
+import matplotlib.pyplot as plt
 import numpy as np
+import os
 import pandas as pd
 import plotly
 import plotly.graph_objs as go
 import warnings
 from galaxy_ml.utils import load_model, read_columns, SafeEval
 safe_eval = SafeEval()
+# plotly default colors
+default_colors = [
+'#1f77b4',  # muted blue
+'#ff7f0e',  # safety orange
+'#2ca02c',  # cooked asparagus green
+'#d62728',  # brick red
+'#9467bd',  # muted purple
+'#8c564b',  # chestnut brown
+'#e377c2',  # raspberry yogurt pink
+'#7f7f7f',  # middle gray
+'#bcbd22',  # curry yellow-green
+'#17becf'   # blue-teal
+]
+def visualize_pr_curve_plotly(df1, df2, pos_label, title=None):
+"""output pr-curve in html using plotly
+df1 : pandas.DataFrame
+Containing y_true
+df2 : pandas.DataFrame
+Containing y_score
+pos_label : None
+The label of positive class
+title : str
+Plot title
+"""
+data = []
+for idx in range(df1.shape[1]):
+y_true = df1.iloc[:, idx].values
+y_score = df2.iloc[:, idx].values
+precision, recall, _ = precision_recall_curve(
+y_true, y_score, pos_label=pos_label)
+ap = average_precision_score(
+y_true, y_score, pos_label=pos_label or 1)
+trace = go.Scatter(
+x=recall,
+y=precision,
+mode='lines',
+marker=dict(
+color=default_colors[idx % len(default_colors)]
+),
+name='%s (area = %.3f)' % (idx, ap)
+)
+data.append(trace)
+layout = go.Layout(
+xaxis=dict(
+title='Recall',
+linecolor='lightslategray',
+linewidth=1
+),
+yaxis=dict(
+title='Precision',
+linecolor='lightslategray',
+linewidth=1
+),
+title=dict(
+text=title or 'Precision-Recall Curve',
+x=0.5,
+y=0.92,
+xanchor='center',
+yanchor='top'
+),
+font=dict(
+family="sans-serif",
+size=11
+),
+# control backgroud colors
+plot_bgcolor='rgba(255,255,255,0)'
+)
+"""
+legend=dict(
+x=0.95,
+y=0,
+traceorder="normal",
+font=dict(
+family="sans-serif",
+size=9,
+color="black"
+),
+bgcolor="LightSteelBlue",
+bordercolor="Black",
+borderwidth=2
+),"""
+fig = go.Figure(data=data, layout=layout)
+plotly.offline.plot(fig, filename="output.html", auto_open=False)
+# to be discovered by `from_work_dir`
+os.rename('output.html', 'output')
+def visualize_pr_curve_matplotlib(df1, df2, pos_label, title=None):
+"""visualize pr-curve using matplotlib and output svg image
+"""
+backend = matplotlib.get_backend()
+if "inline" not in backend:
+matplotlib.use("SVG")
+plt.style.use('seaborn-colorblind')
+plt.figure()
+for idx in range(df1.shape[1]):
+y_true = df1.iloc[:, idx].values
+y_score = df2.iloc[:, idx].values
+precision, recall, _ = precision_recall_curve(
+y_true, y_score, pos_label=pos_label)
+ap = average_precision_score(
+y_true, y_score, pos_label=pos_label or 1)
+plt.step(recall, precision, 'r-', color="black", alpha=0.3,
+lw=1, where="post", label='%s (area = %.3f)' % (idx, ap))
+plt.xlim([0.0, 1.0])
+plt.ylim([0.0, 1.05])
+plt.xlabel('Recall')
+plt.ylabel('Precision')
+title = title or 'Precision-Recall Curve'
+plt.title(title)
+folder = os.getcwd()
+plt.savefig(os.path.join(folder, "output.svg"), format="svg")
+os.rename(os.path.join(folder, "output.svg"),
+os.path.join(folder, "output"))
+def visualize_roc_curve_plotly(df1, df2, pos_label,
+drop_intermediate=True,
+title=None):
+"""output roc-curve in html using plotly
+df1 : pandas.DataFrame
+Containing y_true
+df2 : pandas.DataFrame
+Containing y_score
+pos_label : None
+The label of positive class
+drop_intermediate : bool
+Whether to drop some suboptimal thresholds
+title : str
+Plot title
+"""
+data = []
+for idx in range(df1.shape[1]):
+y_true = df1.iloc[:, idx].values
+y_score = df2.iloc[:, idx].values
+fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label,
+drop_intermediate=drop_intermediate)
+roc_auc = auc(fpr, tpr)
+trace = go.Scatter(
+x=fpr,
+y=tpr,
+mode='lines',
+marker=dict(
+color=default_colors[idx % len(default_colors)]
+),
+name='%s (area = %.3f)' % (idx, roc_auc)
+)
+data.append(trace)
+layout = go.Layout(
+xaxis=dict(
+title='False Positive Rate',
+linecolor='lightslategray',
+linewidth=1
+),
+yaxis=dict(
+title='True Positive Rate',
+linecolor='lightslategray',
+linewidth=1
+),
+title=dict(
+text=title or 'Receiver Operating Characteristic (ROC) Curve',
+x=0.5,
+y=0.92,
+xanchor='center',
+yanchor='top'
+),
+font=dict(
+family="sans-serif",
+size=11
+),
+# control backgroud colors
+plot_bgcolor='rgba(255,255,255,0)'
+)
+"""
+# legend=dict(
+# x=0.95,
+# y=0,
+# traceorder="normal",
+# font=dict(
+#    family="sans-serif",
+#    size=9,
+#    color="black"
+# ),
+# bgcolor="LightSteelBlue",
+# bordercolor="Black",
+# borderwidth=2
+# ),
+"""
+fig = go.Figure(data=data, layout=layout)
+plotly.offline.plot(fig, filename="output.html", auto_open=False)
+# to be discovered by `from_work_dir`
+os.rename('output.html', 'output')
+def visualize_roc_curve_matplotlib(df1, df2, pos_label,
+drop_intermediate=True,
+title=None):
+"""visualize roc-curve using matplotlib and output svg image
+"""
+backend = matplotlib.get_backend()
+if "inline" not in backend:
+matplotlib.use("SVG")
+plt.style.use('seaborn-colorblind')
+plt.figure()
+for idx in range(df1.shape[1]):
+y_true = df1.iloc[:, idx].values
+y_score = df2.iloc[:, idx].values
+fpr, tpr, _ = roc_curve(y_true, y_score, pos_label=pos_label,
+drop_intermediate=drop_intermediate)
+roc_auc = auc(fpr, tpr)
+plt.step(fpr, tpr, 'r-', color="black", alpha=0.3, lw=1,
+where="post", label='%s (area = %.3f)' % (idx, roc_auc))
+plt.xlim([0.0, 1.0])
+plt.ylim([0.0, 1.05])
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+title = title or 'Receiver Operating Characteristic (ROC) Curve'
+plt.title(title)
+folder = os.getcwd()
+plt.savefig(os.path.join(folder, "output.svg"), format="svg")
+os.rename(os.path.join(folder, "output.svg"),
+os.path.join(folder, "output"))
 def main(inputs, infile_estimator=None, infile1=None,
 infile2=None, outfile_result=None,
 outfile_object=None, groups=None,
 ref_seq=None, intervals=None,
 with open(inputs, 'r') as param_handler:
 params = json.load(param_handler)
 title = params['plotting_selection']['title'].strip()
 plot_type = params['plotting_selection']['plot_type']
+plot_format = params['plotting_selection']['plot_format']
 if plot_type == 'feature_importances':
 with open(infile_estimator, 'rb') as estimator_handler:
 estimator = load_model(estimator_handler)
 column_option = (params['plotting_selection']
 trace = go.Bar(x=feature_names[indices],
 y=coefs[indices])
 layout = go.Layout(title=title or "Feature Importances")
 fig = go.Figure(data=[trace], layout=layout)
-elif plot_type == 'pr_curve':
+plotly.offline.plot(fig, filename="output.html",
-df1 = pd.read_csv(infile1, sep='\t', header=None)
+auto_open=False)
-df2 = pd.read_csv(infile2, sep='\t', header=None)
+# to be discovered by `from_work_dir`
+os.rename('output.html', 'output')
-precision = {}
-recall = {}
+return 0
-ap = {}
+elif plot_type in ('pr_curve', 'roc_curve'):
+df1 = pd.read_csv(infile1, sep='\t', header='infer')
+df2 = pd.read_csv(infile2, sep='\t', header='infer').astype(np.float32)
+minimum = params['plotting_selection']['report_minimum_n_positives']
+# filter out columns whose n_positives is beblow the threhold
+if minimum:
+mask = df1.sum(axis=0) >= minimum
+df1 = df1.loc[:, mask]
+df2 = df2.loc[:, mask]
 pos_label = params['plotting_selection']['pos_label'].strip() \
 or None
-for col in df1.columns:
-y_true = df1[col].values
+if plot_type == 'pr_curve':
-y_score = df2[col].values
+if plot_format == 'plotly_html':
+visualize_pr_curve_plotly(df1, df2, pos_label, title=title)
-precision[col], recall[col], _ = precision_recall_curve(
+else:
-y_true, y_score, pos_label=pos_label)
+visualize_pr_curve_matplotlib(df1, df2, pos_label, title)
-ap[col] = average_precision_score(
+else:          # 'roc_curve'
-y_true, y_score, pos_label=pos_label or 1)
+drop_intermediate = (params['plotting_selection']
+['drop_intermediate'])
-if len(df1.columns) > 1:
+if plot_format == 'plotly_html':
-precision["micro"], recall["micro"], _ = precision_recall_curve(
+visualize_roc_curve_plotly(df1, df2, pos_label,
-df1.values.ravel(), df2.values.ravel(), pos_label=pos_label)
+drop_intermediate=drop_intermediate,
-ap['micro'] = average_precision_score(
+title=title)
-df1.values, df2.values, average='micro',
+else:
-pos_label=pos_label or 1)
+visualize_roc_curve_matplotlib(
+df1, df2, pos_label,
-data = []
+drop_intermediate=drop_intermediate,
-for key in precision.keys():
+title=title)
-trace = go.Scatter(
-x=recall[key],
+return 0
-y=precision[key],
-mode='lines',
-name='%s (area = %.2f)' % (key, ap[key]) if key == 'micro'
-else 'column %s (area = %.2f)' % (key, ap[key])
-)
-data.append(trace)
-layout = go.Layout(
-title=title or "Precision-Recall curve",
-xaxis=dict(title='Recall'),
-yaxis=dict(title='Precision')
-)
-fig = go.Figure(data=data, layout=layout)
-elif plot_type == 'roc_curve':
-df1 = pd.read_csv(infile1, sep='\t', header=None)
-df2 = pd.read_csv(infile2, sep='\t', header=None)
-fpr = {}
-tpr = {}
-roc_auc = {}
-pos_label = params['plotting_selection']['pos_label'].strip() \
-or None
-for col in df1.columns:
-y_true = df1[col].values
-y_score = df2[col].values
-fpr[col], tpr[col], _ = roc_curve(
-y_true, y_score, pos_label=pos_label)
-roc_auc[col] = auc(fpr[col], tpr[col])
-if len(df1.columns) > 1:
-fpr["micro"], tpr["micro"], _ = roc_curve(
-df1.values.ravel(), df2.values.ravel(), pos_label=pos_label)
-roc_auc['micro'] = auc(fpr["micro"], tpr["micro"])
-data = []
-for key in fpr.keys():
-trace = go.Scatter(
-x=fpr[key],
-y=tpr[key],
-mode='lines',
-name='%s (area = %.2f)' % (key, roc_auc[key]) if key == 'micro'
-else 'column %s (area = %.2f)' % (key, roc_auc[key])
-)
-data.append(trace)
-trace = go.Scatter(x=[0, 1], y=[0, 1],
-mode='lines',
-line=dict(color='black', dash='dash'),
-showlegend=False)
-data.append(trace)
-layout = go.Layout(
-title=title or "Receiver operating characteristic curve",
-xaxis=dict(title='False Positive Rate'),
-yaxis=dict(title='True Positive Rate')
-)
-fig = go.Figure(data=data, layout=layout)
 elif plot_type == 'rfecv_gridscores':
 input_df = pd.read_csv(infile1, sep='\t', header='infer')
 scores = input_df.iloc[:, 0]
 steps = params['plotting_selection']['steps'].strip()
 mode='lines'
 )
 layout = go.Layout(
 xaxis=dict(title="Number of features selected"),
 yaxis=dict(title="Cross validation score"),
-title=title or None
+title=dict(
-)
+text=title or None,
+x=0.5,
+y=0.92,
+xanchor='center',
+yanchor='top'
+),
+font=dict(
+family="sans-serif",
+size=11
+),
+# control backgroud colors
+plot_bgcolor='rgba(255,255,255,0)'
+)
+"""
+# legend=dict(
+# x=0.95,
+# y=0,
+# traceorder="normal",
+# font=dict(
+#    family="sans-serif",
+#    size=9,
+#    color="black"
+# ),
+# bgcolor="LightSteelBlue",
+# bordercolor="Black",
+# borderwidth=2
+# ),
+"""
 fig = go.Figure(data=[data], layout=layout)
+plotly.offline.plot(fig, filename="output.html",
+auto_open=False)
+# to be discovered by `from_work_dir`
+os.rename('output.html', 'output')
+return 0
 elif plot_type == 'learning_curve':
 input_df = pd.read_csv(infile1, sep='\t', header='infer')
 plot_std_err = params['plotting_selection']['plot_std_err']
 data1 = go.Scatter(
 title='No. of samples'
 ),
 yaxis=dict(
 title='Performance Score'
 ),
-title=title or 'Learning Curve'
+# modify these configurations to customize image
-)
+title=dict(
+text=title or 'Learning Curve',
+x=0.5,
+y=0.92,
+xanchor='center',
+yanchor='top'
+),
+font=dict(
+family="sans-serif",
+size=11
+),
+# control backgroud colors
+plot_bgcolor='rgba(255,255,255,0)'
+)
+"""
+# legend=dict(
+# x=0.95,
+# y=0,
+# traceorder="normal",
+# font=dict(
+#    family="sans-serif",
+#    size=9,
+#    color="black"
+# ),
+# bgcolor="LightSteelBlue",
+# bordercolor="Black",
+# borderwidth=2
+# ),
+"""
 fig = go.Figure(data=[data1, data2], layout=layout)
+plotly.offline.plot(fig, filename="output.html",
+auto_open=False)
+# to be discovered by `from_work_dir`
+os.rename('output.html', 'output')
+return 0
 elif plot_type == 'keras_plot_model':
 with open(model_config, 'r') as f:
 model_str = f.read()
 model = model_from_json(model_str)
 plot_model(model, to_file="output.png")
-__import__('os').rename('output.png', 'output')
+os.rename('output.png', 'output')
 return 0
-plotly.offline.plot(fig, filename="output.html",
+# save pdf file to disk
-auto_open=False)
+# fig.write_image("image.pdf", format='pdf')
-# to be discovered by `from_work_dir`
+# fig.write_image("image.pdf", format='pdf', width=340*2, height=226*2)
-__import__('os').rename('output.html', 'output')
 if __name__ == '__main__':
 aparser = argparse.ArgumentParser()
 aparser.add_argument("-i", "--inputs", dest="inputs", required=True)

Mercurial > repos > bgruening > sklearn_train_test_split

comparison ml_visualization_ex.py @ 2:5a092779412e draft