add new example regarding svm hyperparameter plotting (#834)

mfeurer · web-flow · commit c40e474fff97 · 2019-10-16T22:17:26.000+02:00
* add new example regarding svm hyperparameter plotting

* implement Neeratyoy's suggestions

* add title &amp; fix pep8
diff --git a/examples/30_extended/plot_svm_hyperparameters_tutorial.py b/examples/30_extended/plot_svm_hyperparameters_tutorial.py
@@ -0,0 +1,78 @@
+"""
+================================
+Plotting hyperparameter surfaces
+================================
+"""
+import openml
+import numpy as np
+
+####################################################################################################
+# First step - obtaining the data
+# ===============================
+# First, we nood to choose an SVM flow, for example 8353, and a task. Finding the IDs of them are
+# not part of this tutorial, this could for example be done via the website.
+#
+# For this we use the function ``list_evaluations_setup`` which can automatically join
+# evaluations conducted by the server with the hyperparameter settings extracted from the
+# uploaded runs (called *setup*).
+df = openml.evaluations.list_evaluations_setups(
+    function='predictive_accuracy',
+    flow=[8353],
+    task=[6],
+    output_format='dataframe',
+    # Using this flag incorporates the hyperparameters into the returned dataframe. Otherwise,
+    # the dataframe would contain a field ``paramaters`` containing an unparsed dictionary.
+    parameters_in_separate_columns=True,
+)
+print(df.head(n=10))
+
+####################################################################################################
+# We can see all the hyperparameter names in the columns of the dataframe:
+for name in df.columns:
+    print(name)
+
+####################################################################################################
+# Next, we cast and transform the hyperparameters of interest (``C`` and ``gamma``) so that we
+# can nicely plot them.
+hyperparameters = ['sklearn.svm.classes.SVC(16)_C', 'sklearn.svm.classes.SVC(16)_gamma']
+df[hyperparameters] = df[hyperparameters].astype(float).apply(np.log)
+
+####################################################################################################
+# Option 1 - plotting via the pandas helper functions
+# ===================================================
+#
+df.plot.hexbin(
+    x='sklearn.svm.classes.SVC(16)_C',
+    y='sklearn.svm.classes.SVC(16)_gamma',
+    C='value',
+    reduce_C_function=np.mean,
+    gridsize=25,
+    title='SVM performance landscape',
+)
+
+####################################################################################################
+# Option 2 - plotting via matplotlib
+# ==================================
+#
+import matplotlib.pyplot as plt
+
+fig, ax = plt.subplots()
+
+C = df['sklearn.svm.classes.SVC(16)_C']
+gamma = df['sklearn.svm.classes.SVC(16)_gamma']
+score = df['value']
+
+# Plotting all evaluations:
+ax.plot(C, gamma, 'ko', ms=1)
+# Create a contour plot
+cntr = ax.tricontourf(C, gamma, score, levels=12, cmap="RdBu_r")
+# Adjusting the colorbar
+fig.colorbar(cntr, ax=ax, label="accuracy")
+# Adjusting the axis limits
+ax.set(
+    xlim=(min(C), max(C)),
+    ylim=(min(gamma), max(gamma)),
+    xlabel="C (log10)",
+    ylabel="gamma (log10)",
+)
+ax.set_title('SVM performance landscape')
diff --git a/tests/test_evaluations/test_evaluations_example.py b/tests/test_evaluations/test_evaluations_example.py
@@ -0,0 +1,31 @@
+import unittest
+
+
+class TestEvaluationsExample(unittest.TestCase):
+
+    def test_example_python_paper(self):
+        # Example script which will appear in the upcoming OpenML-Python paper
+        # This test ensures that the example will keep running!
+
+        import openml
+        import numpy as np
+        import matplotlib.pyplot as plt
+
+        df = openml.evaluations.list_evaluations_setups(
+            'predictive_accuracy',
+            flow=[8353],
+            task=[6],
+            output_format='dataframe',
+            parameters_in_separate_columns=True,
+        )  # Choose an SVM flow, for example 8353, and a task.
+
+        hp_names = ['sklearn.svm.classes.SVC(16)_C', 'sklearn.svm.classes.SVC(16)_gamma']
+        df[hp_names] = df[hp_names].astype(float).apply(np.log)
+        C, gamma, score = df[hp_names[0]], df[hp_names[1]], df['value']
+
+        cntr = plt.tricontourf(C, gamma, score, levels=12, cmap="RdBu_r")
+        plt.colorbar(cntr, label="accuracy")
+        plt.xlim((min(C), max(C)))
+        plt.ylim((min(gamma), max(gamma)))
+        plt.xlabel("C (log10)")
+        plt.ylabel("gamma (log10)")