openml
diff --git a/‎examples/datasets_tutorial.py‎
Lines changed: 10 additions & 2 deletions b/‎examples/datasets_tutorial.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎examples/tasks_tutorial.py‎
Lines changed: 4 additions & 0 deletions b/‎examples/tasks_tutorial.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎openml/datasets/functions.py‎
Lines changed: 57 additions & 23 deletions b/‎openml/datasets/functions.py‎
Lines changed: 57 additions & 23 deletions
diff --git a/‎openml/evaluations/functions.py‎
Lines changed: 92 additions & 27 deletions b/‎openml/evaluations/functions.py‎
Lines changed: 92 additions & 27 deletions
@@ -10,8 +10,12 @@
 import pandas as pd
 
 ############################################################################
-# List datasets
-# =============
+# Exercise 0
+# **********
+#
+# * List datasets
+#   * Use the output_format parameter to select output type
+#   * Default gives 'dict' (other option: 'dataframe')
 
 openml_list = openml.datasets.list_datasets()  # returns a dict
 
@@ -25,6 +29,10 @@
 print("First 10 of %s datasets..." % len(datalist))
 datalist.head(n=10)
 
+# The same can be done with lesser lines of code
+openml_df = openml.datasets.list_datasets(output_format='dataframe')
+openml_df.head(n=10)
+
 ############################################################################
 # Exercise 1
 # **********
 
@@ -42,6 +42,10 @@
 print("First 5 of %s tasks:" % len(tasks))
 pprint(tasks.head())
 
+# The same can be obtained through lesser lines of code
+tasks_df = openml.tasks.list_tasks(task_type_id=1, output_format='dataframe')
+pprint(tasks_df.head())
+
 ############################################################################
 # We can filter the list of tasks to only contain datasets with more than
 # 500 samples, but less than 1000 samples:
 
@@ -82,7 +82,9 @@ def _get_cached_datasets():
     return datasets
 
 
-def _get_cached_dataset(dataset_id):
+def _get_cached_dataset(
+    dataset_id: int
+) -> OpenMLDataset:
     """Get cached dataset for ID.
 
     Returns
@@ -163,7 +165,14 @@ def _get_cache_directory(dataset: OpenMLDataset) -> str:
     return _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset.dataset_id)
 
 
-def list_datasets(offset=None, size=None, status=None, tag=None, **kwargs):
+def list_datasets(
+    offset: Optional[int] = None,
+    size: Optional[int] = None,
+    status: Optional[str] = None,
+    tag: Optional[str] = None,
+    output_format: str = 'dict',
+    **kwargs
+) -> Union[Dict, pd.DataFrame]:
 
     """
     Return a list of all dataset which are on OpenML.
@@ -180,61 +189,83 @@ def list_datasets(offset=None, size=None, status=None, tag=None, **kwargs):
         default active datasets are returned, but also datasets
         from another status can be requested.
     tag : str, optional
+    output_format: str, optional (default='dict')
+        The parameter decides the format of the output.
+        - If 'dict' the output is a dict of dict
+        - If 'dataframe' the output is a pandas DataFrame
     kwargs : dict, optional
         Legal filter operators (keys in the dict):
         data_name, data_version, number_instances,
         number_features, number_classes, number_missing_values.
 
     Returns
     -------
-    datasets : dict of dicts
-        A mapping from dataset ID to dict.
-
-        Every dataset is represented by a dictionary containing
-        the following information:
-        - dataset id
-        - name
-        - format
-        - status
-
-        If qualities are calculated for the dataset, some of
-        these are also returned.
+    datasets : dict of dicts, or dataframe
+        - If output_format='dict'
+            A mapping from dataset ID to dict.
+
+            Every dataset is represented by a dictionary containing
+            the following information:
+            - dataset id
+            - name
+            - format
+            - status
+            If qualities are calculated for the dataset, some of
+            these are also returned.
+
+        - If output_format='dataframe'
+            Each row maps to a dataset
+            Each column contains the following information:
+            - dataset id
+            - name
+            - format
+            - status
+            If qualities are calculated for the dataset, some of
+            these are also included as columns.
     """
+    if output_format not in ['dataframe', 'dict']:
+        raise ValueError("Invalid output format selected. "
+                         "Only 'dict' or 'dataframe' applicable.")
 
-    return openml.utils._list_all(_list_datasets,
+    return openml.utils._list_all(output_format=output_format,
+                                  listing_call=_list_datasets,
                                   offset=offset,
                                   size=size,
                                   status=status,
                                   tag=tag,
                                   **kwargs)
 
 
-def _list_datasets(**kwargs):
+def _list_datasets(output_format='dict', **kwargs):
 
     """
     Perform api call to return a list of all datasets.
 
     Parameters
     ----------
+    output_format: str, optional (default='dict')
+        The parameter decides the format of the output.
+        - If 'dict' the output is a dict of dict
+        - If 'dataframe' the output is a pandas DataFrame
     kwargs : dict, optional
         Legal filter operators (keys in the dict):
         tag, status, limit, offset, data_name, data_version, number_instances,
         number_features, number_classes, number_missing_values.
 
     Returns
     -------
-    datasets : dict of dicts
+    datasets : dict of dicts, or dataframe
     """
 
     api_call = "data/list"
 
     if kwargs is not None:
         for operator, value in kwargs.items():
             api_call += "/%s/%s" % (operator, value)
-    return __list_datasets(api_call)
+    return __list_datasets(api_call=api_call, output_format=output_format)
 
 
-def __list_datasets(api_call):
+def __list_datasets(api_call, output_format='dict'):
 
     xml_string = openml._api_calls._perform_api_call(api_call, 'get')
     datasets_dict = xmltodict.parse(xml_string, force_list=('oml:dataset',))
@@ -262,6 +293,9 @@ def __list_datasets(api_call):
                 dataset[quality['@name']] = float(quality['#text'])
         datasets[dataset['did']] = dataset
 
+    if output_format == 'dataframe':
+        datasets = pd.DataFrame.from_dict(datasets, orient='index')
+
     return datasets
 
 
@@ -341,8 +375,8 @@ def _name_to_id(
 
 
 def get_datasets(
-        dataset_ids: List[Union[str, int]],
-        download_data: bool = True,
+    dataset_ids: List[Union[str, int]],
+    download_data: bool = True,
 ) -> List[OpenMLDataset]:
     """Download datasets.
 
@@ -667,8 +701,8 @@ def create_dataset(name, description, creator, contributor,
                              do not construct a valid ARFF file")
 
     return OpenMLDataset(
-        name,
-        description,
+        name=name,
+        description=description,
         data_format=data_format,
         creator=creator,
         contributor=contributor,
 
@@ -1,14 +1,26 @@
 import json
 import xmltodict
+import pandas as pd
+from typing import Union, List, Optional, Dict
 
 import openml.utils
 import openml._api_calls
 from ..evaluations import OpenMLEvaluation
 
 
-def list_evaluations(function, offset=None, size=None, id=None, task=None,
-                     setup=None, flow=None, uploader=None, tag=None,
-                     per_fold=None):
+def list_evaluations(
+    function: str,
+    offset: Optional[int] = None,
+    size: Optional[int] = None,
+    id: Optional[List] = None,
+    task: Optional[List] = None,
+    setup: Optional[List] = None,
+    flow: Optional[List] = None,
+    uploader: Optional[List] = None,
+    tag: Optional[str] = None,
+    per_fold: Optional[bool] = None,
+    output_format: str = 'object'
+) -> Union[Dict, pd.DataFrame]:
     """
     List all run-evaluation pairs matching all of the given filters.
     (Supports large amount of results)
@@ -36,21 +48,48 @@ def list_evaluations(function, offset=None, size=None, id=None, task=None,
 
     per_fold : bool, optional
 
+    output_format: str, optional (default='object')
+        The parameter decides the format of the output.
+        - If 'object' the output is a dict of OpenMLEvaluation objects
+        - If 'dict' the output is a dict of dict
+        - If 'dataframe' the output is a pandas DataFrame
+
     Returns
     -------
-    dict
+    dict or dataframe
     """
-    if per_fold is not None:
-        per_fold = str(per_fold).lower()
-
-    return openml.utils._list_all(_list_evaluations, function, offset=offset,
-                                  size=size, id=id, task=task, setup=setup,
-                                  flow=flow, uploader=uploader, tag=tag,
-                                  per_fold=per_fold)
+    if output_format not in ['dataframe', 'dict', 'object']:
+        raise ValueError("Invalid output format selected. "
+                         "Only 'object', 'dataframe', or 'dict' applicable.")
 
-
-def _list_evaluations(function, id=None, task=None,
-                      setup=None, flow=None, uploader=None, **kwargs):
+    per_fold_str = None
+    if per_fold is not None:
+        per_fold_str = str(per_fold).lower()
+
+    return openml.utils._list_all(output_format=output_format,
+                                  listing_call=_list_evaluations,
+                                  function=function,
+                                  offset=offset,
+                                  size=size,
+                                  id=id,
+                                  task=task,
+                                  setup=setup,
+                                  flow=flow,
+                                  uploader=uploader,
+                                  tag=tag,
+                                  per_fold=per_fold_str)
+
+
+def _list_evaluations(
+    function: str,
+    id: Optional[List] = None,
+    task: Optional[List] = None,
+    setup: Optional[List] = None,
+    flow: Optional[List] = None,
+    uploader: Optional[List] = None,
+    output_format: str = 'object',
+    **kwargs
+) -> Union[Dict, pd.DataFrame]:
     """
     Perform API call ``/evaluation/function{function}/{filters}``
 
@@ -75,9 +114,17 @@ def _list_evaluations(function, id=None, task=None,
     kwargs: dict, optional
         Legal filter operators: tag, limit, offset.
 
+    output_format: str, optional (default='dict')
+        The parameter decides the format of the output.
+        - If 'dict' the output is a dict of dict
+        The parameter decides the format of the output.
+        - If 'dict' the output is a dict of dict
+        - If 'dataframe' the output is a pandas DataFrame
+        - If 'dataframe' the output is a pandas DataFrame
+
     Returns
     -------
-    dict
+    dict of objects, or dataframe
     """
 
     api_call = "evaluation/list/function/%s" % function
@@ -95,10 +142,10 @@ def _list_evaluations(function, id=None, task=None,
     if uploader is not None:
         api_call += "/uploader/%s" % ','.join([str(int(i)) for i in uploader])
 
-    return __list_evaluations(api_call)
+    return __list_evaluations(api_call, output_format=output_format)
 
 
-def __list_evaluations(api_call):
+def __list_evaluations(api_call, output_format='object'):
     """Helper function to parse API calls which are lists of runs"""
     xml_string = openml._api_calls._perform_api_call(api_call, 'get')
     evals_dict = xmltodict.parse(xml_string, force_list=('oml:evaluation',))
@@ -123,15 +170,33 @@ def __list_evaluations(api_call):
         if 'oml:array_data' in eval_:
             array_data = eval_['oml:array_data']
 
-        evals[run_id] = OpenMLEvaluation(int(eval_['oml:run_id']),
-                                         int(eval_['oml:task_id']),
-                                         int(eval_['oml:setup_id']),
-                                         int(eval_['oml:flow_id']),
-                                         eval_['oml:flow_name'],
-                                         eval_['oml:data_id'],
-                                         eval_['oml:data_name'],
-                                         eval_['oml:function'],
-                                         eval_['oml:upload_time'],
-                                         value, values, array_data)
+        if output_format == 'object':
+            evals[run_id] = OpenMLEvaluation(int(eval_['oml:run_id']),
+                                             int(eval_['oml:task_id']),
+                                             int(eval_['oml:setup_id']),
+                                             int(eval_['oml:flow_id']),
+                                             eval_['oml:flow_name'],
+                                             eval_['oml:data_id'],
+                                             eval_['oml:data_name'],
+                                             eval_['oml:function'],
+                                             eval_['oml:upload_time'],
+                                             value, values, array_data)
+        else:
+            # for output_format in ['dict', 'dataframe']
+            evals[run_id] = {'run_id': int(eval_['oml:run_id']),
+                             'task_id': int(eval_['oml:task_id']),
+                             'setup_id': int(eval_['oml:setup_id']),
+                             'flow_id': int(eval_['oml:flow_id']),
+                             'flow_name': eval_['oml:flow_name'],
+                             'data_id': eval_['oml:data_id'],
+                             'data_name': eval_['oml:data_name'],
+                             'function': eval_['oml:function'],
+                             'upload_time': eval_['oml:upload_time'],
+                             'value': value,
+                             'values': values,
+                             'array_data': array_data}
+
+    if output_format == 'dataframe':
+        evals = pd.DataFrame.from_dict(evals, orient='index')
 
     return evals