FIX #151 remove undocumented feature private cache directory

mfeurer · mfeurer · commit 9d56347578dd · 2017-02-02T12:56:38.000+01:00
diff --git a/openml/config.py b/openml/config.py
@@ -1,6 +1,5 @@
 """
-Stores module level information like the API key, cache director, private
-directory and the server.
+Stores module level information like the API key, cache directory and the server.
 """
 import os
 import sys
@@ -14,7 +13,6 @@
 server = "https://www.openml.org/api/v1/xml"
 apikey = ""
 cachedir = ""
-privatedir = ""
 
 
 if sys.version_info[0] < 3:
@@ -47,57 +45,35 @@ def _setup():
     config = _parse_config()
     apikey = config.get('FAKE_SECTION', 'apikey')
     server = config.get('FAKE_SECTION', 'server')
-    private_dir = config.get('FAKE_SECTION', 'private_directory')
     cache_dir = config.get('FAKE_SECTION', 'cachedir')
-    set_cache_directory(cache_dir, private_dir)
+    set_cache_directory(cache_dir)
 
 
-def set_cache_directory(cachedir, privatedir=None):
+def set_cache_directory(cachedir):
     """Set module-wide cache directory.
 
     Sets the cache directory into which to download datasets, tasks etc.
-    Also sets the private directory for storing local datasets.
 
     Parameters
     ----------
     cachedir : string
         Path to use as cache directory.
 
-    privatedir : string
-        Path containing private datasets, tasks, etc.
-
     See also
     --------
     get_cache_directory
-    get_private_directory
     """
-    if privatedir is None:
-        privatedir = cachedir
 
     global _cachedir
-    global _privatedir
     _cachedir = cachedir
-    _privatedir = privatedir
 
     # Set up the cache directories
     dataset_cache_dir = os.path.join(cachedir, "datasets")
     task_cache_dir = os.path.join(cachedir, "tasks")
     run_cache_dir = os.path.join(cachedir, 'runs')
 
-    # Set up the private directory
-    _private_directory_datasets = os.path.join(
-        privatedir, "datasets")
-    _private_directory_tasks = os.path.join(
-        privatedir, "tasks")
-    _private_directory_runs = os.path.join(
-        privatedir, "runs")
-
-    for dir_ in [cachedir, dataset_cache_dir,
-                 task_cache_dir, run_cache_dir,
-                 privatedir,
-                 _private_directory_datasets,
-                 _private_directory_tasks,
-                 _private_directory_runs]:
+
+    for dir_ in [cachedir, dataset_cache_dir, task_cache_dir, run_cache_dir]:
         if not os.path.exists(dir_) and not os.path.isdir(dir_):
             os.mkdir(dir_)
 
@@ -108,8 +84,7 @@ def _parse_config():
     defaults = {'apikey': apikey,
                 'server': server,
                 'verbosity': 0,
-                'cachedir': os.path.expanduser('~/.openml/cache'),
-                'private_directory': os.path.expanduser('~/.openml/private')}
+                'cachedir': os.path.expanduser('~/.openml/cache')}
 
     config_file = os.path.expanduser('~/.openml/config')
     config = configparser.RawConfigParser(defaults=defaults)
@@ -147,26 +122,10 @@ def get_cache_directory():
     See also
     --------
     set_cache_directory
-    get_private_directory
     """
     return _cachedir
 
 
-def get_private_directory():
-    """Get the current private directory.
-
-    Returns
-    -------
-    privatecir : string
-        The current private directory.
-
-    See also
-    --------
-    set_cache_directory
-    get_cache_directory
-    """
-    return _privatedir
-
-__all__ = ["set_cache_directory", 'get_cache_directory', 'get_private_directory']
+__all__ = ["set_cache_directory", 'get_cache_directory']
 
 _setup()
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -23,28 +23,28 @@ def _list_cached_datasets():
     """
     datasets = []
 
-    for dataset_cache in [config.get_cache_directory(), config.get_private_directory()]:
-        dataset_cache_dir = os.path.join(dataset_cache, "datasets")
-        directory_content = os.listdir(dataset_cache_dir)
-        directory_content.sort()
-
-        # Find all dataset ids for which we have downloaded the dataset
-        # description
-        for directory_name in directory_content:
-            # First check if the directory name could be an OpenML dataset id
-            if not re.match(r"[0-9]*", directory_name):
-                continue
+    dataset_cache = config.get_cache_directory()
+    dataset_cache_dir = os.path.join(dataset_cache, "datasets")
+    directory_content = os.listdir(dataset_cache_dir)
+    directory_content.sort()
+
+    # Find all dataset ids for which we have downloaded the dataset
+    # description
+    for directory_name in directory_content:
+        # First check if the directory name could be an OpenML dataset id
+        if not re.match(r"[0-9]*", directory_name):
+            continue
 
-            dataset_id = int(directory_name)
+        dataset_id = int(directory_name)
 
-            directory_name = os.path.join(dataset_cache_dir,
-                                          directory_name)
-            dataset_directory_content = os.listdir(directory_name)
+        directory_name = os.path.join(dataset_cache_dir,
+                                      directory_name)
+        dataset_directory_content = os.listdir(directory_name)
 
-            if "dataset.arff" in dataset_directory_content and \
-                            "description.xml" in dataset_directory_content:
-                if dataset_id not in datasets:
-                    datasets.append(dataset_id)
+        if "dataset.arff" in dataset_directory_content and \
+                        "description.xml" in dataset_directory_content:
+            if dataset_id not in datasets:
+                datasets.append(dataset_id)
 
     datasets.sort()
     return datasets
@@ -79,53 +79,44 @@ def _get_cached_dataset(dataset_id):
 
 
 def _get_cached_dataset_description(dataset_id):
-    for cache_dir in [config.get_cache_directory(),
-                      config.get_private_directory()]:
-        did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
-        description_file = os.path.join(did_cache_dir, "description.xml")
-        try:
-            with io.open(description_file, encoding='utf8') as fh:
-                dataset_xml = fh.read()
-        except (IOError, OSError):
-            continue
-
+    cache_dir = config.get_cache_directory()
+    did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
+    description_file = os.path.join(did_cache_dir, "description.xml")
+    try:
+        with io.open(description_file, encoding='utf8') as fh:
+            dataset_xml = fh.read()
         return xmltodict.parse(dataset_xml)["oml:data_set_description"]
+    except (IOError, OSError):
+        raise OpenMLCacheException(
+            "Dataset description for dataset id %d not "
+            "cached" % dataset_id)
 
-    raise OpenMLCacheException("Dataset description for dataset id %d not "
-                               "cached" % dataset_id)
 
 def _get_cached_dataset_features(dataset_id):
-    for cache_dir in [config.get_cache_directory(),
-                      config.get_private_directory()]:
-        did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
-        features_file = os.path.join(did_cache_dir, "features.xml")
-        try:
-            with io.open(features_file, encoding='utf8') as fh:
-                features_xml = fh.read()
-        except (IOError, OSError):
-            continue
-
-        return xmltodict.parse(features_xml)["oml:data_features"]
-
-    raise OpenMLCacheException("Dataset features for dataset id %d not "
-                               "cached" % dataset_id)
+    cache_dir = config.get_cache_directory()
+    did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
+    features_file = os.path.join(did_cache_dir, "features.xml")
+    try:
+        with io.open(features_file, encoding='utf8') as fh:
+            features_xml = fh.read()
+            return xmltodict.parse(features_xml)["oml:data_features"]
+    except (IOError, OSError):
+        raise OpenMLCacheException("Dataset features for dataset id %d not "
+                                   "cached" % dataset_id)
 
 
 def _get_cached_dataset_arff(dataset_id):
-    for cache_dir in [config.get_cache_directory(),
-                      config.get_private_directory()]:
-        did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
-        output_file = os.path.join(did_cache_dir, "dataset.arff")
+    cache_dir = config.get_cache_directory()
+    did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
+    output_file = os.path.join(did_cache_dir, "dataset.arff")
 
-        try:
-            with io.open(output_file, encoding='utf8'):
-                pass
-            return output_file
-        except (OSError, IOError):
-            continue
-
-    raise OpenMLCacheException("ARFF file for dataset id %d not "
-                               "cached" % dataset_id)
+    try:
+        with io.open(output_file, encoding='utf8'):
+            pass
+        return output_file
+    except (OSError, IOError):
+        raise OpenMLCacheException("ARFF file for dataset id %d not "
+                                   "cached" % dataset_id)
 
 
 def list_datasets(offset=None, size=None, tag=None):
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -260,21 +260,18 @@ def _create_run_from_xml(xml):
 
 def _get_cached_run(run_id):
     """Load a run from the cache."""
-    for cache_dir in [config.get_cache_directory(),
-                      config.get_private_directory()]:
-        run_cache_dir = os.path.join(cache_dir, "runs")
-        try:
-            run_file = os.path.join(run_cache_dir,
-                                    "run_%d.xml" % int(run_id))
-            with io.open(run_file, encoding='utf8') as fh:
-                run = _create_task_from_xml(xml=fh.read())
-            return run
-
-        except (OSError, IOError):
-            continue
-
-    raise OpenMLCacheException("Run file for run id %d not "
-                               "cached" % run_id)
+    cache_dir = config.get_cache_directory()
+    run_cache_dir = os.path.join(cache_dir, "runs")
+    try:
+        run_file = os.path.join(run_cache_dir,
+                                "run_%d.xml" % int(run_id))
+        with io.open(run_file, encoding='utf8') as fh:
+            run = _create_task_from_xml(xml=fh.read())
+        return run
+
+    except (OSError, IOError):
+        raise OpenMLCacheException("Run file for run id %d not "
+                                   "cached" % run_id)
 
 
 def list_runs(offset=None, size=None, id=None, task=None,
diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
@@ -14,39 +14,37 @@
 
 def _get_cached_tasks():
     tasks = OrderedDict()
-    for cache_dir in [config.get_cache_directory(), config.get_private_directory()]:
+    cache_dir = config.get_cache_directory()
 
-        task_cache_dir = os.path.join(cache_dir, "tasks")
-        directory_content = os.listdir(task_cache_dir)
-        directory_content.sort()
+    task_cache_dir = os.path.join(cache_dir, "tasks")
+    directory_content = os.listdir(task_cache_dir)
+    directory_content.sort()
 
-        # Find all dataset ids for which we have downloaded the dataset
-        # description
+    # Find all dataset ids for which we have downloaded the dataset
+    # description
 
-        for filename in directory_content:
-            if not re.match(r"[0-9]*", filename):
-                continue
+    for filename in directory_content:
+        if not re.match(r"[0-9]*", filename):
+            continue
 
-            tid = int(filename)
-            tasks[tid] = _get_cached_task(tid)
+        tid = int(filename)
+        tasks[tid] = _get_cached_task(tid)
 
     return tasks
 
 
 def _get_cached_task(tid):
-    for cache_dir in [config.get_cache_directory(), config.get_private_directory()]:
-        task_cache_dir = os.path.join(cache_dir, "tasks")
-        task_file = os.path.join(task_cache_dir, str(tid), "task.xml")
-
-        try:
-            with io.open(task_file, encoding='utf8') as fh:
-                task = _create_task_from_xml(xml=fh.read())
-            return task
-        except (OSError, IOError):
-            continue
+    cache_dir = config.get_cache_directory()
+    task_cache_dir = os.path.join(cache_dir, "tasks")
+    task_file = os.path.join(task_cache_dir, str(tid), "task.xml")
 
-    raise OpenMLCacheException("Task file for tid %d not "
-                               "cached" % tid)
+    try:
+        with io.open(task_file, encoding='utf8') as fh:
+            task = _create_task_from_xml(xml=fh.read())
+        return task
+    except (OSError, IOError):
+        raise OpenMLCacheException("Task file for tid %d not "
+                                   "cached" % tid)
 
 
 def _get_estimation_procedure_list():
diff --git a/openml/testing.py b/openml/testing.py
@@ -46,7 +46,7 @@ def setUp(self):
         self.test_server = "https://test.openml.org/api/v1/xml"
         openml.config.server = self.test_server
 
-        openml.config.set_cache_directory(self.workdir, self.workdir)
+        openml.config.set_cache_directory(self.workdir)
 
     def tearDown(self):
         os.chdir(self.cwd)
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
@@ -53,7 +53,6 @@ def test_run_optimize_bagging_iris(self):
         num_folds = 10
         num_iterations = 36 # (num values for C times gamma)
 
-        task = openml.tasks.get_task(task_id)
         bag = BaggingClassifier(base_estimator=SVC())
         param_dist = {"base_estimator__C": [0.001, 0.01, 0.1, 1, 10, 100],
                       "base_estimator__gamma": [0.001, 0.01, 0.1, 1, 10, 100]}
@@ -62,7 +61,6 @@ def test_run_optimize_bagging_iris(self):
         run = self._perform_run(task_id, num_instances, grid_search)
         self.assertEqual(len(run.trace_content), num_iterations * num_folds)
 
-
     def test__run_task_get_arffcontent(self):
         task = openml.tasks.get_task(1939)
         class_labels = task.class_labels