Skip to content

Commit 2f625fa

Browse files
committed
Merge pull request #14 from mfeurer/feature/download_run
Add API calls for /run/list/ and /run/{run_id}/
2 parents f83307f + 6b95a17 commit 2f625fa

4 files changed

Lines changed: 211 additions & 15 deletions

File tree

openml/apiconnector.py

Lines changed: 163 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from collections import OrderedDict
2-
import hashlib
32
import logging
43
import os
54
import re
@@ -25,8 +24,11 @@
2524
from .entities.dataset import OpenMLDataset
2625
from .entities.task import OpenMLTask
2726
from .entities.split import OpenMLSplit
27+
from .entities.run import OpenMLRun
2828
from .util import is_string
2929

30+
import numpy as np
31+
3032
logger = logging.getLogger(__name__)
3133

3234
OPENML_URL = "http://api_new.openml.org/v1/"
@@ -806,16 +808,163 @@ def _download_split(self, task, cache_file):
806808

807809
def _create_task_cache_dir(self, task_id):
808810
task_cache_dir = os.path.join(self.task_cache_dir, str(task_id))
811+
809812
try:
810813
os.makedirs(task_cache_dir)
811814
except (IOError, OSError):
812815
# TODO add debug information!
813816
pass
814817
return task_cache_dir
815818

816-
def _perform_api_call(self, call, data=None, filePath=None, add_authentication=True):
819+
############################################################################
820+
# Runs
821+
def get_runs_list(self, task_id=None, flow_id=None, setup_id=None):
822+
"""Return a list of all runs for either a task, flow or setup.
823+
824+
Exactly one of the optional parameters must be given.
825+
826+
Parameters
827+
----------
828+
task_id : int, optional
829+
flow_id : int, optional
830+
setup_id : int, optional
831+
832+
Returns
833+
-------
834+
list
835+
A list of all runs run IDs for a given ID.
817836
"""
818-
Perform an API call at the OpenML server.
837+
test = [task_id is None, flow_id is None, setup_id is None]
838+
if np.nansum(test) != 2:
839+
raise ValueError
840+
841+
call = "run/list"
842+
843+
if task_id is not None:
844+
call += "?task_id=%d" % task_id
845+
elif flow_id is not None:
846+
call += "?implementation_id=%d" % flow_id
847+
elif setup_id is not None:
848+
call += "?setup_id=%d" % setup_id
849+
850+
return_code, xml_string = self._perform_api_call(call)
851+
datasets_dict = xmltodict.parse(xml_string)
852+
853+
854+
if isinstance(datasets_dict['oml:runs']['oml:run'], dict):
855+
runs = [datasets_dict['oml:runs']['oml:run']]
856+
else:
857+
# Minimalistic check if the XML is useful
858+
assert type(datasets_dict['oml:runs']['oml:run']) == list, \
859+
type(datasets_dict['oml:runs']['oml:run'])
860+
assert datasets_dict['oml:runs']['@xmlns:oml'] == \
861+
'http://openml.org/openml'
862+
863+
runs = []
864+
for runs_ in datasets_dict['oml:runs']['oml:run']:
865+
run = {'run_id': int(runs_['oml:run_id']),
866+
'task_id': int(runs_['oml:task_id']),
867+
'setup_id': int(runs_['oml:setup_id']),
868+
'implementation_id': int(runs_['oml:implementation_id']),
869+
'uploader': int(runs_['oml:uploader'])}
870+
871+
runs.append(run)
872+
runs.sort(key=lambda t: t['run_id'])
873+
874+
return runs
875+
876+
def download_run(self, run_id):
877+
"""Download the OpenML run for a given run ID.
878+
879+
Parameters
880+
----------
881+
run_id : int
882+
The OpenML run id.
883+
"""
884+
try:
885+
run_id = int(run_id)
886+
except:
887+
raise ValueError("Task ID is neither an Integer nor can be "
888+
"cast to an Integer.")
889+
890+
xml_file = os.path.join(self._create_run_cache_dir(run_id),
891+
"run.xml")
892+
893+
try:
894+
with open(xml_file) as fh:
895+
run = self._create_run_from_xml(fh.read())
896+
except (OSError, IOError):
897+
898+
try:
899+
return_code, run_xml = self._perform_api_call(
900+
"run/%d" % run_id)
901+
except (URLError, UnicodeEncodeError) as e:
902+
print(e)
903+
raise e
904+
905+
# Cache the xml task file
906+
if os.path.exists(xml_file):
907+
with open(xml_file) as fh:
908+
local_xml = fh.read()
909+
910+
if run_xml != local_xml:
911+
raise ValueError("Run description of run %d cached at %s "
912+
"has changed." % (run_id, xml_file))
913+
914+
else:
915+
with open(xml_file, "w") as fh:
916+
fh.write(run_xml)
917+
918+
run = self._create_run_from_xml(run_xml)
919+
920+
return run
921+
922+
def _create_run_cache_dir(self, run_id):
923+
run_cache_dir = os.path.join(self.task_cache_dir, str(run_id))
924+
925+
try:
926+
os.makedirs(run_cache_dir)
927+
except (IOError, OSError):
928+
# TODO add debug information!
929+
pass
930+
return run_cache_dir
931+
932+
def _create_run_from_xml(self, xml):
933+
dic = xmltodict.parse(xml)[u"oml:run"]
934+
datasets = []
935+
for key in dic[u'oml:input_data']:
936+
dataset = dic[u'oml:input_data'][key]
937+
did = dataset[u'oml:did']
938+
datasets.append(did)
939+
940+
tags = []
941+
for tag in dic[u"oml:tag"]:
942+
tags.append(tag)
943+
944+
files = dict()
945+
for file_ in dic[u"oml:output_data"][u"oml:file"]:
946+
name = file_[u"oml:name"]
947+
url = file_[u"oml:url"]
948+
files[name] = url
949+
950+
print dic.keys()
951+
evaluations = dict()
952+
for evaluation in dic[u"oml:output_data"][u"oml:evaluation"]:
953+
name = evaluation[u"oml:name"]
954+
value = evaluation.get(u"oml:value")
955+
value_array = evaluation.get(u"oml:array_data")
956+
evaluations[name] = (value, value_array)
957+
958+
return OpenMLRun(
959+
dic[u"oml:run_id"], dic[u"oml:uploader"],
960+
dic[u"oml:task_id"], dic[u"oml:implementation_id"],
961+
dic[u"oml:setup_string"], dic[u'oml:setup_id'],
962+
tags, datasets, files, evaluations)
963+
964+
############################################################################
965+
# Internal stuff
966+
def _perform_api_call(self, call, data=None, file_path=None):
967+
"""Perform an API call at the OpenML server.
819968
return self._read_url(url, data=data, filePath=filePath,
820969
def _read_url(self, url, add_authentication=False, data=None, filePath=None):
821970
@@ -840,21 +989,21 @@ def _read_url(self, url, add_authentication=False, data=None, filePath=None):
840989
if not url.endswith("/"):
841990
url += "/"
842991
url += call
843-
return self._read_url(url, data=data,filePath= filePath)
992+
return self._read_url(url, data=data, file_path=file_path)
844993

845-
def _read_url(self, url, data=None, filePath=None):
994+
def _read_url(self, url, data=None, file_path=None):
846995
if data is None:
847996
data = {}
848997
data['session_hash'] = self.config.get('FAKE_SECTION', 'apikey')
849998

850-
if filePath is not None:
851-
if os.path.isabs(filePath):
999+
if file_path is not None:
1000+
if os.path.isabs(file_path):
8521001
try:
8531002
decoder = arff.ArffDecoder()
8541003
except:
8551004
raise "The file you provided is not a valid arff file"
8561005

857-
fileElement={'dataset': open(filePath, 'rb')}
1006+
fileElement={'dataset': open(file_path, 'rb')}
8581007
data['description']= data.get('description')
8591008
data.pop('dataset', None)
8601009

@@ -866,6 +1015,7 @@ def _read_url(self, url, data=None, filePath=None):
8661015
return response.status_code, response
8671016
else:
8681017
raise "File doesn't exists"
1018+
8691019
else:
8701020
data = urlencode(data)
8711021
data = data.encode('utf-8')
@@ -907,10 +1057,11 @@ def _read_url(self, url, data=None, filePath=None):
9071057
string.write(chunk)
9081058
return return_code, string.getvalue()
9091059

910-
def upload_dataset(self, description, filePath=None):
1060+
def upload_dataset(self, description, file_path=None):
9111061
try:
9121062
data = {'description': description}
913-
return_code, dataset_xml = self._perform_api_call("/data/",data=data, filePath=filePath)
1063+
return_code, dataset_xml = self._perform_api_call(
1064+
"/data/", data=data, file_path=file_path)
9141065

9151066
except URLError as e:
9161067
# TODO logger.debug
@@ -921,7 +1072,8 @@ def upload_dataset(self, description, filePath=None):
9211072
def upload_flow(self, description, binary, source):
9221073
try:
9231074
data = {'description': description, 'binary': binary, 'source': source}
924-
return_code, dataset_xml = self._perform_api_call("openml.implementation.upload", data=data)
1075+
return_code, dataset_xml = self._perform_api_call(
1076+
"openml.implementation.upload", data=data)
9251077

9261078
except URLError as e:
9271079
# TODO logger.debug

openml/entities/run.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
class OpenMLRun(object):
2+
def __init__(self, run_id, uploader, task_id, flow_id, setup_string,
3+
setup_id, tags, datasets, files, evaluations):
4+
self.run_id = run_id
5+
self.uploader = uploader
6+
self.task_id = task_id
7+
self.flow_id = flow_id
8+
self.setup_id = setup_id
9+
self.setup_string = setup_string
10+
self.tags = tags
11+
self.datasets = datasets
12+
self.files = files
13+
self.evaluations = evaluations

source/progress.rst

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ API calls
1313
API call implemented tested properly tested loads json proper error handling
1414
=============================================== =========== ====== =============== ========== =====================
1515
/data/list/ yes yes
16-
/data/list/active/
1716
/data/list/tag/{tag}
1817
/data/{data_id} yes yes
1918
/data/delete/
@@ -26,7 +25,6 @@ API call implemented tested properly test
2625
/data/tag
2726
/data/untag
2827
/task/list yes yes
29-
/task/list/active
3028
/task/list/tag/{tag}
3129
/task/{task_id} yes yes
3230
/task/tag
@@ -40,8 +38,8 @@ API call implemented tested properly test
4038
/flow/
4139
/flow/exists/{name,ext_version}
4240
/flow/owned
43-
/run/list
44-
/run/{run_id}
41+
/run/list yes yes
42+
/run/{run_id} yes yes
4543
/run
4644
/run/tag
4745
/run/untag

tests/test_apiconnector.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,39 @@ def test_download_split(self):
221221
self.assertTrue(os.path.exists(
222222
os.path.join(os.getcwd(), "tasks", "1", "datasplits.arff")))
223223

224+
############################################################################
225+
# Runs
226+
def test_download_run_list(self):
227+
def check_run(run):
228+
self.assertIsInstance(run, dict)
229+
self.assertEqual(len(run), 5)
230+
231+
runs = self.connector.get_runs_list(task_id=1)
232+
# 1759 as the number of supervised classification tasks retrieved
233+
# openml.org from this call; don't trust the number on openml.org as
234+
# it also counts private datasets
235+
self.assertGreaterEqual(len(runs), 800)
236+
for run in runs:
237+
check_run(run)
238+
239+
runs = self.connector.get_runs_list(flow_id=1)
240+
self.assertGreaterEqual(len(runs), 1)
241+
for task in runs:
242+
check_run(task)
243+
244+
runs = self.connector.get_runs_list(setup_id=1)
245+
self.assertGreaterEqual(len(runs), 261)
246+
for task in runs:
247+
check_run(task)
248+
249+
def test_download_run(self):
250+
run = self.connector.download_run(473350)
251+
self.assertGreaterEqual(len(run.tags), 2)
252+
self.assertEqual(len(run.datasets), 1)
253+
self.assertGreaterEqual(len(run.files), 2)
254+
self.assertGreaterEqual(len(run.evaluations), 18)
255+
self.assertEqual(len(run.evaluations['f_measure']), 2)
256+
224257
def test_upload_dataset(self):
225258

226259
dataset = self.connector.download_dataset(3)

0 commit comments

Comments
 (0)