Skip to content

Commit f74b73a

Browse files
PGijsbersmfeurer
authored andcommitted
Put shared logic of Publish into OpenMLBase (#849)
* Reworked Task publish and Dataset publish * Use OpenMLBase publish method. * Remove unused import. Add study as legal API entity. * Use shared logic in Flow and fix resolving Study alias. * Further extract shared logic. * Fix flake8, mypy
1 parent 2a25ed3 commit f74b73a

12 files changed

Lines changed: 178 additions & 219 deletions

File tree

examples/30_extended/create_upload_tutorial.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,8 @@
119119

120120
############################################################################
121121

122-
upload_did = diabetes_dataset.publish()
123-
print(f"URL for dataset: {openml.config.server}/data/{upload_did}")
122+
diabetes_dataset.publish()
123+
print(f"URL for dataset: {diabetes_dataset.openml_url}")
124124

125125
############################################################################
126126
# Dataset is a list
@@ -192,8 +192,8 @@
192192

193193
############################################################################
194194

195-
upload_did = weather_dataset.publish()
196-
print(f"URL for dataset: {openml.config.server}/data/{upload_did}")
195+
weather_dataset.publish()
196+
print(f"URL for dataset: {weather_dataset.openml_url}")
197197

198198
############################################################################
199199
# Dataset is a pandas DataFrame
@@ -238,8 +238,8 @@
238238

239239
############################################################################
240240

241-
upload_did = weather_dataset.publish()
242-
print(f"URL for dataset: {openml.config.server}/data/{upload_did}")
241+
weather_dataset.publish()
242+
print(f"URL for dataset: {weather_dataset.openml_url}")
243243

244244
############################################################################
245245
# Dataset is a sparse matrix
@@ -275,8 +275,8 @@
275275

276276
############################################################################
277277

278-
upload_did = xor_dataset.publish()
279-
print(f"URL for dataset: {openml.config.server}/data/{upload_did}")
278+
xor_dataset.publish()
279+
print(f"URL for dataset: {xor_dataset.openml_url}")
280280

281281

282282
############################################################################
@@ -310,8 +310,8 @@
310310

311311
############################################################################
312312

313-
upload_did = xor_dataset.publish()
314-
print(f"URL for dataset: {openml.config.server}/data/{upload_did}")
313+
xor_dataset.publish()
314+
print(f"URL for dataset: {xor_dataset.openml_url}")
315315

316316

317317
############################################################################

openml/base.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
from abc import ABC, abstractmethod
22
from collections import OrderedDict
33
import re
4-
from typing import Optional, List, Tuple, Union
4+
from typing import Optional, List, Tuple, Union, Dict
55
import webbrowser
66

77
import xmltodict
88

99
import openml.config
10-
from .utils import _tag_openml_base
10+
from .utils import _tag_openml_base, _get_rest_api_type_alias
1111

1212

1313
class OpenMLBase(ABC):
@@ -104,6 +104,34 @@ def _to_xml(self) -> str:
104104
encoding_specification, xml_body = xml_representation.split('\n', 1)
105105
return xml_body
106106

107+
def _get_file_elements(self) -> Dict:
108+
""" Get file_elements to upload to the server, called during Publish.
109+
110+
Derived child classes should overwrite this method as necessary.
111+
The description field will be populated automatically if not provided.
112+
"""
113+
return {}
114+
115+
@abstractmethod
116+
def _parse_publish_response(self, xml_response: Dict):
117+
""" Parse the id from the xml_response and assign it to self. """
118+
pass
119+
120+
def publish(self) -> 'OpenMLBase':
121+
file_elements = self._get_file_elements()
122+
123+
if 'description' not in file_elements:
124+
file_elements['description'] = self._to_xml()
125+
126+
call = '{}/'.format(_get_rest_api_type_alias(self))
127+
response_text = openml._api_calls._perform_api_call(
128+
call, 'post', file_elements=file_elements
129+
)
130+
xml_response = xmltodict.parse(response_text)
131+
132+
self._parse_publish_response(xml_response)
133+
return self
134+
107135
def open_in_browser(self):
108136
""" Opens the OpenML web page corresponding to this object in your default browser. """
109137
webbrowser.open(self.openml_url)

openml/datasets/dataset.py

Lines changed: 19 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,8 @@
1111
import numpy as np
1212
import pandas as pd
1313
import scipy.sparse
14-
import xmltodict
1514
from warnings import warn
1615

17-
import openml._api_calls
1816
from openml.base import OpenMLBase
1917
from .data_feature import OpenMLDataFeature
2018
from ..exceptions import PyOpenMLError
@@ -728,49 +726,28 @@ def get_features_by_type(self, data_type, exclude=None,
728726
result.append(idx - offset)
729727
return result
730728

731-
def publish(self):
732-
"""Publish the dataset on the OpenML server.
729+
def _get_file_elements(self) -> Dict:
730+
""" Adds the 'dataset' to file elements. """
731+
file_elements = {}
732+
path = None if self.data_file is None else os.path.abspath(self.data_file)
733733

734-
Upload the dataset description and dataset content to openml.
735-
736-
Returns
737-
-------
738-
dataset_id: int
739-
Id of the dataset uploaded to the server.
740-
"""
741-
file_elements = {'description': self._to_xml()}
742-
743-
# the arff dataset string is available
744734
if self._dataset is not None:
745735
file_elements['dataset'] = self._dataset
746-
else:
747-
# the path to the arff dataset is given
748-
if self.data_file is not None:
749-
path = os.path.abspath(self.data_file)
750-
if os.path.exists(path):
751-
try:
752-
753-
with io.open(path, encoding='utf8') as fh:
754-
# check if arff is valid
755-
decoder = arff.ArffDecoder()
756-
decoder.decode(fh, encode_nominal=True)
757-
except arff.ArffException:
758-
raise ValueError("The file you have provided is not "
759-
"a valid arff file.")
760-
761-
with open(path, 'rb') as fp:
762-
file_elements['dataset'] = fp.read()
763-
else:
764-
if self.url is None:
765-
raise ValueError("No url/path to the data file was given")
766-
767-
return_value = openml._api_calls._perform_api_call(
768-
"data/", 'post',
769-
file_elements=file_elements,
770-
)
771-
response = xmltodict.parse(return_value)
772-
self.dataset_id = int(response['oml:upload_data_set']['oml:id'])
773-
return self.dataset_id
736+
elif path is not None and os.path.exists(path):
737+
with open(path, 'rb') as fp:
738+
file_elements['dataset'] = fp.read()
739+
try:
740+
dataset_utf8 = str(file_elements['dataset'], 'utf8')
741+
arff.ArffDecoder().decode(dataset_utf8, encode_nominal=True)
742+
except arff.ArffException:
743+
raise ValueError("The file you have provided is not a valid arff file.")
744+
elif self.url is None:
745+
raise ValueError("No valid url/path to the data file was given.")
746+
return file_elements
747+
748+
def _parse_publish_response(self, xml_response: Dict):
749+
""" Parse the id from the xml_response and assign it to self. """
750+
self.dataset_id = int(xml_response['oml:upload_data_set']['oml:id'])
774751

775752
def _to_dict(self) -> 'OrderedDict[str, OrderedDict]':
776753
""" Creates a dictionary representation of self. """

openml/flows/flow.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,10 @@ def from_filesystem(cls, input_directory) -> 'OpenMLFlow':
351351
xml_string = f.read()
352352
return OpenMLFlow._from_dict(xmltodict.parse(xml_string))
353353

354+
def _parse_publish_response(self, xml_response: Dict):
355+
""" Parse the id from the xml_response and assign it to self. """
356+
self.flow_id = int(xml_response['oml:upload_flow']['oml:id'])
357+
354358
def publish(self, raise_error_if_exists: bool = False) -> 'OpenMLFlow':
355359
""" Publish this flow to OpenML server.
356360
@@ -379,15 +383,8 @@ def publish(self, raise_error_if_exists: bool = False) -> 'OpenMLFlow':
379383
if self.flow_id:
380384
raise openml.exceptions.PyOpenMLError("Flow does not exist on the server, "
381385
"but 'flow.flow_id' is not None.")
382-
xml_description = self._to_xml()
383-
file_elements = {'description': xml_description}
384-
return_value = openml._api_calls._perform_api_call(
385-
"flow/",
386-
'post',
387-
file_elements=file_elements,
388-
)
389-
server_response = xmltodict.parse(return_value)
390-
flow_id = int(server_response['oml:upload_flow']['oml:id'])
386+
super().publish()
387+
flow_id = self.flow_id
391388
elif raise_error_if_exists:
392389
error_message = "This OpenMLFlow already exists with id: {}.".format(flow_id)
393390
raise openml.exceptions.PyOpenMLError(error_message)

openml/runs/run.py

Lines changed: 10 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
from collections import OrderedDict
22
import pickle
33
import time
4-
from typing import Any, IO, TextIO, List, Union, Tuple, Optional # noqa F401
4+
from typing import Any, IO, TextIO, List, Union, Tuple, Optional, Dict # noqa F401
55
import os
66

77
import arff
88
import numpy as np
9-
import xmltodict
109

1110
import openml
1211
import openml._api_calls
@@ -428,16 +427,15 @@ def _attribute_list_to_dict(attribute_list):
428427
scores.append(sklearn_fn(y_true, y_pred, **kwargs))
429428
return np.array(scores)
430429

431-
def publish(self) -> 'OpenMLRun':
432-
""" Publish a run (and if necessary, its flow) to the OpenML server.
430+
def _parse_publish_response(self, xml_response: Dict):
431+
""" Parse the id from the xml_response and assign it to self. """
432+
self.run_id = int(xml_response['oml:upload_run']['oml:run_id'])
433433

434-
Uploads the results of a run to OpenML.
435-
If the run is of an unpublished OpenMLFlow, the flow will be uploaded too.
436-
Sets the run_id on self.
434+
def _get_file_elements(self) -> Dict:
435+
""" Get file_elements to upload to the server.
437436
438-
Returns
439-
-------
440-
self : OpenMLRun
437+
Derived child classes should overwrite this method as necessary.
438+
The description field will be populated automatically if not provided.
441439
"""
442440
if self.model is None:
443441
raise PyOpenMLError(
@@ -463,8 +461,7 @@ def publish(self) -> 'OpenMLRun':
463461
self.model,
464462
)
465463

466-
description_xml = self._to_xml()
467-
file_elements = {'description': ("description.xml", description_xml)}
464+
file_elements = {'description': ("description.xml", self._to_xml())}
468465

469466
if self.error_message is None:
470467
predictions = arff.dumps(self._generate_arff_dict())
@@ -473,13 +470,7 @@ def publish(self) -> 'OpenMLRun':
473470
if self.trace is not None:
474471
trace_arff = arff.dumps(self.trace.trace_to_arff())
475472
file_elements['trace'] = ("trace.arff", trace_arff)
476-
477-
return_value = openml._api_calls._perform_api_call(
478-
"/run/", 'post', file_elements=file_elements
479-
)
480-
result = xmltodict.parse(return_value)
481-
self.run_id = int(result['oml:upload_run']['oml:run_id'])
482-
return self
473+
return file_elements
483474

484475
def _to_dict(self) -> 'OrderedDict[str, OrderedDict]':
485476
""" Creates a dictionary representation of self. """

openml/study/study.py

Lines changed: 3 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
from collections import OrderedDict
22
from typing import Dict, List, Optional, Tuple, Union, Any
33

4-
import xmltodict
5-
64
import openml
75
from openml.base import OpenMLBase
86

@@ -124,26 +122,9 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
124122
"Creator", "Upload Time"]
125123
return [(key, fields[key]) for key in order if key in fields]
126124

127-
def publish(self) -> int:
128-
"""
129-
Publish the study on the OpenML server.
130-
131-
Returns
132-
-------
133-
study_id: int
134-
Id of the study uploaded to the server.
135-
"""
136-
file_elements = {
137-
'description': self._to_xml()
138-
}
139-
return_value = openml._api_calls._perform_api_call(
140-
"study/",
141-
'post',
142-
file_elements=file_elements,
143-
)
144-
study_res = xmltodict.parse(return_value)
145-
self.study_id = int(study_res['oml:study_upload']['oml:id'])
146-
return self.study_id
125+
def _parse_publish_response(self, xml_response: Dict):
126+
""" Parse the id from the xml_response and assign it to self. """
127+
self.study_id = int(xml_response['oml:study_upload']['oml:id'])
147128

148129
def _to_dict(self) -> 'OrderedDict[str, OrderedDict]':
149130
""" Creates a dictionary representation of self. """

openml/tasks/task.py

Lines changed: 3 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import numpy as np
99
import pandas as pd
1010
import scipy.sparse
11-
import xmltodict
1211

1312
import openml._api_calls
1413
from openml.base import OpenMLBase
@@ -181,30 +180,9 @@ def _to_dict(self) -> 'OrderedDict[str, OrderedDict]':
181180

182181
return task_container
183182

184-
def publish(self) -> int:
185-
"""Publish task to OpenML server.
186-
187-
Returns
188-
-------
189-
task_id: int
190-
Returns the id of the uploaded task
191-
if successful.
192-
193-
"""
194-
195-
xml_description = self._to_xml()
196-
197-
file_elements = {'description': xml_description}
198-
199-
return_value = openml._api_calls._perform_api_call(
200-
"task/",
201-
'post',
202-
file_elements=file_elements,
203-
)
204-
205-
task_id = int(xmltodict.parse(return_value)['oml:upload_task']['oml:id'])
206-
207-
return task_id
183+
def _parse_publish_response(self, xml_response: Dict):
184+
""" Parse the id from the xml_response and assign it to self. """
185+
self.task_id = int(xml_response['oml:upload_task']['oml:id'])
208186

209187

210188
class OpenMLSupervisedTask(OpenMLTask, ABC):

openml/utils.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import hashlib
33
import xmltodict
44
import shutil
5-
from typing import TYPE_CHECKING
5+
from typing import TYPE_CHECKING, List, Tuple, Union, Type
66
import warnings
77
import pandas as pd
88
from functools import wraps
@@ -68,16 +68,23 @@ def extract_xml_tags(xml_tag_name, node, allow_none=True):
6868
(xml_tag_name, str(node)))
6969

7070

71-
def _tag_openml_base(oml_object: 'OpenMLBase', tag: str, untag: bool = False):
71+
def _get_rest_api_type_alias(oml_object: 'OpenMLBase') -> str:
72+
""" Return the alias of the openml entity as it is defined for the REST API. """
7273
rest_api_mapping = [
7374
(openml.datasets.OpenMLDataset, 'data'),
7475
(openml.flows.OpenMLFlow, 'flow'),
7576
(openml.tasks.OpenMLTask, 'task'),
76-
(openml.runs.OpenMLRun, 'run')
77-
]
77+
(openml.runs.OpenMLRun, 'run'),
78+
((openml.study.OpenMLStudy, openml.study.OpenMLBenchmarkSuite), 'study')
79+
] # type: List[Tuple[Union[Type, Tuple], str]]
7880
_, api_type_alias = [(python_type, api_alias)
7981
for (python_type, api_alias) in rest_api_mapping
8082
if isinstance(oml_object, python_type)][0]
83+
return api_type_alias
84+
85+
86+
def _tag_openml_base(oml_object: 'OpenMLBase', tag: str, untag: bool = False):
87+
api_type_alias = _get_rest_api_type_alias(oml_object)
8188
_tag_entity(api_type_alias, oml_object.id, tag, untag)
8289

8390

0 commit comments

Comments
 (0)