Skip to content

Commit 43596e0

Browse files
PGijsbersmfeurer
authored andcommitted
Create OpenMLBase, have most OpenML objects derive from it (#828)
* Create OpenMLBase, have OpenMLFlow derive from it. * Derive ID and entity_letter based on class type. * Add #433 open_in_browser. * Use OpenMLBase in Dataset, Run, Task. * Use OpenMLBase for Study * Update tag functions to take into account entity type. * Quote OpenMLBase typing as it is only imported for type checking. * Remove _repr_pretty_ as the default __repr__ prints pretty in a notebook anyway. * Move _to_xml to base * Fix bug, actually check for instance type to determine entity. * Provide list to task type description in task __repr__ * Move fetching id to derived classes. * Share base_url logic. Fix mypy warnings. * Make child classes responsible for making sure _entity_letter is correct. * Docstring and type hint changes. * PEP8 * PEP8 * Fix mypy issues * Fix CI mypy issues. * Dont use Py3.6 syntax * Fix CI mypy issue
1 parent c40e474 commit 43596e0

15 files changed

Lines changed: 295 additions & 378 deletions

File tree

openml/base.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
from abc import ABC, abstractmethod
2+
from collections import OrderedDict
3+
import re
4+
from typing import Optional, List, Tuple, Union
5+
import webbrowser
6+
7+
import xmltodict
8+
9+
import openml.config
10+
from .utils import _tag_openml_base
11+
12+
13+
class OpenMLBase(ABC):
14+
""" Base object for functionality that is shared across entities. """
15+
16+
def __repr__(self):
17+
body_fields = self._get_repr_body_fields()
18+
return self._apply_repr_template(body_fields)
19+
20+
@property
21+
@abstractmethod
22+
def id(self) -> Optional[int]:
23+
""" The id of the entity, it is unique for its entity type. """
24+
pass
25+
26+
@property
27+
def openml_url(self) -> Optional[str]:
28+
""" The URL of the object on the server, if it was uploaded, else None. """
29+
if self.id is None:
30+
return None
31+
return self.__class__.url_for_id(self.id)
32+
33+
@classmethod
34+
def url_for_id(cls, id_: int) -> str:
35+
""" Return the OpenML URL for the object of the class entity with the given id. """
36+
# Sample url for a flow: openml.org/f/123
37+
return "{}/{}/{}".format(openml.config.server_base_url, cls._entity_letter(), id_)
38+
39+
@classmethod
40+
def _entity_letter(cls) -> str:
41+
""" Return the letter which represents the entity type in urls, e.g. 'f' for flow."""
42+
# We take advantage of the class naming convention (OpenMLX),
43+
# which holds for all entities except studies and tasks, which overwrite this method.
44+
return cls.__name__.lower()[len('OpenML'):][0]
45+
46+
@abstractmethod
47+
def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
48+
""" Collect all information to display in the __repr__ body.
49+
50+
Returns
51+
------
52+
body_fields : List[Tuple[str, Union[str, int, List[str]]]]
53+
A list of (name, value) pairs to display in the body of the __repr__.
54+
E.g.: [('metric', 'accuracy'), ('dataset', 'iris')]
55+
If value is a List of str, then each item of the list will appear in a separate row.
56+
"""
57+
# Should be implemented in the base class.
58+
pass
59+
60+
def _apply_repr_template(self, body_fields: List[Tuple[str, str]]) -> str:
61+
""" Generates the header and formats the body for string representation of the object.
62+
63+
Parameters
64+
----------
65+
body_fields: List[Tuple[str, str]]
66+
A list of (name, value) pairs to display in the body of the __repr__.
67+
"""
68+
# We add spaces between capitals, e.g. ClassificationTask -> Classification Task
69+
name_with_spaces = re.sub(r"(\w)([A-Z])", r"\1 \2",
70+
self.__class__.__name__[len('OpenML'):])
71+
header_text = 'OpenML {}'.format(name_with_spaces)
72+
header = '{}\n{}\n'.format(header_text, '=' * len(header_text))
73+
74+
longest_field_name_length = max(len(name) for name, value in body_fields)
75+
field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
76+
body = '\n'.join(field_line_format.format(name, value) for name, value in body_fields)
77+
return header + body
78+
79+
@abstractmethod
80+
def _to_dict(self) -> 'OrderedDict[str, OrderedDict]':
81+
""" Creates a dictionary representation of self.
82+
83+
Uses OrderedDict to ensure consistent ordering when converting to xml.
84+
The return value (OrderedDict) will be used to create the upload xml file.
85+
The xml file must have the tags in exactly the order of the object's xsd.
86+
(see https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/).
87+
88+
Returns
89+
-------
90+
OrderedDict
91+
Flow represented as OrderedDict.
92+
93+
"""
94+
# Should be implemented in the base class.
95+
pass
96+
97+
def _to_xml(self) -> str:
98+
""" Generate xml representation of self for upload to server. """
99+
dict_representation = self._to_dict()
100+
xml_representation = xmltodict.unparse(dict_representation, pretty=True)
101+
102+
# A task may not be uploaded with the xml encoding specification:
103+
# <?xml version="1.0" encoding="utf-8"?>
104+
encoding_specification, xml_body = xml_representation.split('\n', 1)
105+
return xml_body
106+
107+
def open_in_browser(self):
108+
""" Opens the OpenML web page corresponding to this object in your default browser. """
109+
webbrowser.open(self.openml_url)
110+
111+
def push_tag(self, tag: str):
112+
"""Annotates this entity with a tag on the server.
113+
114+
Parameters
115+
----------
116+
tag : str
117+
Tag to attach to the flow.
118+
"""
119+
_tag_openml_base(self, tag)
120+
121+
def remove_tag(self, tag: str):
122+
"""Removes a tag from this entity on the server.
123+
124+
Parameters
125+
----------
126+
tag : str
127+
Tag to attach to the flow.
128+
"""
129+
_tag_openml_base(self, tag, untag=True)

openml/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@
2828

2929
# Default values are actually added here in the _setup() function which is
3030
# called at the end of this module
31-
server = _defaults['server']
31+
server = str(_defaults['server']) # so mypy knows it is a string
32+
server_base_url = server[:-len('/api/v1/xml')]
3233
apikey = _defaults['apikey']
3334
# The current cache directory (without the server name)
3435
cache_directory = _defaults['cachedir']

openml/datasets/dataset.py

Lines changed: 13 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,15 @@
1515
from warnings import warn
1616

1717
import openml._api_calls
18+
from openml.base import OpenMLBase
1819
from .data_feature import OpenMLDataFeature
1920
from ..exceptions import PyOpenMLError
20-
from ..utils import _tag_entity
2121

2222

2323
logger = logging.getLogger(__name__)
2424

2525

26-
class OpenMLDataset(object):
26+
class OpenMLDataset(OpenMLBase):
2727
"""Dataset object.
2828
2929
Allows fetching and uploading datasets to OpenML.
@@ -184,11 +184,12 @@ def __init__(self, name, description, format=None,
184184
else:
185185
self.data_pickle_file = None
186186

187-
def __repr__(self):
188-
header = "OpenML Dataset"
189-
header = '{}\n{}\n'.format(header, '=' * len(header))
187+
@property
188+
def id(self) -> Optional[int]:
189+
return self.dataset_id
190190

191-
base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
191+
def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
192+
""" Collect all information to display in the __repr__ body. """
192193
fields = {"Name": self.name,
193194
"Version": self.version,
194195
"Format": self.format,
@@ -201,19 +202,14 @@ def __repr__(self):
201202
if self.upload_date is not None:
202203
fields["Upload Date"] = self.upload_date.replace('T', ' ')
203204
if self.dataset_id is not None:
204-
fields["OpenML URL"] = "{}d/{}".format(base_url, self.dataset_id)
205+
fields["OpenML URL"] = self.openml_url
205206
if self.qualities is not None and self.qualities['NumberOfInstances'] is not None:
206207
fields["# of instances"] = int(self.qualities['NumberOfInstances'])
207208

208209
# determines the order in which the information will be printed
209210
order = ["Name", "Version", "Format", "Upload Date", "Licence", "Download URL",
210211
"OpenML URL", "Data File", "Pickle File", "# of features", "# of instances"]
211-
fields = [(key, fields[key]) for key in order if key in fields]
212-
213-
longest_field_name_length = max(len(name) for name, value in fields)
214-
field_line_format = "{{:.<{}}}: {{}}".format(longest_field_name_length)
215-
body = '\n'.join(field_line_format.format(name, value) for name, value in fields)
216-
return header + body
212+
return [(key, fields[key]) for key in order if key in fields]
217213

218214
def __eq__(self, other):
219215

@@ -462,26 +458,6 @@ def _load_data(self):
462458

463459
return data, categorical, attribute_names
464460

465-
def push_tag(self, tag):
466-
"""Annotates this data set with a tag on the server.
467-
468-
Parameters
469-
----------
470-
tag : str
471-
Tag to attach to the dataset.
472-
"""
473-
_tag_entity('data', self.dataset_id, tag)
474-
475-
def remove_tag(self, tag):
476-
"""Removes a tag from this dataset on the server.
477-
478-
Parameters
479-
----------
480-
tag : str
481-
Tag to attach to the dataset.
482-
"""
483-
_tag_entity('data', self.dataset_id, tag, untag=True)
484-
485461
@staticmethod
486462
def _convert_array_format(data, array_format, attribute_names):
487463
"""Convert a dataset to a given array format.
@@ -796,22 +772,16 @@ def publish(self):
796772
self.dataset_id = int(response['oml:upload_data_set']['oml:id'])
797773
return self.dataset_id
798774

799-
def _to_xml(self):
800-
""" Serialize object to xml for upload
801-
802-
Returns
803-
-------
804-
xml_dataset : str
805-
XML description of the data.
806-
"""
775+
def _to_dict(self) -> 'OrderedDict[str, OrderedDict]':
776+
""" Creates a dictionary representation of self. """
807777
props = ['id', 'name', 'version', 'description', 'format', 'creator',
808778
'contributor', 'collection_date', 'upload_date', 'language',
809779
'licence', 'url', 'default_target_attribute',
810780
'row_id_attribute', 'ignore_attribute', 'version_label',
811781
'citation', 'tag', 'visibility', 'original_data_url',
812782
'paper_url', 'update_comment', 'md5_checksum']
813783

814-
data_container = OrderedDict()
784+
data_container = OrderedDict() # type: 'OrderedDict[str, OrderedDict]'
815785
data_dict = OrderedDict([('@xmlns:oml', 'http://openml.org/openml')])
816786
data_container['oml:data_set_description'] = data_dict
817787

@@ -820,14 +790,7 @@ def _to_xml(self):
820790
if content is not None:
821791
data_dict["oml:" + prop] = content
822792

823-
xml_string = xmltodict.unparse(
824-
input_dict=data_container,
825-
pretty=True,
826-
)
827-
# A flow may not be uploaded with the xml encoding specification:
828-
# <?xml version="1.0" encoding="utf-8"?>
829-
xml_string = xml_string.split('\n', 1)[-1]
830-
return xml_string
793+
return data_container
831794

832795

833796
def _check_qualities(qualities):

openml/evaluations/evaluation.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,18 +61,17 @@ def __repr__(self):
6161
header = "OpenML Evaluation"
6262
header = '{}\n{}\n'.format(header, '=' * len(header))
6363

64-
base_url = "{}".format(openml.config.server[:-len('api/v1/xml')])
6564
fields = {"Upload Date": self.upload_time,
6665
"Run ID": self.run_id,
67-
"OpenML Run URL": "{}r/{}".format(base_url, self.run_id),
66+
"OpenML Run URL": openml.runs.OpenMLRun.url_for_id(self.run_id),
6867
"Task ID": self.task_id,
69-
"OpenML Task URL": "{}t/{}".format(base_url, self.task_id),
68+
"OpenML Task URL": openml.tasks.OpenMLTask.url_for_id(self.task_id),
7069
"Flow ID": self.flow_id,
71-
"OpenML Flow URL": "{}f/{}".format(base_url, self.flow_id),
70+
"OpenML Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id),
7271
"Setup ID": self.setup_id,
7372
"Data ID": self.data_id,
7473
"Data Name": self.data_name,
75-
"OpenML Data URL": "{}d/{}".format(base_url, self.data_id),
74+
"OpenML Data URL": openml.datasets.OpenMLDataset.url_for_id(self.data_id),
7675
"Metric Used": self.function,
7776
"Result": self.value}
7877

0 commit comments

Comments
 (0)