Skip to content

Commit c5f11cf

Browse files
security added for get_dor_uris_from_obj
1 parent e0eb195 commit c5f11cf

4 files changed

Lines changed: 153 additions & 21 deletions

File tree

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from energyml.utils.epc_utils import get_dor_uris_from_obj
2+
from energyml.utils.introspection import get_obj_uri, search_attribute_matching_type_with_path
3+
from energyml.utils.serialization import (
4+
serialize_xml,
5+
read_energyml_xml_str,
6+
read_energyml_xml_file,
7+
read_energyml_xml_bytes,
8+
read_energyml_json_str,
9+
read_energyml_json_bytes,
10+
JSON_VERSION,
11+
)
12+
13+
14+
def test_as_uri(xml_path: str):
15+
obj = read_energyml_xml_file(xml_path)
16+
17+
# print(obj)
18+
19+
for uri in get_dor_uris_from_obj(obj):
20+
print(uri)
21+
print("=" * 40)
22+
print(obj.category_lookup)
23+
print(get_obj_uri(obj.category_lookup))
24+
25+
print("=" * 40)
26+
for p, o in search_attribute_matching_type_with_path(obj, "DataObjectreference"):
27+
print(f"{p}: {o} ({get_obj_uri(o)})\n")
28+
29+
30+
if __name__ == "__main__":
31+
# test_as_uri("rc/ContinuousProperty_1d34249c-4c4f-4705-870e-b5dea9c0d78e.xml")
32+
test_as_uri("rc/DiscreteProperty.xml")

energyml-utils/src/energyml/utils/epc.py

Lines changed: 116 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -17,18 +17,15 @@
1717
from functools import wraps
1818
from io import BytesIO
1919
from typing import List, Any, Union, Dict, Optional
20+
import numpy as np
21+
from xsdata.formats.dataclass.models.generics import DerivedElement
2022

2123
from energyml.opc.opc import (
2224
CoreProperties,
2325
Relationships,
2426
Types,
25-
Default,
2627
Relationship,
2728
Override,
28-
Created,
29-
Creator,
30-
Identifier,
31-
Keywords1,
3229
)
3330
from energyml.utils.epc_utils import (
3431
gen_core_props_path,
@@ -38,12 +35,9 @@
3835
create_h5_external_relationship,
3936
)
4037
from energyml.utils.storage_interface import DataArrayMetadata, EnergymlStorageInterface, ResourceMetadata
41-
import numpy as np
4238
from energyml.utils.uri import Uri, parse_uri
43-
from xsdata.formats.dataclass.models.generics import DerivedElement
4439

4540
from energyml.utils.constants import (
46-
RELS_CONTENT_TYPE,
4741
EpcExportVersion,
4842
RawFile,
4943
EPCRelsRelationshipType,
@@ -62,8 +56,6 @@
6256
get_obj_uuid,
6357
get_content_type_from_class,
6458
get_direct_dor_list,
65-
epoch_to_date,
66-
epoch,
6759
gen_uuid,
6860
get_obj_identifier,
6961
get_object_attribute,
@@ -80,6 +72,114 @@
8072
from energyml.utils.xml import is_energyml_content_type
8173

8274

75+
class EnergymlObjectCollection:
76+
"""
77+
A collection that maintains both list semantics (for backward compatibility)
78+
and dict-based lookups (for O(1) performance) for energyml objects.
79+
80+
This allows existing code using .append() to work while providing efficient
81+
get_object_by_identifier() and get_object_by_uuid() operations.
82+
"""
83+
84+
def __init__(self, objects: Optional[List[Any]] = None):
85+
self._by_identifier: Dict[str, Any] = {}
86+
self._by_uri: Dict[str, Any] = {}
87+
self._by_uuid: Dict[str, List[Any]] = {}
88+
self._objects_list: List[Any] = []
89+
90+
if objects:
91+
for obj in objects:
92+
self.append(obj)
93+
94+
def append(self, obj: Any) -> None:
95+
"""Add an object to the collection (list-compatible method)."""
96+
identifier = get_obj_identifier(obj)
97+
uri = str(get_obj_uri(obj))
98+
uuid = get_obj_uuid(obj)
99+
100+
# Check if object already exists by identifier
101+
if identifier in self._by_identifier:
102+
# Replace existing object
103+
existing = self._by_identifier[identifier]
104+
idx = self._objects_list.index(existing)
105+
self._objects_list[idx] = obj
106+
107+
# Clean up old URI mapping
108+
old_uri = str(get_obj_uri(existing))
109+
if old_uri in self._by_uri:
110+
del self._by_uri[old_uri]
111+
112+
# Clean up old UUID mapping
113+
old_uuid = get_obj_uuid(existing)
114+
if old_uuid in self._by_uuid and existing in self._by_uuid[old_uuid]:
115+
self._by_uuid[old_uuid].remove(existing)
116+
if not self._by_uuid[old_uuid]:
117+
del self._by_uuid[old_uuid]
118+
else:
119+
# Add new object
120+
self._objects_list.append(obj)
121+
122+
# Update all indices
123+
self._by_identifier[identifier] = obj
124+
self._by_uri[uri] = obj
125+
126+
if uuid not in self._by_uuid:
127+
self._by_uuid[uuid] = []
128+
if obj not in self._by_uuid[uuid]:
129+
self._by_uuid[uuid].append(obj)
130+
131+
def remove(self, obj: Any) -> None:
132+
"""Remove an object from the collection (list-compatible method)."""
133+
identifier = get_obj_identifier(obj)
134+
135+
if identifier in self._by_identifier:
136+
stored_obj = self._by_identifier[identifier]
137+
self._objects_list.remove(stored_obj)
138+
139+
# Clean up all indices
140+
del self._by_identifier[identifier]
141+
142+
uri = str(get_obj_uri(stored_obj))
143+
if uri in self._by_uri:
144+
del self._by_uri[uri]
145+
146+
uuid = get_obj_uuid(stored_obj)
147+
if uuid in self._by_uuid and stored_obj in self._by_uuid[uuid]:
148+
self._by_uuid[uuid].remove(stored_obj)
149+
if not self._by_uuid[uuid]:
150+
del self._by_uuid[uuid]
151+
152+
def get_by_identifier(self, identifier: Union[str, Uri]) -> Optional[Any]:
153+
"""Get object by identifier (O(1) lookup)."""
154+
# Try identifier lookup first
155+
obj = self._by_identifier.get(str(identifier))
156+
if obj is not None:
157+
return obj
158+
159+
# Try URI lookup
160+
return self._by_uri.get(str(identifier))
161+
162+
def get_by_uuid(self, uuid: str) -> List[Any]:
163+
"""Get all objects with this UUID (O(1) lookup)."""
164+
return self._by_uuid.get(uuid, [])
165+
166+
def __iter__(self):
167+
"""Iterate over objects in insertion order."""
168+
return iter(self._objects_list)
169+
170+
def __len__(self) -> int:
171+
"""Get number of objects."""
172+
return len(self._objects_list)
173+
174+
def __getitem__(self, index: int) -> Any:
175+
"""Support indexing (e.g., energyml_objects[0])."""
176+
return self._objects_list[index]
177+
178+
def __bool__(self) -> bool:
179+
"""Support boolean checks (e.g., if energyml_objects:)."""
180+
return len(self._objects_list) > 0
181+
182+
83183
def log_timestamp(func):
84184
"""Decorator to log timestamps for function execution."""
85185

@@ -134,8 +234,8 @@ class Epc(EnergymlStorageInterface):
134234
core_props: Optional[CoreProperties] = field(default=None)
135235

136236
""" xml files referred in the [Content_Types].xml """
137-
energyml_objects: List = field(
138-
default_factory=list,
237+
energyml_objects: EnergymlObjectCollection = field(
238+
default_factory=EnergymlObjectCollection,
139239
)
140240

141241
""" Other files content like pdf etc """
@@ -564,20 +664,16 @@ def get_object_by_uuid(self, uuid: str) -> List[Any]:
564664
:param uuid:
565665
:return:
566666
"""
567-
return list(filter(lambda o: get_obj_uuid(o) == uuid, self.energyml_objects))
667+
return self.energyml_objects.get_by_uuid(uuid)
568668

569669
def get_object_by_identifier(self, identifier: Union[str, Uri]) -> Optional[Any]:
570670
"""
571671
Search an object by its identifier.
572672
:param identifier: given by the function :func:`get_obj_identifier`, or a URI (or its str representation)
573673
:return:
574674
"""
575-
is_uri = isinstance(identifier, Uri) or parse_uri(identifier) is not None
576-
id_str = str(identifier)
577-
for o in self.energyml_objects:
578-
if (get_obj_identifier(o) if not is_uri else str(get_obj_uri(o))) == id_str:
579-
return o
580-
return None
675+
# Use the O(1) dict lookup from the collection
676+
return self.energyml_objects.get_by_identifier(identifier)
581677

582678
def get_object(self, identifier: Union[str, Uri]) -> Optional[Any]:
583679
return self.get_object_by_identifier(identifier)
@@ -898,7 +994,7 @@ def read_stream(cls, epc_file_io: BytesIO): # returns an Epc instance
898994
)
899995

900996
return Epc(
901-
energyml_objects=obj_list,
997+
energyml_objects=EnergymlObjectCollection(obj_list),
902998
raw_files=raw_file_list,
903999
core_props=core_props,
9041000
additional_rels=additional_rels,

energyml-utils/src/energyml/utils/epc_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,7 @@ def get_dor_uris_from_obj(obj: Any) -> Set[Uri]:
721721
for dor in dor_list:
722722
try:
723723
uri = get_obj_uri(dor)
724-
if uri:
724+
if uri and uri.is_object_uri():
725725
uri_set.add(uri)
726726
except Exception as e:
727727
logging.warning(f"Failed to extract uri from DOR: {e}")

energyml-utils/tests/test_introspection.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -724,6 +724,10 @@ def test_get_obj_uri(triangulated_set_no_version, fault_interpretation):
724724
"""Test URI generation for energyml objects."""
725725
uri_str = str(get_obj_uri(triangulated_set_no_version))
726726
assert uri_str == f"eml:///resqml22.TriangulatedSetRepresentation({triangulated_set_no_version.uuid})"
727+
assert (
728+
str(get_obj_uri(as_dor(triangulated_set_no_version)))
729+
== f"eml:///resqml22.TriangulatedSetRepresentation({triangulated_set_no_version.uuid})"
730+
)
727731

728732
uri_str_with_dataspace = str(get_obj_uri(triangulated_set_no_version, "/MyDataspace/"))
729733
assert (

0 commit comments

Comments
 (0)