Skip to content

Commit ed8a0e3

Browse files
forcing get_title in epc_stream for the metadata
1 parent 6fd4694 commit ed8a0e3

2 files changed

Lines changed: 131 additions & 22 deletions

File tree

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
from energyml.utils.epc_stream import EpcStreamReader, RelsUpdateMode
2+
from datetime import datetime
3+
4+
5+
def list_epc_classical(epc_file):
6+
"""List contents of an EPC file."""
7+
epc = EpcStreamReader(epc_file, rels_update_mode=RelsUpdateMode.MANUAL)
8+
9+
time_start = datetime.now()
10+
for obj in epc.list_objects():
11+
print(f"Object: {obj}")
12+
print(len(epc.list_objects()))
13+
time_end = datetime.now()
14+
print(f"Time taken: {time_end - time_start}")
15+
16+
17+
def list_epc_fast(epc_file):
18+
"""List contents of an EPC file using fast method."""
19+
epc = EpcStreamReader(
20+
epc_file,
21+
rels_update_mode=RelsUpdateMode.MANUAL,
22+
)
23+
24+
time_start = datetime.now()
25+
# for obj in epc.list_objects_parallel():
26+
# print(f"Object: {obj}")
27+
print(len(epc.list_objects_parallel()))
28+
time_end = datetime.now()
29+
print(f"Time taken: {time_end - time_start}")
30+
31+
32+
def list_epc_seq(epc_file):
33+
"""List contents of an EPC file using sequential method."""
34+
epc = EpcStreamReader(
35+
epc_file,
36+
rels_update_mode=RelsUpdateMode.MANUAL,
37+
)
38+
39+
time_start = datetime.now()
40+
# for obj in epc.list_objects_seq():
41+
# print(f"Object: {obj}")
42+
print(len(epc.list_objects_seq()))
43+
time_end = datetime.now()
44+
print(f"Time taken: {time_end - time_start}")
45+
46+
47+
if __name__ == "__main__":
48+
epc_file = "D:/Geosiris/Clients/BRGM/git/pointset-extraction/rc/output/full-local/full-local.epc"
49+
print("Listing EPC contents (classical method):")
50+
list_epc_classical(epc_file)
51+
52+
# print("\nListing EPC contents (fast method):")
53+
# list_epc_fast(epc_file)
54+
55+
# print("\nListing EPC contents (sequential method):")
56+
# list_epc_seq(epc_file)

energyml-utils/src/energyml/utils/epc_stream.py

Lines changed: 75 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -680,37 +680,52 @@ def _process_energyml_object_metadata(self, zf: zipfile.ZipFile, override: Overr
680680
file_path = override.part_name.lstrip("/")
681681
content_type = override.content_type
682682

683+
uuid, version, title, last_changed = None, None, None, None
684+
683685
try:
684686
# First try to extract UUID and version from file path (works for EXPANDED mode)
685687
uuid, version = extract_uuid_and_version_from_obj_path(file_path)
686688

687689
# For CLASSIC mode, version is not in the path, so we need to extract it from XML content
688-
if uuid and version is None:
689-
try:
690-
# Read first chunk of XML to extract version without full parsing
691-
with zf.open(file_path) as f:
692-
chunk = f.read(2048) # 2KB should be enough for root element
693-
self.stats.bytes_read += len(chunk)
694-
chunk_str = chunk.decode("utf-8", errors="ignore")
695-
696-
# Extract version if present
697-
version_patterns = [
698-
r'object[Vv]ersion["\']?\s*[:=]\s*["\']([^"\']+)',
699-
]
690+
# => Finally I do it anyway to get the title.
691+
try:
692+
# Read first chunk of XML to extract version, title, and last_changed in one regex search
693+
# with self.zip_accessor.get_zip_file() as f:
694+
chunk = zf.read(4096) # 4KB to increase chance of catching citation block
695+
self.stats.bytes_read += len(chunk)
696+
chunk_str = chunk.decode("utf-8", errors="ignore")
697+
698+
# Single regex with named groups for version, title, and last_changed
699+
pattern = re.compile(
700+
r'object[Vv]ersion["\']?\s*[:=]\s*["\'](?P<version>[^"\']+)' # version attribute
701+
r"|<eml:Title>(?P<title>.*?)</eml:Title>" # eml:Title tag
702+
r"|<eml:LastUpdate>(?P<last_changed>.*?)</eml:LastUpdate>",
703+
re.DOTALL,
704+
)
700705

701-
for pattern in version_patterns:
702-
version_match = re.search(pattern, chunk_str)
703-
if version_match:
704-
version = version_match.group(1)
705-
if not isinstance(version, str):
706-
version = str(version)
707-
break
708-
except Exception as e:
709-
logging.debug(f"Failed to extract version from XML content for {file_path}: {e}")
706+
# Iterate all matches and assign the first found for each group
707+
found = {"version": None, "title": None, "last_changed": None}
708+
for match in pattern.finditer(chunk_str):
709+
for key in found:
710+
if found[key] is None and match.group(key) is not None:
711+
found[key] = match.group(key).strip()
712+
if version is None and found["version"] is not None:
713+
version = found["version"]
714+
if found["title"] is not None:
715+
title = found["title"]
716+
if found["last_changed"] is not None:
717+
last_changed = found["last_changed"]
718+
# Try to parse as datetime if possible
719+
try:
720+
last_changed = date_to_datetime(last_changed)
721+
except Exception:
722+
pass
723+
except Exception as e:
724+
logging.debug(f"Failed to extract version/title/last_update from XML content for {file_path}: {e}")
710725

711726
if uuid: # Only process if we successfully extracted UUID
712727
uri = create_uri_from_content_type_or_qualified_type(ct_or_qt=content_type, uuid=uuid, version=version)
713-
metadata = EpcObjectMetadata(uri=uri)
728+
metadata = EpcObjectMetadata(uri=uri, title=title, last_changed=last_changed)
714729

715730
# Store in indexes
716731
identifier = metadata.identifier
@@ -2055,6 +2070,44 @@ def list_objects(
20552070
) -> List[ResourceMetadata]:
20562071
return [m.to_resource_metadata() for m in self._metadata_mgr.list_metadata(qualified_type_filter=object_type)]
20572072

2073+
def list_objects_parallel(
2074+
self, dataspace: Optional[str] = None, object_type: Optional[str] = None
2075+
) -> List[ResourceMetadata]:
2076+
# use self._metadata_mgr.list_metadata(qualified_type_filter=object_type) to get the list of metadata,
2077+
# then get_each object in parallel using get_object and return the list of ResourceMetadata for the objects that were successfully retrieved
2078+
2079+
import concurrent.futures
2080+
from concurrent.futures import ThreadPoolExecutor, as_completed
2081+
2082+
metadata_list = self._metadata_mgr.list_metadata(qualified_type_filter=object_type)
2083+
with ThreadPoolExecutor(max_workers=8) as executor:
2084+
future_to_metadata = {executor.submit(self.get_object, m.identifier): m for m in metadata_list}
2085+
resource_metadata_list = []
2086+
for future in as_completed(future_to_metadata):
2087+
metadata = future_to_metadata[future]
2088+
try:
2089+
obj = future.result()
2090+
if obj is not None:
2091+
resource_metadata_list.append(metadata.to_resource_metadata())
2092+
except Exception as e:
2093+
logging.debug(f"Failed to get object for metadata {metadata.identifier}: {e}")
2094+
return resource_metadata_list
2095+
2096+
def list_objects_seq(
2097+
self, dataspace: Optional[str] = None, object_type: Optional[str] = None
2098+
) -> List[ResourceMetadata]:
2099+
metadata_list = self._metadata_mgr.list_metadata(qualified_type_filter=object_type)
2100+
resource_metadata_list = []
2101+
for metadata in metadata_list:
2102+
try:
2103+
obj = self.get_object(metadata.identifier)
2104+
if obj is not None:
2105+
resource_metadata_list.append(metadata.to_resource_metadata())
2106+
except Exception as e:
2107+
logging.debug(f"Failed to get object for metadata {metadata.identifier}: {e}")
2108+
2109+
return resource_metadata_list
2110+
20582111
def get_obj_rels(self, obj: Union[str, Uri, Any]) -> List[Relationship]:
20592112
_id = self._id_from_uri_or_identifier(obj)
20602113

0 commit comments

Comments
 (0)