Skip to content

Commit 7644352

Browse files
using FileHandlerRegistry for epc class
1 parent 069aefa commit 7644352

1 file changed

Lines changed: 171 additions & 57 deletions

File tree

  • energyml-utils/src/energyml/utils

energyml-utils/src/energyml/utils/epc.py

Lines changed: 171 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,7 @@
5353
OptimizedRegex,
5454
)
5555
from .data.datasets_io import (
56-
HDF5FileReader,
57-
HDF5FileWriter,
56+
get_handler_registry,
5857
read_external_dataset_array,
5958
)
6059
from .exception import UnparsableFile
@@ -329,20 +328,26 @@ def export_io(self) -> BytesIO:
329328

330329
return zip_buffer
331330

332-
def get_obj_rels(self, obj: Any) -> Optional[Relationships]:
331+
def get_obj_rels(self, obj: Union[str, Uri, Any]) -> List[Relationship]:
333332
"""
334-
Get the Relationships object for a given energyml object
335-
:param obj:
336-
:return:
333+
Get the relationships for a given energyml object
334+
:param obj: The object identifier/URI or the object itself
335+
:return: List of Relationship objects
337336
"""
337+
# Convert identifier to object if needed
338+
if isinstance(obj, str) or isinstance(obj, Uri):
339+
obj = self.get_object_by_identifier(obj)
340+
if obj is None:
341+
return []
342+
338343
rels_path = gen_rels_path(
339344
energyml_object=obj,
340345
export_version=self.export_version,
341346
)
342347
all_rels = self.compute_rels()
343348
if rels_path in all_rels:
344-
return all_rels[rels_path]
345-
return None
349+
return all_rels[rels_path].relationship if all_rels[rels_path].relationship else []
350+
return []
346351

347352
def compute_rels(self) -> Dict[str, Relationships]:
348353
"""
@@ -576,65 +581,112 @@ def read_external_array(
576581
epc=self,
577582
)
578583

579-
def read_array(self, proxy: Union[str, Uri, Any], path_in_external: str) -> Optional[np.ndarray]:
584+
def read_array(
585+
self,
586+
proxy: Union[str, Uri, Any],
587+
path_in_external: str,
588+
start_indices: Optional[List[int]] = None,
589+
counts: Optional[List[int]] = None,
590+
external_uri: Optional[str] = None,
591+
) -> Optional[np.ndarray]:
592+
"""
593+
Read a data array from external storage (HDF5, Parquet, CSV, etc.) with optional sub-selection.
594+
595+
:param proxy: The object identifier/URI or the object itself that references the array
596+
:param path_in_external: Path within the external file (e.g., 'values/0')
597+
:param start_indices: Optional start index for each dimension (RESQML v2.2 StartIndex)
598+
:param counts: Optional count of elements for each dimension (RESQML v2.2 Count)
599+
:param external_uri: Optional URI to override default file path (RESQML v2.2 URI)
600+
:return: The data array as a numpy array, or None if not found
601+
"""
580602
obj = proxy
581603
if isinstance(proxy, str) or isinstance(proxy, Uri):
582604
obj = self.get_object_by_identifier(proxy)
583605

584-
h5_path = self.get_h5_file_paths(obj)
585-
h5_reader = HDF5FileReader()
606+
# Determine which external files to use
607+
file_paths = [external_uri] if external_uri else self.get_h5_file_paths(obj)
608+
if not file_paths or len(file_paths) == 0:
609+
file_paths = self.external_files_path
586610

587-
if h5_path is None or len(h5_path) == 0:
588-
for h5_path in self.external_files_path:
589-
try:
590-
return h5_reader.read_array(source=h5_path, path_in_external_file=path_in_external)
591-
except Exception:
592-
pass
593-
# logging.error(f"Failed to read HDF5 dataset from {h5_path}: {e}")
594-
else:
595-
for h5p in h5_path:
596-
try:
597-
return h5_reader.read_array(source=h5p, path_in_external_file=path_in_external)
598-
except Exception:
599-
pass
600-
# logging.error(f"Failed to read HDF5 dataset from {h5p}: {e}")
611+
if not file_paths:
612+
logging.warning(f"No external file paths found for proxy: {proxy}")
613+
return None
614+
615+
# Get the file handler registry
616+
handler_registry = get_handler_registry()
617+
618+
for file_path in file_paths:
619+
# Get the appropriate handler for this file type
620+
handler = handler_registry.get_handler_for_file(file_path)
621+
if handler is None:
622+
logging.debug(f"No handler found for file: {file_path}")
623+
continue
624+
625+
try:
626+
# Use handler to read array with sub-selection support
627+
array = handler.read_array(file_path, path_in_external, start_indices, counts)
628+
if array is not None:
629+
return array
630+
except Exception as e:
631+
logging.debug(f"Failed to read dataset from {file_path}: {e}")
632+
pass
633+
634+
logging.error(f"Failed to read array from any available file paths: {file_paths}")
601635
return None
602636

603637
def write_array(
604-
self, proxy: Union[str, Uri, Any], path_in_external: str, array: Any, in_memory: bool = False
638+
self,
639+
proxy: Union[str, Uri, Any],
640+
path_in_external: str,
641+
array: np.ndarray,
642+
start_indices: Optional[List[int]] = None,
643+
external_uri: Optional[str] = None,
644+
**kwargs,
605645
) -> bool:
606646
"""
607-
Write a dataset in the HDF5 file linked to the proxy object.
608-
:param proxy: the object or its identifier
609-
:param path_in_external: the path in the external file
610-
:param array: the data to write
611-
:param in_memory: if True, write in the in-memory HDF5 files (epc.h5_io_files)
612-
613-
:return: True if successful
647+
Write a data array to external storage (HDF5, Parquet, CSV, etc.) with optional offset.
648+
649+
:param proxy: The object identifier/URI or the object itself that references the array
650+
:param path_in_external: Path within the external file (e.g., 'values/0')
651+
:param array: The numpy array to write
652+
:param start_indices: Optional start index for each dimension for partial writes
653+
:param external_uri: Optional URI to override default file path (RESQML v2.2 URI)
654+
:param kwargs: Additional format-specific parameters (e.g., dtype, column_titles)
655+
:return: True if successfully written, False otherwise
614656
"""
615657
obj = proxy
616658
if isinstance(proxy, str) or isinstance(proxy, Uri):
617659
obj = self.get_object_by_identifier(proxy)
618660

619-
h5_path = self.get_h5_file_paths(obj)
620-
h5_writer = HDF5FileWriter()
661+
# Determine which external files to use
662+
file_paths = [external_uri] if external_uri else self.get_h5_file_paths(obj)
663+
if not file_paths or len(file_paths) == 0:
664+
file_paths = self.external_files_path
621665

622-
if in_memory or h5_path is None or len(h5_path) == 0:
623-
for h5_path in self.external_files_path:
624-
try:
625-
h5_writer.write_array(target=h5_path, path_in_external_file=path_in_external, array=array)
626-
return True
627-
except Exception:
628-
pass
629-
# logging.error(f"Failed to write HDF5 dataset to {h5_path}: {e}")
666+
if not file_paths:
667+
logging.warning(f"No external file paths found for proxy: {proxy}")
668+
return False
669+
670+
# Get the file handler registry
671+
handler_registry = get_handler_registry()
672+
673+
# Try to write to the first available file
674+
for file_path in file_paths:
675+
# Get the appropriate handler for this file type
676+
handler = handler_registry.get_handler_for_file(file_path)
677+
if handler is None:
678+
logging.debug(f"No handler found for file: {file_path}")
679+
continue
630680

631-
for h5p in h5_path:
632681
try:
633-
h5_writer.write_array(target=h5p, path_in_external_file=path_in_external, array=array)
634-
return True
635-
except Exception:
636-
pass
637-
# logging.error(f"Failed to write HDF5 dataset to {h5p}: {e}")
682+
# Use handler to write array with optional partial write support
683+
success = handler.write_array(file_path, array, path_in_external, start_indices, **kwargs)
684+
if success:
685+
return True
686+
except Exception as e:
687+
logging.error(f"Failed to write dataset to {file_path}: {e}")
688+
689+
logging.error(f"Failed to write array to any available file paths: {file_paths}")
638690
return False
639691

640692
# Class methods
@@ -804,14 +856,76 @@ def delete_object(self, identifier: Union[str, Any]) -> bool:
804856
return False
805857

806858
def get_array_metadata(
807-
self, proxy: str | Uri | Any, path_in_external: str | None = None
808-
) -> DataArrayMetadata | List[DataArrayMetadata] | None:
809-
array = self.read_array(proxy=proxy, path_in_external=path_in_external)
810-
if array is not None:
811-
if isinstance(array, np.ndarray):
812-
return DataArrayMetadata.from_numpy_array(path_in_resource=path_in_external, array=array)
813-
elif isinstance(array, list):
814-
return DataArrayMetadata.from_list(path_in_resource=path_in_external, data=array)
859+
self,
860+
proxy: Union[str, Uri, Any],
861+
path_in_external: Optional[str] = None,
862+
start_indices: Optional[List[int]] = None,
863+
counts: Optional[List[int]] = None,
864+
) -> Union[DataArrayMetadata, List[DataArrayMetadata], None]:
865+
"""
866+
Get metadata for data array(s) without loading the full array data.
867+
Supports RESQML v2.2 sub-array selection metadata.
868+
869+
:param proxy: The object identifier/URI or the object itself that references the array
870+
:param path_in_external: Optional specific path. If None, returns all array metadata for the object
871+
:param start_indices: Optional start index for each dimension (RESQML v2.2 StartIndex)
872+
:param counts: Optional count of elements for each dimension (RESQML v2.2 Count)
873+
:return: DataArrayMetadata if path specified, List[DataArrayMetadata] if no path, or None if not found
874+
"""
875+
obj = proxy
876+
if isinstance(proxy, str) or isinstance(proxy, Uri):
877+
obj = self.get_object_by_identifier(proxy)
878+
879+
# Get possible file paths for this object
880+
file_paths = self.get_h5_file_paths(obj)
881+
if not file_paths or len(file_paths) == 0:
882+
file_paths = self.external_files_path
883+
884+
if not file_paths:
885+
logging.warning(f"No external file paths found for proxy: {proxy}")
886+
return None
887+
888+
# Get the file handler registry
889+
handler_registry = get_handler_registry()
890+
891+
for file_path in file_paths:
892+
# Get the appropriate handler for this file type
893+
handler = handler_registry.get_handler_for_file(file_path)
894+
if handler is None:
895+
logging.debug(f"No handler found for file: {file_path}")
896+
continue
897+
898+
try:
899+
# Use handler to get metadata without loading full array
900+
metadata_dict = handler.get_array_metadata(file_path, path_in_external, start_indices, counts)
901+
902+
if metadata_dict is None:
903+
continue
904+
905+
# Convert dict(s) to DataArrayMetadata
906+
if isinstance(metadata_dict, list):
907+
return [
908+
DataArrayMetadata(
909+
path_in_resource=m.get("path"),
910+
array_type=m.get("dtype", "unknown"),
911+
dimensions=m.get("shape", []),
912+
start_indices=start_indices,
913+
custom_data={"size": m.get("size", 0)},
914+
)
915+
for m in metadata_dict
916+
]
917+
else:
918+
return DataArrayMetadata(
919+
path_in_resource=metadata_dict.get("path"),
920+
array_type=metadata_dict.get("dtype", "unknown"),
921+
dimensions=metadata_dict.get("shape", []),
922+
start_indices=start_indices,
923+
custom_data={"size": metadata_dict.get("size", 0)},
924+
)
925+
except Exception as e:
926+
logging.debug(f"Failed to get metadata from file {file_path}: {e}")
927+
928+
return None
815929

816930
def dumps_epc_content_and_files_lists(self) -> str:
817931
"""

0 commit comments

Comments
 (0)