|
53 | 53 | OptimizedRegex, |
54 | 54 | ) |
55 | 55 | from .data.datasets_io import ( |
56 | | - HDF5FileReader, |
57 | | - HDF5FileWriter, |
| 56 | + get_handler_registry, |
58 | 57 | read_external_dataset_array, |
59 | 58 | ) |
60 | 59 | from .exception import UnparsableFile |
@@ -329,20 +328,26 @@ def export_io(self) -> BytesIO: |
329 | 328 |
|
330 | 329 | return zip_buffer |
331 | 330 |
|
332 | | - def get_obj_rels(self, obj: Any) -> Optional[Relationships]: |
| 331 | + def get_obj_rels(self, obj: Union[str, Uri, Any]) -> List[Relationship]: |
333 | 332 | """ |
334 | | - Get the Relationships object for a given energyml object |
335 | | - :param obj: |
336 | | - :return: |
| 333 | + Get the relationships for a given energyml object |
| 334 | + :param obj: The object identifier/URI or the object itself |
| 335 | + :return: List of Relationship objects |
337 | 336 | """ |
| 337 | + # Convert identifier to object if needed |
| 338 | + if isinstance(obj, str) or isinstance(obj, Uri): |
| 339 | + obj = self.get_object_by_identifier(obj) |
| 340 | + if obj is None: |
| 341 | + return [] |
| 342 | + |
338 | 343 | rels_path = gen_rels_path( |
339 | 344 | energyml_object=obj, |
340 | 345 | export_version=self.export_version, |
341 | 346 | ) |
342 | 347 | all_rels = self.compute_rels() |
343 | 348 | if rels_path in all_rels: |
344 | | - return all_rels[rels_path] |
345 | | - return None |
| 349 | + return all_rels[rels_path].relationship if all_rels[rels_path].relationship else [] |
| 350 | + return [] |
346 | 351 |
|
347 | 352 | def compute_rels(self) -> Dict[str, Relationships]: |
348 | 353 | """ |
@@ -576,65 +581,112 @@ def read_external_array( |
576 | 581 | epc=self, |
577 | 582 | ) |
578 | 583 |
|
579 | | - def read_array(self, proxy: Union[str, Uri, Any], path_in_external: str) -> Optional[np.ndarray]: |
| 584 | + def read_array( |
| 585 | + self, |
| 586 | + proxy: Union[str, Uri, Any], |
| 587 | + path_in_external: str, |
| 588 | + start_indices: Optional[List[int]] = None, |
| 589 | + counts: Optional[List[int]] = None, |
| 590 | + external_uri: Optional[str] = None, |
| 591 | + ) -> Optional[np.ndarray]: |
| 592 | + """ |
| 593 | + Read a data array from external storage (HDF5, Parquet, CSV, etc.) with optional sub-selection. |
| 594 | +
|
| 595 | + :param proxy: The object identifier/URI or the object itself that references the array |
| 596 | + :param path_in_external: Path within the external file (e.g., 'values/0') |
| 597 | + :param start_indices: Optional start index for each dimension (RESQML v2.2 StartIndex) |
| 598 | + :param counts: Optional count of elements for each dimension (RESQML v2.2 Count) |
| 599 | + :param external_uri: Optional URI to override default file path (RESQML v2.2 URI) |
| 600 | + :return: The data array as a numpy array, or None if not found |
| 601 | + """ |
580 | 602 | obj = proxy |
581 | 603 | if isinstance(proxy, str) or isinstance(proxy, Uri): |
582 | 604 | obj = self.get_object_by_identifier(proxy) |
583 | 605 |
|
584 | | - h5_path = self.get_h5_file_paths(obj) |
585 | | - h5_reader = HDF5FileReader() |
| 606 | + # Determine which external files to use |
| 607 | + file_paths = [external_uri] if external_uri else self.get_h5_file_paths(obj) |
| 608 | + if not file_paths or len(file_paths) == 0: |
| 609 | + file_paths = self.external_files_path |
586 | 610 |
|
587 | | - if h5_path is None or len(h5_path) == 0: |
588 | | - for h5_path in self.external_files_path: |
589 | | - try: |
590 | | - return h5_reader.read_array(source=h5_path, path_in_external_file=path_in_external) |
591 | | - except Exception: |
592 | | - pass |
593 | | - # logging.error(f"Failed to read HDF5 dataset from {h5_path}: {e}") |
594 | | - else: |
595 | | - for h5p in h5_path: |
596 | | - try: |
597 | | - return h5_reader.read_array(source=h5p, path_in_external_file=path_in_external) |
598 | | - except Exception: |
599 | | - pass |
600 | | - # logging.error(f"Failed to read HDF5 dataset from {h5p}: {e}") |
| 611 | + if not file_paths: |
| 612 | + logging.warning(f"No external file paths found for proxy: {proxy}") |
| 613 | + return None |
| 614 | + |
| 615 | + # Get the file handler registry |
| 616 | + handler_registry = get_handler_registry() |
| 617 | + |
| 618 | + for file_path in file_paths: |
| 619 | + # Get the appropriate handler for this file type |
| 620 | + handler = handler_registry.get_handler_for_file(file_path) |
| 621 | + if handler is None: |
| 622 | + logging.debug(f"No handler found for file: {file_path}") |
| 623 | + continue |
| 624 | + |
| 625 | + try: |
| 626 | + # Use handler to read array with sub-selection support |
| 627 | + array = handler.read_array(file_path, path_in_external, start_indices, counts) |
| 628 | + if array is not None: |
| 629 | + return array |
| 630 | + except Exception as e: |
| 631 | + logging.debug(f"Failed to read dataset from {file_path}: {e}") |
| 632 | + pass |
| 633 | + |
| 634 | + logging.error(f"Failed to read array from any available file paths: {file_paths}") |
601 | 635 | return None |
602 | 636 |
|
603 | 637 | def write_array( |
604 | | - self, proxy: Union[str, Uri, Any], path_in_external: str, array: Any, in_memory: bool = False |
| 638 | + self, |
| 639 | + proxy: Union[str, Uri, Any], |
| 640 | + path_in_external: str, |
| 641 | + array: np.ndarray, |
| 642 | + start_indices: Optional[List[int]] = None, |
| 643 | + external_uri: Optional[str] = None, |
| 644 | + **kwargs, |
605 | 645 | ) -> bool: |
606 | 646 | """ |
607 | | - Write a dataset in the HDF5 file linked to the proxy object. |
608 | | - :param proxy: the object or its identifier |
609 | | - :param path_in_external: the path in the external file |
610 | | - :param array: the data to write |
611 | | - :param in_memory: if True, write in the in-memory HDF5 files (epc.h5_io_files) |
612 | | -
|
613 | | - :return: True if successful |
| 647 | + Write a data array to external storage (HDF5, Parquet, CSV, etc.) with optional offset. |
| 648 | +
|
| 649 | + :param proxy: The object identifier/URI or the object itself that references the array |
| 650 | + :param path_in_external: Path within the external file (e.g., 'values/0') |
| 651 | + :param array: The numpy array to write |
| 652 | + :param start_indices: Optional start index for each dimension for partial writes |
| 653 | + :param external_uri: Optional URI to override default file path (RESQML v2.2 URI) |
| 654 | + :param kwargs: Additional format-specific parameters (e.g., dtype, column_titles) |
| 655 | + :return: True if successfully written, False otherwise |
614 | 656 | """ |
615 | 657 | obj = proxy |
616 | 658 | if isinstance(proxy, str) or isinstance(proxy, Uri): |
617 | 659 | obj = self.get_object_by_identifier(proxy) |
618 | 660 |
|
619 | | - h5_path = self.get_h5_file_paths(obj) |
620 | | - h5_writer = HDF5FileWriter() |
| 661 | + # Determine which external files to use |
| 662 | + file_paths = [external_uri] if external_uri else self.get_h5_file_paths(obj) |
| 663 | + if not file_paths or len(file_paths) == 0: |
| 664 | + file_paths = self.external_files_path |
621 | 665 |
|
622 | | - if in_memory or h5_path is None or len(h5_path) == 0: |
623 | | - for h5_path in self.external_files_path: |
624 | | - try: |
625 | | - h5_writer.write_array(target=h5_path, path_in_external_file=path_in_external, array=array) |
626 | | - return True |
627 | | - except Exception: |
628 | | - pass |
629 | | - # logging.error(f"Failed to write HDF5 dataset to {h5_path}: {e}") |
| 666 | + if not file_paths: |
| 667 | + logging.warning(f"No external file paths found for proxy: {proxy}") |
| 668 | + return False |
| 669 | + |
| 670 | + # Get the file handler registry |
| 671 | + handler_registry = get_handler_registry() |
| 672 | + |
| 673 | + # Try to write to the first available file |
| 674 | + for file_path in file_paths: |
| 675 | + # Get the appropriate handler for this file type |
| 676 | + handler = handler_registry.get_handler_for_file(file_path) |
| 677 | + if handler is None: |
| 678 | + logging.debug(f"No handler found for file: {file_path}") |
| 679 | + continue |
630 | 680 |
|
631 | | - for h5p in h5_path: |
632 | 681 | try: |
633 | | - h5_writer.write_array(target=h5p, path_in_external_file=path_in_external, array=array) |
634 | | - return True |
635 | | - except Exception: |
636 | | - pass |
637 | | - # logging.error(f"Failed to write HDF5 dataset to {h5p}: {e}") |
| 682 | + # Use handler to write array with optional partial write support |
| 683 | + success = handler.write_array(file_path, array, path_in_external, start_indices, **kwargs) |
| 684 | + if success: |
| 685 | + return True |
| 686 | + except Exception as e: |
| 687 | + logging.error(f"Failed to write dataset to {file_path}: {e}") |
| 688 | + |
| 689 | + logging.error(f"Failed to write array to any available file paths: {file_paths}") |
638 | 690 | return False |
639 | 691 |
|
640 | 692 | # Class methods |
@@ -804,14 +856,76 @@ def delete_object(self, identifier: Union[str, Any]) -> bool: |
804 | 856 | return False |
805 | 857 |
|
806 | 858 | def get_array_metadata( |
807 | | - self, proxy: str | Uri | Any, path_in_external: str | None = None |
808 | | - ) -> DataArrayMetadata | List[DataArrayMetadata] | None: |
809 | | - array = self.read_array(proxy=proxy, path_in_external=path_in_external) |
810 | | - if array is not None: |
811 | | - if isinstance(array, np.ndarray): |
812 | | - return DataArrayMetadata.from_numpy_array(path_in_resource=path_in_external, array=array) |
813 | | - elif isinstance(array, list): |
814 | | - return DataArrayMetadata.from_list(path_in_resource=path_in_external, data=array) |
| 859 | + self, |
| 860 | + proxy: Union[str, Uri, Any], |
| 861 | + path_in_external: Optional[str] = None, |
| 862 | + start_indices: Optional[List[int]] = None, |
| 863 | + counts: Optional[List[int]] = None, |
| 864 | + ) -> Union[DataArrayMetadata, List[DataArrayMetadata], None]: |
| 865 | + """ |
| 866 | + Get metadata for data array(s) without loading the full array data. |
| 867 | + Supports RESQML v2.2 sub-array selection metadata. |
| 868 | +
|
| 869 | + :param proxy: The object identifier/URI or the object itself that references the array |
| 870 | + :param path_in_external: Optional specific path. If None, returns all array metadata for the object |
| 871 | + :param start_indices: Optional start index for each dimension (RESQML v2.2 StartIndex) |
| 872 | + :param counts: Optional count of elements for each dimension (RESQML v2.2 Count) |
| 873 | + :return: DataArrayMetadata if path specified, List[DataArrayMetadata] if no path, or None if not found |
| 874 | + """ |
| 875 | + obj = proxy |
| 876 | + if isinstance(proxy, str) or isinstance(proxy, Uri): |
| 877 | + obj = self.get_object_by_identifier(proxy) |
| 878 | + |
| 879 | + # Get possible file paths for this object |
| 880 | + file_paths = self.get_h5_file_paths(obj) |
| 881 | + if not file_paths or len(file_paths) == 0: |
| 882 | + file_paths = self.external_files_path |
| 883 | + |
| 884 | + if not file_paths: |
| 885 | + logging.warning(f"No external file paths found for proxy: {proxy}") |
| 886 | + return None |
| 887 | + |
| 888 | + # Get the file handler registry |
| 889 | + handler_registry = get_handler_registry() |
| 890 | + |
| 891 | + for file_path in file_paths: |
| 892 | + # Get the appropriate handler for this file type |
| 893 | + handler = handler_registry.get_handler_for_file(file_path) |
| 894 | + if handler is None: |
| 895 | + logging.debug(f"No handler found for file: {file_path}") |
| 896 | + continue |
| 897 | + |
| 898 | + try: |
| 899 | + # Use handler to get metadata without loading full array |
| 900 | + metadata_dict = handler.get_array_metadata(file_path, path_in_external, start_indices, counts) |
| 901 | + |
| 902 | + if metadata_dict is None: |
| 903 | + continue |
| 904 | + |
| 905 | + # Convert dict(s) to DataArrayMetadata |
| 906 | + if isinstance(metadata_dict, list): |
| 907 | + return [ |
| 908 | + DataArrayMetadata( |
| 909 | + path_in_resource=m.get("path"), |
| 910 | + array_type=m.get("dtype", "unknown"), |
| 911 | + dimensions=m.get("shape", []), |
| 912 | + start_indices=start_indices, |
| 913 | + custom_data={"size": m.get("size", 0)}, |
| 914 | + ) |
| 915 | + for m in metadata_dict |
| 916 | + ] |
| 917 | + else: |
| 918 | + return DataArrayMetadata( |
| 919 | + path_in_resource=metadata_dict.get("path"), |
| 920 | + array_type=metadata_dict.get("dtype", "unknown"), |
| 921 | + dimensions=metadata_dict.get("shape", []), |
| 922 | + start_indices=start_indices, |
| 923 | + custom_data={"size": metadata_dict.get("size", 0)}, |
| 924 | + ) |
| 925 | + except Exception as e: |
| 926 | + logging.debug(f"Failed to get metadata from file {file_path}: {e}") |
| 927 | + |
| 928 | + return None |
815 | 929 |
|
816 | 930 | def dumps_epc_content_and_files_lists(self) -> str: |
817 | 931 | """ |
|
0 commit comments