Skip to content

Commit d9a4d76

Browse files
starting optimisation for mesh reading
1 parent 46d74e5 commit d9a4d76

8 files changed

Lines changed: 1766 additions & 41 deletions

File tree

energyml-utils/example/attic/arrays_test_fast.py

Lines changed: 437 additions & 0 deletions
Large diffs are not rendered by default.

energyml-utils/src/energyml/utils/data/datasets_io.py

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -616,7 +616,7 @@ def read_external_dataset_array(
616616
):
617617
if additional_sources is None:
618618
additional_sources = []
619-
result_array = []
619+
result_array = None
620620

621621
for path_in_obj, path_in_external in get_path_in_external_with_path(energyml_array):
622622
succeed = False
@@ -630,10 +630,15 @@ def read_external_dataset_array(
630630
)
631631
for s in sources:
632632
try:
633-
# TODO: take care of the "Counts" and "Starts" list in ExternalDataArrayPart to fill array correctly
634-
result_array = result_array + read_dataset(
635-
source=s, path_in_external_file=path_in_external, mimetype=mimetype
636-
)
633+
if result_array is None:
634+
result_array = read_dataset(
635+
source=s, path_in_external_file=path_in_external, mimetype=mimetype
636+
)
637+
else:
638+
# TODO: take care of the "Counts" and "Starts" list in ExternalDataArrayPart to fill array correctly
639+
result_array = result_array + read_dataset(
640+
source=s, path_in_external_file=path_in_external, mimetype=mimetype
641+
)
637642
succeed = True
638643
break # stop after the first read success
639644
except MissingExtraInstallation as mei:
@@ -855,7 +860,7 @@ def open_file_no_cache(self, file_path: str, mode: str = "r") -> Optional[Any]:
855860
try:
856861
return h5py.File(file_path, mode) # type: ignore
857862
except Exception as e:
858-
logging.error(f"Failed to open HDF5 file {file_path}: {e}")
863+
logging.debug(f"Failed to open HDF5 file {file_path}: {e}")
859864
return None
860865

861866
def read_array(
@@ -880,6 +885,41 @@ def read_array(
880885
with self.file_cache.get_or_open(source, self, "r") as f: # type: ignore
881886
return self.read_array(f, path_in_external_file, start_indices, counts)
882887

888+
def read_array_view(
889+
self,
890+
source: Union[BytesIO, str, Any],
891+
path_in_external_file: Optional[str] = None,
892+
start_indices: Optional[List[int]] = None,
893+
counts: Optional[List[int]] = None,
894+
) -> Optional[np.ndarray]:
895+
"""Read array from HDF5 with best-effort zero-copy semantics.
896+
897+
For contiguous, uncompressed datasets the returned array is backed
898+
by the memory-mapped file buffer (no copy). For chunked or
899+
compressed datasets h5py transparently falls back to a copy, but
900+
sub-selection is done by h5py in C before the data reaches Python
901+
(avoids loading the full dataset then slicing in Python).
902+
903+
The caller **must not mutate** the returned array.
904+
"""
905+
if isinstance(source, h5py.File): # type: ignore
906+
if not path_in_external_file:
907+
return None
908+
d_group = source[path_in_external_file]
909+
if start_indices is not None and counts is not None:
910+
# h5py reads only the required chunks/slabs from disk
911+
slices = tuple(
912+
slice(start, start + count) for start, count in zip(start_indices, counts)
913+
)
914+
return d_group[slices] # type: ignore
915+
# np.array with copy=False returns a view for contiguous datasets
916+
# Note: copy= kwarg on np.asarray requires numpy >=2.0;
917+
# np.array(x, copy=False) works on all numpy versions.
918+
return np.array(d_group, copy=False) # type: ignore
919+
else:
920+
with self.file_cache.get_or_open(source, self, "r") as f: # type: ignore
921+
return self.read_array_view(f, path_in_external_file, start_indices, counts)
922+
883923
def write_array(
884924
self,
885925
target: Union[str, BytesIO, Any],

energyml-utils/src/energyml/utils/data/helper.py

Lines changed: 45 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -279,10 +279,15 @@ def apply_crs_transform(
279279
return transformed
280280

281281

282-
def get_crs_origin_offset(crs_obj: Any) -> List[float | int]:
282+
def get_crs_origin_offset(crs_obj: Any) -> np.ndarray:
283283
"""
284-
Return a list [X,Y,Z] corresponding to the crs Offset [XOffset/OriginProjectedCoordinate1, ... ] depending on the
285-
crs energyml version.
284+
Return a ``(3,) float64`` numpy array ``[X, Y, Z]`` corresponding to the
285+
CRS origin offset (``XOffset``/``OriginProjectedCoordinate1``, …) depending
286+
on the energyml version.
287+
288+
Returning an ndarray instead of a plain list avoids the ``np.asarray()``
289+
call in callers such as :func:`mesh_numpy.crs_displacement_np`.
290+
286291
:param crs_obj:
287292
:return:
288293
"""
@@ -298,17 +303,18 @@ def get_crs_origin_offset(crs_obj: Any) -> List[float | int]:
298303
if tmp_offset_z is None:
299304
tmp_offset_z = get_object_attribute_rgx(crs_obj, "OriginProjectedCoordinate3")
300305

301-
crs_point_offset = [0.0, 0.0, 0.0]
302306
try:
303-
crs_point_offset = [
304-
float(tmp_offset_x) if tmp_offset_x is not None else 0.0,
305-
float(tmp_offset_y) if tmp_offset_y is not None else 0.0,
306-
float(tmp_offset_z) if tmp_offset_z is not None else 0.0,
307-
]
307+
return np.array(
308+
[
309+
float(tmp_offset_x) if tmp_offset_x is not None else 0.0,
310+
float(tmp_offset_y) if tmp_offset_y is not None else 0.0,
311+
float(tmp_offset_z) if tmp_offset_z is not None else 0.0,
312+
],
313+
dtype=np.float64,
314+
)
308315
except Exception as e:
309316
logging.info(f"ERR reading crs offset {e}")
310-
311-
return crs_point_offset
317+
return np.zeros(3, dtype=np.float64)
312318

313319

314320
def get_datum_information(
@@ -1037,28 +1043,33 @@ def read_constant_array(
10371043
path_in_root: Optional[str] = None,
10381044
workspace: Optional[EnergymlStorageInterface] = None,
10391045
sub_indices: Optional[Union[List[int], np.ndarray]] = None,
1040-
) -> List[Any]:
1046+
) -> Union[np.ndarray, List[Any]]:
10411047
"""
1042-
Read a constant array ( BooleanConstantArray, DoubleConstantArray, FloatingPointConstantArray, IntegerConstantArray ...)
1048+
Read a constant array (BooleanConstantArray, DoubleConstantArray,
1049+
FloatingPointConstantArray, IntegerConstantArray …).
1050+
1051+
For numeric (int / float / bool) values a ``numpy.ndarray`` is returned
1052+
via :func:`numpy.full`, avoiding a Python-list allocation. String values
1053+
fall back to a plain list because numpy object arrays add no benefit.
1054+
10431055
:param energyml_array:
10441056
:param root_obj:
10451057
:param path_in_root:
10461058
:param workspace:
10471059
:param sub_indices:
10481060
:return:
10491061
"""
1050-
# logging.debug(f"Reading constant array\n\t{energyml_array}")
1051-
10521062
value = get_object_attribute_no_verif(energyml_array, "value")
10531063
count = (
10541064
len(sub_indices)
10551065
if sub_indices is not None and len(sub_indices) > 0
10561066
else get_object_attribute_no_verif(energyml_array, "count")
10571067
)
10581068

1059-
# logging.debug(f"\tValue : {[value for i in range(0, count)]}")
1060-
1061-
return [value] * count
1069+
if isinstance(value, (int, float, bool, np.integer, np.floating)):
1070+
return np.full(int(count), value)
1071+
# Non-numeric (e.g. string) — keep as Python list.
1072+
return [value] * int(count)
10621073

10631074

10641075
def read_xml_array(
@@ -1402,44 +1413,45 @@ def read_point3d_lattice_array(
14021413
# Add slowest offsets where i > 0
14031414
result_arr[1:, :, :] += slowest_cumsum[:-1, np.newaxis, :]
14041415

1405-
# Flatten to list of points
1406-
result = result_arr.reshape(-1, 3).tolist()
1416+
# Return the (N, 3) float64 numpy array directly — no .tolist().
1417+
result = result_arr.reshape(-1, 3)
14071418

14081419
except (ValueError, TypeError) as e:
1409-
# Fallback to original implementation if NumPy conversion fails
1420+
# Fallback to original implementation if NumPy conversion fails.
14101421
logging.warning(f"NumPy vectorization failed ({e}), falling back to iterative approach")
1422+
fallback: List = []
14111423
for i in range(slowest_size):
14121424
for j in range(fastest_size):
14131425
previous_value = origin
14141426

14151427
if j > 0:
14161428
if i > 0:
14171429
line_idx = i * fastest_size
1418-
previous_value = result[line_idx + j - 1]
1430+
previous_value = fallback[line_idx + j - 1]
14191431
else:
1420-
previous_value = result[j - 1]
1432+
previous_value = fallback[j - 1]
14211433
if zincreasing_downward:
1422-
result.append(sum_lists(previous_value, slowest_table[i - 1]))
1434+
fallback.append(sum_lists(previous_value, slowest_table[i - 1]))
14231435
else:
1424-
result.append(sum_lists(previous_value, fastest_table[j - 1]))
1436+
fallback.append(sum_lists(previous_value, fastest_table[j - 1]))
14251437
else:
14261438
if i > 0:
14271439
prev_line_idx = (i - 1) * fastest_size
1428-
previous_value = result[prev_line_idx]
1440+
previous_value = fallback[prev_line_idx]
14291441
if zincreasing_downward:
1430-
result.append(sum_lists(previous_value, fastest_table[j - 1]))
1442+
fallback.append(sum_lists(previous_value, fastest_table[j - 1]))
14311443
else:
1432-
result.append(sum_lists(previous_value, slowest_table[i - 1]))
1444+
fallback.append(sum_lists(previous_value, slowest_table[i - 1]))
14331445
else:
1434-
result.append(previous_value)
1446+
fallback.append(previous_value)
1447+
# Convert fallback list to ndarray to keep the return type consistent.
1448+
result = np.array(fallback, dtype=np.float64).reshape(-1, 3)
14351449
else:
14361450
raise Exception(f"{type(energyml_array)} read with an offset of length {len(offset)} is not supported")
14371451

14381452
if sub_indices is not None and len(sub_indices) > 0:
1439-
if isinstance(result, np.ndarray):
1440-
result = result[sub_indices].tolist()
1441-
else:
1442-
result = [result[idx] for idx in sub_indices]
1453+
# result is always an ndarray here; index directly without .tolist().
1454+
result = result[np.asarray(sub_indices, dtype=np.int64)]
14431455

14441456
return result
14451457

energyml-utils/src/energyml/utils/data/mesh.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def crs_displacement(points: List[Point], crs_obj: Any) -> Tuple[List[Point], Po
172172
crs_point_offset = get_crs_origin_offset(crs_obj=crs_obj)
173173
zincreasing_downward = is_z_reversed(crs_obj)
174174

175-
if crs_point_offset != [0, 0, 0]:
175+
if np.any(crs_point_offset):
176176
for p in points:
177177
for xyz in range(len(p)):
178178
p[xyz] = (p[xyz] + crs_point_offset[xyz]) if p[xyz] is not None else None
@@ -241,7 +241,7 @@ def read_mesh_object(
241241
): # WellboreFrameRep has allready the displacement applied
242242
# TODO: the displacement should be done in each reader function to manage specific cases
243243
for s in surfaces:
244-
logging.debug(f"CRS : {s.crs_object}")
244+
# logging.debug(f"CRS : {s.crs_object}")
245245
crs_displacement(
246246
s.point_list,
247247
s.crs_object[0] if isinstance(s.crs_object, list) and len(s.crs_object) > 0 else s.crs_object,

0 commit comments

Comments
 (0)