Skip to content

Commit 096c613

Browse files
optimization
1 parent 4e3103d commit 096c613

4 files changed

Lines changed: 172 additions & 12 deletions

File tree

energyml-utils/example/attic/compare_inmem_n_stream.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,19 +40,19 @@ def reexport_in_memory(filepath: str, output_folder: Optional[str] = None):
4040
os.makedirs(output_folder, exist_ok=True)
4141
path_in_memory = f"{output_folder}/{path_in_memory.split('/')[-1]}"
4242
epc = Epc.read_file(epc_file_path=filepath, read_rels_from_files=False, recompute_rels=False)
43-
43+
print(len(epc.list_objects()))
4444
if os.path.exists(path_in_memory):
4545
os.remove(path_in_memory)
4646
epc.export_file(path_in_memory)
4747

4848

4949
def reexport_in_memory_par_read(filepath: str, output_folder: Optional[str] = None):
50-
path_in_memory = filepath.replace(".epc", "_in_memory_par_read.epc")
50+
path_in_memory = filepath.replace(".epc", f"_in_memory_par_read_v{os.environ['EPC_FAST_V2']}.epc")
5151
if output_folder:
5252
os.makedirs(output_folder, exist_ok=True)
5353
path_in_memory = f"{output_folder}/{path_in_memory.split('/')[-1]}"
5454
epc = Epc.read_file(epc_file_path=filepath, read_rels_from_files=False, read_parallel=True, recompute_rels=False)
55-
55+
print(len(epc.list_objects()))
5656
if os.path.exists(path_in_memory):
5757
os.remove(path_in_memory)
5858
epc.export_file(path_in_memory, parallel=True)
@@ -80,13 +80,23 @@ def time_comparison(
8080
print(f" ✓ Completed in {elapsed_inmem:.3f}s\n")
8181

8282
# Test 1b: In-Memory with Parallel Read
83+
os.environ["EPC_FAST_V2"] = "0"
8384
print("⏳ Testing In-Memory EPC processing with Parallel Read...")
8485
start = time.perf_counter()
8586
reexport_in_memory_par_read(filepath, output_folder)
8687
elapsed_inmem_par = time.perf_counter() - start
8788
results.append(("In-Memory (Epc) Parallel Read", elapsed_inmem_par))
8889
print(f" ✓ Completed in {elapsed_inmem_par:.3f}s\n")
8990

91+
# Test 1b: In-Memory with Parallel Read v2
92+
os.environ["EPC_FAST_V2"] = "1"
93+
print("⏳ Testing In-Memory EPC processing with Parallel Read v2...")
94+
start = time.perf_counter()
95+
reexport_in_memory_par_read(filepath, output_folder)
96+
elapsed_inmem_par = time.perf_counter() - start
97+
results.append(("In-Memory (Epc) Parallel Read v2", elapsed_inmem_par))
98+
print(f" ✓ Completed in {elapsed_inmem_par:.3f}s\n")
99+
90100
if not skip_sequential_stream:
91101
# Test 2: Streaming Sequential
92102
print("⏳ Testing Streaming Sequential processing...")
@@ -157,13 +167,13 @@ def recompute_rels(epc_file_path: str):
157167
# output_folder="rc/performance_results",
158168
# )
159169

160-
# time_comparison(
161-
# filepath=sys.argv[1] if len(sys.argv) > 1 else "rc/epc/80wells_surf.epc", output_folder="rc/performance_results"
162-
# )
170+
time_comparison(
171+
filepath=sys.argv[1] if len(sys.argv) > 1 else "rc/epc/80wells_surf.epc", output_folder="rc/performance_results"
172+
)
163173

164174
# time_comparison(
165175
# filepath=sys.argv[1] if len(sys.argv) > 1 else "wip/failingData/fix/sample_mini_firp_201_norels_with_media.epc",
166176
# output_folder="rc/performance_results",
167177
# )
168178

169-
recompute_rels("C:/Users/Cryptaro/Downloads/Galaxy384-[[Output] EPC file pointset extraction].epc")
179+
# recompute_rels("C:/Users/Cryptaro/Downloads/Galaxy384-[[Output] EPC file pointset extraction].epc")
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Benchmark de performance pour get_obj_uuid
2+
import time
3+
import re
4+
from types import SimpleNamespace
5+
6+
UUID_RGX: re.Pattern = re.compile(r"[Uu]u?id|UUID")
7+
8+
9+
# Version originale
10+
def get_obj_uuid_original(obj):
11+
try:
12+
return getattr(obj, "uuid", None) or getattr(obj, "uid")
13+
except AttributeError:
14+
if isinstance(obj, dict):
15+
for k in obj.keys():
16+
if UUID_RGX.match(k):
17+
return obj[k]
18+
return None
19+
20+
21+
# Version optimisée
22+
def get_obj_uuid_fast(obj):
23+
for attr in dir(obj):
24+
if UUID_RGX.match(attr):
25+
value = getattr(obj, attr, None)
26+
if value is not None:
27+
return value
28+
if isinstance(obj, dict):
29+
for k, v in obj.items():
30+
if UUID_RGX.match(k):
31+
if v is not None:
32+
return v
33+
return None
34+
35+
36+
# Simulation d'une classe TriangulatedSetRepresentation
37+
class TriangulatedSetRepresentation:
38+
def __init__(self, uuid):
39+
self.uuid = uuid
40+
41+
42+
N = 10000
43+
objs = [TriangulatedSetRepresentation(f"uuid-{i}") for i in range(N)]
44+
45+
# Test version originale
46+
start = time.perf_counter()
47+
for obj in objs:
48+
assert get_obj_uuid_original(obj) == obj.uuid
49+
elapsed_original = time.perf_counter() - start
50+
51+
# Test version optimisée
52+
start = time.perf_counter()
53+
for obj in objs:
54+
assert get_obj_uuid_fast(obj) == obj.uuid
55+
elapsed_fast = time.perf_counter() - start
56+
57+
print(f"Original version: {elapsed_original:.6f} s for {N} calls")
58+
print(f"Optimized version: {elapsed_fast:.6f} s for {N} calls")

energyml-utils/src/energyml/utils/epc.py

Lines changed: 95 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1553,15 +1553,22 @@ def read_file(
15531553
"""
15541554
with open(epc_file_path, "rb") as f:
15551555
if read_parallel:
1556-
epc = cls.read_stream_ultra_fast(
1557-
BytesIO(f.read()), read_rels_from_files=read_rels_from_files, recompute_rels=recompute_rels
1556+
epc = (
1557+
cls.read_stream_ultra_fast(
1558+
BytesIO(f.read()), read_rels_from_files=read_rels_from_files, recompute_rels=recompute_rels
1559+
)
1560+
if not os.environ.get("EPC_FAST_V2", "0") == "1"
1561+
else cls.read_stream_ultra_fast_v2(
1562+
BytesIO(f.read()), read_rels_from_files=read_rels_from_files, recompute_rels=recompute_rels
1563+
)
15581564
)
15591565
else:
15601566
epc = cls.read_stream(
15611567
BytesIO(f.read()), read_rels_from_files=read_rels_from_files, recompute_rels=recompute_rels
15621568
)
1563-
epc.epc_file_path = epc_file_path
1564-
return epc
1569+
if epc is not None:
1570+
epc.epc_file_path = epc_file_path
1571+
return epc
15651572
raise IOError(f"Failed to open EPC file {epc_file_path}")
15661573

15671574
@classmethod
@@ -1575,6 +1582,7 @@ def read_stream(
15751582
:param recompute_rels: If True, recompute all relationships after loading
15761583
:return: an :class:`EPC` instance
15771584
"""
1585+
print("Reading EPC file seq...")
15781586
try:
15791587
_read_files = []
15801588
obj_list = []
@@ -1727,6 +1735,8 @@ def read_stream_ultra_fast(
17271735
from concurrent.futures import ProcessPoolExecutor, as_completed
17281736
import multiprocessing
17291737

1738+
print("Reading EPC file parrallel v1...")
1739+
17301740
obj_to_process = {}
17311741
rels_to_process = {}
17321742
raw_files = []
@@ -1813,6 +1823,87 @@ def read_stream_ultra_fast(
18131823

18141824
return epc
18151825

1826+
@classmethod
1827+
def read_stream_ultra_fast_v2(
1828+
cls, epc_file_io: BytesIO, read_rels_from_files: bool = True, recompute_rels: bool = False
1829+
) -> Optional["Epc"]:
1830+
from concurrent.futures import ThreadPoolExecutor # Passage au ThreadPool
1831+
1832+
print("Reading EPC file parrallel v2...")
1833+
1834+
obj_list = []
1835+
path_to_obj = {}
1836+
rels_content_map = {}
1837+
raw_files = []
1838+
core_props = None
1839+
1840+
# On utilise un ThreadPool pour éviter le coût de sérialisation Pickle
1841+
# lxml libère le GIL, donc c'est très efficace
1842+
with ThreadPoolExecutor() as executor:
1843+
futures = []
1844+
1845+
with zipfile.ZipFile(epc_file_io, "r") as epc_file:
1846+
# On récupère l'index d'abord
1847+
ct_path = get_epc_content_type_path()
1848+
content_type_obj = read_energyml_xml_bytes(epc_file.read(ct_path))
1849+
1850+
# Identification des types via le ContentTypes
1851+
energyml_paths = {}
1852+
for ov in content_type_obj.override:
1853+
path = ov.part_name.lstrip("/\\")
1854+
if is_energyml_content_type(ov.content_type):
1855+
energyml_paths[path] = ov.content_type
1856+
elif get_class_from_content_type(ov.content_type) == CoreProperties:
1857+
core_props = read_energyml_xml_bytes(epc_file.read(path), CoreProperties)
1858+
1859+
for info in epc_file.infolist():
1860+
fname = info.filename
1861+
1862+
# STREAMING : On lance la tâche dès qu'on a les bytes
1863+
if fname in energyml_paths:
1864+
data = epc_file.read(fname)
1865+
f = executor.submit(_parallel_xml_read, data, energyml_paths[fname])
1866+
futures.append((f, "OBJ", fname))
1867+
1868+
elif read_rels_from_files and fname.lower().endswith(".rels"):
1869+
data = epc_file.read(fname)
1870+
f = executor.submit(_parallel_rels_read, data)
1871+
futures.append((f, "REL", fname))
1872+
elif (
1873+
not fname.lower().endswith(".rels")
1874+
and not fname.lower().endswith(gen_core_props_path().lower())
1875+
and fname not in energyml_paths
1876+
and fname != ct_path
1877+
):
1878+
raw_files.append(RawFile(path=fname, content=BytesIO(epc_file.read(fname))))
1879+
1880+
# 2. Récupération des résultats (pendant que le ZIP continue d'être lu si possible)
1881+
for future, kind, path in futures:
1882+
res = future.result()
1883+
if isinstance(res, Exception):
1884+
continue
1885+
1886+
if kind == "OBJ":
1887+
path_to_obj[path] = res
1888+
obj_list.append(res)
1889+
else:
1890+
o_path = str(Path(path).parent.parent / Path(path).stem).replace("\\", "/")
1891+
rels_content_map[o_path] = res
1892+
1893+
# 3. Assemblage final dans le processus parent
1894+
epc = Epc(energyml_objects=EnergymlObjectCollection(obj_list), raw_files=raw_files, core_props=core_props)
1895+
1896+
if read_rels_from_files:
1897+
for obj_path, rels_obj in rels_content_map.items():
1898+
if obj_path in path_to_obj:
1899+
target_obj = path_to_obj[obj_path]
1900+
epc._rels_cache.set_rels_from_file(target_obj, rels_obj) # type: ignore
1901+
1902+
if recompute_rels:
1903+
epc._rels_cache.recompute_cache() # type: ignore
1904+
1905+
return epc
1906+
18161907

18171908
# ______ __ ____ __ _
18181909
# / ____/___ ___ _________ ___ ______ ___ / / / __/_ ______ _____/ /_(_)___ ____ _____

energyml-utils/src/energyml/utils/introspection.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -909,7 +909,8 @@ def search_attribute_matching_type_with_path(
909909
elif not is_primitive(obj):
910910
for att_name in get_class_attributes(obj):
911911
res = res + search_attribute_matching_type_with_path(
912-
obj=get_object_attribute_rgx(obj, att_name),
912+
obj=get_object_attribute_no_verif(obj, att_name),
913+
# obj=get_object_attribute_rgx(obj, att_name),
913914
type_rgx=type_rgx,
914915
re_flags=re_flags,
915916
return_self=True,

0 commit comments

Comments
 (0)