Skip to content

Commit cb8d87b

Browse files
[fix] fix clearing caches synchronization and add more logs (#4212)
* [fix] fix clearing caches synchronization and add more logs * [chore] print cache_ready_signal in log
1 parent de4feff commit cb8d87b

3 files changed

Lines changed: 51 additions & 22 deletions

File tree

fastdeploy/cache_manager/cache_transfer_manager.py

Lines changed: 46 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -201,12 +201,12 @@ def __init__(self, args):
201201
def _init_gpu_cache(self, args):
202202

203203
if not args.create_cache_tensor:
204-
logger.info("Waiting for runners to create kv cache.")
204+
logger.info(f"[rank {self.rank}/{self.n_ranks}] Waiting for runners to create kv cache.")
205205
while self.cache_ready_signal.value[self.rank] != 1:
206-
time.sleep(1)
207-
logger.info("OK! Stop waiting.")
206+
time.sleep(0.1)
207+
logger.info(f"[rank {self.rank}/{self.n_ranks}] OK! Stop waiting.")
208208

209-
logger.info("Initializing kv cache for all layers.")
209+
logger.info(f"[rank {self.rank}/{self.n_ranks}] Initializing kv cache for all layers.")
210210
paddle.set_device(f"gpu:{self.device}")
211211
for i in range(args.num_layers + self.num_extra_layers):
212212
num_gpu_blocks = args.num_gpu_blocks if i < args.num_layers else self.num_extra_layer_gpu_blocks
@@ -215,13 +215,13 @@ def _init_gpu_cache(self, args):
215215
val_name = f"value_caches_{i}_rank{self.rank}.device{self.device}"
216216

217217
if args.create_cache_tensor:
218-
logger.info(f"..creating kv cache for layer {i}: {cache_shape}")
218+
logger.info(f"[rank {self.rank}/{self.n_ranks}] ..creating kv cache for layer {i}: {cache_shape}")
219219
key_cache = paddle.full(shape=cache_shape, fill_value=0, dtype=args.cache_dtype)
220220
val_cache = paddle.full(shape=cache_shape, fill_value=0, dtype=args.cache_dtype)
221221
set_data_ipc(key_cache, key_name)
222222
set_data_ipc(val_cache, val_name)
223223
else:
224-
logger.info(f"..attaching kv cache for layer {i}: {cache_shape}")
224+
logger.info(f"[rank {self.rank}/{self.n_ranks}] ..attaching kv cache for layer {i}: {cache_shape}")
225225
key_cache = paddle.empty(shape=[], dtype=args.cache_dtype)
226226
val_cache = paddle.empty(shape=[], dtype=args.cache_dtype)
227227
key_cache = share_external_data(key_cache, key_name, cache_shape)
@@ -233,33 +233,37 @@ def _init_gpu_cache(self, args):
233233
self.gpu_cache_v_tensors.append(self.gpu_cache_kvs[val_name])
234234

235235
if args.create_cache_tensor:
236-
logger.info("✅ kv cache is ready!")
236+
logger.info("[rank {self.rank}/{self.n_ranks}] ✅ kv cache is ready!")
237237
self.cache_ready_signal.value[self.rank] = 1
238238

239239
cache_kv_size_byte = sum([tmp.numel() * 1 for key, tmp in self.gpu_cache_kvs.items()])
240-
logger.info(f"device :{self.device}")
241-
logger.info(f"cache_kv_size_byte : {cache_kv_size_byte}")
242-
logger.info(f"done init cache (full) gmem alloc : {paddle.device.cuda.memory_allocated()}")
240+
logger.info(f"[rank {self.rank}/{self.n_ranks}] device :{self.device}")
241+
logger.info(f"[rank {self.rank}/{self.n_ranks}] cache_kv_size_byte : {cache_kv_size_byte}")
242+
logger.info(
243+
f"[rank {self.rank}/{self.n_ranks}] done init cache (full) gmem alloc : {paddle.device.cuda.memory_allocated()}"
244+
)
243245

244246
def _init_cpu_cache(self, args):
245247
if args.num_cpu_blocks == 0:
246-
logger.info("💡 no swap space (cpu cache) is specified.")
248+
logger.info(f"[rank {self.rank}/{self.n_ranks}] 💡 no swap space (cpu cache) is specified.")
247249
self.swap_space_ready_signal.value[self.rank] = 1
248250
return
249-
logger.info("Initializing swap space (cpu cache) for all layers.")
251+
logger.info(f"[rank {self.rank}/{self.n_ranks}] Initializing swap space (cpu cache) for all layers.")
250252
paddle.set_device("cpu")
251253
self.k_dst_ptrs = []
252254
self.v_dst_ptrs = []
253255
for i in range(args.num_layers + self.num_extra_layers):
254256
key_name = f"key_caches_{i}_rank{self.rank}"
255257
val_name = f"value_caches_{i}_rank{self.rank}"
256258
need_to_allocate_bytes = args.num_cpu_blocks * args.bytes_per_layer_per_block
257-
logger.info(f"..creating cpu cache for layer {i}: {2 * need_to_allocate_bytes / 1024 ** 3:.2f}GB")
259+
logger.info(
260+
f"[rank {self.rank}/{self.n_ranks}] ..creating cpu cache for layer {i}: {2 * need_to_allocate_bytes / 1024 ** 3:.2f}GB"
261+
)
258262
self.cpu_cache_kvs[key_name] = cuda_host_alloc(need_to_allocate_bytes)
259263
self.k_dst_ptrs.append(self.cpu_cache_kvs[key_name])
260264
self.cpu_cache_kvs[val_name] = cuda_host_alloc(need_to_allocate_bytes)
261265
self.v_dst_ptrs.append(self.cpu_cache_kvs[val_name])
262-
logger.info("✅ swap space (cpu cache) is ready!")
266+
logger.info(f"[rank {self.rank}/{self.n_ranks}] ✅ swap space (cpu cache) is ready!")
263267
self.swap_space_ready_signal.value[self.rank] = 1
264268

265269
def _do_swap_to_cpu_task(
@@ -473,6 +477,10 @@ def clear_or_update_caches(self, args):
473477
while True:
474478
if kv_cache_status_signal.value[0] == KVCacheStatus.CLEARING:
475479
try:
480+
logger.info(
481+
f"[rank {self.rank}/{self.n_ranks}] Start clearing caches {self.cache_ready_signal.value}"
482+
)
483+
# clear cpu caches
476484
if envs.FD_ENABLE_SWAP_SPACE_CLEARING:
477485
paddle.set_device("cpu")
478486
for ptrs in self.k_dst_ptrs + self.v_dst_ptrs:
@@ -486,37 +494,58 @@ def clear_or_update_caches(self, args):
486494
while np.sum(self.swap_space_ready_signal.value) != 0:
487495
time.sleep(0.1)
488496

497+
# clear gpu caches
489498
paddle.set_device(f"gpu:{self.device}")
490499
for name, tensor in self.gpu_cache_kvs.items():
491500
unset_data_ipc(tensor, name, True, False)
492501
self.gpu_cache_kvs.clear()
493502
self.gpu_cache_k_tensors.clear()
494503
self.gpu_cache_v_tensors.clear()
504+
495505
# reset cache_ready_signal
496506
self.cache_ready_signal.value[self.rank] = 0
497-
if np.sum(self.cache_ready_signal.value) == 0:
507+
logger.info(
508+
f"[rank {self.rank}/{self.n_ranks}] Finish clearing caches {self.cache_ready_signal.value}"
509+
)
510+
511+
# wait for all ranks caches to be cleared
512+
if np.sum(self.cache_ready_signal.value) != 0:
498513
time.sleep(0.1)
499514

515+
# reset kv_cache_status_signal
500516
kv_cache_status_signal.value[0] = KVCacheStatus.CLEARED
517+
logger.info("All ranks finish clearing caches")
501518

502519
except Exception as e:
503-
logger.error(f"Failed to clear caches: {e}")
520+
logger.error(f"[rank {self.rank}/{self.n_ranks}] Failed to clear caches: {e}")
504521

505522
elif kv_cache_status_signal.value[0] == KVCacheStatus.UPDATING:
506523
try:
524+
logger.info(
525+
f"[rank {self.rank}/{self.n_ranks}] Start restoring caches {self.cache_ready_signal.value}"
526+
)
527+
# restore cpu cache
507528
if envs.FD_ENABLE_SWAP_SPACE_CLEARING:
508529
self._init_cpu_cache(args)
509530
while np.sum(self.swap_space_ready_signal.value) != args.mp_num:
510531
time.sleep(0.1)
511532

533+
# restore gpu cache and set cache_ready_signal
512534
self._init_gpu_cache(args)
535+
logger.info(
536+
f"[rank {self.rank}/{self.n_ranks}] Finish restoring caches {self.cache_ready_signal.value}"
537+
)
538+
539+
# wait for all ranks caches to be ready
513540
while np.sum(self.cache_ready_signal.value) != args.mp_num:
514541
time.sleep(0.1)
515542

543+
# set kv_cache_status_signal
544+
logger.info("All ranks finish restoring caches")
516545
kv_cache_status_signal.value[0] = KVCacheStatus.NORMAL
517546

518547
except Exception as e:
519-
logger.error(f"Failed to restore caches: {e}")
548+
logger.error(f"[rank {self.rank}/{self.n_ranks}] Failed to restore caches: {e}")
520549

521550
time.sleep(0.1)
522551

fastdeploy/rl/dynamic_weight_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def clear_parameters(self, pid: int = 0) -> None:
115115
self._verify_parameters("clearance")
116116
if self.parallel_config.tensor_parallel_size > 1:
117117
paddle.distributed.barrier(self.parallel_config.tp_group)
118-
paddle.distributed.shutdown_process_group(self.parallel_config.tp_group)
118+
paddle.distributed.shutdown_process_group(self.parallel_config.tp_group)
119119
if self.parallel_config.enable_expert_parallel:
120120
paddle.distributed.barrier(self.parallel_config.ep_group)
121121
paddle.distributed.shutdown_process_group(self.parallel_config.ep_group)

fastdeploy/worker/gpu_model_runner.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1028,12 +1028,12 @@ def initialize_kv_cache(self, profile: bool = False) -> None:
10281028
create_cache_tensor = profile or self.parallel_config.splitwise_role == "mixed"
10291029

10301030
if not create_cache_tensor:
1031-
logger.info("Waiting for cache managers to create kv cache..")
1031+
logger.info(f"Waiting for cache managers to create kv cache.. {cache_ready_signal.value}")
10321032
while cache_ready_signal.value[self.local_rank] != 1:
10331033
time.sleep(1)
1034-
logger.info("OK! Stop waiting.")
1034+
logger.info(f"OK! Stop waiting. {cache_ready_signal.value}")
10351035

1036-
logger.info("Initializing kv cache for all layers.")
1036+
logger.info(f"Initializing kv cache for all layers. {cache_ready_signal.value}")
10371037
cache_kvs_list = []
10381038
for i in range(self.model_config.num_hidden_layers):
10391039
key_cache_name = f"key_caches_{i}_rank{local_rank}.device{self.device_id}"
@@ -1054,8 +1054,8 @@ def initialize_kv_cache(self, profile: bool = False) -> None:
10541054
self.share_inputs["caches"] = cache_kvs_list
10551055

10561056
if not profile and create_cache_tensor:
1057-
logger.info("✅ kv cache is ready!")
10581057
cache_ready_signal.value[self.local_rank] = 1
1058+
logger.info(f"✅ kv cache is ready! {cache_ready_signal.value}")
10591059

10601060
paddle.device.cuda.empty_cache()
10611061

0 commit comments

Comments
 (0)