Merge pull request #425 from kywch/stop-train

jsuarez5341 · web-flow · commit 33661e7a8434 · 2025-11-28T16:51:01.000-05:00
Adaptive early-stop thresholding in sweep
diff --git a/pufferlib/config/default.ini b/pufferlib/config/default.ini
@@ -63,11 +63,13 @@ prio_beta0 = 0.2
 [sweep]
 method = Protein 
 metric = score
+metric_distribution = linear
 goal = maximize
 max_suggestion_cost = 3600
 downsample = 5
 use_gpu = True
 prune_pareto = True
+early_stop_quantile = 0.3
 
 #[sweep.vec.num_envs]
 #distribution = uniform_pow2
@@ -100,6 +102,12 @@ min = 0.00001
 max = 0.1
 scale = 0.5
 
+[sweep.train.min_lr_ratio]
+distribution = uniform
+min = 0.0
+max = 0.5
+scale = auto
+
 [sweep.train.ent_coef]
 distribution = log_normal
 min = 0.00001
diff --git a/pufferlib/config/ocean/breakout.ini b/pufferlib/config/ocean/breakout.ini
@@ -53,6 +53,8 @@ vf_coef = 1.6832989594296321
 vtrace_c_clip = 2.878171091654008
 vtrace_rho_clip = 0.7876748061547312
 
+[sweep]
+
 [sweep.train.total_timesteps]
 distribution = log_normal
 min = 3e7
diff --git a/pufferlib/config/ocean/tower_climb.ini b/pufferlib/config/ocean/tower_climb.ini
@@ -5,53 +5,72 @@ policy_name = TowerClimb
 rnn_name = TowerClimbLSTM
 
 [vec]
-num_envs = 8
+num_envs = 4
 
 [env]
 num_envs = 1024
-num_maps = 50
-reward_climb_row = 0.636873185634613
-reward_fall_row = -0.15898257493972778
-reward_illegal_move = -0.003928301855921745
-reward_move_block = 0.235064297914505
+num_maps = 200
+reward_climb_row = 0.27
+reward_fall_row = 0
+reward_illegal_move = 0
+reward_move_block = 0.18
 
 [train]
-total_timesteps = 150_000_000
-#gamma = 0.98
-#learning_rate = 0.05
-minibatch_size = 32768
+# https://wandb.ai/kywch/pufferlib/runs/8r3l9l1h?nw=nwuserkywch
+total_timesteps = 30_000_000
+anneal_lr = True
+batch_size = auto
+bptt_horizon = 64
+minibatch_size = 65536
+
+clip_coef = 1.0
+ent_coef = 0.2
+gae_lambda = 0.96
+gamma = 0.92
+vf_clip_coef = 0.1
+vf_coef = 0.34
+
+learning_rate = 0.029
+max_grad_norm = 0.8
+
+adam_beta1 = 0.89
+adam_beta2 = 0.999
+adam_eps = 2e-11
+prio_alpha = 0.86
+prio_beta0 = 0.30
+vtrace_c_clip = 0.92
+vtrace_rho_clip = 1.44
+
+[sweep]
+metric = perf
+metric_distribution = percentile
 
 [sweep.train.total_timesteps]
 distribution = uniform
-min = 50_000_000
+min = 10_000_000
 max = 200_000_000
-mean = 100_000_000
 scale = 0.5
 
 [sweep.env.reward_climb_row]
 distribution = uniform
 min = 0.0
 max = 1.0
-mean = 0.5
 scale = auto
 
 [sweep.env.reward_fall_row]
 distribution = uniform
 min = -1.0
 max = 0.0
-mean = -0.5
 scale = auto
 
 [sweep.env.reward_illegal_move]
 distribution = uniform
 min = -1e-2
 max = -1e-4
-mean = -1e-3
 scale = auto
 
 [sweep.env.reward_move_block]
 distribution = uniform
 min = 0.0
 max = 1.0
-mean = 0.5
 scale = auto
diff --git a/pufferlib/pufferl.py b/pufferlib/pufferl.py
@@ -828,7 +828,7 @@ def __init__(self, args):
     def log(self, logs, step):
         pass
 
-    def close(self, model_path):
+    def close(self, model_path, early_stop):
         pass
 
 class NeptuneLogger:
@@ -859,7 +859,8 @@ def log(self, logs, step):
     def upload_model(self, model_path):
         self.neptune['model'].track_files(model_path)
 
-    def close(self, model_path):
+    def close(self, model_path, early_stop):
+        self.neptune['early_stop'] = early_stop
         if self.should_upload_model:
             self.upload_model(model_path)
         self.neptune.stop()
@@ -894,7 +895,8 @@ def upload_model(self, model_path):
         artifact.add_file(model_path)
         self.wandb.run.log_artifact(artifact)
 
-    def close(self, model_path):
+    def close(self, model_path, early_stop):
+        self.wandb.run.summary['early_stop'] = early_stop
         if self.should_upload_model:
             self.upload_model(model_path)
         self.wandb.finish()
@@ -905,7 +907,7 @@ def download(self):
         model_file = max(os.listdir(data_dir))
         return f'{data_dir}/{model_file}'
 
-def train(env_name, args=None, vecenv=None, policy=None, logger=None, should_stop_early=None):
+def train(env_name, args=None, vecenv=None, policy=None, logger=None, early_stop_fn=None):
     args = args or load_config(env_name)
 
     # Assume TorchRun DDP is used if LOCAL_RANK is set
@@ -944,7 +946,10 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None, should_sto
     train_config = { **args['train'], 'env': env_name }
     pufferl = PuffeRL(train_config, vecenv, policy, logger)
 
+    # Sweep needs data for early stopped runs, so send data when steps > 100M
+    logging_threshold = min(0.20*train_config['total_timesteps'], 100_000_000)
     all_logs = []
+
     while pufferl.global_step < train_config['total_timesteps']:
         if train_config['device'] == 'cuda':
             torch.compiler.cudagraph_mark_step_begin()
@@ -954,12 +959,19 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None, should_sto
         logs = pufferl.train()
 
         if logs is not None:
-            if pufferl.global_step > 0.20*train_config['total_timesteps']:
+            should_stop_early = False
+            if early_stop_fn is not None:
+                should_stop_early = early_stop_fn(logs)
+                # This is hacky, but need to see if threshold looks reasonable
+                if 'early_stop_threshold' in logs:
+                    pufferl.logger.log({'environment/early_stop_threshold': logs['early_stop_threshold']}, logs['agent_steps'])
+
+            if pufferl.global_step > logging_threshold:
                 all_logs.append(logs)
 
-            if should_stop_early is not None and should_stop_early(logs):
+            if should_stop_early:
                 model_path = pufferl.close()
-                pufferl.logger.close(model_path)
+                pufferl.logger.close(model_path, early_stop=True)
                 return all_logs
 
     # Final eval. You can reset the env here, but depending on
@@ -976,7 +988,7 @@ def train(env_name, args=None, vecenv=None, policy=None, logger=None, should_sto
 
     pufferl.print_dashboard()
     model_path = pufferl.close()
-    pufferl.logger.close(model_path)
+    pufferl.logger.close(model_path, early_stop=False)
     return all_logs
 
 def eval(env_name, args=None, vecenv=None, policy=None):
@@ -1053,6 +1065,30 @@ def sweep(args=None, env_name=None):
     sweep = sweep_cls(args['sweep'])
     points_per_run = args['sweep']['downsample']
     target_key = f'environment/{args["sweep"]["metric"]}'
+    running_target_buffer = deque(maxlen=30)
+
+    def stop_if_perf_below(logs):
+        if stop_if_loss_nan(logs):
+            logs['is_loss_nan'] = True
+            return True
+
+        if method != 'Protein':
+            return False
+
+        if ('uptime' in logs and target_key in logs):
+            metric_val, cost = logs[target_key], logs['uptime']
+            running_target_buffer.append(metric_val)
+            target_running_mean = np.mean(running_target_buffer)
+            
+            # If metric distribution is percentile, threshold is also logit transformed
+            threshold = sweep.get_early_stop_threshold(cost)
+            logs['early_stop_threshold'] = max(threshold, -5)  # clipping for visualization
+
+            if sweep.should_stop(max(target_running_mean, metric_val), cost):
+                logs['is_loss_nan'] = False
+                return True
+        return False
+
     for i in range(args['max_runs']):
         seed = time.time_ns() & 0xFFFFFFFF
         random.seed(seed)
@@ -1063,7 +1099,7 @@ def sweep(args=None, env_name=None):
         if i > 0:
             sweep.suggest(args)
 
-        all_logs = train(env_name, args=args, should_stop_early=stop_if_loss_nan)
+        all_logs = train(env_name, args=args, early_stop_fn=stop_if_perf_below)
         all_logs = [e for e in all_logs if target_key in e]
 
         if not all_logs:
@@ -1076,7 +1112,8 @@ def sweep(args=None, env_name=None):
         costs = downsample([log['uptime'] for log in all_logs], points_per_run)
         timesteps = downsample([log['agent_steps'] for log in all_logs], points_per_run)
 
-        if len(timesteps) > 0 and timesteps[-1] < 0.7 * total_timesteps:  # 0.7 is arbitrary
+        is_final_loss_nan = all_logs[-1].get('is_loss_nan', False)
+        if is_final_loss_nan:
             s = scores.pop()
             c = costs.pop()
             args['train']['total_timesteps'] = timesteps.pop()
diff --git a/pufferlib/sweep.py b/pufferlib/sweep.py