ServiceNow
diff --git a/‎src/agentlab/agents/generic_agent/agent_configs.py‎
Lines changed: 1 addition & 1 deletion b/‎src/agentlab/agents/generic_agent/agent_configs.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/agentlab/analyze/error_analysis/__init__.py‎ b/‎src/agentlab/analyze/error_analysis/__init__.py‎
diff --git a/‎src/agentlab/analyze/error_analysis/pipeline.py‎
Lines changed: 74 additions & 0 deletions b/‎src/agentlab/analyze/error_analysis/pipeline.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎src/agentlab/analyze/error_analysis.py‎ ‎…lab/analyze/error_analysis/summarizer.py‎src/agentlab/analyze/error_analysis.py renamed to src/agentlab/analyze/error_analysis/summarizer.py
Lines changed: 2 additions & 1 deletion b/‎src/agentlab/analyze/error_analysis.py‎ ‎…lab/analyze/error_analysis/summarizer.py‎src/agentlab/analyze/error_analysis.py renamed to src/agentlab/analyze/error_analysis/summarizer.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/analyze/error_analysis/test_pipeline.py‎
Lines changed: 85 additions & 0 deletions b/‎tests/analyze/error_analysis/test_pipeline.py‎
Lines changed: 85 additions & 0 deletions
diff --git a/‎tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl‎
2.23 KB b/‎tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl‎
2.23 KB
diff --git a/‎tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz‎
102 Bytes b/‎tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz‎
102 Bytes
@@ -257,7 +257,7 @@
 )
 
 AGENT_4o_MINI = GenericAgentArgs(
-    chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4o-mini-2024-07-18"],
+    chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"],
     flags=FLAGS_GPT_4o,
 )
 
 
@@ -0,0 +1,74 @@
+import json
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Generator
+
+from bgym import ExpResult
+
+from agentlab.analyze.inspect_results import yield_all_exp_results
+
+from .summarizer import ChangeSummarizer, EpisodeSummarizer
+
+
+@dataclass
+class Analyzer:
+    prompt: str
+    llm = None
+
+    def __call__(self, *args, **kwds):
+        return "analysis"
+
+
+@dataclass
+class ErrorAnalysisPipeline:
+    exp_dir: Path
+    filter: str = None
+    step_summarizer: ChangeSummarizer = None
+    episode_summarizer: EpisodeSummarizer = None
+    analyzer: Analyzer = None
+
+    def filter_exp_results(self) -> Generator[ExpResult, None, None]:
+        # TODO:(thibault) improve filtering
+        exp_results = yield_all_exp_results(self.exp_dir)
+        for exp_result in exp_results:
+            if self.filter is None or self.filter in str(exp_result.exp_dir):
+                yield exp_result
+
+    def run_analysis(self):
+        filtered_results = self.filter_exp_results()
+
+        for exp_result in filtered_results:
+            step_analysis = self.analyze_step(exp_result)
+            episode_analysis = self.analyze_episode(exp_result, step_analysis)
+            error_analysis = self.analyze_errors(exp_result, episode_analysis, step_analysis)
+            self.save_analysis(exp_result, error_analysis)
+
+    def analyze_step(self, exp_result: ExpResult) -> list[str]:
+        step_summaries = []  # type: list[str]
+        # this assumes that there is always an extra step at the end of the episode
+        # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info
+        # TODO:(thibault) make some checks
+        for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]):
+            step_summaries.append(
+                self.step_summarizer.summarize(step, step.action, next_step, step_summaries)
+            )
+        return step_summaries
+
+    def analyze_episode(self, exp_result: ExpResult, step_analysis: list[str]) -> str:
+        episode_summary = self.episode_summarizer.summarize(exp_result, step_analysis)
+        return episode_summary
+
+    def analyze_errors(
+        self, exp_result: ExpResult, episode_analysis: str, step_analysis: list[str]
+    ) -> str:
+        error_analysis = self.analyzer(exp_result, episode_analysis, step_analysis)
+        return error_analysis
+
+    def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=True):
+        """Save the analysis to json"""
+        analysis_path = exp_result.exp_dir / "error_analysis.json"
+        if not exists_ok and analysis_path.exists():
+            raise FileExistsError(f"{analysis_path} already exists")
+        with analysis_path.open("w") as f:
+            json.dump(error_analysis, f)
@@ -1,4 +1,5 @@
 from dataclasses import dataclass
+
 from bgym import StepInfo
 
 CHANGE_SUMMARIZER_PROMPT = """
@@ -227,7 +228,7 @@ def summarize(
         past_obs_message = self.obs_formatter(past_obs)
         current_obs_message = self.obs_formatter(current_obs)
 
-        goal = past_obs["goal"]    # Use goal object from agentlab
+        goal = past_obs["goal"]  # Use goal object from agentlab
         # Outsource everything to formatter
         plan = past_obs["plan"]
         if self.use_diff:
 
@@ -0,0 +1,85 @@
+from pathlib import Path
+
+import pytest
+from bgym import ExpResult, StepInfo
+
+from agentlab.analyze.error_analysis.pipeline import ErrorAnalysisPipeline
+
+exp_dir = Path(__file__).parent.parent.parent / "data/error_analysis"
+
+
+class MockStepSummarizer:
+    def summarize(
+        self, step: StepInfo, action: str, next_step: StepInfo, step_summaries: list[str]
+    ) -> str:
+        return f"Agent took action {action} at step {len(step_summaries)}"
+
+
+class MockEpisodeSummarizer:
+    def summarize(self, exp_result: ExpResult, step_analysis: list[str]) -> str:
+        return f"Agent did actions {', '.join(step.action for step in exp_result.steps_info if step.action)}"
+
+
+class MockAnalyzer:
+    def __call__(
+        self, exp_result: ExpResult, episode_analysis: str, step_analysis: list[str]
+    ) -> str:
+        return {"error": "analysis", "episode": episode_analysis}
+
+
+@pytest.fixture(scope="module")
+def pipeline() -> ErrorAnalysisPipeline:
+    return ErrorAnalysisPipeline(
+        exp_dir=exp_dir,
+        filter=None,
+        episode_summarizer=MockEpisodeSummarizer(),
+        step_summarizer=MockStepSummarizer(),
+        analyzer=MockAnalyzer(),
+    )
+
+
+def test_yield_no_filter(pipeline: ErrorAnalysisPipeline):
+    assert len(list(pipeline.filter_exp_results())) == 4
+
+
+def test_yield_with_filter(pipeline: ErrorAnalysisPipeline):
+    pattern = "click-dialog"
+    pipeline.filter = pattern
+    assert len(list(pipeline.filter_exp_results())) == 2
+    pipeline.filter = None
+
+
+def test_analyze_step(pipeline: ErrorAnalysisPipeline):
+    exp_result = next(pipeline.filter_exp_results())
+    step_analysis = pipeline.analyze_step(exp_result)
+
+    assert len(exp_result.steps_info) == len(step_analysis) + 1
+    assert step_analysis[0] == f"Agent took action {exp_result.steps_info[0].action} at step 0"
+
+
+def test_analyze_episode(pipeline: ErrorAnalysisPipeline):
+    exp_result = next(pipeline.filter_exp_results())
+    step_analysis = pipeline.analyze_step(exp_result)
+    episode_analysis = pipeline.analyze_episode(exp_result, step_analysis)
+
+    for step_info in exp_result.steps_info:
+        if step_info.action:
+            assert step_info.action in episode_analysis
+
+
+def test_save_analysis(pipeline: ErrorAnalysisPipeline):
+    exp_result = next(pipeline.filter_exp_results())
+    step_analysis = pipeline.analyze_step(exp_result)
+    episode_analysis = pipeline.analyze_episode(exp_result, step_analysis)
+    error_analysis = pipeline.analyze_errors(exp_result, episode_analysis, step_analysis)
+
+    pipeline.save_analysis(exp_result, error_analysis, exists_ok=False)
+
+    assert (exp_result.exp_dir / "error_analysis.json").exists()
+
+    # remove the file
+    (exp_result.exp_dir / "error_analysis.json").unlink()
+
+
+if __name__ == "__main__":
+    test_yield_with_filter()
Original file line number	Diff line number	Diff line change
`@@ -257,7 +257,7 @@`
`257`	`257`	`)`
`258`	`258`
`259`	`259`	`AGENT_4o_MINI = GenericAgentArgs(`
`260`		`- chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4o-mini-2024-07-18"],`
	`260`	`+ chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"],`
`261`	`261`	`flags=FLAGS_GPT_4o,`
`262`	`262`	`)`
`263`	`263`