xml parsing

TLSDC · TLSDC · commit 6163b47a7caf · 2025-01-29T19:30:10.000Z
diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py
@@ -69,15 +69,15 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T
 
     from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
 
-    llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"].make_model()
+    llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-2024-08-06"].make_model()
 
     step_summarizer = ChangeSummarizer(llm, lambda x: x)
     episode_summarizer = EpisodeSummarizer()
 
     pipeline = ErrorAnalysisPipeline(
         exp_dir=exp_dir,
         filter=filter,
-        episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm, HTML_FORMATTER), llm),
+        episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm, AXTREE_FORMATTER), llm),
     )
 
     pipeline.run_analysis()
diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py
@@ -7,7 +7,7 @@
     ERROR_CLASSIFICATION_PROMPT,
 )
 from agentlab.analyze.inspect_results import summarize
-from agentlab.llm.llm_utils import json_parser
+from agentlab.llm.llm_utils import json_parser, parse_html_tags
 
 
 def _diff(past_obs, current_obs):
@@ -39,7 +39,7 @@ def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]
         if self.use_diff:
             next_obs_message = _diff(obs_message, next_obs_message)
 
-        return self.llm(
+        return self.parse(self.llm(
             self.make_prompt(
                 obs_message,
                 action,
@@ -48,7 +48,7 @@ def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]
                 goal,
                 obs.obs.get("plan", "No plan available"),
             )
-        )
+        )['content'])
 
     def make_prompt(
         self, past_obs_message, action, current_obs_message, past_summaries, goal, plan
@@ -63,6 +63,10 @@ def make_prompt(
             action=action,
         )
 
+    def parse(self, raw_output: str) -> dict:
+        parsed_result = parse_html_tags(raw_output, keys=["changeSummary", "actionAssessment", "explanation", "suggestion"])[0]
+        return parsed_result
+
 
 @dataclass
 class EpisodeAnalysis:
@@ -83,13 +87,13 @@ def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ...
     def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis:
         """Run Change Summarizer for every step in the episode or extract a pre-computed one."""
 
-        if exp_results.steps_info[-1].reward == 1:
-            return {"analysis": "Success", "summaries": {}}
+        # if exp_results.steps_info[-1].reward == 1:
+        #     return {"analysis": "Success", "summaries": {}}
 
         summaries = self.make_change_summaries(exp_results)
         prompt = self.make_prompt(exp_results, summaries)
         raw_analysis = self.llm(prompt)["content"]
-        analysis = self.parser(raw_analysis)
+        analysis = self.parse(raw_analysis)
         return {
             "analysis": analysis,
             "summaries": {i: self.parser(a) for i, a in enumerate(summaries)},
@@ -102,10 +106,13 @@ def make_change_summaries(self, exp_result: ExpResult) -> list[str]:
         # TODO:(thibault) make some checks or w/e
         for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]):
             summaries.append(
-                self.change_summarizer.summarize(step, next_step, summaries)["content"]
+                self.change_summarizer.summarize(step, next_step, summaries)
             )
         return summaries
 
+    def parse(self, raw_output: str) -> dict:
+        parsed_result = parse_html_tags(raw_output, keys=["explanation", "success", "errorCategory"])[0]
+        return parsed_result
 
 @dataclass
 class EpisodeErrorSummarizer(EpisodeSummarizer):
@@ -116,7 +123,13 @@ def make_prompt(self, exp_results: ExpResult, summaries: list[str]):
         """TODO: Implement the prompt."""
         goal = exp_results.steps_info[0].obs["goal"]
 
-        txt_summaries = "\n".join(summaries)
+        def format_summary(summary):
+            res = ''
+            for key, value in summary.items():
+                res += f"{key}: {value}\n"
+            return res
+
+        txt_summaries = "\n".join([format_summary(summary) for summary in summaries])
 
         thoughts = [step.agent_info.think for step in exp_results.steps_info[:-1]]
         actions = [step.action for step in exp_results.steps_info[:-1]]
diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
@@ -21,20 +21,19 @@
 OUTPUT FORMAT (per step):
 Return your analysis as a JSON-like structure, for example:
 
-{{
-  "changeSummary": "A new search results panel appeared on the right side.",
-  "actionAssessment": "Correct",
-  "explanation": "Clicking 'Search' was appropriate to display the results."
-}}
+<changeSummary>A new search results panel appeared on the right side.</changeSummary>
+<actionAssessment>Correct</actionAssessment>
+<explanation>Clicking 'Search' was appropriate to display the results.</explanation>
 
 Or for an incorrect action:
 
-{{
-  "changeSummary": "The page reloaded but the date fields were reset to defaults.",
-  "actionAssessment": "Incorrect",
-  "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.",
-  "suggestion": "Correct the date format or check for error messages."
-}}
+<changeSummary>The page reloaded but the date fields were reset to defaults.</changeSummary>
+<actionAssessment>Incorrect</actionAssessment>
+<explanation>The agent should have fixed the date format first instead of re-clicking 'Show report'.</explanation>
+<suggestion>Correct the date format or check for error messages.</suggestion>
+
+
+Please use single quotes '' to quote elements from the page, so as not to create parsing issues.
 
 Please follow this structure at every step. Keep your responses concise and clear. Below are the details.
 
@@ -139,19 +138,17 @@
 3. Provide a brief explanation justifying your classification, referencing specific steps if helpful.
 
 Output format example for an unsuccessful interaction:
-{{
-  "explanation": "The agent opened the wrong GitLab page and never recovered...",
-  "success": False,
-  "errorCategory": ["Navigation & Planning"],
-}}
+
+<explanation>The agent opened the wrong GitLab page and never recovered...</explanation>
+<success>False</success>
+<errorCategory>["Navigation & Planning"]</errorCategory>
 
 Output format example for a successful interaction:
-{{
-  "explanation": "The agent opened the correct GitLab page and ...",
-  "success": True,
-  "errorCategory": [],
-}}
 
+<explanation>The agent opened the correct GitLab page and ...</explanation>
+<success>True</success>
+<errorCategory>[]</errorCategory>
+  
 Please follow this structure at every step. Keep your responses concise and clear. 
 
 Below are the details for the interaction.