|
| 1 | +from dataclasses import dataclass |
| 2 | + |
| 3 | +from bgym import ExpResult, StepInfo |
| 4 | + |
| 5 | +CHANGE_SUMMARIZER_PROMPT = """ |
| 6 | +You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website, |
| 7 | +you will receive the following pieces of information: |
| 8 | +
|
| 9 | +1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'"). |
| 10 | +2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries. |
| 11 | +3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet). |
| 12 | +4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'"). |
| 13 | +5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available. |
| 14 | +
|
| 15 | +YOUR TASK (each step): |
| 16 | +A) SUMMARIZE THE CHANGE |
| 17 | + - Describe what visibly changed between the previous observation (or diff) and the current observation. |
| 18 | + For example, did a new panel open, did the form reset, did nothing happen, etc.? |
| 19 | +
|
| 20 | +B) ASSESS THE ACTION |
| 21 | + - Decide whether the agent's action seems helpful or correct given the user's main goal, |
| 22 | + or if it appears incorrect/unhelpful. |
| 23 | + - Briefly explain why. |
| 24 | +
|
| 25 | +OUTPUT FORMAT (per step): |
| 26 | +Return your analysis as a JSON-like structure, for example: |
| 27 | +
|
| 28 | +{ |
| 29 | + "changeSummary": "A new search results panel appeared on the right side.", |
| 30 | + "actionAssessment": "Correct", |
| 31 | + "explanation": "Clicking 'Search' was appropriate to display the results." |
| 32 | +} |
| 33 | +
|
| 34 | +Or for an incorrect action: |
| 35 | +
|
| 36 | +{ |
| 37 | + "changeSummary": "The page reloaded but the date fields were reset to defaults.", |
| 38 | + "actionAssessment": "Incorrect", |
| 39 | + "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.", |
| 40 | + "suggestion": "Correct the date format or check for error messages." |
| 41 | +} |
| 42 | +
|
| 43 | +Please follow this structure at every step. Keep your responses concise and clear. Below are the details. |
| 44 | +
|
| 45 | +Goal: {goal} |
| 46 | +
|
| 47 | +LLM Plan: {plan} |
| 48 | +
|
| 49 | +Previous Observation: {past_observation} |
| 50 | +
|
| 51 | +Current Observation: {current_observation} |
| 52 | +
|
| 53 | +Past summaries: {past_summaries} |
| 54 | +
|
| 55 | +Action: {action} |
| 56 | +""" |
| 57 | + |
| 58 | +ERROR_CLASSIFICATION_PROMPT = """ |
| 59 | +You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. |
| 60 | +Below are the high-level definitions of each top-level category (Agent Errors, Language Model Errors, and Benchmark/Environment Errors), |
| 61 | +followed by an explanation of the inputs you will receive (planning history, chain of thought, etc.), |
| 62 | +a set of labeled examples for reference (few-shot), and finally the classification task you must complete. |
| 63 | +
|
| 64 | +-------------------------------------------------------------------------------- |
| 65 | +TAXONOMY DEFINITIONS |
| 66 | +-------------------------------------------------------------------------------- |
| 67 | +
|
| 68 | +1. AGENT ERRORS |
| 69 | +These errors arise when agents interact with web interfaces and fail due to limitations in perception, navigation, or manipulation. |
| 70 | +
|
| 71 | + - Navigation & Planning Errors |
| 72 | + The agent cannot construct or execute a correct sequence of actions to reach its goal |
| 73 | + (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms). |
| 74 | +
|
| 75 | + - Interaction Execution Errors |
| 76 | + The agent enters data in the wrong format, forgets to click "Submit" after typing, |
| 77 | + repeats the same failing action without adaptation, or loses track of the changing webpage state. |
| 78 | +
|
| 79 | + - Information Processing Errors |
| 80 | + The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), |
| 81 | + misconstrues relationships between pieces of information, or fails to validate data against task requirements. |
| 82 | +
|
| 83 | + - Observation & Action Errors |
| 84 | + The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded) |
| 85 | + or misaligns its actions (clicks the wrong element or stale link). |
| 86 | +
|
| 87 | +2. LANGUAGE MODEL ERRORS |
| 88 | +These errors result from the model's inability to correctly interpret or reason about the task at a higher level, |
| 89 | +independent of the low-level web interactions. |
| 90 | +
|
| 91 | + - Task Understanding Errors |
| 92 | + The agent misreads or misunderstands the user's objective (goal interpretation), |
| 93 | + loses crucial context (context loss), or performs actions beyond or short of the intended scope. |
| 94 | +
|
| 95 | + - Reasoning Failures |
| 96 | + The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, |
| 97 | + or fails to prioritize important subtasks when handling complex goals. |
| 98 | +
|
| 99 | +3. BENCHMARK & ENVIRONMENT ERRORS |
| 100 | +These errors are external to the agent's logic and the language model's reasoning, |
| 101 | +arising from flaws in the system, network, or evaluation framework itself. |
| 102 | +
|
| 103 | + - System Errors |
| 104 | + Network failures, API downtime, or dynamic web changes that break the agent's assumptions (e.g., layout shifts). |
| 105 | +
|
| 106 | + - Benchmark Design Errors |
| 107 | + Ambiguous or contradictory task specifications, incorrect validation criteria (where correct solutions are flagged as failures), |
| 108 | + or inflexible evaluation systems that fail to account for valid alternative solutions. |
| 109 | +
|
| 110 | +-------------------------------------------------------------------------------- |
| 111 | +INPUT DESCRIPTION |
| 112 | +-------------------------------------------------------------------------------- |
| 113 | +
|
| 114 | +You will receive the following for each scenario: |
| 115 | +1. User Goal |
| 116 | + - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'"). |
| 117 | + |
| 118 | +2. Planning / Thought History |
| 119 | + - The internal reasoning or plan the agent considered. May include branches of logic or key decision points. |
| 120 | +
|
| 121 | +3. Current Observation (HTML / AX Tree Snippet) |
| 122 | + - The webpage structure or state that the agent sees at a given point in time. |
| 123 | +
|
| 124 | +4. Historical change summaries |
| 125 | + - A list of summaries of changes in the observation that the agent has seen during the course of actions. |
| 126 | +
|
| 127 | +5. Action History |
| 128 | + - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) |
| 129 | + along with immediate outcomes or errors. |
| 130 | +
|
| 131 | +Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories. |
| 132 | +
|
| 133 | +-------------------------------------------------------------------------------- |
| 134 | +FEW-SHOT CLASSIFICATION EXAMPLES |
| 135 | +-------------------------------------------------------------------------------- |
| 136 | +
|
| 137 | +1) EXAMPLE A (Benchmark Error - Benchmark Design Error) |
| 138 | + • Context: The agent correctly finds a cheaper product meeting the user's criteria, |
| 139 | + but the benchmark expects a more expensive product and marks the solution as wrong. |
| 140 | + • Classification: ["Benchmark Design Error"] |
| 141 | + • Justification: The agent's solution is objectively valid, but the evaluation framework is too rigid |
| 142 | + and does not allow an alternative correct solution. |
| 143 | +
|
| 144 | +2) EXAMPLE B (Agent Error - Interaction Execution) |
| 145 | + • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. |
| 146 | + Each time, the site resets to default dates. The agent never notices and keeps doing the same thing. |
| 147 | + • Classification: ["Agent Error - Interaction Execution"] |
| 148 | + • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action |
| 149 | + without adaptation ("Action Repetition"). |
| 150 | +
|
| 151 | +3) EXAMPLE C (Benchmark Error - Benchmark Design Error) |
| 152 | + • Context: The user asks, "Where is the nearest In-N-Out to Upitts?" |
| 153 | + The query is ambiguous because "Upitts" is not a standard location. |
| 154 | + The agent flounders, eventually returning "No In-N-Out found," which is incorrect for the region. |
| 155 | + • Classification: ["Benchmark Design Error"] |
| 156 | + • Justification: The task goal is poorly specified ("Upitts" is ambiguous or unrealistic), |
| 157 | + leading the agent astray due to unclear context. |
| 158 | +
|
| 159 | +4) EXAMPLE D (Language Model Error - Task Understanding) |
| 160 | + • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' |
| 161 | + that are older than 30 days and add a comment saying 'I can help fix this.'" |
| 162 | + The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue |
| 163 | + with label 'help wanted,' ignoring the user's actual request to find and comment on old issues. |
| 164 | + • Classification: ["Language Model Error - Task Understanding"] |
| 165 | + • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, |
| 166 | + it focused on creating a new issue. This is a misinterpretation of the instructions, |
| 167 | + not a mechanical error in clicking or input format. |
| 168 | +
|
| 169 | +-------------------------------------------------------------------------------- |
| 170 | +CLASSIFICATION TASK |
| 171 | +-------------------------------------------------------------------------------- |
| 172 | +
|
| 173 | +1. Read through: |
| 174 | + - The planning and thought history |
| 175 | + - The action history |
| 176 | + - The current HTML or AX Tree observation |
| 177 | + - The user goal |
| 178 | +
|
| 179 | +2. Decide if the failure is: |
| 180 | + - An Agent Error (which subcategory/subcategories), |
| 181 | + - A Language Model Error (which subcategory/subcategories), |
| 182 | + - A Benchmark/Environment Error (which subcategory/subcategories), |
| 183 | + - Or a combination thereof (multi-label if needed). |
| 184 | +
|
| 185 | +3. Provide a brief explanation justifying your classification, referencing specific steps if helpful. |
| 186 | +
|
| 187 | +4. If the agent succeeds (no error), label the errorCategory accordingly as "Success". |
| 188 | +
|
| 189 | +Output Format Example: |
| 190 | +{ |
| 191 | + "errorCategory": ["Agent Error - Navigation & Planning"], |
| 192 | + "explanation": "The agent opened the wrong GitLab page and never recovered..." |
| 193 | +} |
| 194 | +
|
| 195 | +Please follow this structure at every step. Keep your responses concise and clear. Below are the details. |
| 196 | +
|
| 197 | +Overall goal: {goal} |
| 198 | +
|
| 199 | +LLM Plan and thought history: {plan} |
| 200 | +
|
| 201 | +Current Observation: {current_observation} |
| 202 | +
|
| 203 | +Historical change summaries: {historical_summaries} |
| 204 | +
|
| 205 | +Action history: {action_history} |
| 206 | +""" |
| 207 | + |
| 208 | + |
| 209 | +def _diff(past_obs, current_obs): |
| 210 | + """TODO: Implement the diff function. |
| 211 | +
|
| 212 | + Returns a diff version of current_obs compares to past_obs, unless there is too many changes. |
| 213 | + """ |
| 214 | + raise ValueError("Not implemented yet.") |
| 215 | + |
| 216 | + |
| 217 | +@dataclass |
| 218 | +class ChangeSummarizer: |
| 219 | + |
| 220 | + llm: callable # language model |
| 221 | + obs_formatter: callable |
| 222 | + use_diff: bool = False |
| 223 | + |
| 224 | + def summarize( |
| 225 | + self, past_obs: dict, action: str, current_obs: dict, past_summaries: list[str] |
| 226 | + ) -> str: |
| 227 | + """Produces, a summary of the effect of an action.""" |
| 228 | + past_obs_message = self.obs_formatter(past_obs) |
| 229 | + current_obs_message = self.obs_formatter(current_obs) |
| 230 | + |
| 231 | + goal = past_obs["goal"] # Use goal object from agentlab |
| 232 | + # Outsource everything to formatter |
| 233 | + plan = past_obs["plan"] |
| 234 | + if self.use_diff: |
| 235 | + current_obs_message = _diff(past_obs_message, current_obs_message) |
| 236 | + |
| 237 | + return self.llm( |
| 238 | + self.make_prompt( |
| 239 | + past_obs_message, action, current_obs_message, past_summaries, goal, plan |
| 240 | + ) |
| 241 | + ) |
| 242 | + |
| 243 | + def make_prompt( |
| 244 | + self, past_obs_message, action, current_obs_message, past_summaries, goal, plan |
| 245 | + ): |
| 246 | + """TODO: Implement the prompt.""" |
| 247 | + return CHANGE_SUMMARIZER_PROMPT.format( |
| 248 | + goal=goal, |
| 249 | + plan=plan, |
| 250 | + past_observation=past_obs_message, |
| 251 | + current_observation=current_obs_message, |
| 252 | + past_summaries=past_summaries, |
| 253 | + action=action, |
| 254 | + ) |
| 255 | + |
| 256 | + |
| 257 | +@dataclass |
| 258 | +class EpisodeAnalysis: |
| 259 | + analysis: str # complete analysis of the episode |
| 260 | + summary: str # short summary of the analysis |
| 261 | + categories: dict[str, float] # score for each category e.g. type of error or difficulty levels |
| 262 | + |
| 263 | + |
| 264 | +@dataclass |
| 265 | +class EpisodeSummarizer: |
| 266 | + |
| 267 | + change_summarizer: ChangeSummarizer = None |
| 268 | + |
| 269 | + def summarize(exp_results: list[ExpResult], change_summaries: list[str]) -> EpisodeAnalysis: |
| 270 | + """Run Change Summarizer for every step in the episode or extract a pre-computed one.""" |
| 271 | + pass |
| 272 | + |
| 273 | + |
| 274 | +@dataclass |
| 275 | +class EpisodeErrorSummarizer(EpisodeSummarizer): |
| 276 | + |
| 277 | + change_summarizer: ChangeSummarizer = None |
| 278 | + |
| 279 | + def make_prompt(self, current_observation, action_history, historical_summaries, goal, plan): |
| 280 | + """TODO: Implement the prompt.""" |
| 281 | + return ERROR_CLASSIFICATION_PROMPT.format( |
| 282 | + goal=goal, |
| 283 | + plan=plan, |
| 284 | + current_observation=current_observation, |
| 285 | + historical_summaries=historical_summaries, |
| 286 | + action_history=action_history, |
| 287 | + ) |
0 commit comments