|
2 | 2 | import json |
3 | 3 | import logging |
4 | 4 | import os |
5 | | -import tempfile |
6 | 5 | from typing import Annotated |
7 | 6 |
|
8 | | -from agent_framework import ChatAgent, tool |
| 7 | +from agent_framework import Agent, tool |
9 | 8 | from agent_framework.openai import OpenAIChatClient |
10 | 9 | from azure.ai.evaluation import ( |
11 | 10 | AzureOpenAIModelConfiguration, |
|
14 | 13 | ResponseCompletenessEvaluator, |
15 | 14 | TaskAdherenceEvaluator, |
16 | 15 | ToolCallAccuracyEvaluator, |
17 | | - evaluate, |
18 | 16 | ) |
19 | 17 | from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider |
20 | 18 | from dotenv import load_dotenv |
|
68 | 66 | model=os.environ.get("OPENAI_MODEL", "gpt-5-mini"), |
69 | 67 | ) |
70 | 68 |
|
71 | | -# Optional: Set AZURE_AI_PROJECT in .env to log results to Azure AI Foundry. |
72 | | -# Example: https://your-account.services.ai.azure.com/api/projects/your-project |
73 | | -AZURE_AI_PROJECT = os.getenv("AZURE_AI_PROJECT") |
74 | | - |
75 | 69 |
|
76 | 70 | @tool |
77 | 71 | def get_weather( |
@@ -183,9 +177,8 @@ def estimate_budget( |
183 | 177 | "within the user's budget. Include weather information to help with packing." |
184 | 178 | ) |
185 | 179 |
|
186 | | -agent = ChatAgent( |
187 | | - name="travel-planner", |
188 | | - chat_client=client, |
| 180 | +agent = Agent( |
| 181 | + client=client, |
189 | 182 | instructions=AGENT_INSTRUCTIONS, |
190 | 183 | tools=tools, |
191 | 184 | ) |
@@ -269,7 +262,7 @@ def display_evaluation_results(results: dict[str, dict]) -> None: |
269 | 262 |
|
270 | 263 |
|
271 | 264 | async def main(): |
272 | | - query = "Plan a 3-day trip from New York to Tokyo next month on a $2000 budget. I like hiking and museums." |
| 265 | + query = "Plan a 3-day trip from New York (JFK) to Tokyo, departing March 15 and returning March 18, 2026. My budget is $2000 total. I like hiking and museums. Please search for flights, hotels under $150/night, check the weather, and suggest activities." |
273 | 266 |
|
274 | 267 | logger.info("Running travel planner agent...") |
275 | 268 | response = await agent.run(query) |
@@ -298,94 +291,35 @@ async def main(): |
298 | 291 | "ToolCallAccuracy": "tool_call_accuracy", |
299 | 292 | } |
300 | 293 |
|
301 | | - if AZURE_AI_PROJECT: |
302 | | - logger.info(f"Logging evaluation results to Azure AI project: {AZURE_AI_PROJECT}") |
| 294 | + intent_evaluator = IntentResolutionEvaluator(**evaluator_kwargs) |
| 295 | + completeness_evaluator = ResponseCompletenessEvaluator(**evaluator_kwargs) |
| 296 | + adherence_evaluator = TaskAdherenceEvaluator(**evaluator_kwargs) |
| 297 | + tool_accuracy_evaluator = ToolCallAccuracyEvaluator(**evaluator_kwargs) |
303 | 298 |
|
304 | | - eval_data_row = { |
305 | | - "query": eval_query, |
306 | | - "response": eval_response, |
307 | | - "response_text": response.text, |
308 | | - "ground_truth": ground_truth, |
309 | | - "tool_definitions": tool_definitions, |
310 | | - } |
| 299 | + intent_result = intent_evaluator(query=eval_query, response=eval_response, tool_definitions=tool_definitions) |
| 300 | + completeness_result = completeness_evaluator(response=response.text, ground_truth=ground_truth) |
| 301 | + adherence_result = adherence_evaluator( |
| 302 | + query=eval_query, response=eval_response, tool_definitions=tool_definitions |
| 303 | + ) |
| 304 | + tool_accuracy_result = tool_accuracy_evaluator( |
| 305 | + query=eval_query, response=eval_response, tool_definitions=tool_definitions |
| 306 | + ) |
311 | 307 |
|
312 | | - with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False, encoding="utf-8") as f: |
313 | | - f.write(json.dumps(eval_data_row) + "\n") |
314 | | - eval_data_file = f.name |
315 | | - |
316 | | - try: |
317 | | - eval_result = evaluate( |
318 | | - data=eval_data_file, |
319 | | - evaluation_name="travel-planner-agent-eval", |
320 | | - evaluators={ |
321 | | - "intent_resolution": IntentResolutionEvaluator(**evaluator_kwargs), |
322 | | - "response_completeness": ResponseCompletenessEvaluator(**evaluator_kwargs), |
323 | | - "task_adherence": TaskAdherenceEvaluator(**evaluator_kwargs), |
324 | | - "tool_call_accuracy": ToolCallAccuracyEvaluator(**evaluator_kwargs), |
325 | | - }, |
326 | | - # ResponseCompletenessEvaluator expects a plain text response, not a message list, |
327 | | - # so we override its column mapping to use response_text and ground_truth. |
328 | | - # Other evaluators auto-map correctly since data keys match param names. |
329 | | - evaluator_config={ |
330 | | - "response_completeness": { |
331 | | - "column_mapping": { |
332 | | - "response": "${data.response_text}", |
333 | | - "ground_truth": "${data.ground_truth}", |
334 | | - } |
335 | | - }, |
336 | | - }, |
337 | | - azure_ai_project=AZURE_AI_PROJECT, |
338 | | - ) |
339 | | - |
340 | | - # Parse results from the batch evaluate() output |
341 | | - evaluation_results = {} |
342 | | - rows = eval_result.get("rows", []) |
343 | | - row = rows[0] if rows else {} |
344 | | - |
345 | | - for display_name, key in result_keys.items(): |
346 | | - evaluation_results[display_name] = { |
347 | | - "score": row.get(f"outputs.{key}.{key}", "N/A"), |
348 | | - "result": row.get(f"outputs.{key}.{key}_result", "N/A"), |
349 | | - "reason": row.get(f"outputs.{key}.{key}_reason", "N/A"), |
350 | | - } |
351 | | - |
352 | | - display_evaluation_results(evaluation_results) |
353 | | - |
354 | | - studio_url = eval_result.get("studio_url") |
355 | | - if studio_url: |
356 | | - print(f"\n[bold blue]View results in Azure AI Foundry:[/bold blue] {studio_url}") |
357 | | - finally: |
358 | | - os.unlink(eval_data_file) |
359 | | - else: |
360 | | - intent_evaluator = IntentResolutionEvaluator(**evaluator_kwargs) |
361 | | - completeness_evaluator = ResponseCompletenessEvaluator(**evaluator_kwargs) |
362 | | - adherence_evaluator = TaskAdherenceEvaluator(**evaluator_kwargs) |
363 | | - tool_accuracy_evaluator = ToolCallAccuracyEvaluator(**evaluator_kwargs) |
364 | | - |
365 | | - intent_result = intent_evaluator(query=eval_query, response=eval_response, tool_definitions=tool_definitions) |
366 | | - completeness_result = completeness_evaluator(response=response.text, ground_truth=ground_truth) |
367 | | - adherence_result = adherence_evaluator( |
368 | | - query=eval_query, response=eval_response, tool_definitions=tool_definitions |
369 | | - ) |
370 | | - tool_accuracy_result = tool_accuracy_evaluator( |
371 | | - query=eval_query, response=eval_response, tool_definitions=tool_definitions |
372 | | - ) |
| 308 | + evaluation_results = {} |
| 309 | + for name, result in [ |
| 310 | + ("IntentResolution", intent_result), |
| 311 | + ("ResponseCompleteness", completeness_result), |
| 312 | + ("TaskAdherence", adherence_result), |
| 313 | + ("ToolCallAccuracy", tool_accuracy_result), |
| 314 | + ]: |
| 315 | + key = result_keys[name] |
| 316 | + evaluation_results[name] = { |
| 317 | + "score": result.get(key, "N/A"), |
| 318 | + "result": result.get(f"{key}_result", "N/A"), |
| 319 | + "reason": result.get(f"{key}_reason", result.get("error_message", "N/A")), |
| 320 | + } |
373 | 321 |
|
374 | | - evaluation_results = {} |
375 | | - for name, result in [ |
376 | | - ("IntentResolution", intent_result), |
377 | | - ("ResponseCompleteness", completeness_result), |
378 | | - ("TaskAdherence", adherence_result), |
379 | | - ("ToolCallAccuracy", tool_accuracy_result), |
380 | | - ]: |
381 | | - key = result_keys[name] |
382 | | - evaluation_results[name] = { |
383 | | - "score": result.get(key, "N/A"), |
384 | | - "result": result.get(f"{key}_result", "N/A"), |
385 | | - "reason": result.get(f"{key}_reason", result.get("error_message", "N/A")), |
386 | | - } |
387 | | - |
388 | | - display_evaluation_results(evaluation_results) |
| 322 | + display_evaluation_results(evaluation_results) |
389 | 323 |
|
390 | 324 | if async_credential: |
391 | 325 | await async_credential.close() |
|
0 commit comments