Skip to content

Commit 68de6c6

Browse files
committed
Add travel planning agent evaluation scripts in English and Spanish
- Implemented `agent_evaluation.py` for English, featuring tools for weather, flights, hotels, activities, and budget estimation. - Created `agent_evaluation.py` for Spanish with localized tool descriptions and agent instructions. - Integrated Azure and GitHub model configurations for evaluation. - Added evaluation logic using IntentResolution, ResponseCompleteness, TaskAdherence, and ToolCallAccuracy evaluators. - Enhanced logging and output formatting for evaluation results.
1 parent 980943f commit 68de6c6

6 files changed

Lines changed: 1208 additions & 7 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ You can run the examples in this repository by executing the scripts in the `exa
174174
| [openai_tool_calling.py](examples/openai_tool_calling.py) | Tool calling with the low-level OpenAI SDK, showing manual tool dispatch. |
175175
| [workflow_basic.py](examples/workflow_basic.py) | A workflow-based agent. |
176176
| [agent_otel_aspire.py](examples/agent_otel_aspire.py) | An agent with OpenTelemetry tracing, metrics, and structured logs exported to the [Aspire Dashboard](https://aspire.dev/dashboard/standalone/). |
177+
| [agent_evaluation.py](examples/agent_evaluation.py) | Evaluate a travel planner agent using [Azure AI Evaluation](https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/agent-evaluators) agent evaluators (IntentResolution, ToolCallAccuracy, TaskAdherence, ResponseCompleteness). |
177178
178179
## Using the Aspire Dashboard for telemetry
179180

examples/agent_evaluation.py

Lines changed: 320 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
import asyncio
2+
import json
3+
import logging
4+
import os
5+
from typing import Annotated
6+
7+
from agent_framework import ChatAgent, tool
8+
from agent_framework.openai import OpenAIChatClient
9+
from azure.ai.evaluation import (
10+
AzureOpenAIModelConfiguration,
11+
IntentResolutionEvaluator,
12+
OpenAIModelConfiguration,
13+
ResponseCompletenessEvaluator,
14+
TaskAdherenceEvaluator,
15+
ToolCallAccuracyEvaluator,
16+
)
17+
from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
18+
from dotenv import load_dotenv
19+
from pydantic import Field
20+
from rich import print
21+
from rich.logging import RichHandler
22+
from rich.panel import Panel
23+
from rich.table import Table
24+
25+
handler = RichHandler(show_path=False, rich_tracebacks=True, show_level=False)
26+
logging.basicConfig(level=logging.WARNING, handlers=[handler], force=True, format="%(message)s")
27+
logger = logging.getLogger(__name__)
28+
logger.setLevel(logging.INFO)
29+
30+
load_dotenv(override=True)
31+
API_HOST = os.getenv("API_HOST", "github")
32+
33+
async_credential = None
34+
if API_HOST == "azure":
35+
async_credential = DefaultAzureCredential()
36+
token_provider = get_bearer_token_provider(async_credential, "https://cognitiveservices.azure.com/.default")
37+
client = OpenAIChatClient(
38+
base_url=f"{os.environ['AZURE_OPENAI_ENDPOINT']}/openai/v1/",
39+
api_key=token_provider,
40+
model_id=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT"],
41+
)
42+
eval_model_config = AzureOpenAIModelConfiguration(
43+
type="azure_openai",
44+
azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
45+
azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT"],
46+
)
47+
elif API_HOST == "github":
48+
client = OpenAIChatClient(
49+
base_url="https://models.github.ai/inference",
50+
api_key=os.environ["GITHUB_TOKEN"],
51+
model_id=os.getenv("GITHUB_MODEL", "openai/gpt-5-mini"),
52+
)
53+
eval_model_config = OpenAIModelConfiguration(
54+
type="openai",
55+
base_url="https://models.github.ai/inference",
56+
api_key=os.environ["GITHUB_TOKEN"],
57+
model="openai/gpt-5-mini",
58+
)
59+
else:
60+
client = OpenAIChatClient(
61+
api_key=os.environ["OPENAI_API_KEY"], model_id=os.environ.get("OPENAI_MODEL", "gpt-5-mini")
62+
)
63+
eval_model_config = OpenAIModelConfiguration(
64+
type="openai",
65+
api_key=os.environ["OPENAI_API_KEY"],
66+
model=os.environ.get("OPENAI_MODEL", "gpt-5-mini"),
67+
)
68+
69+
70+
@tool
71+
def get_weather(
72+
city: Annotated[str, Field(description="The city to get the weather forecast for.")],
73+
date_range: Annotated[str, Field(description="Date range in format 'YYYY-MM-DD to YYYY-MM-DD'.")],
74+
) -> dict:
75+
"""Returns a weather forecast for a city over a date range, including temperature and conditions."""
76+
logger.info(f"Getting weather for {city} ({date_range})")
77+
return {
78+
"city": city,
79+
"date_range": date_range,
80+
"forecast": [
81+
{"date": "Day 1", "high_f": 65, "low_f": 52, "conditions": "Partly cloudy"},
82+
{"date": "Day 2", "high_f": 70, "low_f": 55, "conditions": "Sunny"},
83+
{"date": "Day 3", "high_f": 62, "low_f": 50, "conditions": "Light rain"},
84+
],
85+
}
86+
87+
88+
@tool
89+
def search_flights(
90+
origin: Annotated[str, Field(description="Departure city or airport code.")],
91+
destination: Annotated[str, Field(description="Arrival city or airport code.")],
92+
departure_date: Annotated[str, Field(description="Departure date in YYYY-MM-DD format.")],
93+
return_date: Annotated[str, Field(description="Return date in YYYY-MM-DD format.")],
94+
) -> list[dict]:
95+
"""Searches for round-trip flights and returns options with prices."""
96+
logger.info(f"Searching flights {origin} -> {destination} ({departure_date} to {return_date})")
97+
return [
98+
{"airline": "SkyAir", "price_usd": 850, "duration": "14h 20m", "stops": 1},
99+
{"airline": "OceanWings", "price_usd": 720, "duration": "16h 45m", "stops": 2},
100+
{"airline": "DirectJet", "price_usd": 1100, "duration": "12h 30m", "stops": 0},
101+
]
102+
103+
104+
@tool
105+
def search_hotels(
106+
city: Annotated[str, Field(description="The city to search hotels in.")],
107+
checkin: Annotated[str, Field(description="Check-in date in YYYY-MM-DD format.")],
108+
checkout: Annotated[str, Field(description="Check-out date in YYYY-MM-DD format.")],
109+
max_price_per_night: Annotated[int, Field(description="Maximum price per night in USD.")],
110+
) -> list[dict]:
111+
"""Searches for hotels within a nightly budget and returns options with ratings."""
112+
logger.info(f"Searching hotels in {city} ({checkin} to {checkout}, max ${max_price_per_night}/night)")
113+
return [
114+
{"name": "Budget Inn Tokyo", "price_per_night_usd": 80, "rating": 3.8, "neighborhood": "Asakusa"},
115+
{"name": "Sakura Hotel", "price_per_night_usd": 120, "rating": 4.2, "neighborhood": "Shinjuku"},
116+
{"name": "Tokyo Garden Suites", "price_per_night_usd": 200, "rating": 4.6, "neighborhood": "Ginza"},
117+
]
118+
119+
120+
@tool
121+
def get_activities(
122+
city: Annotated[str, Field(description="The city to find activities in.")],
123+
interests: Annotated[list[str], Field(description="List of interests, e.g. ['hiking', 'museums'].")],
124+
) -> list[dict]:
125+
"""Returns activity suggestions for a city based on user interests."""
126+
logger.info(f"Getting activities in {city} for interests: {interests}")
127+
activities = []
128+
if "hiking" in [i.lower() for i in interests]:
129+
activities.extend(
130+
[
131+
{"name": "Mt. Takao Day Hike", "cost_usd": 15, "duration": "4-5 hours"},
132+
{"name": "Kamakura Trail Walk", "cost_usd": 25, "duration": "3 hours"},
133+
]
134+
)
135+
if "museums" in [i.lower() for i in interests]:
136+
activities.extend(
137+
[
138+
{"name": "Tokyo National Museum", "cost_usd": 10, "duration": "2-3 hours"},
139+
{"name": "teamLab Borderless", "cost_usd": 30, "duration": "2 hours"},
140+
]
141+
)
142+
if not activities:
143+
activities = [{"name": "City walking tour", "cost_usd": 0, "duration": "3 hours"}]
144+
return activities
145+
146+
147+
@tool
148+
def estimate_budget(
149+
total_budget: Annotated[int, Field(description="Total trip budget in USD.")],
150+
num_days: Annotated[int, Field(description="Number of days for the trip.")],
151+
) -> dict:
152+
"""Provides a recommended budget breakdown for flights, hotels, activities, and food."""
153+
logger.info(f"Estimating budget: ${total_budget} for {num_days} days")
154+
flight_pct = 0.40
155+
hotel_pct = 0.30
156+
activities_pct = 0.15
157+
food_pct = 0.15
158+
return {
159+
"total_budget_usd": total_budget,
160+
"flights_usd": int(total_budget * flight_pct),
161+
"hotels_usd": int(total_budget * hotel_pct),
162+
"hotels_per_night_usd": int(total_budget * hotel_pct / num_days),
163+
"activities_usd": int(total_budget * activities_pct),
164+
"food_usd": int(total_budget * food_pct),
165+
"food_per_day_usd": int(total_budget * food_pct / num_days),
166+
}
167+
168+
169+
tools = [get_weather, search_flights, search_hotels, get_activities, estimate_budget]
170+
171+
tool_definitions = [t.to_json_schema_spec()["function"] for t in tools]
172+
173+
AGENT_INSTRUCTIONS = (
174+
"You are a travel planning assistant. Help users plan trips by checking weather, "
175+
"finding flights and hotels within budget, and suggesting activities based on their interests. "
176+
"Always provide a complete itinerary with costs for each component and ensure the total stays "
177+
"within the user's budget. Include weather information to help with packing."
178+
)
179+
180+
agent = ChatAgent(
181+
name="travel-planner",
182+
chat_client=client,
183+
instructions=AGENT_INSTRUCTIONS,
184+
tools=tools,
185+
)
186+
187+
188+
def convert_to_evaluator_messages(messages) -> list[dict]:
189+
"""Convert agent framework ChatMessages to the Azure AI Evaluation message schema.
190+
191+
Remaps content types: function_call -> tool_call, function_result -> tool_result.
192+
See: https://learn.microsoft.com/azure/ai-foundry/how-to/develop/agent-evaluate-sdk#agent-message-schema
193+
"""
194+
evaluator_messages = []
195+
for msg in messages:
196+
role = str(msg.role.value) if hasattr(msg.role, "value") else str(msg.role)
197+
content_items = []
198+
for c in msg.contents:
199+
if c.type == "function_call":
200+
content_items.append(
201+
{
202+
"type": "tool_call",
203+
"tool_call_id": c.call_id,
204+
"name": c.name,
205+
"arguments": json.loads(c.arguments) if isinstance(c.arguments, str) else c.arguments,
206+
}
207+
)
208+
elif c.type == "function_result":
209+
content_items.append(
210+
{
211+
"type": "tool_result",
212+
"tool_result": c.result,
213+
}
214+
)
215+
if c.call_id:
216+
evaluator_messages.append({"role": role, "tool_call_id": c.call_id, "content": content_items})
217+
content_items = []
218+
continue
219+
elif c.type == "text" and c.text:
220+
content_items.append({"type": "text", "text": c.text})
221+
if content_items:
222+
evaluator_messages.append({"role": role, "content": content_items})
223+
return evaluator_messages
224+
225+
226+
def display_evaluation_results(results: dict[str, dict]) -> None:
227+
"""Display evaluation results in a formatted table using rich."""
228+
table = Table(title="Agent Evaluation Results", show_lines=True)
229+
table.add_column("Evaluator", style="cyan", width=28)
230+
table.add_column("Score", style="bold", justify="center", width=8)
231+
table.add_column("Result", justify="center", width=8)
232+
table.add_column("Reason", style="dim", width=70)
233+
234+
for evaluator_name, result in results.items():
235+
score = str(result.get("score", "N/A"))
236+
pass_fail = result.get("result", "N/A")
237+
reason = result.get("reason", "N/A")
238+
239+
if pass_fail == "pass":
240+
result_str = "[green]pass[/green]"
241+
elif pass_fail == "fail":
242+
result_str = "[red]fail[/red]"
243+
else:
244+
result_str = str(pass_fail)
245+
246+
table.add_row(evaluator_name, score, result_str, reason)
247+
248+
print()
249+
print(table)
250+
251+
252+
async def main():
253+
query = "Plan a 3-day trip from New York to Tokyo next month on a $2000 budget. I like hiking and museums."
254+
255+
logger.info("Running travel planner agent...")
256+
response = await agent.run(query)
257+
print(Panel(response.text, title="Agent Response", border_style="blue"))
258+
259+
# See: https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/agent-evaluators
260+
eval_query = [
261+
{"role": "system", "content": AGENT_INSTRUCTIONS},
262+
{"role": "user", "content": [{"type": "text", "text": query}]},
263+
]
264+
eval_response = convert_to_evaluator_messages(response.messages)
265+
266+
# ResponseCompletenessEvaluator compares the response against this ground truth
267+
ground_truth = (
268+
"A complete 3-day Tokyo trip itinerary from New York including: round-trip flight options with prices, "
269+
"hotel recommendations within nightly budget, hiking activities (e.g. Mt. Takao), museum visits "
270+
"(e.g. Tokyo National Museum, teamLab Borderless), weather forecast for the travel dates, "
271+
"a full cost breakdown showing total under $2000, and packing suggestions based on weather."
272+
)
273+
274+
logger.info("Running agent evaluators...")
275+
276+
# TODO: is_reasoning_model=True is needed because the GitHub Models endpoint rejects the max_tokens
277+
# parameter that the SDK's prompty templates hardcode. This flag swaps it to max_completion_tokens.
278+
# On Azure OpenAI this may not be necessary. Remove once the SDK updates its prompty templates.
279+
evaluator_kwargs = {"model_config": eval_model_config, "is_reasoning_model": True}
280+
intent_evaluator = IntentResolutionEvaluator(**evaluator_kwargs)
281+
completeness_evaluator = ResponseCompletenessEvaluator(**evaluator_kwargs)
282+
adherence_evaluator = TaskAdherenceEvaluator(**evaluator_kwargs)
283+
tool_accuracy_evaluator = ToolCallAccuracyEvaluator(**evaluator_kwargs)
284+
285+
intent_result = intent_evaluator(query=eval_query, response=eval_response)
286+
completeness_result = completeness_evaluator(response=response.text, ground_truth=ground_truth)
287+
adherence_result = adherence_evaluator(query=eval_query, response=eval_response)
288+
tool_accuracy_result = tool_accuracy_evaluator(
289+
query=eval_query, response=eval_response, tool_definitions=tool_definitions
290+
)
291+
292+
# Evaluator output keys follow the pattern: {key}, {key}_result, {key}_reason
293+
result_keys = {
294+
"IntentResolution": "intent_resolution",
295+
"ResponseCompleteness": "response_completeness",
296+
"TaskAdherence": "task_adherence",
297+
"ToolCallAccuracy": "tool_call_accuracy",
298+
}
299+
evaluation_results = {}
300+
for name, result in [
301+
("IntentResolution", intent_result),
302+
("ResponseCompleteness", completeness_result),
303+
("TaskAdherence", adherence_result),
304+
("ToolCallAccuracy", tool_accuracy_result),
305+
]:
306+
key = result_keys[name]
307+
evaluation_results[name] = {
308+
"score": result.get(key, "N/A"),
309+
"result": result.get(f"{key}_result", "N/A"),
310+
"reason": result.get(f"{key}_reason", result.get("error_message", "N/A")),
311+
}
312+
313+
display_evaluation_results(evaluation_results)
314+
315+
if async_credential:
316+
await async_credential.close()
317+
318+
319+
if __name__ == "__main__":
320+
asyncio.run(main())

examples/spanish/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ Puedes ejecutar los ejemplos en este repositorio ejecutando los scripts en el di
175175
| [openai_tool_calling.py](openai_tool_calling.py) | Llamadas a funciones con el SDK de OpenAI de bajo nivel, mostrando despacho manual de herramientas. |
176176
| [workflow_basic.py](workflow_basic.py) | Usa Agent Framework para crear un agente basado en flujo de trabajo. |
177177
| [agent_otel_aspire.py](agent_otel_aspire.py) | Un agente con trazas, métricas y logs estructurados de OpenTelemetry exportados al [Aspire Dashboard](https://aspire.dev/dashboard/standalone/). |
178+
| [agent_evaluation.py](agent_evaluation.py) | Evalúa un agente planificador de viajes usando evaluadores de [Azure AI Evaluation](https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/agent-evaluators) (IntentResolution, ToolCallAccuracy, TaskAdherence, ResponseCompleteness). |
178179

179180
## Usar el Aspire Dashboard para telemetría
180181

0 commit comments

Comments
 (0)