Azure-Samples
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/agent_evaluation.py‎
Lines changed: 320 additions & 0 deletions b/‎examples/agent_evaluation.py‎
Lines changed: 320 additions & 0 deletions
diff --git a/‎examples/spanish/README.md‎
Lines changed: 1 addition & 0 deletions b/‎examples/spanish/README.md‎
Lines changed: 1 addition & 0 deletions
@@ -174,6 +174,7 @@ You can run the examples in this repository by executing the scripts in the `exa
 | [openai_tool_calling.py](examples/openai_tool_calling.py) | Tool calling with the low-level OpenAI SDK, showing manual tool dispatch. |
 | [workflow_basic.py](examples/workflow_basic.py) | A workflow-based agent. |
 | [agent_otel_aspire.py](examples/agent_otel_aspire.py) | An agent with OpenTelemetry tracing, metrics, and structured logs exported to the [Aspire Dashboard](https://aspire.dev/dashboard/standalone/). |
+| [agent_evaluation.py](examples/agent_evaluation.py) | Evaluate a travel planner agent using [Azure AI Evaluation](https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/agent-evaluators) agent evaluators (IntentResolution, ToolCallAccuracy, TaskAdherence, ResponseCompleteness). |
 
 ## Using the Aspire Dashboard for telemetry
 
 
@@ -0,0 +1,320 @@
+import asyncio
+import json
+import logging
+import os
+from typing import Annotated
+
+from agent_framework import ChatAgent, tool
+from agent_framework.openai import OpenAIChatClient
+from azure.ai.evaluation import (
+    AzureOpenAIModelConfiguration,
+    IntentResolutionEvaluator,
+    OpenAIModelConfiguration,
+    ResponseCompletenessEvaluator,
+    TaskAdherenceEvaluator,
+    ToolCallAccuracyEvaluator,
+)
+from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
+from dotenv import load_dotenv
+from pydantic import Field
+from rich import print
+from rich.logging import RichHandler
+from rich.panel import Panel
+from rich.table import Table
+
+handler = RichHandler(show_path=False, rich_tracebacks=True, show_level=False)
+logging.basicConfig(level=logging.WARNING, handlers=[handler], force=True, format="%(message)s")
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+load_dotenv(override=True)
+API_HOST = os.getenv("API_HOST", "github")
+
+async_credential = None
+if API_HOST == "azure":
+    async_credential = DefaultAzureCredential()
+    token_provider = get_bearer_token_provider(async_credential, "https://cognitiveservices.azure.com/.default")
+    client = OpenAIChatClient(
+        base_url=f"{os.environ['AZURE_OPENAI_ENDPOINT']}/openai/v1/",
+        api_key=token_provider,
+        model_id=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT"],
+    )
+    eval_model_config = AzureOpenAIModelConfiguration(
+        type="azure_openai",
+        azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
+        azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT"],
+    )
+elif API_HOST == "github":
+    client = OpenAIChatClient(
+        base_url="https://models.github.ai/inference",
+        api_key=os.environ["GITHUB_TOKEN"],
+        model_id=os.getenv("GITHUB_MODEL", "openai/gpt-5-mini"),
+    )
+    eval_model_config = OpenAIModelConfiguration(
+        type="openai",
+        base_url="https://models.github.ai/inference",
+        api_key=os.environ["GITHUB_TOKEN"],
+        model="openai/gpt-5-mini",
+    )
+else:
+    client = OpenAIChatClient(
+        api_key=os.environ["OPENAI_API_KEY"], model_id=os.environ.get("OPENAI_MODEL", "gpt-5-mini")
+    )
+    eval_model_config = OpenAIModelConfiguration(
+        type="openai",
+        api_key=os.environ["OPENAI_API_KEY"],
+        model=os.environ.get("OPENAI_MODEL", "gpt-5-mini"),
+    )
+
+
+@tool
+def get_weather(
+    city: Annotated[str, Field(description="The city to get the weather forecast for.")],
+    date_range: Annotated[str, Field(description="Date range in format 'YYYY-MM-DD to YYYY-MM-DD'.")],
+) -> dict:
+    """Returns a weather forecast for a city over a date range, including temperature and conditions."""
+    logger.info(f"Getting weather for {city} ({date_range})")
+    return {
+        "city": city,
+        "date_range": date_range,
+        "forecast": [
+            {"date": "Day 1", "high_f": 65, "low_f": 52, "conditions": "Partly cloudy"},
+            {"date": "Day 2", "high_f": 70, "low_f": 55, "conditions": "Sunny"},
+            {"date": "Day 3", "high_f": 62, "low_f": 50, "conditions": "Light rain"},
+        ],
+    }
+
+
+@tool
+def search_flights(
+    origin: Annotated[str, Field(description="Departure city or airport code.")],
+    destination: Annotated[str, Field(description="Arrival city or airport code.")],
+    departure_date: Annotated[str, Field(description="Departure date in YYYY-MM-DD format.")],
+    return_date: Annotated[str, Field(description="Return date in YYYY-MM-DD format.")],
+) -> list[dict]:
+    """Searches for round-trip flights and returns options with prices."""
+    logger.info(f"Searching flights {origin} -> {destination} ({departure_date} to {return_date})")
+    return [
+        {"airline": "SkyAir", "price_usd": 850, "duration": "14h 20m", "stops": 1},
+        {"airline": "OceanWings", "price_usd": 720, "duration": "16h 45m", "stops": 2},
+        {"airline": "DirectJet", "price_usd": 1100, "duration": "12h 30m", "stops": 0},
+    ]
+
+
+@tool
+def search_hotels(
+    city: Annotated[str, Field(description="The city to search hotels in.")],
+    checkin: Annotated[str, Field(description="Check-in date in YYYY-MM-DD format.")],
+    checkout: Annotated[str, Field(description="Check-out date in YYYY-MM-DD format.")],
+    max_price_per_night: Annotated[int, Field(description="Maximum price per night in USD.")],
+) -> list[dict]:
+    """Searches for hotels within a nightly budget and returns options with ratings."""
+    logger.info(f"Searching hotels in {city} ({checkin} to {checkout}, max ${max_price_per_night}/night)")
+    return [
+        {"name": "Budget Inn Tokyo", "price_per_night_usd": 80, "rating": 3.8, "neighborhood": "Asakusa"},
+        {"name": "Sakura Hotel", "price_per_night_usd": 120, "rating": 4.2, "neighborhood": "Shinjuku"},
+        {"name": "Tokyo Garden Suites", "price_per_night_usd": 200, "rating": 4.6, "neighborhood": "Ginza"},
+    ]
+
+
+@tool
+def get_activities(
+    city: Annotated[str, Field(description="The city to find activities in.")],
+    interests: Annotated[list[str], Field(description="List of interests, e.g. ['hiking', 'museums'].")],
+) -> list[dict]:
+    """Returns activity suggestions for a city based on user interests."""
+    logger.info(f"Getting activities in {city} for interests: {interests}")
+    activities = []
+    if "hiking" in [i.lower() for i in interests]:
+        activities.extend(
+            [
+                {"name": "Mt. Takao Day Hike", "cost_usd": 15, "duration": "4-5 hours"},
+                {"name": "Kamakura Trail Walk", "cost_usd": 25, "duration": "3 hours"},
+            ]
+        )
+    if "museums" in [i.lower() for i in interests]:
+        activities.extend(
+            [
+                {"name": "Tokyo National Museum", "cost_usd": 10, "duration": "2-3 hours"},
+                {"name": "teamLab Borderless", "cost_usd": 30, "duration": "2 hours"},
+            ]
+        )
+    if not activities:
+        activities = [{"name": "City walking tour", "cost_usd": 0, "duration": "3 hours"}]
+    return activities
+
+
+@tool
+def estimate_budget(
+    total_budget: Annotated[int, Field(description="Total trip budget in USD.")],
+    num_days: Annotated[int, Field(description="Number of days for the trip.")],
+) -> dict:
+    """Provides a recommended budget breakdown for flights, hotels, activities, and food."""
+    logger.info(f"Estimating budget: ${total_budget} for {num_days} days")
+    flight_pct = 0.40
+    hotel_pct = 0.30
+    activities_pct = 0.15
+    food_pct = 0.15
+    return {
+        "total_budget_usd": total_budget,
+        "flights_usd": int(total_budget * flight_pct),
+        "hotels_usd": int(total_budget * hotel_pct),
+        "hotels_per_night_usd": int(total_budget * hotel_pct / num_days),
+        "activities_usd": int(total_budget * activities_pct),
+        "food_usd": int(total_budget * food_pct),
+        "food_per_day_usd": int(total_budget * food_pct / num_days),
+    }
+
+
+tools = [get_weather, search_flights, search_hotels, get_activities, estimate_budget]
+
+tool_definitions = [t.to_json_schema_spec()["function"] for t in tools]
+
+AGENT_INSTRUCTIONS = (
+    "You are a travel planning assistant. Help users plan trips by checking weather, "
+    "finding flights and hotels within budget, and suggesting activities based on their interests. "
+    "Always provide a complete itinerary with costs for each component and ensure the total stays "
+    "within the user's budget. Include weather information to help with packing."
+)
+
+agent = ChatAgent(
+    name="travel-planner",
+    chat_client=client,
+    instructions=AGENT_INSTRUCTIONS,
+    tools=tools,
+)
+
+
+def convert_to_evaluator_messages(messages) -> list[dict]:
+    """Convert agent framework ChatMessages to the Azure AI Evaluation message schema.
+
+    Remaps content types: function_call -> tool_call, function_result -> tool_result.
+    See: https://learn.microsoft.com/azure/ai-foundry/how-to/develop/agent-evaluate-sdk#agent-message-schema
+    """
+    evaluator_messages = []
+    for msg in messages:
+        role = str(msg.role.value) if hasattr(msg.role, "value") else str(msg.role)
+        content_items = []
+        for c in msg.contents:
+            if c.type == "function_call":
+                content_items.append(
+                    {
+                        "type": "tool_call",
+                        "tool_call_id": c.call_id,
+                        "name": c.name,
+                        "arguments": json.loads(c.arguments) if isinstance(c.arguments, str) else c.arguments,
+                    }
+                )
+            elif c.type == "function_result":
+                content_items.append(
+                    {
+                        "type": "tool_result",
+                        "tool_result": c.result,
+                    }
+                )
+                if c.call_id:
+                    evaluator_messages.append({"role": role, "tool_call_id": c.call_id, "content": content_items})
+                    content_items = []
+                    continue
+            elif c.type == "text" and c.text:
+                content_items.append({"type": "text", "text": c.text})
+        if content_items:
+            evaluator_messages.append({"role": role, "content": content_items})
+    return evaluator_messages
+
+
+def display_evaluation_results(results: dict[str, dict]) -> None:
+    """Display evaluation results in a formatted table using rich."""
+    table = Table(title="Agent Evaluation Results", show_lines=True)
+    table.add_column("Evaluator", style="cyan", width=28)
+    table.add_column("Score", style="bold", justify="center", width=8)
+    table.add_column("Result", justify="center", width=8)
+    table.add_column("Reason", style="dim", width=70)
+
+    for evaluator_name, result in results.items():
+        score = str(result.get("score", "N/A"))
+        pass_fail = result.get("result", "N/A")
+        reason = result.get("reason", "N/A")
+
+        if pass_fail == "pass":
+            result_str = "[green]pass[/green]"
+        elif pass_fail == "fail":
+            result_str = "[red]fail[/red]"
+        else:
+            result_str = str(pass_fail)
+
+        table.add_row(evaluator_name, score, result_str, reason)
+
+    print()
+    print(table)
+
+
+async def main():
+    query = "Plan a 3-day trip from New York to Tokyo next month on a $2000 budget. I like hiking and museums."
+
+    logger.info("Running travel planner agent...")
+    response = await agent.run(query)
+    print(Panel(response.text, title="Agent Response", border_style="blue"))
+
+    # See: https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/agent-evaluators
+    eval_query = [
+        {"role": "system", "content": AGENT_INSTRUCTIONS},
+        {"role": "user", "content": [{"type": "text", "text": query}]},
+    ]
+    eval_response = convert_to_evaluator_messages(response.messages)
+
+    # ResponseCompletenessEvaluator compares the response against this ground truth
+    ground_truth = (
+        "A complete 3-day Tokyo trip itinerary from New York including: round-trip flight options with prices, "
+        "hotel recommendations within nightly budget, hiking activities (e.g. Mt. Takao), museum visits "
+        "(e.g. Tokyo National Museum, teamLab Borderless), weather forecast for the travel dates, "
+        "a full cost breakdown showing total under $2000, and packing suggestions based on weather."
+    )
+
+    logger.info("Running agent evaluators...")
+
+    # TODO: is_reasoning_model=True is needed because the GitHub Models endpoint rejects the max_tokens
+    # parameter that the SDK's prompty templates hardcode. This flag swaps it to max_completion_tokens.
+    # On Azure OpenAI this may not be necessary. Remove once the SDK updates its prompty templates.
+    evaluator_kwargs = {"model_config": eval_model_config, "is_reasoning_model": True}
+    intent_evaluator = IntentResolutionEvaluator(**evaluator_kwargs)
+    completeness_evaluator = ResponseCompletenessEvaluator(**evaluator_kwargs)
+    adherence_evaluator = TaskAdherenceEvaluator(**evaluator_kwargs)
+    tool_accuracy_evaluator = ToolCallAccuracyEvaluator(**evaluator_kwargs)
+
+    intent_result = intent_evaluator(query=eval_query, response=eval_response)
+    completeness_result = completeness_evaluator(response=response.text, ground_truth=ground_truth)
+    adherence_result = adherence_evaluator(query=eval_query, response=eval_response)
+    tool_accuracy_result = tool_accuracy_evaluator(
+        query=eval_query, response=eval_response, tool_definitions=tool_definitions
+    )
+
+    # Evaluator output keys follow the pattern: {key}, {key}_result, {key}_reason
+    result_keys = {
+        "IntentResolution": "intent_resolution",
+        "ResponseCompleteness": "response_completeness",
+        "TaskAdherence": "task_adherence",
+        "ToolCallAccuracy": "tool_call_accuracy",
+    }
+    evaluation_results = {}
+    for name, result in [
+        ("IntentResolution", intent_result),
+        ("ResponseCompleteness", completeness_result),
+        ("TaskAdherence", adherence_result),
+        ("ToolCallAccuracy", tool_accuracy_result),
+    ]:
+        key = result_keys[name]
+        evaluation_results[name] = {
+            "score": result.get(key, "N/A"),
+            "result": result.get(f"{key}_result", "N/A"),
+            "reason": result.get(f"{key}_reason", result.get("error_message", "N/A")),
+        }
+
+    display_evaluation_results(evaluation_results)
+
+    if async_credential:
+        await async_credential.close()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
@@ -175,6 +175,7 @@ Puedes ejecutar los ejemplos en este repositorio ejecutando los scripts en el di
 | [openai_tool_calling.py](openai_tool_calling.py) | Llamadas a funciones con el SDK de OpenAI de bajo nivel, mostrando despacho manual de herramientas. |
 | [workflow_basic.py](workflow_basic.py) | Usa Agent Framework para crear un agente basado en flujo de trabajo. |
 | [agent_otel_aspire.py](agent_otel_aspire.py) | Un agente con trazas, métricas y logs estructurados de OpenTelemetry exportados al [Aspire Dashboard](https://aspire.dev/dashboard/standalone/). |
+| [agent_evaluation.py](agent_evaluation.py) | Evalúa un agente planificador de viajes usando evaluadores de [Azure AI Evaluation](https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/agent-evaluators) (IntentResolution, ToolCallAccuracy, TaskAdherence, ResponseCompleteness). |
 
 ## Usar el Aspire Dashboard para telemetría