Skip to content

Commit a6a4a46

Browse files
committed
Add Fondry Project setting
1 parent 68de6c6 commit a6a4a46

3 files changed

Lines changed: 226 additions & 74 deletions

File tree

.env.sample

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,6 @@ OPENAI_MODEL=gpt-3.5-turbo
99
# Configure for GitHub models: (GITHUB_TOKEN already exists inside Codespaces)
1010
GITHUB_MODEL=gpt-5-mini
1111
GITHUB_TOKEN=YOUR-GITHUB-PERSONAL-ACCESS-TOKEN
12-
OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
12+
OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317
13+
# Optional: Set to log evaluation results to Azure AI Foundry for rich visualization
14+
AZURE_AI_PROJECT=https://YOUR-ACCOUNT.services.ai.azure.com/api/projects/YOUR-PROJECT

examples/agent_evaluation.py

Lines changed: 110 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import json
33
import logging
44
import os
5+
import tempfile
56
from typing import Annotated
67

78
from agent_framework import ChatAgent, tool
@@ -13,6 +14,7 @@
1314
ResponseCompletenessEvaluator,
1415
TaskAdherenceEvaluator,
1516
ToolCallAccuracyEvaluator,
17+
evaluate,
1618
)
1719
from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
1820
from dotenv import load_dotenv
@@ -66,6 +68,10 @@
6668
model=os.environ.get("OPENAI_MODEL", "gpt-5-mini"),
6769
)
6870

71+
# Optional: Set AZURE_AI_PROJECT in .env to log results to Azure AI Foundry.
72+
# Example: https://your-account.services.ai.azure.com/api/projects/your-project
73+
AZURE_AI_PROJECT = os.getenv("AZURE_AI_PROJECT")
74+
6975

7076
@tool
7177
def get_weather(
@@ -206,16 +212,29 @@ def convert_to_evaluator_messages(messages) -> list[dict]:
206212
}
207213
)
208214
elif c.type == "function_result":
215+
if c.call_id:
216+
if content_items:
217+
evaluator_messages.append({"role": role, "content": content_items})
218+
content_items = []
219+
evaluator_messages.append(
220+
{
221+
"role": "tool",
222+
"tool_call_id": c.call_id,
223+
"content": [
224+
{
225+
"type": "tool_result",
226+
"tool_result": c.result,
227+
}
228+
],
229+
}
230+
)
231+
continue
209232
content_items.append(
210233
{
211234
"type": "tool_result",
212235
"tool_result": c.result,
213236
}
214237
)
215-
if c.call_id:
216-
evaluator_messages.append({"role": role, "tool_call_id": c.call_id, "content": content_items})
217-
content_items = []
218-
continue
219238
elif c.type == "text" and c.text:
220239
content_items.append({"type": "text", "text": c.text})
221240
if content_items:
@@ -256,14 +275,12 @@ async def main():
256275
response = await agent.run(query)
257276
print(Panel(response.text, title="Agent Response", border_style="blue"))
258277

259-
# See: https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/agent-evaluators
260278
eval_query = [
261279
{"role": "system", "content": AGENT_INSTRUCTIONS},
262280
{"role": "user", "content": [{"type": "text", "text": query}]},
263281
]
264282
eval_response = convert_to_evaluator_messages(response.messages)
265283

266-
# ResponseCompletenessEvaluator compares the response against this ground truth
267284
ground_truth = (
268285
"A complete 3-day Tokyo trip itinerary from New York including: round-trip flight options with prices, "
269286
"hotel recommendations within nightly budget, hiking activities (e.g. Mt. Takao), museum visits "
@@ -273,44 +290,102 @@ async def main():
273290

274291
logger.info("Running agent evaluators...")
275292

276-
# TODO: is_reasoning_model=True is needed because the GitHub Models endpoint rejects the max_tokens
277-
# parameter that the SDK's prompty templates hardcode. This flag swaps it to max_completion_tokens.
278-
# On Azure OpenAI this may not be necessary. Remove once the SDK updates its prompty templates.
279293
evaluator_kwargs = {"model_config": eval_model_config, "is_reasoning_model": True}
280-
intent_evaluator = IntentResolutionEvaluator(**evaluator_kwargs)
281-
completeness_evaluator = ResponseCompletenessEvaluator(**evaluator_kwargs)
282-
adherence_evaluator = TaskAdherenceEvaluator(**evaluator_kwargs)
283-
tool_accuracy_evaluator = ToolCallAccuracyEvaluator(**evaluator_kwargs)
284-
285-
intent_result = intent_evaluator(query=eval_query, response=eval_response)
286-
completeness_result = completeness_evaluator(response=response.text, ground_truth=ground_truth)
287-
adherence_result = adherence_evaluator(query=eval_query, response=eval_response)
288-
tool_accuracy_result = tool_accuracy_evaluator(
289-
query=eval_query, response=eval_response, tool_definitions=tool_definitions
290-
)
291-
292-
# Evaluator output keys follow the pattern: {key}, {key}_result, {key}_reason
293294
result_keys = {
294295
"IntentResolution": "intent_resolution",
295296
"ResponseCompleteness": "response_completeness",
296297
"TaskAdherence": "task_adherence",
297298
"ToolCallAccuracy": "tool_call_accuracy",
298299
}
299-
evaluation_results = {}
300-
for name, result in [
301-
("IntentResolution", intent_result),
302-
("ResponseCompleteness", completeness_result),
303-
("TaskAdherence", adherence_result),
304-
("ToolCallAccuracy", tool_accuracy_result),
305-
]:
306-
key = result_keys[name]
307-
evaluation_results[name] = {
308-
"score": result.get(key, "N/A"),
309-
"result": result.get(f"{key}_result", "N/A"),
310-
"reason": result.get(f"{key}_reason", result.get("error_message", "N/A")),
300+
301+
if AZURE_AI_PROJECT:
302+
logger.info(f"Logging evaluation results to Azure AI project: {AZURE_AI_PROJECT}")
303+
304+
eval_data_row = {
305+
"query": eval_query,
306+
"response": eval_response,
307+
"response_text": response.text,
308+
"ground_truth": ground_truth,
309+
"tool_definitions": tool_definitions,
311310
}
312311

313-
display_evaluation_results(evaluation_results)
312+
with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False, encoding="utf-8") as f:
313+
f.write(json.dumps(eval_data_row) + "\n")
314+
eval_data_file = f.name
315+
316+
try:
317+
eval_result = evaluate(
318+
data=eval_data_file,
319+
evaluation_name="travel-planner-agent-eval",
320+
evaluators={
321+
"intent_resolution": IntentResolutionEvaluator(**evaluator_kwargs),
322+
"response_completeness": ResponseCompletenessEvaluator(**evaluator_kwargs),
323+
"task_adherence": TaskAdherenceEvaluator(**evaluator_kwargs),
324+
"tool_call_accuracy": ToolCallAccuracyEvaluator(**evaluator_kwargs),
325+
},
326+
# ResponseCompletenessEvaluator expects a plain text response, not a message list,
327+
# so we override its column mapping to use response_text and ground_truth.
328+
# Other evaluators auto-map correctly since data keys match param names.
329+
evaluator_config={
330+
"response_completeness": {
331+
"column_mapping": {
332+
"response": "${data.response_text}",
333+
"ground_truth": "${data.ground_truth}",
334+
}
335+
},
336+
},
337+
azure_ai_project=AZURE_AI_PROJECT,
338+
)
339+
340+
# Parse results from the batch evaluate() output
341+
evaluation_results = {}
342+
rows = eval_result.get("rows", [])
343+
row = rows[0] if rows else {}
344+
345+
for display_name, key in result_keys.items():
346+
evaluation_results[display_name] = {
347+
"score": row.get(f"outputs.{key}.{key}", "N/A"),
348+
"result": row.get(f"outputs.{key}.{key}_result", "N/A"),
349+
"reason": row.get(f"outputs.{key}.{key}_reason", "N/A"),
350+
}
351+
352+
display_evaluation_results(evaluation_results)
353+
354+
studio_url = eval_result.get("studio_url")
355+
if studio_url:
356+
print(f"\n[bold blue]View results in Azure AI Foundry:[/bold blue] {studio_url}")
357+
finally:
358+
os.unlink(eval_data_file)
359+
else:
360+
intent_evaluator = IntentResolutionEvaluator(**evaluator_kwargs)
361+
completeness_evaluator = ResponseCompletenessEvaluator(**evaluator_kwargs)
362+
adherence_evaluator = TaskAdherenceEvaluator(**evaluator_kwargs)
363+
tool_accuracy_evaluator = ToolCallAccuracyEvaluator(**evaluator_kwargs)
364+
365+
intent_result = intent_evaluator(query=eval_query, response=eval_response, tool_definitions=tool_definitions)
366+
completeness_result = completeness_evaluator(response=response.text, ground_truth=ground_truth)
367+
adherence_result = adherence_evaluator(
368+
query=eval_query, response=eval_response, tool_definitions=tool_definitions
369+
)
370+
tool_accuracy_result = tool_accuracy_evaluator(
371+
query=eval_query, response=eval_response, tool_definitions=tool_definitions
372+
)
373+
374+
evaluation_results = {}
375+
for name, result in [
376+
("IntentResolution", intent_result),
377+
("ResponseCompleteness", completeness_result),
378+
("TaskAdherence", adherence_result),
379+
("ToolCallAccuracy", tool_accuracy_result),
380+
]:
381+
key = result_keys[name]
382+
evaluation_results[name] = {
383+
"score": result.get(key, "N/A"),
384+
"result": result.get(f"{key}_result", "N/A"),
385+
"reason": result.get(f"{key}_reason", result.get("error_message", "N/A")),
386+
}
387+
388+
display_evaluation_results(evaluation_results)
314389

315390
if async_credential:
316391
await async_credential.close()

0 commit comments

Comments
 (0)