22import json
33import logging
44import os
5+ import tempfile
56from typing import Annotated
67
78from agent_framework import ChatAgent , tool
1314 ResponseCompletenessEvaluator ,
1415 TaskAdherenceEvaluator ,
1516 ToolCallAccuracyEvaluator ,
17+ evaluate ,
1618)
1719from azure .identity .aio import DefaultAzureCredential , get_bearer_token_provider
1820from dotenv import load_dotenv
6668 model = os .environ .get ("OPENAI_MODEL" , "gpt-5-mini" ),
6769 )
6870
71+ # Optional: Set AZURE_AI_PROJECT in .env to log results to Azure AI Foundry.
72+ # Example: https://your-account.services.ai.azure.com/api/projects/your-project
73+ AZURE_AI_PROJECT = os .getenv ("AZURE_AI_PROJECT" )
74+
6975
7076@tool
7177def get_weather (
@@ -206,16 +212,29 @@ def convert_to_evaluator_messages(messages) -> list[dict]:
206212 }
207213 )
208214 elif c .type == "function_result" :
215+ if c .call_id :
216+ if content_items :
217+ evaluator_messages .append ({"role" : role , "content" : content_items })
218+ content_items = []
219+ evaluator_messages .append (
220+ {
221+ "role" : "tool" ,
222+ "tool_call_id" : c .call_id ,
223+ "content" : [
224+ {
225+ "type" : "tool_result" ,
226+ "tool_result" : c .result ,
227+ }
228+ ],
229+ }
230+ )
231+ continue
209232 content_items .append (
210233 {
211234 "type" : "tool_result" ,
212235 "tool_result" : c .result ,
213236 }
214237 )
215- if c .call_id :
216- evaluator_messages .append ({"role" : role , "tool_call_id" : c .call_id , "content" : content_items })
217- content_items = []
218- continue
219238 elif c .type == "text" and c .text :
220239 content_items .append ({"type" : "text" , "text" : c .text })
221240 if content_items :
@@ -256,14 +275,12 @@ async def main():
256275 response = await agent .run (query )
257276 print (Panel (response .text , title = "Agent Response" , border_style = "blue" ))
258277
259- # See: https://learn.microsoft.com/azure/ai-foundry/concepts/evaluation-evaluators/agent-evaluators
260278 eval_query = [
261279 {"role" : "system" , "content" : AGENT_INSTRUCTIONS },
262280 {"role" : "user" , "content" : [{"type" : "text" , "text" : query }]},
263281 ]
264282 eval_response = convert_to_evaluator_messages (response .messages )
265283
266- # ResponseCompletenessEvaluator compares the response against this ground truth
267284 ground_truth = (
268285 "A complete 3-day Tokyo trip itinerary from New York including: round-trip flight options with prices, "
269286 "hotel recommendations within nightly budget, hiking activities (e.g. Mt. Takao), museum visits "
@@ -273,44 +290,102 @@ async def main():
273290
274291 logger .info ("Running agent evaluators..." )
275292
276- # TODO: is_reasoning_model=True is needed because the GitHub Models endpoint rejects the max_tokens
277- # parameter that the SDK's prompty templates hardcode. This flag swaps it to max_completion_tokens.
278- # On Azure OpenAI this may not be necessary. Remove once the SDK updates its prompty templates.
279293 evaluator_kwargs = {"model_config" : eval_model_config , "is_reasoning_model" : True }
280- intent_evaluator = IntentResolutionEvaluator (** evaluator_kwargs )
281- completeness_evaluator = ResponseCompletenessEvaluator (** evaluator_kwargs )
282- adherence_evaluator = TaskAdherenceEvaluator (** evaluator_kwargs )
283- tool_accuracy_evaluator = ToolCallAccuracyEvaluator (** evaluator_kwargs )
284-
285- intent_result = intent_evaluator (query = eval_query , response = eval_response )
286- completeness_result = completeness_evaluator (response = response .text , ground_truth = ground_truth )
287- adherence_result = adherence_evaluator (query = eval_query , response = eval_response )
288- tool_accuracy_result = tool_accuracy_evaluator (
289- query = eval_query , response = eval_response , tool_definitions = tool_definitions
290- )
291-
292- # Evaluator output keys follow the pattern: {key}, {key}_result, {key}_reason
293294 result_keys = {
294295 "IntentResolution" : "intent_resolution" ,
295296 "ResponseCompleteness" : "response_completeness" ,
296297 "TaskAdherence" : "task_adherence" ,
297298 "ToolCallAccuracy" : "tool_call_accuracy" ,
298299 }
299- evaluation_results = {}
300- for name , result in [
301- ("IntentResolution" , intent_result ),
302- ("ResponseCompleteness" , completeness_result ),
303- ("TaskAdherence" , adherence_result ),
304- ("ToolCallAccuracy" , tool_accuracy_result ),
305- ]:
306- key = result_keys [name ]
307- evaluation_results [name ] = {
308- "score" : result .get (key , "N/A" ),
309- "result" : result .get (f"{ key } _result" , "N/A" ),
310- "reason" : result .get (f"{ key } _reason" , result .get ("error_message" , "N/A" )),
300+
301+ if AZURE_AI_PROJECT :
302+ logger .info (f"Logging evaluation results to Azure AI project: { AZURE_AI_PROJECT } " )
303+
304+ eval_data_row = {
305+ "query" : eval_query ,
306+ "response" : eval_response ,
307+ "response_text" : response .text ,
308+ "ground_truth" : ground_truth ,
309+ "tool_definitions" : tool_definitions ,
311310 }
312311
313- display_evaluation_results (evaluation_results )
312+ with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".jsonl" , delete = False , encoding = "utf-8" ) as f :
313+ f .write (json .dumps (eval_data_row ) + "\n " )
314+ eval_data_file = f .name
315+
316+ try :
317+ eval_result = evaluate (
318+ data = eval_data_file ,
319+ evaluation_name = "travel-planner-agent-eval" ,
320+ evaluators = {
321+ "intent_resolution" : IntentResolutionEvaluator (** evaluator_kwargs ),
322+ "response_completeness" : ResponseCompletenessEvaluator (** evaluator_kwargs ),
323+ "task_adherence" : TaskAdherenceEvaluator (** evaluator_kwargs ),
324+ "tool_call_accuracy" : ToolCallAccuracyEvaluator (** evaluator_kwargs ),
325+ },
326+ # ResponseCompletenessEvaluator expects a plain text response, not a message list,
327+ # so we override its column mapping to use response_text and ground_truth.
328+ # Other evaluators auto-map correctly since data keys match param names.
329+ evaluator_config = {
330+ "response_completeness" : {
331+ "column_mapping" : {
332+ "response" : "${data.response_text}" ,
333+ "ground_truth" : "${data.ground_truth}" ,
334+ }
335+ },
336+ },
337+ azure_ai_project = AZURE_AI_PROJECT ,
338+ )
339+
340+ # Parse results from the batch evaluate() output
341+ evaluation_results = {}
342+ rows = eval_result .get ("rows" , [])
343+ row = rows [0 ] if rows else {}
344+
345+ for display_name , key in result_keys .items ():
346+ evaluation_results [display_name ] = {
347+ "score" : row .get (f"outputs.{ key } .{ key } " , "N/A" ),
348+ "result" : row .get (f"outputs.{ key } .{ key } _result" , "N/A" ),
349+ "reason" : row .get (f"outputs.{ key } .{ key } _reason" , "N/A" ),
350+ }
351+
352+ display_evaluation_results (evaluation_results )
353+
354+ studio_url = eval_result .get ("studio_url" )
355+ if studio_url :
356+ print (f"\n [bold blue]View results in Azure AI Foundry:[/bold blue] { studio_url } " )
357+ finally :
358+ os .unlink (eval_data_file )
359+ else :
360+ intent_evaluator = IntentResolutionEvaluator (** evaluator_kwargs )
361+ completeness_evaluator = ResponseCompletenessEvaluator (** evaluator_kwargs )
362+ adherence_evaluator = TaskAdherenceEvaluator (** evaluator_kwargs )
363+ tool_accuracy_evaluator = ToolCallAccuracyEvaluator (** evaluator_kwargs )
364+
365+ intent_result = intent_evaluator (query = eval_query , response = eval_response , tool_definitions = tool_definitions )
366+ completeness_result = completeness_evaluator (response = response .text , ground_truth = ground_truth )
367+ adherence_result = adherence_evaluator (
368+ query = eval_query , response = eval_response , tool_definitions = tool_definitions
369+ )
370+ tool_accuracy_result = tool_accuracy_evaluator (
371+ query = eval_query , response = eval_response , tool_definitions = tool_definitions
372+ )
373+
374+ evaluation_results = {}
375+ for name , result in [
376+ ("IntentResolution" , intent_result ),
377+ ("ResponseCompleteness" , completeness_result ),
378+ ("TaskAdherence" , adherence_result ),
379+ ("ToolCallAccuracy" , tool_accuracy_result ),
380+ ]:
381+ key = result_keys [name ]
382+ evaluation_results [name ] = {
383+ "score" : result .get (key , "N/A" ),
384+ "result" : result .get (f"{ key } _result" , "N/A" ),
385+ "reason" : result .get (f"{ key } _reason" , result .get ("error_message" , "N/A" )),
386+ }
387+
388+ display_evaluation_results (evaluation_results )
314389
315390 if async_credential :
316391 await async_credential .close ()
0 commit comments