Skip to content

Commit 3ba1112

Browse files
committed
Add tool instruction following comparison tests across models
We noticed inconsistencies in how Grok models invoke (or not) tools depending on their description. Right now, the 4.20 model randomly fails whereas 4.1 passes every time (on not-calling scenario). This will serve to monitor future model behavior changes, as well as documenting behavior delta (if any).
1 parent 13b4da7 commit 3ba1112

2 files changed

Lines changed: 139 additions & 4 deletions

File tree

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
using System.ComponentModel;
2+
using System.Text.Json;
3+
using Microsoft.Extensions.AI;
4+
using static ConfigurationExtensions;
5+
6+
namespace xAI.Tests;
7+
8+
public class ToolCallingFollowing(ITestOutputHelper output)
9+
{
10+
[SecretsTheory("XAI_API_KEY")]
11+
[MemberData(nameof(AllDistressMessages))]
12+
public async Task InvokesDistress(string model, string message)
13+
{
14+
var chat = new GrokClient(Configuration["XAI_API_KEY"]!).AsIChatClient(model)
15+
.AsBuilder()
16+
.UseFunctionInvocation(configure: client => client.MaximumIterationsPerRequest = 3)
17+
.UseLogging(output.AsLoggerFactory())
18+
.Build();
19+
20+
var options = new ChatOptions
21+
{
22+
Tools = [AIFunctionFactory.Create(SendAlertAsync)]
23+
};
24+
25+
var response = await chat.GetResponseAsync(message, options);
26+
27+
var calledTools = response.Messages
28+
.SelectMany(m => m.Contents.OfType<FunctionCallContent>())
29+
.Select(fc => fc.Name)
30+
.ToList();
31+
32+
Assert.True(
33+
calledTools.Contains("emergency_alert", StringComparer.OrdinalIgnoreCase),
34+
$"[{model}] LLM did not call emergency_alert for: \"{message}\". " +
35+
$"Tools called: [{string.Join(", ", calledTools)}]");
36+
}
37+
38+
[SecretsTheory("XAI_API_KEY")]
39+
[MemberData(nameof(AllRoutineMessages))]
40+
public async Task DoesNotInvokeDistress(string model, string message)
41+
{
42+
var chat = new GrokClient(Configuration["XAI_API_KEY"]!).AsIChatClient(model)
43+
.AsBuilder()
44+
.UseFunctionInvocation(configure: client => client.MaximumIterationsPerRequest = 3)
45+
.UseLogging(output.AsLoggerFactory())
46+
.Build();
47+
48+
var options = new ChatOptions
49+
{
50+
Tools = [AIFunctionFactory.Create(SendAlertAsync)],
51+
ToolMode = ChatToolMode.Auto
52+
};
53+
54+
var response = await chat.GetResponseAsync(message, options);
55+
56+
var calledTools = response.Messages
57+
.SelectMany(m => m.Contents.OfType<FunctionCallContent>())
58+
.Select(fc => fc.Name)
59+
.ToList();
60+
61+
var jopt = new JsonSerializerOptions(JsonSerializerDefaults.Web)
62+
{
63+
WriteIndented = true
64+
};
65+
66+
Assert.False(
67+
calledTools.Contains("emergency_alert", StringComparer.OrdinalIgnoreCase),
68+
$"""
69+
[{model}] LLM did call emergency_alert for: "{message}".
70+
Tool calls:
71+
{string.Join(Environment.NewLine, response.Messages
72+
.SelectMany(x => x.Contents.OfType<FunctionCallContent>()
73+
.Select(x => JsonSerializer.Serialize(x, jopt))))}
74+
""");
75+
}
76+
77+
public static IEnumerable<object[]> AllDistressMessages =>
78+
from model in Models
79+
from message in DistressPhrases
80+
select new object[] { model, message };
81+
82+
public static IEnumerable<object[]> AllRoutineMessages =>
83+
from model in Models
84+
from message in RoutinePhrases
85+
select new object[] { model, message };
86+
87+
static readonly string[] Models = ["grok-4-1-fast-non-reasoning", "grok-4.20-non-reasoning"];
88+
89+
static readonly string[] RoutinePhrases =
90+
[
91+
"¿qué hora es?",
92+
"¿qué tiempo hace hoy?",
93+
"poneme música",
94+
"cuéntame un chiste",
95+
"buenas tardes",
96+
"thanks",
97+
"gracias",
98+
"what time is it?",
99+
"tell me a joke",
100+
"play some music",
101+
"good morning",
102+
"what's the weather like?",
103+
"remind me to take my pills at 8pm",
104+
];
105+
106+
static readonly string[] DistressPhrases =
107+
[
108+
"necesito ayuda, me siento muy mal",
109+
"socorro",
110+
"me caí",
111+
"me caí de la cama",
112+
"me duele el pecho",
113+
"no puedo levantarme",
114+
"llamá a mi hijo, me caí",
115+
"please help me, I am feeling really bad",
116+
"I fell down",
117+
"I'm having chest pains",
118+
"I can't get up",
119+
"I think I'm having a heart attack",
120+
];
121+
122+
[DisplayName("emergency_alert")]
123+
[Description(
124+
"""
125+
Use ONLY when the elder is expressing their own physical distress, inability to move, or calling for help
126+
right now — examples: a fall ('I fell', 'I can't get up', 'no puedo levantarme'), acute pain, difficulty
127+
breathing, chest pain, feeling very unwell, or an explicit plea for help ('help me', 'I feel terrible',
128+
'please help me', 'I am feeling really bad').
129+
""")]
130+
public Task<string> SendAlertAsync(
131+
[Description("The elder's exact words (verbatim transcription). Must be the original user message, not a summary or paraphrase.")]
132+
string elder_message,
133+
[Description("Explanation of why the tool is being invoked")]
134+
string reason,
135+
[Description("Severity level: 'high' for general distress, 'critical' for falls, chest pain, breathing difficulty")]
136+
string severity = "high",
137+
CancellationToken ct = default) => Task.FromResult("done");
138+
}

src/xAI.Tests/xAI.Tests.csproj

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,8 @@
22

33
<PropertyGroup>
44
<TargetFramework>net10.0</TargetFramework>
5-
<ImplicitUsings>enable</ImplicitUsings>
6-
<Nullable>enable</Nullable>
7-
<IsPackable>false</IsPackable>
5+
<RootNamespace>xAI.Tests</RootNamespace>
86
<NoWarn>MEAI001;xAI001;$(NoWarn)</NoWarn>
9-
<LangVersion>latest</LangVersion>
107
</PropertyGroup>
118

129
<ItemGroup>

0 commit comments

Comments
 (0)