Skip to content

Commit 6d6cb84

Browse files
authored
.NET: Improve resilience of verify-samples by building separately and improving evaluation instructions (#5151)
* Improve resilience of verify-samples by building separately and improving evaluation instructions * Address PR comments * Address PR comment
1 parent 79afda1 commit 6d6cb84

File tree

7 files changed

+95
-17
lines changed

7 files changed

+95
-17
lines changed

.github/workflows/dotnet-verify-samples.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,11 @@ jobs:
6363
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
6464
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
6565

66+
- name: Build solution
67+
working-directory: dotnet
68+
shell: bash
69+
run: dotnet build agent-framework-dotnet.slnx -f net10.0 --warnaserror
70+
6671
- name: Run verify-samples
6772
id: verify
6873
working-directory: dotnet

dotnet/.github/skills/verify-samples-tool/SKILL.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,16 @@ The `verify-samples` project (`dotnet/eng/verify-samples/`) is an automated tool
99

1010
## Running verify-samples
1111

12+
**Important:** By default, samples must be pre-built before running verify-samples. Build the solution first, or pass `--build` to build samples during the run:
13+
1214
```bash
1315
cd dotnet
16+
dotnet build agent-framework-dotnet.slnx -f net10.0
17+
```
18+
19+
Then run verify-samples:
1420

21+
```bash
1522
# Run all samples across all categories
1623
dotnet run --project eng/verify-samples -- --log results.log --csv results.csv
1724

@@ -24,6 +31,10 @@ dotnet run --project eng/verify-samples -- Agent_Step02_StructuredOutput Agent_S
2431
# Control parallelism (default 8)
2532
dotnet run --project eng/verify-samples -- --parallel 8 --log results.log
2633

34+
# Build samples during run (skips the need for a prior build step)
35+
# This may cause build conflicts as multiple samples are built in parallel, so use with caution
36+
dotnet run --project eng/verify-samples -- --build --log results.log
37+
2738
# Combine options
2839
dotnet run --project eng/verify-samples -- --category 03-workflows --parallel 4 --log results.log --csv results.csv --md results.md
2940
```

dotnet/eng/verify-samples/Program.cs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
// dotnet run -- --log results.log # Write sequential log to file
1515
// dotnet run -- --csv results.csv # Write CSV summary to file
1616
// dotnet run -- --md results.md # Write Markdown summary to file
17+
// dotnet run -- --build # Build samples during run (default: --no-build)
18+
// Note: By default, this tool expects sample build outputs to already exist.
19+
// Pre-build the solution before running, or pass --build to avoid missing build output failures.
1720
//
1821
// Required environment variables (for AI-powered samples):
1922
// AZURE_OPENAI_ENDPOINT
@@ -63,7 +66,7 @@
6366
// Run all samples
6467
var reporter = new ConsoleReporter();
6568
var verifier = new SampleVerifier(chatClient);
66-
var orchestrator = new VerificationOrchestrator(verifier, reporter, dotnetRoot, TimeSpan.FromMinutes(3), logWriter);
69+
var orchestrator = new VerificationOrchestrator(verifier, reporter, dotnetRoot, TimeSpan.FromMinutes(3), logWriter, buildSamples: options.BuildSamples);
6770

6871
var run = await orchestrator.RunAllAsync(options.Samples, options.MaxParallelism);
6972

dotnet/eng/verify-samples/SampleRunner.cs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,23 +20,32 @@ internal static class SampleRunner
2020
{
2121
/// <summary>
2222
/// Runs <c>dotnet run --framework net10.0</c> in the given project directory.
23+
/// When <paramref name="build"/> is false (the default), <c>--no-build</c> is passed
24+
/// to skip building, assuming the project was pre-built.
2325
/// </summary>
2426
public static Task<SampleRunResult> RunAsync(
2527
string projectPath,
2628
TimeSpan timeout,
29+
bool build = false,
2730
CancellationToken cancellationToken = default)
28-
=> RunAsync(projectPath, "run --framework net10.0", timeout, inputs: null, inputDelayMs: 0, cancellationToken: cancellationToken);
31+
=> RunAsync(projectPath, DotnetRunArgs(build), timeout, inputs: null, inputDelayMs: 0, cancellationToken: cancellationToken);
2932

3033
/// <summary>
3134
/// Runs <c>dotnet run --framework net10.0</c> with stdin inputs.
35+
/// When <paramref name="build"/> is false (the default), <c>--no-build</c> is passed
36+
/// to skip building, assuming the project was pre-built.
3237
/// </summary>
3338
public static Task<SampleRunResult> RunAsync(
3439
string projectPath,
3540
TimeSpan timeout,
3641
string?[]? inputs,
3742
int inputDelayMs = 2000,
43+
bool build = false,
3844
CancellationToken cancellationToken = default)
39-
=> RunAsync(projectPath, "run --framework net10.0", timeout, inputs, inputDelayMs, cancellationToken);
45+
=> RunAsync(projectPath, DotnetRunArgs(build), timeout, inputs, inputDelayMs, cancellationToken);
46+
47+
private static string DotnetRunArgs(bool build) =>
48+
$"run {(build ? "" : "--no-build")} --framework net10.0";
4049

4150
/// <summary>
4251
/// Runs an arbitrary <c>dotnet</c> command in the given working directory.

dotnet/eng/verify-samples/SampleVerifier.cs

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// Copyright (c) Microsoft. All rights reserved.
22

3+
using System.ComponentModel;
34
using System.Text.Json.Serialization;
45
using Microsoft.Agents.AI;
56
using Microsoft.Extensions.AI;
@@ -27,11 +28,19 @@ public SampleVerifier(ChatClient? chatClient = null)
2728
instructions: """
2829
You are a test output verifier. You will be given:
2930
1. The actual stdout output of a program
30-
2. A list of expectations about what the output should contain or demonstrate
31+
2. The stderr output (if any)
32+
3. A list of expectations about what the output should contain or demonstrate
3133
3234
Your job is to determine whether the actual output satisfies each expectation.
3335
Be reasonable — the output comes from an LLM so exact wording won't match, but the
3436
semantic intent should be clearly satisfied.
37+
38+
In your response, you MUST:
39+
- Always provide ai_reasoning with a brief overall assessment.
40+
- Always provide exactly one entry in expectation_results for each expectation,
41+
in the same order as the input list.
42+
- For each expectation_results entry, echo the expectation text in the expectation
43+
field and explain your assessment in the detail field, citing evidence from the output.
3544
""",
3645
name: "OutputVerifier");
3746
}
@@ -78,7 +87,7 @@ public async Task<VerificationResult> VerifyAsync(SampleDefinition sample, Sampl
7887
}
7988
else
8089
{
81-
var aiResult = await this.VerifyWithAIAsync(run.Stdout, sample.ExpectedOutputDescription);
90+
var aiResult = await this.VerifyWithAIAsync(run.Stdout, run.Stderr, sample.ExpectedOutputDescription);
8291
aiReasoning = aiResult.Reasoning;
8392

8493
foreach (var unmet in aiResult.UnmetExpectations)
@@ -100,16 +109,28 @@ public async Task<VerificationResult> VerifyAsync(SampleDefinition sample, Sampl
100109
}
101110

102111
private async Task<(string Reasoning, List<string> UnmetExpectations)> VerifyWithAIAsync(
103-
string actualOutput,
112+
string stdout,
113+
string stderr,
104114
string[] expectations)
105115
{
106116
var expectationList = string.Join("\n", expectations.Select((e, i) => $" {i + 1}. {e}"));
117+
118+
var stderrSection = string.IsNullOrWhiteSpace(stderr)
119+
? ""
120+
: $"""
121+
122+
Stderr output:
123+
---
124+
{Truncate(stderr, 2000)}
125+
---
126+
""";
127+
107128
var prompt = $"""
108129
Actual program output:
109130
---
110-
{Truncate(actualOutput, 4000)}
131+
{Truncate(stdout, 4000)}
111132
---
112-
133+
{stderrSection}
113134
Expectations to verify:
114135
{expectationList}
115136
@@ -126,7 +147,9 @@ Does the output satisfy all expectations?
126147
return ($"AI verification returned null result. Raw: {response.Text}", ["AI verification returned null result."]);
127148
}
128149

129-
var reasoning = result.Reasoning ?? "(no reasoning provided)";
150+
var reasoning = string.IsNullOrWhiteSpace(result.AIReasoning)
151+
? "(no reasoning provided)"
152+
: result.AIReasoning;
130153

131154
// Collect unmet expectations as individual failures
132155
var unmet = new List<string>();
@@ -174,12 +197,14 @@ internal sealed class AIVerificationResponse
174197
public bool Pass { get; set; }
175198

176199
/// <summary>Brief explanation of the overall assessment.</summary>
177-
[JsonPropertyName("reasoning")]
178-
public string? Reasoning { get; set; }
200+
[JsonPropertyName("ai_reasoning")]
201+
[Description("Always required. Brief explanation of the overall assessment, covering all expectations.")]
202+
public string AIReasoning { get; set; } = string.Empty;
179203

180204
/// <summary>Per-expectation results.</summary>
181205
[JsonPropertyName("expectation_results")]
182-
public List<ExpectationResult>? ExpectationResults { get; set; }
206+
[Description("Always required. One entry per expectation, in the same order as the input list.")]
207+
public List<ExpectationResult> ExpectationResults { get; set; } = [];
183208
}
184209

185210
/// <summary>
@@ -190,13 +215,15 @@ internal sealed class ExpectationResult
190215
{
191216
/// <summary>The expectation text that was evaluated.</summary>
192217
[JsonPropertyName("expectation")]
193-
public string? Expectation { get; set; }
218+
[Description("Echo back the expectation text being evaluated.")]
219+
public string Expectation { get; set; } = string.Empty;
194220

195221
/// <summary>Whether this expectation was met.</summary>
196222
[JsonPropertyName("met")]
197223
public bool Met { get; set; }
198224

199225
/// <summary>Detail about how the expectation was or was not met.</summary>
200226
[JsonPropertyName("detail")]
201-
public string? Detail { get; set; }
227+
[Description("Explain how the expectation was or was not met, citing specific evidence from the output.")]
228+
public string Detail { get; set; } = string.Empty;
202229
}

dotnet/eng/verify-samples/VerificationOrchestrator.cs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,22 @@ internal sealed class VerificationOrchestrator
1414
private readonly LogFileWriter? _logWriter;
1515
private readonly string _dotnetRoot;
1616
private readonly TimeSpan _timeout;
17+
private readonly bool _buildSamples;
1718

1819
public VerificationOrchestrator(
1920
SampleVerifier verifier,
2021
ConsoleReporter reporter,
2122
string dotnetRoot,
2223
TimeSpan timeout,
23-
LogFileWriter? logWriter = null)
24+
LogFileWriter? logWriter = null,
25+
bool buildSamples = false)
2426
{
2527
this._verifier = verifier;
2628
this._reporter = reporter;
2729
this._logWriter = logWriter;
2830
this._dotnetRoot = dotnetRoot;
2931
this._timeout = timeout;
32+
this._buildSamples = buildSamples;
3033
}
3134

3235
/// <summary>
@@ -136,8 +139,8 @@ private async Task RunSingleAsync(
136139

137140
var projectPath = Path.Combine(this._dotnetRoot, sample.ProjectPath);
138141
var run = sample.Inputs.Length > 0
139-
? await SampleRunner.RunAsync(projectPath, this._timeout, sample.Inputs, sample.InputDelayMs)
140-
: await SampleRunner.RunAsync(projectPath, this._timeout);
142+
? await SampleRunner.RunAsync(projectPath, this._timeout, sample.Inputs, sample.InputDelayMs, build: this._buildSamples)
143+
: await SampleRunner.RunAsync(projectPath, this._timeout, build: this._buildSamples);
141144

142145
log.Add($"[{sample.Name}] Completed ({run.Elapsed.TotalSeconds:F1}s, exit={run.ExitCode})");
143146
this._reporter.WriteLineWithPrefix(

dotnet/eng/verify-samples/VerifyOptions.cs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,12 @@ internal sealed class VerifyOptions
2727
/// </summary>
2828
public string? LogFilePath { get; init; }
2929

30+
/// <summary>
31+
/// When true, samples are built as part of <c>dotnet run</c>.
32+
/// When false (the default), <c>--no-build</c> is passed, assuming a prior build step.
33+
/// </summary>
34+
public bool BuildSamples { get; init; }
35+
3036
/// <summary>
3137
/// The filtered list of samples to process.
3238
/// </summary>
@@ -55,6 +61,7 @@ internal sealed class VerifyOptions
5561
var logFilePath = ExtractArg(argList, "--log");
5662
var csvFilePath = ExtractArg(argList, "--csv");
5763
var markdownFilePath = ExtractArg(argList, "--md");
64+
var buildSamples = ExtractFlag(argList, "--build");
5865

5966
int maxParallelism = 8;
6067
var parallelArg = ExtractArg(argList, "--parallel");
@@ -105,6 +112,7 @@ internal sealed class VerifyOptions
105112
LogFilePath = logFilePath,
106113
CsvFilePath = csvFilePath,
107114
MarkdownFilePath = markdownFilePath,
115+
BuildSamples = buildSamples,
108116
Samples = samples,
109117
};
110118
}
@@ -128,4 +136,16 @@ internal sealed class VerifyOptions
128136
list.RemoveRange(idx, 2);
129137
return value;
130138
}
139+
140+
private static bool ExtractFlag(List<string> list, string flag)
141+
{
142+
var idx = list.IndexOf(flag);
143+
if (idx < 0)
144+
{
145+
return false;
146+
}
147+
148+
list.RemoveAt(idx);
149+
return true;
150+
}
131151
}

0 commit comments

Comments
 (0)