.NET: Improve resilience of verify-samples by building separately and improving evaluation instructions (#5151)

westey-m · web-flow · commit 6d6cb840aec8 · 2026-04-09T11:25:00.000Z
* Improve resilience of verify-samples by building separately and improving evaluation instructions

* Address PR comments

* Address PR comment
diff --git a/.github/workflows/dotnet-verify-samples.yml b/.github/workflows/dotnet-verify-samples.yml
@@ -63,6 +63,11 @@ jobs:
           tenant-id: ${{ secrets.AZURE_TENANT_ID }}
           subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
 
+      - name: Build solution
+        working-directory: dotnet
+        shell: bash
+        run: dotnet build agent-framework-dotnet.slnx -f net10.0 --warnaserror
+
       - name: Run verify-samples
         id: verify
         working-directory: dotnet
diff --git a/dotnet/.github/skills/verify-samples-tool/SKILL.md b/dotnet/.github/skills/verify-samples-tool/SKILL.md
@@ -9,9 +9,16 @@ The `verify-samples` project (`dotnet/eng/verify-samples/`) is an automated tool
 
 ## Running verify-samples
 
+**Important:** By default, samples must be pre-built before running verify-samples. Build the solution first, or pass `--build` to build samples during the run:
+
 ```bash
 cd dotnet
+dotnet build agent-framework-dotnet.slnx -f net10.0
+```
+
+Then run verify-samples:
 
+```bash
 # Run all samples across all categories
 dotnet run --project eng/verify-samples -- --log results.log --csv results.csv
 
@@ -24,6 +31,10 @@ dotnet run --project eng/verify-samples -- Agent_Step02_StructuredOutput Agent_S
 # Control parallelism (default 8)
 dotnet run --project eng/verify-samples -- --parallel 8 --log results.log
 
+# Build samples during run (skips the need for a prior build step)
+# This may cause build conflicts as multiple samples are built in parallel, so use with caution
+dotnet run --project eng/verify-samples -- --build --log results.log
+
 # Combine options
 dotnet run --project eng/verify-samples -- --category 03-workflows --parallel 4 --log results.log --csv results.csv --md results.md
 ```
diff --git a/dotnet/eng/verify-samples/Program.cs b/dotnet/eng/verify-samples/Program.cs
@@ -14,6 +14,9 @@
 //   dotnet run -- --log results.log                     # Write sequential log to file
 //   dotnet run -- --csv results.csv                     # Write CSV summary to file
 //   dotnet run -- --md results.md                       # Write Markdown summary to file
+//   dotnet run -- --build                                # Build samples during run (default: --no-build)
+// Note: By default, this tool expects sample build outputs to already exist.
+// Pre-build the solution before running, or pass --build to avoid missing build output failures.
 //
 // Required environment variables (for AI-powered samples):
 //   AZURE_OPENAI_ENDPOINT
@@ -63,7 +66,7 @@
     // Run all samples
     var reporter = new ConsoleReporter();
     var verifier = new SampleVerifier(chatClient);
-    var orchestrator = new VerificationOrchestrator(verifier, reporter, dotnetRoot, TimeSpan.FromMinutes(3), logWriter);
+    var orchestrator = new VerificationOrchestrator(verifier, reporter, dotnetRoot, TimeSpan.FromMinutes(3), logWriter, buildSamples: options.BuildSamples);
 
     var run = await orchestrator.RunAllAsync(options.Samples, options.MaxParallelism);
 
diff --git a/dotnet/eng/verify-samples/SampleRunner.cs b/dotnet/eng/verify-samples/SampleRunner.cs
@@ -20,23 +20,32 @@ internal static class SampleRunner
 {
     /// <summary>
     /// Runs <c>dotnet run --framework net10.0</c> in the given project directory.
+    /// When <paramref name="build"/> is false (the default), <c>--no-build</c> is passed
+    /// to skip building, assuming the project was pre-built.
     /// </summary>
     public static Task<SampleRunResult> RunAsync(
         string projectPath,
         TimeSpan timeout,
+        bool build = false,
         CancellationToken cancellationToken = default)
-        => RunAsync(projectPath, "run --framework net10.0", timeout, inputs: null, inputDelayMs: 0, cancellationToken: cancellationToken);
+        => RunAsync(projectPath, DotnetRunArgs(build), timeout, inputs: null, inputDelayMs: 0, cancellationToken: cancellationToken);
 
     /// <summary>
     /// Runs <c>dotnet run --framework net10.0</c> with stdin inputs.
+    /// When <paramref name="build"/> is false (the default), <c>--no-build</c> is passed
+    /// to skip building, assuming the project was pre-built.
     /// </summary>
     public static Task<SampleRunResult> RunAsync(
         string projectPath,
         TimeSpan timeout,
         string?[]? inputs,
         int inputDelayMs = 2000,
+        bool build = false,
         CancellationToken cancellationToken = default)
-        => RunAsync(projectPath, "run --framework net10.0", timeout, inputs, inputDelayMs, cancellationToken);
+        => RunAsync(projectPath, DotnetRunArgs(build), timeout, inputs, inputDelayMs, cancellationToken);
+
+    private static string DotnetRunArgs(bool build) =>
+        $"run {(build ? "" : "--no-build")} --framework net10.0";
 
     /// <summary>
     /// Runs an arbitrary <c>dotnet</c> command in the given working directory.
diff --git a/dotnet/eng/verify-samples/SampleVerifier.cs b/dotnet/eng/verify-samples/SampleVerifier.cs
@@ -1,5 +1,6 @@
 ﻿// Copyright (c) Microsoft. All rights reserved.
 
+using System.ComponentModel;
 using System.Text.Json.Serialization;
 using Microsoft.Agents.AI;
 using Microsoft.Extensions.AI;
@@ -27,11 +28,19 @@ public SampleVerifier(ChatClient? chatClient = null)
                 instructions: """
                     You are a test output verifier. You will be given:
                     1. The actual stdout output of a program
-                    2. A list of expectations about what the output should contain or demonstrate
+                    2. The stderr output (if any)
+                    3. A list of expectations about what the output should contain or demonstrate
 
                     Your job is to determine whether the actual output satisfies each expectation.
                     Be reasonable — the output comes from an LLM so exact wording won't match, but the
                     semantic intent should be clearly satisfied.
+
+                    In your response, you MUST:
+                    - Always provide ai_reasoning with a brief overall assessment.
+                    - Always provide exactly one entry in expectation_results for each expectation,
+                      in the same order as the input list.
+                    - For each expectation_results entry, echo the expectation text in the expectation
+                      field and explain your assessment in the detail field, citing evidence from the output.
                     """,
                 name: "OutputVerifier");
         }
@@ -78,7 +87,7 @@ public async Task<VerificationResult> VerifyAsync(SampleDefinition sample, Sampl
             }
             else
             {
-                var aiResult = await this.VerifyWithAIAsync(run.Stdout, sample.ExpectedOutputDescription);
+                var aiResult = await this.VerifyWithAIAsync(run.Stdout, run.Stderr, sample.ExpectedOutputDescription);
                 aiReasoning = aiResult.Reasoning;
 
                 foreach (var unmet in aiResult.UnmetExpectations)
@@ -100,16 +109,28 @@ public async Task<VerificationResult> VerifyAsync(SampleDefinition sample, Sampl
     }
 
     private async Task<(string Reasoning, List<string> UnmetExpectations)> VerifyWithAIAsync(
-        string actualOutput,
+        string stdout,
+        string stderr,
         string[] expectations)
     {
         var expectationList = string.Join("\n", expectations.Select((e, i) => $"  {i + 1}. {e}"));
+
+        var stderrSection = string.IsNullOrWhiteSpace(stderr)
+            ? ""
+            : $"""
+
+                Stderr output:
+                ---
+                {Truncate(stderr, 2000)}
+                ---
+                """;
+
         var prompt = $"""
             Actual program output:
             ---
-            {Truncate(actualOutput, 4000)}
+            {Truncate(stdout, 4000)}
             ---
-
+            {stderrSection}
             Expectations to verify:
             {expectationList}
 
@@ -126,7 +147,9 @@ Does the output satisfy all expectations?
                 return ($"AI verification returned null result. Raw: {response.Text}", ["AI verification returned null result."]);
             }
 
-            var reasoning = result.Reasoning ?? "(no reasoning provided)";
+            var reasoning = string.IsNullOrWhiteSpace(result.AIReasoning)
+                ? "(no reasoning provided)"
+                : result.AIReasoning;
 
             // Collect unmet expectations as individual failures
             var unmet = new List<string>();
@@ -174,12 +197,14 @@ internal sealed class AIVerificationResponse
     public bool Pass { get; set; }
 
     /// <summary>Brief explanation of the overall assessment.</summary>
-    [JsonPropertyName("reasoning")]
-    public string? Reasoning { get; set; }
+    [JsonPropertyName("ai_reasoning")]
+    [Description("Always required. Brief explanation of the overall assessment, covering all expectations.")]
+    public string AIReasoning { get; set; } = string.Empty;
 
     /// <summary>Per-expectation results.</summary>
     [JsonPropertyName("expectation_results")]
-    public List<ExpectationResult>? ExpectationResults { get; set; }
+    [Description("Always required. One entry per expectation, in the same order as the input list.")]
+    public List<ExpectationResult> ExpectationResults { get; set; } = [];
 }
 
 /// <summary>
@@ -190,13 +215,15 @@ internal sealed class ExpectationResult
 {
     /// <summary>The expectation text that was evaluated.</summary>
     [JsonPropertyName("expectation")]
-    public string? Expectation { get; set; }
+    [Description("Echo back the expectation text being evaluated.")]
+    public string Expectation { get; set; } = string.Empty;
 
     /// <summary>Whether this expectation was met.</summary>
     [JsonPropertyName("met")]
     public bool Met { get; set; }
 
     /// <summary>Detail about how the expectation was or was not met.</summary>
     [JsonPropertyName("detail")]
-    public string? Detail { get; set; }
+    [Description("Explain how the expectation was or was not met, citing specific evidence from the output.")]
+    public string Detail { get; set; } = string.Empty;
 }
diff --git a/dotnet/eng/verify-samples/VerificationOrchestrator.cs b/dotnet/eng/verify-samples/VerificationOrchestrator.cs
@@ -14,19 +14,22 @@ internal sealed class VerificationOrchestrator
     private readonly LogFileWriter? _logWriter;
     private readonly string _dotnetRoot;
     private readonly TimeSpan _timeout;
+    private readonly bool _buildSamples;
 
     public VerificationOrchestrator(
         SampleVerifier verifier,
         ConsoleReporter reporter,
         string dotnetRoot,
         TimeSpan timeout,
-        LogFileWriter? logWriter = null)
+        LogFileWriter? logWriter = null,
+        bool buildSamples = false)
     {
         this._verifier = verifier;
         this._reporter = reporter;
         this._logWriter = logWriter;
         this._dotnetRoot = dotnetRoot;
         this._timeout = timeout;
+        this._buildSamples = buildSamples;
     }
 
     /// <summary>
@@ -136,8 +139,8 @@ private async Task RunSingleAsync(
 
             var projectPath = Path.Combine(this._dotnetRoot, sample.ProjectPath);
             var run = sample.Inputs.Length > 0
-                ? await SampleRunner.RunAsync(projectPath, this._timeout, sample.Inputs, sample.InputDelayMs)
-                : await SampleRunner.RunAsync(projectPath, this._timeout);
+                ? await SampleRunner.RunAsync(projectPath, this._timeout, sample.Inputs, sample.InputDelayMs, build: this._buildSamples)
+                : await SampleRunner.RunAsync(projectPath, this._timeout, build: this._buildSamples);
 
             log.Add($"[{sample.Name}] Completed ({run.Elapsed.TotalSeconds:F1}s, exit={run.ExitCode})");
             this._reporter.WriteLineWithPrefix(
diff --git a/dotnet/eng/verify-samples/VerifyOptions.cs b/dotnet/eng/verify-samples/VerifyOptions.cs
@@ -27,6 +27,12 @@ internal sealed class VerifyOptions
     /// </summary>
     public string? LogFilePath { get; init; }
 
+    /// <summary>
+    /// When true, samples are built as part of <c>dotnet run</c>.
+    /// When false (the default), <c>--no-build</c> is passed, assuming a prior build step.
+    /// </summary>
+    public bool BuildSamples { get; init; }
+
     /// <summary>
     /// The filtered list of samples to process.
     /// </summary>
@@ -55,6 +61,7 @@ internal sealed class VerifyOptions
         var logFilePath = ExtractArg(argList, "--log");
         var csvFilePath = ExtractArg(argList, "--csv");
         var markdownFilePath = ExtractArg(argList, "--md");
+        var buildSamples = ExtractFlag(argList, "--build");
 
         int maxParallelism = 8;
         var parallelArg = ExtractArg(argList, "--parallel");
@@ -105,6 +112,7 @@ internal sealed class VerifyOptions
             LogFilePath = logFilePath,
             CsvFilePath = csvFilePath,
             MarkdownFilePath = markdownFilePath,
+            BuildSamples = buildSamples,
             Samples = samples,
         };
     }
@@ -128,4 +136,16 @@ internal sealed class VerifyOptions
         list.RemoveRange(idx, 2);
         return value;
     }
+
+    private static bool ExtractFlag(List<string> list, string flag)
+    {
+        var idx = list.IndexOf(flag);
+        if (idx < 0)
+        {
+            return false;
+        }
+
+        list.RemoveAt(idx);
+        return true;
+    }
 }