Skip to content

Commit 86b49d8

Browse files
Fix and simplify ComputerUse sample (#5075)
* fix the computer use sample * rollback changes to the search state enum * address review comments * address review comments
1 parent d73c06f commit 86b49d8

10 files changed

Lines changed: 161 additions & 177 deletions

File tree

dotnet/samples/02-agents/AgentsWithFoundry/Agent_Step15_ComputerUse/Agent_Step15_ComputerUse.csproj

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
<Nullable>enable</Nullable>
88
<ImplicitUsings>enable</ImplicitUsings>
9-
<NoWarn>$(NoWarn);OPENAICUA001</NoWarn>
9+
<NoWarn>$(NoWarn);OPENAICUA001;MEAI001</NoWarn>
1010
</PropertyGroup>
1111

1212
<ItemGroup>
@@ -19,13 +19,13 @@
1919
</ItemGroup>
2020

2121
<ItemGroup>
22-
<None Update="Assets\cua_browser_search.png">
22+
<None Update="Assets\cua_browser_search.jpg">
2323
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
2424
</None>
25-
<None Update="Assets\cua_search_results.png">
25+
<None Update="Assets\cua_search_results.jpg">
2626
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
2727
</None>
28-
<None Update="Assets\cua_search_typed.png">
28+
<None Update="Assets\cua_search_typed.jpg">
2929
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
3030
</None>
3131
</ItemGroup>
402 KB
Loading
Binary file not shown.
85.4 KB
Loading
Binary file not shown.
357 KB
Loading
Binary file not shown.
Lines changed: 51 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// Copyright (c) Microsoft. All rights reserved.
22

3+
using Microsoft.Extensions.AI;
34
using OpenAI.Responses;
45

56
namespace Demo.ComputerUse;
@@ -16,83 +17,77 @@ internal enum SearchState
1617

1718
internal static class ComputerUseUtil
1819
{
19-
/// <summary>
20-
/// Load and convert screenshot images to base64 data URLs.
21-
/// </summary>
22-
internal static Dictionary<string, byte[]> LoadScreenshotAssets()
20+
internal static async Task<Dictionary<string, string>> UploadScreenshotAssetsAsync(IHostedFileClient fileClient)
2321
{
24-
string baseDir = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Assets");
22+
string assetsDir = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Assets");
23+
24+
(string key, string fileName)[] files =
25+
[
26+
("browser_search", "cua_browser_search.jpg"),
27+
("search_typed", "cua_search_typed.jpg"),
28+
("search_results", "cua_search_results.jpg")
29+
];
2530

26-
ReadOnlySpan<(string key, string fileName)> screenshotFiles =
27-
[
28-
("browser_search", "cua_browser_search.png"),
29-
("search_typed", "cua_search_typed.png"),
30-
("search_results", "cua_search_results.png")
31-
];
31+
Dictionary<string, string> screenshots = [];
3232

33-
Dictionary<string, byte[]> screenshots = [];
34-
foreach (var (key, fileName) in screenshotFiles)
33+
foreach (var (key, fileName) in files)
3534
{
36-
string fullPath = Path.GetFullPath(Path.Combine(baseDir, fileName));
37-
screenshots[key] = File.ReadAllBytes(fullPath);
35+
HostedFileContent result = await fileClient.UploadAsync(
36+
Path.Combine(assetsDir, fileName), new HostedFileClientOptions() { Purpose = "assistants" });
37+
screenshots[key] = result.FileId;
3838
}
3939

4040
return screenshots;
4141
}
4242

43+
internal static async Task EnsureDeleteScreenshotAssetsAsync(IHostedFileClient fileClient, Dictionary<string, string> screenshots)
44+
{
45+
foreach (var (_, fileId) in screenshots)
46+
{
47+
try
48+
{
49+
await fileClient.DeleteAsync(fileId);
50+
}
51+
catch
52+
{
53+
}
54+
}
55+
}
56+
4357
/// <summary>
44-
/// Process a computer action and simulate its execution.
58+
/// Simulates executing a computer action by advancing the state
59+
/// and returning the screenshot file ID for the new state.
4560
/// </summary>
46-
internal static (SearchState CurrentState, byte[] ImageBytes) HandleComputerActionAndTakeScreenshot(
61+
internal static async Task<(SearchState State, string FileId)> GetScreenshotAsync(
4762
ComputerCallAction action,
4863
SearchState currentState,
49-
Dictionary<string, byte[]> screenshots)
50-
{
51-
Console.WriteLine($"Simulating the execution of computer action: {action.Kind}");
52-
53-
SearchState newState = DetermineNextState(action, currentState);
54-
string imageKey = GetImageKey(newState);
55-
56-
return (newState, screenshots[imageKey]);
57-
}
58-
59-
private static SearchState DetermineNextState(ComputerCallAction action, SearchState currentState)
64+
Dictionary<string, string> screenshots)
6065
{
61-
string actionType = action.Kind.ToString();
62-
63-
if (actionType.Equals("type", StringComparison.OrdinalIgnoreCase) && action.TypeText is not null)
66+
if (action.Kind == ComputerCallActionKind.Wait)
6467
{
65-
return SearchState.Typed;
68+
await Task.Delay(TimeSpan.FromSeconds(5));
6669
}
6770

68-
if (IsEnterKeyAction(action, actionType))
71+
SearchState nextState = action.Kind switch
6972
{
70-
Console.WriteLine(" -> Detected ENTER key press");
71-
return SearchState.PressedEnter;
72-
}
73+
ComputerCallActionKind.Click when currentState == SearchState.Typed => SearchState.PressedEnter,
74+
ComputerCallActionKind.Type when action.TypeText is not null => SearchState.Typed,
75+
ComputerCallActionKind.KeyPress when IsEnterKey(action) => SearchState.PressedEnter,
76+
_ => currentState
77+
};
7378

74-
if (actionType.Equals("click", StringComparison.OrdinalIgnoreCase) && currentState == SearchState.Typed)
79+
string imageKey = nextState switch
7580
{
76-
Console.WriteLine(" -> Detected click after typing");
77-
return SearchState.PressedEnter;
78-
}
81+
SearchState.PressedEnter => "search_results",
82+
SearchState.Typed => "search_typed",
83+
_ => "browser_search"
84+
};
7985

80-
return currentState;
86+
return (nextState, screenshots[imageKey]);
8187
}
8288

83-
private static bool IsEnterKeyAction(ComputerCallAction action, string actionType)
84-
{
85-
return (actionType.Equals("key", StringComparison.OrdinalIgnoreCase) ||
86-
actionType.Equals("keypress", StringComparison.OrdinalIgnoreCase)) &&
87-
action.KeyPressKeyCodes is not null &&
88-
(action.KeyPressKeyCodes.Contains("Return", StringComparer.OrdinalIgnoreCase) ||
89-
action.KeyPressKeyCodes.Contains("Enter", StringComparer.OrdinalIgnoreCase));
90-
}
91-
92-
private static string GetImageKey(SearchState state) => state switch
93-
{
94-
SearchState.PressedEnter => "search_results",
95-
SearchState.Typed => "search_typed",
96-
_ => "browser_search"
97-
};
89+
private static bool IsEnterKey(ComputerCallAction action) =>
90+
action.KeyPressKeyCodes is not null &&
91+
(action.KeyPressKeyCodes.Contains("Return", StringComparer.OrdinalIgnoreCase) ||
92+
action.KeyPressKeyCodes.Contains("Enter", StringComparer.OrdinalIgnoreCase));
9893
}
Lines changed: 75 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1,146 +1,109 @@
11
// Copyright (c) Microsoft. All rights reserved.
22

3-
// This sample shows how to use Computer Use Tool with a ChatClientAgent.
3+
// This sample shows how to use the Computer Use tool with AIProjectClient.AsAIAgent(...).
44

55
using Azure.AI.Projects;
66
using Azure.Identity;
7+
using Demo.ComputerUse;
78
using Microsoft.Agents.AI;
89
using Microsoft.Agents.AI.Foundry;
910
using Microsoft.Extensions.AI;
1011
using OpenAI.Responses;
1112

12-
namespace Demo.ComputerUse;
13+
string endpoint = Environment.GetEnvironmentVariable("AZURE_AI_PROJECT_ENDPOINT") ?? throw new InvalidOperationException("AZURE_AI_PROJECT_ENDPOINT is not set.");
14+
string deploymentName = Environment.GetEnvironmentVariable("AZURE_AI_COMPUTER_USE_DEPLOYMENT_NAME") ?? "computer-use-preview";
1315

14-
internal sealed class Program
15-
{
16-
private static async Task Main(string[] args)
17-
{
18-
const string AgentInstructions = @"
19-
You are a computer automation assistant.
20-
21-
Be direct and efficient. When you reach the search results page, read and describe the actual search result titles and descriptions you can see.
22-
";
23-
24-
const string AgentName = "ComputerAgent-RAPI";
25-
26-
string endpoint = Environment.GetEnvironmentVariable("AZURE_AI_PROJECT_ENDPOINT") ?? throw new InvalidOperationException("AZURE_AI_PROJECT_ENDPOINT is not set.");
27-
string deploymentName = Environment.GetEnvironmentVariable("AZURE_AI_MODEL_DEPLOYMENT_NAME") ?? "computer-use-preview";
28-
29-
// WARNING: DefaultAzureCredential is convenient for development but requires careful consideration in production.
30-
// In production, consider using a specific credential (e.g., ManagedIdentityCredential) to avoid
31-
// latency issues, unintended credential probing, and potential security risks from fallback mechanisms.
32-
AIProjectClient aiProjectClient = new(new Uri(endpoint), new DefaultAzureCredential());
33-
34-
// Create a AIAgent with ComputerUseTool.
35-
AIAgent agent = aiProjectClient.AsAIAgent(deploymentName,
36-
instructions: AgentInstructions,
37-
name: AgentName,
38-
description: "Computer automation agent with screen interaction capabilities.",
39-
tools: [
40-
FoundryAITool.CreateComputerTool(ComputerToolEnvironment.Browser, 1026, 769),
41-
]);
42-
43-
await InvokeComputerUseAgentAsync(agent);
44-
}
16+
AIProjectClient projectClient = new(new Uri(endpoint), new DefaultAzureCredential());
17+
using IHostedFileClient fileClient = projectClient.GetProjectOpenAIClient().AsIHostedFileClient();
4518

46-
private static async Task InvokeComputerUseAgentAsync(AIAgent agent)
47-
{
48-
// Load screenshot assets
49-
Dictionary<string, byte[]> screenshots = ComputerUseUtil.LoadScreenshotAssets();
19+
AIAgent agent = projectClient.AsAIAgent(
20+
model: deploymentName,
21+
name: "ComputerAgent",
22+
instructions: "You are a computer automation assistant.",
23+
tools: [FoundryAITool.CreateComputerTool(ComputerToolEnvironment.Browser, 1026, 769)]);
5024

51-
ChatOptions chatOptions = new();
52-
CreateResponseOptions responseCreationOptions = new()
53-
{
54-
TruncationMode = ResponseTruncationMode.Auto
55-
};
56-
chatOptions.RawRepresentationFactory = (_) => responseCreationOptions;
57-
ChatClientAgentRunOptions runOptions = new(chatOptions)
58-
{
59-
AllowBackgroundResponses = true,
60-
};
25+
Dictionary<string, string> screenshots = [];
6126

62-
ChatMessage message = new(ChatRole.User, [
63-
new TextContent("I need you to help me search for 'OpenAI news'. Please type 'OpenAI news' and submit the search. Once you see search results, the task is complete."),
64-
new DataContent(new BinaryData(screenshots["browser_search"]), "image/png")
65-
]);
66-
67-
// Initial request with screenshot - start with Bing search page
68-
Console.WriteLine("Starting computer automation session (initial screenshot: cua_browser_search.png)...");
69-
70-
// We use PreviousResponseId to chain calls, sending only the new computer_call_output items
71-
// instead of re-sending the full context.
72-
AgentSession session = await agent.CreateSessionAsync();
73-
AgentResponse response = await agent.RunAsync(message, session: session, options: runOptions);
74-
75-
// Main interaction loop
76-
const int MaxIterations = 10;
77-
int iteration = 0;
78-
// Initialize state machine
79-
SearchState currentState = SearchState.Initial;
27+
try
28+
{
29+
// Upload pre-captured screenshots that simulate browser state transitions.
30+
screenshots = await ComputerUseUtil.UploadScreenshotAssetsAsync(fileClient);
8031

81-
while (true)
32+
// Enable auto-truncation for the Responses API.
33+
ChatClientAgentRunOptions runOptions = new()
34+
{
35+
ChatOptions = new ChatOptions
8236
{
83-
// Poll until the response is complete.
84-
while (response.ContinuationToken is { } token)
85-
{
86-
// Wait before polling again.
87-
await Task.Delay(TimeSpan.FromSeconds(2));
37+
RawRepresentationFactory = (_) => new CreateResponseOptions() { TruncationMode = ResponseTruncationMode.Auto },
38+
}
39+
};
8840

89-
// Continue with the token.
90-
runOptions.ContinuationToken = token;
41+
// Send the initial request with a screenshot of the browser.
42+
ChatMessage message = new(ChatRole.User, [
43+
new TextContent("Search for 'OpenAI news'. Type it and submit. Once you see results, the task is complete."),
44+
new AIContent() { RawRepresentation = ResponseContentPart.CreateInputImagePart(imageFileId: screenshots["browser_search"], imageDetailLevel: ResponseImageDetailLevel.High) }
45+
]);
9146

92-
response = await agent.RunAsync(session, runOptions);
93-
}
47+
Console.WriteLine("Starting computer use session...");
48+
49+
AgentSession session = await agent.CreateSessionAsync();
50+
AgentResponse response = await agent.RunAsync(message, session: session, options: runOptions);
9451

95-
// Clear the continuation token so the next RunAsync call is a fresh request.
96-
runOptions.ContinuationToken = null;
52+
SearchState currentState = SearchState.Initial;
9753

98-
Console.WriteLine($"Agent response received (ID: {response.ResponseId})");
54+
for (int i = 0; i < 10; i++)
55+
{
56+
// Find the next computer call action.
57+
ComputerCallResponseItem? computerCall = response.Messages
58+
.SelectMany(m => m.Contents)
59+
.Select(c => c.RawRepresentation as ComputerCallResponseItem)
60+
.FirstOrDefault(item => item is not null);
9961

100-
if (iteration >= MaxIterations)
62+
if (computerCall is null)
63+
{
64+
if (currentState == SearchState.PressedEnter)
10165
{
102-
Console.WriteLine($"\nReached maximum iterations ({MaxIterations}). Stopping.");
66+
Console.WriteLine("No more computer actions. Done.");
67+
Console.WriteLine(response);
10368
break;
10469
}
10570

106-
iteration++;
107-
Console.WriteLine($"\n--- Iteration {iteration} ---");
108-
109-
// Check for computer calls in the response
110-
IEnumerable<ComputerCallResponseItem> computerCallResponseItems = response.Messages
111-
.SelectMany(x => x.Contents)
112-
.Where(c => c.RawRepresentation is ComputerCallResponseItem and not null)
113-
.Select(c => (ComputerCallResponseItem)c.RawRepresentation!);
114-
115-
ComputerCallResponseItem? firstComputerCall = computerCallResponseItems.FirstOrDefault();
116-
if (firstComputerCall is null)
71+
// Check if the agent is asking for confirmation to proceed, and if so, respond affirmatively.
72+
TextContent? textContent = response.Messages
73+
.Where(m => m.Role == ChatRole.Assistant)
74+
.SelectMany(m => m.Contents.OfType<TextContent>())
75+
.FirstOrDefault();
76+
77+
if (textContent?.Text is { } text && (
78+
text.Contains("Would you like me") ||
79+
text.Contains("Should I") ||
80+
text.Contains("proceed") ||
81+
text.Contains('?')))
11782
{
118-
Console.WriteLine("No computer call actions found. Ending interaction.");
119-
Console.WriteLine($"Final Response: {response}");
120-
break;
83+
response = await agent.RunAsync("Please proceed.", session, runOptions);
84+
continue;
12185
}
12286

123-
// Process the first computer call response
124-
ComputerCallAction action = firstComputerCall.Action;
125-
string currentCallId = firstComputerCall.CallId;
126-
127-
Console.WriteLine($"Processing computer call (ID: {currentCallId})");
87+
break;
88+
}
12889

129-
// Simulate executing the action and taking a screenshot
130-
(SearchState CurrentState, byte[] ImageBytes) screenInfo = ComputerUseUtil.HandleComputerActionAndTakeScreenshot(action, currentState, screenshots);
131-
currentState = screenInfo.CurrentState;
90+
Console.WriteLine($"[{i + 1}] Action: {computerCall!.Action.Kind}");
13291

133-
Console.WriteLine("Sending action result back to agent...");
92+
// Simulate the action and get the resulting screenshot.
93+
(currentState, string fileId) = await ComputerUseUtil.GetScreenshotAsync(computerCall.Action, currentState, screenshots);
13494

135-
// Send only the computer_call_output — the session carries PreviousResponseId for context continuity.
136-
AIContent callOutput = new()
137-
{
138-
RawRepresentation = new ComputerCallOutputResponseItem(
139-
currentCallId,
140-
output: ComputerCallOutput.CreateScreenshotOutput(new BinaryData(screenInfo.ImageBytes), "image/png"))
141-
};
95+
// Send the screenshot back as the computer call output.
96+
AIContent callOutput = new()
97+
{
98+
RawRepresentation = new ComputerCallOutputResponseItem(
99+
computerCall.CallId,
100+
output: ComputerCallOutput.CreateScreenshotOutput(screenshotImageFileId: fileId))
101+
};
142102

143-
response = await agent.RunAsync([new ChatMessage(ChatRole.User, [callOutput])], session: session, options: runOptions);
144-
}
103+
response = await agent.RunAsync([new ChatMessage(ChatRole.User, [callOutput])], session: session, options: runOptions);
145104
}
146105
}
106+
finally
107+
{
108+
await ComputerUseUtil.EnsureDeleteScreenshotAssetsAsync(fileClient, screenshots);
109+
}

0 commit comments

Comments
 (0)