Here are the results:
===============================================================================================
AGGREGATE RESULTS
===============================================================================================
Metric Foundry MAF OpenAI
────────────────────────────── ──────────────────── ──────────────────── ────────────────────
Time to First Token (ms) 1775.1 2638.6 1401.0
└─ Winner ✓
Total Time (s) 1.89 12.86 1.52
└─ Winner ✓
Throughput (tokens/s) 14.22 1.69 14.97
└─ Winner ✓
───────────────────────────────────────────────────────────────────────────────────────────────
Errors:
Foundry: 0/9
MAF: 0/9
OpenAI: 0/9
===============================================================================================
WINNER BREAKDOWN
===============================================================================================
🏆 OpenAI wins 3/3 metrics
Metric Winners:
• Time to First Token: OpenAI (1401ms)
• Total Time: OpenAI (1.52s)
• Throughput: OpenAI (15.0 tok/s)
===============================================================================================
"""
Three-Way Latency Comparison: Azure AI Foundry vs MAF vs Azure OpenAI
Compares streaming latency across three different approaches:
1. Azure AI Foundry (Direct) - azure.ai.inference SDK for direct model inference
2. Microsoft Agent Framework (MAF) - MAF ChatAgent with Foundry project endpoint
3. Azure OpenAI (Direct) - OpenAI SDK with Azure OpenAI endpoint
All three use the same model deployment to isolate endpoint/framework differences.
"""
import os
import time
import asyncio
import statistics
from typing import List, Dict
from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
from azure.ai.inference.aio import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
from agent_framework.azure import AzureAIClient
from agent_framework import ChatAgent
from openai import AsyncAzureOpenAI
from dotenv import load_dotenv
load_dotenv("XXX")
# Endpoint configurations
OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") # https://cog-xxx.cognitiveservices.azure.com/
MODEL_DEPLOYMENT = "agents" # Use 'agents' deployment that works for all
# Azure AI Foundry endpoint for direct inference (azure.ai.inference SDK)
FOUNDRY_CHAT_ENDPOINT = f"{OPENAI_ENDPOINT}openai/deployments/{MODEL_DEPLOYMENT}"
# Azure AI Foundry project endpoint for MAF
FOUNDRY_PROJECT_ENDPOINT = XXX
# Test configuration
TEST_PROMPTS = [
"Tell me a joke about a pirate.",
"Explain quantum computing in 2 sentences.",
"Write a haiku about technology.",
]
NUM_RUNS = 3
print(f"\n{'='*90}")
print(f"THREE-WAY LATENCY COMPARISON")
print(f"{'='*90}")
print(f"Foundry Chat Endpoint: {FOUNDRY_CHAT_ENDPOINT}")
print(f"Foundry Project Endpoint: {FOUNDRY_PROJECT_ENDPOINT}")
print(f"OpenAI Endpoint: {OPENAI_ENDPOINT}")
print(f"Model Deployment: {MODEL_DEPLOYMENT}")
print(f"Test Prompts: {len(TEST_PROMPTS)}")
print(f"Runs per prompt: {NUM_RUNS}")
print(f"{'='*90}\n")
async def test_foundry_direct(credential: DefaultAzureCredential, query: str) -> Dict:
"""Test 1: Azure AI Foundry using azure.ai.inference SDK (Direct)"""
start_time = time.perf_counter()
first_token_time = None
token_times = []
last_token_time = start_time
token_count = 0
full_response = ""
error = None
try:
# azure.ai.inference ChatCompletionsClient for direct Foundry inference
client = ChatCompletionsClient(
endpoint=FOUNDRY_CHAT_ENDPOINT,
credential=credential,
credential_scopes=["https://cognitiveservices.azure.com/.default"],
)
messages = [
SystemMessage(content="You are a helpful assistant."),
UserMessage(content=query),
]
stream = await client.complete(messages=messages, stream=True)
async for chunk in stream:
if chunk.choices and chunk.choices[0].delta:
delta = chunk.choices[0].delta
content = delta.content if hasattr(delta, "content") else None
if content:
current_time = time.perf_counter()
if first_token_time is None:
first_token_time = current_time
else:
token_times.append(current_time - last_token_time)
full_response += content
token_count += 1
last_token_time = current_time
await client.close()
except Exception as e:
error = str(e)
total_time = time.perf_counter() - start_time
ttft = (first_token_time - start_time) if first_token_time else 0
return {
"approach": "foundry_direct",
"ttft": ttft,
"total_time": total_time,
"token_count": token_count,
"tokens_per_sec": token_count / total_time if total_time > 0 else 0,
"avg_inter_token": statistics.mean(token_times) if token_times else 0,
"content_length": len(full_response),
"error": error,
}
async def test_maf(credential: DefaultAzureCredential, query: str) -> Dict:
"""Test 2: Microsoft Agent Framework (MAF) with ChatAgent"""
start_time = time.perf_counter()
first_token_time = None
token_times = []
last_token_time = start_time
token_count = 0
full_response = ""
error = None
try:
client = AzureAIClient(
project_endpoint=FOUNDRY_PROJECT_ENDPOINT,
model_deployment_name=MODEL_DEPLOYMENT,
async_credential=credential,
agent_name="latency-test-maf"
)
agent = ChatAgent(
chat_client=client,
name="test-agent",
instructions="You are a helpful assistant.",
)
async for chunk in agent.run_stream(query):
if chunk.text:
current_time = time.perf_counter()
if first_token_time is None:
first_token_time = current_time
else:
token_times.append(current_time - last_token_time)
full_response += chunk.text
token_count += 1
last_token_time = current_time
await client.close()
except Exception as e:
error = str(e)
total_time = time.perf_counter() - start_time
ttft = (first_token_time - start_time) if first_token_time else 0
return {
"approach": "maf",
"ttft": ttft,
"total_time": total_time,
"token_count": token_count,
"tokens_per_sec": token_count / total_time if total_time > 0 else 0,
"avg_inter_token": statistics.mean(token_times) if token_times else 0,
"content_length": len(full_response),
"error": error,
}
async def test_openai_direct(credential: DefaultAzureCredential, query: str) -> Dict:
"""Test 3: Azure OpenAI using OpenAI SDK (Direct)"""
start_time = time.perf_counter()
first_token_time = None
token_times = []
last_token_time = start_time
token_count = 0
full_response = ""
error = None
try:
token_provider = get_bearer_token_provider(
credential,
"https://cognitiveservices.azure.com/.default"
)
client = AsyncAzureOpenAI(
azure_endpoint=OPENAI_ENDPOINT,
azure_ad_token_provider=token_provider,
api_version="2024-10-21",
)
stream = await client.chat.completions.create(
model=MODEL_DEPLOYMENT,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": query}
],
stream=True,
)
async for chunk in stream:
if chunk.choices and chunk.choices[0].delta.content:
current_time = time.perf_counter()
content = chunk.choices[0].delta.content
if first_token_time is None:
first_token_time = current_time
else:
token_times.append(current_time - last_token_time)
full_response += content
token_count += 1
last_token_time = current_time
await client.close()
except Exception as e:
error = str(e)
total_time = time.perf_counter() - start_time
ttft = (first_token_time - start_time) if first_token_time else 0
return {
"approach": "openai_direct",
"ttft": ttft,
"total_time": total_time,
"token_count": token_count,
"tokens_per_sec": token_count / total_time if total_time > 0 else 0,
"avg_inter_token": statistics.mean(token_times) if token_times else 0,
"content_length": len(full_response),
"error": error,
}
def print_result(result: Dict, run_num: int):
"""Print single test result."""
labels = {
"foundry_direct": "Foundry (Direct)",
"maf": "MAF",
"openai_direct": "OpenAI (Direct)",
}
label = labels.get(result["approach"], result["approach"])
if result["error"]:
print(f" {label:20} Run {run_num}: ❌ {result['error'][:70]}")
else:
print(f" {label:20} Run {run_num}: ✓ TTFT={result['ttft']*1000:>6.0f}ms, "
f"Total={result['total_time']:>5.2f}s, Tokens={result['token_count']:>3}, "
f"TPS={result['tokens_per_sec']:>5.1f}")
def print_comparison(foundry_results: List[Dict], maf_results: List[Dict], openai_results: List[Dict]):
"""Print three-way comparison."""
print(f"\n{'='*95}")
print(f"AGGREGATE RESULTS")
print(f"{'='*95}\n")
# Filter valid results
foundry_valid = [r for r in foundry_results if not r["error"]]
maf_valid = [r for r in maf_results if not r["error"]]
openai_valid = [r for r in openai_results if not r["error"]]
if not all([foundry_valid, maf_valid, openai_valid]):
print(f"⚠️ Not enough valid results")
print(f"Valid: Foundry={len(foundry_valid)}, MAF={len(maf_valid)}, OpenAI={len(openai_valid)}")
return
# Calculate means
def mean(results, metric):
return statistics.mean([r[metric] for r in results])
foundry_ttft = mean(foundry_valid, "ttft")
maf_ttft = mean(maf_valid, "ttft")
openai_ttft = mean(openai_valid, "ttft")
foundry_total = mean(foundry_valid, "total_time")
maf_total = mean(maf_valid, "total_time")
openai_total = mean(openai_valid, "total_time")
foundry_tps = mean(foundry_valid, "tokens_per_sec")
maf_tps = mean(maf_valid, "tokens_per_sec")
openai_tps = mean(openai_valid, "tokens_per_sec")
# Print table
print(f"{'Metric':<30} {'Foundry':>20} {'MAF':>20} {'OpenAI':>20}")
print(f"{'─'*30} {'─'*20} {'─'*20} {'─'*20}")
print(f"{'Time to First Token (ms)':<30} {foundry_ttft*1000:>20.1f} {maf_ttft*1000:>20.1f} {openai_ttft*1000:>20.1f}")
ttft_winner = min([("Foundry", foundry_ttft), ("MAF", maf_ttft), ("OpenAI", openai_ttft)], key=lambda x: x[1])
print(f"{' └─ Winner':<30} {('✓' if ttft_winner[0]=='Foundry' else ''):>20} {('✓' if ttft_winner[0]=='MAF' else ''):>20} {('✓' if ttft_winner[0]=='OpenAI' else ''):>20}")
print()
print(f"{'Total Time (s)':<30} {foundry_total:>20.2f} {maf_total:>20.2f} {openai_total:>20.2f}")
total_winner = min([("Foundry", foundry_total), ("MAF", maf_total), ("OpenAI", openai_total)], key=lambda x: x[1])
print(f"{' └─ Winner':<30} {('✓' if total_winner[0]=='Foundry' else ''):>20} {('✓' if total_winner[0]=='MAF' else ''):>20} {('✓' if total_winner[0]=='OpenAI' else ''):>20}")
print()
print(f"{'Throughput (tokens/s)':<30} {foundry_tps:>20.2f} {maf_tps:>20.2f} {openai_tps:>20.2f}")
tps_winner = max([("Foundry", foundry_tps), ("MAF", maf_tps), ("OpenAI", openai_tps)], key=lambda x: x[1])
print(f"{' └─ Winner':<30} {('✓' if tps_winner[0]=='Foundry' else ''):>20} {('✓' if tps_winner[0]=='MAF' else ''):>20} {('✓' if tps_winner[0]=='OpenAI' else ''):>20}")
print(f"\n{'─'*95}")
# Error counts
print(f"\nErrors:")
print(f" Foundry: {len([r for r in foundry_results if r['error']])}/{len(foundry_results)}")
print(f" MAF: {len([r for r in maf_results if r['error']])}/{len(maf_results)}")
print(f" OpenAI: {len([r for r in openai_results if r['error']])}/{len(openai_results)}")
# Overall winner
print(f"\n{'='*95}")
print(f"WINNER BREAKDOWN")
print(f"{'='*95}\n")
wins = {"Foundry": 0, "MAF": 0, "OpenAI": 0}
wins[ttft_winner[0]] += 1
wins[total_winner[0]] += 1
wins[tps_winner[0]] += 1
winner = max(wins.items(), key=lambda x: x[1])
print(f"🏆 {winner[0]} wins {winner[1]}/3 metrics\n")
print(f"Metric Winners:")
print(f" • Time to First Token: {ttft_winner[0]} ({ttft_winner[1]*1000:.0f}ms)")
print(f" • Total Time: {total_winner[0]} ({total_winner[1]:.2f}s)")
print(f" • Throughput: {tps_winner[0]} ({tps_winner[1]:.1f} tok/s)")
print(f"\n{'='*95}\n")
async def main():
"""Run three-way latency comparison."""
credential = DefaultAzureCredential()
foundry_results, maf_results, openai_results = [], [], []
try:
for prompt_idx, prompt in enumerate(TEST_PROMPTS, 1):
print(f"\n{'─'*95}")
print(f"PROMPT {prompt_idx}/{len(TEST_PROMPTS)}: {prompt}")
print(f"{'─'*95}\n")
for run in range(NUM_RUNS):
# Test all three approaches
foundry = await test_foundry_direct(credential, prompt)
foundry_results.append(foundry)
print_result(foundry, run+1)
await asyncio.sleep(0.5)
maf = await test_maf(credential, prompt)
maf_results.append(maf)
print_result(maf, run+1)
await asyncio.sleep(0.5)
openai = await test_openai_direct(credential, prompt)
openai_results.append(openai)
print_result(openai, run+1)
await asyncio.sleep(0.5)
print()
# Print comparison
print_comparison(foundry_results, maf_results, openai_results)
except Exception as e:
print(f"\n❌ Fatal error: {e}")
import traceback
traceback.print_exc()
finally:
await credential.close()
if __name__ == "__main__":
asyncio.run(main())
I wrote a script were I compared calling the same model through 3 different endpoints/clients:
The different between azure ai foundry and openai is marginal but MAF is not even comparable, am I doing something wrong here? Look at the code below: