Python: What am I doing wrong here? The latency on MAF is unbelievable

**I wrote a script were I compared calling the same model through 3 different endpoints/clients:**
1. Using the OpenAI library
2. Using the Azure AI foundry library
3. Using the MAF library

```
Here are the results:

===============================================================================================
AGGREGATE RESULTS
===============================================================================================

Metric                                      Foundry                  MAF               OpenAI
────────────────────────────── ──────────────────── ──────────────────── ────────────────────
Time to First Token (ms)                     1775.1               2638.6               1401.0
  └─ Winner                                                                                 ✓

Total Time (s)                                 1.89                12.86                 1.52
  └─ Winner                                                                                 ✓

Throughput (tokens/s)                         14.22                 1.69                14.97
  └─ Winner                                                                                 ✓

───────────────────────────────────────────────────────────────────────────────────────────────

Errors:
  Foundry: 0/9
  MAF:     0/9
  OpenAI:  0/9

===============================================================================================
WINNER BREAKDOWN
===============================================================================================

🏆 OpenAI wins 3/3 metrics

Metric Winners:
  • Time to First Token: OpenAI (1401ms)
  • Total Time:          OpenAI (1.52s)
  • Throughput:          OpenAI (15.0 tok/s)

===============================================================================================
```

**The different between azure ai foundry and openai is marginal but MAF is not even comparable, am I doing something wrong here? Look at the code below:**

```
"""
Three-Way Latency Comparison: Azure AI Foundry vs MAF vs Azure OpenAI

Compares streaming latency across three different approaches:
1. Azure AI Foundry (Direct) - azure.ai.inference SDK for direct model inference
2. Microsoft Agent Framework (MAF) - MAF ChatAgent with Foundry project endpoint
3. Azure OpenAI (Direct) - OpenAI SDK with Azure OpenAI endpoint

All three use the same model deployment to isolate endpoint/framework differences.
"""

import os
import time
import asyncio
import statistics
from typing import List, Dict
from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
from azure.ai.inference.aio import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
from agent_framework.azure import AzureAIClient
from agent_framework import ChatAgent
from openai import AsyncAzureOpenAI
from dotenv import load_dotenv

load_dotenv("XXX")

# Endpoint configurations
OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")  # https://cog-xxx.cognitiveservices.azure.com/
MODEL_DEPLOYMENT = "agents"  # Use 'agents' deployment that works for all

# Azure AI Foundry endpoint for direct inference (azure.ai.inference SDK)
FOUNDRY_CHAT_ENDPOINT = f"{OPENAI_ENDPOINT}openai/deployments/{MODEL_DEPLOYMENT}"

# Azure AI Foundry project endpoint for MAF
FOUNDRY_PROJECT_ENDPOINT = XXX

# Test configuration
TEST_PROMPTS = [
    "Tell me a joke about a pirate.",
    "Explain quantum computing in 2 sentences.",
    "Write a haiku about technology.",
]
NUM_RUNS = 3

print(f"\n{'='*90}")
print(f"THREE-WAY LATENCY COMPARISON")
print(f"{'='*90}")
print(f"Foundry Chat Endpoint:      {FOUNDRY_CHAT_ENDPOINT}")
print(f"Foundry Project Endpoint:   {FOUNDRY_PROJECT_ENDPOINT}")
print(f"OpenAI Endpoint:            {OPENAI_ENDPOINT}")
print(f"Model Deployment:           {MODEL_DEPLOYMENT}")
print(f"Test Prompts:               {len(TEST_PROMPTS)}")
print(f"Runs per prompt:            {NUM_RUNS}")
print(f"{'='*90}\n")


async def test_foundry_direct(credential: DefaultAzureCredential, query: str) -> Dict:
    """Test 1: Azure AI Foundry using azure.ai.inference SDK (Direct)"""
    start_time = time.perf_counter()
    first_token_time = None
    token_times = []
    last_token_time = start_time
    token_count = 0
    full_response = ""
    error = None

    try:
        # azure.ai.inference ChatCompletionsClient for direct Foundry inference
        client = ChatCompletionsClient(
            endpoint=FOUNDRY_CHAT_ENDPOINT,
            credential=credential,
            credential_scopes=["https://cognitiveservices.azure.com/.default"],
        )

        messages = [
            SystemMessage(content="You are a helpful assistant."),
            UserMessage(content=query),
        ]

        stream = await client.complete(messages=messages, stream=True)

        async for chunk in stream:
            if chunk.choices and chunk.choices[0].delta:
                delta = chunk.choices[0].delta
                content = delta.content if hasattr(delta, "content") else None

                if content:
                    current_time = time.perf_counter()
                    if first_token_time is None:
                        first_token_time = current_time
                    else:
                        token_times.append(current_time - last_token_time)

                    full_response += content
                    token_count += 1
                    last_token_time = current_time

        await client.close()

    except Exception as e:
        error = str(e)

    total_time = time.perf_counter() - start_time
    ttft = (first_token_time - start_time) if first_token_time else 0

    return {
        "approach": "foundry_direct",
        "ttft": ttft,
        "total_time": total_time,
        "token_count": token_count,
        "tokens_per_sec": token_count / total_time if total_time > 0 else 0,
        "avg_inter_token": statistics.mean(token_times) if token_times else 0,
        "content_length": len(full_response),
        "error": error,
    }


async def test_maf(credential: DefaultAzureCredential, query: str) -> Dict:
    """Test 2: Microsoft Agent Framework (MAF) with ChatAgent"""
    start_time = time.perf_counter()
    first_token_time = None
    token_times = []
    last_token_time = start_time
    token_count = 0
    full_response = ""
    error = None

    try:
        client = AzureAIClient(
            project_endpoint=FOUNDRY_PROJECT_ENDPOINT,
            model_deployment_name=MODEL_DEPLOYMENT,
            async_credential=credential,
            agent_name="latency-test-maf"
        )

        agent = ChatAgent(
            chat_client=client,
            name="test-agent",
            instructions="You are a helpful assistant.",
        )

        async for chunk in agent.run_stream(query):
            if chunk.text:
                current_time = time.perf_counter()
                if first_token_time is None:
                    first_token_time = current_time
                else:
                    token_times.append(current_time - last_token_time)

                full_response += chunk.text
                token_count += 1
                last_token_time = current_time

        await client.close()

    except Exception as e:
        error = str(e)

    total_time = time.perf_counter() - start_time
    ttft = (first_token_time - start_time) if first_token_time else 0

    return {
        "approach": "maf",
        "ttft": ttft,
        "total_time": total_time,
        "token_count": token_count,
        "tokens_per_sec": token_count / total_time if total_time > 0 else 0,
        "avg_inter_token": statistics.mean(token_times) if token_times else 0,
        "content_length": len(full_response),
        "error": error,
    }


async def test_openai_direct(credential: DefaultAzureCredential, query: str) -> Dict:
    """Test 3: Azure OpenAI using OpenAI SDK (Direct)"""
    start_time = time.perf_counter()
    first_token_time = None
    token_times = []
    last_token_time = start_time
    token_count = 0
    full_response = ""
    error = None

    try:
        token_provider = get_bearer_token_provider(
            credential,
            "https://cognitiveservices.azure.com/.default"
        )

        client = AsyncAzureOpenAI(
            azure_endpoint=OPENAI_ENDPOINT,
            azure_ad_token_provider=token_provider,
            api_version="2024-10-21",
        )

        stream = await client.chat.completions.create(
            model=MODEL_DEPLOYMENT,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": query}
            ],
            stream=True,
        )

        async for chunk in stream:
            if chunk.choices and chunk.choices[0].delta.content:
                current_time = time.perf_counter()
                content = chunk.choices[0].delta.content

                if first_token_time is None:
                    first_token_time = current_time
                else:
                    token_times.append(current_time - last_token_time)

                full_response += content
                token_count += 1
                last_token_time = current_time

        await client.close()

    except Exception as e:
        error = str(e)

    total_time = time.perf_counter() - start_time
    ttft = (first_token_time - start_time) if first_token_time else 0

    return {
        "approach": "openai_direct",
        "ttft": ttft,
        "total_time": total_time,
        "token_count": token_count,
        "tokens_per_sec": token_count / total_time if total_time > 0 else 0,
        "avg_inter_token": statistics.mean(token_times) if token_times else 0,
        "content_length": len(full_response),
        "error": error,
    }


def print_result(result: Dict, run_num: int):
    """Print single test result."""
    labels = {
        "foundry_direct": "Foundry (Direct)",
        "maf": "MAF",
        "openai_direct": "OpenAI (Direct)",
    }
    label = labels.get(result["approach"], result["approach"])

    if result["error"]:
        print(f"  {label:20} Run {run_num}: ❌ {result['error'][:70]}")
    else:
        print(f"  {label:20} Run {run_num}: ✓ TTFT={result['ttft']*1000:>6.0f}ms, "
              f"Total={result['total_time']:>5.2f}s, Tokens={result['token_count']:>3}, "
              f"TPS={result['tokens_per_sec']:>5.1f}")


def print_comparison(foundry_results: List[Dict], maf_results: List[Dict], openai_results: List[Dict]):
    """Print three-way comparison."""
    print(f"\n{'='*95}")
    print(f"AGGREGATE RESULTS")
    print(f"{'='*95}\n")

    # Filter valid results
    foundry_valid = [r for r in foundry_results if not r["error"]]
    maf_valid = [r for r in maf_results if not r["error"]]
    openai_valid = [r for r in openai_results if not r["error"]]

    if not all([foundry_valid, maf_valid, openai_valid]):
        print(f"⚠️  Not enough valid results")
        print(f"Valid: Foundry={len(foundry_valid)}, MAF={len(maf_valid)}, OpenAI={len(openai_valid)}")
        return

    # Calculate means
    def mean(results, metric):
        return statistics.mean([r[metric] for r in results])

    foundry_ttft = mean(foundry_valid, "ttft")
    maf_ttft = mean(maf_valid, "ttft")
    openai_ttft = mean(openai_valid, "ttft")

    foundry_total = mean(foundry_valid, "total_time")
    maf_total = mean(maf_valid, "total_time")
    openai_total = mean(openai_valid, "total_time")

    foundry_tps = mean(foundry_valid, "tokens_per_sec")
    maf_tps = mean(maf_valid, "tokens_per_sec")
    openai_tps = mean(openai_valid, "tokens_per_sec")

    # Print table
    print(f"{'Metric':<30} {'Foundry':>20} {'MAF':>20} {'OpenAI':>20}")
    print(f"{'─'*30} {'─'*20} {'─'*20} {'─'*20}")

    print(f"{'Time to First Token (ms)':<30} {foundry_ttft*1000:>20.1f} {maf_ttft*1000:>20.1f} {openai_ttft*1000:>20.1f}")
    ttft_winner = min([("Foundry", foundry_ttft), ("MAF", maf_ttft), ("OpenAI", openai_ttft)], key=lambda x: x[1])
    print(f"{'  └─ Winner':<30} {('✓' if ttft_winner[0]=='Foundry' else ''):>20} {('✓' if ttft_winner[0]=='MAF' else ''):>20} {('✓' if ttft_winner[0]=='OpenAI' else ''):>20}")

    print()
    print(f"{'Total Time (s)':<30} {foundry_total:>20.2f} {maf_total:>20.2f} {openai_total:>20.2f}")
    total_winner = min([("Foundry", foundry_total), ("MAF", maf_total), ("OpenAI", openai_total)], key=lambda x: x[1])
    print(f"{'  └─ Winner':<30} {('✓' if total_winner[0]=='Foundry' else ''):>20} {('✓' if total_winner[0]=='MAF' else ''):>20} {('✓' if total_winner[0]=='OpenAI' else ''):>20}")

    print()
    print(f"{'Throughput (tokens/s)':<30} {foundry_tps:>20.2f} {maf_tps:>20.2f} {openai_tps:>20.2f}")
    tps_winner = max([("Foundry", foundry_tps), ("MAF", maf_tps), ("OpenAI", openai_tps)], key=lambda x: x[1])
    print(f"{'  └─ Winner':<30} {('✓' if tps_winner[0]=='Foundry' else ''):>20} {('✓' if tps_winner[0]=='MAF' else ''):>20} {('✓' if tps_winner[0]=='OpenAI' else ''):>20}")

    print(f"\n{'─'*95}")

    # Error counts
    print(f"\nErrors:")
    print(f"  Foundry: {len([r for r in foundry_results if r['error']])}/{len(foundry_results)}")
    print(f"  MAF:     {len([r for r in maf_results if r['error']])}/{len(maf_results)}")
    print(f"  OpenAI:  {len([r for r in openai_results if r['error']])}/{len(openai_results)}")

    # Overall winner
    print(f"\n{'='*95}")
    print(f"WINNER BREAKDOWN")
    print(f"{'='*95}\n")

    wins = {"Foundry": 0, "MAF": 0, "OpenAI": 0}
    wins[ttft_winner[0]] += 1
    wins[total_winner[0]] += 1
    wins[tps_winner[0]] += 1

    winner = max(wins.items(), key=lambda x: x[1])
    print(f"🏆 {winner[0]} wins {winner[1]}/3 metrics\n")

    print(f"Metric Winners:")
    print(f"  • Time to First Token: {ttft_winner[0]} ({ttft_winner[1]*1000:.0f}ms)")
    print(f"  • Total Time:          {total_winner[0]} ({total_winner[1]:.2f}s)")
    print(f"  • Throughput:          {tps_winner[0]} ({tps_winner[1]:.1f} tok/s)")

    print(f"\n{'='*95}\n")


async def main():
    """Run three-way latency comparison."""
    credential = DefaultAzureCredential()
    foundry_results, maf_results, openai_results = [], [], []

    try:
        for prompt_idx, prompt in enumerate(TEST_PROMPTS, 1):
            print(f"\n{'─'*95}")
            print(f"PROMPT {prompt_idx}/{len(TEST_PROMPTS)}: {prompt}")
            print(f"{'─'*95}\n")

            for run in range(NUM_RUNS):
                # Test all three approaches
                foundry = await test_foundry_direct(credential, prompt)
                foundry_results.append(foundry)
                print_result(foundry, run+1)
                await asyncio.sleep(0.5)

                maf = await test_maf(credential, prompt)
                maf_results.append(maf)
                print_result(maf, run+1)
                await asyncio.sleep(0.5)

                openai = await test_openai_direct(credential, prompt)
                openai_results.append(openai)
                print_result(openai, run+1)
                await asyncio.sleep(0.5)

                print()

        # Print comparison
        print_comparison(foundry_results, maf_results, openai_results)

    except Exception as e:
        print(f"\n❌ Fatal error: {e}")
        import traceback
        traceback.print_exc()

    finally:
        await credential.close()


if __name__ == "__main__":
    asyncio.run(main())
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Python: What am I doing wrong here? The latency on MAF is unbelievable #2752

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Python: What am I doing wrong here? The latency on MAF is unbelievable #2752

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions