generative-computing
diff --git a/‎cli/serve/app.py‎
Lines changed: 27 additions & 29 deletions b/‎cli/serve/app.py‎
Lines changed: 27 additions & 29 deletions
diff --git a/‎cli/serve/models.py‎
Lines changed: 3 additions & 0 deletions b/‎cli/serve/models.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cli/serve/streaming.py‎
Lines changed: 48 additions & 2 deletions b/‎cli/serve/streaming.py‎
Lines changed: 48 additions & 2 deletions
@@ -3,7 +3,6 @@
 import asyncio
 import importlib.util
 import inspect
-import json
 import os
 import sys
 import time
@@ -23,7 +22,10 @@
     ) from e
 
 from mellea.backends.model_options import ModelOption
-from mellea.helpers.openai_compatible_helpers import build_completion_usage
+from mellea.helpers.openai_compatible_helpers import (
+    build_completion_usage,
+    build_tool_calls,
+)
 
 from .models import (
     ChatCompletion,
@@ -176,34 +178,30 @@ async def endpoint(request: ChatCompletionRequest):
                 )
 
             # Extract tool calls from the ModelOutputThunk if available
-            tool_calls = None
-            finish_reason: Literal[
-                "stop", "length", "content_filter", "tool_calls", "function_call"
-            ] = "stop"
-            if (
-                hasattr(output, "tool_calls")
-                and output.tool_calls is not None
-                and isinstance(output.tool_calls, dict)
-                and output.tool_calls  # Check dict is not empty
-            ):
-                tool_calls = []
-                for model_tool_call in output.tool_calls.values():
-                    # Generate a unique ID for this tool call
-                    tool_call_id = f"call_{uuid.uuid4().hex[:24]}"
-
-                    # Serialize the arguments to JSON string
-                    args_json = json.dumps(model_tool_call.args)
-
-                    tool_calls.append(
-                        ChatCompletionMessageToolCall(
-                            id=tool_call_id,
-                            type="function",
-                            function=ToolCallFunction(
-                                name=model_tool_call.name, arguments=args_json
-                            ),
-                        )
+            tool_calls_list = build_tool_calls(output)
+            tool_calls = (
+                [
+                    ChatCompletionMessageToolCall(
+                        id=tc["id"],
+                        type=tc["type"],
+                        function=ToolCallFunction(
+                            name=tc["function"]["name"],
+                            arguments=tc["function"]["arguments"],
+                        ),
                     )
-                finish_reason = "tool_calls"
+                    for tc in tool_calls_list
+                ]
+                if tool_calls_list
+                else None
+            )
+
+            # Determine finish_reason based on tool calls
+            finish_reason: (
+                Literal[
+                    "stop", "length", "content_filter", "tool_calls", "function_call"
+                ]
+                | None
+            ) = "tool_calls" if tool_calls else "stop"
 
             # system_fingerprint represents backend config hash, not model name
             # The model name is already in response.model (line 73)
 
@@ -170,6 +170,9 @@ class ChatCompletionChunkDelta(BaseModel):
     refusal: str | None = None
     """The refusal message fragment, if any."""
 
+    tool_calls: list[ChatCompletionMessageToolCall] | None = None
+    """The tool calls generated by the model (only in tool call chunks)."""
+
 
 class ChatCompletionChunkChoice(BaseModel):
     """A choice in a streaming chunk."""
 
@@ -1,18 +1,24 @@
 """Streaming utilities for OpenAI-compatible server responses."""
 
 from collections.abc import AsyncGenerator
+from typing import Literal
 
 from mellea.core.base import ModelOutputThunk
 from mellea.core.utils import MelleaLogger
-from mellea.helpers.openai_compatible_helpers import build_completion_usage
+from mellea.helpers.openai_compatible_helpers import (
+    build_completion_usage,
+    build_tool_calls,
+)
 
 from .models import (
     ChatCompletionChunk,
     ChatCompletionChunkChoice,
     ChatCompletionChunkDelta,
+    ChatCompletionMessageToolCall,
     OpenAIError,
     OpenAIErrorResponse,
     StreamOptions,
+    ToolCallFunction,
 )
 
 
@@ -98,6 +104,46 @@ async def stream_chat_completion_chunks(
                     )
                     yield f"data: {chunk.model_dump_json()}\n\n"
 
+        # Extract tool calls from the ModelOutputThunk if available
+        tool_calls_list = build_tool_calls(output)
+
+        if tool_calls_list:
+            # Convert to ChatCompletionMessageToolCall objects
+            tool_calls = [
+                ChatCompletionMessageToolCall(
+                    id=tc["id"],
+                    type=tc["type"],
+                    function=ToolCallFunction(
+                        name=tc["function"]["name"],
+                        arguments=tc["function"]["arguments"],
+                    ),
+                )
+                for tc in tool_calls_list
+            ]
+
+            # Emit tool calls in a separate chunk before the final chunk
+            tool_call_chunk = ChatCompletionChunk(
+                id=completion_id,
+                model=model,
+                created=created,
+                choices=[
+                    ChatCompletionChunkChoice(
+                        index=0,
+                        delta=ChatCompletionChunkDelta(tool_calls=tool_calls),
+                        finish_reason=None,
+                    )
+                ],
+                object="chat.completion.chunk",
+                system_fingerprint=system_fingerprint,
+            )
+            yield f"data: {tool_call_chunk.model_dump_json()}\n\n"
+
+        # Determine finish_reason based on tool calls
+        finish_reason: (
+            Literal["stop", "length", "content_filter", "tool_calls", "function_call"]
+            | None
+        ) = "tool_calls" if tool_calls_list else "stop"
+
         # Include usage in final chunk only if explicitly requested via stream_options
         # Per OpenAI spec: usage is only included when stream_options.include_usage=True
         include_usage = stream_options is not None and stream_options.include_usage
@@ -112,7 +158,7 @@ async def stream_chat_completion_chunks(
                 ChatCompletionChunkChoice(
                     index=0,
                     delta=ChatCompletionChunkDelta(content=None),
-                    finish_reason="stop",
+                    finish_reason=finish_reason,
                 )
             ],
             object="chat.completion.chunk",