generative-computing · markstur · Apr 13, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 14, 2026
@@ -7,6 +7,7 @@
 import sys
 import time
 import uuid
+from typing import Literal
 
 try:
     import typer
@@ -21,11 +22,15 @@
     ) from e
 
 from mellea.backends.model_options import ModelOption
-from mellea.helpers.openai_compatible_helpers import build_completion_usage
+from mellea.helpers.openai_compatible_helpers import (
+    build_completion_usage,
+    build_tool_calls,
+)
 
 from .models import (
     ChatCompletion,
     ChatCompletionMessage,
+    ChatCompletionMessageToolCall,
     ChatCompletionRequest,
     Choice,
     OpenAIError,
@@ -111,14 +116,14 @@ def _build_model_options(request: ChatCompletionRequest) -> dict:
         "response_format",  # Response format (json_object) - not yet implemented
         "functions",  # Legacy function calling - not yet implemented
         "function_call",  # Legacy function calling - not yet implemented
-        "tools",  # Tool calling - not yet implemented
-        "tool_choice",  # Tool choice - not yet implemented
     }
     openai_to_model_option = {
         "temperature": ModelOption.TEMPERATURE,
         "max_tokens": ModelOption.MAX_NEW_TOKENS,
         "seed": ModelOption.SEED,
         "stream": ModelOption.STREAM,
+        "tools": ModelOption.TOOLS,
+        "tool_choice": ModelOption.TOOL_CHOICE,
     }
 
     # Get all non-None fields
@@ -171,8 +176,6 @@ async def endpoint(request: ChatCompletionRequest):
                     model_options=model_options,
                 )
 
-            # system_fingerprint represents backend config hash, not model name
-            # The model name is already in response.model (line 73)
             # Leave as None since we don't track backend config fingerprints yet
             system_fingerprint = None
 
@@ -190,6 +193,24 @@ async def endpoint(request: ChatCompletionRequest):
                     media_type="text/event-stream",
                 )
 
+            tool_calls_list = build_tool_calls(output)
+            tool_calls = (
+                [
+                    ChatCompletionMessageToolCall.model_validate(tc)
+                    for tc in tool_calls_list
+                ]
+                if tool_calls_list
+                else None
+            )
+
+            # Determine finish_reason based on tool calls
+            finish_reason: (
+                Literal[
+                    "stop", "length", "content_filter", "tool_calls", "function_call"
+                ]
+                | None
+            ) = "tool_calls" if tool_calls else "stop"
+
             return ChatCompletion(
                 id=completion_id,
                 model=request.model,
@@ -198,9 +219,11 @@ async def endpoint(request: ChatCompletionRequest):
                     Choice(
                         index=0,
                         message=ChatCompletionMessage(
-                            content=output.value, role="assistant"
+                            content=output.value,
+                            role="assistant",
+                            tool_calls=tool_calls,
                         ),
-                        finish_reason="stop",
+                        finish_reason=finish_reason,
                     )
                 ],
                 object="chat.completion",  # type: ignore

@@ -80,6 +80,67 @@ class ChatCompletionRequest(BaseModel):
     extra: dict[str, Any] = Field(default_factory=dict)
 
 
+class ToolCallFunction(BaseModel):
+    """Function details for a tool call."""
+
+    name: str
+    """The name of the function to call."""
+
+    arguments: str
+    """The arguments to call the function with, as a JSON string."""
+
+
+class ChatCompletionMessageToolCall(BaseModel):
+    """A tool call generated by the model (non-streaming)."""
+
+    id: str
+    """The ID of the tool call."""
+
+    type: Literal["function"]
+    """The type of the tool. Currently, only 'function' is supported."""
+
+    function: ToolCallFunction
+    """The function that the model called."""
+
+
+class ToolCallFunctionDelta(BaseModel):
+    """Function details for a streaming tool call delta.
+
+    In streaming responses, function name and arguments may arrive across
+    multiple chunks, so both fields are optional.
+    """
+
+    name: str | None = None
+    """The name of the function to call (may be None in delta chunks)."""
+
+    arguments: str | None = None
+    """The arguments fragment for this delta (may be None in delta chunks)."""
+
+
+class ChatCompletionMessageToolCallDelta(BaseModel):
+    """A tool call delta in a streaming response.
+
+    Per OpenAI streaming spec, each delta must include an index field that
+    clients use to reassemble tool calls across chunks. The id, type, and
+    function fields are optional since they may arrive incrementally.
+    """
+
+    index: int
+    """The index of this tool call in the tool_calls array.
+
+    Required for delta reassembly in OpenAI SDK and compatible clients.
+    """
+
+    id: str | None = None
+    """The ID of the tool call (may be None in subsequent delta chunks)."""
+
+    type: Literal["function"] | None = None
+    """The type of the tool (may be None in subsequent delta chunks)."""
+
+    function: ToolCallFunctionDelta | None = None
+    """The function delta for this chunk (may be None in some chunks)."""
+
+
 # Taking this from OpenAI types https://github.com/openai/openai-python/blob/main/src/openai/types/chat/chat_completion.py,
 class ChatCompletionMessage(BaseModel):
     content: str | None = None
@@ -91,6 +152,9 @@ class ChatCompletionMessage(BaseModel):
     role: Literal["assistant"]
     """The role of the author of this message."""
 
+    tool_calls: list[ChatCompletionMessageToolCall] | None = None
+    """The tool calls generated by the model, such as function calls."""
+
 
 class Choice(BaseModel):
     index: int
@@ -144,6 +208,14 @@ class ChatCompletionChunkDelta(BaseModel):
     refusal: str | None = None
     """The refusal message fragment, if any."""
 
+    tool_calls: list[ChatCompletionMessageToolCallDelta] | None = None
+    """The tool call deltas in this chunk.
+
+    Each delta includes a required index field for reassembly by OpenAI SDK
+    and compatible clients. The id, type, and function fields are optional
+    since they may arrive incrementally across multiple chunks.
+    """
+
 
 class ChatCompletionChunkChoice(BaseModel):
     """A choice in a streaming chunk."""

@@ -1,15 +1,20 @@
 """Streaming utilities for OpenAI-compatible server responses."""
 
 from collections.abc import AsyncGenerator
+from typing import Literal
 
 from mellea.core.base import ModelOutputThunk
 from mellea.core.utils import MelleaLogger
-from mellea.helpers.openai_compatible_helpers import build_completion_usage
+from mellea.helpers.openai_compatible_helpers import (
+    build_completion_usage,
+    build_tool_calls,
+)
 
 from .models import (
     ChatCompletionChunk,
     ChatCompletionChunkChoice,
     ChatCompletionChunkDelta,
+    ChatCompletionMessageToolCallDelta,
     OpenAIError,
     OpenAIErrorResponse,
     StreamOptions,
@@ -98,6 +103,38 @@ async def stream_chat_completion_chunks(
                     )
                     yield f"data: {chunk.model_dump_json()}\n\n"
 
+        tool_calls_list = build_tool_calls(output)
+
+        if tool_calls_list:
+            # Convert to ChatCompletionMessageToolCallDelta objects with required index
+            tool_calls = [
+                ChatCompletionMessageToolCallDelta.model_validate({**tc, "index": idx})
+                for idx, tc in enumerate(tool_calls_list)
+            ]
+
+            # Emit tool calls in a separate chunk before the final chunk
+            tool_call_chunk = ChatCompletionChunk(
+                id=completion_id,
+                model=model,
+                created=created,
+                choices=[
+                    ChatCompletionChunkChoice(
+                        index=0,
+                        delta=ChatCompletionChunkDelta(tool_calls=tool_calls),
+                        finish_reason=None,
+                    )
+                ],
+                object="chat.completion.chunk",
+                system_fingerprint=system_fingerprint,
+            )
+            yield f"data: {tool_call_chunk.model_dump_json()}\n\n"
+
+        # Determine finish_reason based on tool calls
+        finish_reason: (
+            Literal["stop", "length", "content_filter", "tool_calls", "function_call"]
+            | None
+        ) = "tool_calls" if tool_calls_list else "stop"
+
         # Include usage in final chunk only if explicitly requested via stream_options
         # Per OpenAI spec: usage is only included when stream_options.include_usage=True
         include_usage = stream_options is not None and stream_options.include_usage
@@ -112,7 +149,7 @@ async def stream_chat_completion_chunks(
                 ChatCompletionChunkChoice(
                     index=0,
                     delta=ChatCompletionChunkDelta(content=None),
-                    finish_reason="stop",
+                    finish_reason=finish_reason,
                 )
             ],
             object="chat.completion.chunk",