feat: add prompt cache token support to cost telemetry (#936)

ajbozarth · web-flow · commit c1622f58e9d6 · 2026-04-27T21:20:36.000Z
* feat: add prompt cache token support to cost and token telemetry (#890) Record cache token costs accurately for Anthropic and OpenAI models. Previously, cache reads and writes were excluded from cost estimates and Anthropic cache_creation_input_tokens were excluded from the input token counter. - TokenMetricsPlugin: adds cache_creation_input_tokens to prompt_tokens for Anthropic (additive; not included in prompt_tokens by the API) - CostMetricsPlugin: extracts cached_tokens from prompt_tokens_details and prices cache reads and writes separately using the correct formula (prompt_tokens - cached_tokens) * full_rate + cached_tokens * cache_read_rate + cache_creation_tokens * cache_write_rate - builtin_pricing.json: adds cache_write_per_1m and cache_read_per_1m for all current Anthropic and OpenAI models - pricing.py: extends compute_cost() with cached_tokens and cache_creation_tokens params Assisted-by: Claude Code Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com> * fix: correct LiteLLM cache token double-counting in metrics plugins LiteLLM normalises Anthropic usage so that prompt_tokens already includes cache_creation_input_tokens and cache_read_input_tokens. Both plugins were treating prompt_tokens as raw base input and adding cache fields on top, causing double-counting. - TokenMetricsPlugin: drop the + cache_creation addition - CostMetricsPlugin: subtract both cached_tokens and cache_creation from prompt_tokens so write tokens are not billed at full rate and write rate - Update test_cost_plugin_cache_tokens_forwarded to use a realistic LiteLLM-normalised shape with the correct expected input_tokens value - Remove the now-redundant with-cache-creation token metrics parametrize case - Clarify pricing.py docs and validation warning around the replace-not-merge behaviour of custom pricing file entries Assisted-by: Claude Code Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com> * fix: restore TokenMetricsPlugin and clarify custom pricing override scope in docs Assisted-by: Claude Code Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com> * docs: revert usage docstring to provider-agnostic wording Assisted-by: Claude Code Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com> --------- Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com>
diff --git a/docs/docs/observability/metrics.md b/docs/docs/observability/metrics.md
@@ -209,14 +209,19 @@ The file format maps model IDs to per-million-token rates:
 
 ```json
 {
-  "my-custom-model": {"input_per_1m": 1.0, "output_per_1m": 2.0},
-  "gpt-5.4": {"input_per_1m": 2.5, "output_per_1m": 15.0}
+  "my-custom-model": {"input_per_1m": 1.0, "output_per_1m": 2.0, "cache_write_per_1m": 1.25, "cache_read_per_1m": 0.10},
+  "gpt-5.4": {"input_per_1m": 2.5, "output_per_1m": 15.0, "cache_read_per_1m": 1.25}
 }
 ```
 
-Custom entries override built-in prices. Errors loading the file are logged as
+Custom entries replace the entire built-in entry for that model. Errors loading the file are logged as
 warnings and built-in prices are used as a fallback.
 
+> **Note:** Anthropic does not distinguish 5-minute from 1-hour cache writes in
+> `cache_creation_input_tokens`. Mellea uses the 5-minute rate for `cache_write_per_1m`
+> (1.25× base input). Override it in a custom pricing file if you primarily use 1-hour
+> writes (2× base input).
+
 ## Operational metrics
 
 Mellea records metrics for its internal sampling, validation, and tool execution
diff --git a/mellea/telemetry/builtin_pricing.json b/mellea/telemetry/builtin_pricing.json
@@ -1,17 +1,17 @@
 {
-  "claude-opus-4-7":             {"input_per_1m": 5.0,    "output_per_1m": 25.0},
-  "claude-opus-4-6":             {"input_per_1m": 5.0,    "output_per_1m": 25.0},
-  "claude-opus-4-5":             {"input_per_1m": 5.0,    "output_per_1m": 25.0},
-  "claude-opus-4-5-20251101":    {"input_per_1m": 5.0,    "output_per_1m": 25.0},
-  "claude-opus-4-1":             {"input_per_1m": 15.0,   "output_per_1m": 75.0},
-  "claude-opus-4-1-20250805":    {"input_per_1m": 15.0,   "output_per_1m": 75.0},
-  "claude-sonnet-4-6":           {"input_per_1m": 3.0,    "output_per_1m": 15.0},
-  "claude-sonnet-4-5":           {"input_per_1m": 3.0,    "output_per_1m": 15.0},
-  "claude-sonnet-4-5-20250929":  {"input_per_1m": 3.0,    "output_per_1m": 15.0},
-  "claude-haiku-4-5":            {"input_per_1m": 1.0,    "output_per_1m": 5.0},
-  "claude-haiku-4-5-20251001":   {"input_per_1m": 1.0,    "output_per_1m": 5.0},
-  "gpt-5.4":                     {"input_per_1m": 2.5,    "output_per_1m": 15.0},
-  "gpt-5.4-mini":                {"input_per_1m": 0.75,   "output_per_1m": 4.5},
-  "gpt-5.4-nano":                {"input_per_1m": 0.2,    "output_per_1m": 1.25},
-  "gpt-5.4-pro":                 {"input_per_1m": 30.0,   "output_per_1m": 180.0}
+  "claude-opus-4-7":            {"input_per_1m": 5.0,    "output_per_1m": 25.0,  "cache_write_per_1m": 6.25,   "cache_read_per_1m": 0.50},
+  "claude-opus-4-6":            {"input_per_1m": 5.0,    "output_per_1m": 25.0,  "cache_write_per_1m": 6.25,   "cache_read_per_1m": 0.50},
+  "claude-opus-4-5":            {"input_per_1m": 5.0,    "output_per_1m": 25.0,  "cache_write_per_1m": 6.25,   "cache_read_per_1m": 0.50},
+  "claude-opus-4-5-20251101":   {"input_per_1m": 5.0,    "output_per_1m": 25.0,  "cache_write_per_1m": 6.25,   "cache_read_per_1m": 0.50},
+  "claude-opus-4-1":            {"input_per_1m": 15.0,   "output_per_1m": 75.0,  "cache_write_per_1m": 18.75,  "cache_read_per_1m": 1.50},
+  "claude-opus-4-1-20250805":   {"input_per_1m": 15.0,   "output_per_1m": 75.0,  "cache_write_per_1m": 18.75,  "cache_read_per_1m": 1.50},
+  "claude-sonnet-4-6":          {"input_per_1m": 3.0,    "output_per_1m": 15.0,  "cache_write_per_1m": 3.75,   "cache_read_per_1m": 0.30},
+  "claude-sonnet-4-5":          {"input_per_1m": 3.0,    "output_per_1m": 15.0,  "cache_write_per_1m": 3.75,   "cache_read_per_1m": 0.30},
+  "claude-sonnet-4-5-20250929": {"input_per_1m": 3.0,    "output_per_1m": 15.0,  "cache_write_per_1m": 3.75,   "cache_read_per_1m": 0.30},
+  "claude-haiku-4-5":           {"input_per_1m": 1.0,    "output_per_1m": 5.0,   "cache_write_per_1m": 1.25,   "cache_read_per_1m": 0.10},
+  "claude-haiku-4-5-20251001":  {"input_per_1m": 1.0,    "output_per_1m": 5.0,   "cache_write_per_1m": 1.25,   "cache_read_per_1m": 0.10},
+  "gpt-5.4":                    {"input_per_1m": 2.5,    "output_per_1m": 15.0,  "cache_read_per_1m": 1.25},
+  "gpt-5.4-mini":               {"input_per_1m": 0.75,   "output_per_1m": 4.5,   "cache_read_per_1m": 0.375},
+  "gpt-5.4-nano":               {"input_per_1m": 0.2,    "output_per_1m": 1.25,  "cache_read_per_1m": 0.10},
+  "gpt-5.4-pro":                {"input_per_1m": 30.0,   "output_per_1m": 180.0, "cache_read_per_1m": 15.0}
 }
diff --git a/mellea/telemetry/metrics_plugins.py b/mellea/telemetry/metrics_plugins.py
@@ -167,10 +167,18 @@ async def record_cost_metrics(
 
         model = gen.model or "unknown"
         provider = gen.provider or "unknown"
+        details = gen.usage.get("prompt_tokens_details")
+        cached_tokens = (
+            details.get("cached_tokens") if isinstance(details, dict) else 0
+        ) or 0
+        cache_creation = gen.usage.get("cache_creation_input_tokens") or 0
+        prompt_tokens = gen.usage.get("prompt_tokens") or 0
         cost = compute_cost(
             model=model,
-            input_tokens=gen.usage.get("prompt_tokens"),
+            input_tokens=prompt_tokens - cached_tokens - cache_creation,
             output_tokens=gen.usage.get("completion_tokens"),
+            cached_tokens=cached_tokens,
+            cache_creation_tokens=cache_creation,
         )
         if cost is not None:
             record_cost(cost=cost, model=model, provider=provider)
diff --git a/mellea/telemetry/pricing.py b/mellea/telemetry/pricing.py
@@ -4,22 +4,37 @@
 Built-in prices are loaded from ``builtin_pricing.json`` in this package directory.
 Custom overrides are loaded from a path set via ``MELLEA_PRICING_FILE``.
 
-Pricing data sources (last verified 2026-04-17):
-  - Anthropic (2026-04-17): https://platform.claude.com/docs/en/about-claude/pricing
+Pricing data sources:
+  - Anthropic (2026-04-24): https://platform.claude.com/docs/en/about-claude/pricing
+    ``cache_write_per_1m`` = 5-minute write rate (1.25x base input). 1-hour writes cost
+    2x base, but the API does not distinguish write duration in
+    ``cache_creation_input_tokens``, so cost will be underestimated for 1-hour writes.
   - OpenAI (2026-04-17): https://platform.openai.com/docs/pricing
+    ``cache_read_per_1m`` = 50% of base input. OpenAI has no separate write cost.
 
 Prices change over time. To override or supplement built-in prices, create a JSON
 file in the same format as ``builtin_pricing.json`` and point ``MELLEA_PRICING_FILE``
-to it. Custom entries take precedence over built-ins.
+to it. Custom entries take precedence over built-ins. Each custom entry replaces the
+entire built-in entry for that model — there is no field-level merging. To adjust only
+cache rates for a built-in model, copy its full entry from ``builtin_pricing.json`` and
+modify the relevant fields.
 
 Environment variables:
   - MELLEA_PRICING_FILE: Path to a JSON file with custom model pricing overrides.
 
 Custom pricing file format::
 
     {
-      "my-model": {"input_per_1m": 1.0, "output_per_1m": 2.0}
+      "my-model": {
+        "input_per_1m": 1.0,
+        "output_per_1m": 2.0,
+        "cache_write_per_1m": 1.25,
+        "cache_read_per_1m": 0.10
+      }
     }
+
+``cache_write_per_1m`` and ``cache_read_per_1m`` are optional. Models without
+these fields report $0 for cache token costs.
 """
 
 import json
@@ -62,7 +77,9 @@ def _validate_pricing_entry(model: str, entry: Any) -> bool:
             return False
     if not _PRICING_KEYS & entry.keys():
         logger.warning(
-            "Pricing entry for %r has no recognised keys (%s) — skipping.",
+            "Pricing entry for %r is missing required keys (%s) — skipping. "
+            "Custom entries must include at least one of these; they replace the full "
+            "built-in entry and do not merge with it.",
             model,
             ", ".join(sorted(_PRICING_KEYS)),
         )
@@ -100,14 +117,21 @@ def __init__(self, pricing_file: str | None = None) -> None:
         self._warned_models: set[str] = set()
 
     def compute_cost(
-        self, model: str, input_tokens: int | None, output_tokens: int | None
+        self,
+        model: str,
+        input_tokens: int | None,
+        output_tokens: int | None,
+        cached_tokens: int | None = None,
+        cache_creation_tokens: int | None = None,
     ) -> float | None:
         """Estimate request cost in USD.
 
         Args:
             model: Model identifier (e.g. ``"gpt-5.4"``, ``"claude-sonnet-4-6"``).
             input_tokens: Number of input/prompt tokens, or ``None``.
             output_tokens: Number of output/completion tokens, or ``None``.
+            cached_tokens: Tokens served from prompt cache, or ``None``.
+            cache_creation_tokens: Tokens written to prompt cache, or ``None``.
 
         Returns:
             Estimated cost in USD, or ``None`` if no pricing data exists for the model.
@@ -128,7 +152,13 @@ def compute_cost(
         output_cost = ((output_tokens or 0) / 1_000_000.0) * entry.get(
             "output_per_1m", 0.0
         )
-        return input_cost + output_cost
+        cache_read_cost = ((cached_tokens or 0) / 1_000_000.0) * entry.get(
+            "cache_read_per_1m", 0.0
+        )
+        cache_creation_cost = ((cache_creation_tokens or 0) / 1_000_000.0) * entry.get(
+            "cache_write_per_1m", 0.0
+        )
+        return input_cost + output_cost + cache_read_cost + cache_creation_cost
 
 
 _registry: PricingRegistry | None = None
@@ -142,16 +172,24 @@ def _get_registry() -> PricingRegistry:
 
 
 def compute_cost(
-    model: str, input_tokens: int | None, output_tokens: int | None
+    model: str,
+    input_tokens: int | None,
+    output_tokens: int | None,
+    cached_tokens: int | None = None,
+    cache_creation_tokens: int | None = None,
 ) -> float | None:
     """Estimate request cost in USD using the default pricing registry.
 
     Args:
         model: Model identifier (e.g. ``"gpt-5.4"``, ``"claude-sonnet-4-6"``).
         input_tokens: Number of input/prompt tokens, or ``None``.
         output_tokens: Number of output/completion tokens, or ``None``.
+        cached_tokens: Tokens served from prompt cache, or ``None``.
+        cache_creation_tokens: Tokens written to prompt cache, or ``None``.
 
     Returns:
         Estimated cost in USD, or ``None`` if no pricing data exists for the model.
     """
-    return _get_registry().compute_cost(model, input_tokens, output_tokens)
+    return _get_registry().compute_cost(
+        model, input_tokens, output_tokens, cached_tokens, cache_creation_tokens
+    )
diff --git a/test/telemetry/test_metrics_plugins.py b/test/telemetry/test_metrics_plugins.py
@@ -306,13 +306,49 @@ async def test_cost_plugin_records_cost_for_known_model(cost_plugin):
         await cost_plugin.record_cost_metrics(payload, {})
 
         mock_cost.assert_called_once_with(
-            model="test-model", input_tokens=100, output_tokens=50
+            model="test-model",
+            input_tokens=100,
+            output_tokens=50,
+            cached_tokens=0,
+            cache_creation_tokens=0,
         )
         mock_record.assert_called_once_with(
             cost=0.0042, model="test-model", provider="test-provider"
         )
 
 
+@pytest.mark.asyncio
+async def test_cost_plugin_cache_tokens_forwarded(cost_plugin):
+    """Cache token fields are extracted and forwarded to compute_cost correctly.
+
+    Simulates LiteLLM-normalised Anthropic usage where prompt_tokens already
+    includes cache_creation and cache_read tokens (40 base + 50 read + 10 write = 100).
+    """
+    payload = _make_cost_payload(
+        usage={
+            "prompt_tokens": 100,  # LiteLLM-normalised: 40 base + 50 cache_read + 10 cache_creation
+            "completion_tokens": 20,
+            "total_tokens": 120,
+            "prompt_tokens_details": {"cached_tokens": 50},
+            "cache_creation_input_tokens": 10,
+        }
+    )
+
+    with (
+        patch("mellea.telemetry.pricing.compute_cost", return_value=0.005) as mock_cost,
+        patch("mellea.telemetry.metrics.record_cost"),
+    ):
+        await cost_plugin.record_cost_metrics(payload, {})
+
+        mock_cost.assert_called_once_with(
+            model="test-model",
+            input_tokens=40,  # prompt_tokens (100) - cached_tokens (50) - cache_creation (10)
+            output_tokens=20,
+            cached_tokens=50,
+            cache_creation_tokens=10,
+        )
+
+
 @pytest.mark.asyncio
 async def test_cost_plugin_skips_unknown_model(cost_plugin):
     """Plugin does not call record_cost when compute_cost returns None."""
@@ -360,7 +396,11 @@ async def test_cost_plugin_unknown_model_provider_fallback(cost_plugin):
         await cost_plugin.record_cost_metrics(payload, {})
 
         mock_cost.assert_called_once_with(
-            model="unknown", input_tokens=10, output_tokens=5
+            model="unknown",
+            input_tokens=10,
+            output_tokens=5,
+            cached_tokens=0,
+            cache_creation_tokens=0,
         )
         mock_record.assert_called_once_with(
             cost=0.001, model="unknown", provider="unknown"
diff --git a/test/telemetry/test_pricing.py b/test/telemetry/test_pricing.py
@@ -52,10 +52,22 @@ def test_compute_cost_unknown_model(fresh_registry, caplog):
 
 
 def test_compute_cost_none_tokens(fresh_registry):
-    """None tokens are treated as zero without raising."""
+    """None tokens are treated as zero without raising; None cache args produce same cost as omitting them."""
     cost = fresh_registry.compute_cost("gpt-5.4", input_tokens=None, output_tokens=None)
     assert cost == 0.0
 
+    base = fresh_registry.compute_cost(
+        "claude-sonnet-4-6", input_tokens=500, output_tokens=100
+    )
+    with_none = fresh_registry.compute_cost(
+        "claude-sonnet-4-6",
+        input_tokens=500,
+        output_tokens=100,
+        cached_tokens=None,
+        cache_creation_tokens=None,
+    )
+    assert base == with_none
+
 
 def test_compute_cost_zero_tokens(fresh_registry):
     """Zero tokens produce zero cost."""
@@ -195,4 +207,51 @@ def test_invalid_entry_no_recognised_keys(custom_pricing, caplog):
         cost = compute_cost("bad-model", 1000, 1000)
 
     assert cost is None
-    assert any("recognised keys" in record.message for record in caplog.records)
+    assert any("required keys" in record.message for record in caplog.records)
+
+
+def test_compute_cost_with_cached_tokens(fresh_registry):
+    """Cache read tokens are priced at cache_read_per_1m rate."""
+    # claude-sonnet-4-6: cache_read_per_1m = 0.30
+    # 1000 cached_tokens * 0.30 / 1e6 = 0.0003
+    cost = fresh_registry.compute_cost(
+        "claude-sonnet-4-6", input_tokens=0, output_tokens=0, cached_tokens=1000
+    )
+    assert cost is not None
+    assert abs(cost - 0.0003) < 1e-9
+
+
+def test_compute_cost_with_cache_creation_tokens(fresh_registry):
+    """Cache creation tokens are priced at cache_write_per_1m rate."""
+    # claude-sonnet-4-6: cache_write_per_1m = 3.75
+    # 1000 cache_creation_tokens * 3.75 / 1e6 = 0.00375
+    cost = fresh_registry.compute_cost(
+        "claude-sonnet-4-6", input_tokens=0, output_tokens=0, cache_creation_tokens=1000
+    )
+    assert cost is not None
+    assert abs(cost - 0.00375) < 1e-9
+
+
+def test_compute_cost_cache_tokens_model_without_cache_pricing(fresh_registry):
+    """Cache token args are silently ignored for models with no cache pricing fields."""
+    # gpt-5.4 has no cache_write_per_1m, only cache_read_per_1m
+    base_cost = fresh_registry.compute_cost(
+        "gpt-5.4", input_tokens=1000, output_tokens=0
+    )
+    cost_with_creation = fresh_registry.compute_cost(
+        "gpt-5.4", input_tokens=1000, output_tokens=0, cache_creation_tokens=500
+    )
+    assert base_cost is not None
+    assert cost_with_creation is not None
+    assert abs(base_cost - cost_with_creation) < 1e-9
+
+
+def test_compute_cost_openai_cache_read(fresh_registry):
+    """OpenAI cache_read_per_1m (50% of input) is applied correctly."""
+    # gpt-5.4: cache_read_per_1m = 1.25
+    # 1000 cached_tokens * 1.25 / 1e6 = 0.00125
+    cost = fresh_registry.compute_cost(
+        "gpt-5.4", input_tokens=0, output_tokens=0, cached_tokens=1000
+    )
+    assert cost is not None
+    assert abs(cost - 0.00125) < 1e-9