Skip to content

Commit c1622f5

Browse files
authored
feat: add prompt cache token support to cost telemetry (#936)
* feat: add prompt cache token support to cost and token telemetry (#890) Record cache token costs accurately for Anthropic and OpenAI models. Previously, cache reads and writes were excluded from cost estimates and Anthropic cache_creation_input_tokens were excluded from the input token counter. - TokenMetricsPlugin: adds cache_creation_input_tokens to prompt_tokens for Anthropic (additive; not included in prompt_tokens by the API) - CostMetricsPlugin: extracts cached_tokens from prompt_tokens_details and prices cache reads and writes separately using the correct formula (prompt_tokens - cached_tokens) * full_rate + cached_tokens * cache_read_rate + cache_creation_tokens * cache_write_rate - builtin_pricing.json: adds cache_write_per_1m and cache_read_per_1m for all current Anthropic and OpenAI models - pricing.py: extends compute_cost() with cached_tokens and cache_creation_tokens params Assisted-by: Claude Code Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com> * fix: correct LiteLLM cache token double-counting in metrics plugins LiteLLM normalises Anthropic usage so that prompt_tokens already includes cache_creation_input_tokens and cache_read_input_tokens. Both plugins were treating prompt_tokens as raw base input and adding cache fields on top, causing double-counting. - TokenMetricsPlugin: drop the + cache_creation addition - CostMetricsPlugin: subtract both cached_tokens and cache_creation from prompt_tokens so write tokens are not billed at full rate and write rate - Update test_cost_plugin_cache_tokens_forwarded to use a realistic LiteLLM-normalised shape with the correct expected input_tokens value - Remove the now-redundant with-cache-creation token metrics parametrize case - Clarify pricing.py docs and validation warning around the replace-not-merge behaviour of custom pricing file entries Assisted-by: Claude Code Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com> * fix: restore TokenMetricsPlugin and clarify custom pricing override scope in docs Assisted-by: Claude Code Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com> * docs: revert usage docstring to provider-agnostic wording Assisted-by: Claude Code Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com> --------- Signed-off-by: Alex Bozarth <ajbozart@us.ibm.com>
1 parent 3c5876d commit c1622f5

6 files changed

Lines changed: 182 additions & 32 deletions

File tree

docs/docs/observability/metrics.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -209,14 +209,19 @@ The file format maps model IDs to per-million-token rates:
209209

210210
```json
211211
{
212-
"my-custom-model": {"input_per_1m": 1.0, "output_per_1m": 2.0},
213-
"gpt-5.4": {"input_per_1m": 2.5, "output_per_1m": 15.0}
212+
"my-custom-model": {"input_per_1m": 1.0, "output_per_1m": 2.0, "cache_write_per_1m": 1.25, "cache_read_per_1m": 0.10},
213+
"gpt-5.4": {"input_per_1m": 2.5, "output_per_1m": 15.0, "cache_read_per_1m": 1.25}
214214
}
215215
```
216216

217-
Custom entries override built-in prices. Errors loading the file are logged as
217+
Custom entries replace the entire built-in entry for that model. Errors loading the file are logged as
218218
warnings and built-in prices are used as a fallback.
219219

220+
> **Note:** Anthropic does not distinguish 5-minute from 1-hour cache writes in
221+
> `cache_creation_input_tokens`. Mellea uses the 5-minute rate for `cache_write_per_1m`
222+
> (1.25× base input). Override it in a custom pricing file if you primarily use 1-hour
223+
> writes (2× base input).
224+
220225
## Operational metrics
221226

222227
Mellea records metrics for its internal sampling, validation, and tool execution
Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
{
2-
"claude-opus-4-7": {"input_per_1m": 5.0, "output_per_1m": 25.0},
3-
"claude-opus-4-6": {"input_per_1m": 5.0, "output_per_1m": 25.0},
4-
"claude-opus-4-5": {"input_per_1m": 5.0, "output_per_1m": 25.0},
5-
"claude-opus-4-5-20251101": {"input_per_1m": 5.0, "output_per_1m": 25.0},
6-
"claude-opus-4-1": {"input_per_1m": 15.0, "output_per_1m": 75.0},
7-
"claude-opus-4-1-20250805": {"input_per_1m": 15.0, "output_per_1m": 75.0},
8-
"claude-sonnet-4-6": {"input_per_1m": 3.0, "output_per_1m": 15.0},
9-
"claude-sonnet-4-5": {"input_per_1m": 3.0, "output_per_1m": 15.0},
10-
"claude-sonnet-4-5-20250929": {"input_per_1m": 3.0, "output_per_1m": 15.0},
11-
"claude-haiku-4-5": {"input_per_1m": 1.0, "output_per_1m": 5.0},
12-
"claude-haiku-4-5-20251001": {"input_per_1m": 1.0, "output_per_1m": 5.0},
13-
"gpt-5.4": {"input_per_1m": 2.5, "output_per_1m": 15.0},
14-
"gpt-5.4-mini": {"input_per_1m": 0.75, "output_per_1m": 4.5},
15-
"gpt-5.4-nano": {"input_per_1m": 0.2, "output_per_1m": 1.25},
16-
"gpt-5.4-pro": {"input_per_1m": 30.0, "output_per_1m": 180.0}
2+
"claude-opus-4-7": {"input_per_1m": 5.0, "output_per_1m": 25.0, "cache_write_per_1m": 6.25, "cache_read_per_1m": 0.50},
3+
"claude-opus-4-6": {"input_per_1m": 5.0, "output_per_1m": 25.0, "cache_write_per_1m": 6.25, "cache_read_per_1m": 0.50},
4+
"claude-opus-4-5": {"input_per_1m": 5.0, "output_per_1m": 25.0, "cache_write_per_1m": 6.25, "cache_read_per_1m": 0.50},
5+
"claude-opus-4-5-20251101": {"input_per_1m": 5.0, "output_per_1m": 25.0, "cache_write_per_1m": 6.25, "cache_read_per_1m": 0.50},
6+
"claude-opus-4-1": {"input_per_1m": 15.0, "output_per_1m": 75.0, "cache_write_per_1m": 18.75, "cache_read_per_1m": 1.50},
7+
"claude-opus-4-1-20250805": {"input_per_1m": 15.0, "output_per_1m": 75.0, "cache_write_per_1m": 18.75, "cache_read_per_1m": 1.50},
8+
"claude-sonnet-4-6": {"input_per_1m": 3.0, "output_per_1m": 15.0, "cache_write_per_1m": 3.75, "cache_read_per_1m": 0.30},
9+
"claude-sonnet-4-5": {"input_per_1m": 3.0, "output_per_1m": 15.0, "cache_write_per_1m": 3.75, "cache_read_per_1m": 0.30},
10+
"claude-sonnet-4-5-20250929": {"input_per_1m": 3.0, "output_per_1m": 15.0, "cache_write_per_1m": 3.75, "cache_read_per_1m": 0.30},
11+
"claude-haiku-4-5": {"input_per_1m": 1.0, "output_per_1m": 5.0, "cache_write_per_1m": 1.25, "cache_read_per_1m": 0.10},
12+
"claude-haiku-4-5-20251001": {"input_per_1m": 1.0, "output_per_1m": 5.0, "cache_write_per_1m": 1.25, "cache_read_per_1m": 0.10},
13+
"gpt-5.4": {"input_per_1m": 2.5, "output_per_1m": 15.0, "cache_read_per_1m": 1.25},
14+
"gpt-5.4-mini": {"input_per_1m": 0.75, "output_per_1m": 4.5, "cache_read_per_1m": 0.375},
15+
"gpt-5.4-nano": {"input_per_1m": 0.2, "output_per_1m": 1.25, "cache_read_per_1m": 0.10},
16+
"gpt-5.4-pro": {"input_per_1m": 30.0, "output_per_1m": 180.0, "cache_read_per_1m": 15.0}
1717
}

mellea/telemetry/metrics_plugins.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,10 +167,18 @@ async def record_cost_metrics(
167167

168168
model = gen.model or "unknown"
169169
provider = gen.provider or "unknown"
170+
details = gen.usage.get("prompt_tokens_details")
171+
cached_tokens = (
172+
details.get("cached_tokens") if isinstance(details, dict) else 0
173+
) or 0
174+
cache_creation = gen.usage.get("cache_creation_input_tokens") or 0
175+
prompt_tokens = gen.usage.get("prompt_tokens") or 0
170176
cost = compute_cost(
171177
model=model,
172-
input_tokens=gen.usage.get("prompt_tokens"),
178+
input_tokens=prompt_tokens - cached_tokens - cache_creation,
173179
output_tokens=gen.usage.get("completion_tokens"),
180+
cached_tokens=cached_tokens,
181+
cache_creation_tokens=cache_creation,
174182
)
175183
if cost is not None:
176184
record_cost(cost=cost, model=model, provider=provider)

mellea/telemetry/pricing.py

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,37 @@
44
Built-in prices are loaded from ``builtin_pricing.json`` in this package directory.
55
Custom overrides are loaded from a path set via ``MELLEA_PRICING_FILE``.
66
7-
Pricing data sources (last verified 2026-04-17):
8-
- Anthropic (2026-04-17): https://platform.claude.com/docs/en/about-claude/pricing
7+
Pricing data sources:
8+
- Anthropic (2026-04-24): https://platform.claude.com/docs/en/about-claude/pricing
9+
``cache_write_per_1m`` = 5-minute write rate (1.25x base input). 1-hour writes cost
10+
2x base, but the API does not distinguish write duration in
11+
``cache_creation_input_tokens``, so cost will be underestimated for 1-hour writes.
912
- OpenAI (2026-04-17): https://platform.openai.com/docs/pricing
13+
``cache_read_per_1m`` = 50% of base input. OpenAI has no separate write cost.
1014
1115
Prices change over time. To override or supplement built-in prices, create a JSON
1216
file in the same format as ``builtin_pricing.json`` and point ``MELLEA_PRICING_FILE``
13-
to it. Custom entries take precedence over built-ins.
17+
to it. Custom entries take precedence over built-ins. Each custom entry replaces the
18+
entire built-in entry for that model — there is no field-level merging. To adjust only
19+
cache rates for a built-in model, copy its full entry from ``builtin_pricing.json`` and
20+
modify the relevant fields.
1421
1522
Environment variables:
1623
- MELLEA_PRICING_FILE: Path to a JSON file with custom model pricing overrides.
1724
1825
Custom pricing file format::
1926
2027
{
21-
"my-model": {"input_per_1m": 1.0, "output_per_1m": 2.0}
28+
"my-model": {
29+
"input_per_1m": 1.0,
30+
"output_per_1m": 2.0,
31+
"cache_write_per_1m": 1.25,
32+
"cache_read_per_1m": 0.10
33+
}
2234
}
35+
36+
``cache_write_per_1m`` and ``cache_read_per_1m`` are optional. Models without
37+
these fields report $0 for cache token costs.
2338
"""
2439

2540
import json
@@ -62,7 +77,9 @@ def _validate_pricing_entry(model: str, entry: Any) -> bool:
6277
return False
6378
if not _PRICING_KEYS & entry.keys():
6479
logger.warning(
65-
"Pricing entry for %r has no recognised keys (%s) — skipping.",
80+
"Pricing entry for %r is missing required keys (%s) — skipping. "
81+
"Custom entries must include at least one of these; they replace the full "
82+
"built-in entry and do not merge with it.",
6683
model,
6784
", ".join(sorted(_PRICING_KEYS)),
6885
)
@@ -100,14 +117,21 @@ def __init__(self, pricing_file: str | None = None) -> None:
100117
self._warned_models: set[str] = set()
101118

102119
def compute_cost(
103-
self, model: str, input_tokens: int | None, output_tokens: int | None
120+
self,
121+
model: str,
122+
input_tokens: int | None,
123+
output_tokens: int | None,
124+
cached_tokens: int | None = None,
125+
cache_creation_tokens: int | None = None,
104126
) -> float | None:
105127
"""Estimate request cost in USD.
106128
107129
Args:
108130
model: Model identifier (e.g. ``"gpt-5.4"``, ``"claude-sonnet-4-6"``).
109131
input_tokens: Number of input/prompt tokens, or ``None``.
110132
output_tokens: Number of output/completion tokens, or ``None``.
133+
cached_tokens: Tokens served from prompt cache, or ``None``.
134+
cache_creation_tokens: Tokens written to prompt cache, or ``None``.
111135
112136
Returns:
113137
Estimated cost in USD, or ``None`` if no pricing data exists for the model.
@@ -128,7 +152,13 @@ def compute_cost(
128152
output_cost = ((output_tokens or 0) / 1_000_000.0) * entry.get(
129153
"output_per_1m", 0.0
130154
)
131-
return input_cost + output_cost
155+
cache_read_cost = ((cached_tokens or 0) / 1_000_000.0) * entry.get(
156+
"cache_read_per_1m", 0.0
157+
)
158+
cache_creation_cost = ((cache_creation_tokens or 0) / 1_000_000.0) * entry.get(
159+
"cache_write_per_1m", 0.0
160+
)
161+
return input_cost + output_cost + cache_read_cost + cache_creation_cost
132162

133163

134164
_registry: PricingRegistry | None = None
@@ -142,16 +172,24 @@ def _get_registry() -> PricingRegistry:
142172

143173

144174
def compute_cost(
145-
model: str, input_tokens: int | None, output_tokens: int | None
175+
model: str,
176+
input_tokens: int | None,
177+
output_tokens: int | None,
178+
cached_tokens: int | None = None,
179+
cache_creation_tokens: int | None = None,
146180
) -> float | None:
147181
"""Estimate request cost in USD using the default pricing registry.
148182
149183
Args:
150184
model: Model identifier (e.g. ``"gpt-5.4"``, ``"claude-sonnet-4-6"``).
151185
input_tokens: Number of input/prompt tokens, or ``None``.
152186
output_tokens: Number of output/completion tokens, or ``None``.
187+
cached_tokens: Tokens served from prompt cache, or ``None``.
188+
cache_creation_tokens: Tokens written to prompt cache, or ``None``.
153189
154190
Returns:
155191
Estimated cost in USD, or ``None`` if no pricing data exists for the model.
156192
"""
157-
return _get_registry().compute_cost(model, input_tokens, output_tokens)
193+
return _get_registry().compute_cost(
194+
model, input_tokens, output_tokens, cached_tokens, cache_creation_tokens
195+
)

test/telemetry/test_metrics_plugins.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,13 +306,49 @@ async def test_cost_plugin_records_cost_for_known_model(cost_plugin):
306306
await cost_plugin.record_cost_metrics(payload, {})
307307

308308
mock_cost.assert_called_once_with(
309-
model="test-model", input_tokens=100, output_tokens=50
309+
model="test-model",
310+
input_tokens=100,
311+
output_tokens=50,
312+
cached_tokens=0,
313+
cache_creation_tokens=0,
310314
)
311315
mock_record.assert_called_once_with(
312316
cost=0.0042, model="test-model", provider="test-provider"
313317
)
314318

315319

320+
@pytest.mark.asyncio
321+
async def test_cost_plugin_cache_tokens_forwarded(cost_plugin):
322+
"""Cache token fields are extracted and forwarded to compute_cost correctly.
323+
324+
Simulates LiteLLM-normalised Anthropic usage where prompt_tokens already
325+
includes cache_creation and cache_read tokens (40 base + 50 read + 10 write = 100).
326+
"""
327+
payload = _make_cost_payload(
328+
usage={
329+
"prompt_tokens": 100, # LiteLLM-normalised: 40 base + 50 cache_read + 10 cache_creation
330+
"completion_tokens": 20,
331+
"total_tokens": 120,
332+
"prompt_tokens_details": {"cached_tokens": 50},
333+
"cache_creation_input_tokens": 10,
334+
}
335+
)
336+
337+
with (
338+
patch("mellea.telemetry.pricing.compute_cost", return_value=0.005) as mock_cost,
339+
patch("mellea.telemetry.metrics.record_cost"),
340+
):
341+
await cost_plugin.record_cost_metrics(payload, {})
342+
343+
mock_cost.assert_called_once_with(
344+
model="test-model",
345+
input_tokens=40, # prompt_tokens (100) - cached_tokens (50) - cache_creation (10)
346+
output_tokens=20,
347+
cached_tokens=50,
348+
cache_creation_tokens=10,
349+
)
350+
351+
316352
@pytest.mark.asyncio
317353
async def test_cost_plugin_skips_unknown_model(cost_plugin):
318354
"""Plugin does not call record_cost when compute_cost returns None."""
@@ -360,7 +396,11 @@ async def test_cost_plugin_unknown_model_provider_fallback(cost_plugin):
360396
await cost_plugin.record_cost_metrics(payload, {})
361397

362398
mock_cost.assert_called_once_with(
363-
model="unknown", input_tokens=10, output_tokens=5
399+
model="unknown",
400+
input_tokens=10,
401+
output_tokens=5,
402+
cached_tokens=0,
403+
cache_creation_tokens=0,
364404
)
365405
mock_record.assert_called_once_with(
366406
cost=0.001, model="unknown", provider="unknown"

test/telemetry/test_pricing.py

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,22 @@ def test_compute_cost_unknown_model(fresh_registry, caplog):
5252

5353

5454
def test_compute_cost_none_tokens(fresh_registry):
55-
"""None tokens are treated as zero without raising."""
55+
"""None tokens are treated as zero without raising; None cache args produce same cost as omitting them."""
5656
cost = fresh_registry.compute_cost("gpt-5.4", input_tokens=None, output_tokens=None)
5757
assert cost == 0.0
5858

59+
base = fresh_registry.compute_cost(
60+
"claude-sonnet-4-6", input_tokens=500, output_tokens=100
61+
)
62+
with_none = fresh_registry.compute_cost(
63+
"claude-sonnet-4-6",
64+
input_tokens=500,
65+
output_tokens=100,
66+
cached_tokens=None,
67+
cache_creation_tokens=None,
68+
)
69+
assert base == with_none
70+
5971

6072
def test_compute_cost_zero_tokens(fresh_registry):
6173
"""Zero tokens produce zero cost."""
@@ -195,4 +207,51 @@ def test_invalid_entry_no_recognised_keys(custom_pricing, caplog):
195207
cost = compute_cost("bad-model", 1000, 1000)
196208

197209
assert cost is None
198-
assert any("recognised keys" in record.message for record in caplog.records)
210+
assert any("required keys" in record.message for record in caplog.records)
211+
212+
213+
def test_compute_cost_with_cached_tokens(fresh_registry):
214+
"""Cache read tokens are priced at cache_read_per_1m rate."""
215+
# claude-sonnet-4-6: cache_read_per_1m = 0.30
216+
# 1000 cached_tokens * 0.30 / 1e6 = 0.0003
217+
cost = fresh_registry.compute_cost(
218+
"claude-sonnet-4-6", input_tokens=0, output_tokens=0, cached_tokens=1000
219+
)
220+
assert cost is not None
221+
assert abs(cost - 0.0003) < 1e-9
222+
223+
224+
def test_compute_cost_with_cache_creation_tokens(fresh_registry):
225+
"""Cache creation tokens are priced at cache_write_per_1m rate."""
226+
# claude-sonnet-4-6: cache_write_per_1m = 3.75
227+
# 1000 cache_creation_tokens * 3.75 / 1e6 = 0.00375
228+
cost = fresh_registry.compute_cost(
229+
"claude-sonnet-4-6", input_tokens=0, output_tokens=0, cache_creation_tokens=1000
230+
)
231+
assert cost is not None
232+
assert abs(cost - 0.00375) < 1e-9
233+
234+
235+
def test_compute_cost_cache_tokens_model_without_cache_pricing(fresh_registry):
236+
"""Cache token args are silently ignored for models with no cache pricing fields."""
237+
# gpt-5.4 has no cache_write_per_1m, only cache_read_per_1m
238+
base_cost = fresh_registry.compute_cost(
239+
"gpt-5.4", input_tokens=1000, output_tokens=0
240+
)
241+
cost_with_creation = fresh_registry.compute_cost(
242+
"gpt-5.4", input_tokens=1000, output_tokens=0, cache_creation_tokens=500
243+
)
244+
assert base_cost is not None
245+
assert cost_with_creation is not None
246+
assert abs(base_cost - cost_with_creation) < 1e-9
247+
248+
249+
def test_compute_cost_openai_cache_read(fresh_registry):
250+
"""OpenAI cache_read_per_1m (50% of input) is applied correctly."""
251+
# gpt-5.4: cache_read_per_1m = 1.25
252+
# 1000 cached_tokens * 1.25 / 1e6 = 0.00125
253+
cost = fresh_registry.compute_cost(
254+
"gpt-5.4", input_tokens=0, output_tokens=0, cached_tokens=1000
255+
)
256+
assert cost is not None
257+
assert abs(cost - 0.00125) < 1e-9

0 commit comments

Comments
 (0)