fix(stdlib): address review feedback on streaming validation

planetf1 · planetf1 · commit def10b6f87ce · 2026-04-28T20:32:35.000+01:00
Addresses issues raised by independent review on top of PR #942. Orchestrator (mellea/stdlib/streaming.py): - except Exception now calls mot.cancel_generation() before surfacing the exception to the consumer — previously the backend producer was left running, eventually blocking on mot._async_queue (maxsize=20). Cleanup failures are logged via MelleaLogger.warning with a TODO(#902) marker; #902 replaces the log with a proper ErrorEvent. - RuntimeError catch in the astream() loop now re-raises unless mot.is_computed() is true, so only the documented "already computed" race is swallowed. - astream() docstring now states the single-consumer contract explicitly; a second iteration blocks on an empty queue with no sentinel to deliver. - as_thunk docstring now flags the early-exit case: cancel_generation forces is_computed=True without running post_processing(), so generation.usage and related telemetry fields may be None. Chunker (mellea/stdlib/chunking.py): - SentenceChunker.flush switches from .strip() to .rstrip() with a comment explaining why: the loop's lstrip has already removed leading whitespace, and trailing whitespace on a sentence fragment is non-semantic (consistent with split() returning sentences without trailing whitespace). - ParagraphChunker.flush adds a docstring noting the deliberate asymmetry: paragraph fragments are returned byte-for-byte because internal whitespace (e.g. trailing \n of a list item) can be semantically meaningful. Tests (test/stdlib/test_streaming.py): - test_stream_validate_receives_individual_chunks now uses exact- match on the captured chunk list, which directly regresses if someone reverts to accumulated-text semantics. - test_multiple_chunks_in_one_batch_with_mid_batch_fail: response fed as one large token so split() yields 4 sentences at once; verifies chunk 1 emits, chunk 2 fails (not emitted), chunks 3 and 4 are neither validated nor emitted. - test_cancel_generation_invoked_on_fail: spies on ModelOutputThunk.cancel_generation and asserts it was called on the "fail" early-exit path. - test_exception_in_stream_validate_cancels_generation: a requirement that raises must cause cancel_generation to run and the exception to surface via astream()/acomplete() without hanging. Telemetry observability (orchestrator-level spans, metrics, span events) remains deferred to #902 per the epic, which now has the acceptance criteria updated to cover event emission, the OTEL bridge, and the ErrorEvent type that will replace the MelleaLogger stopgap. Assisted-by: Claude Code
diff --git a/mellea/stdlib/chunking.py b/mellea/stdlib/chunking.py
@@ -116,7 +116,15 @@ def split(self, accumulated_text: str) -> list[str]:
         return chunks
 
     def flush(self, accumulated_text: str) -> list[str]:
-        """Return the trailing sentence fragment (if any) as a final chunk."""
+        """Return the trailing sentence fragment (if any) as a final chunk.
+
+        Trailing whitespace on the fragment is non-semantic for sentence
+        boundaries and is dropped via ``rstrip``.  Leading whitespace is
+        already removed by the loop's ``lstrip`` on each advance, so no
+        ``lstrip`` is needed here.  The result is the fragment's content
+        only, consistent with how :meth:`split` returns sentences without
+        trailing whitespace.
+        """
         if not accumulated_text:
             return []
         remaining = accumulated_text
@@ -125,7 +133,7 @@ def flush(self, accumulated_text: str) -> list[str]:
             if match is None:
                 break
             remaining = remaining[match.end() :].lstrip()
-        trailing = remaining.strip()
+        trailing = remaining.rstrip()
         return [trailing] if trailing else []
 
 
@@ -216,7 +224,15 @@ def split(self, accumulated_text: str) -> list[str]:
         return [p for p in parts if p]
 
     def flush(self, accumulated_text: str) -> list[str]:
-        """Return the trailing paragraph fragment (if any) as a final chunk."""
+        r"""Return the trailing paragraph fragment (if any) as a final chunk.
+
+        Unlike :class:`SentenceChunker.flush`, the fragment is returned
+        byte-for-byte without stripping.  Internal whitespace — including
+        a trailing single ``\n`` — can be semantically meaningful inside
+        a paragraph (e.g. a list item or a deliberate line break), and a
+        consumer validating paragraph content should see the fragment as
+        it was withheld.
+        """
         if not accumulated_text:
             return []
         if _PARA_BOUNDARY_END.search(accumulated_text):
diff --git a/mellea/stdlib/streaming.py b/mellea/stdlib/streaming.py
@@ -16,6 +16,7 @@
 from ..core.backend import Backend
 from ..core.base import CBlock, Component, Context, ModelOutputThunk
 from ..core.requirement import PartialValidationResult, Requirement, ValidationResult
+from ..core.utils import MelleaLogger
 from .chunking import ChunkingStrategy, ParagraphChunker, SentenceChunker, WordChunker
 
 _CHUNKING_ALIASES: dict[str, type[ChunkingStrategy]] = {
@@ -75,6 +76,14 @@ async def astream(self) -> AsyncIterator[str]:
         all chunks have been yielded, whether the stream completed normally or
         was cancelled early on a ``"fail"`` result.
 
+        **Single-consumer.** Chunks are delivered via an
+        :class:`asyncio.Queue` that this method drains; calling
+        ``astream()`` a second time on the same result blocks indefinitely
+        because the queue is empty and the terminating ``None`` sentinel
+        has already been consumed.  If you need the chunks after
+        iteration, capture them into a list during the first pass or use
+        :attr:`full_text` after :meth:`acomplete`.
+
         Yields:
             str: A validated text chunk from the chunking strategy.
 
@@ -116,6 +125,15 @@ def as_thunk(self) -> ModelOutputThunk:
         early-exit results; ``value`` will reflect whatever was accumulated
         before cancellation.
 
+        Note:
+            On early exit, ``cancel_generation()`` forces the MOT into a
+            computed state without running the backend's
+            ``post_processing()``.  Telemetry fields on the returned thunk
+            (``generation.usage``, ``generation.ttfb_ms``, etc.) may
+            therefore be ``None`` or reflect the partial state at
+            cancellation time.  ``value`` and ``streaming`` are reliable;
+            usage totals are not.
+
         Returns:
             ModelOutputThunk: A computed thunk containing the streamed output.
 
@@ -178,7 +196,12 @@ async def _validate_and_emit(c: str) -> bool:
             try:
                 delta = await mot.astream()
             except RuntimeError:
-                break
+                # Expected race: mot.is_computed() was False at the top of the
+                # loop but the stream finished before we re-entered astream().
+                # Any other RuntimeError is a real bug and must propagate.
+                if mot.is_computed():
+                    break
+                raise
 
             accumulated += delta
             chunks = chunking.split(accumulated)
@@ -220,6 +243,23 @@ async def _validate_and_emit(c: str) -> bool:
             )
 
     except Exception as exc:
+        # Orchestrator is leaving — we must stop the backend producer too,
+        # otherwise mot._async_queue (maxsize=20) fills and the feeder task
+        # blocks indefinitely. The spec (#891, #901) calls this out for the
+        # "fail" path; the same reasoning applies to any unplanned exit.
+        try:
+            await mot.cancel_generation()
+        except Exception as cleanup_exc:
+            # Never let cleanup mask the original exception: log loudly and
+            # continue to surface `exc` to the consumer.
+            # TODO(#902): replace this log with an ErrorEvent emission.
+            MelleaLogger.get_logger().warning(
+                "stream_with_chunking: cancel_generation() raised during "
+                "exception cleanup (original: %r, cleanup: %r)",
+                exc,
+                cleanup_exc,
+            )
+        result.completed = False
         await result._chunk_queue.put(exc)
     finally:
         await result._chunk_queue.put(None)
diff --git a/test/stdlib/test_streaming.py b/test/stdlib/test_streaming.py
@@ -433,14 +433,12 @@ def _capturing_copy(self: ChunkRecordingReq) -> ChunkRecordingReq:
 
     assert len(captured) == 1
     seen = captured[0].seen_chunks
-    # Three complete sentences → three separate stream_validate calls.
-    assert len(seen) == 3
-    # Each chunk is one sentence, not a prefix of accumulated text.
-    for chunk in seen:
-        assert chunk.endswith(".")
-    # Lengths must not be monotonically growing (which would indicate accumulated text).
-    # With per-chunk semantics, each chunk is roughly the same length as one sentence.
-    assert not all(len(seen[i]) < len(seen[i + 1]) for i in range(len(seen) - 1))
+    # Exact match: three separate calls, one per complete sentence,
+    # each call receiving that sentence and nothing more.  Under the old
+    # accumulated-text semantics, seen would have been
+    # ["First sentence.", "First sentence. Second sentence.", ...] —
+    # exact match against the per-chunk list is the direct regression guard.
+    assert seen == ["First sentence.", "Second sentence.", "Third sentence."]
 
 
 @pytest.mark.asyncio
@@ -576,3 +574,190 @@ async def test_no_requirements_streams_without_validation() -> None:
     assert result.full_text == response
     assert result.final_validations == []
     assert result.streaming_failures == []
+
+
+@pytest.mark.asyncio
+async def test_multiple_chunks_in_one_batch_with_mid_batch_fail() -> None:
+    """When one astream() delta produces several complete chunks and one in
+    the middle fails, earlier chunks emit, failing chunk is recorded, later
+    chunks are neither validated nor emitted."""
+
+    captured: list[Any] = []
+
+    class FailOnNthChunk(Requirement):
+        def __init__(self, n: int) -> None:
+            self._n = n
+            self._calls = 0
+            self.seen: list[str] = []
+
+        def __copy__(self) -> "FailOnNthChunk":
+            clone = FailOnNthChunk(self._n)
+            captured.append(clone)
+            return clone
+
+        def format_for_llm(self) -> str:
+            return f"fail on chunk {self._n}"
+
+        async def stream_validate(
+            self, chunk: str, *, backend: Any, ctx: Any
+        ) -> PartialValidationResult:
+            _ = backend, ctx
+            self._calls += 1
+            self.seen.append(chunk)
+            if self._calls == self._n:
+                return PartialValidationResult("fail", reason=f"n={self._n}")
+            return PartialValidationResult("unknown")
+
+        async def validate(
+            self,
+            backend: Any,
+            ctx: Any,
+            *,
+            format: Any = None,
+            model_options: Any = None,
+        ) -> ValidationResult:
+            _ = backend, ctx, format, model_options
+            return ValidationResult(result=True)
+
+    # token_size larger than the whole response → one astream() delta delivers
+    # the full text, so chunking.split produces 4 sentences in a single batch.
+    response = "One. Two. Three. Four. "
+    backend = StreamingMockBackend(response, token_size=100)
+    req = FailOnNthChunk(n=2)
+
+    result = await stream_with_chunking(
+        _action(), backend, _ctx(), quick_check_requirements=[req], chunking="sentence"
+    )
+    yielded: list[str] = []
+    async for c in result.astream():
+        yielded.append(c)
+    await result.acomplete()
+
+    assert result.completed is False
+    assert len(result.streaming_failures) == 1
+    # Chunk 1 was validated and emitted; chunk 2 was validated and failed
+    # (NOT emitted); chunks 3 and 4 were NEITHER validated NOR emitted.
+    assert yielded == ["One."]
+    assert len(captured) == 1
+    assert captured[0].seen == ["One.", "Two."]
+    assert captured[0]._calls == 2
+
+
+@pytest.mark.asyncio
+async def test_cancel_generation_invoked_on_fail() -> None:
+    """Early exit on 'fail' must call mot.cancel_generation() — the spec reason
+    is that asyncio.Queue(maxsize=20) will block the producer if the consumer
+    stops without cancelling."""
+
+    from mellea.core.base import ModelOutputThunk
+
+    response = "word " * 50
+    backend = StreamingMockBackend(response, token_size=3)
+
+    class FailOnFirstChunk(Requirement):
+        def format_for_llm(self) -> str:
+            return "fail immediately"
+
+        async def stream_validate(
+            self, chunk: str, *, backend: Any, ctx: Any
+        ) -> PartialValidationResult:
+            _ = chunk, backend, ctx
+            return PartialValidationResult("fail", reason="nope")
+
+        async def validate(
+            self,
+            backend: Any,
+            ctx: Any,
+            *,
+            format: Any = None,
+            model_options: Any = None,
+        ) -> ValidationResult:
+            _ = backend, ctx, format, model_options
+            return ValidationResult(result=True)
+
+    call_count = 0
+    real_cancel = ModelOutputThunk.cancel_generation
+
+    async def spy_cancel(self: ModelOutputThunk) -> None:
+        nonlocal call_count
+        call_count += 1
+        await real_cancel(self)
+
+    ModelOutputThunk.cancel_generation = spy_cancel  # type: ignore[method-assign]
+    try:
+        result = await stream_with_chunking(
+            _action(),
+            backend,
+            _ctx(),
+            quick_check_requirements=[FailOnFirstChunk()],
+            chunking="word",
+        )
+        await asyncio.wait_for(result.acomplete(), timeout=5.0)
+    finally:
+        ModelOutputThunk.cancel_generation = real_cancel  # type: ignore[method-assign]
+
+    assert result.completed is False
+    assert call_count >= 1
+
+
+@pytest.mark.asyncio
+async def test_exception_in_stream_validate_cancels_generation() -> None:
+    """If stream_validate raises, the orchestrator must still call
+    cancel_generation() — otherwise the backend producer blocks on the
+    (maxsize=20) queue — and surface the exception to the consumer via
+    astream()/acomplete()."""
+
+    from mellea.core.base import ModelOutputThunk
+
+    class RaisingReq(Requirement):
+        def format_for_llm(self) -> str:
+            return "raises"
+
+        async def stream_validate(
+            self, chunk: str, *, backend: Any, ctx: Any
+        ) -> PartialValidationResult:
+            _ = chunk, backend, ctx
+            raise ValueError("boom")
+
+        async def validate(
+            self,
+            backend: Any,
+            ctx: Any,
+            *,
+            format: Any = None,
+            model_options: Any = None,
+        ) -> ValidationResult:
+            _ = backend, ctx, format, model_options
+            return ValidationResult(result=True)
+
+    response = "word " * 50  # enough to fill maxsize=20 queue without cleanup
+    backend = StreamingMockBackend(response, token_size=3)
+
+    call_count = 0
+    real_cancel = ModelOutputThunk.cancel_generation
+
+    async def spy_cancel(self: ModelOutputThunk) -> None:
+        nonlocal call_count
+        call_count += 1
+        await real_cancel(self)
+
+    ModelOutputThunk.cancel_generation = spy_cancel  # type: ignore[method-assign]
+    try:
+        result = await stream_with_chunking(
+            _action(),
+            backend,
+            _ctx(),
+            quick_check_requirements=[RaisingReq()],
+            chunking="word",
+        )
+        with pytest.raises(ValueError, match="boom"):
+            async for _chunk in result.astream():
+                pass
+        # acomplete must complete (not hang) even though the orchestration
+        # task raised, because cancel_generation was called in the except path.
+        await asyncio.wait_for(result.acomplete(), timeout=5.0)
+    finally:
+        ModelOutputThunk.cancel_generation = real_cancel  # type: ignore[method-assign]
+
+    assert result.completed is False
+    assert call_count >= 1