feat(pipeline): codec_pipeline.max_workers config controls sync pool size

d-v-b · claude · d-v-b · commit faf44a028ce6 · 2026-04-17T22:07:13.000+02:00
Replace n_workers (sentinel '0 = no parallelism') with max_workers
(integer, default 1 = sequential). Passed explicitly into read_sync/
write_sync; the pipeline never reads config itself.

Add codec_pipeline.max_workers config key (default None = auto =
os.cpu_count()), resolved at the async-wrapper boundary by
_resolve_max_workers(). The wrapper passes the resolved value into
read_sync/write_sync.

_get_pool(max_workers) sizes the pool on demand and grows it (replaces
the existing pool) if a larger size is requested. Shrinking requests
reuse the larger pool.

This replaces the unwired threading.max_workers + async.concurrency
mishmash with one explicit knob. async.concurrency continues to control
BatchedCodecPipeline IO concurrency.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import os
 import threading
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass, field
@@ -37,17 +36,45 @@
 
 
 _pool: ThreadPoolExecutor | None = None
+_pool_size: int = 0
 _pool_lock = threading.Lock()
 
 
-def _get_pool() -> ThreadPoolExecutor:
-    """Get or create the module-level thread pool for codec compute."""
-    global _pool
-    if _pool is None:
+def _resolve_max_workers() -> int:
+    """Resolve ``codec_pipeline.max_workers`` config to an effective worker count.
+
+    ``None`` means "auto" → ``os.cpu_count()`` (or 1 if unavailable).
+    Values < 1 are clamped to 1 (sequential).
+    """
+    import os as _os
+
+    cfg = config.get("codec_pipeline.max_workers", default=None)
+    if cfg is None:
+        return _os.cpu_count() or 1
+    return max(1, int(cfg))
+
+
+def _get_pool(max_workers: int) -> ThreadPoolExecutor:
+    """Get or create the module-level thread pool, sized to ``max_workers``.
+
+    The pool grows on demand — if a request arrives for more workers than
+    the current pool has, the existing pool is shut down and replaced.
+    Shrinking requests reuse the existing larger pool (it just leaves
+    workers idle).
+
+    Callers that want sequential execution should not call this — they
+    should run the task list inline. ``max_workers`` must be >= 1.
+    """
+    global _pool, _pool_size
+    if max_workers < 1:
+        raise ValueError(f"max_workers must be >= 1, got {max_workers}")
+    if _pool is None or _pool_size < max_workers:
         with _pool_lock:
-            if _pool is None:
-                max_workers = os.cpu_count() or 4
+            if _pool is None or _pool_size < max_workers:
+                if _pool is not None:
+                    _pool.shutdown(wait=False)
                 _pool = ThreadPoolExecutor(max_workers=max_workers)
+                _pool_size = max_workers
     return _pool
 
 
@@ -897,15 +924,19 @@ def read_sync(
         batch_info: Iterable[tuple[ByteGetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
         out: NDBuffer,
         drop_axes: tuple[int, ...] = (),
-        n_workers: int = 0,
+        max_workers: int = 1,
     ) -> tuple[GetResult, ...]:
         """Synchronous read: fetch -> decode -> scatter, per chunk.
 
-        When ``n_workers > 0`` and there are multiple chunks, each
+        When ``max_workers > 1`` and there are multiple chunks, each
         chunk's full lifecycle (fetch + decode + scatter) runs as one
-        task on the module-level thread pool — overlapping IO of one
-        chunk with decode/scatter of another. Scatter is thread-safe
-        because the chunks have non-overlapping output selections.
+        task on a thread pool sized to ``max_workers`` — overlapping IO
+        of one chunk with decode/scatter of another. Scatter is
+        thread-safe because the chunks have non-overlapping output
+        selections.
+
+        ``max_workers=1`` runs everything sequentially in the calling
+        thread (no pool involvement).
 
         Mirrors ``BatchedCodecPipeline.read_batch``: when the AB codec
         supports partial decoding (e.g. sharding), the codec handles its
@@ -943,8 +974,8 @@ def _read_one_partial(
                 out[out_selection] = decoded
                 return GetResult(status="present")
 
-            if n_workers > 0 and len(batch) > 1:
-                pool = _get_pool()
+            if max_workers > 1 and len(batch) > 1:
+                pool = _get_pool(max_workers)
                 return tuple(pool.map(_read_one_partial, batch))
             return tuple(_read_one_partial(item) for item in batch)
 
@@ -964,8 +995,8 @@ def _read_one(
             out[out_selection] = selected
             return GetResult(status="present")
 
-        if n_workers > 0 and len(batch) > 1:
-            pool = _get_pool()
+        if max_workers > 1 and len(batch) > 1:
+            pool = _get_pool(max_workers)
             return tuple(pool.map(_read_one, batch))
         return tuple(_read_one(item) for item in batch)
 
@@ -974,14 +1005,17 @@ def write_sync(
         batch_info: Iterable[tuple[ByteSetter, ArraySpec, SelectorTuple, SelectorTuple, bool]],
         value: NDBuffer,
         drop_axes: tuple[int, ...] = (),
-        n_workers: int = 0,
+        max_workers: int = 1,
     ) -> None:
         """Synchronous write: fetch existing -> merge+encode -> store.
 
-        When ``n_workers > 0`` and there are multiple chunks, each
+        When ``max_workers > 1`` and there are multiple chunks, each
         chunk's full lifecycle (get-existing + merge + encode + set/delete)
-        runs as one task on the module-level thread pool — overlapping
-        IO of one chunk with compute of another.
+        runs as one task on a thread pool sized to ``max_workers`` —
+        overlapping IO of one chunk with compute of another.
+
+        ``max_workers=1`` runs everything sequentially in the calling
+        thread (no pool involvement).
 
         When the codec pipeline supports partial encoding (e.g. a
         sharding codec with no outer AA/BB codecs), the AB codec handles
@@ -1010,8 +1044,8 @@ def _encode_one_partial(
                 chunk_value = value if scalar else value[out_selection]
                 codec._encode_partial_sync(bs, chunk_value, chunk_selection, chunk_spec)
 
-            if n_workers > 0 and len(batch) > 1:
-                pool = _get_pool()
+            if max_workers > 1 and len(batch) > 1:
+                pool = _get_pool(max_workers)
                 # consume the iterator to surface exceptions
                 list(pool.map(_encode_one_partial, batch))
             else:
@@ -1054,8 +1088,8 @@ def _write_one(
             else:
                 bs.set_sync(encoded)
 
-        if n_workers > 0 and len(batch) > 1:
-            pool = _get_pool()
+        if max_workers > 1 and len(batch) > 1:
+            pool = _get_pool(max_workers)
             list(pool.map(_write_one, batch))
         else:
             for item in batch:
@@ -1083,9 +1117,7 @@ async def read(
             and isinstance(first_bg, StorePath)
             and isinstance(first_bg.store, SupportsGetSync)
         ):
-            return self.read_sync(
-                batch, out, drop_axes, n_workers=int(config.get("async.concurrency") or 0)
-            )
+            return self.read_sync(batch, out, drop_axes, max_workers=_resolve_max_workers())
 
         # Async fallback: fetch all chunks, decode via async codec API, scatter
         chunk_bytes_batch = await concurrent_map(
@@ -1134,9 +1166,7 @@ async def write(
             and isinstance(first_bs, StorePath)
             and isinstance(first_bs.store, SupportsSetSync)
         ):
-            self.write_sync(
-                batch, value, drop_axes, n_workers=int(config.get("async.concurrency") or 0)
-            )
+            self.write_sync(batch, value, drop_axes, max_workers=_resolve_max_workers())
             return
 
         # Async fallback: same pattern as BatchedCodecPipeline.write_batch
diff --git a/src/zarr/core/config.py b/src/zarr/core/config.py
@@ -106,6 +106,7 @@ def enable_gpu(self) -> ConfigSet:
             "codec_pipeline": {
                 "path": "zarr.core.codec_pipeline.BatchedCodecPipeline",
                 "batch_size": 1,
+                "max_workers": None,
             },
             "codecs": {
                 "blosc": "zarr.codecs.blosc.BloscCodec",
diff --git a/tests/test_config.py b/tests/test_config.py
@@ -63,6 +63,7 @@ def test_config_defaults_set() -> None:
                 "codec_pipeline": {
                     "path": "zarr.core.codec_pipeline.BatchedCodecPipeline",
                     "batch_size": 1,
+                    "max_workers": None,
                 },
                 "codecs": {
                     "blosc": "zarr.codecs.blosc.BloscCodec",