From d727208560f017e40ae8200a9e3a146708f8b74f Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Tue, 21 Apr 2026 13:26:30 -0700
Subject: [PATCH 01/19] fix: tag tritonfrontend wheel with build platform
 (TRI-983)

The tritonfrontend wheel ships an arch-specific CPython extension
(tritonfrontend/_c/<pybind>.so) but is produced with the default
"none-any" platform tag, which violates PEP 425 and breaks
hash-locked package managers (Poetry, pip-tools, uv) that see two
wheels with the same filename but different SHA256 across arches.

setup.py already honors a --plat-name flag and sets
root_is_pure = False, but build_wheel.py never passed one. Derive the
platform via sysconfig.get_platform() and forward it so the wheel is
tagged e.g. linux_x86_64 / linux_aarch64.

Refs: NVBug 6098081, JIRA DLIS-8648, Linear TRI-983
---
 src/python/build_wheel.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/src/python/build_wheel.py b/src/python/build_wheel.py
index 875dd32a70..fd1fde7459 100755
--- a/src/python/build_wheel.py
+++ b/src/python/build_wheel.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -32,6 +32,7 @@
 import shutil
 import subprocess
 import sys
+import sysconfig
 from distutils.dir_util import copy_tree
 from tempfile import mkstemp
 
@@ -115,8 +116,13 @@ def main():
     shutil.copyfile("setup.py", os.path.join(FLAGS.whl_dir, "setup.py"))
 
     os.chdir(FLAGS.whl_dir)
+    # The wheel ships an arch-specific CPython extension
+    # (tritonfrontend/_c/<pybind>.so). Pass --plat-name so the wheel is
+    # tagged with the current platform (e.g. linux_x86_64 / linux_aarch64)
+    # instead of the misleading "none-any".
+    plat_name = sysconfig.get_platform().replace("-", "_").replace(".", "_")
     print("=== Building wheel")
-    args = ["python3", "setup.py", "bdist_wheel"]
+    args = ["python3", "setup.py", "bdist_wheel", "--plat-name", plat_name]
 
     wenv = os.environ.copy()
     wenv["VERSION"] = FLAGS.triton_version

From af8a6f1db57fe7e69a9f49fd447d3b79dbde0291 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Tue, 21 Apr 2026 13:47:48 -0700
Subject: [PATCH 02/19] fix: run auditwheel on tritonfrontend wheel (TRI-983)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Raw linux_x86_64 / linux_aarch64 wheels are not accepted by canonical
PyPI — the platform tag must be manylinux_2_X_<arch>. Port the pattern
established for tritonclient in TRI-286: after bdist_wheel emits a
linux_<arch> wheel, run `auditwheel repair` to auto-discover the
minimum manylinux tag from the embedded .so's glibc symbol
dependencies, with a `python -m wheel tags --platform-tag
manylinux_2_28_<arch>` fallback for the "no ELF" pure-Python case
(documented in TRI-286 follow-up).

When auditwheel is not available on PATH (e.g. local non-container
builds), keep the linux_<arch> wheel and log a warning so builds do
not regress; the Poetry / pip-tools lock-file problem is already
solved by the distinct filename.

Also install `auditwheel` in the buildbase stage via build.py so the
container build image has the tool the wheel script expects.

Leaves a NOTE in setup.py.get_tag: the embedded binding .so is
CPython-ABI-specific, so the wheel will need cp<XY>-cp<XY> python+abi
tags once consumers are ready to gate installs on the exact
interpreter version.

Refs: NVBug 6098081, JIRA DLIS-8648, Linear TRI-983, TRI-286
---
 build.py                  |  3 +-
 src/python/build_wheel.py | 76 ++++++++++++++++++++++++++++++++++++++-
 src/python/setup.py       | 10 +++++-
 3 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/build.py b/build.py
index 774728f189..f8777f7e30 100755
--- a/build.py
+++ b/build.py
@@ -1002,7 +1002,8 @@ def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
           docker \\
           virtualenv \\
           patchelf==0.17.2 \\
-          cmake==4.0.3
+          cmake==4.0.3 \\
+          auditwheel
 """
     df += f"""
 # Install boost version >= 1.78 for boost::span
diff --git a/src/python/build_wheel.py b/src/python/build_wheel.py
index fd1fde7459..6e0789fe8c 100755
--- a/src/python/build_wheel.py
+++ b/src/python/build_wheel.py
@@ -131,11 +131,85 @@ def main():
     p.wait()
     fail_if(p.returncode != 0, "setup.py failed")
 
-    cpdir("dist", FLAGS.dest_dir)
+    # Post-process with auditwheel so the wheel is tagged with a proper
+    # manylinux_2_X_<arch> platform (required by canonical PyPI). When
+    # auditwheel is unavailable in the build image we keep the
+    # linux_<arch> wheel and emit a warning; the Poetry/pip lock-file
+    # problem is already solved by the distinct filename, and the tag can
+    # be fixed up in a follow-up publish step if needed.
+    _repair_wheel_with_auditwheel(FLAGS.whl_dir, FLAGS.dest_dir)
 
     print(f"=== Output wheel file is in: {FLAGS.dest_dir}")
     touch(os.path.join(FLAGS.dest_dir, "stamp.whl"))
 
 
+def _repair_wheel_with_auditwheel(whl_dir, dest_dir):
+    """Upgrade a linux_<arch> wheel to manylinux_2_X_<arch>.
+
+    Ports the pattern established for tritonclient in TRI-286:
+      1. auditwheel repair   — auto-discovers the minimum manylinux tag
+         by inspecting glibc symbol requirements of the embedded .so.
+      2. python -m wheel tags fallback — used when auditwheel reports
+         "no ELF" (the wheel has no native extension, e.g. a downstream
+         build disabled bindings). Mirrors the documented fallback.
+      3. No-op with warning — when auditwheel is not installed in the
+         build image, keep the linux_<arch> wheel as-is so the build
+         does not regress.
+    """
+    if shutil.which("auditwheel") is None:
+        print(
+            "=== WARNING: auditwheel not found on PATH; keeping linux_<arch> "
+            "wheel as-is. Install auditwheel in the build image to produce "
+            "PyPI-acceptable manylinux_2_X_<arch> wheels.",
+            file=sys.stderr,
+        )
+        cpdir("dist", dest_dir)
+        return
+
+    dist_dir = os.path.join(whl_dir, "dist")
+    wheels = [
+        os.path.join(dist_dir, w) for w in os.listdir(dist_dir) if w.endswith(".whl")
+    ]
+    fail_if(not wheels, "no wheel produced by setup.py")
+
+    for wheel_path in wheels:
+        print(f"=== Running auditwheel repair on {wheel_path}")
+        r = subprocess.run(
+            ["auditwheel", "repair", wheel_path, "--wheel-dir", dest_dir],
+            capture_output=True,
+            text=True,
+        )
+        # `auditwheel` logs via Python's logging module, which writes to
+        # stderr — the "no ELF" sentinel only appears there, not in
+        # stdout. See TRI-286 root-cause write-up.
+        if r.returncode != 0 and "no ELF" in r.stderr:
+            arch = os.uname().machine
+            manylinux_tag = f"manylinux_2_28_{arch}"
+            print(
+                f"=== Pure-Python wheel detected; falling back to wheel tags "
+                f"({manylinux_tag})"
+            )
+            copied = os.path.join(dest_dir, os.path.basename(wheel_path))
+            shutil.copy(wheel_path, copied)
+            # `wheel tags --remove` replaces the linux_<arch> wheel in
+            # dest_dir with the correctly-tagged manylinux one.
+            r2 = subprocess.run(
+                [
+                    "python3",
+                    "-m",
+                    "wheel",
+                    "tags",
+                    "--platform-tag",
+                    manylinux_tag,
+                    "--remove",
+                    copied,
+                ]
+            )
+            fail_if(r2.returncode != 0, "wheel tags fallback failed")
+        elif r.returncode != 0:
+            sys.stderr.write(r.stderr)
+            fail_if(True, "auditwheel repair failed")
+
+
 if __name__ == "__main__":
     main()
diff --git a/src/python/setup.py b/src/python/setup.py
index 2c7c12a9ee..d04c5aeac8 100755
--- a/src/python/setup.py
+++ b/src/python/setup.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -49,6 +49,14 @@ def finalize_options(self):
             self.root_is_pure = False
 
         def get_tag(self):
+            # NOTE: the wheel bundles a CPython-ABI-specific binding
+            # (tritonfrontend/_c/<pybind>.so, e.g. "cpython-312-..."),
+            # which means it is only loadable under the matching
+            # interpreter. We currently emit a "py3-none-<plat>" tag
+            # for backwards compatibility with consumers that expect
+            # the existing filename shape; promote to "cp<XY>-cp<XY>"
+            # when we are ready to gate installs on the exact CPython
+            # version (see TRI-983).
             pyver, abi, plat = "py3", "none", PLATFORM_FLAG
             return pyver, abi, plat
 

From 863010965d6cd9a62c6016ed0423f0af4d3c04df Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Tue, 21 Apr 2026 13:59:47 -0700
Subject: [PATCH 03/19] fix: tag tritonfrontend wheel with CI build number
 (TRI-983)

Adopt PEP 427's optional build-tag slot so two wheels of the same
version (e.g. successive reruns of a CI pipeline) can coexist in the
same index without filename collision. Preferred source is GitLab's
CI_PIPELINE_ID with a BUILD_NUMBER fallback for other CI systems;
both are guaranteed to start with a digit as required by PEP 427.

Matches the build-tag slot already used by the RHEL .zip artifact
naming convention in .gitlab-ci.yml. Build-arg handoff through
build.py is a separate follow-up; this change is a no-op in local
non-CI builds since neither env var is set.

Refs: NVBug 6098081, JIRA DLIS-8648, Linear TRI-983
---
 src/python/build_wheel.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/python/build_wheel.py b/src/python/build_wheel.py
index 6e0789fe8c..d700e9dcea 100755
--- a/src/python/build_wheel.py
+++ b/src/python/build_wheel.py
@@ -123,6 +123,14 @@ def main():
     plat_name = sysconfig.get_platform().replace("-", "_").replace(".", "_")
     print("=== Building wheel")
     args = ["python3", "setup.py", "bdist_wheel", "--plat-name", plat_name]
+    # PEP 427 "build tag": an optional numeric segment between version
+    # and python-tag that lets two wheels of the same version coexist
+    # (e.g. reruns of the same CI pipeline). Preferred source is
+    # CI_PIPELINE_ID (GitLab) with a BUILD_NUMBER fallback — both are
+    # guaranteed to start with a digit as required by PEP 427.
+    build_number = os.environ.get("CI_PIPELINE_ID") or os.environ.get("BUILD_NUMBER")
+    if build_number:
+        args += ["--build-number", build_number]
 
     wenv = os.environ.copy()
     wenv["VERSION"] = FLAGS.triton_version

From e5bf24619a8f39eff158b262ada8c26864970fbe Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Tue, 21 Apr 2026 14:49:38 -0700
Subject: [PATCH 04/19] fix: auto-derive python/ABI tags, compose local
 version, propagate env (TRI-983)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three related fixes to make the tritonfrontend wheel filename match
the existing RHEL zip artifact convention and match the tritonserver
wheel's auto-derivation behaviour:

1. Remove the get_tag override in src/python/setup.py so setuptools
   derives cp<XY>-cp<XY>-<plat> automatically from root_is_pure=False.
   The override was hard-coding "py3-none-<PLATFORM_FLAG>", which
   under-specifies the wheel: the embedded binding .so is
   CPython-ABI-specific and fails to load on any other interpreter.
   Drop the stale --plat-name argv parsing along with it — bdist_wheel's
   stock finalize_options already picks up the flag forwarded by
   build_wheel.py.

2. Compose a PEP 440 local-version segment in build_wheel.py via a new
   _compose_version() helper. Appends "+nv<NVIDIA_UPSTREAM_VERSION>"
   and ".cu<MAJORMINOR>" when the corresponding env vars are set, so
   the wheel filename carries the same nv<X>.cu<Y> identifiers already
   used by the RHEL .zip artifact naming in .gitlab-ci.yml.

3. Propagate the wheel-naming env vars from the host into the build
   container via "-e NAME" on the docker-run invocation in build.py.
   CI_PIPELINE_ID and BUILD_NUMBER feed the PEP 427 build-tag slot;
   NVIDIA_UPSTREAM_VERSION and CUDA_VERSION feed the local-version
   segment.

Also adds wheel, setuptools, and auditwheel to the Ubuntu buildbase
pip install list (they were missing from the non-RHEL path, which is
why the first pipeline produced linux_<arch> instead of the expected
manylinux_2_28_<arch> tag).

Expected wheel filename under full CI with auditwheel present:
  tritonfrontend-<TRITON_VERSION>+nv<NV>.cu<CUDA>-<CI_PIPELINE_ID>-cp<XY>-cp<XY>-manylinux_2_28_<arch>.whl

Refs: NVBug 6098081, JIRA DLIS-8648, Linear TRI-983, TRI-286
---
 build.py                  | 22 ++++++++++++++++++++++
 src/python/build_wheel.py | 34 +++++++++++++++++++++++++++++++++-
 src/python/setup.py       | 25 +++++++------------------
 3 files changed, 62 insertions(+), 19 deletions(-)

diff --git a/build.py b/build.py
index f8777f7e30..177b886b94 100755
--- a/build.py
+++ b/build.py
@@ -1113,10 +1113,13 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
 
 RUN pip3 install --upgrade \\
           build \\
+          wheel \\
+          setuptools \\
           docker \\
           virtualenv \\
           patchelf==0.17.2 \\
           cmake==4.0.3 \\
+          auditwheel \\
           pybind11[global]
 """
 
@@ -1807,6 +1810,25 @@ def create_docker_build_script(script_name, container_install_dir, container_ci_
             "tritonserver_builder",
         ]
 
+        # Propagate wheel-naming env vars from the host (CI runner or
+        # local shell) into the build container so build_wheel.py can
+        # compose the full wheel filename: CI_PIPELINE_ID feeds the
+        # PEP 427 build-tag slot; NVIDIA_UPSTREAM_VERSION and
+        # CUDA_VERSION feed the PEP 440 local-version segment
+        # (+nv<X>.cu<Y>). See TRI-983. The "-e NAME" form inherits
+        # the value from the host env without naming the value, so
+        # unset vars simply propagate as unset.
+        runargs += [
+            "-e",
+            "CI_PIPELINE_ID",
+            "-e",
+            "BUILD_NUMBER",
+            "-e",
+            "NVIDIA_UPSTREAM_VERSION",
+            "-e",
+            "CUDA_VERSION",
+        ]
+
         if not FLAGS.no_container_interactive:
             runargs += ["-it"]
 
diff --git a/src/python/build_wheel.py b/src/python/build_wheel.py
index d700e9dcea..0dd7b7813e 100755
--- a/src/python/build_wheel.py
+++ b/src/python/build_wheel.py
@@ -55,6 +55,38 @@ def cpdir(src, dest):
     copy_tree(src, dest, preserve_symlinks=1)
 
 
+def _compose_version(base_version):
+    """Compose the full wheel version string.
+
+    The base version comes from TRITON_VERSION and may already include a
+    PEP 440 pre-release suffix (e.g. "2.69.0.dev0"). Append a PEP 440
+    local-version segment describing the NVIDIA container release and
+    CUDA toolkit the wheel was built against, so consumers can tell an
+    nv26.04 wheel from an nv26.05 wheel (same upstream Triton version)
+    and a cu132 wheel from a cu128 wheel. The local-version segment is
+    purely informational and does not affect pip's version comparison.
+
+    Sources:
+      NVIDIA_UPSTREAM_VERSION  - set by GitLab CI (e.g. "26.04")
+      CUDA_VERSION             - set by the CUDA base image (e.g. "13.2")
+    Both are optional; if neither is present the version is returned
+    unchanged so local non-CI builds stay stable.
+    """
+    local = []
+    nv = os.environ.get("NVIDIA_UPSTREAM_VERSION")
+    if nv:
+        local.append(f"nv{nv}")
+    cuda = os.environ.get("CUDA_VERSION")
+    if cuda:
+        # "13.2" / "13.2.0" / "13.2.1" -> "cu132"
+        parts = cuda.split(".")
+        if len(parts) >= 2 and parts[0].isdigit() and parts[1].isdigit():
+            local.append(f"cu{parts[0]}{parts[1]}")
+    if local:
+        return f"{base_version}+{'.'.join(local)}"
+    return base_version
+
+
 def sed(pattern, replace, source, dest=None):
     name = None
     if dest:
@@ -133,7 +165,7 @@ def main():
         args += ["--build-number", build_number]
 
     wenv = os.environ.copy()
-    wenv["VERSION"] = FLAGS.triton_version
+    wenv["VERSION"] = _compose_version(FLAGS.triton_version)
     wenv["TRITON_PYBIND"] = PYBIND_LIB
     p = subprocess.Popen(args, env=wenv)
     p.wait()
diff --git a/src/python/setup.py b/src/python/setup.py
index d04c5aeac8..6dd3ce1dae 100755
--- a/src/python/setup.py
+++ b/src/python/setup.py
@@ -26,20 +26,21 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import os
-import sys
 
 from setuptools import find_packages, setup
 
-if "--plat-name" in sys.argv:
-    PLATFORM_FLAG = sys.argv[sys.argv.index("--plat-name") + 1]
-else:
-    PLATFORM_FLAG = "any"
-
 if "VERSION" not in os.environ:
     raise Exception("envvar VERSION must be specified")
 
 VERSION = os.environ["VERSION"]
 
+# The wheel bundles a CPython-ABI-specific binding
+# (tritonfrontend/_c/<pybind>.so, filename encodes e.g. "cpython-312-..."),
+# so the wheel is only loadable under the matching interpreter and arch.
+# Marking the root impure lets setuptools/wheel auto-derive the correct
+# python/ABI/platform tags (e.g. "cp312-cp312-linux_x86_64") instead of
+# the misleading "py3-none-any" fallback. The --plat-name flag forwarded
+# by build_wheel.py is picked up by bdist_wheel's stock finalize_options.
 try:
     from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 
@@ -48,18 +49,6 @@ def finalize_options(self):
             _bdist_wheel.finalize_options(self)
             self.root_is_pure = False
 
-        def get_tag(self):
-            # NOTE: the wheel bundles a CPython-ABI-specific binding
-            # (tritonfrontend/_c/<pybind>.so, e.g. "cpython-312-..."),
-            # which means it is only loadable under the matching
-            # interpreter. We currently emit a "py3-none-<plat>" tag
-            # for backwards compatibility with consumers that expect
-            # the existing filename shape; promote to "cp<XY>-cp<XY>"
-            # when we are ready to gate installs on the exact CPython
-            # version (see TRI-983).
-            pyver, abi, plat = "py3", "none", PLATFORM_FLAG
-            return pyver, abi, plat
-
 except ImportError:
     bdist_wheel = None
 

From 34e599e4d7bd120a51e99e6c139347b9a3b0c2e0 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Tue, 21 Apr 2026 14:55:00 -0700
Subject: [PATCH 05/19] fix: source wheel build-tag from --build-id and detect
 CUDA locally (TRI-983)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two refinements on the env-propagation path for wheel naming:

1. Use NVIDIA_BUILD_ID (from --build-id) instead of a separate
   CI_PIPELINE_ID / BUILD_NUMBER env var. .gitlab-ci.yml already
   passes `--build-id=${CI_JOB_ID}` to build.py per the existing
   Triton convention, so the wheel build-tag slot now aligns with the
   same identifier used elsewhere in the Triton build system instead
   of introducing a parallel env var. build.py forwards FLAGS.build_id
   into the build container via `-e NVIDIA_BUILD_ID=<value>` only
   when --build-id was supplied; build_wheel.py skips the build-tag
   slot when NVIDIA_BUILD_ID is unset or non-numeric ("<unknown>"
   default, non-digit leading char) to satisfy PEP 427.

2. Stop propagating CUDA_VERSION via docker-run. The CUDA base image
   already exports CUDA_VERSION as an ENV inside the container, while
   the host / CI runner does not — `-e CUDA_VERSION` with an empty
   host value would override (and erase) the container value.
   build_wheel.py now reads CUDA_VERSION from the container-local env
   with a /usr/local/cuda/version.json fallback (canonical location
   for the installed toolkit).

Refs: NVBug 6098081, JIRA DLIS-8648, Linear TRI-983
---
 build.py                  | 42 +++++++++++++++++++--------------
 src/python/build_wheel.py | 49 +++++++++++++++++++++++++++++++--------
 2 files changed, 63 insertions(+), 28 deletions(-)

diff --git a/build.py b/build.py
index 177b886b94..b16aae5a2d 100755
--- a/build.py
+++ b/build.py
@@ -1810,24 +1810,30 @@ def create_docker_build_script(script_name, container_install_dir, container_ci_
             "tritonserver_builder",
         ]
 
-        # Propagate wheel-naming env vars from the host (CI runner or
-        # local shell) into the build container so build_wheel.py can
-        # compose the full wheel filename: CI_PIPELINE_ID feeds the
-        # PEP 427 build-tag slot; NVIDIA_UPSTREAM_VERSION and
-        # CUDA_VERSION feed the PEP 440 local-version segment
-        # (+nv<X>.cu<Y>). See TRI-983. The "-e NAME" form inherits
-        # the value from the host env without naming the value, so
-        # unset vars simply propagate as unset.
-        runargs += [
-            "-e",
-            "CI_PIPELINE_ID",
-            "-e",
-            "BUILD_NUMBER",
-            "-e",
-            "NVIDIA_UPSTREAM_VERSION",
-            "-e",
-            "CUDA_VERSION",
-        ]
+        # Propagate wheel-naming context from the host / CLI flags into
+        # the build container so build_wheel.py can compose the full
+        # wheel filename. See TRI-983.
+        #
+        # * NVIDIA_BUILD_ID — from --build-id (fed in CI by
+        #   `--build-id=${CI_JOB_ID}` per the existing Triton
+        #   convention). Feeds the PEP 427 build-tag slot between
+        #   version and python-tag.
+        # * NVIDIA_UPSTREAM_VERSION — set by GitLab CI as a top-level
+        #   pipeline variable. Feeds the PEP 440 local-version segment
+        #   "+nv<X>". The "-e NAME" form inherits the value from the
+        #   host env without naming it, so unset vars propagate as
+        #   unset.
+        #
+        # CUDA_VERSION is intentionally NOT propagated: the CUDA base
+        # image already sets it as an ENV inside the container, and
+        # the host/CI runner does not. Passing "-e CUDA_VERSION" with
+        # an empty host value would override (and erase) the
+        # container's value. build_wheel.py reads CUDA_VERSION from
+        # the container-local env (with a /usr/local/cuda/version.json
+        # fallback), which is where it is reliably set.
+        if FLAGS.build_id is not None:
+            runargs += ["-e", f"NVIDIA_BUILD_ID={FLAGS.build_id}"]
+        runargs += ["-e", "NVIDIA_UPSTREAM_VERSION"]
 
         if not FLAGS.no_container_interactive:
             runargs += ["-it"]
diff --git a/src/python/build_wheel.py b/src/python/build_wheel.py
index 0dd7b7813e..5bb32347ff 100755
--- a/src/python/build_wheel.py
+++ b/src/python/build_wheel.py
@@ -55,6 +55,32 @@ def cpdir(src, dest):
     copy_tree(src, dest, preserve_symlinks=1)
 
 
+def _detect_cuda_version():
+    """Detect the CUDA toolkit version visible to the build.
+
+    Prefers the CUDA_VERSION env var (set by official NVIDIA base
+    images); falls back to parsing /usr/local/cuda/version.json which
+    is the canonical location for the installed toolkit. Returns the
+    raw string (e.g. "13.2.1") or None when CUDA is not available.
+
+    CUDA_VERSION is only reliably set inside the build container (the
+    CUDA base image exports it) and must not be propagated from the
+    host — see the matching comment in build.py's docker-run
+    invocation.
+    """
+    v = os.environ.get("CUDA_VERSION")
+    if v:
+        return v
+    try:
+        import json as _json
+
+        with open("/usr/local/cuda/version.json") as f:
+            data = _json.load(f)
+        return data.get("cuda", {}).get("version")
+    except (OSError, ValueError, KeyError):
+        return None
+
+
 def _compose_version(base_version):
     """Compose the full wheel version string.
 
@@ -68,7 +94,7 @@ def _compose_version(base_version):
 
     Sources:
       NVIDIA_UPSTREAM_VERSION  - set by GitLab CI (e.g. "26.04")
-      CUDA_VERSION             - set by the CUDA base image (e.g. "13.2")
+      CUDA_VERSION / toolkit   - discovered by _detect_cuda_version()
     Both are optional; if neither is present the version is returned
     unchanged so local non-CI builds stay stable.
     """
@@ -76,7 +102,7 @@ def _compose_version(base_version):
     nv = os.environ.get("NVIDIA_UPSTREAM_VERSION")
     if nv:
         local.append(f"nv{nv}")
-    cuda = os.environ.get("CUDA_VERSION")
+    cuda = _detect_cuda_version()
     if cuda:
         # "13.2" / "13.2.0" / "13.2.1" -> "cu132"
         parts = cuda.split(".")
@@ -155,14 +181,17 @@ def main():
     plat_name = sysconfig.get_platform().replace("-", "_").replace(".", "_")
     print("=== Building wheel")
     args = ["python3", "setup.py", "bdist_wheel", "--plat-name", plat_name]
-    # PEP 427 "build tag": an optional numeric segment between version
-    # and python-tag that lets two wheels of the same version coexist
-    # (e.g. reruns of the same CI pipeline). Preferred source is
-    # CI_PIPELINE_ID (GitLab) with a BUILD_NUMBER fallback — both are
-    # guaranteed to start with a digit as required by PEP 427.
-    build_number = os.environ.get("CI_PIPELINE_ID") or os.environ.get("BUILD_NUMBER")
-    if build_number:
-        args += ["--build-number", build_number]
+    # PEP 427 "build tag": an optional segment between version and
+    # python-tag that lets two wheels of the same version coexist
+    # (e.g. reruns of the same CI job). Source is NVIDIA_BUILD_ID,
+    # which is set on the build container from --build-id; in CI
+    # .gitlab-ci.yml already passes `--build-id=${CI_JOB_ID}` so the
+    # value is a monotonic numeric ID. Skip the slot when the value
+    # does not start with a digit (required by PEP 427) or is the
+    # "<unknown>" default emitted for local builds without --build-id.
+    build_tag = os.environ.get("NVIDIA_BUILD_ID")
+    if build_tag and build_tag[:1].isdigit():
+        args += ["--build-number", build_tag]
 
     wenv = os.environ.copy()
     wenv["VERSION"] = _compose_version(FLAGS.triton_version)

From 24cf5e08aa8fbe5baaa6ac5ca94d746c0d39d1ca Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Tue, 21 Apr 2026 15:00:13 -0700
Subject: [PATCH 06/19] fix: source NVIDIA_UPSTREAM_VERSION from
 --upstream-container-version (TRI-983)

Switch the NVIDIA_UPSTREAM_VERSION passthrough to an explicit
-e NAME=VALUE form sourced from FLAGS.upstream_container_version,
matching the NVIDIA_BUILD_ID pattern introduced in the previous
commit. The value is well-defined in both CI (.gitlab-ci.yml passes
--upstream-container-version=${NVIDIA_UPSTREAM_VERSION}) and local
builds (falls back to DEFAULT_TRITON_VERSION_MAP's upstream default),
so the wheel's +nv<X> local-version segment is never empty by
accident. The previous "-e NAME" (inherit-from-host) form would have
propagated an empty string in local builds where the env var is not
exported, erasing any value set inside the container.

Refs: NVBug 6098081, JIRA DLIS-8648, Linear TRI-983
---
 build.py | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/build.py b/build.py
index b16aae5a2d..55bef073b6 100755
--- a/build.py
+++ b/build.py
@@ -1810,19 +1810,21 @@ def create_docker_build_script(script_name, container_install_dir, container_ci_
             "tritonserver_builder",
         ]
 
-        # Propagate wheel-naming context from the host / CLI flags into
-        # the build container so build_wheel.py can compose the full
-        # wheel filename. See TRI-983.
+        # Propagate wheel-naming context from build.py's CLI flags
+        # into the build container so build_wheel.py can compose the
+        # full wheel filename. See TRI-983. Both values come from
+        # FLAGS (the canonical source), not inherited from the host
+        # env, so they are defined in both CI and local builds:
         #
-        # * NVIDIA_BUILD_ID — from --build-id (fed in CI by
-        #   `--build-id=${CI_JOB_ID}` per the existing Triton
-        #   convention). Feeds the PEP 427 build-tag slot between
-        #   version and python-tag.
-        # * NVIDIA_UPSTREAM_VERSION — set by GitLab CI as a top-level
-        #   pipeline variable. Feeds the PEP 440 local-version segment
-        #   "+nv<X>". The "-e NAME" form inherits the value from the
-        #   host env without naming it, so unset vars propagate as
-        #   unset.
+        # * NVIDIA_BUILD_ID — from --build-id. In CI, .gitlab-ci.yml
+        #   passes `--build-id=${CI_JOB_ID}` per the existing Triton
+        #   convention. Feeds the PEP 427 build-tag slot between
+        #   version and python-tag. Skipped when --build-id was not
+        #   supplied (local builds).
+        # * NVIDIA_UPSTREAM_VERSION — from --upstream-container-version
+        #   (CI: `--upstream-container-version=${NVIDIA_UPSTREAM_VERSION}`;
+        #   local: defaults to the value in DEFAULT_TRITON_VERSION_MAP).
+        #   Feeds the PEP 440 local-version segment "+nv<X>".
         #
         # CUDA_VERSION is intentionally NOT propagated: the CUDA base
         # image already sets it as an ENV inside the container, and
@@ -1833,7 +1835,11 @@ def create_docker_build_script(script_name, container_install_dir, container_ci_
         # fallback), which is where it is reliably set.
         if FLAGS.build_id is not None:
             runargs += ["-e", f"NVIDIA_BUILD_ID={FLAGS.build_id}"]
-        runargs += ["-e", "NVIDIA_UPSTREAM_VERSION"]
+        if FLAGS.upstream_container_version:
+            runargs += [
+                "-e",
+                f"NVIDIA_UPSTREAM_VERSION={FLAGS.upstream_container_version}",
+            ]
 
         if not FLAGS.no_container_interactive:
             runargs += ["-it"]

From 63bbec0588cc4dadb4ce21763e4fbb655fee5d0d Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Tue, 21 Apr 2026 15:01:50 -0700
Subject: [PATCH 07/19] Remove system package installation

---
 build.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/build.py b/build.py
index 55bef073b6..2c59ac057a 100755
--- a/build.py
+++ b/build.py
@@ -1096,8 +1096,6 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
             libgoogle-perftools-dev \\
             python3-dev \\
             python3-pip \\
-            python3-wheel \\
-            python3-setuptools \\
             rapidjson-dev \\
             scons \\
             software-properties-common \\

From 64e2c8233082063db5d349526ddf7ff0f04bc9df Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Tue, 21 Apr 2026 15:39:36 -0700
Subject: [PATCH 08/19] build: use /opt/venv-tritonserver virtualenv for pip
 installs

Replace the distro-managed system Python + PIP_BREAK_SYSTEM_PACKAGES=1
escape hatch with a dedicated virtualenv at /opt/venv-tritonserver in
each Dockerfile stage. Subsequent RUN steps pick up the venv's pip /
python / cmake / auditwheel / etc. via PATH without further wiring.

Stages converted:
- RHEL buildbase (create_dockerfile_buildbase_rhel): venv created
  after change_default_python_version_rhel so the venv inherits the
  pyenv-installed interpreter rather than the distro system Python.
- Ubuntu buildbase (create_dockerfile_buildbase): venv created after
  apt-get installs python3-related packages; pybind11[global] is kept
  only in this stage (it is only needed during wheel builds).
- Linux runtime RHEL python-backend branch (dockerfile_prepare_container_linux):
  venv created after pyenv, same pattern as RHEL buildbase.
- Linux runtime Ubuntu python-backend branch: split the combined
  apt-get + pip install into apt-get (adding python3-venv), venv
  creation, pip install.

PIP_BREAK_SYSTEM_PACKAGES=1 is retained only in the production-stage
Dockerfile (create_dockerfile_linux) where an early `pip3 install
patchelf==0.17.2` runs in the RHEL common-deps section before the
venv is created; removing it would break that install. The cibase
Dockerfile (create_dockerfile_cibase) inherits the buildbase image
and therefore the buildbase venv via PATH, so PIP_BREAK_SYSTEM_PACKAGES
is dropped there.

Windows build is left untouched (uses python3 -m pip directly; no
venv conversion needed for the Windows pip site).
---
 build.py | 47 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/build.py b/build.py
index 2c59ac057a..e75357731a 100755
--- a/build.py
+++ b/build.py
@@ -930,7 +930,7 @@ def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
 
 ARG TRITON_VERSION
 ARG TRITON_CONTAINER_VERSION
-ENV PIP_BREAK_SYSTEM_PACKAGES=1 CMAKE_POLICY_VERSION_MINIMUM=3.5
+ENV CMAKE_POLICY_VERSION_MINIMUM=3.5
 """
     df += """
 # Install docker docker buildx
@@ -994,8 +994,14 @@ def create_dockerfile_buildbase_rhel(ddir, dockerfile_name, argmap):
     df += change_default_python_version_rhel(FLAGS.rhel_py_version)
     df += """
 
-RUN pip3 install --upgrade pip \\
-      && pip3 install --upgrade \\
+# Create a dedicated virtualenv so pip installs are isolated from the
+# distro-managed system Python. Subsequent RUN steps pick up the
+# venv's pip/python via PATH.
+RUN python3 -m venv /opt/venv-tritonserver
+ENV PATH="/opt/venv-tritonserver/bin:${PATH}"
+
+RUN pip install --upgrade pip \\
+      && pip install --upgrade \\
           build \\
           wheel \\
           setuptools \\
@@ -1048,7 +1054,7 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
 
 ARG TRITON_VERSION
 ARG TRITON_CONTAINER_VERSION
-ENV PIP_BREAK_SYSTEM_PACKAGES=1 CMAKE_POLICY_VERSION_MINIMUM=3.5
+ENV CMAKE_POLICY_VERSION_MINIMUM=3.5
 """
     # Install the windows- or linux-specific buildbase dependencies
     if target_platform() == "windows":
@@ -1109,7 +1115,13 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
             wget \\
       && rm -rf /var/lib/apt/lists/*
 
-RUN pip3 install --upgrade \\
+# Create a dedicated virtualenv so pip installs are isolated from the
+# distro-managed system Python. Subsequent RUN steps pick up the
+# venv's pip/python via PATH.
+RUN python3 -m venv /opt/venv-tritonserver
+ENV PATH="/opt/venv-tritonserver/bin:${PATH}"
+
+RUN pip install --upgrade \\
           build \\
           wheel \\
           setuptools \\
@@ -1196,7 +1208,6 @@ def create_dockerfile_cibase(ddir, dockerfile_name, argmap):
 
 ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
 ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION}
-ENV PIP_BREAK_SYSTEM_PACKAGES=1
 """
 
     with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
@@ -1440,8 +1451,14 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
             # Requires openssl-devel to be installed first for pyenv build to be successful
             df += change_default_python_version_rhel(FLAGS.rhel_py_version)
             df += """
-RUN pip3 install --upgrade pip \\
-    && pip3 install --upgrade \\
+# Create a dedicated virtualenv so pip installs are isolated from the
+# distro-managed system Python. Built after pyenv has provided the
+# desired Python version so the venv inherits that interpreter.
+RUN python3 -m venv /opt/venv-tritonserver
+ENV PATH="/opt/venv-tritonserver/bin:${PATH}"
+
+RUN pip install --upgrade pip \\
+    && pip install --upgrade \\
         wheel \\
         setuptools \\
         \"numpy<2\" \\
@@ -1453,15 +1470,23 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
 RUN apt-get update \\
       && apt-get install -y --no-install-recommends \\
             python3 \\
+            python3-venv \\
             libarchive-dev \\
             python3-pip \\
             python3-wheel \\
             python3-setuptools \\
             libpython3-dev \\
-      && pip3 install --upgrade \\
-            \"numpy<2\" \\
-            virtualenv \\
       && rm -rf /var/lib/apt/lists/*
+
+# Create a dedicated virtualenv so pip installs are isolated from the
+# distro-managed system Python. Subsequent RUN steps pick up the
+# venv's pip/python via PATH.
+RUN python3 -m venv /opt/venv-tritonserver
+ENV PATH="/opt/venv-tritonserver/bin:${PATH}"
+
+RUN pip install --upgrade \\
+        \"numpy<2\" \\
+        virtualenv
 """
     if "tensorrtllm" in backends or "vllm" in backends:
         df += """

From e808b042098da95411b1272bb4fe80ac743357cc Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Tue, 21 Apr 2026 15:46:01 -0700
Subject: [PATCH 09/19] build: drop PIP_BREAK_SYSTEM_PACKAGES, isolate patchelf
 via throwaway venv

Finishes the migration from system-Python + PIP_BREAK_SYSTEM_PACKAGES
to a clean venv model by addressing the two remaining pip3 installs
that previously ran against the distro system Python:

* create_dockerfile_linux (production stage): drop the
  `ENV PIP_BREAK_SYSTEM_PACKAGES=1` that was only needed as an escape
  hatch for the two patchelf installs handled below.
* dockerfile_prepare_container_linux / RHEL common deps: replace
  `pip3 install patchelf==0.17.2` with an ephemeral venv at
  /tmp/patchelf-venv, install patchelf into it, copy the binary to
  /usr/local/bin/patchelf, then remove the venv. Runs before the
  python-backend branch (where pyenv may later recreate the main
  /opt/venv-tritonserver venv), so this pattern survives the venv
  swap without needing to reinstall patchelf.
* add_cpu_libs_to_linux_dockerfile / pytorch CPU path: same
  ephemeral-venv pattern. The Ubuntu apt install now also pulls in
  python3-venv so `python3 -m venv` works in the runtime image.

The main /opt/venv-tritonserver venv is still only created inside the
`if "python" in backends:` branches, because the pyenv interpreter is
only required when Triton's python backend is built. The ephemeral
patchelf venv is independent of that gating and always runs when the
corresponding platform branch (RHEL common deps / CPU+pytorch) fires.
---
 build.py | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/build.py b/build.py
index e75357731a..b37e9bf8d2 100755
--- a/build.py
+++ b/build.py
@@ -1254,8 +1254,6 @@ def create_dockerfile_linux(
 ##  Production stage: Create container with just inference server executable
 ############################################################################
 FROM ${BASE_IMAGE}
-
-ENV PIP_BREAK_SYSTEM_PACKAGES=1
 """
 
     df += dockerfile_prepare_container_linux(
@@ -1375,7 +1373,15 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
         python3.12-pip \\
         numactl-devel
 
-RUN pip3 install patchelf==0.17.2
+# patchelf is distributed as a Python wheel but is a standalone CLI
+# tool. Install it into a throwaway venv, copy the binary to
+# /usr/local/bin, then remove the venv — this avoids polluting the
+# main /opt/venv-tritonserver venv and survives any later venv
+# recreation (e.g. when pyenv provides a different Python).
+RUN python3 -m venv /tmp/patchelf-venv \\
+    && /tmp/patchelf-venv/bin/pip install patchelf==0.17.2 \\
+    && cp /tmp/patchelf-venv/bin/patchelf /usr/local/bin/patchelf \\
+    && rm -rf /tmp/patchelf-venv
 
 """
     else:
@@ -1579,10 +1585,16 @@ def add_cpu_libs_to_linux_dockerfile(backends, target_machine):
 
 COPY --from=min_container /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.9 /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.9
 
-# patchelf is needed to add deps of libcublasLt.so.12 to libtorch_cuda.so
+# patchelf is needed to add deps of libcublasLt.so.12 to libtorch_cuda.so.
+# Install into a throwaway venv, copy the binary to /usr/local/bin,
+# then remove the venv — keeps the main /opt/venv-tritonserver clean
+# and avoids dependency on python3-venv in the runtime image.
 RUN apt-get update \\
-      && apt-get install -y --no-install-recommends openmpi-bin
-RUN pip3 install patchelf==0.17.2
+      && apt-get install -y --no-install-recommends openmpi-bin python3-venv
+RUN python3 -m venv /tmp/patchelf-venv \\
+    && /tmp/patchelf-venv/bin/pip install patchelf==0.17.2 \\
+    && cp /tmp/patchelf-venv/bin/patchelf /usr/local/bin/patchelf \\
+    && rm -rf /tmp/patchelf-venv
 
 ENV LD_LIBRARY_PATH /usr/local/cuda/targets/{cuda_arch}-linux/lib:/usr/local/cuda/lib64/stubs:${{LD_LIBRARY_PATH}}
 """.format(

From f1f035df97259129054b5ac33afe206bdd6cd988 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Tue, 21 Apr 2026 16:11:22 -0700
Subject: [PATCH 10/19] Fix package installation

---
 build.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.py b/build.py
index b37e9bf8d2..dd400ff88b 100755
--- a/build.py
+++ b/build.py
@@ -1101,7 +1101,7 @@ def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
             libb64-dev \\
             libgoogle-perftools-dev \\
             python3-dev \\
-            python3-pip \\
+            python3-venv \\
             rapidjson-dev \\
             scons \\
             software-properties-common \\

From 5ed69f2244101fa4d0ccf11508527c9ad9ab7410 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Tue, 21 Apr 2026 16:47:13 -0700
Subject: [PATCH 11/19] fix: signal has_ext_modules() to make wheel
 platlib-compliant (TRI-983)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirror the tritonserver setup.py fix: replace the ineffective
bdist_wheel.root_is_pure=False override with a Distribution subclass
whose has_ext_modules() returns True. Modern setuptools (>=70) ignores
overrides registered against wheel.bdist_wheel, so Root-Is-Purelib
stayed "true" in the WHEEL metadata despite our override — causing
auditwheel to reject the repair on the paired tritonserver wheel path
and the same issue is imminent for tritonfrontend. has_ext_modules()
is the canonical setuptools hook for declaring a wheel as binary.

Drop the --plat-name argv parsing (superseded by setuptools's auto
platform derivation) and the wheel.bdist_wheel override block.
build_wheel.py still forwards --plat-name via bdist_wheel's stock
flag, which is honored by setuptools's own bdist_wheel command.

Refs: NVBug 6098081, JIRA DLIS-8648, Linear TRI-983
---
 src/python/setup.py | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/python/setup.py b/src/python/setup.py
index 6dd3ce1dae..c7536c9d63 100755
--- a/src/python/setup.py
+++ b/src/python/setup.py
@@ -27,30 +27,34 @@
 
 import os
 
-from setuptools import find_packages, setup
+from setuptools import Distribution, find_packages, setup
 
 if "VERSION" not in os.environ:
     raise Exception("envvar VERSION must be specified")
 
 VERSION = os.environ["VERSION"]
 
+
 # The wheel bundles a CPython-ABI-specific binding
 # (tritonfrontend/_c/<pybind>.so, filename encodes e.g. "cpython-312-..."),
 # so the wheel is only loadable under the matching interpreter and arch.
-# Marking the root impure lets setuptools/wheel auto-derive the correct
-# python/ABI/platform tags (e.g. "cp312-cp312-linux_x86_64") instead of
-# the misleading "py3-none-any" fallback. The --plat-name flag forwarded
-# by build_wheel.py is picked up by bdist_wheel's stock finalize_options.
-try:
-    from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
-
-    class bdist_wheel(_bdist_wheel):
-        def finalize_options(self):
-            _bdist_wheel.finalize_options(self)
-            self.root_is_pure = False
+# The binding is copied into package_data at build time rather than
+# declared via setup(ext_modules=...), so setuptools would otherwise
+# treat the distribution as pure-Python and emit "Root-Is-Purelib: true"
+# in the WHEEL metadata — which auditwheel rejects when it finds the
+# .so embedded in the purelib tree.
+#
+# Signaling has_ext_modules()=True via a custom Distribution subclass
+# is the canonical way to tell setuptools the wheel is binary without
+# triggering a fake compilation step. setuptools then:
+#   - sets Root-Is-Purelib to false (required for auditwheel repair),
+#   - auto-derives the correct cp<XY>-cp<XY>-linux_<arch> tag from
+#     the current interpreter and sysconfig.get_platform().
+# See TRI-983.
+class BinaryDistribution(Distribution):
+    def has_ext_modules(self):
+        return True
 
-except ImportError:
-    bdist_wheel = None
 
 this_directory = os.path.abspath(os.path.dirname(__file__))
 
@@ -102,7 +106,7 @@ def finalize_options(self):
         "": platform_package_data,
     },
     zip_safe=False,
-    cmdclass={"bdist_wheel": bdist_wheel},
+    distclass=BinaryDistribution,
     data_files=data_files,
     install_requires=["tritonserver", "pydantic==2.10.6"],
     extras_require={"GPU": gpu_extras, "test": test_extras, "all": all_extras},

From 0a5913e8d82d07f022e50f6499b2d31ea44df9ee Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Tue, 21 Apr 2026 17:41:33 -0700
Subject: [PATCH 12/19] build: venv in production stage, symlink patchelf
 (TRI-983)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Four related refinements on top of the existing venv-based pip model:

1. create_dockerfile_linux production stage: add an unconditional
   `python3 -m venv /opt/venv-tritonserver` + PATH block right after
   `dockerfile_prepare_container_linux()` returns. The subsequent
   wheel installs (`pip install tritonserver-*.whl` /
   `tritonfrontend-*.whl`) and `pip install -r openai/requirements.txt`
   now use the venv's pip, removing the last place that relied on the
   legacy PIP_BREAK_SYSTEM_PACKAGES=1 escape hatch. Derived images
   (Dockerfile.QA) inherit the venv transparently via PATH — no
   changes needed there. Re-running `python3 -m venv` when the
   python-backend branch already created the venv is a safe no-op.

2. dockerfile_prepare_container_linux Ubuntu common-deps: add
   python3-venv to the apt install list so `python3 -m venv` works on
   minimal builds that omit the python backend (where python3-venv
   was previously only added inside the python-backend-specific
   branch).

3. Both patchelf install blocks (RHEL common-deps + CPU-only pytorch
   path): switch from cp-and-discard (/tmp/patchelf-venv) to
   symlink-from-persistent-venv (/opt/patchelf-venv). Makes future
   patchelf upgrades idempotent (`pip install -U patchelf` in the
   venv, symlink already points at the right place), and avoids the
   recreate-venv-just-to-copy-a-binary dance.

4. Change `pip3 install -r python/openai/requirements.txt` to
   `pip install` to match the rest of the venv-aware invocations in
   the same stage (both resolve to /opt/venv-tritonserver/bin/pip
   via PATH, but the style is now consistent).

Refs: Linear TRI-983
---
 build.py | 46 ++++++++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/build.py b/build.py
index dd400ff88b..f75b368417 100755
--- a/build.py
+++ b/build.py
@@ -1260,6 +1260,20 @@ def create_dockerfile_linux(
         argmap, backends, FLAGS.enable_gpu, target_machine()
     )
 
+    # Create a dedicated virtualenv so the wheel + openai-requirements
+    # pip installs below run in isolation from the distro-managed
+    # system Python (replaces the legacy PIP_BREAK_SYSTEM_PACKAGES=1
+    # escape hatch). If the python-backend branch above already
+    # created /opt/venv-tritonserver (on top of pyenv / Ubuntu
+    # python3), re-running `python3 -m venv` is a safe no-op; on
+    # minimal builds without the python backend this is the first
+    # creation. Derived images (Dockerfile.QA) inherit the venv via
+    # PATH.
+    df += """
+RUN python3 -m venv /opt/venv-tritonserver
+ENV PATH="/opt/venv-tritonserver/bin:${PATH}"
+"""
+
     df += f"""
 WORKDIR /opt
 COPY --chown=1000:1000 build/install tritonserver
@@ -1271,7 +1285,7 @@ def create_dockerfile_linux(
     find /opt/tritonserver/python -maxdepth 1 -type f -name \\
     "tritonfrontend-*.whl" | xargs -I {{}} pip install --upgrade {{}}[{FLAGS.triton_wheels_dependencies_group}]
 
-RUN pip3 install -r python/openai/requirements.txt
+RUN pip install -r python/openai/requirements.txt
 
 """
     if not FLAGS.no_core_build:
@@ -1374,14 +1388,13 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
         numactl-devel
 
 # patchelf is distributed as a Python wheel but is a standalone CLI
-# tool. Install it into a throwaway venv, copy the binary to
-# /usr/local/bin, then remove the venv — this avoids polluting the
-# main /opt/venv-tritonserver venv and survives any later venv
-# recreation (e.g. when pyenv provides a different Python).
-RUN python3 -m venv /tmp/patchelf-venv \\
-    && /tmp/patchelf-venv/bin/pip install patchelf==0.17.2 \\
-    && cp /tmp/patchelf-venv/bin/patchelf /usr/local/bin/patchelf \\
-    && rm -rf /tmp/patchelf-venv
+# tool. Install it into a dedicated venv and symlink the binary into
+# /usr/local/bin. Keeping the venv around (vs cp-and-discard) makes
+# future upgrades idempotent (`pip install -U patchelf` in the venv),
+# and avoids polluting the main /opt/venv-tritonserver venv.
+RUN python3 -m venv /opt/patchelf-venv \\
+    && /opt/patchelf-venv/bin/pip install patchelf==0.17.2 \\
+    && ln -s /opt/patchelf-venv/bin/patchelf /usr/local/bin/patchelf
 
 """
     else:
@@ -1406,6 +1419,7 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
               wget \\
               {backend_dependencies} \\
               python3-pip \\
+              python3-venv \\
       && rm -rf /var/lib/apt/lists/*
 """.format(
             backend_dependencies=backend_dependencies
@@ -1586,15 +1600,15 @@ def add_cpu_libs_to_linux_dockerfile(backends, target_machine):
 COPY --from=min_container /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.9 /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.9
 
 # patchelf is needed to add deps of libcublasLt.so.12 to libtorch_cuda.so.
-# Install into a throwaway venv, copy the binary to /usr/local/bin,
-# then remove the venv — keeps the main /opt/venv-tritonserver clean
-# and avoids dependency on python3-venv in the runtime image.
+# Install into a dedicated venv and symlink the binary into
+# /usr/local/bin. Keeping the venv around (vs cp-and-discard) makes
+# future upgrades idempotent and avoids polluting the main
+# /opt/venv-tritonserver venv.
 RUN apt-get update \\
       && apt-get install -y --no-install-recommends openmpi-bin python3-venv
-RUN python3 -m venv /tmp/patchelf-venv \\
-    && /tmp/patchelf-venv/bin/pip install patchelf==0.17.2 \\
-    && cp /tmp/patchelf-venv/bin/patchelf /usr/local/bin/patchelf \\
-    && rm -rf /tmp/patchelf-venv
+RUN python3 -m venv /opt/patchelf-venv \\
+    && /opt/patchelf-venv/bin/pip install patchelf==0.17.2 \\
+    && ln -s /opt/patchelf-venv/bin/patchelf /usr/local/bin/patchelf
 
 ENV LD_LIBRARY_PATH /usr/local/cuda/targets/{cuda_arch}-linux/lib:/usr/local/cuda/lib64/stubs:${{LD_LIBRARY_PATH}}
 """.format(

From dd6ca0d24ceeb2584e97eca03dbc8cad47035ded Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Tue, 21 Apr 2026 18:11:32 -0700
Subject: [PATCH 13/19] fix: robust NVIDIA_UPSTREAM_VERSION lookup for wheel
 naming (TRI-983)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wheels produced in pipeline 49141836 came out as
`tritonserver-2.69.0.dev0-<build>-cp312-cp312-linux_x86_64.whl` — no
`+nv26.04.cu132` local-version segment. The missing `+nv<X>` is
because the docker-run `-e NVIDIA_UPSTREAM_VERSION=<value>` path is
fragile: if build.py's `FLAGS.upstream_container_version` evaluates
to empty (e.g. `--upstream-container-version=` passed with no RHS)
the `-e` arg is skipped and the container-local env var is unset
when `python3 build_wheel.py` runs.

Two-pronged hardening:

1. `_compose_version()` now consults three env sources, first
   non-empty wins:
     - NVIDIA_UPSTREAM_VERSION (docker-run -e; fragile)
     - NVIDIA_TRITON_SERVER_VERSION (ENV in the buildbase image via
       the `TRITON_CONTAINER_VERSION` ARG -> ENV wiring; baked in at
       image-build time, survives even when -e forwarding fails)
     - TRITON_CONTAINER_VERSION (same value in CI, extra safety)
   In CI all three carry the same value so the effective output is
   unchanged when things work; when the -e hop fails, the image-ENV
   fallback keeps the `+nv<X>` suffix intact.

2. Diagnostic logging:
     - build.py logs wheel-naming inputs (build-id and
       upstream-container-version) alongside the existing container-
       version log line, so a missing value is obvious on the host
       before docker run even starts.
     - _compose_version() prints the env-var dict it saw plus the
       resolved nv/cuda tuple to stderr. Any future gap in the chain
       now surfaces in the wheel-build log rather than silently
       losing a version suffix.

Refs: Linear TRI-983
---
 build.py                  |  9 +++++++++
 src/python/build_wheel.py | 40 ++++++++++++++++++++++++++++++++-------
 2 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/build.py b/build.py
index f75b368417..86e4b4ac3d 100755
--- a/build.py
+++ b/build.py
@@ -2933,6 +2933,15 @@ def enable_all():
 
     log("container version {}".format(FLAGS.container_version))
     log("upstream container version {}".format(FLAGS.upstream_container_version))
+    # Explicit visibility for wheel-naming inputs (see TRI-983). If
+    # these are empty here, the wheel filename will lack the expected
+    # build-tag / local-version segments and the log below tells us
+    # which link in the chain dropped the value.
+    log(
+        "wheel-naming inputs: --build-id={!r}, --upstream-container-version={!r}".format(
+            FLAGS.build_id, FLAGS.upstream_container_version
+        )
+    )
 
     for ep in FLAGS.endpoint:
         log(f'endpoint "{ep}"')
diff --git a/src/python/build_wheel.py b/src/python/build_wheel.py
index 5bb32347ff..faf0366867 100755
--- a/src/python/build_wheel.py
+++ b/src/python/build_wheel.py
@@ -92,17 +92,43 @@ def _compose_version(base_version):
     and a cu132 wheel from a cu128 wheel. The local-version segment is
     purely informational and does not affect pip's version comparison.
 
-    Sources:
-      NVIDIA_UPSTREAM_VERSION  - set by GitLab CI (e.g. "26.04")
-      CUDA_VERSION / toolkit   - discovered by _detect_cuda_version()
-    Both are optional; if neither is present the version is returned
-    unchanged so local non-CI builds stay stable.
+    Sources for NVIDIA upstream version (first non-empty wins):
+      NVIDIA_UPSTREAM_VERSION        - propagated by build.py via
+                                       `docker run -e` from
+                                       FLAGS.upstream_container_version.
+      NVIDIA_TRITON_SERVER_VERSION   - set as ENV in the buildbase image
+                                       at image-build time from the
+                                       TRITON_CONTAINER_VERSION ARG
+                                       (survives even if the docker-run
+                                       `-e` forwarding is not applied).
+      TRITON_CONTAINER_VERSION       - set as ENV in some downstream
+                                       images; same value as above in CI.
+    Source for CUDA toolkit version:
+      CUDA_VERSION / toolkit         - discovered by _detect_cuda_version()
+
+    All sources are optional; if none is present the version is returned
+    unchanged so local non-CI builds stay stable. Each detection
+    outcome is logged to stderr so any future gap is self-announcing
+    in the build log rather than surfacing only as a missing suffix in
+    the wheel filename.
     """
+    nv = (
+        os.environ.get("NVIDIA_UPSTREAM_VERSION")
+        or os.environ.get("NVIDIA_TRITON_SERVER_VERSION")
+        or os.environ.get("TRITON_CONTAINER_VERSION")
+    )
+    cuda = _detect_cuda_version()
+    print(
+        f"=== Wheel local-version inputs: "
+        f"NVIDIA_UPSTREAM_VERSION={os.environ.get('NVIDIA_UPSTREAM_VERSION')!r} "
+        f"NVIDIA_TRITON_SERVER_VERSION={os.environ.get('NVIDIA_TRITON_SERVER_VERSION')!r} "
+        f"TRITON_CONTAINER_VERSION={os.environ.get('TRITON_CONTAINER_VERSION')!r} "
+        f"-> nv={nv!r}, cuda={cuda!r}",
+        file=sys.stderr,
+    )
     local = []
-    nv = os.environ.get("NVIDIA_UPSTREAM_VERSION")
     if nv:
         local.append(f"nv{nv}")
-    cuda = _detect_cuda_version()
     if cuda:
         # "13.2" / "13.2.0" / "13.2.1" -> "cu132"
         parts = cuda.split(".")

From 046ad94b93b6bf580cae6224023e57f651f170e6 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Tue, 21 Apr 2026 19:05:45 -0700
Subject: [PATCH 14/19] fix: propagate CI_PIPELINE_ID and prefer it as wheel
 build tag (TRI-983)

Two changes so the produced wheel carries a deterministic PEP 427
build tag matching the GitLab pipeline convention, with parity
between the tritonfrontend (this repo) and tritonserver (core repo)
wheels:

1. build.py's docker-run invocation now forwards CI_PIPELINE_ID
   from the host env into the build container via `-e`. Falls back
   to the host NVIDIA_UPSTREAM_VERSION env var when the CLI flag
   --upstream-container-version is empty.

2. build_wheel.py's build-tag resolution now prefers CI_PIPELINE_ID
   over NVIDIA_BUILD_ID (falls back further to BUILD_NUMBER for
   generic CI systems). CI_PIPELINE_ID is pipeline-scoped, matches
   the identifier used in the RHEL .zip artifact naming convention,
   and keeps all wheels in a pipeline sharing one build tag.

Also filters the "<unknown>" default build.py emits for local
builds without --build-id and adds a stderr diagnostic so any gap
in build-tag propagation is self-announcing.

Refs: Linear TRI-983
---
 build.py                  | 48 ++++++++++++++++++++++++++-------------
 src/python/build_wheel.py | 33 +++++++++++++++++++++------
 2 files changed, 58 insertions(+), 23 deletions(-)

diff --git a/build.py b/build.py
index 86e4b4ac3d..519f94aaf5 100755
--- a/build.py
+++ b/build.py
@@ -1859,21 +1859,27 @@ def create_docker_build_script(script_name, container_install_dir, container_ci_
             "tritonserver_builder",
         ]
 
-        # Propagate wheel-naming context from build.py's CLI flags
-        # into the build container so build_wheel.py can compose the
-        # full wheel filename. See TRI-983. Both values come from
-        # FLAGS (the canonical source), not inherited from the host
-        # env, so they are defined in both CI and local builds:
+        # Propagate wheel-naming context into the build container so
+        # build_wheel.py can compose the full wheel filename. See
+        # TRI-983. Both CLI flags and host env vars are checked so the
+        # value is defined in CI and local builds alike:
         #
         # * NVIDIA_BUILD_ID — from --build-id. In CI, .gitlab-ci.yml
         #   passes `--build-id=${CI_JOB_ID}` per the existing Triton
-        #   convention. Feeds the PEP 427 build-tag slot between
-        #   version and python-tag. Skipped when --build-id was not
-        #   supplied (local builds).
-        # * NVIDIA_UPSTREAM_VERSION — from --upstream-container-version
-        #   (CI: `--upstream-container-version=${NVIDIA_UPSTREAM_VERSION}`;
-        #   local: defaults to the value in DEFAULT_TRITON_VERSION_MAP).
-        #   Feeds the PEP 440 local-version segment "+nv<X>".
+        #   convention. One of several candidates build_wheel.py uses
+        #   for the PEP 427 build-tag slot.
+        # * CI_PIPELINE_ID — inherited from the GitLab runner's env
+        #   when set; build_wheel.py prefers this over NVIDIA_BUILD_ID
+        #   because it matches the identifier used in the RHEL .zip
+        #   artifact naming convention (.gitlab-ci.yml) and makes all
+        #   wheels in a pipeline share one build tag.
+        # * NVIDIA_UPSTREAM_VERSION — primarily from
+        #   --upstream-container-version (CI:
+        #   `--upstream-container-version=${NVIDIA_UPSTREAM_VERSION}`;
+        #   local: DEFAULT_TRITON_VERSION_MAP default). Falls back to
+        #   the host env var when the CLI flag is empty so the
+        #   +nv<X> local-version segment is still applied even if
+        #   someone invokes build.py with `--upstream-container-version=`.
         #
         # CUDA_VERSION is intentionally NOT propagated: the CUDA base
         # image already sets it as an ENV inside the container, and
@@ -1884,10 +1890,16 @@ def create_docker_build_script(script_name, container_install_dir, container_ci_
         # fallback), which is where it is reliably set.
         if FLAGS.build_id is not None:
             runargs += ["-e", f"NVIDIA_BUILD_ID={FLAGS.build_id}"]
-        if FLAGS.upstream_container_version:
+        ci_pipeline_id = os.environ.get("CI_PIPELINE_ID")
+        if ci_pipeline_id:
+            runargs += ["-e", f"CI_PIPELINE_ID={ci_pipeline_id}"]
+        upstream_version = FLAGS.upstream_container_version or os.environ.get(
+            "NVIDIA_UPSTREAM_VERSION"
+        )
+        if upstream_version:
             runargs += [
                 "-e",
-                f"NVIDIA_UPSTREAM_VERSION={FLAGS.upstream_container_version}",
+                f"NVIDIA_UPSTREAM_VERSION={upstream_version}",
             ]
 
         if not FLAGS.no_container_interactive:
@@ -2938,8 +2950,12 @@ def enable_all():
     # build-tag / local-version segments and the log below tells us
     # which link in the chain dropped the value.
     log(
-        "wheel-naming inputs: --build-id={!r}, --upstream-container-version={!r}".format(
-            FLAGS.build_id, FLAGS.upstream_container_version
+        "wheel-naming inputs: --build-id={!r}, --upstream-container-version={!r}, "
+        "CI_PIPELINE_ID={!r}, env NVIDIA_UPSTREAM_VERSION={!r}".format(
+            FLAGS.build_id,
+            FLAGS.upstream_container_version,
+            os.environ.get("CI_PIPELINE_ID"),
+            os.environ.get("NVIDIA_UPSTREAM_VERSION"),
         )
     )
 
diff --git a/src/python/build_wheel.py b/src/python/build_wheel.py
index faf0366867..e93f8d5adc 100755
--- a/src/python/build_wheel.py
+++ b/src/python/build_wheel.py
@@ -209,14 +209,33 @@ def main():
     args = ["python3", "setup.py", "bdist_wheel", "--plat-name", plat_name]
     # PEP 427 "build tag": an optional segment between version and
     # python-tag that lets two wheels of the same version coexist
-    # (e.g. reruns of the same CI job). Source is NVIDIA_BUILD_ID,
-    # which is set on the build container from --build-id; in CI
-    # .gitlab-ci.yml already passes `--build-id=${CI_JOB_ID}` so the
-    # value is a monotonic numeric ID. Skip the slot when the value
-    # does not start with a digit (required by PEP 427) or is the
+    # (e.g. reruns of the same CI pipeline). Sources, first non-empty
+    # and usable wins:
+    #   CI_PIPELINE_ID   - GitLab pipeline-scoped ID, matches the
+    #                      identifier used in RHEL .zip artifact
+    #                      naming (.gitlab-ci.yml). Preferred so all
+    #                      wheels in a pipeline share one build tag.
+    #   NVIDIA_BUILD_ID  - set from build.py's --build-id flag
+    #                      (CI feeds ${CI_JOB_ID}); falls back for
+    #                      non-CI builds that pass --build-id.
+    #   BUILD_NUMBER     - generic CI systems that set this instead.
+    # PEP 427 requires the build tag to start with a digit. Skip the
+    # slot when the value does not satisfy that constraint or is the
     # "<unknown>" default emitted for local builds without --build-id.
-    build_tag = os.environ.get("NVIDIA_BUILD_ID")
-    if build_tag and build_tag[:1].isdigit():
+    build_tag = (
+        os.environ.get("CI_PIPELINE_ID")
+        or os.environ.get("NVIDIA_BUILD_ID")
+        or os.environ.get("BUILD_NUMBER")
+    )
+    print(
+        f"=== Wheel build-tag inputs: "
+        f"CI_PIPELINE_ID={os.environ.get('CI_PIPELINE_ID')!r} "
+        f"NVIDIA_BUILD_ID={os.environ.get('NVIDIA_BUILD_ID')!r} "
+        f"BUILD_NUMBER={os.environ.get('BUILD_NUMBER')!r} "
+        f"-> build-tag={build_tag!r}",
+        file=sys.stderr,
+    )
+    if build_tag and build_tag != "<unknown>" and build_tag[:1].isdigit():
         args += ["--build-number", build_tag]
 
     wenv = os.environ.copy()

From 2ebd7d104b9efcab881a7ec8e01dadd27c1a4fce Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Wed, 22 Apr 2026 03:41:16 +0000
Subject: [PATCH 15/19] feat(wheel): use CI_JOB_ID as build tag, add
 PYPI_RELEASE support

- build.py now propagates CI_JOB_ID (preferred build tag) and
  PYPI_RELEASE into the build container via docker run -e; removes
  CI_PIPELINE_ID propagation (replaced by CI_JOB_ID).
- build_wheel.py build tag source updated: CI_JOB_ID -> NVIDIA_BUILD_ID
  -> BUILD_NUMBER.
- _compose_version() returns bare base_version when PYPI_RELEASE=true,
  stripping the +nv<X>.cu<Y> local segment that PyPI rejects on upload.
---
 build.py                  | 30 ++++++++++++++++++------------
 src/python/build_wheel.py | 31 ++++++++++++++++++++-----------
 2 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/build.py b/build.py
index 519f94aaf5..3fda3d1d8e 100755
--- a/build.py
+++ b/build.py
@@ -1864,15 +1864,14 @@ def create_docker_build_script(script_name, container_install_dir, container_ci_
         # TRI-983. Both CLI flags and host env vars are checked so the
         # value is defined in CI and local builds alike:
         #
+        # * CI_JOB_ID — GitLab job ID; used by build_wheel.py as the
+        #   PEP 427 build-tag (the numeric segment between version and
+        #   python-tag in the wheel filename). Unique per wheel-build
+        #   job, matching the identifier in .gitlab-ci.yml artifact
+        #   naming. Preferred over NVIDIA_BUILD_ID.
         # * NVIDIA_BUILD_ID — from --build-id. In CI, .gitlab-ci.yml
         #   passes `--build-id=${CI_JOB_ID}` per the existing Triton
-        #   convention. One of several candidates build_wheel.py uses
-        #   for the PEP 427 build-tag slot.
-        # * CI_PIPELINE_ID — inherited from the GitLab runner's env
-        #   when set; build_wheel.py prefers this over NVIDIA_BUILD_ID
-        #   because it matches the identifier used in the RHEL .zip
-        #   artifact naming convention (.gitlab-ci.yml) and makes all
-        #   wheels in a pipeline share one build tag.
+        #   convention; serves as fallback when CI_JOB_ID is not set.
         # * NVIDIA_UPSTREAM_VERSION — primarily from
         #   --upstream-container-version (CI:
         #   `--upstream-container-version=${NVIDIA_UPSTREAM_VERSION}`;
@@ -1880,6 +1879,9 @@ def create_docker_build_script(script_name, container_install_dir, container_ci_
         #   the host env var when the CLI flag is empty so the
         #   +nv<X> local-version segment is still applied even if
         #   someone invokes build.py with `--upstream-container-version=`.
+        # * PYPI_RELEASE — when "true", build_wheel.py omits the
+        #   +nv<X>.cu<Y> local-version suffix so the resulting wheel
+        #   can be uploaded to PyPI (which rejects local versions).
         #
         # CUDA_VERSION is intentionally NOT propagated: the CUDA base
         # image already sets it as an ENV inside the container, and
@@ -1888,11 +1890,11 @@ def create_docker_build_script(script_name, container_install_dir, container_ci_
         # container's value. build_wheel.py reads CUDA_VERSION from
         # the container-local env (with a /usr/local/cuda/version.json
         # fallback), which is where it is reliably set.
+        ci_job_id = os.environ.get("CI_JOB_ID")
+        if ci_job_id:
+            runargs += ["-e", f"CI_JOB_ID={ci_job_id}"]
         if FLAGS.build_id is not None:
             runargs += ["-e", f"NVIDIA_BUILD_ID={FLAGS.build_id}"]
-        ci_pipeline_id = os.environ.get("CI_PIPELINE_ID")
-        if ci_pipeline_id:
-            runargs += ["-e", f"CI_PIPELINE_ID={ci_pipeline_id}"]
         upstream_version = FLAGS.upstream_container_version or os.environ.get(
             "NVIDIA_UPSTREAM_VERSION"
         )
@@ -1901,6 +1903,9 @@ def create_docker_build_script(script_name, container_install_dir, container_ci_
                 "-e",
                 f"NVIDIA_UPSTREAM_VERSION={upstream_version}",
             ]
+        pypi_release = os.environ.get("PYPI_RELEASE")
+        if pypi_release:
+            runargs += ["-e", f"PYPI_RELEASE={pypi_release}"]
 
         if not FLAGS.no_container_interactive:
             runargs += ["-it"]
@@ -2951,11 +2956,12 @@ def enable_all():
     # which link in the chain dropped the value.
     log(
         "wheel-naming inputs: --build-id={!r}, --upstream-container-version={!r}, "
-        "CI_PIPELINE_ID={!r}, env NVIDIA_UPSTREAM_VERSION={!r}".format(
+        "CI_JOB_ID={!r}, env NVIDIA_UPSTREAM_VERSION={!r}, PYPI_RELEASE={!r}".format(
             FLAGS.build_id,
             FLAGS.upstream_container_version,
-            os.environ.get("CI_PIPELINE_ID"),
+            os.environ.get("CI_JOB_ID"),
             os.environ.get("NVIDIA_UPSTREAM_VERSION"),
+            os.environ.get("PYPI_RELEASE"),
         )
     )
 
diff --git a/src/python/build_wheel.py b/src/python/build_wheel.py
index e93f8d5adc..8a89918d80 100755
--- a/src/python/build_wheel.py
+++ b/src/python/build_wheel.py
@@ -92,6 +92,10 @@ def _compose_version(base_version):
     and a cu132 wheel from a cu128 wheel. The local-version segment is
     purely informational and does not affect pip's version comparison.
 
+    When PYPI_RELEASE=true the local-version suffix is omitted entirely:
+    PyPI rejects uploads whose version contains a '+' local segment, so
+    public release builds must use the bare version.
+
     Sources for NVIDIA upstream version (first non-empty wins):
       NVIDIA_UPSTREAM_VERSION        - propagated by build.py via
                                        `docker run -e` from
@@ -112,6 +116,12 @@ def _compose_version(base_version):
     in the build log rather than surfacing only as a missing suffix in
     the wheel filename.
     """
+    if os.environ.get("PYPI_RELEASE", "").lower() in ("1", "true", "yes"):
+        print(
+            "=== PYPI_RELEASE set: omitting local-version suffix for PyPI compatibility",
+            file=sys.stderr,
+        )
+        return base_version
     nv = (
         os.environ.get("NVIDIA_UPSTREAM_VERSION")
         or os.environ.get("NVIDIA_TRITON_SERVER_VERSION")
@@ -209,27 +219,26 @@ def main():
     args = ["python3", "setup.py", "bdist_wheel", "--plat-name", plat_name]
     # PEP 427 "build tag": an optional segment between version and
     # python-tag that lets two wheels of the same version coexist
-    # (e.g. reruns of the same CI pipeline). Sources, first non-empty
+    # (e.g. re-runs of the same pipeline). Sources, first non-empty
     # and usable wins:
-    #   CI_PIPELINE_ID   - GitLab pipeline-scoped ID, matches the
-    #                      identifier used in RHEL .zip artifact
-    #                      naming (.gitlab-ci.yml). Preferred so all
-    #                      wheels in a pipeline share one build tag.
-    #   NVIDIA_BUILD_ID  - set from build.py's --build-id flag
-    #                      (CI feeds ${CI_JOB_ID}); falls back for
-    #                      non-CI builds that pass --build-id.
-    #   BUILD_NUMBER     - generic CI systems that set this instead.
+    #   CI_JOB_ID       - GitLab job ID; unique per wheel-build job and
+    #                     matches the identifier stamped into the wheel
+    #                     filename in .gitlab-ci.yml artifact naming.
+    #   NVIDIA_BUILD_ID - set from build.py's --build-id flag (CI feeds
+    #                     ${CI_JOB_ID} there too); fallback for builds
+    #                     that do not export CI_JOB_ID directly.
+    #   BUILD_NUMBER    - generic CI systems that use this instead.
     # PEP 427 requires the build tag to start with a digit. Skip the
     # slot when the value does not satisfy that constraint or is the
     # "<unknown>" default emitted for local builds without --build-id.
     build_tag = (
-        os.environ.get("CI_PIPELINE_ID")
+        os.environ.get("CI_JOB_ID")
         or os.environ.get("NVIDIA_BUILD_ID")
         or os.environ.get("BUILD_NUMBER")
     )
     print(
         f"=== Wheel build-tag inputs: "
-        f"CI_PIPELINE_ID={os.environ.get('CI_PIPELINE_ID')!r} "
+        f"CI_JOB_ID={os.environ.get('CI_JOB_ID')!r} "
         f"NVIDIA_BUILD_ID={os.environ.get('NVIDIA_BUILD_ID')!r} "
         f"BUILD_NUMBER={os.environ.get('BUILD_NUMBER')!r} "
         f"-> build-tag={build_tag!r}",

From d9115b8551cfdce3217179ef4883ae9b8b2308fd Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Wed, 22 Apr 2026 03:55:15 +0000
Subject: [PATCH 16/19] fix(wheel): revert build tag to CI_PIPELINE_ID for
 consistency

All wheels from one pipeline share the same build tag so tritonserver
and tritonfrontend filenames stay consistent. CI should pass
--build-id=${CI_PIPELINE_ID} to build.py.
---
 build.py                  | 25 ++++++++++++-------------
 src/python/build_wheel.py | 17 +++++++++--------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/build.py b/build.py
index 3fda3d1d8e..c383608882 100755
--- a/build.py
+++ b/build.py
@@ -1864,14 +1864,13 @@ def create_docker_build_script(script_name, container_install_dir, container_ci_
         # TRI-983. Both CLI flags and host env vars are checked so the
         # value is defined in CI and local builds alike:
         #
-        # * CI_JOB_ID — GitLab job ID; used by build_wheel.py as the
-        #   PEP 427 build-tag (the numeric segment between version and
-        #   python-tag in the wheel filename). Unique per wheel-build
-        #   job, matching the identifier in .gitlab-ci.yml artifact
-        #   naming. Preferred over NVIDIA_BUILD_ID.
-        # * NVIDIA_BUILD_ID — from --build-id. In CI, .gitlab-ci.yml
-        #   passes `--build-id=${CI_JOB_ID}` per the existing Triton
-        #   convention; serves as fallback when CI_JOB_ID is not set.
+        # * CI_PIPELINE_ID — GitLab pipeline ID; shared across all jobs
+        #   in one pipeline so tritonserver and tritonfrontend wheels
+        #   from the same release carry the same PEP 427 build tag.
+        #   In CI, pass `--build-id=${CI_PIPELINE_ID}` to build.py.
+        # * NVIDIA_BUILD_ID — from --build-id; the primary vehicle for
+        #   CI_PIPELINE_ID into the container. build_wheel.py falls back
+        #   to this when CI_PIPELINE_ID is not exported directly.
         # * NVIDIA_UPSTREAM_VERSION — primarily from
         #   --upstream-container-version (CI:
         #   `--upstream-container-version=${NVIDIA_UPSTREAM_VERSION}`;
@@ -1890,9 +1889,9 @@ def create_docker_build_script(script_name, container_install_dir, container_ci_
         # container's value. build_wheel.py reads CUDA_VERSION from
         # the container-local env (with a /usr/local/cuda/version.json
         # fallback), which is where it is reliably set.
-        ci_job_id = os.environ.get("CI_JOB_ID")
-        if ci_job_id:
-            runargs += ["-e", f"CI_JOB_ID={ci_job_id}"]
+        ci_pipeline_id = os.environ.get("CI_PIPELINE_ID")
+        if ci_pipeline_id:
+            runargs += ["-e", f"CI_PIPELINE_ID={ci_pipeline_id}"]
         if FLAGS.build_id is not None:
             runargs += ["-e", f"NVIDIA_BUILD_ID={FLAGS.build_id}"]
         upstream_version = FLAGS.upstream_container_version or os.environ.get(
@@ -2956,10 +2955,10 @@ def enable_all():
     # which link in the chain dropped the value.
     log(
         "wheel-naming inputs: --build-id={!r}, --upstream-container-version={!r}, "
-        "CI_JOB_ID={!r}, env NVIDIA_UPSTREAM_VERSION={!r}, PYPI_RELEASE={!r}".format(
+        "CI_PIPELINE_ID={!r}, env NVIDIA_UPSTREAM_VERSION={!r}, PYPI_RELEASE={!r}".format(
             FLAGS.build_id,
             FLAGS.upstream_container_version,
-            os.environ.get("CI_JOB_ID"),
+            os.environ.get("CI_PIPELINE_ID"),
             os.environ.get("NVIDIA_UPSTREAM_VERSION"),
             os.environ.get("PYPI_RELEASE"),
         )
diff --git a/src/python/build_wheel.py b/src/python/build_wheel.py
index 8a89918d80..5fc4790e89 100755
--- a/src/python/build_wheel.py
+++ b/src/python/build_wheel.py
@@ -221,24 +221,25 @@ def main():
     # python-tag that lets two wheels of the same version coexist
     # (e.g. re-runs of the same pipeline). Sources, first non-empty
     # and usable wins:
-    #   CI_JOB_ID       - GitLab job ID; unique per wheel-build job and
-    #                     matches the identifier stamped into the wheel
-    #                     filename in .gitlab-ci.yml artifact naming.
-    #   NVIDIA_BUILD_ID - set from build.py's --build-id flag (CI feeds
-    #                     ${CI_JOB_ID} there too); fallback for builds
-    #                     that do not export CI_JOB_ID directly.
+    #   CI_PIPELINE_ID  - GitLab pipeline ID; shared by all jobs in one
+    #                     pipeline so tritonserver and tritonfrontend
+    #                     wheels from the same release carry the same
+    #                     tag. In CI, build.py is invoked with
+    #                     `--build-id=${CI_PIPELINE_ID}`.
+    #   NVIDIA_BUILD_ID - set from build.py's --build-id flag; primary
+    #                     vehicle for CI_PIPELINE_ID into the container.
     #   BUILD_NUMBER    - generic CI systems that use this instead.
     # PEP 427 requires the build tag to start with a digit. Skip the
     # slot when the value does not satisfy that constraint or is the
     # "<unknown>" default emitted for local builds without --build-id.
     build_tag = (
-        os.environ.get("CI_JOB_ID")
+        os.environ.get("CI_PIPELINE_ID")
         or os.environ.get("NVIDIA_BUILD_ID")
         or os.environ.get("BUILD_NUMBER")
     )
     print(
         f"=== Wheel build-tag inputs: "
-        f"CI_JOB_ID={os.environ.get('CI_JOB_ID')!r} "
+        f"CI_PIPELINE_ID={os.environ.get('CI_PIPELINE_ID')!r} "
         f"NVIDIA_BUILD_ID={os.environ.get('NVIDIA_BUILD_ID')!r} "
         f"BUILD_NUMBER={os.environ.get('BUILD_NUMBER')!r} "
         f"-> build-tag={build_tag!r}",

From 16e5446254ce720d5f869a119570098dc4e94a6f Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Wed, 22 Apr 2026 04:17:06 +0000
Subject: [PATCH 17/19] fix(wheel): omit build tag on PYPI_RELEASE=true

PyPI release wheels must have no build-tag segment so the filename is
the canonical bare form:
  tritonfrontend-2.69.0-cp312-cp312-manylinux_2_28_x86_64.whl
---
 src/python/build_wheel.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/python/build_wheel.py b/src/python/build_wheel.py
index 5fc4790e89..ef358d0689 100755
--- a/src/python/build_wheel.py
+++ b/src/python/build_wheel.py
@@ -232,13 +232,17 @@ def main():
     # PEP 427 requires the build tag to start with a digit. Skip the
     # slot when the value does not satisfy that constraint or is the
     # "<unknown>" default emitted for local builds without --build-id.
-    build_tag = (
-        os.environ.get("CI_PIPELINE_ID")
-        or os.environ.get("NVIDIA_BUILD_ID")
-        or os.environ.get("BUILD_NUMBER")
-    )
+    if os.environ.get("PYPI_RELEASE", "").lower() in ("1", "true", "yes"):
+        build_tag = None
+    else:
+        build_tag = (
+            os.environ.get("CI_PIPELINE_ID")
+            or os.environ.get("NVIDIA_BUILD_ID")
+            or os.environ.get("BUILD_NUMBER")
+        )
     print(
         f"=== Wheel build-tag inputs: "
+        f"PYPI_RELEASE={os.environ.get('PYPI_RELEASE')!r} "
         f"CI_PIPELINE_ID={os.environ.get('CI_PIPELINE_ID')!r} "
         f"NVIDIA_BUILD_ID={os.environ.get('NVIDIA_BUILD_ID')!r} "
         f"BUILD_NUMBER={os.environ.get('BUILD_NUMBER')!r} "

From 0e47abbf67896fb611567d156d5c71280168cfb0 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Wed, 22 Apr 2026 04:46:03 +0000
Subject: [PATCH 18/19] fix(rhel): use ln -sf to overwrite existing patchelf
 symlink

The RHEL base image already ships patchelf at /usr/local/bin/patchelf,
so ln -s fails with 'File exists'. Use -f to force-replace it with
the venv-managed binary.
---
 build.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/build.py b/build.py
index c383608882..3d1df62f37 100755
--- a/build.py
+++ b/build.py
@@ -1374,18 +1374,18 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
     if target_platform() == "rhel":
         df += """
 # Common dependencies.
-RUN yum install -y \\
+RUN dnf install -y \\
         git \\
         gperf \\
-        re2-devel \\
-        openssl-devel \\
-        libtool \\
-        libcurl-devel \\
-        libb64-devel \\
         gperftools-devel \\
-        wget \\
-        python3.12-pip \\
-        numactl-devel
+        libb64-devel \\
+        libcurl-devel \\
+        libtool \\
+        numactl-devel \\
+        openssl-devel \\
+        python3.12-venv \\
+        re2-devel \\
+        wget
 
 # patchelf is distributed as a Python wheel but is a standalone CLI
 # tool. Install it into a dedicated venv and symlink the binary into
@@ -1394,7 +1394,7 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
 # and avoids polluting the main /opt/venv-tritonserver venv.
 RUN python3 -m venv /opt/patchelf-venv \\
     && /opt/patchelf-venv/bin/pip install patchelf==0.17.2 \\
-    && ln -s /opt/patchelf-venv/bin/patchelf /usr/local/bin/patchelf
+    && ln -sf /opt/patchelf-venv/bin/patchelf /usr/local/bin/patchelf
 
 """
     else:
@@ -1608,7 +1608,7 @@ def add_cpu_libs_to_linux_dockerfile(backends, target_machine):
       && apt-get install -y --no-install-recommends openmpi-bin python3-venv
 RUN python3 -m venv /opt/patchelf-venv \\
     && /opt/patchelf-venv/bin/pip install patchelf==0.17.2 \\
-    && ln -s /opt/patchelf-venv/bin/patchelf /usr/local/bin/patchelf
+    && ln -sf /opt/patchelf-venv/bin/patchelf /usr/local/bin/patchelf
 
 ENV LD_LIBRARY_PATH /usr/local/cuda/targets/{cuda_arch}-linux/lib:/usr/local/cuda/lib64/stubs:${{LD_LIBRARY_PATH}}
 """.format(

From ad5530317caf773e56faade3bcd43eee72f3b1d5 Mon Sep 17 00:00:00 2001
From: Misha Chornyi <mchornyi@nvidia.com>
Date: Wed, 22 Apr 2026 05:02:37 +0000
Subject: [PATCH 19/19] fix: consolidate tool venvs into /opt/venv-tritonserver

Replace bare pip3 install and per-tool dedicated venvs (patchelf-venv,
cmake-venv) with a single /opt/venv-tritonserver whose bin/ is added to
PATH via ENV. This fixes PEP 668 externally-managed-environment errors on
Ubuntu Noble base images and provides a consistent install pattern across
Dockerfile.QA (cibase + main stages) and build.py (RHEL and cpu-pytorch
final-image stages).
---
 Dockerfile.QA |  8 +++++---
 build.py      | 21 ++++++---------------
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/Dockerfile.QA b/Dockerfile.QA
index a8ffbd8a19..d605b12c13 100644
--- a/Dockerfile.QA
+++ b/Dockerfile.QA
@@ -66,7 +66,9 @@ RUN apt-get update && \
             software-properties-common && \
     rm -rf /var/lib/apt/lists/*
 
-RUN pip3 install cmake==4.0.3
+RUN python3 -m venv /opt/venv-tritonserver
+ENV PATH="/opt/venv-tritonserver/bin:${PATH}"
+RUN pip install cmake==4.0.3
 ENV CMAKE_POLICY_VERSION_MINIMUM=3.5
 
 # Add densenet_onnx model to example repo
@@ -348,7 +350,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 RUN rm -f /usr/bin/python && \
     ln -s /usr/bin/python3 /usr/bin/python
 
-RUN pip3 install --upgrade "numpy<2" pillow attrdict future "grpcio<1.68" requests gsutil \
+RUN pip install --upgrade "numpy<2" pillow attrdict future "grpcio<1.68" requests gsutil \
                            "awscli<=1.36.40" six "grpcio-channelz<1.68" prettytable virtualenv \
                            check-jsonschema
 
@@ -377,7 +379,7 @@ COPY --chown=1000:1000 --from=sdk /workspace/qa/ qa/
 RUN rm -fr qa/L0_copyrights qa/L0_build_variants && \
     find qa/pkgs/ -maxdepth 1 -type f -name \
     "tritonclient-*-py3-none-any.whl" | xargs printf -- '%s[all]' | \
-    xargs pip3 install --upgrade
+    xargs pip install --upgrade
 
 ENV LD_LIBRARY_PATH /opt/tritonserver/qa/clients:${LD_LIBRARY_PATH}
 
diff --git a/build.py b/build.py
index 3d1df62f37..a60525e6a5 100755
--- a/build.py
+++ b/build.py
@@ -1387,14 +1387,9 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
         re2-devel \\
         wget
 
-# patchelf is distributed as a Python wheel but is a standalone CLI
-# tool. Install it into a dedicated venv and symlink the binary into
-# /usr/local/bin. Keeping the venv around (vs cp-and-discard) makes
-# future upgrades idempotent (`pip install -U patchelf` in the venv),
-# and avoids polluting the main /opt/venv-tritonserver venv.
-RUN python3 -m venv /opt/patchelf-venv \\
-    && /opt/patchelf-venv/bin/pip install patchelf==0.17.2 \\
-    && ln -sf /opt/patchelf-venv/bin/patchelf /usr/local/bin/patchelf
+RUN python3 -m venv /opt/venv-tritonserver
+ENV PATH="/opt/venv-tritonserver/bin:${PATH}"
+RUN pip install patchelf==0.17.2
 
 """
     else:
@@ -1600,15 +1595,11 @@ def add_cpu_libs_to_linux_dockerfile(backends, target_machine):
 COPY --from=min_container /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.9 /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.9
 
 # patchelf is needed to add deps of libcublasLt.so.12 to libtorch_cuda.so.
-# Install into a dedicated venv and symlink the binary into
-# /usr/local/bin. Keeping the venv around (vs cp-and-discard) makes
-# future upgrades idempotent and avoids polluting the main
-# /opt/venv-tritonserver venv.
 RUN apt-get update \\
       && apt-get install -y --no-install-recommends openmpi-bin python3-venv
-RUN python3 -m venv /opt/patchelf-venv \\
-    && /opt/patchelf-venv/bin/pip install patchelf==0.17.2 \\
-    && ln -sf /opt/patchelf-venv/bin/patchelf /usr/local/bin/patchelf
+RUN python3 -m venv /opt/venv-tritonserver
+ENV PATH="/opt/venv-tritonserver/bin:${PATH}"
+RUN pip install patchelf==0.17.2
 
 ENV LD_LIBRARY_PATH /usr/local/cuda/targets/{cuda_arch}-linux/lib:/usr/local/cuda/lib64/stubs:${{LD_LIBRARY_PATH}}
 """.format(