triton-inference-server
diff --git a/‎.github/workflows/pre-commit.yaml‎ ‎.github/workflows/pre-commit.yml‎.github/workflows/pre-commit.yaml renamed to .github/workflows/pre-commit.yml
Lines changed: 5 additions & 5 deletions b/‎.github/workflows/pre-commit.yaml‎ ‎.github/workflows/pre-commit.yml‎.github/workflows/pre-commit.yaml renamed to .github/workflows/pre-commit.yml
Lines changed: 5 additions & 5 deletions
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 4 additions & 4 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Dockerfile.QA‎
Lines changed: 4 additions & 2 deletions b/‎Dockerfile.QA‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎Dockerfile.sdk‎
Lines changed: 13 additions & 68 deletions b/‎Dockerfile.sdk‎
Lines changed: 13 additions & 68 deletions
diff --git a/‎LICENSE‎
Lines changed: 1 addition & 1 deletion b/‎LICENSE‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 11 additions & 12 deletions b/‎README.md‎
Lines changed: 11 additions & 12 deletions
diff --git a/‎TRITON_VERSION‎
Lines changed: 1 addition & 1 deletion b/‎TRITON_VERSION‎
Lines changed: 1 addition & 1 deletion
@@ -1,4 +1,4 @@
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -31,15 +31,15 @@ on:
 
 jobs:
   pre-commit:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v5.0.0
       with:
         fetch-depth: 2
     - name: Get modified files
       id: modified-files
       run: echo "modified_files=$(git diff --name-only -r HEAD^1 HEAD | xargs)" >> $GITHUB_OUTPUT
-    - uses: actions/setup-python@v3
-    - uses: pre-commit/action@v3.0.0
+    - uses: actions/setup-python@v6.0.0
+    - uses: pre-commit/action@v3.0.1
       with:
         extra_args: --files ${{ steps.modified-files.outputs.modified_files }}
@@ -16,4 +16,5 @@ cprofile
 # Test exclusions
 qa/L0_openai/openai
 tensorrtllm_models
+tensorrtllm_mistral_models/
 custom_tokenizer
@@ -1,4 +1,4 @@
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,7 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 repos:
-- repo: https://github.com/timothycrosley/isort
+- repo: https://github.com/PyCQA/isort
   rev: 5.12.0
   hooks:
   - id: isort
@@ -36,7 +36,7 @@ repos:
   - id: black
     types_or: [python, cython]
 - repo: https://github.com/PyCQA/flake8
-  rev: 5.0.4
+  rev: 7.3.0
   hooks:
   - id: flake8
     args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]
@@ -57,7 +57,7 @@ repos:
 # More details about these pre-commit hooks here:
 # https://pre-commit.com/hooks.html
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.4.0
+  rev: v6.0.0
   hooks:
   - id: check-case-conflict
   - id: check-executables-have-shebangs
 
@@ -88,7 +88,7 @@ proposed change so that the Triton team can provide feedback.
     documentation for instructions on running these tests.
 
 - Triton Inference Server's default build assumes recent versions of
-  dependencies (CUDA, TensorFlow, PyTorch, TensorRT,
+  dependencies (CUDA, PyTorch, TensorRT,
   etc.). Contributions that add compatibility with older versions of
   those dependencies will be considered, but NVIDIA cannot guarantee
   that all possible build configurations work, are not broken by
 
@@ -1,4 +1,4 @@
-# Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -139,7 +139,7 @@ RUN mkdir -p qa/common && \
     mkdir qa/L0_data_compression/models && \
     cp -r docs/examples/model_repository/simple qa/L0_data_compression/models && \
     cp bin/data_compressor_test qa/L0_data_compression/. && \
-    cp bin/backend_tensor_size_test qa/L0_input_validation/. && \
+    cp bin/tensor_size_test qa/L0_input_validation/. && \
     cp bin/metrics_api_test qa/L0_metrics/. && \
     cp bin/response_cache_test qa/L0_response_cache/. && \
     cp bin/request_cancellation_test qa/L0_request_cancellation/. && \
@@ -186,12 +186,14 @@ RUN cd tritonbuild/identity && \
     make -j16 install
 
 # L0_backend_python test require triton_shm_monitor
+ARG TRITON_BOOST_URL="https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz"
 RUN cd tritonbuild/python && \
     rm -rf install build && mkdir build && cd build && \
     cmake -DCMAKE_INSTALL_PREFIX:PATH=/workspace/tritonbuild/python/install \
         -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \
         -DTRITON_COMMON_REPO_TAG:STRING=${TRITON_COMMON_REPO_TAG} \
         -DTRITON_CORE_REPO_TAG:STRING=${TRITON_CORE_REPO_TAG} \
+        -DTRITON_BOOST_URL:STRING=${TRITON_BOOST_URL} \
         -DTRITON_BACKEND_REPO_TAG:STRING=${TRITON_BACKEND_REPO_TAG} .. && \
     make -j16 triton-shm-monitor install
 
 
@@ -1,4 +1,4 @@
-# Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,10 +29,9 @@
 #
 
 # Base image on the minimum Triton container
-ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.07-py3-min
+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:26.02-py3-min
 
 ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
-ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo
 ARG TRITON_REPO_ORGANIZATION=http://github.com/triton-inference-server
 ARG TRITON_COMMON_REPO_TAG=main
 ARG TRITON_CORE_REPO_TAG=main
@@ -41,9 +40,8 @@ ARG TRITON_THIRD_PARTY_REPO_TAG=main
 ARG TRITON_ENABLE_GPU=ON
 ARG JAVA_BINDINGS_MAVEN_VERSION=3.8.4
 ARG JAVA_BINDINGS_JAVACPP_PRESETS_TAG=1.5.8
-ARG TRITON_PERF_ANALYZER_BUILD=1
 # DCGM version to install for Model Analyzer
-ARG DCGM_VERSION=4.2.3-2
+ARG DCGM_VERSION=4.5.2-1
 
 ARG NVIDIA_TRITON_SERVER_SDK_VERSION=unknown
 ARG NVIDIA_BUILD_ID=unknown
@@ -97,7 +95,6 @@ RUN rm -f /usr/bin/python && \
 # Build the client library and examples
 ARG TRITON_REPO_ORGANIZATION
 ARG TRITON_CLIENT_REPO_SUBDIR
-ARG TRITON_PA_REPO_SUBDIR
 ARG TRITON_COMMON_REPO_TAG
 ARG TRITON_CORE_REPO_TAG
 ARG TRITON_CLIENT_REPO_TAG
@@ -106,14 +103,10 @@ ARG TRITON_ENABLE_GPU
 ARG JAVA_BINDINGS_MAVEN_VERSION
 ARG JAVA_BINDINGS_JAVACPP_PRESETS_TAG
 ARG TARGETPLATFORM
-ARG TRITON_PERF_ANALYZER_BUILD
-
-ENV TRITON_PERF_ANALYZER_BUILD=${TRITON_PERF_ANALYZER_BUILD}
 
 WORKDIR /workspace
 COPY TRITON_VERSION .
 COPY ${TRITON_CLIENT_REPO_SUBDIR} client
-COPY ${TRITON_PA_REPO_SUBDIR} perf_analyzer
 
 WORKDIR /workspace/client_build
 RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
@@ -124,63 +117,11 @@ RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
           -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
           -DTRITON_ENABLE_PERF_ANALYZER=OFF \
           -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON \
-          -DTRITON_ENABLE_PYTHON_HTTP=OFF -DTRITON_ENABLE_PYTHON_GRPC=OFF \
+          -DTRITON_ENABLE_PYTHON_HTTP=ON -DTRITON_ENABLE_PYTHON_GRPC=ON \
           -DTRITON_ENABLE_JAVA_HTTP=ON \
           -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON \
           -DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} /workspace/client
-RUN cmake --build . -v --parallel --target cc-clients java-clients
-
-# TODO: PA will rebuild the CC clients since it depends on it.
-# This should be optimized so that we do not have to build
-# the CC clients twice. Similarly, because the SDK expectation is
-# that PA is packaged with the python client, we hold off on building
-# the python client until now. Post-migration we should focus
-# effort on de-tangling these flows.
-WORKDIR /workspace/pa_build
-# NOTE: If TRITON_PERF_ANALYZER_BUILD=0, the Performance Analyzer (PA) binaries must already exist
-# in the path specified by the ARG TRITON_PA_REPO_SUBDIR.
-RUN if [ "$TRITON_PERF_ANALYZER_BUILD" = "1" ]; then \
-        cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
-          -DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \
-          -DTRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION} \
-          -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
-          -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
-          -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
-          -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
-          -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
-          -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \
-          -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \
-          -DTRITON_ENABLE_PERF_ANALYZER_OPENAI=ON \
-          -DTRITON_ENABLE_CC_HTTP=ON \
-          -DTRITON_ENABLE_CC_GRPC=ON \
-          -DTRITON_ENABLE_PYTHON_HTTP=ON \
-          -DTRITON_ENABLE_PYTHON_GRPC=ON \
-          -DTRITON_PACKAGE_PERF_ANALYZER=ON \
-          -DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} \
-        /workspace/perf_analyzer && \
-        cmake --build . -v --parallel --target perf-analyzer python-clients && \
-        pip3 install build && \
-        cd /workspace/perf_analyzer/genai-perf && \
-        python3 -m build --wheel --outdir /workspace/install/python; \
-    else \
-        ls /workspace/perf_analyzer/ && \
-        tar -xzf /workspace/perf_analyzer/perf_analyzer*.tar.gz -C /workspace/install/bin && \
-        echo "Perf Analyzer binaries was extracted and not build" && \
-        cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
-          -DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \
-          -DTRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION} \
-          -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
-          -DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
-          -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
-          -DTRITON_ENABLE_PYTHON_HTTP=ON \
-          -DTRITON_ENABLE_PYTHON_GRPC=ON \
-          -DTRITON_PACKAGE_PERF_ANALYZER=ON \
-          -DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} \
-        /workspace/perf_analyzer && \
-        cmake --build . -v --parallel --target python-clients && \
-        mkdir -p /workspace/install/python && \
-        cp /workspace/perf_analyzer/genai_perf-*.whl /workspace/install/python/; \
-    fi
+RUN cmake --build . -v --parallel --target cc-clients java-clients python-clients
 
 # Install Java API Bindings
 RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
@@ -226,7 +167,6 @@ RUN apt-get update && \
             python3-pip \
             python3-setuptools \
             python3-wheel \
-            software-properties-common \
             vim \
             wget && \
     pip3 install "grpcio<1.68" "grpcio-tools<1.68"
@@ -235,7 +175,6 @@ WORKDIR /workspace
 COPY TRITON_VERSION .
 COPY NVIDIA_Deep_Learning_Container_License.pdf .
 COPY --from=sdk_build /workspace/client/ client/
-COPY --from=sdk_build /workspace/perf_analyzer/ perf_analyzer/
 COPY --from=sdk_build /workspace/install/ install/
 RUN cd install && \
     export VERSION=`cat /workspace/TRITON_VERSION` && \
@@ -253,8 +192,6 @@ COPY --from=sdk_build /workspace/client/src/python/library/tests/* qa/python_cli
 # Install an image needed by the quickstart and other documentation.
 COPY qa/images/mug.jpg images/mug.jpg
 
-RUN pip3 install install/python/genai_perf-*.whl
-
 # Install the dependencies needed to run the client examples. These
 # are not needed for building but including them allows this image to
 # be used to run the client examples.
@@ -263,6 +200,9 @@ RUN pip3 install --upgrade "numpy<2" pillow attrdict && \
          "tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \
     xargs pip3 install --upgrade
 
+# Install GenAI-Perf
+RUN pip3 install genai-perf
+
 # Install DCGM
 RUN if [ "$TRITON_ENABLE_GPU" = "ON" ]; then \
         [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" && \
@@ -279,6 +219,11 @@ RUN if [ "$TRITON_ENABLE_GPU" = "ON" ]; then \
 RUN rm -f /usr/bin/python && \
     ln -s /usr/bin/python3 /usr/bin/python
 
+# Install Model Analyzer
+ARG TRITON_MODEL_ANALYZER_REPO_TAG
+ARG TRITON_MODEL_ANALYZER_REPO="${TRITON_REPO_ORGANIZATION}/model_analyzer@${TRITON_MODEL_ANALYZER_REPO_TAG}"
+RUN pip3 install "git+${TRITON_MODEL_ANALYZER_REPO}"
+
 # Entrypoint Banner
 ENV NVIDIA_PRODUCT_NAME="Triton Server SDK"
 COPY docker/entrypoint.d/ /opt/nvidia/entrypoint.d/
 
@@ -1,4 +1,4 @@
-Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+Copyright (c) 2018-2026, NVIDIA CORPORATION. All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions
 
@@ -1,5 +1,5 @@
 <!--
-# Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -29,15 +29,15 @@
 
 >[!WARNING]
 >You are currently on the `main` branch which tracks under-development progress
->towards the next release. The current release is version [2.59.1](https://github.com/triton-inference-server/server/releases/latest)
->and corresponds to the 25.07 container release on NVIDIA GPU Cloud (NGC).
+>towards the next release. The current release is version [2.66.0](https://github.com/triton-inference-server/server/releases/latest)
+>and corresponds to the 26.02 container release on NVIDIA GPU Cloud (NGC).
 
 # Triton Inference Server
 
 Triton Inference Server is an open source inference serving software that
 streamlines AI inferencing. Triton enables teams to deploy any AI model from
 multiple deep learning and machine learning frameworks, including TensorRT,
-TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton
+PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton
 Inference Server supports inference across cloud, data center, edge and embedded
 devices on NVIDIA GPUs, x86 and ARM CPU, or AWS Inferentia. Triton Inference
 Server delivers optimized performance for many query types, including real time,
@@ -54,8 +54,8 @@ Major features include:
   frameworks](https://github.com/triton-inference-server/fil_backend)
 - [Concurrent model
   execution](docs/user_guide/architecture.md#concurrent-model-execution)
-- [Dynamic batching](docs/user_guide/model_configuration.md#dynamic-batcher)
-- [Sequence batching](docs/user_guide/model_configuration.md#sequence-batcher) and
+- [Dynamic batching](docs/user_guide/batcher.md#dynamic-batcher)
+- [Sequence batching](docs/user_guide/batcher.md#sequence-batcher) and
   [implicit state management](docs/user_guide/architecture.md#implicit-state-management)
   for stateful models
 - Provides [Backend API](https://github.com/triton-inference-server/backend) that
@@ -70,8 +70,8 @@ Major features include:
   protocols](docs/customization_guide/inference_protocols.md) based on the community
   developed [KServe
   protocol](https://github.com/kserve/kserve/tree/master/docs/predict-api/v2)
-- A [C API](docs/customization_guide/inference_protocols.md#in-process-triton-server-api) and
-  [Java API](docs/customization_guide/inference_protocols.md#java-bindings-for-in-process-triton-server-api)
+- A [C API](docs/customization_guide/inprocess_c_api.md) and
+  [Java API](docs/customization_guide/inprocess_java_api.md)
   allow Triton to link directly into your application for edge and other in-process use cases
 - [Metrics](docs/user_guide/metrics.md) indicating GPU utilization, server
   throughput, server latency, and more
@@ -90,16 +90,16 @@ Inference Server with the
 
 ```bash
 # Step 1: Create the example model repository
-git clone -b r25.07 https://github.com/triton-inference-server/server.git
+git clone -b r26.02 https://github.com/triton-inference-server/server.git
 cd server/docs/examples
 ./fetch_models.sh
 
 # Step 2: Launch triton from the NGC Triton container
-docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:25.07-py3 tritonserver --model-repository=/models --model-control-mode explicit --load-model densenet_onnx
+docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:26.02-py3 tritonserver --model-repository=/models --model-control-mode explicit --load-model densenet_onnx
 
 # Step 3: Sending an Inference Request
 # In a separate console, launch the image_client example from the NGC Triton SDK container
-docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:25.07-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
+docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:26.02-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
 
 # Inference should return the following
 Image '/workspace/images/mug.jpg':
@@ -166,7 +166,6 @@ configuration](docs/user_guide/model_configuration.md) for the model.
 - Triton supports multiple execution engines, called
   [backends](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton), including
   [TensorRT](https://github.com/triton-inference-server/tensorrt_backend),
-  [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend),
   [PyTorch](https://github.com/triton-inference-server/pytorch_backend),
   [ONNX](https://github.com/triton-inference-server/onnxruntime_backend),
   [OpenVINO](https://github.com/triton-inference-server/openvino_backend),
 
@@ -1 +1 @@
-2.60.0dev
+2.67.0dev
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.`
	`1`	`+Copyright (c) 2018-2026, NVIDIA CORPORATION. All rights reserved.`
`2`	`2`
`3`	`3`	`Redistribution and use in source and binary forms, with or without`
`4`	`4`	`modification, are permitted provided that the following conditions`