Skip to content

Commit 2314ced

Browse files
authored
Merge branch 'main' into buildpy_fixes
2 parents f940474 + b593182 commit 2314ced

211 files changed

Lines changed: 12102 additions & 6241 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
#
33
# Redistribution and use in source and binary forms, with or without
44
# modification, are permitted provided that the following conditions
@@ -31,15 +31,15 @@ on:
3131

3232
jobs:
3333
pre-commit:
34-
runs-on: ubuntu-22.04
34+
runs-on: ubuntu-latest
3535
steps:
36-
- uses: actions/checkout@v3
36+
- uses: actions/checkout@v5.0.0
3737
with:
3838
fetch-depth: 2
3939
- name: Get modified files
4040
id: modified-files
4141
run: echo "modified_files=$(git diff --name-only -r HEAD^1 HEAD | xargs)" >> $GITHUB_OUTPUT
42-
- uses: actions/setup-python@v3
43-
- uses: pre-commit/action@v3.0.0
42+
- uses: actions/setup-python@v6.0.0
43+
- uses: pre-commit/action@v3.0.1
4444
with:
4545
extra_args: --files ${{ steps.modified-files.outputs.modified_files }}

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,5 @@ cprofile
1616
# Test exclusions
1717
qa/L0_openai/openai
1818
tensorrtllm_models
19+
tensorrtllm_mistral_models/
1920
custom_tokenizer

.pre-commit-config.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
#
33
# Redistribution and use in source and binary forms, with or without
44
# modification, are permitted provided that the following conditions
@@ -25,7 +25,7 @@
2525
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626

2727
repos:
28-
- repo: https://github.com/timothycrosley/isort
28+
- repo: https://github.com/PyCQA/isort
2929
rev: 5.12.0
3030
hooks:
3131
- id: isort
@@ -36,7 +36,7 @@ repos:
3636
- id: black
3737
types_or: [python, cython]
3838
- repo: https://github.com/PyCQA/flake8
39-
rev: 5.0.4
39+
rev: 7.3.0
4040
hooks:
4141
- id: flake8
4242
args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]
@@ -57,7 +57,7 @@ repos:
5757
# More details about these pre-commit hooks here:
5858
# https://pre-commit.com/hooks.html
5959
- repo: https://github.com/pre-commit/pre-commit-hooks
60-
rev: v4.4.0
60+
rev: v6.0.0
6161
hooks:
6262
- id: check-case-conflict
6363
- id: check-executables-have-shebangs

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ proposed change so that the Triton team can provide feedback.
8888
documentation for instructions on running these tests.
8989

9090
- Triton Inference Server's default build assumes recent versions of
91-
dependencies (CUDA, TensorFlow, PyTorch, TensorRT,
91+
dependencies (CUDA, PyTorch, TensorRT,
9292
etc.). Contributions that add compatibility with older versions of
9393
those dependencies will be considered, but NVIDIA cannot guarantee
9494
that all possible build configurations work, are not broken by

Dockerfile.QA

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# Copyright 2018-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
#
33
# Redistribution and use in source and binary forms, with or without
44
# modification, are permitted provided that the following conditions
@@ -139,7 +139,7 @@ RUN mkdir -p qa/common && \
139139
mkdir qa/L0_data_compression/models && \
140140
cp -r docs/examples/model_repository/simple qa/L0_data_compression/models && \
141141
cp bin/data_compressor_test qa/L0_data_compression/. && \
142-
cp bin/backend_tensor_size_test qa/L0_input_validation/. && \
142+
cp bin/tensor_size_test qa/L0_input_validation/. && \
143143
cp bin/metrics_api_test qa/L0_metrics/. && \
144144
cp bin/response_cache_test qa/L0_response_cache/. && \
145145
cp bin/request_cancellation_test qa/L0_request_cancellation/. && \
@@ -186,12 +186,14 @@ RUN cd tritonbuild/identity && \
186186
make -j16 install
187187

188188
# L0_backend_python test require triton_shm_monitor
189+
ARG TRITON_BOOST_URL="https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz"
189190
RUN cd tritonbuild/python && \
190191
rm -rf install build && mkdir build && cd build && \
191192
cmake -DCMAKE_INSTALL_PREFIX:PATH=/workspace/tritonbuild/python/install \
192193
-DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \
193194
-DTRITON_COMMON_REPO_TAG:STRING=${TRITON_COMMON_REPO_TAG} \
194195
-DTRITON_CORE_REPO_TAG:STRING=${TRITON_CORE_REPO_TAG} \
196+
-DTRITON_BOOST_URL:STRING=${TRITON_BOOST_URL} \
195197
-DTRITON_BACKEND_REPO_TAG:STRING=${TRITON_BACKEND_REPO_TAG} .. && \
196198
make -j16 triton-shm-monitor install
197199

Dockerfile.sdk

Lines changed: 13 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# Copyright 2019-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
#
33
# Redistribution and use in source and binary forms, with or without
44
# modification, are permitted provided that the following conditions
@@ -29,10 +29,9 @@
2929
#
3030

3131
# Base image on the minimum Triton container
32-
ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:25.07-py3-min
32+
ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:26.02-py3-min
3333

3434
ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
35-
ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo
3635
ARG TRITON_REPO_ORGANIZATION=http://github.com/triton-inference-server
3736
ARG TRITON_COMMON_REPO_TAG=main
3837
ARG TRITON_CORE_REPO_TAG=main
@@ -41,9 +40,8 @@ ARG TRITON_THIRD_PARTY_REPO_TAG=main
4140
ARG TRITON_ENABLE_GPU=ON
4241
ARG JAVA_BINDINGS_MAVEN_VERSION=3.8.4
4342
ARG JAVA_BINDINGS_JAVACPP_PRESETS_TAG=1.5.8
44-
ARG TRITON_PERF_ANALYZER_BUILD=1
4543
# DCGM version to install for Model Analyzer
46-
ARG DCGM_VERSION=4.2.3-2
44+
ARG DCGM_VERSION=4.5.2-1
4745

4846
ARG NVIDIA_TRITON_SERVER_SDK_VERSION=unknown
4947
ARG NVIDIA_BUILD_ID=unknown
@@ -97,7 +95,6 @@ RUN rm -f /usr/bin/python && \
9795
# Build the client library and examples
9896
ARG TRITON_REPO_ORGANIZATION
9997
ARG TRITON_CLIENT_REPO_SUBDIR
100-
ARG TRITON_PA_REPO_SUBDIR
10198
ARG TRITON_COMMON_REPO_TAG
10299
ARG TRITON_CORE_REPO_TAG
103100
ARG TRITON_CLIENT_REPO_TAG
@@ -106,14 +103,10 @@ ARG TRITON_ENABLE_GPU
106103
ARG JAVA_BINDINGS_MAVEN_VERSION
107104
ARG JAVA_BINDINGS_JAVACPP_PRESETS_TAG
108105
ARG TARGETPLATFORM
109-
ARG TRITON_PERF_ANALYZER_BUILD
110-
111-
ENV TRITON_PERF_ANALYZER_BUILD=${TRITON_PERF_ANALYZER_BUILD}
112106

113107
WORKDIR /workspace
114108
COPY TRITON_VERSION .
115109
COPY ${TRITON_CLIENT_REPO_SUBDIR} client
116-
COPY ${TRITON_PA_REPO_SUBDIR} perf_analyzer
117110

118111
WORKDIR /workspace/client_build
119112
RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
@@ -124,63 +117,11 @@ RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
124117
-DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
125118
-DTRITON_ENABLE_PERF_ANALYZER=OFF \
126119
-DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON \
127-
-DTRITON_ENABLE_PYTHON_HTTP=OFF -DTRITON_ENABLE_PYTHON_GRPC=OFF \
120+
-DTRITON_ENABLE_PYTHON_HTTP=ON -DTRITON_ENABLE_PYTHON_GRPC=ON \
128121
-DTRITON_ENABLE_JAVA_HTTP=ON \
129122
-DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON \
130123
-DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} /workspace/client
131-
RUN cmake --build . -v --parallel --target cc-clients java-clients
132-
133-
# TODO: PA will rebuild the CC clients since it depends on it.
134-
# This should be optimized so that we do not have to build
135-
# the CC clients twice. Similarly, because the SDK expectation is
136-
# that PA is packaged with the python client, we hold off on building
137-
# the python client until now. Post-migration we should focus
138-
# effort on de-tangling these flows.
139-
WORKDIR /workspace/pa_build
140-
# NOTE: If TRITON_PERF_ANALYZER_BUILD=0, the Performance Analyzer (PA) binaries must already exist
141-
# in the path specified by the ARG TRITON_PA_REPO_SUBDIR.
142-
RUN if [ "$TRITON_PERF_ANALYZER_BUILD" = "1" ]; then \
143-
cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
144-
-DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \
145-
-DTRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION} \
146-
-DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
147-
-DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
148-
-DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
149-
-DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
150-
-DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
151-
-DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \
152-
-DTRITON_ENABLE_PERF_ANALYZER_TS=ON \
153-
-DTRITON_ENABLE_PERF_ANALYZER_OPENAI=ON \
154-
-DTRITON_ENABLE_CC_HTTP=ON \
155-
-DTRITON_ENABLE_CC_GRPC=ON \
156-
-DTRITON_ENABLE_PYTHON_HTTP=ON \
157-
-DTRITON_ENABLE_PYTHON_GRPC=ON \
158-
-DTRITON_PACKAGE_PERF_ANALYZER=ON \
159-
-DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} \
160-
/workspace/perf_analyzer && \
161-
cmake --build . -v --parallel --target perf-analyzer python-clients && \
162-
pip3 install build && \
163-
cd /workspace/perf_analyzer/genai-perf && \
164-
python3 -m build --wheel --outdir /workspace/install/python; \
165-
else \
166-
ls /workspace/perf_analyzer/ && \
167-
tar -xzf /workspace/perf_analyzer/perf_analyzer*.tar.gz -C /workspace/install/bin && \
168-
echo "Perf Analyzer binaries was extracted and not build" && \
169-
cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
170-
-DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \
171-
-DTRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION} \
172-
-DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
173-
-DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
174-
-DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
175-
-DTRITON_ENABLE_PYTHON_HTTP=ON \
176-
-DTRITON_ENABLE_PYTHON_GRPC=ON \
177-
-DTRITON_PACKAGE_PERF_ANALYZER=ON \
178-
-DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} \
179-
/workspace/perf_analyzer && \
180-
cmake --build . -v --parallel --target python-clients && \
181-
mkdir -p /workspace/install/python && \
182-
cp /workspace/perf_analyzer/genai_perf-*.whl /workspace/install/python/; \
183-
fi
124+
RUN cmake --build . -v --parallel --target cc-clients java-clients python-clients
184125

185126
# Install Java API Bindings
186127
RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
@@ -226,7 +167,6 @@ RUN apt-get update && \
226167
python3-pip \
227168
python3-setuptools \
228169
python3-wheel \
229-
software-properties-common \
230170
vim \
231171
wget && \
232172
pip3 install "grpcio<1.68" "grpcio-tools<1.68"
@@ -235,7 +175,6 @@ WORKDIR /workspace
235175
COPY TRITON_VERSION .
236176
COPY NVIDIA_Deep_Learning_Container_License.pdf .
237177
COPY --from=sdk_build /workspace/client/ client/
238-
COPY --from=sdk_build /workspace/perf_analyzer/ perf_analyzer/
239178
COPY --from=sdk_build /workspace/install/ install/
240179
RUN cd install && \
241180
export VERSION=`cat /workspace/TRITON_VERSION` && \
@@ -253,8 +192,6 @@ COPY --from=sdk_build /workspace/client/src/python/library/tests/* qa/python_cli
253192
# Install an image needed by the quickstart and other documentation.
254193
COPY qa/images/mug.jpg images/mug.jpg
255194

256-
RUN pip3 install install/python/genai_perf-*.whl
257-
258195
# Install the dependencies needed to run the client examples. These
259196
# are not needed for building but including them allows this image to
260197
# be used to run the client examples.
@@ -263,6 +200,9 @@ RUN pip3 install --upgrade "numpy<2" pillow attrdict && \
263200
"tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \
264201
xargs pip3 install --upgrade
265202

203+
# Install GenAI-Perf
204+
RUN pip3 install genai-perf
205+
266206
# Install DCGM
267207
RUN if [ "$TRITON_ENABLE_GPU" = "ON" ]; then \
268208
[ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" && \
@@ -279,6 +219,11 @@ RUN if [ "$TRITON_ENABLE_GPU" = "ON" ]; then \
279219
RUN rm -f /usr/bin/python && \
280220
ln -s /usr/bin/python3 /usr/bin/python
281221

222+
# Install Model Analyzer
223+
ARG TRITON_MODEL_ANALYZER_REPO_TAG
224+
ARG TRITON_MODEL_ANALYZER_REPO="${TRITON_REPO_ORGANIZATION}/model_analyzer@${TRITON_MODEL_ANALYZER_REPO_TAG}"
225+
RUN pip3 install "git+${TRITON_MODEL_ANALYZER_REPO}"
226+
282227
# Entrypoint Banner
283228
ENV NVIDIA_PRODUCT_NAME="Triton Server SDK"
284229
COPY docker/entrypoint.d/ /opt/nvidia/entrypoint.d/

LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
1+
Copyright (c) 2018-2026, NVIDIA CORPORATION. All rights reserved.
22

33
Redistribution and use in source and binary forms, with or without
44
modification, are permitted provided that the following conditions

README.md

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
<!--
2-
# Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# Copyright 2018-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
#
44
# Redistribution and use in source and binary forms, with or without
55
# modification, are permitted provided that the following conditions
@@ -29,15 +29,15 @@
2929

3030
>[!WARNING]
3131
>You are currently on the `main` branch which tracks under-development progress
32-
>towards the next release. The current release is version [2.59.1](https://github.com/triton-inference-server/server/releases/latest)
33-
>and corresponds to the 25.07 container release on NVIDIA GPU Cloud (NGC).
32+
>towards the next release. The current release is version [2.66.0](https://github.com/triton-inference-server/server/releases/latest)
33+
>and corresponds to the 26.02 container release on NVIDIA GPU Cloud (NGC).
3434
3535
# Triton Inference Server
3636

3737
Triton Inference Server is an open source inference serving software that
3838
streamlines AI inferencing. Triton enables teams to deploy any AI model from
3939
multiple deep learning and machine learning frameworks, including TensorRT,
40-
TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton
40+
PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton
4141
Inference Server supports inference across cloud, data center, edge and embedded
4242
devices on NVIDIA GPUs, x86 and ARM CPU, or AWS Inferentia. Triton Inference
4343
Server delivers optimized performance for many query types, including real time,
@@ -54,8 +54,8 @@ Major features include:
5454
frameworks](https://github.com/triton-inference-server/fil_backend)
5555
- [Concurrent model
5656
execution](docs/user_guide/architecture.md#concurrent-model-execution)
57-
- [Dynamic batching](docs/user_guide/model_configuration.md#dynamic-batcher)
58-
- [Sequence batching](docs/user_guide/model_configuration.md#sequence-batcher) and
57+
- [Dynamic batching](docs/user_guide/batcher.md#dynamic-batcher)
58+
- [Sequence batching](docs/user_guide/batcher.md#sequence-batcher) and
5959
[implicit state management](docs/user_guide/architecture.md#implicit-state-management)
6060
for stateful models
6161
- Provides [Backend API](https://github.com/triton-inference-server/backend) that
@@ -70,8 +70,8 @@ Major features include:
7070
protocols](docs/customization_guide/inference_protocols.md) based on the community
7171
developed [KServe
7272
protocol](https://github.com/kserve/kserve/tree/master/docs/predict-api/v2)
73-
- A [C API](docs/customization_guide/inference_protocols.md#in-process-triton-server-api) and
74-
[Java API](docs/customization_guide/inference_protocols.md#java-bindings-for-in-process-triton-server-api)
73+
- A [C API](docs/customization_guide/inprocess_c_api.md) and
74+
[Java API](docs/customization_guide/inprocess_java_api.md)
7575
allow Triton to link directly into your application for edge and other in-process use cases
7676
- [Metrics](docs/user_guide/metrics.md) indicating GPU utilization, server
7777
throughput, server latency, and more
@@ -90,16 +90,16 @@ Inference Server with the
9090

9191
```bash
9292
# Step 1: Create the example model repository
93-
git clone -b r25.07 https://github.com/triton-inference-server/server.git
93+
git clone -b r26.02 https://github.com/triton-inference-server/server.git
9494
cd server/docs/examples
9595
./fetch_models.sh
9696

9797
# Step 2: Launch triton from the NGC Triton container
98-
docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:25.07-py3 tritonserver --model-repository=/models --model-control-mode explicit --load-model densenet_onnx
98+
docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:26.02-py3 tritonserver --model-repository=/models --model-control-mode explicit --load-model densenet_onnx
9999

100100
# Step 3: Sending an Inference Request
101101
# In a separate console, launch the image_client example from the NGC Triton SDK container
102-
docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:25.07-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
102+
docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:26.02-py3-sdk /workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
103103

104104
# Inference should return the following
105105
Image '/workspace/images/mug.jpg':
@@ -166,7 +166,6 @@ configuration](docs/user_guide/model_configuration.md) for the model.
166166
- Triton supports multiple execution engines, called
167167
[backends](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton), including
168168
[TensorRT](https://github.com/triton-inference-server/tensorrt_backend),
169-
[TensorFlow](https://github.com/triton-inference-server/tensorflow_backend),
170169
[PyTorch](https://github.com/triton-inference-server/pytorch_backend),
171170
[ONNX](https://github.com/triton-inference-server/onnxruntime_backend),
172171
[OpenVINO](https://github.com/triton-inference-server/openvino_backend),

TRITON_VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.60.0dev
1+
2.67.0dev

0 commit comments

Comments
 (0)