From 69ae8e3d008320dfdb5460845ff6905c3362d667 Mon Sep 17 00:00:00 2001
From: itsnothuy <huytrngqu@gmail.com>
Date: Tue, 14 Apr 2026 15:08:20 +0700
Subject: [PATCH 1/3] fix: async model load/unload to prevent evhtp thread
 starvation

Fixes #8635

HandleRepositoryControl() was calling TRITONSERVER_ServerLoadModelWithParameters()
and TRITONSERVER_ServerUnloadModel() synchronously on evhtp worker threads. Under
concurrent model load/unload traffic all http-thread-count evhtp workers could
become blocked, starving inference requests, health probes, and metadata queries.

Apply the same async pattern used by InferRequestClass for inference:
- New ControlRequestClass captures the evhtp thread and calls
  evhtp_request_pause(req) to free the worker immediately.
- A detached std::thread executes the blocking TRITONSERVER API call.
- evthr_defer posts ReplyCallback back onto the original evhtp thread
  where evhtp_send_reply() + evhtp_request_resume() execute safely.
- An std::atomic<int> concurrency gate (sourced from --model-load-thread-count,
  default 4) caps concurrent detached threads and returns HTTP 503 when
  the limit is exceeded, preventing unbounded thread creation.
- When --model-load-thread-count=0 the handler falls back to the original
  synchronous behavior for backward compatibility.
---
 src/http_server.cc | 285 ++++++++++++++++++++++++++++++---------------
 src/http_server.h  |  76 +++++++++++-
 src/main.cc        |   4 +-
 3 files changed, 267 insertions(+), 98 deletions(-)
diff --git a/src/http_server.cc b/src/http_server.cc
index cce57ee254..3f77e05846 100644
--- a/src/http_server.cc
+++ b/src/http_server.cc
@@ -1119,7 +1119,8 @@ HTTPAPIServer::HTTPAPIServer(
     const std::shared_ptr<SharedMemoryManager>& shm_manager, const int32_t port,
     const bool reuse_port, const std::string& address,
     const std::string& header_forward_pattern, const int thread_cnt,
-    const size_t max_input_size, const RestrictedFeatures& restricted_apis)
+    const int control_request_concurrency, const size_t max_input_size,
+    const RestrictedFeatures& restricted_apis)
     : HTTPServer(port, reuse_port, address, header_forward_pattern, thread_cnt),
       server_(server), trace_manager_(trace_manager), shm_manager_(shm_manager),
       allocator_(nullptr), server_regex_(R"(/v2(?:/health/(live|ready))?)"),
@@ -1166,6 +1167,12 @@ HTTPAPIServer::HTTPAPIServer(
       "setting allocator's buffer attributes function");
 
   ConfigureGenerateMappingSchema();
+
+  max_control_requests_ = control_request_concurrency;
+  if (max_control_requests_ > 0) {
+    LOG_INFO << "Model-control async enabled, max concurrent requests: "
+             << max_control_requests_;
+  }
 }
 
 HTTPAPIServer::~HTTPAPIServer()
@@ -1440,118 +1447,200 @@ HTTPAPIServer::HandleRepositoryControl(
         req, EVHTP_RES_METHNALLOWED, "Method Not Allowed");
   }
 
-  TRITONSERVER_Error* err = nullptr;
+  // Validate repository_name synchronously (lightweight)
   if (!repository_name.empty()) {
-    err = TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_UNSUPPORTED,
-        "'repository_name' specification is not supported");
-  } else {
-    if (action == "load") {
-      static auto param_deleter =
-          [](std::vector<TRITONSERVER_Parameter*>* params) {
-            if (params != nullptr) {
-              for (auto& param : *params) {
-                TRITONSERVER_ParameterDelete(param);
-              }
-              delete params;
+    RETURN_AND_RESPOND_IF_ERR(
+        req, TRITONSERVER_ErrorNew(
+                 TRITONSERVER_ERROR_UNSUPPORTED,
+                 "'repository_name' specification is not supported"));
+  }
+
+  // --- Load path: parse parameters synchronously, then offload blocking call
+  if (action == "load") {
+    static auto param_deleter =
+        [](std::vector<TRITONSERVER_Parameter*>* params) {
+          if (params != nullptr) {
+            for (auto& param : *params) {
+              TRITONSERVER_ParameterDelete(param);
             }
-          };
-      std::unique_ptr<
-          std::vector<TRITONSERVER_Parameter*>, decltype(param_deleter)>
-          params(new std::vector<TRITONSERVER_Parameter*>(), param_deleter);
-      // local variables to store the decoded file content, the data must
-      // be valid until TRITONSERVER_ServerLoadModelWithParameters returns.
-      std::list<std::vector<char>> binary_files;
-      // WAR for the const-ness check
-      std::vector<const TRITONSERVER_Parameter*> const_params;
-      triton::common::TritonJson::Value load_request;
-      size_t buffer_len = 0;
-      RETURN_AND_RESPOND_IF_ERR(
-          req, EVRequestToJson(req, "load model", &load_request, &buffer_len));
-
-      if (buffer_len > 0) {
-        // Parse request body for parameters
-        triton::common::TritonJson::Value param_json;
-        if (load_request.Find("parameters", &param_json)) {
-          // Iterate over each member in 'param_json'
-          std::vector<std::string> members;
-          RETURN_AND_RESPOND_IF_ERR(req, param_json.Members(&members));
-          for (const auto& m : members) {
-            const char* param_str = nullptr;
-            size_t param_len = 0;
+            delete params;
+          }
+        };
+    // Use shared_ptr so the lambda closure can be copyable (required by
+    // std::function) while still using a custom deleter.
+    std::shared_ptr<std::vector<TRITONSERVER_Parameter*>> params(
+        new std::vector<TRITONSERVER_Parameter*>(), param_deleter);
+    // local variables to store the decoded file content, the data must
+    // be valid until TRITONSERVER_ServerLoadModelWithParameters returns.
+    auto binary_files = std::make_shared<std::list<std::vector<char>>>();
+    // WAR for the const-ness check
+    auto const_params =
+        std::make_shared<std::vector<const TRITONSERVER_Parameter*>>();
+    triton::common::TritonJson::Value load_request;
+    size_t buffer_len = 0;
+    RETURN_AND_RESPOND_IF_ERR(
+        req, EVRequestToJson(req, "load model", &load_request, &buffer_len));
+
+    if (buffer_len > 0) {
+      // Parse request body for parameters
+      triton::common::TritonJson::Value param_json;
+      if (load_request.Find("parameters", &param_json)) {
+        // Iterate over each member in 'param_json'
+        std::vector<std::string> members;
+        RETURN_AND_RESPOND_IF_ERR(req, param_json.Members(&members));
+        for (const auto& m : members) {
+          const char* param_str = nullptr;
+          size_t param_len = 0;
+          RETURN_AND_RESPOND_IF_ERR(
+              req,
+              param_json.MemberAsString(m.c_str(), &param_str, &param_len));
+
+          TRITONSERVER_Parameter* param = nullptr;
+          if (m == "config") {
+            param = TRITONSERVER_ParameterNew(
+                m.c_str(), TRITONSERVER_PARAMETER_STRING, param_str);
+          } else if (m.rfind("file:", 0) == 0) {
+            size_t decoded_size;
+            binary_files->emplace_back(std::vector<char>());
             RETURN_AND_RESPOND_IF_ERR(
-                req,
-                param_json.MemberAsString(m.c_str(), &param_str, &param_len));
-
-            TRITONSERVER_Parameter* param = nullptr;
-            if (m == "config") {
-              param = TRITONSERVER_ParameterNew(
-                  m.c_str(), TRITONSERVER_PARAMETER_STRING, param_str);
-            } else if (m.rfind("file:", 0) == 0) {
-              size_t decoded_size;
-              binary_files.emplace_back(std::vector<char>());
-              RETURN_AND_RESPOND_IF_ERR(
-                  req, DecodeBase64(
-                           param_str, param_len, binary_files.back(),
-                           decoded_size, m));
-              param = TRITONSERVER_ParameterBytesNew(
-                  m.c_str(), binary_files.back().data(), decoded_size);
-            }
+                req, DecodeBase64(
+                         param_str, param_len, binary_files->back(),
+                         decoded_size, m));
+            param = TRITONSERVER_ParameterBytesNew(
+                m.c_str(), binary_files->back().data(), decoded_size);
+          }
 
-            if (param != nullptr) {
-              params->emplace_back(param);
-              const_params.emplace_back(param);
-            } else {
-              RETURN_AND_RESPOND_IF_ERR(
-                  req, TRITONSERVER_ErrorNew(
-                           TRITONSERVER_ERROR_INTERNAL,
-                           "unexpected error on creating Triton parameter"));
-            }
+          if (param != nullptr) {
+            params->emplace_back(param);
+            const_params->emplace_back(param);
+          } else {
+            RETURN_AND_RESPOND_IF_ERR(
+                req, TRITONSERVER_ErrorNew(
+                         TRITONSERVER_ERROR_INTERNAL,
+                         "unexpected error on creating Triton parameter"));
           }
         }
       }
+    }
+
+    // If async disabled, fall back to synchronous (original behavior)
+    if (max_control_requests_ <= 0) {
       RETURN_AND_RESPOND_IF_ERR(
           req, TRITONSERVER_ServerLoadModelWithParameters(
-                   server_.get(), model_name.c_str(), const_params.data(),
-                   const_params.size()));
-    } else if (action == "unload") {
-      // Check if the dependent models should be removed
-      bool unload_dependents = false;
-      {
-        triton::common::TritonJson::Value control_request;
-        size_t buffer_len = 0;
-        RETURN_AND_RESPOND_IF_ERR(
-            req, EVRequestToJson(
-                     req, "unload model", &control_request, &buffer_len));
-
-        if (buffer_len > 0) {
-          triton::common::TritonJson::Value params_json;
-          if (control_request.Find("parameters", &params_json)) {
-            triton::common::TritonJson::Value ud_json;
-            if (params_json.Find("unload_dependents", &ud_json)) {
-              auto parse_err = ud_json.AsBool(&unload_dependents);
-              if (parse_err != nullptr) {
-                err = TRITONSERVER_ErrorNew(
-                    TRITONSERVER_ErrorCode(parse_err),
-                    (std::string("Unable to parse 'unload_dependents': ") +
-                     TRITONSERVER_ErrorMessage(parse_err))
-                        .c_str());
-                TRITONSERVER_ErrorDelete(parse_err);
-              }
+                   server_.get(), model_name.c_str(), const_params->data(),
+                   const_params->size()));
+      evhtp_send_reply(req, EVHTP_RES_OK);
+      return;
+    }
+
+    // Check concurrency limit before pausing the request.
+    int current = control_request_cnt_.fetch_add(1, std::memory_order_acq_rel);
+    if (current >= max_control_requests_) {
+      control_request_cnt_.fetch_sub(1, std::memory_order_acq_rel);
+      RETURN_AND_RESPOND_WITH_ERR(
+          req, EVHTP_RES_SERVUNAVAIL,
+          "Model control request rejected: too many concurrent "
+          "load/unload requests");
+      return;
+    }
+
+    // Create the async request object. This captures the evhtp thread
+    // and pauses the request so the evhtp worker is freed immediately.
+    auto* ctrl_req = new ControlRequestClass(req);
+    TRITONSERVER_Server* raw_server = server_.get();
+    auto* cnt_ptr = &control_request_cnt_;
+
+    // Spawn a detached thread for the blocking call. Thread creation
+    // overhead is acceptable for model load/unload operations.
+    std::thread([ctrl_req, raw_server, model_name, params, binary_files,
+                 const_params, cnt_ptr]() {
+      ctrl_req->err_ = TRITONSERVER_ServerLoadModelWithParameters(
+          raw_server, model_name.c_str(), const_params->data(),
+          const_params->size());
+      cnt_ptr->fetch_sub(1, std::memory_order_acq_rel);
+      evthr_defer(
+          ctrl_req->thread_, ControlRequestClass::ReplyCallback, ctrl_req);
+    }).detach();
+    return;
+  }
+
+  // --- Unload path: parse parameters synchronously, then offload blocking call
+  if (action == "unload") {
+    bool unload_dependents = false;
+    {
+      triton::common::TritonJson::Value control_request;
+      size_t buffer_len = 0;
+      RETURN_AND_RESPOND_IF_ERR(
+          req,
+          EVRequestToJson(req, "unload model", &control_request, &buffer_len));
+
+      if (buffer_len > 0) {
+        triton::common::TritonJson::Value params_json;
+        if (control_request.Find("parameters", &params_json)) {
+          triton::common::TritonJson::Value ud_json;
+          if (params_json.Find("unload_dependents", &ud_json)) {
+            TRITONSERVER_Error* parse_err = ud_json.AsBool(&unload_dependents);
+            if (parse_err != nullptr) {
+              TRITONSERVER_Error* err = TRITONSERVER_ErrorNew(
+                  TRITONSERVER_ErrorCode(parse_err),
+                  (std::string("Unable to parse 'unload_dependents': ") +
+                   TRITONSERVER_ErrorMessage(parse_err))
+                      .c_str());
+              TRITONSERVER_ErrorDelete(parse_err);
+              RETURN_AND_RESPOND_IF_ERR(req, err);
             }
           }
         }
       }
+    }
+
+    // If async disabled, fall back to synchronous (original behavior)
+    if (max_control_requests_ <= 0) {
+      TRITONSERVER_Error* err = nullptr;
       if (unload_dependents) {
         err = TRITONSERVER_ServerUnloadModelAndDependents(
             server_.get(), model_name.c_str());
       } else {
         err = TRITONSERVER_ServerUnloadModel(server_.get(), model_name.c_str());
       }
+      RETURN_AND_RESPOND_IF_ERR(req, err);
+      evhtp_send_reply(req, EVHTP_RES_OK);
+      return;
     }
+
+    // Check concurrency limit.
+    int current = control_request_cnt_.fetch_add(1, std::memory_order_acq_rel);
+    if (current >= max_control_requests_) {
+      control_request_cnt_.fetch_sub(1, std::memory_order_acq_rel);
+      RETURN_AND_RESPOND_WITH_ERR(
+          req, EVHTP_RES_SERVUNAVAIL,
+          "Model control request rejected: too many concurrent "
+          "load/unload requests");
+      return;
+    }
+
+    // Create the async request object.
+    auto* ctrl_req = new ControlRequestClass(req);
+    TRITONSERVER_Server* raw_server = server_.get();
+    auto* cnt_ptr = &control_request_cnt_;
+
+    std::thread([ctrl_req, raw_server, model_name, unload_dependents,
+                 cnt_ptr]() {
+      if (unload_dependents) {
+        ctrl_req->err_ = TRITONSERVER_ServerUnloadModelAndDependents(
+            raw_server, model_name.c_str());
+      } else {
+        ctrl_req->err_ =
+            TRITONSERVER_ServerUnloadModel(raw_server, model_name.c_str());
+      }
+      cnt_ptr->fetch_sub(1, std::memory_order_acq_rel);
+      evthr_defer(
+          ctrl_req->thread_, ControlRequestClass::ReplyCallback, ctrl_req);
+    }).detach();
+    return;
   }
 
-  RETURN_AND_RESPOND_IF_ERR(req, err);
+  // Unknown action — should not normally reach here
   evhtp_send_reply(req, EVHTP_RES_OK);
 }
 
@@ -4812,12 +4901,14 @@ HTTPAPIServer::Create(
     const std::shared_ptr<SharedMemoryManager>& shm_manager, const int32_t port,
     const bool reuse_port, const std::string& address,
     const std::string& header_forward_pattern, const int thread_cnt,
-    const size_t max_input_size, const RestrictedFeatures& restricted_features,
+    const int control_request_concurrency, const size_t max_input_size,
+    const RestrictedFeatures& restricted_features,
     std::unique_ptr<HTTPServer>* http_server)
 {
   http_server->reset(new HTTPAPIServer(
       server, trace_manager, shm_manager, port, reuse_port, address,
-      header_forward_pattern, thread_cnt, max_input_size, restricted_features));
+      header_forward_pattern, thread_cnt, control_request_concurrency,
+      max_input_size, restricted_features));
 
   const std::string addr = address + ":" + std::to_string(port);
   LOG_INFO << "Started HTTPService at " << addr;
@@ -4848,10 +4939,14 @@ HTTPAPIServer::Create(
       GetValue(options, "header_forward_pattern", &header_forward_pattern));
   RETURN_IF_ERR(GetValue(options, "thread_count", &thread_count));
 
+  // Use model_load_thread_count as concurrency limit if available
+  int control_concurrency = 4;  // default
+  GetValue(options, "control_request_concurrency", &control_concurrency);
+
   return Create(
       server, trace_manager, shm_manager, port, reuse_port, address,
-      header_forward_pattern, thread_count, HTTP_DEFAULT_MAX_INPUT_SIZE,
-      restricted_features, service);
+      header_forward_pattern, thread_count, control_concurrency,
+      HTTP_DEFAULT_MAX_INPUT_SIZE, restricted_features, service);
 }
 
 
diff --git a/src/http_server.h b/src/http_server.h
index c5ed0eb6de..9a5ce243c1 100644
--- a/src/http_server.h
+++ b/src/http_server.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -28,6 +28,8 @@
 #include <evhtp/evhtp.h>
 #include <re2/re2.h>
 
+#include <atomic>
+#include <functional>
 #include <list>
 #include <map>
 #include <memory>
@@ -36,6 +38,7 @@
 #include <string>
 #include <thread>
 #include <unordered_map>
+#include <vector>
 
 #include "common.h"
 #include "data_compressor.h"
@@ -198,7 +201,8 @@ class HTTPAPIServer : public HTTPServer {
       const std::shared_ptr<SharedMemoryManager>& smb_manager,
       const int32_t port, const bool reuse_port, const std::string& address,
       const std::string& header_forward_pattern, const int thread_cnt,
-      const size_t max_input_size, const RestrictedFeatures& restricted_apis,
+      const int control_request_concurrency, const size_t max_input_size,
+      const RestrictedFeatures& restricted_apis,
       std::unique_ptr<HTTPServer>* http_server);
 
   static TRITONSERVER_Error* Create(
@@ -380,6 +384,66 @@ class HTTPAPIServer : public HTTPServer {
     evhtp_res response_code_{EVHTP_RES_OK};
   };
 
+  // Lightweight request class for model repository control operations
+  // (load/unload). Follows the same async pattern as InferRequestClass:
+  // capture the evhtp thread, pause the request, do blocking work off-thread,
+  // then defer the reply back via evthr_defer.
+  class ControlRequestClass {
+   public:
+    explicit ControlRequestClass(evhtp_request_t* req)
+        : req_(req), thread_(nullptr), err_(nullptr)
+    {
+      evhtp_connection_t* htpconn = evhtp_request_get_connection(req);
+      thread_ = htpconn->thread;
+      evhtp_request_pause(req);
+      evhtp_request_set_hook(
+          req_, evhtp_hook_on_request_fini,
+          (evhtp_hook)(void*)ControlRequestFiniHook,
+          reinterpret_cast<void*>(this));
+    }
+
+    ~ControlRequestClass()
+    {
+      if (req_ != nullptr) {
+        evhtp_request_unset_hook(req_, evhtp_hook_on_request_fini);
+      }
+      if (err_ != nullptr) {
+        TRITONSERVER_ErrorDelete(err_);
+      }
+    }
+
+    // Called by evhtp when the connection is closed before we reply.
+    // Nulls req_ so ReplyCallback will skip the reply safely.
+    static evhtp_res ControlRequestFiniHook(evhtp_request* req, void* arg)
+    {
+      auto* ctrl_req = reinterpret_cast<ControlRequestClass*>(arg);
+      ctrl_req->req_ = nullptr;
+      return EVHTP_RES_OK;
+    }
+
+    // Deferred onto the evhtp thread via evthr_defer. Sends the reply
+    // and resumes the paused request. Deletes the ControlRequestClass.
+    static void ReplyCallback(evthr_t* thr, void* arg, void* shared)
+    {
+      auto* ctrl_req = reinterpret_cast<ControlRequestClass*>(arg);
+      evhtp_request_t* req = ctrl_req->req_;
+      if (req != nullptr) {
+        if (ctrl_req->err_ != nullptr) {
+          EVBufferAddErrorJson(req->buffer_out, ctrl_req->err_);
+          evhtp_send_reply(req, HttpCodeFromError(ctrl_req->err_));
+        } else {
+          evhtp_send_reply(req, EVHTP_RES_OK);
+        }
+        evhtp_request_resume(req);
+      }
+      delete ctrl_req;
+    }
+
+    evhtp_request_t* req_;
+    evthr_t* thread_;
+    TRITONSERVER_Error* err_;
+  };
+
   class GenerateRequestClass : public InferRequestClass {
    public:
     explicit GenerateRequestClass(
@@ -500,6 +564,7 @@ class HTTPAPIServer : public HTTPServer {
       const std::shared_ptr<SharedMemoryManager>& shm_manager,
       const int32_t port, const bool reuse_port, const std::string& address,
       const std::string& header_forward_pattern, const int thread_cnt,
+      const int control_request_concurrency,
       const size_t max_input_size = HTTP_DEFAULT_MAX_INPUT_SIZE,
       const RestrictedFeatures& restricted_apis = {});
 
@@ -717,6 +782,13 @@ class HTTPAPIServer : public HTTPServer {
   RestrictedFeatures restricted_apis_{};
   bool RespondIfRestricted(
       evhtp_request_t* req, const Restriction& restriction);
+
+  // Maximum number of concurrent model repository control operations
+  // (load/unload) that can run off the evhtp worker threads. Each request
+  // spawns a detached thread; this limit prevents thread explosion.
+  // Sourced from --model-load-thread-count. 0 disables async (synchronous).
+  int max_control_requests_;
+  std::atomic<int> control_request_cnt_{0};
 };
 
 }}  // namespace triton::server
diff --git a/src/main.cc b/src/main.cc
index 597a97f17c..8dedd42fca 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -138,7 +138,9 @@ StartHttpService(
       server, trace_manager, shm_manager, g_triton_params.http_port_,
       g_triton_params.reuse_http_port_, g_triton_params.http_address_,
       g_triton_params.http_forward_header_pattern_,
-      g_triton_params.http_thread_cnt_, g_triton_params.http_max_input_size_,
+      g_triton_params.http_thread_cnt_,
+      g_triton_params.model_load_thread_count_,
+      g_triton_params.http_max_input_size_,
       g_triton_params.http_restricted_apis_, service);
   if (err == nullptr) {
     err = (*service)->Start();

From a1b98617c939dc57bcbdc3110383412f75356c37 Mon Sep 17 00:00:00 2001
From: Huy Tran <158990013+itsnothuy@users.noreply.github.com>
Date: Tue, 14 Apr 2026 02:40:49 -0700
Subject: [PATCH 2/3] Update src/http_server.h

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/http_server.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/http_server.h b/src/http_server.h
index 9f6e124c27..60a4cc92bb 100644
--- a/src/http_server.h
+++ b/src/http_server.h
@@ -789,7 +789,8 @@ class HTTPAPIServer : public HTTPServer {
   // Maximum number of concurrent model repository control operations
   // (load/unload) that can run off the evhtp worker threads. Each request
   // spawns a detached thread; this limit prevents thread explosion.
-  // Sourced from --model-load-thread-count. 0 disables async (synchronous).
+  // Configured via control_request_concurrency; main.cc wires it from
+  // --model-load-thread-count. 0 disables async (synchronous).
   int max_control_requests_;
   std::atomic<int> control_request_cnt_{0};
 };

From 2a2c3f766f39983190ea9d9fd8ebd6af0970021e Mon Sep 17 00:00:00 2001
From: itsnothuy <huytrngqu@gmail.com>
Date: Wed, 15 Apr 2026 00:30:58 +0700
Subject: [PATCH 3/3] fix: address Copilot review comments

- Move ControlRequestClass::ReplyCallback definition to .cc file
  (was using anonymous namespace symbols from header - wouldn't compile)
- Remove unused <functional> include from http_server.h
- Fix inaccurate comment about std::function copyability
- Wrap std::thread creation in try/catch for both load and unload paths
  to handle std::system_error if thread creation fails under resource
  exhaustion (decrement counter, resume request, cleanup ctrl_req)

Fixes #8635
---
 src/http_server.cc | 94 ++++++++++++++++++++++++++++++++++------------
 src/http_server.h  | 18 +--------
 2 files changed, 72 insertions(+), 40 deletions(-)

diff --git a/src/http_server.cc b/src/http_server.cc
index 44d1268822..d1ca890137 100644
--- a/src/http_server.cc
+++ b/src/http_server.cc
@@ -1481,8 +1481,9 @@ HTTPAPIServer::HandleRepositoryControl(
             delete params;
           }
         };
-    // Use shared_ptr so the lambda closure can be copyable (required by
-    // std::function) while still using a custom deleter.
+    // Use shared_ptr for these containers because they are captured by value
+    // in the lambda passed to std::thread, ensuring the data outlives the
+    // thread and is properly cleaned up when all references are released.
     std::shared_ptr<std::vector<TRITONSERVER_Parameter*>> params(
         new std::vector<TRITONSERVER_Parameter*>(), param_deleter);
     // local variables to store the decoded file content, the data must
@@ -1567,15 +1568,29 @@ HTTPAPIServer::HandleRepositoryControl(
 
     // Spawn a detached thread for the blocking call. Thread creation
     // overhead is acceptable for model load/unload operations.
-    std::thread([ctrl_req, raw_server, model_name, params, binary_files,
-                 const_params, cnt_ptr]() {
-      ctrl_req->err_ = TRITONSERVER_ServerLoadModelWithParameters(
-          raw_server, model_name.c_str(), const_params->data(),
-          const_params->size());
-      cnt_ptr->fetch_sub(1, std::memory_order_acq_rel);
-      evthr_defer(
-          ctrl_req->thread_, ControlRequestClass::ReplyCallback, ctrl_req);
-    }).detach();
+    // Wrap in try/catch to handle std::system_error if thread creation fails.
+    try {
+      std::thread([ctrl_req, raw_server, model_name, params, binary_files,
+                   const_params, cnt_ptr]() {
+        ctrl_req->err_ = TRITONSERVER_ServerLoadModelWithParameters(
+            raw_server, model_name.c_str(), const_params->data(),
+            const_params->size());
+        cnt_ptr->fetch_sub(1, std::memory_order_acq_rel);
+        evthr_defer(
+            ctrl_req->thread_, ControlRequestClass::ReplyCallback, ctrl_req);
+      }).detach();
+    }
+    catch (const std::system_error& e) {
+      // Thread creation failed — clean up and return error synchronously.
+      control_request_cnt_.fetch_sub(1, std::memory_order_acq_rel);
+      // ctrl_req has already paused the request; we must resume it.
+      evhtp_request_resume(req);
+      delete ctrl_req;
+      RETURN_AND_RESPOND_WITH_ERR(
+          req, EVHTP_RES_SERVERR,
+          (std::string("Failed to spawn load thread: ") + e.what()).c_str());
+      return;
+    }
     return;
   }
 
@@ -1639,19 +1654,32 @@ HTTPAPIServer::HandleRepositoryControl(
     TRITONSERVER_Server* raw_server = server_.get();
     auto* cnt_ptr = &control_request_cnt_;
 
-    std::thread([ctrl_req, raw_server, model_name, unload_dependents,
-                 cnt_ptr]() {
-      if (unload_dependents) {
-        ctrl_req->err_ = TRITONSERVER_ServerUnloadModelAndDependents(
-            raw_server, model_name.c_str());
-      } else {
-        ctrl_req->err_ =
-            TRITONSERVER_ServerUnloadModel(raw_server, model_name.c_str());
-      }
-      cnt_ptr->fetch_sub(1, std::memory_order_acq_rel);
-      evthr_defer(
-          ctrl_req->thread_, ControlRequestClass::ReplyCallback, ctrl_req);
-    }).detach();
+    // Wrap in try/catch to handle std::system_error if thread creation fails.
+    try {
+      std::thread([ctrl_req, raw_server, model_name, unload_dependents,
+                   cnt_ptr]() {
+        if (unload_dependents) {
+          ctrl_req->err_ = TRITONSERVER_ServerUnloadModelAndDependents(
+              raw_server, model_name.c_str());
+        } else {
+          ctrl_req->err_ =
+              TRITONSERVER_ServerUnloadModel(raw_server, model_name.c_str());
+        }
+        cnt_ptr->fetch_sub(1, std::memory_order_acq_rel);
+        evthr_defer(
+            ctrl_req->thread_, ControlRequestClass::ReplyCallback, ctrl_req);
+      }).detach();
+    }
+    catch (const std::system_error& e) {
+      // Thread creation failed — clean up and return error synchronously.
+      control_request_cnt_.fetch_sub(1, std::memory_order_acq_rel);
+      evhtp_request_resume(req);
+      delete ctrl_req;
+      RETURN_AND_RESPOND_WITH_ERR(
+          req, EVHTP_RES_SERVERR,
+          (std::string("Failed to spawn unload thread: ") + e.what()).c_str());
+      return;
+    }
     return;
   }
 
@@ -3951,6 +3979,24 @@ HTTPAPIServer::HandleInfer(
   request_release_payload.release();
 }
 
+void
+HTTPAPIServer::ControlRequestClass::ReplyCallback(
+    evthr_t* thr, void* arg, void* shared)
+{
+  auto* ctrl_req = reinterpret_cast<ControlRequestClass*>(arg);
+  evhtp_request_t* req = ctrl_req->req_;
+  if (req != nullptr) {
+    if (ctrl_req->err_ != nullptr) {
+      EVBufferAddErrorJson(req->buffer_out, ctrl_req->err_);
+      evhtp_send_reply(req, HttpCodeFromError(ctrl_req->err_));
+    } else {
+      evhtp_send_reply(req, EVHTP_RES_OK);
+    }
+    evhtp_request_resume(req);
+  }
+  delete ctrl_req;
+}
+
 void
 HTTPAPIServer::InferRequestClass::ReplyCallback(
     evthr_t* thr, void* arg, void* shared)
diff --git a/src/http_server.h b/src/http_server.h
index 60a4cc92bb..6be968fdd3 100644
--- a/src/http_server.h
+++ b/src/http_server.h
@@ -29,7 +29,6 @@
 #include <re2/re2.h>
 
 #include <atomic>
-#include <functional>
 #include <list>
 #include <map>
 #include <memory>
@@ -423,21 +422,8 @@ class HTTPAPIServer : public HTTPServer {
 
     // Deferred onto the evhtp thread via evthr_defer. Sends the reply
     // and resumes the paused request. Deletes the ControlRequestClass.
-    static void ReplyCallback(evthr_t* thr, void* arg, void* shared)
-    {
-      auto* ctrl_req = reinterpret_cast<ControlRequestClass*>(arg);
-      evhtp_request_t* req = ctrl_req->req_;
-      if (req != nullptr) {
-        if (ctrl_req->err_ != nullptr) {
-          EVBufferAddErrorJson(req->buffer_out, ctrl_req->err_);
-          evhtp_send_reply(req, HttpCodeFromError(ctrl_req->err_));
-        } else {
-          evhtp_send_reply(req, EVHTP_RES_OK);
-        }
-        evhtp_request_resume(req);
-      }
-      delete ctrl_req;
-    }
+    // Defined in http_server.cc because it uses file-local helper functions.
+    static void ReplyCallback(evthr_t* thr, void* arg, void* shared);
 
     evhtp_request_t* req_;
     evthr_t* thread_;