From acdaac104db5d3b647b1220be8ff60d1477a4965 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Tue, 5 Mar 2024 13:43:34 -0800
Subject: [PATCH 1/3] updating with torch compile option for baseline

---
 .../rayserve/tritonserver_deployment.py       | 23 +++++++++++++++++--
 Triton_Inference_Server_Python_API/run.sh     |  2 +-
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py
index 1610b583..ec5f8a56 100644
--- a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py
+++ b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py
@@ -31,6 +31,7 @@
 import numpy
 import requests
 import torch
+import torch_tensorrt
 import tritonserver
 from fastapi import FastAPI
 from PIL import Image
@@ -54,7 +55,9 @@ def _print_heading(message):
 @serve.deployment(ray_actor_options={"num_gpus": 1})
 @serve.ingress(app)
 class BaseDeployment:
-    def __init__(self):
+    def __init__(self, use_torch_compile=True):
+        import torch_tensorrt
+
         self._image_size = 512
         self._model_id = "runwayml/stable-diffusion-v1-5"
         from diffusers import StableDiffusionPipeline
@@ -63,6 +66,19 @@ def __init__(self):
             self._model_id, revision="fp16", torch_dtype=torch.float16
         )
         self._pipeline = self._pipeline.to("cuda")
+        if use_torch_compile:
+            backend = "torch_tensorrt"
+            print("compiling")
+            print(torch._dynamo.list_backends())
+            self._pipeline.unet = torch.compile(
+                self._pipeline.unet,
+                backend=backend,
+                options={
+                    "truncate_long_and_double": True,
+                    "precision": torch.float16,
+                },
+                dynamic=False,
+            )
 
     @app.get("/generate")
     def generate(self, prompt: str, filename: Optional[str] = None) -> None:
@@ -153,7 +169,10 @@ def tritonserver_deployment(_args):
 
 
 def base_deployment(_args):
-    return BaseDeployment.bind()
+    if "use_torch_compile" in _args:
+        return BaseDeployment.bind(use_torch_compile=True)
+    else:
+        return BaseDeployment.bind(use_torch_compile=False)
 
 
 if __name__ == "__main__":
diff --git a/Triton_Inference_Server_Python_API/run.sh b/Triton_Inference_Server_Python_API/run.sh
index c465e7f5..040e1610 100755
--- a/Triton_Inference_Server_Python_API/run.sh
+++ b/Triton_Inference_Server_Python_API/run.sh
@@ -137,7 +137,7 @@ fi
 
 $RUN_PREFIX mkdir -p backend/diffusion
 
-$RUN_PREFIX docker run --gpus all -it --rm --network host --shm-size=10G --ulimit memlock=-1 --ulimit stack=67108864 -eHF_TOKEN -eGITHUB_TOKEN -eAWS_DEFAULT_REGION -eAWS_ACCESS_KEY_ID -eAWS_SECRET_ACCESS_KEY -eS3_BUCKET_URL -v ${SOURCE_DIR}:/workspace -v${SOURCE_DIR}/.cache/huggingface:/root/.cache/huggingface -w /workspace -v${SOURCE_DIR}/../Popular_Models_Guide/StableDiffusion/backend/diffusion:/opt/tritonserver/backends/diffusion $IMAGE
+$RUN_PREFIX docker run --gpus all -it --rm --network host --shm-size=10G --ulimit memlock=-1 --ulimit stack=67108864 -eHF_TOKEN -eGITHUB_TOKEN -eAWS_DEFAULT_REGION -eAWS_ACCESS_KEY_ID -eAWS_SECRET_ACCESS_KEY -eS3_BUCKET_URL -v ${SOURCE_DIR}:/workspace -v${SOURCE_DIR}/.cache/huggingface:/root/.cache/huggingface -w /workspace -v${SOURCE_DIR}/../Popular_Models_Guide/StableDiffusion/backend/diffusion:/opt/tritonserver/backends/diffusion -v/tmp:/tmp $IMAGE
 
 { set +x; } 2>/dev/null
 

From 452c70f76727a2a8bff204bda941b904df95deda Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Tue, 5 Mar 2024 14:07:43 -0800
Subject: [PATCH 2/3] updated to build model on initialization

---
 .../examples/rayserve/tritonserver_deployment.py                 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py
index ec5f8a56..86d8b77a 100644
--- a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py
+++ b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py
@@ -79,6 +79,7 @@ def __init__(self, use_torch_compile=True):
                 },
                 dynamic=False,
             )
+            self.generate("temp")
 
     @app.get("/generate")
     def generate(self, prompt: str, filename: Optional[str] = None) -> None:

From e5189236e743c95e3266c5846f65cf6edeff64e4 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Tue, 5 Mar 2024 16:38:48 -0800
Subject: [PATCH 3/3] moving torch tensorrt import

---
 .../examples/rayserve/tritonserver_deployment.py             | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py
index 86d8b77a..77bcdb32 100644
--- a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py
+++ b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py
@@ -31,7 +31,6 @@
 import numpy
 import requests
 import torch
-import torch_tensorrt
 import tritonserver
 from fastapi import FastAPI
 from PIL import Image
@@ -56,8 +55,6 @@ def _print_heading(message):
 @serve.ingress(app)
 class BaseDeployment:
     def __init__(self, use_torch_compile=True):
-        import torch_tensorrt
-
         self._image_size = 512
         self._model_id = "runwayml/stable-diffusion-v1-5"
         from diffusers import StableDiffusionPipeline
@@ -67,6 +64,8 @@ def __init__(self, use_torch_compile=True):
         )
         self._pipeline = self._pipeline.to("cuda")
         if use_torch_compile:
+            import torch_tensorrt
+
             backend = "torch_tensorrt"
             print("compiling")
             print(torch._dynamo.list_backends())