From acdaac104db5d3b647b1220be8ff60d1477a4965 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Tue, 5 Mar 2024 13:43:34 -0800 Subject: [PATCH 1/3] updating with torch compile option for baseline --- .../rayserve/tritonserver_deployment.py | 23 +++++++++++++++++-- Triton_Inference_Server_Python_API/run.sh | 2 +- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py index 1610b583..ec5f8a56 100644 --- a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py +++ b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py @@ -31,6 +31,7 @@ import numpy import requests import torch +import torch_tensorrt import tritonserver from fastapi import FastAPI from PIL import Image @@ -54,7 +55,9 @@ def _print_heading(message): @serve.deployment(ray_actor_options={"num_gpus": 1}) @serve.ingress(app) class BaseDeployment: - def __init__(self): + def __init__(self, use_torch_compile=True): + import torch_tensorrt + self._image_size = 512 self._model_id = "runwayml/stable-diffusion-v1-5" from diffusers import StableDiffusionPipeline @@ -63,6 +66,19 @@ def __init__(self): self._model_id, revision="fp16", torch_dtype=torch.float16 ) self._pipeline = self._pipeline.to("cuda") + if use_torch_compile: + backend = "torch_tensorrt" + print("compiling") + print(torch._dynamo.list_backends()) + self._pipeline.unet = torch.compile( + self._pipeline.unet, + backend=backend, + options={ + "truncate_long_and_double": True, + "precision": torch.float16, + }, + dynamic=False, + ) @app.get("/generate") def generate(self, prompt: str, filename: Optional[str] = None) -> None: @@ -153,7 +169,10 @@ def tritonserver_deployment(_args): def base_deployment(_args): - return BaseDeployment.bind() + if "use_torch_compile" in _args: + return BaseDeployment.bind(use_torch_compile=True) + else: + return BaseDeployment.bind(use_torch_compile=False) if __name__ == "__main__": diff --git a/Triton_Inference_Server_Python_API/run.sh b/Triton_Inference_Server_Python_API/run.sh index c465e7f5..040e1610 100755 --- a/Triton_Inference_Server_Python_API/run.sh +++ b/Triton_Inference_Server_Python_API/run.sh @@ -137,7 +137,7 @@ fi $RUN_PREFIX mkdir -p backend/diffusion -$RUN_PREFIX docker run --gpus all -it --rm --network host --shm-size=10G --ulimit memlock=-1 --ulimit stack=67108864 -eHF_TOKEN -eGITHUB_TOKEN -eAWS_DEFAULT_REGION -eAWS_ACCESS_KEY_ID -eAWS_SECRET_ACCESS_KEY -eS3_BUCKET_URL -v ${SOURCE_DIR}:/workspace -v${SOURCE_DIR}/.cache/huggingface:/root/.cache/huggingface -w /workspace -v${SOURCE_DIR}/../Popular_Models_Guide/StableDiffusion/backend/diffusion:/opt/tritonserver/backends/diffusion $IMAGE +$RUN_PREFIX docker run --gpus all -it --rm --network host --shm-size=10G --ulimit memlock=-1 --ulimit stack=67108864 -eHF_TOKEN -eGITHUB_TOKEN -eAWS_DEFAULT_REGION -eAWS_ACCESS_KEY_ID -eAWS_SECRET_ACCESS_KEY -eS3_BUCKET_URL -v ${SOURCE_DIR}:/workspace -v${SOURCE_DIR}/.cache/huggingface:/root/.cache/huggingface -w /workspace -v${SOURCE_DIR}/../Popular_Models_Guide/StableDiffusion/backend/diffusion:/opt/tritonserver/backends/diffusion -v/tmp:/tmp $IMAGE { set +x; } 2>/dev/null From 452c70f76727a2a8bff204bda941b904df95deda Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Tue, 5 Mar 2024 14:07:43 -0800 Subject: [PATCH 2/3] updated to build model on initialization --- .../examples/rayserve/tritonserver_deployment.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py index ec5f8a56..86d8b77a 100644 --- a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py +++ b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py @@ -79,6 +79,7 @@ def __init__(self, use_torch_compile=True): }, dynamic=False, ) + self.generate("temp") @app.get("/generate") def generate(self, prompt: str, filename: Optional[str] = None) -> None: From e5189236e743c95e3266c5846f65cf6edeff64e4 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Tue, 5 Mar 2024 16:38:48 -0800 Subject: [PATCH 3/3] moving torch tensorrt import --- .../examples/rayserve/tritonserver_deployment.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py index 86d8b77a..77bcdb32 100644 --- a/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py +++ b/Triton_Inference_Server_Python_API/examples/rayserve/tritonserver_deployment.py @@ -31,7 +31,6 @@ import numpy import requests import torch -import torch_tensorrt import tritonserver from fastapi import FastAPI from PIL import Image @@ -56,8 +55,6 @@ def _print_heading(message): @serve.ingress(app) class BaseDeployment: def __init__(self, use_torch_compile=True): - import torch_tensorrt - self._image_size = 512 self._model_id = "runwayml/stable-diffusion-v1-5" from diffusers import StableDiffusionPipeline @@ -67,6 +64,8 @@ def __init__(self, use_torch_compile=True): ) self._pipeline = self._pipeline.to("cuda") if use_torch_compile: + import torch_tensorrt + backend = "torch_tensorrt" print("compiling") print(torch._dynamo.list_backends())