Add Hybrid Engine mode + move arguments to a file (#684)

awan-10 · web-flow · commit c77e75a891b0 · 2023-08-11T12:59:54.000-05:00
diff --git a/inference/huggingface/text-generation/arguments.py b/inference/huggingface/text-generation/arguments.py
@@ -0,0 +1,20 @@
+from argparse import ArgumentParser
+import os
+
+parser = ArgumentParser()
+
+parser.add_argument("--model", required=True, type=str, help="model_name")
+parser.add_argument("--checkpoint_path", required=False, default=None, type=str, help="model checkpoint path")
+parser.add_argument("--save_mp_checkpoint_path", required=False, default=None, type=str, help="save-path to store the new model checkpoint")
+parser.add_argument("--batch_size", default=1, type=int, help="batch size")
+parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8"], help="data-type")
+parser.add_argument("--hf_baseline", action='store_true', help="disable DeepSpeed inference")
+parser.add_argument("--use_kernel", action='store_true', help="enable kernel-injection")
+parser.add_argument("--max_tokens", default=1024, type=int, help="maximum tokens used for the text-generation KV-cache")
+parser.add_argument("--max_new_tokens", default=50, type=int, help="maximum new tokens to generate")
+parser.add_argument("--greedy", action='store_true', help="greedy generation mode")
+parser.add_argument("--use_meta_tensor", action='store_true', help="use the meta tensors to initialize model")
+parser.add_argument("--test_performance", action='store_true', help="enable latency, bandwidth, and throughout testing")
+parser.add_argument("--local_rank", type=int, default=int(os.getenv("LOCAL_RANK", "0")), help="local rank")
+parser.add_argument("--world_size", type=int, default=int(os.getenv("WORLD_SIZE", "1")), help="world_size")
+parser.add_argument("--test_hybrid_engine", action='store_true', help="enable hybrid engine testing")
diff --git a/inference/huggingface/text-generation/inference-test.py b/inference/huggingface/text-generation/inference-test.py
@@ -1,4 +1,3 @@
-from argparse import ArgumentParser
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 import deepspeed
 import math
@@ -7,23 +6,8 @@
 import time
 from utils import DSPipeline, Performance
 from deepspeed.runtime.utils import see_memory_usage
+from arguments import parser
 
-parser = ArgumentParser()
-
-parser.add_argument("--model", required=True, type=str, help="model_name")
-parser.add_argument("--checkpoint_path", required=False, default=None, type=str, help="model checkpoint path")
-parser.add_argument("--save_mp_checkpoint_path", required=False, default=None, type=str, help="save-path to store the new model checkpoint")
-parser.add_argument("--batch_size", default=1, type=int, help="batch size")
-parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8"], help="data-type")
-parser.add_argument("--hf_baseline", action='store_true', help="disable DeepSpeed inference")
-parser.add_argument("--use_kernel", action='store_true', help="enable kernel-injection")
-parser.add_argument("--max_tokens", default=1024, type=int, help="maximum tokens used for the text-generation KV-cache")
-parser.add_argument("--max_new_tokens", default=50, type=int, help="maximum new tokens to generate")
-parser.add_argument("--greedy", action='store_true', help="greedy generation mode")
-parser.add_argument("--use_meta_tensor", action='store_true', help="use the meta tensors to initialize model")
-parser.add_argument("--test_performance", action='store_true', help="enable latency, bandwidth, and throughout testing")
-parser.add_argument("--local_rank", type=int, default=int(os.getenv("LOCAL_RANK", "0")), help="local rank")
-parser.add_argument("--world_size", type=int, default=int(os.getenv("WORLD_SIZE", "1")), help="world_size")
 args = parser.parse_args()
 
 if args.hf_baseline and args.world_size > 1:
@@ -51,16 +35,23 @@
 else:
     ds_kwargs = dict()
 
-if not args.hf_baseline:
-    pipe.model = deepspeed.init_inference(pipe.model,
+# Use DeepSpeed Hybrid Engine for inference
+if args.test_hybrid_engine:
+    ds_config = {"train_batch_size": args.batch_size, "fp16": {"enabled": True if data_type==torch.half else False}, "hybrid_engine": {"enabled": True}}
+    pipe.model, *_ = deepspeed.initialize(model=pipe.model, config=ds_config)
+    pipe.model.eval()
+# If not trying with the HuggingFace baseline, use DeepSpeed Inference Engine
+else:
+    if not args.hf_baseline:
+        pipe.model = deepspeed.init_inference(pipe.model,
                                     dtype=data_type,
                                     mp_size=args.world_size,
                                     replace_with_kernel_inject=args.use_kernel,
                                     max_tokens=args.max_tokens,
                                     save_mp_checkpoint_path=args.save_mp_checkpoint_path,
                                     **ds_kwargs
                                     )
-
+  
 if args.local_rank == 0:
     see_memory_usage("after init_inference", True)