Feature(MInference): update experiments details

iofu728 · iofu728 · commit 010b7080e5cb · 2024-06-26T09:50:51.000Z
diff --git a/experiments/README.md b/experiments/README.md
@@ -1,3 +1,15 @@
+# Experimemts
+
+- [Offline Kernel-Aware Sparse Pattern Search](#Offline-Kernel-Aware-Sparse-Pattern-Search)
+- [MInference Benchmark Experiments](#MInference-Benchmark-Experiments)
+    - [End-to-End Benchmark](#End-to-End-Benchmark)
+    - [Micro-Benchmark](#Micro-Benchmark)
+- [MInference Downstream Tasks Experiments](#MInference-Downstream-Tasks-Experiments)
+    - [InfiniteBench](#InfiniteBench)
+    - [RULER](#RULER)
+    - [PPL](#PPL)
+    - [Needle in A Haystack](#Needle-in-A-Haystack)
+
 ## Offline Kernel-Aware Sparse Pattern Search
 
 You can use the following scripts to search for the optimal head sparse pattern:
@@ -19,7 +31,8 @@ python run_infinitebench.py \
 
 ## MInference Benchmark Experiments
 
-Note: All experiments were run on a single A100 GPU with 80GB of VRAM.
+> [!NOTE]
+> All experiments were run on a single A100 GPU with 80GB of VRAM.
 
 Environment parameters:
 - CUDA 12.3
@@ -62,12 +75,16 @@ python experiments/benchmarks/benchmark_e2e.py --run_benchmark
 1000K   1765.56387      107.85639       328.58551       179.12031
 ```
 
+> [!TIP]
+> Based on our tests, **a single A100 can support up to 1.8M** context prompts during the pre-filling stage using LLaMA-3-8B-4M with **bf16**.
+
 ### Micro-Benchmark
 
 
 ## MInference Downstream Tasks Experiments
 
-Note: All of these experiments were run on one A100 GPUs with 80GB of VRAM. You may need to modify commands to fit your own computing environment (e.g., changing the batch size, the max memory per GPU, the number of GPUs, etc)
+> [!NOTE]
+> All of these experiments were run on one A100 GPUs with 80GB of VRAM. You may need to modify commands to fit your own computing environment (e.g., changing the batch size, the max memory per GPU, the number of GPUs, etc)
 
 ### InfiniteBench
 
@@ -78,7 +95,7 @@ InfiniteBench consists of the following tasks: `kv_retrieval`, `longbook_choice_
 1. Run InfiniteBench with `MInference`:
 
 ```bash
-bash experiments/infinite_bench/run_infinitebench.sh gradientai/Llama-3-8B-Instruct-262k 128000 -1 minference
+bash experiments/infinite_bench/run_infinitebench.sh gradientai/Llama-3-8B-Instruct-262k 160000 -1 minference
 ```
 
 2. Experimental results
diff --git a/experiments/infinite_bench/args.py b/experiments/infinite_bench/args.py
@@ -70,6 +70,7 @@ def parse_args() -> Namespace:
             "inf_llm",
             "flash_attn",
             "minference",
+            "minference_with_dense",
             "dilated1",
             "dilated2",
         ],
diff --git a/minference/minference_configuration.py b/minference/minference_configuration.py
@@ -7,15 +7,18 @@
 
 
 class MInferenceConfig:
-    ATTENTION_TYPES = [
+    MINFERENCE_ATTENTION_TYPES = [
         "minference",
+        "vllm",
+    ]
+    STASTIC_ATTENTION_TYPES = [
         "minference_with_dense",
         "static",
         "dilated1",
         "dilated2",
         "streaming",
         "inf_llm",
-        "vllm",
+        "hf",
     ]
 
     def __init__(
@@ -33,7 +36,7 @@ def __init__(
     ):
         super(MInferenceConfig, self).__init__()
         assert (
-            attn_type in self.ATTENTION_TYPES
+            attn_type in self.MINFERENCE_ATTENTION_TYPES + self.STASTIC_ATTENTION_TYPES
         ), f"The attention_type {attn_type} you specified is not supported."
         self.attn_type = attn_type
         self.config_path = self.update_config_path(config_path, model_name)
@@ -46,7 +49,7 @@ def __init__(
         self.attn_kwargs = attn_kwargs
 
     def update_config_path(self, config_path: str, model_name: str):
-        if config_path is not None:
+        if config_path is not None or self.attn_type in self.STASTIC_ATTENTION_TYPES:
             return config_path
         assert (
             model_name in MODEL2PATH
diff --git a/minference/models_patch.py b/minference/models_patch.py
@@ -6,6 +6,8 @@
 from .minference_configuration import MInferenceConfig
 from .patch import minference_patch, minference_patch_vllm, patch_hf
 
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
 
 class MInference:
     def __init__(
@@ -76,6 +78,8 @@ def patch_model(self, model):
                 attn_type="streaming",
                 attn_kwargs={"n_local": 3968, "n_init": 128, **self.config.attn_kwargs},
             )
+        elif self.config.attn_type == "hf":
+            pass
         elif self.config.attn_type == "inf_llm":
             model = patch_hf(
                 model,