Merge pull request #69 from krai/llm_compressor_quantization

mosalov · web-flow · commit d04b1b8b112d · 2025-09-26T17:18:35.000+02:00
Llm compressor quantization
diff --git a/data_axs.json b/data_axs.json
@@ -78,7 +78,10 @@
         "quantize_quark_recipe": "quantize_quark_recipe",
         "base_small_llm_loadgen_experiment": "base_small_llm_loadgen_experiment",
         "mlc_r2_downloader_recipe": "mlc_r2_downloader_recipe",
-        "mlc_r2_downloader_tool_detector": "mlc_r2_downloader_tool_detector"
+        "mlc_r2_downloader_tool_detector": "mlc_r2_downloader_tool_detector",
+        "model_bf16_to_fp16_converter": "model_bf16_to_fp16_converter",
+        "quantize_llm_compressor_recipe": "quantize_llm_compressor_recipe",
+        "dataset_small_llm_cnndm_mlperf_calibration_recipe": "dataset_small_llm_cnndm_mlperf_calibration_recipe"
     },
     "repo_name": "axs2mlperf",
     "checkout": null,
diff --git a/dataset_small_llm_cnndm_mlperf_calibration_recipe/data_axs.json b/dataset_small_llm_cnndm_mlperf_calibration_recipe/data_axs.json
@@ -0,0 +1,10 @@
+{
+    "_producer_rules": [
+        [ [ "downloaded", "dataset_name=cnndm", "model_family=llama3_1", "variant=8b", "type=calibration" ], [["get_kernel"],["byname","downloader"],["download"]], {
+            "downloading_tool_query": "shell_tool,can_download_url,tool_name=mlc_r2_downloader",
+            "url": "https://inference.mlcommons-storage.org/metadata/llama3-1-8b-cnn-dailymail-calibration.uri",
+            "newborn_entry_name": "downloaded_mlc_cnndm_llama3_1_8b_calibration",
+            "file_path": "llama3_1_8b"
+        }, [] ]
+    ]
+}
diff --git a/model_bf16_to_fp16_converter/convert.py b/model_bf16_to_fp16_converter/convert.py
@@ -0,0 +1,24 @@
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import sys
+
+source_model_path = sys.argv[1]
+target_model_path = sys.argv[2]
+
+model = AutoModelForCausalLM.from_pretrained(
+    source_model_path,
+    torch_dtype="auto",
+    device_map="cpu"
+)
+
+print("Casting to fp16...")
+model = model.to(torch.float16)
+
+print(f"Saving FP16 model to {target_model_path} ...")
+model.save_pretrained(
+    target_model_path,
+    safe_serialization=True
+)
+
+tokenizer = AutoTokenizer.from_pretrained(source_model_path)
+tokenizer.save_pretrained(target_model_path)
diff --git a/model_bf16_to_fp16_converter/data_axs.json b/model_bf16_to_fp16_converter/data_axs.json
@@ -0,0 +1,53 @@
+{
+    "_parent_entries": [ [ "^", "byname", "python_script" ], [ "^", "byname", "entry_creator" ] ],
+    "_producer_rules": [
+        [ [ "converted", "method=transformers" ], [[ "get", "pipeline" ]] ]
+    ],
+
+    "pipeline": [ "^^", "execute", [[
+        [ "run" ],
+        [ ],
+        [ "get", "stored_newborn_entry" ]
+    ]] ], 
+
+    "desired_python_version": "3.11",
+
+    "model_name": "llama3_1",
+    "model_variant": "8b",
+
+    "source_model_query": [ "^^", "substitute", [[
+        "downloaded",
+        "hf_model",
+        [ "model_family", "#{model_name}#" ],
+        [ "variant", "#{model_variant}#" ]
+    ]] ],
+    "source_model_entry": [ "^", "byquery", 
+        [[ "^^", "get", "source_model_query" ]], {}, [ "source_model_query" ]
+    ],
+    "source_model_path": [ "^^", "execute", [[
+        [ "get", "source_model_entry" ],
+        [ "get_path" ]
+    ]] ],
+
+    "target_model_path": [ "^^", "get", "newborn_entry_path" ],
+
+    "python_deps": [
+        [ "^^", "python_sync_pip_package", [[ 
+            "python_package",
+            "package_name=llm-compressor",
+            "installable=git+https://github.com/vllm-project/llm-compressor.git@sa/big_model_support",
+            ["desired_python_version", [ "^^", "get", "desired_python_version" ] ]
+        ]] ]
+    ],
+
+    "newborn_entry_tags": [ "converted", "method=transformers" ],
+    "newborn_name_template": [ "converted_#{model_name}#_#{model_variant}#_from_bf16_to_fp16_using_#{method}#" ],
+    "newborn_entry_param_names": [
+        "model_name",
+        "model_variant"
+    ],
+
+    "rel_script_path": "convert.py",
+
+    "script_extra_params": [ "^^", "substitute", "#{source_model_path}# #{target_model_path}#" ]
+}
diff --git a/quantize_llm_compressor_recipe/data_axs.json b/quantize_llm_compressor_recipe/data_axs.json
@@ -0,0 +1,83 @@
+{
+    "_parent_entries": [ [ "^", "byname", "python_script" ], [ "^", "byname", "entry_creator" ] ],
+    "_producer_rules": [
+        [ [ "quantized", "method=llm_compressor" ], [[ "get", "pipeline" ]] ]
+    ],
+
+    "pipeline": [ "^^", "execute", [[
+        [ "run" ],
+        [ ],
+        [ "get", "stored_newborn_entry" ]
+    ]] ], 
+
+    "desired_python_version": "3.11",
+
+    "model_name": "llama3_1",
+    "model_variant": "8b",
+    "conversion_method": "transformers",
+
+    "source_model_query": [ "^^", "substitute", [[
+        "converted",
+        [ "method", "#{conversion_method}#" ],
+        [ "model_name", "#{model_name}#" ],
+        [ "model_variant", "#{model_variant}#" ]
+    ]] ],
+    "source_model_entry": [ "^", "byquery", 
+        [[ "^^", "get", "source_model_query" ]], {}, [ "source_model_query" ]
+    ],
+    "source_model_path": [ "^^", "execute", [[
+        [ "get", "source_model_entry" ],
+        [ "get_path" ]
+    ]] ],
+
+    "target_model_path": [ "^^", "get", "newborn_entry_path" ],
+
+    "dataset_name": "cnndm",
+    "calib_dataset_query": [ "^^", "substitute", [[
+        "downloaded",
+        [ "dataset_name", "#{dataset_name}#" ],
+        [ "model_family", "#{model_name}#" ],
+        [ "variant", "#{model_variant}#" ],
+        [ "type", "calibration" ]
+    ]] ],
+    "calib_dataset_entry": [ "^", "byquery", 
+        [[ "^^", "get", "calib_dataset_query" ]], {}, [ "calib_dataset_query" ]
+    ],
+    "calib_dataset_path": [ "^^", "execute", [[
+        [ "get", "calib_dataset_entry" ],
+        [ "get_path" ],
+        [ "__add__", "/cnn_dailymail_calibration.json" ]
+    ]] ],
+
+    "num_gpus": 1,
+    "max_sequence_length": 4096,
+    "num_calibration_samples": 1000,
+
+    "python_deps": [
+        [ "^^", "python_sync_pip_package", [[ 
+            "python_package",
+            "package_name=llm-compressor",
+            "installable=git+https://github.com/vllm-project/llm-compressor.git@sa/big_model_support",
+            ["desired_python_version", [ "^^", "get", "desired_python_version" ] ]
+        ]] ],
+        [ "^^", "python_sync_pip_package", [[ 
+            "python_package",
+            "package_name=compressed-tensors",
+            "package_version=0.5.0",
+            ["desired_python_version", [ "^^", "get", "desired_python_version" ] ]
+        ]] ]
+    ],
+
+    "newborn_entry_tags": [ "quantized", "method=llm_compressor" ],
+    "newborn_name_template": [ "quantized_#{model_name}#_#{model_variant}#_using_#{method}#" ],
+    "newborn_entry_param_names": [
+        "model_name",
+        "model_variant"
+    ],
+
+    "rel_script_path": "quantize.py",
+
+    "script_extra_params": [ "^^", "substitute", 
+        "#{source_model_path}# #{target_model_path}# #{calib_dataset_path}# #{num_gpus}# #{max_sequence_length}# #{num_calibration_samples}#"
+    ]
+}
diff --git a/quantize_llm_compressor_recipe/quantize.py b/quantize_llm_compressor_recipe/quantize.py
@@ -0,0 +1,83 @@
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
+from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
+
+import sys
+
+args = iter(sys.argv[1:])
+
+source_model_path = next(args)
+target_model_path = next(args)
+calib_dataset_path = next(args)
+num_gpus = int(next(args))
+max_sequence_length = int(next(args))
+num_calibration_samples = int(next(args))
+
+recipe = """
+quant_stage:
+    quant_modifiers:
+        QuantizationModifier:
+            ignore: ["lm_head"]
+            config_groups:
+                group_0:
+                    weights:
+                        num_bits: 8
+                        type: float
+                        strategy: tensor
+                        dynamic: false
+                        symmetric: true
+                    input_activations:
+                        num_bits: 8
+                        type: float
+                        strategy: tensor
+                        dynamic: false
+                        symmetric: true
+                    targets: ["Linear"]
+    """
+
+device_map = calculate_offload_device_map(
+    source_model_path,
+    reserve_for_hessians=False,
+    num_gpus=num_gpus,
+    torch_dtype="auto"
+)
+
+model = SparseAutoModelForCausalLM.from_pretrained(
+    source_model_path,
+    torch_dtype="auto",
+    device_map=device_map
+)
+
+tokenizer = AutoTokenizer.from_pretrained(source_model_path)
+
+calib_dataset = load_dataset(
+    "json",
+    data_files=calib_dataset_path,
+    split="train"
+)
+
+def convert_calib_dataset(sample):
+    input_ids = sample["tok_input"]
+    return {
+        "input_ids": input_ids,
+        "attention_mask": [1] * len(input_ids)
+    }
+
+calib_dataset = calib_dataset.map(
+    convert_calib_dataset,
+    remove_columns=calib_dataset.column_names
+)
+
+num_calibration_samples = min(num_calibration_samples, len(calib_dataset))
+
+oneshot(
+    model=model,
+    output_dir=target_model_path,
+    dataset=calib_dataset,
+    recipe=recipe,
+    max_seq_length=max_sequence_length,
+    num_calibration_samples=num_calibration_samples,
+    save_compressed=True,
+)