Skip to content

Commit d04b1b8

Browse files
authored
Merge pull request #69 from krai/llm_compressor_quantization
Llm compressor quantization
2 parents 74c39a8 + eb1c69b commit d04b1b8

File tree

6 files changed

+257
-1
lines changed

6 files changed

+257
-1
lines changed

data_axs.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,10 @@
7878
"quantize_quark_recipe": "quantize_quark_recipe",
7979
"base_small_llm_loadgen_experiment": "base_small_llm_loadgen_experiment",
8080
"mlc_r2_downloader_recipe": "mlc_r2_downloader_recipe",
81-
"mlc_r2_downloader_tool_detector": "mlc_r2_downloader_tool_detector"
81+
"mlc_r2_downloader_tool_detector": "mlc_r2_downloader_tool_detector",
82+
"model_bf16_to_fp16_converter": "model_bf16_to_fp16_converter",
83+
"quantize_llm_compressor_recipe": "quantize_llm_compressor_recipe",
84+
"dataset_small_llm_cnndm_mlperf_calibration_recipe": "dataset_small_llm_cnndm_mlperf_calibration_recipe"
8285
},
8386
"repo_name": "axs2mlperf",
8487
"checkout": null,
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"_producer_rules": [
3+
[ [ "downloaded", "dataset_name=cnndm", "model_family=llama3_1", "variant=8b", "type=calibration" ], [["get_kernel"],["byname","downloader"],["download"]], {
4+
"downloading_tool_query": "shell_tool,can_download_url,tool_name=mlc_r2_downloader",
5+
"url": "https://inference.mlcommons-storage.org/metadata/llama3-1-8b-cnn-dailymail-calibration.uri",
6+
"newborn_entry_name": "downloaded_mlc_cnndm_llama3_1_8b_calibration",
7+
"file_path": "llama3_1_8b"
8+
}, [] ]
9+
]
10+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from transformers import AutoModelForCausalLM, AutoTokenizer
2+
import torch
3+
import sys
4+
5+
source_model_path = sys.argv[1]
6+
target_model_path = sys.argv[2]
7+
8+
model = AutoModelForCausalLM.from_pretrained(
9+
source_model_path,
10+
torch_dtype="auto",
11+
device_map="cpu"
12+
)
13+
14+
print("Casting to fp16...")
15+
model = model.to(torch.float16)
16+
17+
print(f"Saving FP16 model to {target_model_path} ...")
18+
model.save_pretrained(
19+
target_model_path,
20+
safe_serialization=True
21+
)
22+
23+
tokenizer = AutoTokenizer.from_pretrained(source_model_path)
24+
tokenizer.save_pretrained(target_model_path)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
{
2+
"_parent_entries": [ [ "^", "byname", "python_script" ], [ "^", "byname", "entry_creator" ] ],
3+
"_producer_rules": [
4+
[ [ "converted", "method=transformers" ], [[ "get", "pipeline" ]] ]
5+
],
6+
7+
"pipeline": [ "^^", "execute", [[
8+
[ "run" ],
9+
[ ],
10+
[ "get", "stored_newborn_entry" ]
11+
]] ],
12+
13+
"desired_python_version": "3.11",
14+
15+
"model_name": "llama3_1",
16+
"model_variant": "8b",
17+
18+
"source_model_query": [ "^^", "substitute", [[
19+
"downloaded",
20+
"hf_model",
21+
[ "model_family", "#{model_name}#" ],
22+
[ "variant", "#{model_variant}#" ]
23+
]] ],
24+
"source_model_entry": [ "^", "byquery",
25+
[[ "^^", "get", "source_model_query" ]], {}, [ "source_model_query" ]
26+
],
27+
"source_model_path": [ "^^", "execute", [[
28+
[ "get", "source_model_entry" ],
29+
[ "get_path" ]
30+
]] ],
31+
32+
"target_model_path": [ "^^", "get", "newborn_entry_path" ],
33+
34+
"python_deps": [
35+
[ "^^", "python_sync_pip_package", [[
36+
"python_package",
37+
"package_name=llm-compressor",
38+
"installable=git+https://github.com/vllm-project/llm-compressor.git@sa/big_model_support",
39+
["desired_python_version", [ "^^", "get", "desired_python_version" ] ]
40+
]] ]
41+
],
42+
43+
"newborn_entry_tags": [ "converted", "method=transformers" ],
44+
"newborn_name_template": [ "converted_#{model_name}#_#{model_variant}#_from_bf16_to_fp16_using_#{method}#" ],
45+
"newborn_entry_param_names": [
46+
"model_name",
47+
"model_variant"
48+
],
49+
50+
"rel_script_path": "convert.py",
51+
52+
"script_extra_params": [ "^^", "substitute", "#{source_model_path}# #{target_model_path}#" ]
53+
}
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
{
2+
"_parent_entries": [ [ "^", "byname", "python_script" ], [ "^", "byname", "entry_creator" ] ],
3+
"_producer_rules": [
4+
[ [ "quantized", "method=llm_compressor" ], [[ "get", "pipeline" ]] ]
5+
],
6+
7+
"pipeline": [ "^^", "execute", [[
8+
[ "run" ],
9+
[ ],
10+
[ "get", "stored_newborn_entry" ]
11+
]] ],
12+
13+
"desired_python_version": "3.11",
14+
15+
"model_name": "llama3_1",
16+
"model_variant": "8b",
17+
"conversion_method": "transformers",
18+
19+
"source_model_query": [ "^^", "substitute", [[
20+
"converted",
21+
[ "method", "#{conversion_method}#" ],
22+
[ "model_name", "#{model_name}#" ],
23+
[ "model_variant", "#{model_variant}#" ]
24+
]] ],
25+
"source_model_entry": [ "^", "byquery",
26+
[[ "^^", "get", "source_model_query" ]], {}, [ "source_model_query" ]
27+
],
28+
"source_model_path": [ "^^", "execute", [[
29+
[ "get", "source_model_entry" ],
30+
[ "get_path" ]
31+
]] ],
32+
33+
"target_model_path": [ "^^", "get", "newborn_entry_path" ],
34+
35+
"dataset_name": "cnndm",
36+
"calib_dataset_query": [ "^^", "substitute", [[
37+
"downloaded",
38+
[ "dataset_name", "#{dataset_name}#" ],
39+
[ "model_family", "#{model_name}#" ],
40+
[ "variant", "#{model_variant}#" ],
41+
[ "type", "calibration" ]
42+
]] ],
43+
"calib_dataset_entry": [ "^", "byquery",
44+
[[ "^^", "get", "calib_dataset_query" ]], {}, [ "calib_dataset_query" ]
45+
],
46+
"calib_dataset_path": [ "^^", "execute", [[
47+
[ "get", "calib_dataset_entry" ],
48+
[ "get_path" ],
49+
[ "__add__", "/cnn_dailymail_calibration.json" ]
50+
]] ],
51+
52+
"num_gpus": 1,
53+
"max_sequence_length": 4096,
54+
"num_calibration_samples": 1000,
55+
56+
"python_deps": [
57+
[ "^^", "python_sync_pip_package", [[
58+
"python_package",
59+
"package_name=llm-compressor",
60+
"installable=git+https://github.com/vllm-project/llm-compressor.git@sa/big_model_support",
61+
["desired_python_version", [ "^^", "get", "desired_python_version" ] ]
62+
]] ],
63+
[ "^^", "python_sync_pip_package", [[
64+
"python_package",
65+
"package_name=compressed-tensors",
66+
"package_version=0.5.0",
67+
["desired_python_version", [ "^^", "get", "desired_python_version" ] ]
68+
]] ]
69+
],
70+
71+
"newborn_entry_tags": [ "quantized", "method=llm_compressor" ],
72+
"newborn_name_template": [ "quantized_#{model_name}#_#{model_variant}#_using_#{method}#" ],
73+
"newborn_entry_param_names": [
74+
"model_name",
75+
"model_variant"
76+
],
77+
78+
"rel_script_path": "quantize.py",
79+
80+
"script_extra_params": [ "^^", "substitute",
81+
"#{source_model_path}# #{target_model_path}# #{calib_dataset_path}# #{num_gpus}# #{max_sequence_length}# #{num_calibration_samples}#"
82+
]
83+
}
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
from datasets import load_dataset
2+
from transformers import AutoTokenizer
3+
4+
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
5+
from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
6+
7+
import sys
8+
9+
args = iter(sys.argv[1:])
10+
11+
source_model_path = next(args)
12+
target_model_path = next(args)
13+
calib_dataset_path = next(args)
14+
num_gpus = int(next(args))
15+
max_sequence_length = int(next(args))
16+
num_calibration_samples = int(next(args))
17+
18+
recipe = """
19+
quant_stage:
20+
quant_modifiers:
21+
QuantizationModifier:
22+
ignore: ["lm_head"]
23+
config_groups:
24+
group_0:
25+
weights:
26+
num_bits: 8
27+
type: float
28+
strategy: tensor
29+
dynamic: false
30+
symmetric: true
31+
input_activations:
32+
num_bits: 8
33+
type: float
34+
strategy: tensor
35+
dynamic: false
36+
symmetric: true
37+
targets: ["Linear"]
38+
"""
39+
40+
device_map = calculate_offload_device_map(
41+
source_model_path,
42+
reserve_for_hessians=False,
43+
num_gpus=num_gpus,
44+
torch_dtype="auto"
45+
)
46+
47+
model = SparseAutoModelForCausalLM.from_pretrained(
48+
source_model_path,
49+
torch_dtype="auto",
50+
device_map=device_map
51+
)
52+
53+
tokenizer = AutoTokenizer.from_pretrained(source_model_path)
54+
55+
calib_dataset = load_dataset(
56+
"json",
57+
data_files=calib_dataset_path,
58+
split="train"
59+
)
60+
61+
def convert_calib_dataset(sample):
62+
input_ids = sample["tok_input"]
63+
return {
64+
"input_ids": input_ids,
65+
"attention_mask": [1] * len(input_ids)
66+
}
67+
68+
calib_dataset = calib_dataset.map(
69+
convert_calib_dataset,
70+
remove_columns=calib_dataset.column_names
71+
)
72+
73+
num_calibration_samples = min(num_calibration_samples, len(calib_dataset))
74+
75+
oneshot(
76+
model=model,
77+
output_dir=target_model_path,
78+
dataset=calib_dataset,
79+
recipe=recipe,
80+
max_seq_length=max_sequence_length,
81+
num_calibration_samples=num_calibration_samples,
82+
save_compressed=True,
83+
)

0 commit comments

Comments
 (0)