huggingface · sayakpaul · May 9, 2025 · Mar 10, 2025 · Mar 20, 2025 · Mar 20, 2025
diff --git a/.github/workflows/nightly_tests.yml b/.github/workflows/nightly_tests.yml
@@ -525,118 +525,172 @@
           pip install slack_sdk tabulate
           python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
 
+  run_nightly_pipeline_level_quantization_tests:
+    name: Torch quantization nightly tests
+    strategy:
+      fail-fast: false
+      max-parallel: 2
+    runs-on:
+      group: aws-g6e-xlarge-plus
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "20gb" --ipc host --gpus 0
+    steps:
+      - name: Checkout diffusers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: NVIDIA-SMI
+        run: nvidia-smi
+      - name: Install dependencies
+        run: |
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m uv pip install -e [quality,test]
+          python -m uv pip install -U bitsandbytes optimum_quanto
+          python -m uv pip install pytest-reportlog
+      - name: Environment
+        run: |
+          python utils/print_env.py
+      - name: Pipeline-level quantization tests on GPU
+        env:
+          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+          CUBLAS_WORKSPACE_CONFIG: :16:8
+          BIG_GPU_MEMORY: 40
+        run: |
+          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+            --make-reports=tests_pipeline_level_quant_torch_cuda \
+            --report-log=tests_pipeline_level_quant_torch_cuda.log \
+            tests/quantization/test_pipeline_level_quantization.py
+      - name: Failure short reports
+        if: ${{ failure() }}
+        run: |
+          cat reports/tests_pipeline_level_quant_torch_cuda_stats.txt
+          cat reports/tests_pipeline_level_quant_torch_cuda_failures_short.txt
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: torch_cuda_pipeline_level_quant_reports
+          path: reports
+      - name: Generate Report and Notify Channel
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+
 # M1 runner currently not well supported
 # TODO: (Dhruv) add these back when we setup better testing for Apple Silicon
 #  run_nightly_tests_apple_m1:
 #    name: Nightly PyTorch MPS tests on MacOS
 #    runs-on: [ self-hosted, apple-m1 ]
 #    if: github.event_name == 'schedule'
 #
 #    steps:
 #      - name: Checkout diffusers
 #        uses: actions/checkout@v3
 #        with:
 #          fetch-depth: 2
 #
 #      - name: Clean checkout
 #        shell: arch -arch arm64 bash {0}
 #        run: |
 #          git clean -fxd
 #      - name: Setup miniconda
 #        uses: ./.github/actions/setup-miniconda
 #        with:
 #          python-version: 3.9
 #
 #      - name: Install dependencies
 #        shell: arch -arch arm64 bash {0}
 #        run: |
 #          ${CONDA_RUN} python -m pip install --upgrade pip uv
 #          ${CONDA_RUN} python -m uv pip install -e [quality,test]
 #          ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
 #          ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
 #          ${CONDA_RUN} python -m uv pip install pytest-reportlog
 #      - name: Environment
 #        shell: arch -arch arm64 bash {0}
 #        run: |
 #          ${CONDA_RUN} python utils/print_env.py
 #      - name: Run nightly PyTorch tests on M1 (MPS)
 #        shell: arch -arch arm64 bash {0}
 #        env:
 #          HF_HOME: /System/Volumes/Data/mnt/cache
 #          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
 #        run: |
 #          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
 #            --report-log=tests_torch_mps.log \
 #            tests/
 #      - name: Failure short reports
 #        if: ${{ failure() }}
 #        run: cat reports/tests_torch_mps_failures_short.txt
 #
 #      - name: Test suite reports artifacts
 #        if: ${{ always() }}
 #        uses: actions/upload-artifact@v4
 #        with:
 #          name: torch_mps_test_reports
 #          path: reports
 #
 #      - name: Generate Report and Notify Channel
 #        if: always()
 #        run: |
 #          pip install slack_sdk tabulate
 #          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY  run_nightly_tests_apple_m1:
 #    name: Nightly PyTorch MPS tests on MacOS
 #    runs-on: [ self-hosted, apple-m1 ]
 #    if: github.event_name == 'schedule'
 #
 #    steps:
 #      - name: Checkout diffusers
 #        uses: actions/checkout@v3
 #        with:
 #          fetch-depth: 2
 #
 #      - name: Clean checkout
 #        shell: arch -arch arm64 bash {0}
 #        run: |
 #          git clean -fxd
 #      - name: Setup miniconda
 #        uses: ./.github/actions/setup-miniconda
 #        with:
 #          python-version: 3.9
 #
 #      - name: Install dependencies
 #        shell: arch -arch arm64 bash {0}
 #        run: |
 #          ${CONDA_RUN} python -m pip install --upgrade pip uv
 #          ${CONDA_RUN} python -m uv pip install -e [quality,test]
 #          ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
 #          ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate
 #          ${CONDA_RUN} python -m uv pip install pytest-reportlog
 #      - name: Environment
 #        shell: arch -arch arm64 bash {0}
 #        run: |
 #          ${CONDA_RUN} python utils/print_env.py
 #      - name: Run nightly PyTorch tests on M1 (MPS)
 #        shell: arch -arch arm64 bash {0}
 #        env:
 #          HF_HOME: /System/Volumes/Data/mnt/cache
 #          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
 #        run: |
 #          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
 #            --report-log=tests_torch_mps.log \
 #            tests/
 #      - name: Failure short reports
 #        if: ${{ failure() }}
 #        run: cat reports/tests_torch_mps_failures_short.txt
 #
 #      - name: Test suite reports artifacts
 #        if: ${{ always() }}
 #        uses: actions/upload-artifact@v4
 #        with:
 #          name: torch_mps_test_reports
 #          path: reports
 #
 #      - name: Generate Report and Notify Channel
 #        if: always()
 #        run: |
 #          pip install slack_sdk tabulate
 #          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
diff --git a/docs/source/en/api/quantization.md b/docs/source/en/api/quantization.md
@@ -13,16 +13,17 @@ specific language governing permissions and limitations under the License.
 
 # Quantization
 
-Quantization techniques reduce memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference. Diffusers supports 8-bit and 4-bit quantization with [bitsandbytes](https://huggingface.co/docs/bitsandbytes/en/index).
-
-Quantization techniques that aren't supported in Transformers can be added with the [`DiffusersQuantizer`] class.
+Quantization techniques reduce memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference.
 
 <Tip>
 
 Learn how to quantize models in the [Quantization](../quantization/overview) guide.
 
 </Tip>
 
+## PipelineQuantizationConfig
+
+[[autodoc]] quantizers.PipelineQuantizationConfig
 
 ## BitsAndBytesConfig
 

diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
@@ -39,3 +39,90 @@ Diffusers currently supports the following quantization methods.
 - [Quanto](./quanto.md)
 
 [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques.
+
+## Pipeline-level quantization
+
+Diffusers allows users to directly initialize pipelines from checkpoints that may contain quantized models ([example](https://huggingface.co/hf-internal-testing/flux.1-dev-nf4-pkg)). However, users may want to apply
+quantization on-the-fly when initializing a pipeline from a pre-trained and non-quantized checkpoint. You can
+do this with [`~quantizers.PipelineQuantizationConfig`].
+
+Start by defining a `PipelineQuantizationConfig`:
+
+```py
+import torch
+from diffusers import DiffusionPipeline
+from diffusers.quantizers.quantization_config import QuantoConfig
+from diffusers.quantizers import PipelineQuantizationConfig
+from transformers import BitsAndBytesConfig
+
+pipeline_quant_config = PipelineQuantizationConfig(
+    quant_mapping={
+        "transformer": QuantoConfig(weights_dtype="int8"),
+        "text_encoder_2": BitsAndBytesConfig(
+            load_in_4bit=True, compute_dtype=torch.bfloat16
+        ),
+    }
+)
+```
+
+Then pass it to [`~DiffusionPipeline.from_pretrained`] and run inference:
+
+```py
+pipe = DiffusionPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    quantization_config=pipeline_quant_config,
+    torch_dtype=torch.bfloat16,
+).to("cuda")
+
+image = pipe("photo of a cute dog").images[0]
+```
+
+This method allows for more granular control over the quantization specifications of individual 
+model-level components of a pipeline. It also allows for different quantization backends for
+different components. In the above example, you used a combination of Quanto and BitsandBytes. However,
+one caveat of this method is that users need to know which components come from `transformers` to be able
+to import the right quantization config class.
+
+The other method is simpler in terms of experience but is
+less-flexible. Start by defining a `PipelineQuantizationConfig` but in a different way:
+
+```py
+pipeline_quant_config = PipelineQuantizationConfig(
+    quant_backend="bitsandbytes_4bit",
+    quant_kwargs={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16},
+    components_to_quantize=["transformer", "text_encoder_2"],
+)
+```
+
+This `pipeline_quant_config` can now be passed to [`~DiffusionPipeline.from_pretrained`] similar to the above example.
+
+In this case, `quant_kwargs` will be used to initialize the quantization specifications
+of the respective quantization configuration class of `quant_backend`. `components_to_quantize`
+is used to denote the components that will be quantized. For most pipelines, you would want to
+keep `transformer` in the list as that is often the most compute and memory intensive.
+
+The config below will work for most diffusion pipelines that have a `transformer` component present.
+In most case, you will want to quantize the `transformer` component as that is often the most compute-
+intensive part of a diffusion pipeline.
+
+```py
+pipeline_quant_config = PipelineQuantizationConfig(
+    quant_backend="bitsandbytes_4bit",
+    quant_kwargs={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16},
+    components_to_quantize=["transformer"],
+)
+```
+
+Below is a list of the supported quantization backends available in both `diffusers` and `transformers`:
+
+* `bitsandbytes_4bit` 
+* `bitsandbytes_8bit`
+* `gguf`
+* `quanto`
+* `torchao`
+
+
+Diffusion pipelines can have multiple text encoders. [`FluxPipeline`] has two, for example. It's
+recommended to quantize the text encoders that are memory-intensive. Some examples include T5,
+Llama, Gemma, etc. In the above example, you quantized the T5 model of [`FluxPipeline`] through
+`text_encoder_2` while keeping the CLIP model intact (accessible through `text_encoder`). 
diff --git a/src/diffusers/pipelines/pipeline_loading_utils.py b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -675,8 +675,10 @@ def load_sub_model(
     use_safetensors: bool,
     dduf_entries: Optional[Dict[str, DDUFEntry]],
     provider_options: Any,
+    quantization_config: Optional[Any] = None,
 ):
     """Helper method to load the module `name` from `library_name` and `class_name`"""
+    from ..quantizers import PipelineQuantizationConfig
 
     # retrieve class candidates
 
@@ -769,6 +771,17 @@ def load_sub_model(
         else:
             loading_kwargs["low_cpu_mem_usage"] = False
 
+    if (
+        quantization_config is not None
+        and isinstance(quantization_config, PipelineQuantizationConfig)
+        and issubclass(class_obj, torch.nn.Module)
+    ):
+        model_quant_config = quantization_config._resolve_quant_config(
+            is_diffusers=is_diffusers_model, module_name=name
+        )
+        if model_quant_config is not None:
+            loading_kwargs["quantization_config"] = model_quant_config
+
     # check if the module is in a subdirectory
     if dduf_entries:
         loading_kwargs["dduf_entries"] = dduf_entries

diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
@@ -47,6 +47,7 @@
 from ..models import AutoencoderKL
 from ..models.attention_processor import FusedAttnProcessor2_0
 from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, ModelMixin
+from ..quantizers import PipelineQuantizationConfig
 from ..quantizers.bitsandbytes.utils import _check_bnb_status
 from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from ..utils import (
@@ -725,6 +726,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         use_safetensors = kwargs.pop("use_safetensors", None)
         use_onnx = kwargs.pop("use_onnx", None)
         load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)
+        quantization_config = kwargs.pop("quantization_config", None)
 
         if torch_dtype is not None and not isinstance(torch_dtype, dict) and not isinstance(torch_dtype, torch.dtype):
             torch_dtype = torch.float32
@@ -741,6 +743,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 " install accelerate\n```\n."
             )
 
+        if quantization_config is not None and not isinstance(quantization_config, PipelineQuantizationConfig):
+            raise ValueError("`quantization_config` must be an instance of `PipelineQuantizationConfig`.")
+
         if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
             raise NotImplementedError(
                 "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
@@ -1001,6 +1006,7 @@ def load_module(name, value):
                     use_safetensors=use_safetensors,
                     dduf_entries=dduf_entries,
                     provider_options=provider_options,
+                    quantization_config=quantization_config,
                 )
                 logger.info(
                     f"Loaded {name} as {class_name} from `{name}` subfolder of {pretrained_model_name_or_path}."