Skip to content

Commit 45b42d1

Browse files
Disty0sayakpaul
andauthored
Add device arg to offloading with combined pipelines (huggingface#7471)
Co-authored-by: Sayak Paul <[email protected]>
1 parent 5199ee4 commit 45b42d1

File tree

3 files changed

+24
-24
lines changed

3 files changed

+24
-24
lines changed

src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -178,16 +178,16 @@ def __init__(
178178
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
179179
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
180180

181-
def enable_sequential_cpu_offload(self, gpu_id=0):
181+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
182182
r"""
183183
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
184184
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
185185
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
186186
Note that offloading happens on a submodule basis. Memory savings are higher than with
187187
`enable_model_cpu_offload`, but performance is lower.
188188
"""
189-
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
190-
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
189+
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
190+
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
191191

192192
def progress_bar(self, iterable=None, total=None):
193193
self.prior_pipe.progress_bar(iterable=iterable, total=total)
@@ -405,26 +405,26 @@ def __init__(
405405
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
406406
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
407407

408-
def enable_model_cpu_offload(self, gpu_id=0):
408+
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
409409
r"""
410410
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
411411
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
412412
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
413413
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
414414
"""
415-
self.prior_pipe.enable_model_cpu_offload()
416-
self.decoder_pipe.enable_model_cpu_offload()
415+
self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
416+
self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
417417

418-
def enable_sequential_cpu_offload(self, gpu_id=0):
418+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
419419
r"""
420420
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
421421
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
422422
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
423423
Note that offloading happens on a submodule basis. Memory savings are higher than with
424424
`enable_model_cpu_offload`, but performance is lower.
425425
"""
426-
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
427-
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
426+
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
427+
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
428428

429429
def progress_bar(self, iterable=None, total=None):
430430
self.prior_pipe.progress_bar(iterable=iterable, total=total)
@@ -653,16 +653,16 @@ def __init__(
653653
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
654654
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
655655

656-
def enable_sequential_cpu_offload(self, gpu_id=0):
656+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
657657
r"""
658658
Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
659659
text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
660660
`torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
661661
Note that offloading happens on a submodule basis. Memory savings are higher than with
662662
`enable_model_cpu_offload`, but performance is lower.
663663
"""
664-
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
665-
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
664+
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
665+
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
666666

667667
def progress_bar(self, iterable=None, total=None):
668668
self.prior_pipe.progress_bar(iterable=iterable, total=total)

src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -117,25 +117,25 @@ def __init__(
117117
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
118118
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
119119

120-
def enable_model_cpu_offload(self, gpu_id=0):
120+
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
121121
r"""
122122
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
123123
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
124124
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
125125
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
126126
"""
127-
self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
128-
self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
127+
self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
128+
self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
129129

130-
def enable_sequential_cpu_offload(self, gpu_id=0):
130+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
131131
r"""
132132
Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
133133
Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
134134
GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
135135
Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
136136
"""
137-
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
138-
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
137+
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
138+
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
139139

140140
def progress_bar(self, iterable=None, total=None):
141141
self.prior_pipe.progress_bar(iterable=iterable, total=total)

src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -112,25 +112,25 @@ def __init__(
112112
def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
113113
self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
114114

115-
def enable_model_cpu_offload(self, gpu_id=0):
115+
def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
116116
r"""
117117
Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
118118
to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
119119
method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
120120
`enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
121121
"""
122-
self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
123-
self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
122+
self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
123+
self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id, device=device)
124124

125-
def enable_sequential_cpu_offload(self, gpu_id=0):
125+
def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
126126
r"""
127127
Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
128128
Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
129129
GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
130130
Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
131131
"""
132-
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
133-
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
132+
self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
133+
self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id, device=device)
134134

135135
def progress_bar(self, iterable=None, total=None):
136136
self.prior_pipe.progress_bar(iterable=iterable, total=total)

0 commit comments

Comments
 (0)