Skip to content

Commit dd5a362

Browse files
New Pipeline Slow Test runners (huggingface#5131)
* pipline fetcher * update script * clean up * clean up * clean up * new pipeline runner * rename tests to match modules * test actions in pr * change runner to gpu * clean up * clean up * clean up * fix report * fix reporting * clean up * show test stats in failure reports * give names to jobs * add lora tests * split torch cuda tests and add compile tests * clean up * fix tests * change push to run only on main --------- Co-authored-by: Patrick von Platen <[email protected]>
1 parent 7271f8b commit dd5a362

20 files changed

+294
-58
lines changed

.github/workflows/push_tests.yml

Lines changed: 184 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,127 @@
1-
name: Slow tests on main
1+
name: Slow Tests on main
22

33
on:
44
push:
55
branches:
66
- main
77

8+
89
env:
910
DIFFUSERS_IS_CI: yes
1011
HF_HOME: /mnt/cache
1112
OMP_NUM_THREADS: 8
1213
MKL_NUM_THREADS: 8
1314
PYTEST_TIMEOUT: 600
1415
RUN_SLOW: yes
16+
PIPELINE_USAGE_CUTOFF: 50000
1517

1618
jobs:
17-
run_slow_tests:
19+
setup_torch_cuda_pipeline_matrix:
20+
name: Setup Torch Pipelines CUDA Slow Tests Matrix
21+
runs-on: docker-gpu
22+
container:
23+
image: diffusers/diffusers-pytorch-cpu # this is a CPU image, but we need it to fetch the matrix
24+
options: --shm-size "16gb" --ipc host
25+
outputs:
26+
pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
27+
steps:
28+
- name: Checkout diffusers
29+
uses: actions/checkout@v3
30+
with:
31+
fetch-depth: 2
32+
- name: Install dependencies
33+
run: |
34+
apt-get update && apt-get install libsndfile1-dev libgl1 -y
35+
python -m pip install -e .[quality,test]
36+
python -m pip install git+https://github.com/huggingface/accelerate.git
37+
38+
- name: Environment
39+
run: |
40+
python utils/print_env.py
41+
42+
- name: Fetch Pipeline Matrix
43+
id: fetch_pipeline_matrix
44+
run: |
45+
matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
46+
echo $matrix
47+
echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
48+
49+
- name: Pipeline Tests Artifacts
50+
if: ${{ always() }}
51+
uses: actions/upload-artifact@v2
52+
with:
53+
name: test-pipelines.json
54+
path: reports
55+
56+
torch_pipelines_cuda_tests:
57+
name: Torch Pipelines CUDA Slow Tests
58+
needs: setup_torch_cuda_pipeline_matrix
1859
strategy:
1960
fail-fast: false
2061
max-parallel: 1
2162
matrix:
22-
config:
23-
- name: Slow PyTorch CUDA tests on Ubuntu
24-
framework: pytorch
25-
runner: docker-gpu
26-
image: diffusers/diffusers-pytorch-cuda
27-
report: torch_cuda
28-
- name: Slow Flax TPU tests on Ubuntu
29-
framework: flax
30-
runner: docker-tpu
31-
image: diffusers/diffusers-flax-tpu
32-
report: flax_tpu
33-
- name: Slow ONNXRuntime CUDA tests on Ubuntu
34-
framework: onnxruntime
35-
runner: docker-gpu
36-
image: diffusers/diffusers-onnxruntime-cuda
37-
report: onnx_cuda
38-
39-
name: ${{ matrix.config.name }}
40-
41-
runs-on: ${{ matrix.config.runner }}
42-
63+
module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
64+
runs-on: docker-gpu
4365
container:
44-
image: ${{ matrix.config.image }}
45-
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ ${{ matrix.config.runner == 'docker-tpu' && '--privileged' || '--gpus 0'}}
46-
66+
image: diffusers/diffusers-pytorch-cuda
67+
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
68+
steps:
69+
- name: Checkout diffusers
70+
uses: actions/checkout@v3
71+
with:
72+
fetch-depth: 2
73+
- name: NVIDIA-SMI
74+
run: |
75+
nvidia-smi
76+
- name: Install dependencies
77+
run: |
78+
apt-get update && apt-get install libsndfile1-dev libgl1 -y
79+
python -m pip install -e .[quality,test]
80+
python -m pip install git+https://github.com/huggingface/accelerate.git
81+
- name: Environment
82+
run: |
83+
python utils/print_env.py
84+
- name: Slow PyTorch CUDA checkpoint tests on Ubuntu
85+
env:
86+
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
87+
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
88+
CUBLAS_WORKSPACE_CONFIG: :16:8
89+
run: |
90+
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
91+
-s -v -k "not Flax and not Onnx" \
92+
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
93+
tests/pipelines/${{ matrix.module }}
94+
- name: Failure short reports
95+
if: ${{ failure() }}
96+
run: |
97+
cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
98+
cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
99+
100+
- name: Test suite reports artifacts
101+
if: ${{ always() }}
102+
uses: actions/upload-artifact@v2
103+
with:
104+
name: pipeline_${{ matrix.module }}_test_reports
105+
path: reports
106+
107+
torch_cuda_tests:
108+
name: Torch CUDA Tests
109+
runs-on: docker-gpu
110+
container:
111+
image: diffusers/diffusers-pytorch-cuda
112+
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
47113
defaults:
48114
run:
49115
shell: bash
50-
116+
strategy:
117+
matrix:
118+
module: [models, schedulers, lora, others]
51119
steps:
52120
- name: Checkout diffusers
53121
uses: actions/checkout@v3
54122
with:
55123
fetch-depth: 2
56124

57-
- name: NVIDIA-SMI
58-
if : ${{ matrix.config.runner == 'docker-gpu' }}
59-
run: |
60-
nvidia-smi
61-
62125
- name: Install dependencies
63126
run: |
64127
apt-get update && apt-get install libsndfile1-dev libgl1 -y
@@ -70,47 +133,121 @@ jobs:
70133
python utils/print_env.py
71134
72135
- name: Run slow PyTorch CUDA tests
73-
if: ${{ matrix.config.framework == 'pytorch' }}
74136
env:
75137
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
76138
# https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
77139
CUBLAS_WORKSPACE_CONFIG: :16:8
78-
79140
run: |
80141
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
81-
-s -v -k "not Flax and not Onnx and not compile" \
82-
--make-reports=tests_${{ matrix.config.report }} \
83-
tests/
142+
-s -v -k "not Flax and not Onnx" \
143+
--make-reports=tests_torch_cuda \
144+
tests/${{ matrix.module }}
145+
146+
- name: Failure short reports
147+
if: ${{ failure() }}
148+
run: |
149+
cat reports/tests_torch_cuda_stats.txt
150+
cat reports/tests_torch_cuda_failures_short.txt
151+
152+
- name: Test suite reports artifacts
153+
if: ${{ always() }}
154+
uses: actions/upload-artifact@v2
155+
with:
156+
name: torch_cuda_test_reports
157+
path: reports
158+
159+
flax_tpu_tests:
160+
name: Flax TPU Tests
161+
runs-on: docker-tpu
162+
container:
163+
image: diffusers/diffusers-flax-tpu
164+
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
165+
defaults:
166+
run:
167+
shell: bash
168+
steps:
169+
- name: Checkout diffusers
170+
uses: actions/checkout@v3
171+
with:
172+
fetch-depth: 2
173+
174+
- name: Install dependencies
175+
run: |
176+
apt-get update && apt-get install libsndfile1-dev libgl1 -y
177+
python -m pip install -e .[quality,test]
178+
python -m pip install git+https://github.com/huggingface/accelerate.git
179+
180+
- name: Environment
181+
run: |
182+
python utils/print_env.py
84183
85184
- name: Run slow Flax TPU tests
86-
if: ${{ matrix.config.framework == 'flax' }}
87185
env:
88186
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
89187
run: |
90188
python -m pytest -n 0 \
91189
-s -v -k "Flax" \
92-
--make-reports=tests_${{ matrix.config.report }} \
190+
--make-reports=tests_flax_tpu \
93191
tests/
94192
193+
- name: Failure short reports
194+
if: ${{ failure() }}
195+
run: |
196+
cat reports/tests_flax_tpu_stats.txt
197+
cat reports/tests_flax_tpu_failures_short.txt
198+
199+
- name: Test suite reports artifacts
200+
if: ${{ always() }}
201+
uses: actions/upload-artifact@v2
202+
with:
203+
name: flax_tpu_test_reports
204+
path: reports
205+
206+
onnx_cuda_tests:
207+
name: ONNX CUDA Tests
208+
runs-on: docker-gpu
209+
container:
210+
image: diffusers/diffusers-onnxruntime-cuda
211+
options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
212+
defaults:
213+
run:
214+
shell: bash
215+
steps:
216+
- name: Checkout diffusers
217+
uses: actions/checkout@v3
218+
with:
219+
fetch-depth: 2
220+
221+
- name: Install dependencies
222+
run: |
223+
apt-get update && apt-get install libsndfile1-dev libgl1 -y
224+
python -m pip install -e .[quality,test]
225+
python -m pip install git+https://github.com/huggingface/accelerate.git
226+
227+
- name: Environment
228+
run: |
229+
python utils/print_env.py
230+
95231
- name: Run slow ONNXRuntime CUDA tests
96-
if: ${{ matrix.config.framework == 'onnxruntime' }}
97232
env:
98233
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
99234
run: |
100235
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
101236
-s -v -k "Onnx" \
102-
--make-reports=tests_${{ matrix.config.report }} \
237+
--make-reports=tests_onnx_cuda \
103238
tests/
104239
105240
- name: Failure short reports
106241
if: ${{ failure() }}
107-
run: cat reports/tests_${{ matrix.config.report }}_failures_short.txt
242+
run: |
243+
cat reports/tests_onnx_cuda_stats.txt
244+
cat reports/tests_onnx_cuda_failures_short.txt
108245
109246
- name: Test suite reports artifacts
110247
if: ${{ always() }}
111248
uses: actions/upload-artifact@v2
112249
with:
113-
name: ${{ matrix.config.report }}_test_reports
250+
name: onnx_cuda_test_reports
114251
path: reports
115252

116253
run_torch_compile_tests:
@@ -131,21 +268,17 @@ jobs:
131268
- name: NVIDIA-SMI
132269
run: |
133270
nvidia-smi
134-
135271
- name: Install dependencies
136272
run: |
137273
python -m pip install -e .[quality,test,training]
138-
139274
- name: Environment
140275
run: |
141276
python utils/print_env.py
142-
143277
- name: Run example tests on GPU
144278
env:
145279
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
146280
run: |
147281
python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
148-
149282
- name: Failure short reports
150283
if: ${{ failure() }}
151284
run: cat reports/tests_torch_compile_cuda_failures_short.txt
@@ -192,11 +325,13 @@ jobs:
192325
193326
- name: Failure short reports
194327
if: ${{ failure() }}
195-
run: cat reports/examples_torch_cuda_failures_short.txt
328+
run: |
329+
cat reports/examples_torch_cuda_stats.txt
330+
cat reports/examples_torch_cuda_failures_short.txt
196331
197332
- name: Test suite reports artifacts
198333
if: ${{ always() }}
199334
uses: actions/upload-artifact@v2
200335
with:
201336
name: examples_test_reports
202-
path: reports
337+
path: reports

src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ def prepare_control_image(
213213
do_center_crop=False,
214214
do_normalize=False,
215215
return_tensors="pt",
216-
)["pixel_values"].to(self.device)
216+
)["pixel_values"].to(device)
217217
image_batch_size = image.shape[0]
218218

219219
if image_batch_size == 1:
@@ -365,7 +365,7 @@ def __call__(
365365
height=height,
366366
batch_size=batch_size,
367367
num_images_per_prompt=1,
368-
device=self.device,
368+
device=device,
369369
dtype=self.controlnet.dtype,
370370
do_classifier_free_guidance=do_classifier_free_guidance,
371371
)

src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -765,8 +765,9 @@ def __call__(
765765

766766
if needs_upcasting:
767767
self.upcast_vae()
768-
latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
769768

769+
# Ensure latents are always the same type as the VAE
770+
latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
770771
image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
771772

772773
# cast back to fp16 if needed

tests/lora/test_lora_layers_old_backend.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1554,7 +1554,7 @@ def test_lora_on_off(self, expected_max_diff=1e-3):
15541554
torch_device != "cuda" or not is_xformers_available(),
15551555
reason="XFormers attention is only available with CUDA and `xformers` installed",
15561556
)
1557-
def test_lora_xformers_on_off(self, expected_max_diff=1e-4):
1557+
def test_lora_xformers_on_off(self, expected_max_diff=6e-4):
15581558
# enable deterministic behavior for gradient checkpointing
15591559
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
15601560

tests/pipelines/controlnet/test_controlnet_inpaint.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
enable_full_determinism,
4040
floats_tensor,
4141
load_numpy,
42+
numpy_cosine_similarity_distance,
4243
require_torch_gpu,
4344
slow,
4445
torch_device,
@@ -550,7 +551,7 @@ def make_inpaint_condition(image, image_mask):
550551
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/boy_ray_ban.npy"
551552
)
552553

553-
assert np.abs(expected_image - image).max() < 0.9e-1
554+
assert numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten()) < 1e-2
554555

555556
def test_load_local(self):
556557
controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")

tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py renamed to tests/pipelines/kandinsky2_2/test_kandinsky_controlnet.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,9 @@ def test_kandinsky_controlnet(self):
221221
def test_float16_inference(self):
222222
super().test_float16_inference(expected_max_diff=1e-1)
223223

224+
def test_inference_batch_single_identical(self):
225+
super().test_inference_batch_single_identical(expected_max_diff=5e-4)
226+
224227

225228
@nightly
226229
@require_torch_gpu

0 commit comments

Comments
 (0)