LTX 0.9.5 #10968

a-r-r-o-w · 2025-03-05T00:37:38Z

See https://huggingface.slack.com/archives/C08275HSG8J/p1741091747532049?thread_ts=1738246363.413529&cid=C08275HSG8J

testing scripts

test1: image condition

# test ltx image conditioning
import torch
from diffusers import LTXConditionPipeline
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
from diffusers.utils import export_to_video, load_video, load_image


device = "cuda:2"
dtype = torch.bfloat16
repo = "YiYiXu/ltx-95"

# Initialize the pipeline
pipe = LTXConditionPipeline.from_pretrained(repo, torch_dtype=dtype)
pipe.to(device)


prompt = "The video depicts a long, straight highway stretching into the distance, flanked by metal guardrails. The road is divided into multiple lanes, with a few vehicles visible in the far distance. The surrounding landscape features dry, grassy fields on one side and rolling hills on the other. The sky is mostly clear with a few scattered clouds, suggesting a bright, sunny day."
negative_prompt='worst quality, inconsistent motion, blurry, jittery, distorted'
image = load_image(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input.jpg"
)

# use `conditions` input
condition = LTXVideoCondition(
    image=image,
)

generator = torch.Generator(device=device).manual_seed(0)
video = pipe(
    conditions=[condition],
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=768,
    height=512,
    num_frames=161,
    num_inference_steps=40,
    generator=generator,
).frames[0]

# Export the video
export_to_video(video, "yiyi_test_1_output_1.mp4", fps=24)


#  pass `image` input directly
generator = torch.Generator(device=device).manual_seed(0)
video = pipe(
    image=image,
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=768,
    height=512,
    num_frames=161,
    num_inference_steps=40,
    generator=generator,
).frames[0]

# Export the video
export_to_video(video, "yiyi_test_1_output_2.mp4", fps=24)

test2: video condition

# ltx video conditioning
import torch
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXConditionPipeline, LTXVideoCondition
from diffusers.utils import export_to_video, load_video


device = "cuda:2"
dtype = torch.bfloat16
repo = "YiYiXu/ltx-95"

# Initialize the pipeline
pipe = LTXConditionPipeline.from_pretrained(repo, torch_dtype=dtype)
pipe.to(device)


video = load_video(
    "/raid/yiyi/LTX-Video/outputs/2025-03-11/video_output_0_a-woman-with-long-brown-hair-and_42_512x768x40_0.mp4"
)

condition = LTXVideoCondition(
    video=video,
    frame_index=0
)

# Define prompts
prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
negative_prompt='worst quality, inconsistent motion, blurry, jittery, distorted'
# Generate the video
video = pipe(
    conditions=[condition],
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=768,
    height=512,
    num_frames=161,
    num_inference_steps=40,
).frames[0]

# Export the video
export_to_video(video, "yiyi_test_2_output_1.mp4", fps=24)



# test 2: frame index 8
condition = LTXVideoCondition(
    video=video,
    frame_index=8
)

# Generate the video
video = pipe(
    conditions=[condition],
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=768,
    height=512,
    num_frames=161,
    num_inference_steps=40,
).frames[0]

# Export the video
export_to_video(video, "yiyi_test_2_output_2.mp4", fps=24)


# test3: pass inputs directly

# Generate the video
video = pipe(
    video=video,
    frame_index=8,
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=768,
    height=512,
    num_frames=161,
    num_inference_steps=40,
).frames[0]

# Export the video
export_to_video(video, "yiyi_test_2_output_3.mp4", fps=24)

test3: video + image

# image + video
import torch
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXConditionPipeline, LTXVideoCondition
from diffusers.utils import export_to_video, load_video, load_image


device = "cuda:2"
dtype = torch.bfloat16
repo = "YiYiXu/ltx-95"

# Initialize the pipeline
pipe = LTXConditionPipeline.from_pretrained(repo, torch_dtype=dtype)
pipe.to(device)


video = load_video(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
)

image = load_image(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input.jpg"
)

condition1 = LTXVideoCondition(
    image=image,
    frame_index=0,
)

condition2 = LTXVideoCondition(
    video=video,
    frame_index=80,
)

# Define prompts
prompt = "The video depicts a long, straight highway stretching into the distance, flanked by metal guardrails. The road is divided into multiple lanes, with a few vehicles visible in the far distance. The surrounding landscape features dry, grassy fields on one side and rolling hills on the other. The sky is mostly clear with a few scattered clouds, suggesting a bright, sunny day. And then the camera switch to a inding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
negative_prompt='worst quality, inconsistent motion, blurry, jittery, distorted'
# Generate the video
generator = torch.Generator(device=device).manual_seed(0)
video = pipe(
    conditions=[condition1, condition2],
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=768,
    height=512,
    num_frames=161,
    num_inference_steps=40,
    generator=generator,
).frames[0]

# Export the video
export_to_video(video, "yiyi_test_4_output.mp4", fps=24)

HuggingFaceDocBuilderDev · 2025-03-05T00:44:04Z

The docs for this PR live here. All of your documentation changes will be reflected on that endpoint. The docs are available until 30 days after the last update.

a-r-r-o-w · 2025-03-05T19:13:32Z

Code for matching VAE:

import sys
sys.path.append("/raid/aryan/ltx-code")

import json
from typing import Any, Dict

import torch
from safetensors.torch import load_file
from safetensors import safe_open

from ltx_video.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder


def remove_keys_(key: str, state_dict: Dict[str, Any]):
    state_dict.pop(key)

VAE_KEYS_RENAME_DICT = {
    # decoder
    "up_blocks.0": "mid_block",
    "up_blocks.1": "up_blocks.0",
    "up_blocks.2": "up_blocks.1.upsamplers.0",
    "up_blocks.3": "up_blocks.1",
    "up_blocks.4": "up_blocks.2.conv_in",
    "up_blocks.5": "up_blocks.2.upsamplers.0",
    "up_blocks.6": "up_blocks.2",
    "up_blocks.7": "up_blocks.3.conv_in",
    "up_blocks.8": "up_blocks.3.upsamplers.0",
    "up_blocks.9": "up_blocks.3",
    # encoder
    "down_blocks.0": "down_blocks.0",
    "down_blocks.1": "down_blocks.0.downsamplers.0",
    "down_blocks.2": "down_blocks.0.conv_out",
    "down_blocks.3": "down_blocks.1",
    "down_blocks.4": "down_blocks.1.downsamplers.0",
    "down_blocks.5": "down_blocks.1.conv_out",
    "down_blocks.6": "down_blocks.2",
    "down_blocks.7": "down_blocks.2.downsamplers.0",
    "down_blocks.8": "down_blocks.3",
    "down_blocks.9": "mid_block",
    # common
    "conv_shortcut": "conv_shortcut.conv",
    "res_blocks": "resnets",
    "norm3.norm": "norm3",
    "per_channel_statistics.mean-of-means": "latents_mean",
    "per_channel_statistics.std-of-means": "latents_std",
}

VAE_091_RENAME_DICT = {
    # decoder
    "up_blocks.0": "mid_block",
    "up_blocks.1": "up_blocks.0.upsamplers.0",
    "up_blocks.2": "up_blocks.0",
    "up_blocks.3": "up_blocks.1.upsamplers.0",
    "up_blocks.4": "up_blocks.1",
    "up_blocks.5": "up_blocks.2.upsamplers.0",
    "up_blocks.6": "up_blocks.2",
    "up_blocks.7": "up_blocks.3.upsamplers.0",
    "up_blocks.8": "up_blocks.3",
    # common
    "last_time_embedder": "time_embedder",
    "last_scale_shift_table": "scale_shift_table",
}

VAE_095_RENAME_DICT = {
    # decoder
    "up_blocks.0": "mid_block",
    "up_blocks.1": "up_blocks.0.upsamplers.0",
    "up_blocks.2": "up_blocks.0",
    "up_blocks.3": "up_blocks.1.upsamplers.0",
    "up_blocks.4": "up_blocks.1",
    "up_blocks.5": "up_blocks.2.upsamplers.0",
    "up_blocks.6": "up_blocks.2",
    "up_blocks.7": "up_blocks.3.upsamplers.0",
    "up_blocks.8": "up_blocks.3",
    # encoder
    "down_blocks.0": "down_blocks.0",
    "down_blocks.1": "down_blocks.0.downsamplers.0",
    "down_blocks.2": "down_blocks.1",
    "down_blocks.3": "down_blocks.1.downsamplers.0",
    "down_blocks.4": "down_blocks.2",
    "down_blocks.5": "down_blocks.2.downsamplers.0",
    "down_blocks.6": "down_blocks.3",
    "down_blocks.7": "down_blocks.3.downsamplers.0",
    "down_blocks.8": "mid_block",
    # common
    "last_time_embedder": "time_embedder",
    "last_scale_shift_table": "scale_shift_table",
}

VAE_SPECIAL_KEYS_REMAP = {
    "per_channel_statistics.channel": remove_keys_,
    "per_channel_statistics.mean-of-means": remove_keys_,
    "per_channel_statistics.mean-of-stds": remove_keys_,
    "model.diffusion_model": remove_keys_,
}

VAE_091_SPECIAL_KEYS_REMAP = {
    "timestep_scale_multiplier": remove_keys_,
}

VAE_095_SPECIAL_KEYS_REMAP = {
    
}


def update_state_dict_inplace(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
    state_dict[new_key] = state_dict.pop(old_key)


def convert_vae(original_state_dict):
    PREFIX_KEY = "vae."

    for key in list(original_state_dict.keys()):
        new_key = key[:]
        if new_key.startswith(PREFIX_KEY):
            new_key = key[len(PREFIX_KEY) :]
        for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
            new_key = new_key.replace(replace_key, rename_key)
        update_state_dict_inplace(original_state_dict, key, new_key)

    for key in list(original_state_dict.keys()):
        for special_key, handler_fn_inplace in VAE_SPECIAL_KEYS_REMAP.items():
            if special_key not in key:
                continue
            handler_fn_inplace(key, original_state_dict)

    return original_state_dict


@torch.no_grad()
def match_vae():
    from diffusers import AutoencoderKLLTXVideo

    original_model_path = "/raid/aryan/ltx-new/ltx-video-2b-v0.9.5rc1.safetensors"
    theirs_config = json.loads(safe_open(original_model_path, "pt").metadata()["config"])
    theirs_model = CausalVideoAutoencoder.from_config(theirs_config["vae"])
    theirs_state_dict = load_file(original_model_path)
    theirs_model.load_state_dict(theirs_state_dict)

    ours_config = {
        "in_channels": 3,
        "out_channels": 3,
        "latent_channels": 128,
        "block_out_channels": (128, 256, 512, 1024, 2048),
        "down_block_types": (
            "LTXVideo095DownBlock3D",
            "LTXVideo095DownBlock3D",
            "LTXVideo095DownBlock3D",
            "LTXVideo095DownBlock3D",
        ),
        "decoder_block_out_channels": (256, 512, 1024),
        "layers_per_block": (4, 6, 6, 2, 2),
        "decoder_layers_per_block": (5, 5, 5, 5),
        "spatio_temporal_scaling": (True, True, True, True),
        "decoder_spatio_temporal_scaling": (True, True, True),
        "decoder_inject_noise": (False, False, False, False),
        "downsample_type": ("spatial", "temporal", "spatiotemporal", "spatiotemporal"),
        "upsample_residual": (True, True, True),
        "upsample_factor": (2, 2, 2),
        "timestep_conditioning": True,
        "patch_size": 4,
        "patch_size_t": 1,
        "resnet_norm_eps": 1e-6,
        "scaling_factor": 1.0,
        "encoder_causal": True,
        "decoder_causal": False,
    }
    ours_model = AutoencoderKLLTXVideo.from_config(ours_config)

    VAE_KEYS_RENAME_DICT.update(VAE_095_RENAME_DICT)
    VAE_SPECIAL_KEYS_REMAP.update(VAE_095_SPECIAL_KEYS_REMAP)
    ours_state_dict = convert_vae(theirs_state_dict)
    ours_model.load_state_dict(ours_state_dict)

    state_dict_params = sum(p.numel() for p in ours_state_dict.values())
    print(f"State dict params: {state_dict_params}")

    device = torch.device("cuda")
    dtype = torch.float32

    theirs_model.to(device=device, dtype=dtype)
    ours_model.to(device=device, dtype=dtype)

    theirs_model.disable_z_tiling()
    theirs_model.disable_hw_tiling()

    print(sum(p.numel() for p in theirs_model.parameters()))
    print(sum(p.numel() for p in ours_model.parameters()))

    batch_size = 1
    num_channels = 3
    num_frames = 49
    height = 128
    width = 128

    torch.manual_seed(0)
    input = torch.randn(batch_size, num_channels, num_frames, height, width, device=device, dtype=dtype)
    decode_timestep = 0.025

    print("theirs_encoding")
    theirs_encoder_output = theirs_model.encode(input).latent_dist.mode()
    print("theirs_decoding")
    theirs_decoder_output = theirs_model.decode(theirs_encoder_output, timestep=decode_timestep, target_shape=(batch_size, num_channels, num_frames, height, width)).sample
    print("theirs:", theirs_encoder_output.shape, theirs_decoder_output.shape)

    print("ours_encoding")
    ours_encoder_output = ours_model.encode(input).latent_dist.mode()
    print("ours_decoding")
    ours_decoder_output = ours_model.decode(ours_encoder_output, temb=decode_timestep).sample
    print("ours:", ours_encoder_output.shape, ours_decoder_output.shape)

    diff_encoder = theirs_encoder_output - ours_encoder_output
    diff_decoder = theirs_decoder_output - ours_decoder_output

    absmax_encoder, absmean_encoder = torch.max(diff_encoder.abs()), torch.mean(diff_encoder.abs())
    absmax_decoder, absmean_decoder = torch.max(diff_decoder.abs()), torch.mean(diff_decoder.abs())

    print(f"Encoder: absmax={absmax_encoder}, absmean={absmean_encoder}")
    print(f"Decoder: absmax={absmax_decoder}, absmean={absmean_decoder}")


match_vae()

…_scale_multiplier

* up * Update src/diffusers/pipelines/ltx/pipeline_ltx_condition.py Co-authored-by: hlky <[email protected]> * up * make it work * up * update conversion script * up * up * up * up * up more * up * Apply suggestions from code review Co-authored-by: Aryan <[email protected]> * add docs tests + more refactor * up --------- Co-authored-by: hlky <[email protected]> Co-authored-by: Aryan <[email protected]>

yiyixuxu · 2025-03-14T11:03:41Z

src/diffusers/pipelines/ltx/pipeline_ltx_condition.py

+    def __call__(
+        self,
+        conditions: Union[LTXVideoCondition, List[LTXVideoCondition]] = None,
+        image: Union[PipelineImageInput, List[PipelineImageInput]] = None,


I like the LTXVideoCondition class!
but let's still support the image/video inputs still so that users can do a simple image2video video2video with the same API they use for other img2video/video2video

Sounds good!

a-r-r-o-w

Thanks @yiyixuxu! Just had a question, but otherwise looks good to merge

scripts/convert_ltx_to_diffusers.py

a-r-r-o-w · 2025-03-14T22:56:04Z

src/diffusers/pipelines/ltx/pipeline_ltx_condition.py

+    def __call__(
+        self,
+        conditions: Union[LTXVideoCondition, List[LTXVideoCondition]] = None,
+        image: Union[PipelineImageInput, List[PipelineImageInput]] = None,


Sounds good!

src/diffusers/pipelines/ltx/pipeline_ltx_condition.py

Co-authored-by: SunMarc <[email protected]> condition better. support mapping. improvements. [Quantization] Add Quanto backend (#10756) * update * updaet * update * update * update * update * update * update * update * update * update * update * Update docs/source/en/quantization/quanto.md Co-authored-by: Sayak Paul <[email protected]> * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * Update src/diffusers/quantizers/quanto/utils.py Co-authored-by: Sayak Paul <[email protected]> * update * update --------- Co-authored-by: Sayak Paul <[email protected]> [Single File] Add single file loading for SANA Transformer (#10947) * added support for from_single_file * added diffusers mapping script * added testcase * bug fix * updated tests * corrected code quality * corrected code quality --------- Co-authored-by: Dhruv Nair <[email protected]> [LoRA] Improve warning messages when LoRA loading becomes a no-op (#10187) * updates * updates * updates * updates * notebooks revert * fix-copies. * seeing * fix * revert * fixes * fixes * fixes * remove print * fix * conflicts ii. * updates * fixes * better filtering of prefix. --------- Co-authored-by: hlky <[email protected]> [LoRA] CogView4 (#10981) * update * make fix-copies * update [Tests] improve quantization tests by additionally measuring the inference memory savings (#11021) * memory usage tests * fixes * gguf [`Research Project`] Add AnyText: Multilingual Visual Text Generation And Editing (#8998) * Add initial template * Second template * feat: Add TextEmbeddingModule to AnyTextPipeline * feat: Add AuxiliaryLatentModule template to AnyTextPipeline * Add bert tokenizer from the anytext repo for now * feat: Update AnyTextPipeline's modify_prompt method This commit adds improvements to the modify_prompt method in the AnyTextPipeline class. The method now handles special characters and replaces selected string prompts with a placeholder. Additionally, it includes a check for Chinese text and translation using the trans_pipe. * Fill in the `forward` pass of `AuxiliaryLatentModule` * `make style && make quality` * `chore: Update bert_tokenizer.py with a TODO comment suggesting the use of the transformers library` * Update error handling to raise and logging * Add `create_glyph_lines` function into `TextEmbeddingModule` * make style * Up * Up * Up * Up * Remove several comments * refactor: Remove ControlNetConditioningEmbedding and update code accordingly * Up * Up * up * refactor: Update AnyTextPipeline to include new optional parameters * up * feat: Add OCR model and its components * chore: Update `TextEmbeddingModule` to include OCR model components and dependencies * chore: Update `AuxiliaryLatentModule` to include VAE model and its dependencies for masked image in the editing task * `make style` * refactor: Update `AnyTextPipeline`'s docstring * Update `AuxiliaryLatentModule` to include info dictionary so that text processing is done once * simplify * `make style` * Converting `TextEmbeddingModule` to ordinary `encode_prompt()` function * Simplify for now * `make style` * Up * feat: Add scripts to convert AnyText controlnet to diffusers * `make style` * Fix: Move glyph rendering to `TextEmbeddingModule` from `AuxiliaryLatentModule` * make style * Up * Simplify * Up * feat: Add safetensors module for loading model file * Fix device issues * Up * Up * refactor: Simplify * refactor: Simplify code for loading models and handling data types * `make style` * refactor: Update to() method in FrozenCLIPEmbedderT3 and TextEmbeddingModule * refactor: Update dtype in embedding_manager.py to match proj.weight * Up * Add attribution and adaptation information to pipeline_anytext.py * Update usage example * Will refactor `controlnet_cond_embedding` initialization * Add `AnyTextControlNetConditioningEmbedding` template * Refactor organization * style * style * Move custom blocks from `AuxiliaryLatentModule` to `AnyTextControlNetConditioningEmbedding` * Follow one-file policy * style * [Docs] Update README and pipeline_anytext.py to use AnyTextControlNetModel * [Docs] Update import statement for AnyTextControlNetModel in pipeline_anytext.py * [Fix] Update import path for ControlNetModel, ControlNetOutput in anytext_controlnet.py * Refactor AnyTextControlNet to use configurable conditioning embedding channels * Complete control net conditioning embedding in AnyTextControlNetModel * up * [FIX] Ensure embeddings use correct device in AnyTextControlNetModel * up * up * style * [UPDATE] Revise README and example code for AnyTextPipeline integration with DiffusionPipeline * [UPDATE] Update example code in anytext.py to use correct font file and improve clarity * down * [UPDATE] Refactor BasicTokenizer usage to a new Checker class for text processing * update pillow * [UPDATE] Remove commented-out code and unnecessary docstring in anytext.py and anytext_controlnet.py for improved clarity * [REMOVE] Delete frozen_clip_embedder_t3.py as it is in the anytext.py file * [UPDATE] Replace edict with dict for configuration in anytext.py and RecModel.py for consistency * 🆙 * style * [UPDATE] Revise README.md for clarity, remove unused imports in anytext.py, and add author credits in anytext_controlnet.py * style * Update examples/research_projects/anytext/README.md Co-authored-by: Aryan <[email protected]> * Remove commented-out image preparation code in AnyTextPipeline * Remove unnecessary blank line in README.md [Quantization] Allow loading TorchAO serialized Tensor objects with torch>=2.6 (#11018) * update * update * update * update * update * update * update * update * update fix: mixture tiling sdxl pipeline - adjust gerating time_ids & embeddings (#11012) small fix on generating time_ids & embeddings [LoRA] support wan i2v loras from the world. (#11025) * support wan i2v loras from the world. * remove copied from. * upates * add lora. Fix SD3 IPAdapter feature extractor (#11027) chore: fix help messages in advanced diffusion examples (#10923) Fix missing **kwargs in lora_pipeline.py (#11011) * Update lora_pipeline.py * Apply style fixes * fix-copies --------- Co-authored-by: hlky <[email protected]> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Fix for multi-GPU WAN inference (#10997) Ensure that hidden_state and shift/scale are on the same device when running with multiple GPUs Co-authored-by: Jimmy <39@🇺🇸.com> [Refactor] Clean up import utils boilerplate (#11026) * update * update * update Use `output_size` in `repeat_interleave` (#11030) [hybrid inference 🍯🐝] Add VAE encode (#11017) * [hybrid inference 🍯🐝] Add VAE encode * _toctree: add vae encode * Add endpoints, tests * vae_encode docs * vae encode benchmarks * api reference * changelog * Update docs/source/en/hybrid_inference/overview.md Co-authored-by: Sayak Paul <[email protected]> * update --------- Co-authored-by: Sayak Paul <[email protected]> Wan Pipeline scaling fix, type hint warning, multi generator fix (#11007) * Wan Pipeline scaling fix, type hint warning, multi generator fix * Apply suggestions from code review [LoRA] change to warning from info when notifying the users about a LoRA no-op (#11044) * move to warning. * test related changes. Rename Lumina(2)Text2ImgPipeline -> Lumina(2)Pipeline (#10827) * Rename Lumina(2)Text2ImgPipeline -> Lumina(2)Pipeline --------- Co-authored-by: YiYi Xu <[email protected]> making ```formatted_images``` initialization compact (#10801) compact writing Co-authored-by: Sayak Paul <[email protected]> Co-authored-by: YiYi Xu <[email protected]> Fix aclnnRepeatInterleaveIntWithDim error on NPU for get_1d_rotary_pos_embed (#10820) * get_1d_rotary_pos_embed support npu * Update src/diffusers/models/embeddings.py --------- Co-authored-by: Kai zheng <[email protected]> Co-authored-by: hlky <[email protected]> Co-authored-by: YiYi Xu <[email protected]> [Tests] restrict memory tests for quanto for certain schemes. (#11052) * restrict memory tests for quanto for certain schemes. * Apply suggestions from code review Co-authored-by: Dhruv Nair <[email protected]> * fixes * style --------- Co-authored-by: Dhruv Nair <[email protected]> [LoRA] feat: support non-diffusers wan t2v loras. (#11059) feat: support non-diffusers wan t2v loras. [examples/controlnet/train_controlnet_sd3.py] Fixes #11050 - Cast prompt_embeds and pooled_prompt_embeds to weight_dtype to prevent dtype mismatch (#11051) Fix: dtype mismatch of prompt embeddings in sd3 controlnet training Co-authored-by: Andreas Jörg <[email protected]> Co-authored-by: Sayak Paul <[email protected]> reverts accidental change that removes attn_mask in attn. Improves fl… (#11065) reverts accidental change that removes attn_mask in attn. Improves flux ptxla by using flash block sizes. Moves encoding outside the for loop. Co-authored-by: Juan Acevedo <[email protected]> Fix deterministic issue when getting pipeline dtype and device (#10696) Co-authored-by: Dhruv Nair <[email protected]> [Tests] add requires peft decorator. (#11037) * add requires peft decorator. * install peft conditionally. * conditional deps. Co-authored-by: DN6 <[email protected]> --------- Co-authored-by: DN6 <[email protected]> CogView4 Control Block (#10809) * cogview4 control training --------- Co-authored-by: OleehyO <[email protected]> Co-authored-by: yiyixuxu <[email protected]> [CI] pin transformers version for benchmarking. (#11067) pin transformers version for benchmarking. updates Fix Wan I2V Quality (#11087) * fix_wan_i2v_quality * Update src/diffusers/pipelines/wan/pipeline_wan_i2v.py Co-authored-by: YiYi Xu <[email protected]> * Update src/diffusers/pipelines/wan/pipeline_wan_i2v.py Co-authored-by: YiYi Xu <[email protected]> * Update src/diffusers/pipelines/wan/pipeline_wan_i2v.py Co-authored-by: YiYi Xu <[email protected]> * Update pipeline_wan_i2v.py --------- Co-authored-by: YiYi Xu <[email protected]> Co-authored-by: hlky <[email protected]> LTX 0.9.5 (#10968) * update --------- Co-authored-by: YiYi Xu <[email protected]> Co-authored-by: hlky <[email protected]> make PR GPU tests conditioned on styling. (#11099) Group offloading improvements (#11094) update Fix pipeline_flux_controlnet.py (#11095) * Fix pipeline_flux_controlnet.py * Fix style update readme instructions. (#11096) Co-authored-by: Juan Acevedo <[email protected]> Resolve stride mismatch in UNet's ResNet to support Torch DDP (#11098) Modify UNet's ResNet implementation to resolve stride mismatch in Torch's DDP Fix Group offloading behaviour when using streams (#11097) * update * update Quality options in `export_to_video` (#11090) * Quality options in `export_to_video` * make style improve more. add placeholders for docstrings. formatting. smol fix. solidify validation and annotation

SHYuanBest · 2025-03-23T07:57:35Z

Hi, it seem that LTXPipeline have a bug when loading LTX-0.9.5 (to device)?

import torch
from diffusers import LTXPipeline
from diffusers.utils import export_to_video

base_model_path = "YiYiXu/ltx-95"

pipe = LTXPipeline.from_pretrained(base_model_path, torch_dtype=torch.bfloat16)
pipe.to("cuda")

The config attributes {'timestep_scale_multiplier': 1000.0} were passed to AutoencoderKLLTXVideo, but are not expected and will be ignored. Please verify your config.json configuration file.
Some weights of AutoencoderKLLTXVideo were not initialized from the model checkpoint atLTX-Video-0.9.5-diffusers/vae and are newly initialized: ['decoder.timestep_scale_multiplier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading pipeline components...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.07it/s]
Traceback (most recent call last):
  File "inference.py", line 8, in <module>
    pipe.to("cuda")
  File "/storage/miniconda3/envs/wan_train/lib/python3.11/site-packages/diffusers/pipelines/pipeline_utils.py", line 482, in to
    module.to(device, dtype)
  File "/storage/miniconda3/envs/wan_train/lib/python3.11/site-packages/diffusers/models/modeling_utils.py", line 1351, in to
    return super().to(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/storage/miniconda3/envs/wan_train/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1340, in to
    return self._apply(convert)
           ^^^^^^^^^^^^^^^^^^^^
  File "/storage/miniconda3/envs/wan_train/lib/python3.11/site-packages/torch/nn/modules/module.py", line 900, in _apply
    module._apply(fn)
  File "/storage/miniconda3/envs/wan_train/lib/python3.11/site-packages/torch/nn/modules/module.py", line 927, in _apply
    param_applied = fn(param)
                    ^^^^^^^^^
  File "/storage/miniconda3/envs/wan_train/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1333, in convert
    raise NotImplementedError(
NotImplementedError: Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device.

a-r-r-o-w · 2025-03-23T14:39:50Z

Hi, please use the official Lightricks repository: https://huggingface.co/Lightricks/LTX-Video-0.9.5

* feat: pipeline-level quant config. Co-authored-by: SunMarc <[email protected]> condition better. support mapping. improvements. [Quantization] Add Quanto backend (#10756) * update * updaet * update * update * update * update * update * update * update * update * update * update * Update docs/source/en/quantization/quanto.md Co-authored-by: Sayak Paul <[email protected]> * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * update * Update src/diffusers/quantizers/quanto/utils.py Co-authored-by: Sayak Paul <[email protected]> * update * update --------- Co-authored-by: Sayak Paul <[email protected]> [Single File] Add single file loading for SANA Transformer (#10947) * added support for from_single_file * added diffusers mapping script * added testcase * bug fix * updated tests * corrected code quality * corrected code quality --------- Co-authored-by: Dhruv Nair <[email protected]> [LoRA] Improve warning messages when LoRA loading becomes a no-op (#10187) * updates * updates * updates * updates * notebooks revert * fix-copies. * seeing * fix * revert * fixes * fixes * fixes * remove print * fix * conflicts ii. * updates * fixes * better filtering of prefix. --------- Co-authored-by: hlky <[email protected]> [LoRA] CogView4 (#10981) * update * make fix-copies * update [Tests] improve quantization tests by additionally measuring the inference memory savings (#11021) * memory usage tests * fixes * gguf [`Research Project`] Add AnyText: Multilingual Visual Text Generation And Editing (#8998) * Add initial template * Second template * feat: Add TextEmbeddingModule to AnyTextPipeline * feat: Add AuxiliaryLatentModule template to AnyTextPipeline * Add bert tokenizer from the anytext repo for now * feat: Update AnyTextPipeline's modify_prompt method This commit adds improvements to the modify_prompt method in the AnyTextPipeline class. The method now handles special characters and replaces selected string prompts with a placeholder. Additionally, it includes a check for Chinese text and translation using the trans_pipe. * Fill in the `forward` pass of `AuxiliaryLatentModule` * `make style && make quality` * `chore: Update bert_tokenizer.py with a TODO comment suggesting the use of the transformers library` * Update error handling to raise and logging * Add `create_glyph_lines` function into `TextEmbeddingModule` * make style * Up * Up * Up * Up * Remove several comments * refactor: Remove ControlNetConditioningEmbedding and update code accordingly * Up * Up * up * refactor: Update AnyTextPipeline to include new optional parameters * up * feat: Add OCR model and its components * chore: Update `TextEmbeddingModule` to include OCR model components and dependencies * chore: Update `AuxiliaryLatentModule` to include VAE model and its dependencies for masked image in the editing task * `make style` * refactor: Update `AnyTextPipeline`'s docstring * Update `AuxiliaryLatentModule` to include info dictionary so that text processing is done once * simplify * `make style` * Converting `TextEmbeddingModule` to ordinary `encode_prompt()` function * Simplify for now * `make style` * Up * feat: Add scripts to convert AnyText controlnet to diffusers * `make style` * Fix: Move glyph rendering to `TextEmbeddingModule` from `AuxiliaryLatentModule` * make style * Up * Simplify * Up * feat: Add safetensors module for loading model file * Fix device issues * Up * Up * refactor: Simplify * refactor: Simplify code for loading models and handling data types * `make style` * refactor: Update to() method in FrozenCLIPEmbedderT3 and TextEmbeddingModule * refactor: Update dtype in embedding_manager.py to match proj.weight * Up * Add attribution and adaptation information to pipeline_anytext.py * Update usage example * Will refactor `controlnet_cond_embedding` initialization * Add `AnyTextControlNetConditioningEmbedding` template * Refactor organization * style * style * Move custom blocks from `AuxiliaryLatentModule` to `AnyTextControlNetConditioningEmbedding` * Follow one-file policy * style * [Docs] Update README and pipeline_anytext.py to use AnyTextControlNetModel * [Docs] Update import statement for AnyTextControlNetModel in pipeline_anytext.py * [Fix] Update import path for ControlNetModel, ControlNetOutput in anytext_controlnet.py * Refactor AnyTextControlNet to use configurable conditioning embedding channels * Complete control net conditioning embedding in AnyTextControlNetModel * up * [FIX] Ensure embeddings use correct device in AnyTextControlNetModel * up * up * style * [UPDATE] Revise README and example code for AnyTextPipeline integration with DiffusionPipeline * [UPDATE] Update example code in anytext.py to use correct font file and improve clarity * down * [UPDATE] Refactor BasicTokenizer usage to a new Checker class for text processing * update pillow * [UPDATE] Remove commented-out code and unnecessary docstring in anytext.py and anytext_controlnet.py for improved clarity * [REMOVE] Delete frozen_clip_embedder_t3.py as it is in the anytext.py file * [UPDATE] Replace edict with dict for configuration in anytext.py and RecModel.py for consistency * 🆙 * style * [UPDATE] Revise README.md for clarity, remove unused imports in anytext.py, and add author credits in anytext_controlnet.py * style * Update examples/research_projects/anytext/README.md Co-authored-by: Aryan <[email protected]> * Remove commented-out image preparation code in AnyTextPipeline * Remove unnecessary blank line in README.md [Quantization] Allow loading TorchAO serialized Tensor objects with torch>=2.6 (#11018) * update * update * update * update * update * update * update * update * update fix: mixture tiling sdxl pipeline - adjust gerating time_ids & embeddings (#11012) small fix on generating time_ids & embeddings [LoRA] support wan i2v loras from the world. (#11025) * support wan i2v loras from the world. * remove copied from. * upates * add lora. Fix SD3 IPAdapter feature extractor (#11027) chore: fix help messages in advanced diffusion examples (#10923) Fix missing **kwargs in lora_pipeline.py (#11011) * Update lora_pipeline.py * Apply style fixes * fix-copies --------- Co-authored-by: hlky <[email protected]> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Fix for multi-GPU WAN inference (#10997) Ensure that hidden_state and shift/scale are on the same device when running with multiple GPUs Co-authored-by: Jimmy <39@🇺🇸.com> [Refactor] Clean up import utils boilerplate (#11026) * update * update * update Use `output_size` in `repeat_interleave` (#11030) [hybrid inference 🍯🐝] Add VAE encode (#11017) * [hybrid inference 🍯🐝] Add VAE encode * _toctree: add vae encode * Add endpoints, tests * vae_encode docs * vae encode benchmarks * api reference * changelog * Update docs/source/en/hybrid_inference/overview.md Co-authored-by: Sayak Paul <[email protected]> * update --------- Co-authored-by: Sayak Paul <[email protected]> Wan Pipeline scaling fix, type hint warning, multi generator fix (#11007) * Wan Pipeline scaling fix, type hint warning, multi generator fix * Apply suggestions from code review [LoRA] change to warning from info when notifying the users about a LoRA no-op (#11044) * move to warning. * test related changes. Rename Lumina(2)Text2ImgPipeline -> Lumina(2)Pipeline (#10827) * Rename Lumina(2)Text2ImgPipeline -> Lumina(2)Pipeline --------- Co-authored-by: YiYi Xu <[email protected]> making ```formatted_images``` initialization compact (#10801) compact writing Co-authored-by: Sayak Paul <[email protected]> Co-authored-by: YiYi Xu <[email protected]> Fix aclnnRepeatInterleaveIntWithDim error on NPU for get_1d_rotary_pos_embed (#10820) * get_1d_rotary_pos_embed support npu * Update src/diffusers/models/embeddings.py --------- Co-authored-by: Kai zheng <[email protected]> Co-authored-by: hlky <[email protected]> Co-authored-by: YiYi Xu <[email protected]> [Tests] restrict memory tests for quanto for certain schemes. (#11052) * restrict memory tests for quanto for certain schemes. * Apply suggestions from code review Co-authored-by: Dhruv Nair <[email protected]> * fixes * style --------- Co-authored-by: Dhruv Nair <[email protected]> [LoRA] feat: support non-diffusers wan t2v loras. (#11059) feat: support non-diffusers wan t2v loras. [examples/controlnet/train_controlnet_sd3.py] Fixes #11050 - Cast prompt_embeds and pooled_prompt_embeds to weight_dtype to prevent dtype mismatch (#11051) Fix: dtype mismatch of prompt embeddings in sd3 controlnet training Co-authored-by: Andreas Jörg <[email protected]> Co-authored-by: Sayak Paul <[email protected]> reverts accidental change that removes attn_mask in attn. Improves fl… (#11065) reverts accidental change that removes attn_mask in attn. Improves flux ptxla by using flash block sizes. Moves encoding outside the for loop. Co-authored-by: Juan Acevedo <[email protected]> Fix deterministic issue when getting pipeline dtype and device (#10696) Co-authored-by: Dhruv Nair <[email protected]> [Tests] add requires peft decorator. (#11037) * add requires peft decorator. * install peft conditionally. * conditional deps. Co-authored-by: DN6 <[email protected]> --------- Co-authored-by: DN6 <[email protected]> CogView4 Control Block (#10809) * cogview4 control training --------- Co-authored-by: OleehyO <[email protected]> Co-authored-by: yiyixuxu <[email protected]> [CI] pin transformers version for benchmarking. (#11067) pin transformers version for benchmarking. updates Fix Wan I2V Quality (#11087) * fix_wan_i2v_quality * Update src/diffusers/pipelines/wan/pipeline_wan_i2v.py Co-authored-by: YiYi Xu <[email protected]> * Update src/diffusers/pipelines/wan/pipeline_wan_i2v.py Co-authored-by: YiYi Xu <[email protected]> * Update src/diffusers/pipelines/wan/pipeline_wan_i2v.py Co-authored-by: YiYi Xu <[email protected]> * Update pipeline_wan_i2v.py --------- Co-authored-by: YiYi Xu <[email protected]> Co-authored-by: hlky <[email protected]> LTX 0.9.5 (#10968) * update --------- Co-authored-by: YiYi Xu <[email protected]> Co-authored-by: hlky <[email protected]> make PR GPU tests conditioned on styling. (#11099) Group offloading improvements (#11094) update Fix pipeline_flux_controlnet.py (#11095) * Fix pipeline_flux_controlnet.py * Fix style update readme instructions. (#11096) Co-authored-by: Juan Acevedo <[email protected]> Resolve stride mismatch in UNet's ResNet to support Torch DDP (#11098) Modify UNet's ResNet implementation to resolve stride mismatch in Torch's DDP Fix Group offloading behaviour when using streams (#11097) * update * update Quality options in `export_to_video` (#11090) * Quality options in `export_to_video` * make style improve more. add placeholders for docstrings. formatting. smol fix. solidify validation and annotation * Revert "feat: pipeline-level quant config." This reverts commit 316ff46. * feat: implement pipeline-level quantization config Co-authored-by: SunMarc <[email protected]> * update * fixes * fix validation. * add tests and other improvements. * add tests * import quality * remove prints. * add docs. * fixes to docs. * doc fixes. * doc fixes. * add validation to the input quantization_config. * clarify recommendations. * docs * add to ci. * todo. --------- Co-authored-by: SunMarc <[email protected]>

update

ea436c4

a-r-r-o-w requested a review from yiyixuxu March 5, 2025 00:37

update

f35b807

a-r-r-o-w changed the title ~~Fix documentation~~ LTX 0.9.5 Mar 5, 2025

a-r-r-o-w added 2 commits March 5, 2025 16:03

update

d0bdf4b

update

199e741

a-r-r-o-w added 2 commits March 5, 2025 20:52

vae fix

3d3ad59

update conversion script; todo: update 0.9.1 checkpoint with timestep…

14a2282

…_scale_multiplier

yiyixuxu mentioned this pull request Mar 6, 2025

[WIP] test prepare_latents for ltx0.95 #10976

Merged

hlky linked an issue Mar 13, 2025 that may be closed by this pull request

add LTX-Video 0.9.5 diffusers support #11048

Closed

yiyixuxu and others added 3 commits March 13, 2025 23:24

Merge branch 'main' into integrations/ltx-0.9.5

1cfd2ee

up

1141cdd

yiyixuxu reviewed Mar 14, 2025

View reviewed changes

up

21502d9

yiyixuxu approved these changes Mar 14, 2025

View reviewed changes

a-r-r-o-w commented Mar 14, 2025

View reviewed changes

src/diffusers/pipelines/ltx/pipeline_ltx_condition.py Show resolved Hide resolved

yiyixuxu added 2 commits March 18, 2025 01:01

revert the timestep_scale_multiplier change

cf3a77e

style

6da0a08

a-r-r-o-w commented Mar 18, 2025

View reviewed changes

src/diffusers/pipelines/ltx/pipeline_ltx_condition.py Outdated Show resolved Hide resolved

update

d649fdf

yiyixuxu merged commit 2e83cbb into main Mar 18, 2025
14 of 15 checks passed

yiyixuxu deleted the integrations/ltx-0.9.5 branch March 18, 2025 02:43

yoavhacohen mentioned this pull request Mar 18, 2025

Is v0.9.5 backward compatible? Lightricks/LTX-Video#128

Closed

DN6 added the roadmap Add to current release roadmap label Mar 20, 2025

github-project-automation bot added this to Diffusers Roadmap 0.34 Mar 20, 2025

github-project-automation bot moved this to In Progress in Diffusers Roadmap 0.34 Mar 20, 2025

DN6 moved this from In Progress to Done in Diffusers Roadmap 0.34 Mar 20, 2025

yiyixuxu removed this from Diffusers Roadmap 0.34 Apr 18, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

LTX 0.9.5 #10968

LTX 0.9.5 #10968

a-r-r-o-w commented Mar 5, 2025 •

edited by yiyixuxu

Loading

HuggingFaceDocBuilderDev commented Mar 5, 2025

a-r-r-o-w commented Mar 5, 2025 •

edited

Loading

yiyixuxu Mar 14, 2025

a-r-r-o-w Mar 14, 2025

a-r-r-o-w left a comment

a-r-r-o-w Mar 14, 2025

SHYuanBest commented Mar 23, 2025

a-r-r-o-w commented Mar 23, 2025

LTX 0.9.5 #10968

LTX 0.9.5 #10968

Conversation

a-r-r-o-w commented Mar 5, 2025 • edited by yiyixuxu Loading

testing scripts

HuggingFaceDocBuilderDev commented Mar 5, 2025

a-r-r-o-w commented Mar 5, 2025 • edited Loading

yiyixuxu Mar 14, 2025

Choose a reason for hiding this comment

a-r-r-o-w Mar 14, 2025

Choose a reason for hiding this comment

a-r-r-o-w left a comment

Choose a reason for hiding this comment

a-r-r-o-w Mar 14, 2025

Choose a reason for hiding this comment

SHYuanBest commented Mar 23, 2025

a-r-r-o-w commented Mar 23, 2025

a-r-r-o-w commented Mar 5, 2025 •

edited by yiyixuxu

Loading

a-r-r-o-w commented Mar 5, 2025 •

edited

Loading