antonibigata
diff --git a/‎configs/example_training/svd_interpolation_w_lip.yaml‎
Lines changed: 374 additions & 0 deletions b/‎configs/example_training/svd_interpolation_w_lip.yaml‎
Lines changed: 374 additions & 0 deletions
@@ -0,0 +1,374 @@
+model:
+  base_learning_rate: 3.e-5
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    input_key: latents
+    no_log_keys: [audio_emb, fps_id, motion_bucket_id, cond_aug]
+    scale_factor: 0.18215
+    disable_first_stage_autocast: True
+    ckpt_path: logs/2024-05-28T11-10-27_example_training-svd_interpolation_no_emb/checkpoints/last.ckpt/checkpoint/mp_rank_00_model_states.pt
+    remove_keys_from_weights: []
+    compile_model: False
+    en_and_decode_n_samples_a_time: 1
+    # optimizer_config: 
+    #   target: deepspeed.ops.adam.DeepSpeedCPUAdam
+
+    scheduler_config:
+      target: sgm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [1000]
+        cycle_lengths: [10000000000000]
+        f_start: [1.e-6]
+        f_max: [1.]
+        f_min: [1.]
+
+    to_freeze: []
+    to_unfreeze: []
+
+    # LoRA
+    use_lora: False
+    lora_config:
+      search_class_str: Linear
+      target_replace_module: null
+      r_linear: 16
+      r_conv: 16
+      loras: null  # path to lora .pt
+      # verbose: False
+      # dropout_p: 0.0
+      # scale: 1.0
+      # search_class: both
+
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.Denoiser
+      params:
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
+
+    # network_wrapper: sgm.modules.diffusionmodules.wrappers.IdentityWrapper
+    network_wrapper: 
+      target: sgm.modules.diffusionmodules.wrappers.InterpolationWrapper
+      params:
+        im_size: [512, 512] # USER: adapt this to your dataset
+        n_channels: 4
+        starting_mask_method: zeros
+        add_mask: True
+
+    network_config:
+      target: sgm.modules.diffusionmodules.video_model.VideoUNet
+      params:
+        adm_in_channels: 0
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 9
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        spatial_transformer_attn_type: softmax-xformers
+        extra_ff_mix_layer: True
+        use_spatial_context: True
+        merge_strategy: learned_with_images
+        video_kernel_size: [3, 1, 1]
+        fine_tuning_method: null
+        audio_cond_method: to_time_emb
+        additional_audio_frames: 0
+        audio_dim: 768
+        unfreeze_blocks: ["input"] # Because we changed the input block
+        # adapter_kwargs:
+        #   # down_ratio: 1
+        #   # adapter_type: null
+        #   # adapter_weight: null
+        #   # act_layer: gelu
+        #   # zero_init_last: True
+        #   # use_bias: True
+        #   # adapt_on_time: True
+        #   # condition_on: space
+        #   # condition_dim: 1280
+        #   target_replace_module: ["SpatialVideoTransformer"]
+        #   r: 16
+        #   loras: null  # path to lora .pt
+        #   verbose: False
+        #   dropout_p: 0.0
+        #   scale: 1.0
+
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+        - is_trainable: False
+          input_key: cond_frames_without_noise
+          ucg_rate: 0.1
+          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
+          params:
+            n_cond_frames: 2
+            n_copies: 1
+            open_clip_embedding_config:
+              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
+              params:
+                freeze: True
+
+        # - input_key: fps_id
+        #   is_trainable: False
+        #   target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+        #   params:
+        #     outdim: 256
+
+        # - input_key: motion_bucket_id
+        #   is_trainable: False
+        #   target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+        #   params:
+        #     outdim: 256
+
+        - input_key: cond_frames
+          is_trainable: False
+          ucg_rate: 0.1
+          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
+          params:
+            disable_encoder_autocast: True
+            n_cond_frames: 2
+            n_copies: 1
+            is_ae: True
+            load_encoder: False
+            encoder_config:
+              target: sgm.models.autoencoder.AutoencoderKLModeOnly
+              params:
+                embed_dim: 4
+                monitor: val/rec_loss
+                ddconfig:
+                  attn_type: vanilla-xformers
+                  double_z: True
+                  z_channels: 4
+                  resolution: 256
+                  in_channels: 3
+                  out_ch: 3
+                  ch: 128
+                  ch_mult: [1, 2, 4, 4]
+                  num_res_blocks: 2
+                  attn_resolutions: []
+                  dropout: 0.0
+                lossconfig:
+                  target: torch.nn.Identity
+
+        # - input_key: cond_aug
+        #   is_trainable: False
+        #   target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+        #   params:
+        #     outdim: 256
+
+        - input_key: audio_emb
+          is_trainable: True
+          ucg_rate: 0.2
+          target: sgm.modules.encoders.modules.WhisperAudioEmbedder
+          params:
+            merge_method: mean 
+            linear_dim: null
+
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencodingEngine
+      params:
+        loss_config:
+          target: torch.nn.Identity
+        regularizer_config:
+          target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
+        encoder_config: 
+          target: sgm.modules.diffusionmodules.model.Encoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+        decoder_config:
+          target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
+          params:
+            attn_type: vanilla
+            double_z: True
+            z_channels: 4
+            resolution: 256
+            in_channels: 3
+            out_ch: 3
+            ch: 128
+            ch_mult: [1, 2, 4, 4]
+            num_res_blocks: 2
+            attn_resolutions: []
+            dropout: 0.0
+            video_kernel_size: [3, 1, 1]
+
+    sampler_config:
+      target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
+      params:
+        num_steps: 10
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.AYSDiscretization
+          # params:
+          #   # sigma_max: 700.0
+
+        guider_config:
+          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
+          params:
+            max_scale: 2.5
+            min_scale: 1.0
+            num_frames: 14
+
+    loss_fn_config:
+      target: sgm.modules.diffusionmodules.loss.StandardWithLipLoss
+      params:  
+        lambda_lower: 1.    
+        weight_path: /data/home/antoni/code/generative-models/checkpoints/vsr_trlrs3_base.max400.pth
+        batch2model_keys:
+          - image_only_indicator
+          - num_video_frames
+        loss_weighting_config:
+          # target: sgm.modules.diffusionmodules.loss_weighting.EpsWeighting
+          target: sgm.modules.diffusionmodules.loss_weighting.VWeighting
+        sigma_sampler_config:
+          target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
+          params:
+            # p_mean: 0.7
+            # p_std: 1.6
+            p_mean: 1.
+            p_std: 1.2
+
+data:
+  target: sgm.data.video_datamodule_latent.VideoDataModule
+  params:
+    train:
+      datapipeline:
+        # urls:
+        #   # USER: adapt this path the root of your custom dataset
+        #   - /data2/Datasets/LRW/webdata/train/out-{000000..000004}.tar
+        # pipeline_config:
+        #   shardshuffle: 10000
+        #   sample_shuffle: 100 # USER: you might wanna adapt depending on your available RAM
+
+        # decoders:
+        #   - custom
+        # postprocessors:
+        #   - target: sdata.mappers.SelectTuple
+        #     params:
+        #       key: 'mp4' # USER: you might wanna adapt this for your custom dataset
+        #       index: 0
+        #   - target: sdata.mappers.ToSVDFormat
+        #     params:
+        #       key: mp4
+        #       audio_key: pt
+        #       n_frames: 14
+        #       resize_size: 320
+        #       motion_id: 60
+        #       fps: 24 # FPS - 1 See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
+        #       cond_noise: [-3.0, 0.5]
+        #       mode: interpolation
+        # filelist: /vol/paramonos2/projects/antoni/datasets/HDTF/filelist_videos_train.txt
+        filelist: /fsx/rs2517/data/lists/HDTF/filelist_videos_train.txt
+        resize_size: 512
+        audio_folder: /fsx/rs2517/data/HDTF/audio
+        video_folder: /fsx/rs2517/data/HDTF/cropped_videos_original
+        lip_emb_folder: /fsx/antoni/data/HDTF/lipemb
+        landmarks_folder: null
+        video_extension: .mp4
+        audio_extension: .wav
+        latent_folder: null
+        audio_in_video: False
+        audio_rate: 16000
+        num_frames: 14
+        need_cond: True
+        mode: interpolation
+        use_latent: True
+        latent_type: video
+        latent_scale: 1  # For backwards compatibility
+        from_audio_embedding: True
+        load_all_possible_indexes: True
+        audio_emb_type: wav2vec2
+        # cond_noise: [-3.0, 0.5]
+        cond_noise: 0.
+        motion_id: 125
+        data_mean: null
+        data_std: null
+        use_latent_condition: True
+        get_lip_emb: True
+        get_landmarks: True
+
+      loader:
+        batch_size: 1
+        num_workers: 6
+        drop_last: True
+        pin_memory: True
+        persistent_workers: True
+        # collation_fn:
+        #   target: sgm.data.collates.collate_video
+        #   params:
+        #     merge_keys: [frames]
+            
+    # validation:
+
+    #   datapipeline:
+    #     urls:
+    #       # USER: adapt this path the root of your custom dataset
+    #       - /data/122-2/Datasets/CREMA/webdataset/val/out-{000000..000001}.tar
+    #     pipeline_config:
+    #       shardshuffle: 10000
+    #       sample_shuffle: 1000 # USER: you might wanna adapt depending on your available RAM
+
+    #     decoders:
+    #       - video
+    #     postprocessors:
+    #       - target: sdata.mappers.SelectTuple
+    #         params:
+    #           key: 'mp4' # USER: you might wanna adapt this for your custom dataset
+    #           index: 0
+    #       - target: sdata.mappers.ToSVDFormat
+    #         params:
+    #           key: mp4
+    #           n_frames: 14
+    #           resize_size: 256
+    #           cond_noise: [-3.0, 0.5]
+
+    #   loader:
+    #     batch_size: 2
+    #     num_workers: 6
+
+lightning:
+  modelcheckpoint:
+    params:
+      every_n_train_steps: 5000
+      save_top_k: 1
+
+  callbacks:
+    metrics_over_trainsteps_checkpoint:
+      params:
+        every_n_train_steps: 25000
+
+    video_logger:
+      target: sgm.callbacks.video_logger.VideoLogger
+      params:
+        disabled: False
+        enable_autocast: False
+        batch_frequency: 1000
+        max_videos: 1
+        increase_log_steps: False
+        log_first_step: True
+        log_videos_kwargs:
+          ucg_keys: [cond_frames, cond_frames_without_noise, audio_emb]
+          use_ema_scope: False
+          N: 1
+          n_rows: 1
+
+  trainer:
+    devices: -1
+    benchmark: False
+    num_sanity_val_steps: 1
+    accumulate_grad_batches: 1
+    max_epochs: 1000
+    precision: bf16-mixed
+    num_nodes: 1