1- import os
2- os .environ ['HF_HUB_CACHE' ] = './checkpoints/hf_cache'
31import gradio as gr
42import torch
53import torchaudio
3028 model [key ].to (device )
3129model .cfm .estimator .setup_caches (max_batch_size = 1 , max_seq_length = 8192 )
3230
33- print (f"cfm has { sum (p .numel () for p in model .cfm .parameters () if p .requires_grad )} trainable parameters" )
34-
3531# Load additional modules
3632from modules .campplus .DTDNN import CAMPPlus
3733
4945bigvgan_model .remove_weight_norm ()
5046bigvgan_model = bigvgan_model .eval ().to (device )
5147
52- ckpt_path , config_path = load_custom_model_from_hf ("Plachta/FAcodec" , 'pytorch_model.bin' , 'config.yml' )
53-
54- codec_config = yaml .safe_load (open (config_path ))
55- codec_model_params = recursive_munch (codec_config ['model_params' ])
56- codec_encoder = build_model (codec_model_params , stage = "codec" )
57-
58- ckpt_params = torch .load (ckpt_path , map_location = "cpu" )
59-
60- for key in codec_encoder :
61- codec_encoder [key ].load_state_dict (ckpt_params [key ], strict = False )
62- _ = [codec_encoder [key ].eval () for key in codec_encoder ]
63- _ = [codec_encoder [key ].to (device ) for key in codec_encoder ]
64-
6548# whisper
6649from transformers import AutoFeatureExtractor , WhisperModel
6750
7962 "num_mels" : config ['preprocess_params' ]['spect_params' ]['n_mels' ],
8063 "sampling_rate" : sr ,
8164 "fmin" : 0 ,
82- "fmax" : None if config [ 'preprocess_params' ][ 'spect_params' ]. get ( 'fmax' ) == "None" else config [ 'preprocess_params' ][ 'spect_params' ][ 'fmax' ] ,
65+ "fmax" : None ,
8366 "center" : False
8467}
8568from modules .audio import mel_spectrogram
8871
8972# f0 conditioned model
9073dit_checkpoint_path , dit_config_path = load_custom_model_from_hf ("Plachta/Seed-VC" ,
91- "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema_v2 .pth" ,
74+ "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema .pth" ,
9275 "config_dit_mel_seed_uvit_whisper_base_f0_44k.yml" )
9376
9477config = yaml .safe_load (open (dit_config_path , 'r' ))
118101 "num_mels" : config ['preprocess_params' ]['spect_params' ]['n_mels' ],
119102 "sampling_rate" : sr ,
120103 "fmin" : 0 ,
121- "fmax" : None if config [ 'preprocess_params' ][ 'spect_params' ]. get ( 'fmax' ) == "None" else config [ 'preprocess_params' ][ 'spect_params' ][ 'fmax' ] ,
104+ "fmax" : None ,
122105 "center" : False
123106}
124107to_mel_f0 = lambda x : mel_spectrogram (x , ** mel_fn_args_f0 )
128111bigvgan_44k_model .remove_weight_norm ()
129112bigvgan_44k_model = bigvgan_44k_model .eval ().to (device )
130113
131- from modules .hifigan .generator import HiFTGenerator
132- from modules .hifigan .f0_predictor import ConvRNNF0Predictor
133-
134- hift_config = yaml .safe_load (open ('configs/hifigan.yml' , 'r' ))
135- hift_gen = HiFTGenerator (** hift_config ['hift' ], f0_predictor = ConvRNNF0Predictor (** hift_config ['f0_predictor' ]))
136- hift_gen .load_state_dict (torch .load (hift_config ['pretrained_model_path' ], map_location = 'cpu' ))
137- hift_gen .eval ()
138- hift_gen .to (device )
139-
140114def adjust_f0_semitones (f0_sequence , n_semitones ):
141115 factor = 2 ** (n_semitones / 12 )
142116 return f0_sequence * factor
@@ -161,7 +135,7 @@ def crossfade(chunk1, chunk2, overlap):
161135def voice_conversion (source , target , diffusion_steps , length_adjust , inference_cfg_rate , f0_condition , auto_f0_adjust , pitch_shift ):
162136 inference_module = model if not f0_condition else model_f0
163137 mel_fn = to_mel if not f0_condition else to_mel_f0
164- bigvgan_fn = hift_gen if not f0_condition else bigvgan_44k_model
138+ bigvgan_fn = bigvgan_model if not f0_condition else bigvgan_44k_model
165139 sr = 22050 if not f0_condition else 44100
166140 # Load audio
167141 source_audio = librosa .load (source , sr = sr )[0 ]
@@ -302,8 +276,6 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
302276 inference_cfg_rate = inference_cfg_rate )
303277 vc_target = vc_target [:, :, mel2 .size (- 1 ):]
304278 vc_wave = bigvgan_fn (vc_target )[0 ]
305- if vc_wave .ndim == 1 :
306- vc_wave = vc_wave .unsqueeze (0 )
307279 if processed_frames == 0 :
308280 if is_last_chunk :
309281 output_wave = vc_wave [0 ].cpu ().numpy ()
@@ -385,4 +357,4 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
385357 title = "Seed Voice Conversion" ,
386358 examples = examples ,
387359 cache_examples = False ,
388- ).launch ()
360+ ).launch ()
0 commit comments