Skip to content

Commit 6901140

Browse files
committed
fix vocoder issue in app.py
1 parent f7c4796 commit 6901140

File tree

1 file changed

+5
-33
lines changed

1 file changed

+5
-33
lines changed

app.py

Lines changed: 5 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import os
2-
os.environ['HF_HUB_CACHE'] = './checkpoints/hf_cache'
31
import gradio as gr
42
import torch
53
import torchaudio
@@ -30,8 +28,6 @@
3028
model[key].to(device)
3129
model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
3230

33-
print(f"cfm has {sum(p.numel() for p in model.cfm.parameters() if p.requires_grad)} trainable parameters")
34-
3531
# Load additional modules
3632
from modules.campplus.DTDNN import CAMPPlus
3733

@@ -49,19 +45,6 @@
4945
bigvgan_model.remove_weight_norm()
5046
bigvgan_model = bigvgan_model.eval().to(device)
5147

52-
ckpt_path, config_path = load_custom_model_from_hf("Plachta/FAcodec", 'pytorch_model.bin', 'config.yml')
53-
54-
codec_config = yaml.safe_load(open(config_path))
55-
codec_model_params = recursive_munch(codec_config['model_params'])
56-
codec_encoder = build_model(codec_model_params, stage="codec")
57-
58-
ckpt_params = torch.load(ckpt_path, map_location="cpu")
59-
60-
for key in codec_encoder:
61-
codec_encoder[key].load_state_dict(ckpt_params[key], strict=False)
62-
_ = [codec_encoder[key].eval() for key in codec_encoder]
63-
_ = [codec_encoder[key].to(device) for key in codec_encoder]
64-
6548
# whisper
6649
from transformers import AutoFeatureExtractor, WhisperModel
6750

@@ -79,7 +62,7 @@
7962
"num_mels": config['preprocess_params']['spect_params']['n_mels'],
8063
"sampling_rate": sr,
8164
"fmin": 0,
82-
"fmax": None if config['preprocess_params']['spect_params'].get('fmax') == "None" else config['preprocess_params']['spect_params']['fmax'],
65+
"fmax": None,
8366
"center": False
8467
}
8568
from modules.audio import mel_spectrogram
@@ -88,7 +71,7 @@
8871

8972
# f0 conditioned model
9073
dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
91-
"DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema_v2.pth",
74+
"DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema.pth",
9275
"config_dit_mel_seed_uvit_whisper_base_f0_44k.yml")
9376

9477
config = yaml.safe_load(open(dit_config_path, 'r'))
@@ -118,7 +101,7 @@
118101
"num_mels": config['preprocess_params']['spect_params']['n_mels'],
119102
"sampling_rate": sr,
120103
"fmin": 0,
121-
"fmax": None if config['preprocess_params']['spect_params'].get('fmax') == "None" else config['preprocess_params']['spect_params']['fmax'],
104+
"fmax": None,
122105
"center": False
123106
}
124107
to_mel_f0 = lambda x: mel_spectrogram(x, **mel_fn_args_f0)
@@ -128,15 +111,6 @@
128111
bigvgan_44k_model.remove_weight_norm()
129112
bigvgan_44k_model = bigvgan_44k_model.eval().to(device)
130113

131-
from modules.hifigan.generator import HiFTGenerator
132-
from modules.hifigan.f0_predictor import ConvRNNF0Predictor
133-
134-
hift_config = yaml.safe_load(open('configs/hifigan.yml', 'r'))
135-
hift_gen = HiFTGenerator(**hift_config['hift'], f0_predictor=ConvRNNF0Predictor(**hift_config['f0_predictor']))
136-
hift_gen.load_state_dict(torch.load(hift_config['pretrained_model_path'], map_location='cpu'))
137-
hift_gen.eval()
138-
hift_gen.to(device)
139-
140114
def adjust_f0_semitones(f0_sequence, n_semitones):
141115
factor = 2 ** (n_semitones / 12)
142116
return f0_sequence * factor
@@ -161,7 +135,7 @@ def crossfade(chunk1, chunk2, overlap):
161135
def voice_conversion(source, target, diffusion_steps, length_adjust, inference_cfg_rate, f0_condition, auto_f0_adjust, pitch_shift):
162136
inference_module = model if not f0_condition else model_f0
163137
mel_fn = to_mel if not f0_condition else to_mel_f0
164-
bigvgan_fn = hift_gen if not f0_condition else bigvgan_44k_model
138+
bigvgan_fn = bigvgan_model if not f0_condition else bigvgan_44k_model
165139
sr = 22050 if not f0_condition else 44100
166140
# Load audio
167141
source_audio = librosa.load(source, sr=sr)[0]
@@ -302,8 +276,6 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
302276
inference_cfg_rate=inference_cfg_rate)
303277
vc_target = vc_target[:, :, mel2.size(-1):]
304278
vc_wave = bigvgan_fn(vc_target)[0]
305-
if vc_wave.ndim == 1:
306-
vc_wave = vc_wave.unsqueeze(0)
307279
if processed_frames == 0:
308280
if is_last_chunk:
309281
output_wave = vc_wave[0].cpu().numpy()
@@ -385,4 +357,4 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
385357
title="Seed Voice Conversion",
386358
examples=examples,
387359
cache_examples=False,
388-
).launch()
360+
).launch()

0 commit comments

Comments
 (0)