improve real time vc stability by expanding left context window for content encoder

Plachtaa · Plachtaa · commit 2830a42a0ebf · 2024-11-27T18:21:24.000+08:00
diff --git a/README-ZH.md b/README-ZH.md
@@ -87,7 +87,7 @@ python real-time-gui.py --checkpoint <path-to-checkpoint> --config <path-to-conf
 
 | 模型配置                | 扩散步数 | Inference CFG Rate | 最大prompt长度 | 每块时间 (s) | 交叉淡化长度 (s) | 额外上下文（左）(s) | 额外上下文（右）(s) | 延迟 (ms） | 每块推理时间 (ms) |
 |---------------------|------|--------------------|------------|----------|------------|-------------|-------------|---------|-------------| 
-| seed-uvit-xlsr-tiny | 10   | 0.7                | 3.0        | 0.18s    | 0.04s      | 0.5s        | 0.02s       | 430ms   | 150ms       |
+| seed-uvit-xlsr-tiny | 10   | 0.7                | 3.0        | 0.18s    | 0.04s      | 2.5s        | 0.02s       | 430ms   | 150ms       |
 
 你可以根据设备性能调整 GUI 中的参数，只要推理时间小于块时间，语音转换流就可以正常工作。 注意，如果你正在运行其他占用 GPU 的任务（如游戏、看视频），推理速度可能会下降。
 
diff --git a/README.md b/README.md
@@ -94,7 +94,7 @@ Some performance testing has been done on a NVIDIA RTX 3060 Laptop GPU, results
 
 | Model Configuration             | Diffusion Steps | Inference CFG Rate | Max Prompt Length | Block Time (s) | Crossfade Length (s) | Extra context (left) (s) | Extra context (right) (s) | Latency (ms) | Inference Time per Chunk (ms) |
 |---------------------------------|-----------------|--------------------|-------------------|----------------|----------------------|--------------------------|---------------------------|--------------|-------------------------------| 
-| seed-uvit-xlsr-tiny             | 10              | 0.7                | 3.0               | 0.18s          | 0.04s                | 0.5s                     | 0.02s                     | 430ms        | 150ms                         |
+| seed-uvit-xlsr-tiny             | 10              | 0.7                | 3.0               | 0.18s          | 0.04s                | 2.5s                     | 0.02s                     | 430ms        | 150ms                         |
 
 You can adjust the parameters in the GUI according to your own device performance, the voice conversion stream should work well as long as Inference Time is less than Block Time.  
 Note that inference speed may drop if you are running other GPU intensive tasks (e.g. gaming, watching videos)  
diff --git a/real-time-gui.py b/real-time-gui.py
@@ -45,6 +45,7 @@
 reference_wav_name = ""
 
 prompt_len = 3  # in seconds
+ce_dit_difference = 2  # 2 seconds
 @torch.no_grad()
 def custom_infer(model_set,
                  reference_wav,
@@ -61,6 +62,7 @@ def custom_infer(model_set,
     global prompt_condition, mel2, style2
     global reference_wav_name
     global prompt_len
+    global ce_dit_difference
     (
         model,
         semantic_fn,
@@ -94,12 +96,20 @@ def custom_infer(model_set,
         reference_wav_name = new_reference_wav_name
 
     converted_waves_16k = input_wav_res
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    torch.cuda.synchronize()
+    start_event.record()
     S_alt = semantic_fn(converted_waves_16k.unsqueeze(0))
+    end_event.record()
+    torch.cuda.synchronize()  # Wait for the events to be recorded!
+    elapsed_time_ms = start_event.elapsed_time(end_event)
+    print(f"Time taken for semantic_fn: {elapsed_time_ms}ms")
 
-    target_lengths = torch.LongTensor([(skip_head + return_length + skip_tail) / 50 * sr // hop_length]).to(S_alt.device)
-
+    S_alt = S_alt[:, ce_dit_difference * 50:]
+    target_lengths = torch.LongTensor([(skip_head + return_length + skip_tail - ce_dit_difference * 50) / 50 * sr // hop_length]).to(S_alt.device)
     cond = model.length_regulator(
-        S_alt, ylens=target_lengths, n_quantizers=3, f0=None
+        S_alt, ylens=target_lengths , n_quantizers=3, f0=None
     )[0]
     cat_condition = torch.cat([prompt_condition, cond], dim=1)
     vc_target = model.cfm.inference(
@@ -420,7 +430,7 @@ def load(self):
                         "sr_type": "sr_model",
                         "block_time": 0.5,
                         "crossfade_length": 0.04,
-                        "extra_time": 0.5,
+                        "extra_time": 2.5,
                         "extra_time_right": 0.02,
                         "diffusion_steps": 10,
                         "inference_cfg_rate": 0.7,
@@ -594,7 +604,7 @@ def launcher(self):
                             [
                                 sg.Text("Extra context time (left)"),
                                 sg.Slider(
-                                    range=(0.5, 10.0),
+                                    range=(2.5, 10.0),
                                     key="extra_time",
                                     resolution=0.1,
                                     orientation="h",