Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,17 @@ We are keeping on improving the model quality and adding more features.
## Evaluation📊
See [EVAL.md](EVAL.md) for objective evaluation results and comparisons with other baselines.
## Installation📥
Suggested python 3.10 on Windows or Linux.
Suggested python 3.10 on Windows, Mac M Series (Apple Silicon) or Linux.
Windows and Linux:
```bash
pip install -r requirements.txt
```

Mac M Series:
```bash
pip install -r requirements-mac.txt
```

## Usage🛠️
We have released 3 models for different purposes:

Expand Down Expand Up @@ -93,8 +99,9 @@ python real-time-gui.py --checkpoint <path-to-checkpoint> --config <path-to-conf
- `checkpoint` is the path to the model checkpoint if you have trained or fine-tuned your own model, leave to blank to auto-download default model from huggingface. (`seed-uvit-tat-xlsr-tiny`)
- `config` is the path to the model config if you have trained or fine-tuned your own model, leave to blank to auto-download default config from huggingface

IMPORTANT: It is strongly recommended to use a GPU for real-time voice conversion.
Some performance testing has been done on a NVIDIA RTX 3060 Laptop GPU, results and recommended parameter settings are listed below:
> [!IMPORTANT]
> It is strongly recommended to use a GPU for real-time voice conversion.
> Some performance testing has been done on a NVIDIA RTX 3060 Laptop GPU, results and recommended parameter settings are listed below:

| Model Configuration | Diffusion Steps | Inference CFG Rate | Max Prompt Length | Block Time (s) | Crossfade Length (s) | Extra context (left) (s) | Extra context (right) (s) | Latency (ms) | Inference Time per Chunk (ms) |
|---------------------------------|-----------------|--------------------|-------------------|----------------|----------------------|--------------------------|---------------------------|--------------|-------------------------------|
Expand Down Expand Up @@ -186,8 +193,15 @@ where:
- [x] Colab Notebook for fine-tuning example
- [ ] Replace whisper with more advanced linguistic content extractor
- [ ] More to be added
- [x] Add Apple Silicon support

## Known Issues
- On Mac - running `real-time-gui.py` might raise an error `ModuleNotFoundError: No module named '_tkinter'`, in this case a new Python version **with Tkinter support** should be installed. Refer to [This Guide on stack overflow](https://stackoverflow.com/questions/76105218/why-does-tkinter-or-turtle-seem-to-be-missing-or-broken-shouldnt-it-be-part) for explanation of the problem and a detailed fix.


## CHANGELOGS🗒️
- 2025-03-03:
- Added Mac M Series (Apple Silicon) support
- 2024-11-26:
- Updated v1.0 tiny version pretrained model, optimized for real-time voice conversion
- Support one-shot/few-shot single/multi speaker fine-tuning
Expand Down
16 changes: 13 additions & 3 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,13 @@
from pydub import AudioSegment

# Load model and configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")

dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
"DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
Expand Down Expand Up @@ -233,8 +239,12 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
F0_ori = rmvpe.infer_from_audio(ref_waves_16k[0], thred=0.03)
F0_alt = rmvpe.infer_from_audio(converted_waves_16k[0], thred=0.03)

F0_ori = torch.from_numpy(F0_ori).to(device)[None]
F0_alt = torch.from_numpy(F0_alt).to(device)[None]
if device == "mps":
F0_ori = torch.from_numpy(F0_ori).float().to(device)[None]
F0_alt = torch.from_numpy(F0_alt).float().to(device)[None]
else:
F0_ori = torch.from_numpy(F0_ori).to(device)[None]
F0_alt = torch.from_numpy(F0_alt).to(device)[None]

voiced_F0_ori = F0_ori[F0_ori > 1]
voiced_F0_alt = F0_alt[F0_alt > 1]
Expand Down
16 changes: 13 additions & 3 deletions app_svc.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,8 +294,12 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
F0_ori = f0_fn(ref_waves_16k[0], thred=0.03)
F0_alt = f0_fn(converted_waves_16k[0], thred=0.03)

F0_ori = torch.from_numpy(F0_ori).to(device)[None]
F0_alt = torch.from_numpy(F0_alt).to(device)[None]
if device.type == "mps":
F0_ori = torch.from_numpy(F0_ori).float().to(device)[None]
F0_alt = torch.from_numpy(F0_alt).float().to(device)[None]
else:
F0_ori = torch.from_numpy(F0_ori).to(device)[None]
F0_alt = torch.from_numpy(F0_alt).to(device)[None]

voiced_F0_ori = F0_ori[F0_ori > 1]
voiced_F0_alt = F0_alt[F0_alt > 1]
Expand Down Expand Up @@ -436,5 +440,11 @@ def main(args):
parser.add_argument("--gpu", type=int, help="Which GPU id to use", default=0)
args = parser.parse_args()
cuda_target = f"cuda:{args.gpu}" if args.gpu else "cuda"
device = torch.device(cuda_target if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
device = torch.device(cuda_target)
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
main(args)
8 changes: 7 additions & 1 deletion app_vc.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,5 +389,11 @@ def main(args):
parser.add_argument("--gpu", type=int, help="Which GPU id to use", default=0)
args = parser.parse_args()
cuda_target = f"cuda:{args.gpu}" if args.gpu else "cuda"
device = torch.device(cuda_target if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
device = torch.device(cuda_target)
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
main(args)
2 changes: 1 addition & 1 deletion dac/utils/decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def decode(
model_bitrate: str
Bitrate of the model. Must be one of "8kbps", or "16kbps". Defaults to "8kbps".
device : str, optional
Device to use, by default "cuda". If "cpu", the model will be loaded on the CPU.
Device to use, by default "cuda". Use "mps" on Apple Silicon devices or if "cpu", the model will be loaded on the CPU.
model_type : str, optional
The type of model to use. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified.
"""
Expand Down
2 changes: 1 addition & 1 deletion dac/utils/encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def encode(
n_quantizers : int, optional
Number of quantizers to use, by default None. If not specified, all the quantizers will be used and the model will compress at maximum bitrate.
device : str, optional
Device to use, by default "cuda"
Device to use, by default "cuda". Use "mps" on Apple Silicon devices.
model_type : str, optional
The type of model to use. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified.
"""
Expand Down
8 changes: 7 additions & 1 deletion eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@
from resemblyzer import preprocess_wav, VoiceEncoder

# Load model and configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")

from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector
from transformers import Wav2Vec2Processor, HubertForCTC
Expand Down
9 changes: 8 additions & 1 deletion inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,14 @@


# Load model and configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")

fp16 = False
def load_models(args):
global fp16
Expand Down
8 changes: 7 additions & 1 deletion modules/rmvpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,13 @@ def __init__(self, model_path: str, is_half, device=None, use_jit=False):
self.resample_kernel = {}
self.is_half = is_half
if device is None:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
#device = "cuda:0" if torch.cuda.is_available() else "cpu"
if torch.cuda.is_available():
device = "cuda:0"
elif torch.backends.mps.is_available():
device = "mps"
else:
device = "cpu"
self.device = device
self.mel_extractor = MelSpectrogram(
is_half, 128, 16000, 1024, 160, None, 30, 8000
Expand Down
85 changes: 65 additions & 20 deletions real-time-gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,22 @@ def custom_infer(model_set,
reference_wav_name = new_reference_wav_name

converted_waves_16k = input_wav_res
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
torch.cuda.synchronize()
if device.type == "mps":
start_event = torch.mps.event.Event(enable_timing=True)
end_event = torch.mps.event.Event(enable_timing=True)
torch.mps.synchronize()
else:
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
torch.cuda.synchronize()

start_event.record()
S_alt = semantic_fn(converted_waves_16k.unsqueeze(0))
end_event.record()
torch.cuda.synchronize() # Wait for the events to be recorded!
if device.type == "mps":
torch.mps.synchronize() # MPS - Wait for the events to be recorded!
else:
torch.cuda.synchronize() # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(f"Time taken for semantic_fn: {elapsed_time_ms}ms")

Expand Down Expand Up @@ -466,7 +475,14 @@ def launcher(self):
initial_folder=os.path.join(
os.getcwd(), "examples/reference"
),
file_types=((". wav"), (". mp3"), (". flac"), (". m4a"), (". ogg"), (". opus")),
file_types=[
("WAV Files", "*.wav"),
("MP3 Files", "*.mp3"),
("FLAC Files", "*.flac"),
("M4A Files", "*.m4a"),
("OGG Files", "*.ogg"),
("Opus Files", "*.opus"),
],
),
],
],
Expand Down Expand Up @@ -786,7 +802,10 @@ def set_values(self, values):
return True

def start_vc(self):
torch.cuda.empty_cache()
if device.type == "mps":
torch.mps.empty_cache()
else:
torch.cuda.empty_cache()
self.reference_wav, _ = librosa.load(
self.gui_config.reference_audio_path, sr=self.model_set[-1]["sampling_rate"]
)
Expand Down Expand Up @@ -942,9 +961,14 @@ def audio_callback(
indata = librosa.to_mono(indata.T)

# VAD first
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
torch.cuda.synchronize()
if device.type == "mps":
start_event = torch.mps.event.Event(enable_timing=True)
end_event = torch.mps.event.Event(enable_timing=True)
torch.mps.synchronize()
else:
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
torch.cuda.synchronize()
start_event.record()
indata_16k = librosa.resample(indata, orig_sr=self.gui_config.samplerate, target_sr=16000)
res = self.vad_model.generate(input=indata_16k, cache=self.vad_cache, is_final=False, chunk_size=self.vad_chunk_size)
Expand All @@ -955,7 +979,10 @@ def audio_callback(
elif len(res_value) % 2 == 1 and self.vad_speech_detected:
self.set_speech_detected_false_at_end_flag = True
end_event.record()
torch.cuda.synchronize() # Wait for the events to be recorded!
if device.type == "mps":
torch.mps.synchronize() # MPS - Wait for the events to be recorded!
else:
torch.cuda.synchronize() # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(f"Time taken for VAD: {elapsed_time_ms}ms")

Expand Down Expand Up @@ -993,9 +1020,14 @@ def audio_callback(
if self.function == "vc":
if self.gui_config.extra_time_ce - self.gui_config.extra_time < 0:
raise ValueError("Content encoder extra context must be greater than DiT extra context!")
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
torch.cuda.synchronize()
if device.type == "mps":
start_event = torch.mps.event.Event(enable_timing=True)
end_event = torch.mps.event.Event(enable_timing=True)
torch.mps.synchronize()
else:
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
torch.cuda.synchronize()
start_event.record()
infer_wav = custom_infer(
self.model_set,
Expand All @@ -1014,7 +1046,10 @@ def audio_callback(
if self.resampler2 is not None:
infer_wav = self.resampler2(infer_wav)
end_event.record()
torch.cuda.synchronize() # Wait for the events to be recorded!
if device.type == "mps":
torch.mps.synchronize() # MPS - Wait for the events to be recorded!
else:
torch.cuda.synchronize() # Wait for the events to be recorded!
elapsed_time_ms = start_event.elapsed_time(end_event)
print(f"Time taken for VC: {elapsed_time_ms}ms")
if not self.vad_speech_detected:
Expand All @@ -1037,12 +1072,16 @@ def audio_callback(
)
+ 1e-8
)
if sys.platform == "darwin":
_, sola_offset = torch.max(cor_nom[0, 0] / cor_den[0, 0])
sola_offset = sola_offset.item()
else:

sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
tensor = cor_nom[0, 0] / cor_den[0, 0]
if tensor.numel() > 1: # If tensor has multiple elements
if sys.platform == "darwin":
_, sola_offset = torch.max(tensor, dim=0)
sola_offset = sola_offset.item()
else:
sola_offset = torch.argmax(tensor, dim=0).item()
else:
sola_offset = tensor.item()

print(f"sola_offset = {int(sola_offset)}")

Expand Down Expand Up @@ -1141,5 +1180,11 @@ def get_device_channels(self):
parser.add_argument("--gpu", type=int, help="Which GPU id to use", default=0)
args = parser.parse_args()
cuda_target = f"cuda:{args.gpu}" if args.gpu else "cuda"
device = torch.device(cuda_target if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
device = torch.device(cuda_target)
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
gui = GUI(args)
23 changes: 23 additions & 0 deletions requirements-mac.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
--extra-index-url https://download.pytorch.org/whl/cu121
torch --pre --extra-index-url https://download.pytorch.org/whl/nightly/cpu
torchvision --pre --extra-index-url https://download.pytorch.org/whl/nightly/cpu
torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cpu
scipy==1.13.1
librosa==0.10.2
huggingface-hub==0.23.4
munch==4.0.0
einops==0.8.0
descript-audio-codec==1.0.0
gradio==4.44.0
pydub==0.25.1
resemblyzer
jiwer==3.0.3
transformers==4.46.3
FreeSimpleGUI==5.1.1
soundfile==0.12.1
sounddevice==0.5.0
modelscope==1.18.1
funasr==1.1.5
numpy==1.26.4
pyyaml
python-dotenv
5 changes: 4 additions & 1 deletion train.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,5 +430,8 @@ def main(args):
parser.add_argument('--num-workers', type=int, default=0)
parser.add_argument("--gpu", type=int, help="Which GPU id to use", default=0)
args = parser.parse_args()
args.device = f"cuda:{args.gpu}" if args.gpu else "cuda:0"
if torch.backends.mps.is_available():
args.device = "mps"
else:
args.device = f"cuda:{args.gpu}" if args.gpu else "cuda:0"
main(args)