boin · pull · Mar 4, 2025 · Mar 3, 2025 · Mar 3, 2025 · Mar 4, 2025
diff --git a/README.md b/README.md
@@ -18,11 +18,17 @@ We are keeping on improving the model quality and adding more features.
 ## Evaluation📊
 See [EVAL.md](EVAL.md) for objective evaluation results and comparisons with other baselines.
 ## Installation📥
-Suggested python 3.10 on Windows or Linux.
+Suggested python 3.10 on Windows, Mac M Series (Apple Silicon) or Linux.
+Windows and Linux:
 ```bash
 pip install -r requirements.txt
 ```
 
+Mac M Series:
+```bash
+pip install -r requirements-mac.txt
+```
+
 ## Usage🛠️
 We have released 3 models for different purposes:
 
@@ -93,8 +99,9 @@ python real-time-gui.py --checkpoint <path-to-checkpoint> --config <path-to-conf
 - `checkpoint` is the path to the model checkpoint if you have trained or fine-tuned your own model, leave to blank to auto-download default model from huggingface. (`seed-uvit-tat-xlsr-tiny`)
 - `config` is the path to the model config if you have trained or fine-tuned your own model, leave to blank to auto-download default config from huggingface  
 
-IMPORTANT: It is strongly recommended to use a GPU for real-time voice conversion.  
-Some performance testing has been done on a NVIDIA RTX 3060 Laptop GPU, results and recommended parameter settings are listed below:
+> [!IMPORTANT]
+> It is strongly recommended to use a GPU for real-time voice conversion.
+> Some performance testing has been done on a NVIDIA RTX 3060 Laptop GPU, results and recommended parameter settings are listed below:
 
 | Model Configuration             | Diffusion Steps | Inference CFG Rate | Max Prompt Length | Block Time (s) | Crossfade Length (s) | Extra context (left) (s) | Extra context (right) (s) | Latency (ms) | Inference Time per Chunk (ms) |
 |---------------------------------|-----------------|--------------------|-------------------|----------------|----------------------|--------------------------|---------------------------|--------------|-------------------------------| 
@@ -186,8 +193,15 @@ where:
 - [x] Colab Notebook for fine-tuning example
 - [ ] Replace whisper with more advanced linguistic content extractor
 - [ ] More to be added
+- [x] Add Apple Silicon support
+
+## Known Issues
+- On Mac - running `real-time-gui.py` might raise an error `ModuleNotFoundError: No module named '_tkinter'`, in this case a new Python version **with Tkinter support** should be installed. Refer to [This Guide on stack overflow](https://stackoverflow.com/questions/76105218/why-does-tkinter-or-turtle-seem-to-be-missing-or-broken-shouldnt-it-be-part) for explanation of the problem and a detailed fix.
+
 
 ## CHANGELOGS🗒️
+- 2025-03-03:
+    - Added Mac M Series (Apple Silicon) support
 - 2024-11-26:
     - Updated v1.0 tiny version pretrained model, optimized for real-time voice conversion
     - Support one-shot/few-shot single/multi speaker fine-tuning

diff --git a/app.py b/app.py
@@ -9,7 +9,13 @@
 from pydub import AudioSegment
 
 # Load model and configuration
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+elif torch.backends.mps.is_available():
+    device = torch.device("mps")
+else:
+    device = torch.device("cpu")
 
 dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
                                                 "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
@@ -233,8 +239,12 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
         F0_ori = rmvpe.infer_from_audio(ref_waves_16k[0], thred=0.03)
         F0_alt = rmvpe.infer_from_audio(converted_waves_16k[0], thred=0.03)
 
-        F0_ori = torch.from_numpy(F0_ori).to(device)[None]
-        F0_alt = torch.from_numpy(F0_alt).to(device)[None]
+        if device == "mps":
+            F0_ori = torch.from_numpy(F0_ori).float().to(device)[None]
+            F0_alt = torch.from_numpy(F0_alt).float().to(device)[None]
+        else:
+            F0_ori = torch.from_numpy(F0_ori).to(device)[None]
+            F0_alt = torch.from_numpy(F0_alt).to(device)[None]
 
         voiced_F0_ori = F0_ori[F0_ori > 1]
         voiced_F0_alt = F0_alt[F0_alt > 1]

diff --git a/app_svc.py b/app_svc.py
@@ -294,8 +294,12 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
     F0_ori = f0_fn(ref_waves_16k[0], thred=0.03)
     F0_alt = f0_fn(converted_waves_16k[0], thred=0.03)
 
-    F0_ori = torch.from_numpy(F0_ori).to(device)[None]
-    F0_alt = torch.from_numpy(F0_alt).to(device)[None]
+    if device.type == "mps":
+        F0_ori = torch.from_numpy(F0_ori).float().to(device)[None]
+        F0_alt = torch.from_numpy(F0_alt).float().to(device)[None]
+    else:
+        F0_ori = torch.from_numpy(F0_ori).to(device)[None]
+        F0_alt = torch.from_numpy(F0_alt).to(device)[None]
 
     voiced_F0_ori = F0_ori[F0_ori > 1]
     voiced_F0_alt = F0_alt[F0_alt > 1]
@@ -436,5 +440,11 @@ def main(args):
     parser.add_argument("--gpu", type=int, help="Which GPU id to use", default=0)
     args = parser.parse_args()
     cuda_target = f"cuda:{args.gpu}" if args.gpu else "cuda" 
-    device = torch.device(cuda_target if torch.cuda.is_available() else "cpu")
+
+    if torch.cuda.is_available():
+        device = torch.device(cuda_target)
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
     main(args)
diff --git a/app_vc.py b/app_vc.py
@@ -389,5 +389,11 @@ def main(args):
     parser.add_argument("--gpu", type=int, help="Which GPU id to use", default=0)
     args = parser.parse_args()
     cuda_target = f"cuda:{args.gpu}" if args.gpu else "cuda" 
-    device = torch.device(cuda_target if torch.cuda.is_available() else "cpu")
+
+    if torch.cuda.is_available():
+        device = torch.device(cuda_target)
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
     main(args)
diff --git a/dac/utils/decode.py b/dac/utils/decode.py
@@ -43,7 +43,7 @@ def decode(
     model_bitrate: str
         Bitrate of the model. Must be one of "8kbps", or "16kbps". Defaults to "8kbps".
     device : str, optional
-        Device to use, by default "cuda". If "cpu", the model will be loaded on the CPU.
+        Device to use, by default "cuda". Use "mps" on Apple Silicon devices or if "cpu", the model will be loaded on the CPU.
     model_type : str, optional
         The type of model to use. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified.
     """

diff --git a/dac/utils/encode.py b/dac/utils/encode.py
@@ -47,7 +47,7 @@ def encode(
     n_quantizers : int, optional
         Number of quantizers to use, by default None. If not specified, all the quantizers will be used and the model will compress at maximum bitrate.
     device : str, optional
-        Device to use, by default "cuda"
+        Device to use, by default "cuda". Use "mps" on Apple Silicon devices.
     model_type : str, optional
         The type of model to use. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified.
     """

diff --git a/eval.py b/eval.py
@@ -23,7 +23,13 @@
 from resemblyzer import preprocess_wav, VoiceEncoder
 
 # Load model and configuration
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+elif torch.backends.mps.is_available():
+    device = torch.device("mps")
+else:
+    device = torch.device("cpu")
 
 from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector
 from transformers import Wav2Vec2Processor, HubertForCTC

diff --git a/inference.py b/inference.py
@@ -25,7 +25,14 @@
 
 
 # Load model and configuration
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+elif torch.backends.mps.is_available():
+    device = torch.device("mps")
+else:
+    device = torch.device("cpu")
+
 fp16 = False
 def load_models(args):
     global fp16

diff --git a/modules/rmvpe.py b/modules/rmvpe.py
@@ -486,7 +486,13 @@ def __init__(self, model_path: str, is_half, device=None, use_jit=False):
         self.resample_kernel = {}
         self.is_half = is_half
         if device is None:
-            device = "cuda:0" if torch.cuda.is_available() else "cpu"
+            #device = "cuda:0" if torch.cuda.is_available() else "cpu"
+            if torch.cuda.is_available():
+                device = "cuda:0"
+            elif torch.backends.mps.is_available():
+                device = "mps"
+            else:
+                device = "cpu"
         self.device = device
         self.mel_extractor = MelSpectrogram(
             is_half, 128, 16000, 1024, 160, None, 30, 8000

diff --git a/real-time-gui.py b/real-time-gui.py
@@ -94,13 +94,22 @@ def custom_infer(model_set,
         reference_wav_name = new_reference_wav_name
 
     converted_waves_16k = input_wav_res
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-    torch.cuda.synchronize()
+    if device.type == "mps":
+        start_event = torch.mps.event.Event(enable_timing=True)
+        end_event = torch.mps.event.Event(enable_timing=True)
+        torch.mps.synchronize()
+    else:
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        torch.cuda.synchronize()
+
     start_event.record()
     S_alt = semantic_fn(converted_waves_16k.unsqueeze(0))
     end_event.record()
-    torch.cuda.synchronize()  # Wait for the events to be recorded!
+    if device.type == "mps":
+        torch.mps.synchronize()  # MPS - Wait for the events to be recorded!
+    else:
+        torch.cuda.synchronize()  # Wait for the events to be recorded!
     elapsed_time_ms = start_event.elapsed_time(end_event)
     print(f"Time taken for semantic_fn: {elapsed_time_ms}ms")
 
@@ -466,7 +475,14 @@ def launcher(self):
                                     initial_folder=os.path.join(
                                         os.getcwd(), "examples/reference"
                                     ),
-                                    file_types=((". wav"), (". mp3"), (". flac"), (". m4a"), (". ogg"), (". opus")),
+                                    file_types=[
+                                        ("WAV Files", "*.wav"),
+                                        ("MP3 Files", "*.mp3"),
+                                        ("FLAC Files", "*.flac"),
+                                        ("M4A Files", "*.m4a"),
+                                        ("OGG Files", "*.ogg"),
+                                        ("Opus Files", "*.opus"),
+                                    ],
                                 ),
                             ],
                         ],
@@ -786,7 +802,10 @@ def set_values(self, values):
             return True
 
         def start_vc(self):
-            torch.cuda.empty_cache()
+            if device.type == "mps":
+                torch.mps.empty_cache()
+            else:
+                torch.cuda.empty_cache()
             self.reference_wav, _ = librosa.load(
                 self.gui_config.reference_audio_path, sr=self.model_set[-1]["sampling_rate"]
             )
@@ -942,9 +961,14 @@ def audio_callback(
             indata = librosa.to_mono(indata.T)
 
             # VAD first
-            start_event = torch.cuda.Event(enable_timing=True)
-            end_event = torch.cuda.Event(enable_timing=True)
-            torch.cuda.synchronize()
+            if device.type == "mps":
+                start_event = torch.mps.event.Event(enable_timing=True)
+                end_event = torch.mps.event.Event(enable_timing=True)
+                torch.mps.synchronize()
+            else:
+                start_event = torch.cuda.Event(enable_timing=True)
+                end_event = torch.cuda.Event(enable_timing=True)
+                torch.cuda.synchronize()
             start_event.record()
             indata_16k = librosa.resample(indata, orig_sr=self.gui_config.samplerate, target_sr=16000)
             res = self.vad_model.generate(input=indata_16k, cache=self.vad_cache, is_final=False, chunk_size=self.vad_chunk_size)
@@ -955,7 +979,10 @@ def audio_callback(
             elif len(res_value) % 2 == 1 and self.vad_speech_detected:
                 self.set_speech_detected_false_at_end_flag = True
             end_event.record()
-            torch.cuda.synchronize()  # Wait for the events to be recorded!
+            if device.type == "mps":
+                torch.mps.synchronize()  # MPS - Wait for the events to be recorded!
+            else:
+                torch.cuda.synchronize()  # Wait for the events to be recorded!
             elapsed_time_ms = start_event.elapsed_time(end_event)
             print(f"Time taken for VAD: {elapsed_time_ms}ms")
 
@@ -993,9 +1020,14 @@ def audio_callback(
             if self.function == "vc":
                 if self.gui_config.extra_time_ce - self.gui_config.extra_time < 0:
                     raise ValueError("Content encoder extra context must be greater than DiT extra context!")
-                start_event = torch.cuda.Event(enable_timing=True)
-                end_event = torch.cuda.Event(enable_timing=True)
-                torch.cuda.synchronize()
+                if device.type == "mps":
+                    start_event = torch.mps.event.Event(enable_timing=True)
+                    end_event = torch.mps.event.Event(enable_timing=True)
+                    torch.mps.synchronize()
+                else:
+                    start_event = torch.cuda.Event(enable_timing=True)
+                    end_event = torch.cuda.Event(enable_timing=True)
+                    torch.cuda.synchronize()
                 start_event.record()
                 infer_wav = custom_infer(
                     self.model_set,
@@ -1014,7 +1046,10 @@ def audio_callback(
                 if self.resampler2 is not None:
                     infer_wav = self.resampler2(infer_wav)
                 end_event.record()
-                torch.cuda.synchronize()  # Wait for the events to be recorded!
+                if device.type == "mps":
+                    torch.mps.synchronize()  # MPS - Wait for the events to be recorded!
+                else:
+                    torch.cuda.synchronize()  # Wait for the events to be recorded!
                 elapsed_time_ms = start_event.elapsed_time(end_event)
                 print(f"Time taken for VC: {elapsed_time_ms}ms")
                 if not self.vad_speech_detected:
@@ -1037,12 +1072,16 @@ def audio_callback(
                 )
                 + 1e-8
             )
-            if sys.platform == "darwin":
-                _, sola_offset = torch.max(cor_nom[0, 0] / cor_den[0, 0])
-                sola_offset = sola_offset.item()
-            else:
 
-                sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
+            tensor = cor_nom[0, 0] / cor_den[0, 0]
+            if tensor.numel() > 1:  # If tensor has multiple elements
+                if sys.platform == "darwin":
+                    _, sola_offset = torch.max(tensor, dim=0)
+                    sola_offset = sola_offset.item()
+                else:
+                    sola_offset = torch.argmax(tensor, dim=0).item()
+            else:
+                sola_offset = tensor.item()
 
             print(f"sola_offset = {int(sola_offset)}")
 
@@ -1141,5 +1180,11 @@ def get_device_channels(self):
     parser.add_argument("--gpu", type=int, help="Which GPU id to use", default=0)
     args = parser.parse_args()
     cuda_target = f"cuda:{args.gpu}" if args.gpu else "cuda" 
-    device = torch.device(cuda_target if torch.cuda.is_available() else "cpu")
+
+    if torch.cuda.is_available():
+        device = torch.device(cuda_target)
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
     gui = GUI(args)
diff --git a/requirements-mac.txt b/requirements-mac.txt
@@ -0,0 +1,23 @@
+--extra-index-url https://download.pytorch.org/whl/cu121
+torch --pre --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+torchvision --pre --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cpu
+scipy==1.13.1
+librosa==0.10.2
+huggingface-hub==0.23.4
+munch==4.0.0
+einops==0.8.0
+descript-audio-codec==1.0.0
+gradio==4.44.0
+pydub==0.25.1
+resemblyzer
+jiwer==3.0.3
+transformers==4.46.3
+FreeSimpleGUI==5.1.1
+soundfile==0.12.1
+sounddevice==0.5.0
+modelscope==1.18.1
+funasr==1.1.5
+numpy==1.26.4
+pyyaml
+python-dotenv
diff --git a/train.py b/train.py
@@ -430,5 +430,8 @@ def main(args):
     parser.add_argument('--num-workers', type=int, default=0)
     parser.add_argument("--gpu", type=int, help="Which GPU id to use", default=0)
     args = parser.parse_args()
-    args.device = f"cuda:{args.gpu}" if args.gpu else "cuda:0" 
+    if torch.backends.mps.is_available():
+        args.device = "mps"
+    else:
+        args.device = f"cuda:{args.gpu}" if args.gpu else "cuda:0"
     main(args)