Merge branch 'main' of https://github.com/Plachtaa/seed-vc

Plachtaa · Plachtaa · commit 800af657c2d2 · 2025-02-11T01:58:59.000+08:00
diff --git a/app_svc.py b/app_svc.py
@@ -11,9 +11,9 @@
 from pydub import AudioSegment
 import argparse
 # Load model and configuration
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 fp16 = False
+device = None
 def load_models(args):
     global sr, hop_length, fp16
     fp16 = args.fp16
@@ -433,5 +433,8 @@ def main(args):
     parser.add_argument("--config-path", type=str, help="Path to the config file", default=None)
     parser.add_argument("--share", type=str2bool, nargs="?", const=True, default=False, help="Whether to share the app")
     parser.add_argument("--fp16", type=str2bool, nargs="?", const=True, help="Whether to use fp16", default=True)
+    parser.add_argument("--gpu", type=int, help="Which GPU id to use", default=0)
     args = parser.parse_args()
+    cuda_target = f"cuda:{args.gpu}" if args.gpu else "cuda" 
+    device = torch.device(cuda_target if torch.cuda.is_available() else "cpu")
     main(args)
diff --git a/app_vc.py b/app_vc.py
@@ -12,8 +12,8 @@
 import argparse
 
 # Load model and configuration
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 fp16 = False
+device = None
 def load_models(args):
     global sr, hop_length, fp16
     fp16 = args.fp16
@@ -386,5 +386,8 @@ def main(args):
     parser.add_argument("--config-path", type=str, help="Path to the config file", default=None)
     parser.add_argument("--share", type=str2bool, nargs="?", const=True, default=False, help="Whether to share the app")
     parser.add_argument("--fp16", type=str2bool, nargs="?", const=True, help="Whether to use fp16", default=True)
+    parser.add_argument("--gpu", type=int, help="Which GPU id to use", default=0)
     args = parser.parse_args()
-    main(args)
+    cuda_target = f"cuda:{args.gpu}" if args.gpu else "cuda" 
+    device = torch.device(cuda_target if torch.cuda.is_available() else "cpu")
+    main(args)
diff --git a/data/ft_dataset.py b/data/ft_dataset.py
@@ -12,27 +12,26 @@
     "max": 30.0,
 }
 # assume single speaker
+def to_mel_fn(wave, mel_fn_args):
+    return mel_spectrogram(wave, **mel_fn_args)
+
 class FT_Dataset(torch.utils.data.Dataset):
-    def __init__(self,
-                 data_path,
-                 spect_params,
-                 sr=22050,
-                 batch_size=1,
-                 ):
+    def __init__(
+        self,
+        data_path,
+        spect_params,
+        sr=22050,
+        batch_size=1,
+    ):
         self.data_path = data_path
-        # recursively find all files in data_path
         self.data = []
         for root, _, files in os.walk(data_path):
             for file in files:
-                if (file.endswith(".wav") or
-                        file.endswith(".mp3") or
-                        file.endswith(".flac") or
-                        file.endswith(".ogg") or
-                        file.endswith(".m4a") or
-                        file.endswith(".opus")):
+                if file.endswith((".wav", ".mp3", ".flac", ".ogg", ".m4a", ".opus")):
                     self.data.append(os.path.join(root, file))
 
-        mel_fn_args = {
+        self.sr = sr
+        self.mel_fn_args = {
             "n_fft": spect_params['n_fft'],
             "win_size": spect_params['win_length'],
             "hop_size": spect_params['hop_length'],
@@ -42,11 +41,8 @@ def __init__(self,
             "fmax": None if spect_params['fmax'] == "None" else spect_params['fmax'],
             "center": False
         }
-        self.to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
-        self.sr = sr
 
         assert len(self.data) != 0
-        # if dataset length is less than batch size, repeat the dataset
         while len(self.data) < batch_size:
             self.data += self.data
 
@@ -64,17 +60,14 @@ def __getitem__(self, idx):
         if len(speech) < self.sr * duration_setting["min"] or len(speech) > self.sr * duration_setting["max"]:
             print(f"Audio {wav_path} is too short or too long, skipping")
             return self.__getitem__(random.randint(0, len(self)))
-        return_dict = {
-            'audio': speech,
-            'sr': orig_sr
-        }
-        wave, orig_sr = return_dict['audio'], return_dict['sr']
         if orig_sr != self.sr:
-            wave = librosa.resample(wave, orig_sr, self.sr)
-        wave = torch.from_numpy(wave).float()
-        mel = self.to_mel(wave.unsqueeze(0)).squeeze(0)
+            speech = librosa.resample(speech, orig_sr, self.sr)
+
+        wave = torch.from_numpy(speech).float().unsqueeze(0)
+        mel = to_mel_fn(wave, self.mel_fn_args).squeeze(0)
+
+        return wave.squeeze(0), mel
 
-        return wave, mel
 
 def build_ft_dataloader(data_path, spect_params, sr, batch_size=1, num_workers=0):
     dataset = FT_Dataset(data_path, spect_params, sr, batch_size)
@@ -130,4 +123,4 @@ def collate(batch):
         wave, mel, wave_lengths, mel_lengths = batch
         print(wave.shape, mel.shape)
         if idx == 10:
-            break
+            break
diff --git a/real-time-gui.py b/real-time-gui.py
@@ -30,7 +30,7 @@
 import torch
 from modules.commons import str2bool
 # Load model and configuration
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+device = None
 
 flag_vc = False
 
@@ -328,7 +328,7 @@ def printt(strr, *args):
 
 class Config:
     def __init__(self):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.device = device
 
 
 if __name__ == "__main__":
@@ -1144,5 +1144,8 @@ def get_device_channels(self):
     parser.add_argument("--checkpoint-path", type=str, default=None, help="Path to the model checkpoint")
     parser.add_argument("--config-path", type=str, default=None, help="Path to the vocoder checkpoint")
     parser.add_argument("--fp16", type=str2bool, nargs="?", const=True, help="Whether to use fp16", default=True)
+    parser.add_argument("--gpu", type=int, help="Which GPU id to use", default=0)
     args = parser.parse_args()
+    cuda_target = f"cuda:{args.gpu}" if args.gpu else "cuda" 
+    device = torch.device(cuda_target if torch.cuda.is_available() else "cpu")
     gui = GUI(args)
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
---extra-index-url https://download.pytorch.org/whl/cu113
-torch==2.1.0
-torchvision==0.16.0
-torchaudio==2.1.0
+--extra-index-url https://download.pytorch.org/whl/cu121
+torch==2.4.0
+torchvision==0.19.0
+torchaudio==2.4.0
 scipy==1.13.1
 librosa==0.10.2
 huggingface-hub==0.23.4
@@ -17,4 +17,4 @@ FreeSimpleGUI==5.1.1
 soundfile==0.12.1
 sounddevice==0.5.0
 modelscope==1.18.1
-funasr==1.1.5
+funasr==1.1.5
diff --git a/train.py b/train.py