Skip to content

Commit 3962cf5

Browse files
Merge pull request #149 from huggingface/multilingual-parler
multilingual improvements for parler
2 parents aee3481 + 8174439 commit 3962cf5

File tree

2 files changed

+34
-6
lines changed

2 files changed

+34
-6
lines changed

TTS/parler_handler.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,18 @@
3131
)
3232

3333

34+
WHISPER_LANGUAGE_TO_PARLER_SPEAKER = {
35+
"en": "Jason",
36+
"fr": "Christine",
37+
"es": "Steven",
38+
"de": "Nicole",
39+
"pt": "Sophia",
40+
"pl": "Alex",
41+
"it": "Richard",
42+
"nl": "Mark",
43+
}
44+
45+
3446
class ParlerTTSHandler(BaseHandler):
3547
def setup(
3648
self,
@@ -46,13 +58,19 @@ def setup(
4658
),
4759
play_steps_s=1,
4860
blocksize=512,
61+
use_default_speakers_list=True,
4962
):
5063
self.should_listen = should_listen
5164
self.device = device
5265
self.torch_dtype = getattr(torch, torch_dtype)
5366
self.gen_kwargs = gen_kwargs
5467
self.compile_mode = compile_mode
5568
self.max_prompt_pad_length = max_prompt_pad_length
69+
self.use_default_speakers_list = use_default_speakers_list
70+
if self.use_default_speakers_list:
71+
description = description.replace("Jenny", "")
72+
73+
self.speaker = "Jason"
5674
self.description = description
5775

5876
self.model = ParlerTTSForConditionalGeneration.from_pretrained(
@@ -91,8 +109,12 @@ def prepare_model_inputs(
91109
{"padding": "max_length", "max_length": max_length_prompt} if pad else {}
92110
)
93111

112+
description = self.description
113+
if self.use_default_speakers_list:
114+
description = self.speaker + " " + self.description
115+
94116
tokenized_description = self.description_tokenizer(
95-
self.description, return_tensors="pt"
117+
description, return_tensors="pt"
96118
).to(self.device)
97119
input_ids = tokenized_description.input_ids
98120
attention_mask = tokenized_description.attention_mask
@@ -149,7 +171,8 @@ def warmup(self):
149171

150172
def process(self, llm_sentence):
151173
if isinstance(llm_sentence, tuple):
152-
llm_sentence, _ = llm_sentence
174+
llm_sentence, language_code = llm_sentence
175+
self.speaker = WHISPER_LANGUAGE_TO_PARLER_SPEAKER.get(language_code, "Jason")
153176

154177
console.print(f"[green]ASSISTANT: {llm_sentence}")
155178
nb_tokens = len(self.prompt_tokenizer(llm_sentence).input_ids)

arguments_classes/parler_tts_arguments.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44
@dataclass
55
class ParlerTTSHandlerArguments:
66
tts_model_name: str = field(
7-
default="ylacombe/parler-tts-mini-jenny-30H",
7+
default="parler-tts/parler-mini-v1-jenny",
88
metadata={
9-
"help": "The pretrained TTS model to use. Default is 'ylacombe/parler-tts-mini-jenny-30H'."
9+
"help": "The pretrained TTS model to use. Default is 'parler-tts/parler-mini-v1-jenny'."
1010
},
1111
)
1212
tts_device: str = field(
@@ -41,8 +41,7 @@ class ParlerTTSHandlerArguments:
4141
)
4242
description: str = field(
4343
default=(
44-
"A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. "
45-
"She speaks very fast."
44+
"Jenny speaks at a slightly slow pace with an animated delivery with clear audio quality."
4645
),
4746
metadata={
4847
"help": "Description of the speaker's voice and speaking style to guide the TTS model."
@@ -60,3 +59,9 @@ class ParlerTTSHandlerArguments:
6059
"help": "When using compilation, the prompt as to be padded to closest power of 2. This parameters sets the maximun power of 2 possible."
6160
},
6261
)
62+
use_default_speakers_list: bool = field(
63+
default=False,
64+
metadata={
65+
"help": "Whether to use the default list of speakers or not."
66+
},
67+
)

0 commit comments

Comments
 (0)