@@ -329,14 +329,14 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
329329 chunk_f0 = interpolated_shifted_f0_alt [:, processed_frames :processed_frames + max_source_window ]
330330 is_last_chunk = processed_frames + max_source_window >= cond .size (1 )
331331 cat_condition = torch .cat ([prompt_condition , chunk_cond ], dim = 1 )
332- # with torch.autocast(device_type=device.type, dtype=torch.float16 if fp16 else torch.float32):
333- # Voice Conversion
334- vc_target = inference_module .cfm .inference (cat_condition ,
335- torch .LongTensor ([cat_condition .size (1 )]).to (mel2 .device ),
336- mel2 , style2 , None , diffusion_steps ,
337- inference_cfg_rate = inference_cfg_rate )
338- vc_target = vc_target [:, :, mel2 .size (- 1 ):]
339- vc_wave = vocoder_fn (vc_target ).squeeze ().cpu ()
332+ with torch .autocast (device_type = device .type , dtype = torch .float16 if fp16 else torch .float32 ):
333+ # Voice Conversion
334+ vc_target = inference_module .cfm .inference (cat_condition ,
335+ torch .LongTensor ([cat_condition .size (1 )]).to (mel2 .device ),
336+ mel2 , style2 , None , diffusion_steps ,
337+ inference_cfg_rate = inference_cfg_rate )
338+ vc_target = vc_target [:, :, mel2 .size (- 1 ):]
339+ vc_wave = vocoder_fn (vc_target ).squeeze ().cpu ()
340340 if vc_wave .ndim == 1 :
341341 vc_wave = vc_wave .unsqueeze (0 )
342342 if processed_frames == 0 :
0 commit comments