Skip to content

GRPO训练结果异常 #4800

Closed
Closed
@JuntongWang

Description

@JuntongWang

我现在正在使用GRPO进行全参微调,不过速度有点慢,现在主要是我对当前输出的结果感到疑问,因为我发现训练的loss在上升,后面直接上升到0,因为输出不是想要的数字了,且reward没有上升的迹象,甚至在下降,如下所示:

{"loss": 4.76e-06, "grad_norm": 16.80255157, "learning_rate": 1e-05, "memory(GiB)": 44.32, "train_speed(iter/s)": 0.002302, "completions/mean_length": 5.54166698, "completions/min_length": 5.0, "completions/max_length": 6.0, "completions/clipped_ratio": 0.0, "rewards/VideoQualityRewardORM/mean": 0.60864663, "rewards/VideoQualityRewardORM/std": 0.32342851, "reward": 0.60864663, "reward_std": 0.03382213, "kl": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00262123, "global_step/max_steps": "1/3820", "percentage": "0.03%", "elapsed_time": "4m 52s", "remaining_time": "12d 22h 12m 9s"}
{"loss": 0.00544535, "grad_norm": 12.94114975, "learning_rate": 1e-05, "memory(GiB)": 44.32, "train_speed(iter/s)": 0.003536, "completions/mean_length": 5.79629649, "completions/min_length": 5.11111111, "completions/max_length": 6.0, "completions/clipped_ratio": 0.0, "rewards/VideoQualityRewardORM/mean": 0.54207034, "rewards/VideoQualityRewardORM/std": 0.28235812, "reward": 0.54207035, "reward_std": 0.04684165, "kl": 0.53551398, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02621232, "global_step/max_steps": "10/3820", "percentage": "0.26%", "elapsed_time": "44m 45s", "remaining_time": "11d 20h 15m 33s"}
{"loss": 0.01552085, "grad_norm": 10.53615692, "learning_rate": 1e-05, "memory(GiB)": 44.32, "train_speed(iter/s)": 0.003653, "completions/mean_length": 5.78750014, "completions/min_length": 5.1, "completions/max_length": 6.0, "completions/clipped_ratio": 0.0, "rewards/VideoQualityRewardORM/mean": 0.45943401, "rewards/VideoQualityRewardORM/std": 0.28749434, "reward": 0.45943403, "reward_std": 0.03159176, "kl": 1.55068817, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.05242464, "global_step/max_steps": "20/3820", "percentage": "0.52%", "elapsed_time": "1h 28m 52s", "remaining_time": "11d 17h 26m 34s"}
{"loss": 0.02426582, "grad_norm": 21.23141336, "learning_rate": 1e-05, "memory(GiB)": 44.32, "train_speed(iter/s)": 0.003679, "completions/mean_length": 5.70833344, "completions/min_length": 5.0, "completions/max_length": 6.0, "completions/clipped_ratio": 0.0, "rewards/VideoQualityRewardORM/mean": 0.45706776, "rewards/VideoQualityRewardORM/std": 0.28923585, "reward": 0.45706777, "reward_std": 0.01115019, "kl": 2.41513672, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07863696, "global_step/max_steps": "30/3820", "percentage": "0.79%", "elapsed_time": "2h 13m 31s", "remaining_time": "11d 17h 8m 44s"}
{"loss": 93.04180908, "grad_norm": 1.73205081, "learning_rate": 1e-05, "memory(GiB)": 44.32, "train_speed(iter/s)": 0.003702, "completions/mean_length": 4.91666675, "completions/min_length": 4.5, "completions/max_length": 5.1, "completions/clipped_ratio": 0.3, "rewards/VideoQualityRewardORM/mean": 0.26713088, "rewards/VideoQualityRewardORM/std": 0.1459536, "reward": 0.26713089, "reward_std": 0.01870803, "kl": NaN, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10484928, "global_step/max_steps": "40/3820", "percentage": "1.05%", "elapsed_time": "2h 57m 44s", "remaining_time": "11d 15h 56m 23s"}
{"loss": 0.0, "grad_norm": 1.73205081, "learning_rate": 1e-05, "memory(GiB)": 44.32, "train_speed(iter/s)": 0.003714, "completions/mean_length": 3.0, "completions/min_length": 3.0, "completions/max_length": 3.0, "completions/clipped_ratio": 1.0, "rewards/VideoQualityRewardORM/mean": 0.0, "rewards/VideoQualityRewardORM/std": 0.0, "reward": 0.0, "reward_std": 0.0, "kl": NaN, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1310616, "global_step/max_steps": "50/3820", "percentage": "1.31%", "elapsed_time": "3h 41m 59s", "remaining_time": "11d 14h 58m 39s"}


{"step": ["1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"], "prompt": ["<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n"], "completion": ["33.33", "33.33", "33.33", "32.0", "32.0", "32.0", "52.0", "52.0", "52.0", "58.67", "58.67", "57.33", "46.67", "46.67", "46.67", "28.0", "28.0", "28.0", "46.67", "46.67", "46.67", "36.0", "36.0", "34.67"], "VideoQualityRewardORM": [0.5862552523612976, 0.5862552523612976, 0.5862552523612976, 0.2018965184688568, 0.2018965184688568, 0.2018965184688568, 0.04076220467686653, 0.04076220467686653, 0.04076220467686653, 0.7649077773094177, 0.7649077773094177, 1.0, 1.0, 1.0, 1.0, 0.7664391398429871, 0.7664391398429871, 0.7664391398429871, 0.5862552523612976, 0.5862552523612976, 0.5862552523612976, 0.7664391398429871, 0.7664391398429871, 1.0]}
{"step": ["10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10"], "prompt": ["<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n"], "completion": ["50.67", "50.67", "50.67", "30.67", "30.67", "30.67", "77.33", "77.33", "77.33", "81.33", "81.33", "81.33", "57.33", "57.33", "57.33", "69.33", "69.33", "69.33", "74.67", "70.67", "72.0", "74.67", "77.33", "77.33"], "VideoQualityRewardORM": [0.5862552523612976, 0.5862552523612976, 0.5862552523612976, 1.0, 1.0, 1.0, 0.4090164005756378, 0.4090164005756378, 0.4090164005756378, 0.4493289589881897, 0.4493289589881897, 0.4493289589881897, 0.3443832993507385, 0.3443832993507385, 0.3443832993507385, 0.06952978670597076, 0.06952978670597076, 0.06952978670597076, 0.5862552523612976, 0.7664391398429871, 1.0, 0.5874289274215698, 1.0, 1.0]}
{"step": ["20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20"], "prompt": ["<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n"], "completion": ["57.33", "57.33", "57.33", "49.33", "49.33", "49.33", "49.33", "49.33", "49.33", "42.67", "42.67", "42.67", "64.0", "64.0", "64.0", "53.33", "53.33", "53.33", "60.0", "60.0", "60.0", "49.33", "46.67", "49.33"], "VideoQualityRewardORM": [0.15443222224712372, 0.15443222224712372, 0.15443222224712372, 0.7649077773094177, 0.7649077773094177, 0.7649077773094177, 0.15474139153957367, 0.15474139153957367, 0.15474139153957367, 0.7664391398429871, 0.7664391398429871, 0.7664391398429871, 0.3443832993507385, 0.3443832993507385, 0.3443832993507385, 0.5874289274215698, 0.5874289274215698, 0.5874289274215698, 0.3443832993507385, 0.3443832993507385, 0.3443832993507385, 0.2018965184688568, 0.3436952233314514, 0.2018965184688568]}
{"step": ["30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30"], "prompt": ["<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n"], "completion": ["64.0", "64.0", "64.0", "46.67", "46.67", "46.67", "57.33", "57.33", "57.33", "77.33", "77.33", "77.33", "42.67", "42.67", "42.67", "44.0", "42.67", "44.0", "82.67", "82.67", "82.67", "54.67", "54.67", "54.67"], "VideoQualityRewardORM": [0.3443832993507385, 0.3443832993507385, 0.3443832993507385, 0.4493289589881897, 0.4493289589881897, 0.4493289589881897, 0.06952978670597076, 0.06952978670597076, 0.06952978670597076, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7664391398429871, 1.0, 0.7664391398429871, 0.7664391398429871, 0.7664391398429871, 0.4493289589881897, 0.4493289589881897, 0.4493289589881897]}
{"step": ["40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40"], "prompt": ["<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n"], "completion": ["!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!"], "VideoQualityRewardORM": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
{"step": ["50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50"], "prompt": ["<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n"], "completion": ["!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!"], "VideoQualityRewardORM": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}

提供一下我的运行脚本文件:

#!/bin/bash

# --- 用户配置 ---
# SFT-LoRA微调后的模型checkpoint路径
SFT_CHECKPOINT_PATH="/DATA/DATA3/wjt/AIGV-qwen2.5/SFT+RL/SFT/v2-20250701-130656/checkpoint-12000-merged"
# 预处理后的GRPO数据集路径
GRPO_DATASET_PATH="/DATA/DATA3/wjt/AIGV-qwen2.5/RL_JSON/sft_002_tv_cleaned.jsonl" 
# 自定义plugin.py的路径
PLUGIN_PATH="/DATA/DATA3/wjt/AIGV-qwen2.5/AIGV-main/GPRO/plugin2.py"
# GRPO训练的输出目录
OUTPUT_DIR="/DATA/DATA3/wjt/AIGV-qwen2.5/SFT+RL/RL"

# --- 分布式训练配置 ---
# 使用的GPU ID (根据您的实际情况修改)
GPU_IDS="1,2,3"
# 每个节点的进程数 (必须等于GPU_IDS中的GPU数量)
NPROC_PER_NODE=3

# 设置环境变量,使其对swift命令可见
export MKL_SERVICE_FORCE_INTEL=1
export SWIFT_SKIP_BAD_DATA="true"
# --- GRPO 训练命令 ---
CUDA_VISIBLE_DEVICES=${GPU_IDS} \
NPROC_PER_NODE=${NPROC_PER_NODE} \
swift rlhf \
    --rlhf_type grpo \
    --model_type qwen2_5_vl \
    --model ${SFT_CHECKPOINT_PATH} \
    --train_type full \
    --dataset ${GRPO_DATASET_PATH} \
    --external_plugins ${PLUGIN_PATH} \
    --reward_funcs 'video_quality_reward' \
    --use_vllm true \
    --vllm_mode colocate \
    --vllm_gpu_memory_utilization 0.4 \
    --torch_dtype bfloat16 \
    --num_train_epochs 10 \
    --max_completion_length 16 \
    --per_device_train_batch_size 2 \
    --gradient_accumulation_steps 4 \
    --learning_rate 1e-5 \
    --beta 0.01 \
    --num_generations 3 \
    --temperature 0.1 \
    --save_strategy 'steps' \
    --save_steps 500 \
    --eval_steps 500 \
    --save_total_limit 3 \
    --logging_steps 10 \
    --output_dir ${OUTPUT_DIR} \
    --dataloader_num_workers 2 \
    --report_to wandb \
    --deepspeed zero3 \
    --log_completions true

下面是奖励相关的设置:

# ====================================================================
#          plugin2.py (最终修复版 - 提取 solution 中的数字)
# ====================================================================
import math
import re  # 导入正则表达式模块
from typing import List

from swift.plugin.orm import ORM, orms

class VideoQualityRewardORM(ORM):
    """
    自定义视频质量评估的奖励模型。
    这个版本修复了对包含XML标签的 solution 字符串的处理。
    """
    def __call__(self, completions: List[str], solution: List[str], **kwargs) -> List[float]:
        rewards = []
        
        # 数据对齐逻辑保持不变
        num_generations = kwargs.get('num_generations', 1)
        if len(completions) != len(solution) * num_generations:
            # 这个错误检查现在变得非常重要
            print(f"[FATAL ERROR] Length mismatch! len(completions)={len(completions)}, len(solution)={len(solution)}, num_gen={num_generations}")
            return [0.0] * len(completions)
        aligned_solutions = []
        for s in solution:
            aligned_solutions.extend([s] * int(num_generations))
        
        for pred_str, gt_str_with_tags in zip(completions, aligned_solutions):
            reward = 0.0
            try:
                # 1. 处理模型预测值 (逻辑不变)
                predicted_score = float(pred_str.strip())

                # 2. 【关键修复】从带标签的真实值字符串中提取数字
                # 使用正则表达式查找<answer>和</answer>之间的浮点数或整数
                match = re.search(r'<answer>(.*?)</answer>', gt_str_with_tags)
                
                if match:
                    # 如果找到了匹配项,提取括号内的内容并转换为浮点数
                    ground_truth_score_str = match.group(1).strip()
                    ground_truth_score = float(ground_truth_score_str)
                else:
                    # 如果没有找到匹配的标签,尝试直接转换,以防万一有纯数字格式
                    ground_truth_score = float(gt_str_with_tags.strip())

                # 3. 计算奖励 (逻辑不变)
                error = abs(predicted_score - ground_truth_score)
                reward = math.exp(-0.2 * error)

            except (ValueError, TypeError, AttributeError):
                # 增加了AttributeError以防match为None时出错
                # 如果任何一步转换失败,奖励保持为0
                pass
            
            rewards.append(reward)
            
        return rewards

orms['video_quality_reward'] = VideoQualityRewardORM

请问这个现象是正常的吗,我昨天使用lora微调是没有任何问题的,今天改成了全参+deepspeed。同时我也想知道这个GRPO训练好的权重应该如何合并啊

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions