Closed
Description
我现在正在使用GRPO进行全参微调,不过速度有点慢,现在主要是我对当前输出的结果感到疑问,因为我发现训练的loss在上升,后面直接上升到0,因为输出不是想要的数字了,且reward没有上升的迹象,甚至在下降,如下所示:
{"loss": 4.76e-06, "grad_norm": 16.80255157, "learning_rate": 1e-05, "memory(GiB)": 44.32, "train_speed(iter/s)": 0.002302, "completions/mean_length": 5.54166698, "completions/min_length": 5.0, "completions/max_length": 6.0, "completions/clipped_ratio": 0.0, "rewards/VideoQualityRewardORM/mean": 0.60864663, "rewards/VideoQualityRewardORM/std": 0.32342851, "reward": 0.60864663, "reward_std": 0.03382213, "kl": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.00262123, "global_step/max_steps": "1/3820", "percentage": "0.03%", "elapsed_time": "4m 52s", "remaining_time": "12d 22h 12m 9s"}
{"loss": 0.00544535, "grad_norm": 12.94114975, "learning_rate": 1e-05, "memory(GiB)": 44.32, "train_speed(iter/s)": 0.003536, "completions/mean_length": 5.79629649, "completions/min_length": 5.11111111, "completions/max_length": 6.0, "completions/clipped_ratio": 0.0, "rewards/VideoQualityRewardORM/mean": 0.54207034, "rewards/VideoQualityRewardORM/std": 0.28235812, "reward": 0.54207035, "reward_std": 0.04684165, "kl": 0.53551398, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.02621232, "global_step/max_steps": "10/3820", "percentage": "0.26%", "elapsed_time": "44m 45s", "remaining_time": "11d 20h 15m 33s"}
{"loss": 0.01552085, "grad_norm": 10.53615692, "learning_rate": 1e-05, "memory(GiB)": 44.32, "train_speed(iter/s)": 0.003653, "completions/mean_length": 5.78750014, "completions/min_length": 5.1, "completions/max_length": 6.0, "completions/clipped_ratio": 0.0, "rewards/VideoQualityRewardORM/mean": 0.45943401, "rewards/VideoQualityRewardORM/std": 0.28749434, "reward": 0.45943403, "reward_std": 0.03159176, "kl": 1.55068817, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.05242464, "global_step/max_steps": "20/3820", "percentage": "0.52%", "elapsed_time": "1h 28m 52s", "remaining_time": "11d 17h 26m 34s"}
{"loss": 0.02426582, "grad_norm": 21.23141336, "learning_rate": 1e-05, "memory(GiB)": 44.32, "train_speed(iter/s)": 0.003679, "completions/mean_length": 5.70833344, "completions/min_length": 5.0, "completions/max_length": 6.0, "completions/clipped_ratio": 0.0, "rewards/VideoQualityRewardORM/mean": 0.45706776, "rewards/VideoQualityRewardORM/std": 0.28923585, "reward": 0.45706777, "reward_std": 0.01115019, "kl": 2.41513672, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.07863696, "global_step/max_steps": "30/3820", "percentage": "0.79%", "elapsed_time": "2h 13m 31s", "remaining_time": "11d 17h 8m 44s"}
{"loss": 93.04180908, "grad_norm": 1.73205081, "learning_rate": 1e-05, "memory(GiB)": 44.32, "train_speed(iter/s)": 0.003702, "completions/mean_length": 4.91666675, "completions/min_length": 4.5, "completions/max_length": 5.1, "completions/clipped_ratio": 0.3, "rewards/VideoQualityRewardORM/mean": 0.26713088, "rewards/VideoQualityRewardORM/std": 0.1459536, "reward": 0.26713089, "reward_std": 0.01870803, "kl": NaN, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.10484928, "global_step/max_steps": "40/3820", "percentage": "1.05%", "elapsed_time": "2h 57m 44s", "remaining_time": "11d 15h 56m 23s"}
{"loss": 0.0, "grad_norm": 1.73205081, "learning_rate": 1e-05, "memory(GiB)": 44.32, "train_speed(iter/s)": 0.003714, "completions/mean_length": 3.0, "completions/min_length": 3.0, "completions/max_length": 3.0, "completions/clipped_ratio": 1.0, "rewards/VideoQualityRewardORM/mean": 0.0, "rewards/VideoQualityRewardORM/std": 0.0, "reward": 0.0, "reward_std": 0.0, "kl": NaN, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/high_max": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.1310616, "global_step/max_steps": "50/3820", "percentage": "1.31%", "elapsed_time": "3h 41m 59s", "remaining_time": "11d 14h 58m 39s"}
{"step": ["1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"], "prompt": ["<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n"], "completion": ["33.33", "33.33", "33.33", "32.0", "32.0", "32.0", "52.0", "52.0", "52.0", "58.67", "58.67", "57.33", "46.67", "46.67", "46.67", "28.0", "28.0", "28.0", "46.67", "46.67", "46.67", "36.0", "36.0", "34.67"], "VideoQualityRewardORM": [0.5862552523612976, 0.5862552523612976, 0.5862552523612976, 0.2018965184688568, 0.2018965184688568, 0.2018965184688568, 0.04076220467686653, 0.04076220467686653, 0.04076220467686653, 0.7649077773094177, 0.7649077773094177, 1.0, 1.0, 1.0, 1.0, 0.7664391398429871, 0.7664391398429871, 0.7664391398429871, 0.5862552523612976, 0.5862552523612976, 0.5862552523612976, 0.7664391398429871, 0.7664391398429871, 1.0]}
{"step": ["10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10"], "prompt": ["<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n"], "completion": ["50.67", "50.67", "50.67", "30.67", "30.67", "30.67", "77.33", "77.33", "77.33", "81.33", "81.33", "81.33", "57.33", "57.33", "57.33", "69.33", "69.33", "69.33", "74.67", "70.67", "72.0", "74.67", "77.33", "77.33"], "VideoQualityRewardORM": [0.5862552523612976, 0.5862552523612976, 0.5862552523612976, 1.0, 1.0, 1.0, 0.4090164005756378, 0.4090164005756378, 0.4090164005756378, 0.4493289589881897, 0.4493289589881897, 0.4493289589881897, 0.3443832993507385, 0.3443832993507385, 0.3443832993507385, 0.06952978670597076, 0.06952978670597076, 0.06952978670597076, 0.5862552523612976, 0.7664391398429871, 1.0, 0.5874289274215698, 1.0, 1.0]}
{"step": ["20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20"], "prompt": ["<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n"], "completion": ["57.33", "57.33", "57.33", "49.33", "49.33", "49.33", "49.33", "49.33", "49.33", "42.67", "42.67", "42.67", "64.0", "64.0", "64.0", "53.33", "53.33", "53.33", "60.0", "60.0", "60.0", "49.33", "46.67", "49.33"], "VideoQualityRewardORM": [0.15443222224712372, 0.15443222224712372, 0.15443222224712372, 0.7649077773094177, 0.7649077773094177, 0.7649077773094177, 0.15474139153957367, 0.15474139153957367, 0.15474139153957367, 0.7664391398429871, 0.7664391398429871, 0.7664391398429871, 0.3443832993507385, 0.3443832993507385, 0.3443832993507385, 0.5874289274215698, 0.5874289274215698, 0.5874289274215698, 0.3443832993507385, 0.3443832993507385, 0.3443832993507385, 0.2018965184688568, 0.3436952233314514, 0.2018965184688568]}
{"step": ["30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30", "30"], "prompt": ["<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n"], "completion": ["64.0", "64.0", "64.0", "46.67", "46.67", "46.67", "57.33", "57.33", "57.33", "77.33", "77.33", "77.33", "42.67", "42.67", "42.67", "44.0", "42.67", "44.0", "82.67", "82.67", "82.67", "54.67", "54.67", "54.67"], "VideoQualityRewardORM": [0.3443832993507385, 0.3443832993507385, 0.3443832993507385, 0.4493289589881897, 0.4493289589881897, 0.4493289589881897, 0.06952978670597076, 0.06952978670597076, 0.06952978670597076, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7664391398429871, 1.0, 0.7664391398429871, 0.7664391398429871, 0.7664391398429871, 0.4493289589881897, 0.4493289589881897, 0.4493289589881897]}
{"step": ["40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40", "40"], "prompt": ["<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n"], "completion": ["!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!"], "VideoQualityRewardORM": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
{"step": ["50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50", "50"], "prompt": ["<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n", "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<video> How would you rate the video quality of this video? Please only answer with a specific score.<|im_end|>\n<|im_start|>assistant\n"], "completion": ["!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!", "!!!!!!!!!!!!!!!!"], "VideoQualityRewardORM": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
提供一下我的运行脚本文件:
#!/bin/bash
# --- 用户配置 ---
# SFT-LoRA微调后的模型checkpoint路径
SFT_CHECKPOINT_PATH="/DATA/DATA3/wjt/AIGV-qwen2.5/SFT+RL/SFT/v2-20250701-130656/checkpoint-12000-merged"
# 预处理后的GRPO数据集路径
GRPO_DATASET_PATH="/DATA/DATA3/wjt/AIGV-qwen2.5/RL_JSON/sft_002_tv_cleaned.jsonl"
# 自定义plugin.py的路径
PLUGIN_PATH="/DATA/DATA3/wjt/AIGV-qwen2.5/AIGV-main/GPRO/plugin2.py"
# GRPO训练的输出目录
OUTPUT_DIR="/DATA/DATA3/wjt/AIGV-qwen2.5/SFT+RL/RL"
# --- 分布式训练配置 ---
# 使用的GPU ID (根据您的实际情况修改)
GPU_IDS="1,2,3"
# 每个节点的进程数 (必须等于GPU_IDS中的GPU数量)
NPROC_PER_NODE=3
# 设置环境变量,使其对swift命令可见
export MKL_SERVICE_FORCE_INTEL=1
export SWIFT_SKIP_BAD_DATA="true"
# --- GRPO 训练命令 ---
CUDA_VISIBLE_DEVICES=${GPU_IDS} \
NPROC_PER_NODE=${NPROC_PER_NODE} \
swift rlhf \
--rlhf_type grpo \
--model_type qwen2_5_vl \
--model ${SFT_CHECKPOINT_PATH} \
--train_type full \
--dataset ${GRPO_DATASET_PATH} \
--external_plugins ${PLUGIN_PATH} \
--reward_funcs 'video_quality_reward' \
--use_vllm true \
--vllm_mode colocate \
--vllm_gpu_memory_utilization 0.4 \
--torch_dtype bfloat16 \
--num_train_epochs 10 \
--max_completion_length 16 \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 4 \
--learning_rate 1e-5 \
--beta 0.01 \
--num_generations 3 \
--temperature 0.1 \
--save_strategy 'steps' \
--save_steps 500 \
--eval_steps 500 \
--save_total_limit 3 \
--logging_steps 10 \
--output_dir ${OUTPUT_DIR} \
--dataloader_num_workers 2 \
--report_to wandb \
--deepspeed zero3 \
--log_completions true
下面是奖励相关的设置:
# ====================================================================
# plugin2.py (最终修复版 - 提取 solution 中的数字)
# ====================================================================
import math
import re # 导入正则表达式模块
from typing import List
from swift.plugin.orm import ORM, orms
class VideoQualityRewardORM(ORM):
"""
自定义视频质量评估的奖励模型。
这个版本修复了对包含XML标签的 solution 字符串的处理。
"""
def __call__(self, completions: List[str], solution: List[str], **kwargs) -> List[float]:
rewards = []
# 数据对齐逻辑保持不变
num_generations = kwargs.get('num_generations', 1)
if len(completions) != len(solution) * num_generations:
# 这个错误检查现在变得非常重要
print(f"[FATAL ERROR] Length mismatch! len(completions)={len(completions)}, len(solution)={len(solution)}, num_gen={num_generations}")
return [0.0] * len(completions)
aligned_solutions = []
for s in solution:
aligned_solutions.extend([s] * int(num_generations))
for pred_str, gt_str_with_tags in zip(completions, aligned_solutions):
reward = 0.0
try:
# 1. 处理模型预测值 (逻辑不变)
predicted_score = float(pred_str.strip())
# 2. 【关键修复】从带标签的真实值字符串中提取数字
# 使用正则表达式查找<answer>和</answer>之间的浮点数或整数
match = re.search(r'<answer>(.*?)</answer>', gt_str_with_tags)
if match:
# 如果找到了匹配项,提取括号内的内容并转换为浮点数
ground_truth_score_str = match.group(1).strip()
ground_truth_score = float(ground_truth_score_str)
else:
# 如果没有找到匹配的标签,尝试直接转换,以防万一有纯数字格式
ground_truth_score = float(gt_str_with_tags.strip())
# 3. 计算奖励 (逻辑不变)
error = abs(predicted_score - ground_truth_score)
reward = math.exp(-0.2 * error)
except (ValueError, TypeError, AttributeError):
# 增加了AttributeError以防match为None时出错
# 如果任何一步转换失败,奖励保持为0
pass
rewards.append(reward)
return rewards
orms['video_quality_reward'] = VideoQualityRewardORM
请问这个现象是正常的吗,我昨天使用lora微调是没有任何问题的,今天改成了全参+deepspeed。同时我也想知道这个GRPO训练好的权重应该如何合并啊
Metadata
Metadata
Assignees
Labels
No labels