fix

PaddlePaddle · Liujie0926 · May 13, 2025 · May 7, 2025 · May 7, 2025 · May 7, 2025
commit f673a7b44578b485397bc37e46c971a30d03dc4e
diff --git a/tests/fixtures/llm/grpo.yaml b/tests/fixtures/llm/grpo.yaml
diff --git a/tests/fixtures/llm/reinforce_plus_plus.yaml b/tests/fixtures/llm/reinforce_plus_plus.yaml
diff --git a/tests/llm/test_grpo.py b/tests/llm/test_grpo.py
@@ -34,7 +34,7 @@
     [["qwen"]],
 )
 class FinetuneTest(LLMTest, unittest.TestCase):
-    config_path: str = "./tests/fixtures/llm/grpo.yaml"
+    config_path: str = None
     model_dir: str = None
 
     def setUp(self) -> None:
@@ -88,17 +88,26 @@ def test_finetune(self):
             time.sleep(30)
 
             # 运行主逻辑
+            repo_path = os.getcwd()
             rl_dir = os.path.join(os.getcwd(), "./llm/alignment/rl")
             os.chdir(rl_dir)
-            cmd = "python -u -m paddle.distributed.launch --devices \"$CUDA_VISIBLE_DEVICES\" run_rl.py ./tests/fixtures/llm/grpo.yaml"
+            cmd = "python -u -m paddle.distributed.launch \
+                    --devices \"$CUDA_VISIBLE_DEVICES\" run_rl.py \
+                    ../../config/qwen/reinforce_plus_plus_argument.yaml \
+                    --actor_model_name_or_path \"Qwen/Qwen2-1.5B\" \
+                    --max_dec_len 128 \
+                    --max_steps 3 \
+                    --kl_coeff 0.000 \
+                    --kl_loss_coeff 0.000 \
+                    --use_fused_rms_norm true "
             pro = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             out, err = pro.communicate()
             print(out)
             pro.wait()
             pro.returncode == 0
             assert str(out).find("Error") == -1
             assert str(err).find("Error") == -1
-
+            os.chdir(repo_path)
         finally:
             # main 执行完毕，关闭 reward server
             if reward_proc.poll() is None:  # 确保进程还在

diff --git a/tests/llm/test_reinforce_plus_plus.py b/tests/llm/test_reinforce_plus_plus.py
@@ -21,7 +21,6 @@
 import subprocess
 import time
 import signal
-from unittest import skip
 
 from parameterized import parameterized_class
 
@@ -35,7 +34,7 @@
     [["qwen"]],
 )
 class FinetuneTest(LLMTest, unittest.TestCase):
-    config_path: str = "./tests/fixtures/llm/reinforce_plus_plus.yaml"
+    config_path: str = None
     model_dir: str = None
 
     def setUp(self) -> None:
@@ -59,7 +58,6 @@ def test_finetune(self):
             "FLAGS_mla_use_tensorcore": "0",
             "FLAGS_cascade_attention_max_partition_size": "2048",
         }
-
         case_env = os.environ.copy()
         case_env.update(env_vars)
 
@@ -70,7 +68,7 @@ def test_finetune(self):
                 shell=True,
                 check=True
             )
-        
+
         # 启动 reward server
         reward_dir = os.path.join(os.getcwd(), "./llm/alignment/rl/reward")
         reward_log = os.path.join(reward_dir, "reward_server.log")
@@ -90,17 +88,27 @@ def test_finetune(self):
             time.sleep(30)
 
             # 运行主逻辑
+            repo_path = os.getcwd()
             rl_dir = os.path.join(os.getcwd(), "./llm/alignment/rl")
             os.chdir(rl_dir)
-            cmd = "python -u -m paddle.distributed.launch --devices \"$CUDA_VISIBLE_DEVICES\" run_rl.py ./tests/fixtures/llm/reinforce_plus_plus.yaml"
+            cmd = "python -u -m paddle.distributed.launch \
+                    --devices \"$CUDA_VISIBLE_DEVICES\" run_rl.py \
+                    ../../config/qwen/reinforce_plus_plus_argument.yaml \
+                    --rl_algorithm \"reinforce_plus_plus\" \
+                    --actor_model_name_or_path \"Qwen/Qwen2-1.5B\" \
+                    --max_dec_len 128 \
+                    --max_steps 3 \
+                    --kl_coeff 0.000 \
+                    --kl_loss_coeff 0.000 \
+                    --use_fused_rms_norm true "
             pro = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             out, err = pro.communicate()
             print(out)
             pro.wait()
             pro.returncode == 0
             assert str(out).find("Error") == -1
             assert str(err).find("Error") == -1
-
+            os.chdir(repo_path)
         finally:
             # main 执行完毕，关闭 reward server
             if reward_proc.poll() is None:  # 确保进程还在