modelscope
diff --git a/‎swift/ui/llm_grpo/external_rollout.py
Lines changed: 1 addition & 2 deletions b/‎swift/ui/llm_grpo/external_rollout.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎swift/ui/llm_grpo/external_runtime.py
Lines changed: 3 additions & 2 deletions b/‎swift/ui/llm_grpo/external_runtime.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎swift/ui/llm_grpo/grpo_advanced.py
Lines changed: 159 additions & 9 deletions b/‎swift/ui/llm_grpo/grpo_advanced.py
Lines changed: 159 additions & 9 deletions
diff --git a/‎swift/ui/llm_grpo/llm_grpo.py
Lines changed: 35 additions & 33 deletions b/‎swift/ui/llm_grpo/llm_grpo.py
Lines changed: 35 additions & 33 deletions
@@ -110,7 +110,7 @@ class LLMRollout(BaseUI):
 
     @classmethod
     def do_build_ui(cls, base_tab: Type['BaseUI']):
-        with gr.Accordion(elem_id='llm_rollout', visible=False):
+        with gr.Accordion(elem_id='llm_rollout', open=False, visible=False):
             default_device = 'cpu'
             device_count = get_device_count()
             if device_count > 0:
@@ -119,7 +119,6 @@ def do_build_ui(cls, base_tab: Type['BaseUI']):
                 with gr.Row():
                     gr.Textbox(elem_id='tensor_parallel_size', lines=1, value='1', scale=4)
                     gr.Textbox(elem_id='data_parallel_size', lines=1, value='1', scale=4)
-                    gr.Textbox(elem_id='max_model_len', lines=1, value='', scale=4)
                     gr.Slider(elem_id='gpu_memory_utilization', minimum=0.0, maximum=1.0, step=0.05, value=0.9, scale=4)
                 with gr.Row(equal_height=True):
                     gr.Dropdown(
 
@@ -56,8 +56,8 @@ class RolloutRuntime(Runtime):
                 'en': 'Logging content'
             },
             'info': {
-                'zh': '如果日志无更新请再次点击"展示日志内容"',
-                'en': 'Please press "Show log" if the log content is not updating'
+                'zh': '如果日志无更新请再次点击"展示rollout状态"',
+                'en': 'Please press "Show running status" if the log content is not updating'
             }
         },
         'rollout_running_tasks': {
@@ -90,6 +90,7 @@ def do_build_ui(cls, base_tab: Type['BaseUI']):
             with gr.Blocks():
                 with gr.Row(equal_height=True):
                     gr.Dropdown(elem_id='rollout_running_tasks', scale=10, allow_custom_value=True)
+                with gr.Row(equal_height=True):
                     gr.Button(elem_id='rollout_refresh_tasks', scale=1, variant='primary')
                     gr.Button(elem_id='rollout_show_log', scale=1, variant='primary')
                     gr.Button(elem_id='rollout_stop_show_log', scale=1)
 
@@ -1,8 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+from functools import partial
 from typing import Type
 
 import gradio as gr
 
+from swift.llm import BaseArguments, ModelType
+from swift.llm.model.register import get_all_models
 from swift.ui.base import BaseUI
 
 
@@ -92,20 +95,167 @@ class GrpoAdvanced(BaseUI):
                 'en': 'Skip overlong truncated samples and exclude them from loss calculation'
             }
         },
+        'beta': {
+            'label': {
+                'zh': 'KL正则项系数',
+                'en': 'KL regularization coefficient'
+            }
+        },
+        'vllm_enable_prefix_caching': {
+            'label': {
+                'zh': '开启前缀缓存',
+                'en': 'Enable prefix cache'
+            },
+            'info': {
+                'zh': 'Colocate模式中vLLM透传参数',
+                'en': 'vLLM transparent transmission parameters in colocate mode'
+            }
+        },
+        'log_completions': {
+            'label': {
+                'zh': '记录生成内容',
+                'en': 'Record generated content'
+            },
+            'info': {
+                'zh': '是否记录训练中的模型生成内容',
+                'en': 'Whether to record the model generation content during training'
+            }
+        },
+        'num_iterations': {
+            'label': {
+                'zh': '每个批次更新次数',
+                'en': 'Num of updates per batch'
+            }
+        },
+        'reward_model': {
+            'label': {
+                'zh': '奖励模型id或路径',
+                'en': 'Reward Model id or path'
+            },
+            'info': {
+                'zh': '实际的模型id',
+                'en': 'The actual model id or model path'
+            }
+        },
+        'reward_model_type': {
+            'label': {
+                'zh': '奖励模型类型',
+                'en': 'Select Reward Model Type'
+            },
+            'info': {
+                'zh': 'SWIFT已支持的模型类型',
+                'en': 'Base model type supported by SWIFT'
+            }
+        },
+        'reward_model_plugin': {
+            'label': {
+                'zh': '奖励模型逻辑',
+                'en': 'Reward model logic'
+            },
+            'info': {
+                'zh': '利用reward_model_plugin自定义奖励模型的处理逻辑',
+                'en': 'Use reward_model_plugin to customize the processing logic of the reward model'
+            }
+        },
+        'external_plugins': {
+            'label': {
+                'zh': '外部插件文件',
+                'en': 'External plugin file'
+            },
+            'info': {
+                'zh': '外部插件文件列表，将被注册进插件模块中',
+                'en': 'List of external plugin files that will be registered into the plugin module'
+            }
+        },
+        'ref_model_type': {
+            'label': {
+                'zh': 'Ref模型类型',
+                'en': 'Ref model type'
+            },
+            'info': {
+                'zh': 'SWIFT已支持的模型类型',
+                'en': 'Model type supported by SWIFT'
+            }
+        },
+        'ref_model': {
+            'label': {
+                'zh': 'Ref模型id或路径',
+                'en': 'Ref model id or path'
+            },
+            'info': {
+                'zh': '实际的模型id或路径',
+                'en': 'The actual model id or path'
+            }
+        },
     }
 
     @classmethod
     def do_build_ui(cls, base_tab: Type['BaseUI']):
         with gr.TabItem(elem_id='grpo_advanced_tab'):
             with gr.Blocks():
                 with gr.Row():
-                    gr.Dropdown(elem_id='loss_type', choices=['grpo', 'bnpo', 'dr_grpo'], value='grpo', scale=20)
-                    gr.Textbox(elem_id='epsilon', value=0.2, lines=1, scale=20)
-                    gr.Textbox(elem_id='epsilon_high', value=None, lines=1, scale=20)
-                    gr.Textbox(elem_id='move_model_batches', lines=1, scale=20)
+                    gr.Dropdown(elem_id='loss_type', choices=['grpo', 'bnpo', 'dr_grpo'], value='grpo', scale=4)
+                    gr.Textbox(elem_id='epsilon', value=0.2, lines=1, scale=4)
+                    gr.Textbox(elem_id='epsilon_high', value=None, lines=1, scale=4)
+                    gr.Textbox(elem_id='beta', value=0.04, lines=1, scale=4)
+                    gr.Textbox(elem_id='num_iterations', lines=1, scale=4)
                 with gr.Row():
-                    gr.Textbox(elem_id='multi_turn_scheduler', lines=1, scale=20)
-                    gr.Textbox(elem_id='max_turns', lines=1, scale=20)
-                    gr.Checkbox(elem_id='dynamic_sample', scale=20)
-                    gr.Slider(elem_id='max_resample_times', minimum=1, maximum=16, step=1, value=3, scale=20)
-                    gr.Checkbox(elem_id='overlong_filter', scale=20)
+                    gr.Textbox(elem_id='move_model_batches', lines=1, scale=4)
+                    gr.Checkbox(elem_id='dynamic_sample', scale=4)
+                    gr.Slider(elem_id='max_resample_times', minimum=1, maximum=16, step=1, value=3, scale=4)
+                    gr.Checkbox(elem_id='overlong_filter', scale=4)
+                    gr.Checkbox(elem_id='vllm_enable_prefix_caching', scale=4)
+                with gr.Row():
+                    gr.Checkbox(elem_id='log_completions', scale=4)
+                    gr.Textbox(elem_id='multi_turn_scheduler', lines=1, scale=4)
+                    gr.Textbox(elem_id='max_turns', lines=1, scale=4)
+                    gr.Textbox(elem_id='external_plugins', lines=1, scale=8)
+
+            with gr.Row():
+                gr.Textbox(elem_id='reward_model_plugin', lines=1, scale=8)
+                gr.Dropdown(elem_id='reward_model', multiselect=True, choices=get_all_models(), scale=8)
+                gr.Dropdown(
+                    elem_id='reward_model_type',
+                    multiselect=True,
+                    choices=ModelType.get_model_name_list(),
+                    allow_custom_value=True,
+                    scale=4)
+            with gr.Blocks():
+                with gr.Row():
+                    gr.Dropdown(
+                        elem_id='ref_model', scale=12, value=None, choices=get_all_models(), allow_custom_value=True)
+                    gr.Dropdown(elem_id='ref_model_type', choices=ModelType.get_model_name_list(), value=None, scale=8)
+
+    @classmethod
+    def after_build_ui(cls, base_tab: Type['BaseUI']):
+        cls.element('ref_model').change(
+            partial(cls.update_input_model, allow_keys=['ref_model_type'], has_record=False, is_ref_model=True),
+            inputs=[cls.element('ref_model')],
+            outputs=[cls.element('ref_model_type')])
+        cls.element('reward_model').change(
+            partial(cls.update_input_models, allow_keys=['reward_model_type'], is_reward_model=True, has_record=False),
+            inputs=[cls.element('reward_model')],
+            outputs=[cls.element('reward_model_type')])
+
+    @classmethod
+    def update_input_models(cls,
+                            models,
+                            allow_keys=None,
+                            has_record=False,
+                            arg_cls=BaseArguments,
+                            is_reward_model=False):
+        if models is None:
+            return gr.update()
+        rm_type_str = ''
+        for model in models:
+            rm_type_str = ' '.join([
+                rm_type_str,
+                cls.update_input_model(
+                    model,
+                    allow_keys=allow_keys,
+                    has_record=has_record,
+                    arg_cls=arg_cls,
+                    is_reward_model=is_reward_model)['value']
+            ])
+
+        return gr.update(value=rm_type_str.strip())
@@ -3,6 +3,7 @@
 from typing import Dict, Type
 
 import gradio as gr
+from packaging import version
 
 from swift.llm.argument.base_args.base_args import get_supported_tuners
 from swift.ui.base import BaseUI
@@ -14,14 +15,14 @@
 from swift.ui.llm_grpo.model import GRPOModel
 from swift.ui.llm_grpo.optimizer import GRPOOptimizer
 from swift.ui.llm_grpo.quantization import GRPOQuantization
-from swift.ui.llm_grpo.ref_model import RefModel
 from swift.ui.llm_grpo.report_to import GRPOReportTo
 from swift.ui.llm_grpo.reward import Reward
 from swift.ui.llm_grpo.rollout import Rollout
 from swift.ui.llm_grpo.runtime import GRPORuntime
 from swift.ui.llm_grpo.save import GRPOSave
 from swift.ui.llm_grpo.tuner import GRPOTuner
 from swift.ui.llm_train.llm_train import LLMTrain
+from swift.ui.llm_train.runtime import Runtime
 from swift.utils import get_device_count, get_logger
 
 logger = get_logger()
@@ -32,7 +33,7 @@ class LLMGRPO(LLMTrain):
 
     sub_ui = [
         GRPOModel, GRPODataset, Reward, GRPORuntime, Rollout, GRPOSave, GRPOTuner, GRPOOptimizer, GRPOHyper,
-        GRPOQuantization, GRPOAdvanced, RefModel, GrpoAdvanced, GRPOReportTo, LLMRollout
+        GRPOQuantization, GRPOAdvanced, GrpoAdvanced, GRPOReportTo, LLMRollout
     ]
 
     locale_dict: Dict[str, Dict] = {
@@ -146,16 +147,6 @@ class LLMGRPO(LLMTrain):
                 'en': 'The data parallel size of DDP'
             }
         },
-        'tuner_backend': {
-            'label': {
-                'zh': 'Tuner backend',
-                'en': 'Tuner backend'
-            },
-            'info': {
-                'zh': 'Tuner实现框架',
-                'en': 'The tuner backend'
-            }
-        },
         'use_liger_kernel': {
             'label': {
                 'zh': '使用Liger kernel',
@@ -239,11 +230,17 @@ def do_build_ui(cls, base_tab: Type['BaseUI']):
                 with gr.Accordion(elem_id='train_param', open=True):
                     with gr.Row():
                         gr.Dropdown(elem_id='train_type', scale=4, choices=list(get_supported_tuners()))
-                        gr.Dropdown(elem_id='tuner_backend', scale=4)
                         gr.Textbox(elem_id='seed', scale=4)
                         gr.Dropdown(elem_id='torch_dtype', scale=4)
-                    with gr.Row():
                         gr.Checkbox(elem_id='use_liger_kernel', scale=4)
+                        gr.Textbox(elem_id='sequence_parallel_size', lines=1, scale=4)
+                    with gr.Row():
+                        gr.Dropdown(
+                            elem_id='gpu_id',
+                            multiselect=True,
+                            choices=[str(i) for i in range(device_count)] + ['cpu'],
+                            value=default_device,
+                            scale=8)
                         gr.Checkbox(elem_id='use_ddp', value=False, scale=4)
                         gr.Textbox(elem_id='ddp_num', value='1', scale=4)
                         gr.Dropdown(
@@ -252,25 +249,17 @@ def do_build_ui(cls, base_tab: Type['BaseUI']):
                             allow_custom_value=True,
                             value=None,
                             choices=['zero0', 'zero1', 'zero2', 'zero3', 'zero2_offload', 'zero3_offload'])
-                        gr.Textbox(elem_id='sequence_parallel_size', lines=1, scale=4)
                 GRPOHyper.build_ui(base_tab)
                 GRPORuntime.build_ui(base_tab)
                 with gr.Row(equal_height=True):
-                    gr.Dropdown(
-                        elem_id='gpu_id',
-                        multiselect=True,
-                        choices=[str(i) for i in range(device_count)] + ['cpu'],
-                        value=default_device,
-                        scale=8)
-                    gr.Textbox(elem_id='envs', scale=8)
+                    gr.Textbox(elem_id='envs', scale=12)
                     gr.Checkbox(elem_id='dry_run', value=False, scale=4)
                     submit = gr.Button(elem_id='submit', scale=4, variant='primary')
 
                 Rollout.build_ui(base_tab)
                 LLMRollout.set_lang(cls.lang)
                 LLMRollout.build_ui(LLMRollout)
                 GRPOTuner.build_ui(base_tab)
-                RefModel.build_ui(base_tab)
                 with gr.Accordion(elem_id='extra_params', open=True):
                     with gr.Tabs():
                         GrpoAdvanced.build_ui(base_tab)
@@ -286,13 +275,6 @@ def do_build_ui(cls, base_tab: Type['BaseUI']):
                     inputs=[base_tab.element('train_type')],
                     outputs=[cls.element('learning_rate')])
 
-                base_tab.element('gpu_id').change(
-                    cls.update_ddp_num,
-                    [base_tab.element('gpu_id'), base_tab.element('use_ddp')], base_tab.element('ddp_num'))
-                base_tab.element('use_ddp').change(
-                    cls.update_ddp_num,
-                    [base_tab.element('gpu_id'), base_tab.element('use_ddp')], base_tab.element('ddp_num'))
-
                 submit.click(
                     cls.train_local,
                     list(cls.valid_elements().values()), [
@@ -312,15 +294,35 @@ def do_build_ui(cls, base_tab: Type['BaseUI']):
                        cls.element('template')],
                     [LLMRollout.element('rollout_runtime_tab'),
                      LLMRollout.element('rollout_running_tasks')])
-                base_tab.element('running_tasks').change(
-                    partial(GRPORuntime.task_changed, base_tab=base_tab), [base_tab.element('running_tasks')],
-                    list(base_tab.valid_elements().values()) + [cls.element('log')] + GRPORuntime.all_plots)
+
                 GRPORuntime.element('kill_task').click(
                     GRPORuntime.kill_task,
                     [GRPORuntime.element('running_tasks')],
                     [GRPORuntime.element('running_tasks')] + [GRPORuntime.element('log')] + GRPORuntime.all_plots,
                 ).then(GRPORuntime.reset, [], [GRPORuntime.element('logging_dir')] + [GRPOHyper.element('output_dir')])
 
+                base_tab.element('gpu_id').change(
+                    cls.update_ddp_num,
+                    [base_tab.element('gpu_id'), base_tab.element('use_ddp')], base_tab.element('ddp_num'))
+                base_tab.element('use_ddp').change(
+                    cls.update_ddp_num,
+                    [base_tab.element('gpu_id'), base_tab.element('use_ddp')], base_tab.element('ddp_num'))
+                base_tab.element('ddp_num').change(Rollout.update_num_gen, [
+                    GRPOHyper.element('per_device_train_batch_size'),
+                    GRPOHyper.element('gradient_accumulation_steps'),
+                    cls.element('ddp_num')
+                ], [Rollout.element('num_generations')])
+                GRPOHyper.element('gradient_accumulation_steps').change(Rollout.update_num_gen, [
+                    GRPOHyper.element('per_device_train_batch_size'),
+                    GRPOHyper.element('gradient_accumulation_steps'),
+                    cls.element('ddp_num')
+                ], [Rollout.element('num_generations')])
+                GRPOHyper.element('per_device_train_batch_size').change(Rollout.update_num_gen, [
+                    GRPOHyper.element('per_device_train_batch_size'),
+                    GRPOHyper.element('gradient_accumulation_steps'),
+                    cls.element('ddp_num')
+                ], [Rollout.element('num_generations')])
+
     @classmethod
     def prepare_sub_to_filter(cls):
         tabs_relation_dict = {