update qwen2_5_omni (#3908)

Jintao-Huang · web-flow · commit 04e226d104dd · 2025-04-21T10:16:53.000+08:00
diff --git a/examples/train/grpo/qwen2_5_omni/grpo.sh b/examples/train/grpo/qwen2_5_omni/grpo.sh
@@ -1,6 +1,6 @@
 # 4 * 50GiB
 pip uninstall transformers
-pip install git+https://github.com/huggingface/transformers@f742a644ca32e65758c3adb36225aef1731bd2a8
+pip install git+https://github.com/huggingface/transformers
 pip install math_verify trl -U
 
 MAX_PIXELS=1003520 \
diff --git a/examples/train/multimodal/omni/sft.sh b/examples/train/multimodal/omni/sft.sh
@@ -1,7 +1,7 @@
 # 4*35GB
 # A demo for four modalities that can be run directly
 pip uninstall transformers
-pip install git+https://github.com/huggingface/transformers@f742a644ca32e65758c3adb36225aef1731bd2a8
+pip install git+https://github.com/huggingface/transformers
 
 nproc_per_node=4
 
diff --git a/examples/train/packing/qwen2_5_omni.sh b/examples/train/packing/qwen2_5_omni.sh
@@ -3,7 +3,7 @@
 # A demo for four modalities that can be run directly
 # For local datasets, it is recommended to use streaming: `--streaming true` (save memory)
 pip uninstall transformers
-pip install git+https://github.com/huggingface/transformers@f742a644ca32e65758c3adb36225aef1731bd2a8
+pip install git+https://github.com/huggingface/transformers
 
 NPROC_PER_NODE=4 \
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py
@@ -614,9 +614,9 @@ def get_model_tokenizer_qwen2_5_vl(*args, **kwargs):
 
 
 def get_model_tokenizer_qwen2_5_omni(model_dir, *args, **kwargs):
-    from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor, Qwen2_5OmniConfig
+    from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor, Qwen2_5OmniConfig
     from qwen_omni_utils import vision_process
-    kwargs['automodel_class'] = kwargs['automodel_class'] or Qwen2_5OmniModel
+    kwargs['automodel_class'] = kwargs['automodel_class'] or Qwen2_5OmniForConditionalGeneration
     processor = Qwen2_5OmniProcessor.from_pretrained(model_dir, trust_remote_code=True)
     kwargs['tokenizer'] = processor.tokenizer
     kwargs['model_config'] = Qwen2_5OmniConfig.from_pretrained(model_dir, trust_remote_code=True)
diff --git a/swift/llm/template/template/qwen.py b/swift/llm/template/template/qwen.py
@@ -410,7 +410,7 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         encoded = Template._encode(self, inputs)
         media_inputs = self.processor(
             text='',
-            audios=inputs.audios or None,
+            audio=inputs.audios or None,
             images=inputs.images or None,
             videos=inputs.videos or None,
             return_tensors='pt')
@@ -424,7 +424,7 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
             token_id = self._tokenize(token)
             idx_list = findall(input_ids, token_id)
             if idx_list:
-                merge_length = self.processor.omni_processor.merge_size**2
+                merge_length = self.processor.image_processor.merge_size**2
                 media_grid_thw = media_inputs.get(f'{media_type}_grid_thw')
 
                 def _get_new_tokens(i):