update

modelscope · Jintao-Huang · Apr 24, 2025 · Apr 24, 2025 · Apr 24, 2025 · Apr 24, 2025
commit 9329648f29524190a6d0c651a1eceb83a0ba5a09
diff --git a/swift/llm/model/utils.py b/swift/llm/model/utils.py
@@ -39,6 +39,7 @@ def update_attn_impl(config: PretrainedConfig,
                          attn_impl_keys: Optional[List[str]] = None) -> None:
         if attn_impl is None:
             return
+        logger.info(f'attn_impl: {attn_impl}')
         use_flash_attn = AttnImpl.to_use_flash_attn(attn_impl)
         if use_flash_attn:
             attn_impl = 'flash_attention_2'

diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
@@ -739,7 +739,7 @@ def _pre_tokenize_images(self, context_list: List[Context], loss_scale_list: Lis
             if context == '<image>' and inputs.is_multimodal and inputs.image_idx < len(inputs.images):
                 c_list = self.replace_tag('image', inputs.image_idx, inputs)
                 inputs.image_idx += 1
-                loss_scale = 0.
+                loss_scale = 0. if self.template_backend == 'swift' else 1.
             else:
                 c_list = [context]
             res += c_list

diff --git a/swift/llm/template/template/gemma.py b/swift/llm/template/template/gemma.py
@@ -109,7 +109,7 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
             input_ids = encoded['input_ids']
             labels = encoded['labels']
             idx_list = findall(input_ids, self.boi_token_id)
-            img_tokens = self.tokenizer.encode(self.processor.full_image_sequence)
+            img_tokens = self._tokenize(self.processor.full_image_sequence)
             input_ids, labels = self._extend_tokens(input_ids, labels, idx_list, lambda _: img_tokens)
 
             # TODO: customize

diff --git a/tests/test_align/test_template/test_vision.py b/tests/test_align/test_template/test_vision.py
@@ -509,10 +509,11 @@ def test_phi4_vision():
 
 def test_gemma3_vision():
     pt_engine = PtEngine('LLM-Research/gemma-3-4b-it')
-    response = _infer_model(pt_engine)
+    response = _infer_model(pt_engine, messages=[{'role': 'user', 'content': '<image>Describe this image in detail.'}])
     pt_engine.default_template.template_backend = 'jinja'
-    response2 = _infer_model(pt_engine)
-    assert response == response2
+    response2 = _infer_model(pt_engine, messages=[{'role': 'user', 'content': '<image>Describe this image in detail.'}])
+    assert response[:80] == response2[:80] == (
+        "Here's a detailed description of the image:\n\n**Overall Impression:**\n\nThe image ")
 
 
 def test_mistral_2503():
@@ -596,9 +597,9 @@ def test_kimi_vl():
     # test_minicpmo()
     # test_valley()
     # test_ui_tars()
-    # test_gemma3_vision()
+    test_gemma3_vision()
     # test_mistral_2503()
     # test_llama4()
     # test_internvl3_8b()
     # test_internvl3_9b()
-    test_kimi_vl()
+    # test_kimi_vl()