fix loss_scale bug when meeting <image>,<audio>,<video> #4922

CrownStar7 · 2025-07-11T14:09:39Z

PR type

[✓] Bug Fix
New Feature
Document Updates
More Models or Datasets Support

PR information

在自定义损失权值计算时，遇到了 loss_scale 维度不匹配的问题。查询 PR 记录后，发现该问题此前已被提出，但一直未得到解决。因此，提出了此 PR 以完善 Swift 框架。

此 PR 基于 #3036，结合最新的 3.7.0.dev0 版本代码进行了以下工作：

复现问题
增添代码
验证相关模块的结果

通过这些工作，希望能为 Swift 框架提供更稳定的自定义损失权值计算支持。

BUG内容:

对于Input: 多媒体 + text，Output: text类型的多模态模型，当训练数据user的content包含多媒体，例如 image 时，如果同时启用

-- loss_scale <自定义LossScale类>，
-- loss_type loss_scale

会出现:

只对input_ids, lables的token进行扩展,之后遇见

情况1: 没有对loss_scale相对应地进行扩展，导致调用loss_scale_func(output, lables)时, lables和loss_scale维度不匹配,无法进行自定义权值计算loss.
情况2: 既缺少对loss_scale扩展,又缺少对loss_scale的传递,导致loss_scale丢失,调用loss_scale_func(output, lables)时, loss_scale为None,虽然能正常运行,但无法进行自定义权值计算loss.

自定义LossScale类代码,如下所示:

import re  
from swift.plugin.loss_scale.loss_scale import LossScale, loss_scale_map
from swift.llm.template.utils import ContextType  
  
class ColspanRowspanLossScale(LossScale):    
    def get_loss_scale(self, context: str, context_type: ContextType, is_last_round: bool, **kwargs):    
        if context_type == ContextType.RESPONSE:   
            pattern = r'(colspan="[^"]*"|rowspan="[^"]*")'  
            matches = list(re.finditer(pattern, context))  
              
            if not matches:  
                return [context], [1.0]  
    
            parts = []  
            weights = []  
            last_end = 0

            for match in matches:  
                start, end = match.span()    
                if start > last_end:  
                    parts.append(context[last_end:start])  
                    weights.append(1.0)  
 
                parts.append(context[start:end])  
                weights.append(3.0)  
                  
                last_end = end  
            if last_end < len(context):  
                parts.append(context[last_end:])  
                weights.append(1.0)
            return parts, weights  
            
        return super().get_loss_scale(context, context_type, is_last_round)  
  
loss_scale_map['colspan_rowspan_loss_scale'] = ColspanRowspanLossScale

缺少同时扩展loss_scale维度的函数:

#  swift/llm/template/base.py  
    def _extend_tokens(input_ids: List[int], labels: Optional[List[int]], replace_idx_list: List[int],
                       get_new_tokens: Callable[[int], List[int]]) -> Tuple[List[int], Optional[List[int]]]:
        added_tokens_len = 0
        for i, idx in enumerate(replace_idx_list):
            new_tokens = get_new_tokens(i)
            token_len = len(new_tokens)
            input_ids = input_ids[:idx + added_tokens_len] + new_tokens + input_ids[added_tokens_len + idx + 1:]
            if labels:
                labels = labels[:idx + added_tokens_len] + [-100] * token_len + labels[added_tokens_len + idx + 1:]
            
            # 此处没有相应扩展loss_scale维度
            added_tokens_len += token_len - 1
        return input_ids, labels

缺少loss_scale传递的函数(部分代码):

class mPlugOwl3Template(Template):
    version = None

    def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
        encoded = super()._encode(inputs)
        images = inputs.images
        videos = inputs.videos
        cut_enable = not videos
        input_ids = encoded['input_ids']
        labels = encoded['labels']
        .....
        # 丢失loss_scale
        encoded = {}
        if images:
            .....

            input_ids, labels = self._extend_tokens(input_ids, labels, idx_list, _get_new_tokens)
            image_token_idx = torch.tensor(findall(input_ids, image_token_list))
            if self.version == '241101':
                media_offset = image_token_idx
            else:
                _range = torch.arange(len(input_ids))[:, None]
                matrix = (_range > image_token_idx[None]).sum(dim=1)
                media_offset = torch.stack([torch.zeros(matrix.shape[0], dtype=torch.long), matrix], dim=-1)[None]
            encoded.update({
                'pixel_values': image_inputs['pixel_values'],
                'media_offset': media_offset,
            })

         # 丢失loss_scale
        encoded['input_ids'] = input_ids
        encoded['labels'] = labels
        return encoded

自定义loss无法计算的函数:

@register_loss_func(LossType.loss_scale)
def loss_scale_func(outputs, labels, loss_scale=None, num_items_in_batch=None) -> torch.Tensor:
    """Loss func

    Args:
        outputs: The model outputs
        labels: The labels
        loss_scale: The loss scale
        num_items_in_batch: Number of tokens in the labels of gradient accumulation round that are not -100.

    Returns:

    """
    loss, masks = ce_loss_func(outputs, labels)
    if loss_scale is not None:
        shift_scale = loss_scale[..., 1:].to(masks.device)
        
        # 此处出现错误
        shift_scale = shift_scale[masks]
        loss = (shift_scale * loss)
    if num_items_in_batch is None:
        loss = loss.mean()
    else:
        # compat transformers>=4.46
        loss = loss.sum() / num_items_in_batch
    return loss

解决方案

增加def _extend_loss_scale():函数:

#  swift/llm/template/base.py  
    @staticmethod
    def _extend_loss_scale(loss_scale: Optional[List[float]], replace_idx_list: List[int],
                       get_new_tokens: Callable[[int], List[int]]) -> Optional[List[float]]:
        if loss_scale:
            added_tokens_len = 0
            for i, idx in enumerate(replace_idx_list):
                new_tokens = get_new_tokens(i)
                token_len = len(new_tokens)
                
                scale_idx = loss_scale[idx + added_tokens_len]
                loss_scale = loss_scale[:idx + added_tokens_len] + [scale_idx] * token_len + loss_scale[added_tokens_len + idx + 1:]

                added_tokens_len += token_len - 1

        return loss_scale