unify code.

Reinerzhou · Reinerzhou · commit fb963862c328 · 2024-10-15T03:27:45.000Z
diff --git a/lmdeploy/pytorch/kernels/maca/pagedattention.py b/lmdeploy/pytorch/kernels/maca/pagedattention.py
@@ -30,9 +30,7 @@ def prefill_attention(
             value_states,
             q_start_loc,
             q_seq_len,
-            kv_seq_len,
             max_q_seq_len,
-            max_kv_seq_len,
             num_q_heads,
             num_kv_heads,
             attn_mask,
diff --git a/lmdeploy/pytorch/models/internlm2.py b/lmdeploy/pytorch/models/internlm2.py
@@ -40,7 +40,6 @@ def __init__(self,
         )
 
         # rotary embedding
-        # import pdb; pdb.set_trace()
         self.apply_rotary_pos_emb = ApplyRotaryEmb()
 
         # attention
@@ -75,8 +74,6 @@ def forward(
             qkv_states)
 
         # apply rotary embedding
-        # import pdb; pdb.set_trace()
-
         cos, sin = rotary_pos_emb
         query_states, key_states = self.apply_rotary_pos_emb(
             query_states,
@@ -85,7 +82,6 @@ def forward(
             sin,
             inplace=True,
         )
-        # import pdb; pdb.set_trace()
 
         # attention
         attn_output = self.attn_fwd(
@@ -97,13 +93,8 @@ def forward(
             attn_metadata,
             inplace=True,
         )
-        # import pdb; pdb.set_trace()
-
         attn_output = attn_output.reshape(*hidden_states.shape[:-1], -1)
 
-        is_decoding = query_states.shape[0] == 1
-        # if is_decoding:
-        #     import pdb; pdb.set_trace()
         # o proj
         attn_output = self.wo(attn_output)
         return attn_output
@@ -159,7 +150,6 @@ def __init__(self,
         super().__init__()
         self.layer_idx = layer_idx
         quantization_config = getattr(config, 'quantization_config', None)
-        # import pdb; pdb.set_trace()
 
         # build attention layer
         self.attention = InternLM2Attention(config, dtype=dtype, device=device)
@@ -197,8 +187,6 @@ def forward(
             hidden_states, residual = self.attention_norm(
                 hidden_states, residual)
 
-        # import pdb; pdb.set_trace()
-
         # Self Attention
         hidden_states = self.attention(
             hidden_states=hidden_states,
diff --git a/lmdeploy/pytorch/models/patch.py b/lmdeploy/pytorch/models/patch.py
@@ -188,7 +188,6 @@ def build_model_from_hf_config(model_config: PretrainedConfig,
     if device is None:
         device = torch.device('cuda')
     model_cls = _get_model_class(model_config, module_map)
-    # import pdb; pdb.set_trace()
     model = model_cls(model_config, ctx_mgr, dtype=dtype, device=device)
     return model.eval()
 
diff --git a/test_qwen.py b/test_qwen.py