vllm-project · vllm-bot · Apr 6, 2025 · Apr 2, 2025
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -1785,7 +1785,7 @@ __global__ void Marlin(
             <<<blocks, NUM_THREADS, max_shared_mem, stream>>>(                 \
                 A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr,      \
                 num_groups, prob_m, prob_n, prob_k, lda, locks,                \
-                use_atomic_add, use_fp32_reduce);                              \
+                part_use_atomic_add, use_fp32_reduce);                         \
       }                                                                        \
     }
 
@@ -2215,6 +2215,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
       thread_m_blocks = exec_cfg.max_m_blocks;
     }
 
+    // atomic add reduce have better performance only when m * n is small
+    bool part_use_atomic_add =
+        use_atomic_add && div_ceil(prob_m, 64) * prob_n <= 2048;
+
     if (false) {
     }
     GPTQ_CALL_IF(vllm::kU4B8, 16, 4, 256)

@@ -305,7 +305,7 @@ def should_use_atomic_add_reduce(m: int, n: int, k: int, device: torch.device,
 
     # the performance of atomicAdd is better than global reduce
     # only when m*n is small and k is large
-    return max(m, 64) * n < 64 * 2048 and k >= 2048
+    return n < 2048 and k >= 2048
 
 
 def apply_gptq_marlin_linear(