[BACKEND] Hotfix for perf regression (triton-lang#2822)

Jokeren · web-flow · commit 969c5bb1b828 · 2023-12-19T09:05:38.000-08:00
When annotated with ".target sm_80, debug" in a ptx file, `ptxas` is not
able to apply compiler optimizations.
To validate, adding "-O3" to the compilation command would report
conflicts between the "debug" constraints and the optimization flag.
To fix the problem, this PR converts `.target sm_&lt;arch&gt;, debug` to
`.target sm_&lt;arch&gt;` before applying ptxas.
diff --git a/python/triton/compiler/backends/cuda.py b/python/triton/compiler/backends/cuda.py
@@ -184,6 +184,8 @@ def make_ptx(src, metadata, opt, capability):
             ptx_version = ptx_get_version(cuda_version)
         ptx_version = f'{ptx_version//10}.{ptx_version%10}'
         ret = re.sub(r'\.version \d+\.\d+', f'.version {ptx_version}', ret, flags=re.MULTILINE)
+        # Remove the debug flag that prevents ptxas from optimizing the code
+        ret = re.sub(r",\s*debug|debug,\s*", "", ret)
         return ret
 
     @staticmethod