finally capture build cache errors!

simonguozirui · simonguozirui · commit 48642c5c4ff6 · 2025-01-13T17:51:27.000-08:00
diff --git a/src/eval.py b/src/eval.py
@@ -5,13 +5,14 @@
 import requests
 import torch
 import torch.nn as nn
-import os
+import os, subprocess
 from pydantic import BaseModel
 import numpy as np
 import random
 import json
 from contextlib import redirect_stdout, redirect_stderr
 from io import StringIO
+import sys
 
 from . import utils
 
@@ -168,6 +169,45 @@ def graceful_eval_cleanup(curr_context: dict, device: torch.device):
 
     # _cleanup_cuda_extensions() # SIMON NOTE: is this necessary?
 
+def build_compile_cache_legacy(
+    custom_model_src: str,
+    verbose: bool = False,
+    build_dir: os.PathLike = None,
+) -> tuple[bool, str, str]:
+    """
+    Try to build the compiled cuda code for sample and store in the cache directory
+    Should be able to run on CPUs to do this massively in parallel
+
+    Don't limit ninja to set default number of workers, let it use all the cpu cores possible
+
+    NOTE: currently stdout_buffer does not capture all the compiler warning and failure messages
+    Returns:
+        tuple[bool, str]: whether compilation is successful, stdout content as string
+    """
+    context = {}
+    stdout_buffer = StringIO()
+
+    if verbose:
+        print("[Compilation] Pre-compile custom cuda binaries")
+
+    try:
+        os.environ["TORCH_USE_CUDA_DSA"] = "1"  # compile with device side assertion
+        # sys.stdout.flush()
+
+        # Capture stdout during compilation
+        with redirect_stdout(stdout_buffer), redirect_stderr(stdout_buffer):
+            load_custom_model(custom_model_src, context, build_dir)
+            # sys.stdout.flush()
+
+        if verbose:
+            print(f"[Compilation] Compilation Successful, saved cache at: {build_dir}")
+    except Exception as e:
+        print(f"[Compilation] Failed to compile custom CUDA kernel. Unable to cache, \nError: {e}")
+        return False, stdout_buffer.getvalue(), str(e)
+    
+    return True, stdout_buffer.getvalue(), None
+
+
 
 def build_compile_cache(
     custom_model_src: str,
@@ -179,7 +219,7 @@ def build_compile_cache(
     Should be able to run on CPUs to do this massively in parallel
 
     Don't limit ninja to set default number of workers, let it use all the cpu cores possible
-
+    # try do this with a subprocess
     NOTE: currently stdout_buffer does not capture all the compiler warning and failure messages
     Returns:
         tuple[bool, str]: whether compilation is successful, stdout content as string
@@ -192,10 +232,12 @@ def build_compile_cache(
 
     try:
         os.environ["TORCH_USE_CUDA_DSA"] = "1"  # compile with device side assertion
+        # sys.stdout.flush()
 
         # Capture stdout during compilation
         with redirect_stdout(stdout_buffer), redirect_stderr(stdout_buffer):
             load_custom_model(custom_model_src, context, build_dir)
+            # sys.stdout.flush()
 
         if verbose:
             print(f"[Compilation] Compilation Successful, saved cache at: {build_dir}")
@@ -206,6 +248,49 @@ def build_compile_cache(
     return True, stdout_buffer.getvalue(), None
 
 
+def build_compile_cache_with_capturing(
+    custom_model_src: str,
+    verbose: bool = False,
+    build_dir: os.PathLike = None
+) -> tuple[int, str, str]:
+    """
+    Write a temporary python file to compile the custom model on CPU
+    Captures the return code, stdout, and stderr
+    This works for capturing, build_compile_cache does not
+    """
+    if build_dir:
+        # Add import at the start of the source code
+        custom_model_src = (
+            "import os\n" f"os.environ['TORCH_EXTENSIONS_DIR'] = '{build_dir}'\n"
+        ) + custom_model_src
+
+    kernel_hash = hash(custom_model_src)
+    # tmp is a temp python file we write to for compilation
+    tmp = os.path.join(build_dir, f"tmp_{kernel_hash}.py")
+    os.makedirs(os.path.dirname(tmp), exist_ok=True)
+
+    with open(tmp, "w", encoding="utf-8") as f:
+        f.write(custom_model_src)
+
+    # Execute the temporary Python file and capture output
+    process = subprocess.Popen(['python', tmp], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    stdout, stderr = process.communicate()
+    returncode = process.returncode
+
+    # Clean up temporary file
+    os.remove(tmp)
+
+
+    if verbose:
+        print("[CPU Precompile] return code: ", returncode)
+        print("[CPU Precompile] stdout: \n", stdout.decode('utf-8'))
+        print("[CPU Precompile] stderr: \n", stderr.decode('utf-8')) 
+
+    return returncode, stdout.decode('utf-8'), stderr.decode('utf-8')
+
+
+
+
 def eval_kernel_against_ref(
     original_model_src: str,
     custom_model_src: str,
diff --git a/src/prompt_constructor.py b/src/prompt_constructor.py
@@ -83,7 +83,7 @@ def prompt_generate_custom_cuda_fewshot_and_template(ref_arch_src: str, shots: l
     Avaliable few shot options to start with: 
     - ex_add: pointwise addition
     - ex_fuse_gelu: fused gelu
-    - ex_fuse_mnist2: fused convolutions and relus
+    - ex_mnist2: fused convolutions and relus
     - ex_tiled_matmul: tiled matrix multiplication
     """
     prompt = PROBLEM_STATEMENT_CLEANED
@@ -107,13 +107,13 @@ def prompt_generate_custom_cuda_fewshot_and_template(ref_arch_src: str, shots: l
     example_fuse_gelu_desc = "This given architecture is for a fused gelu: "
 
     # k = 3
-    example_fuse_mnist2 = read_file(
+    example_mnist2 = read_file(
         os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_mnist2.py")
     )
-    example_fuse_mnist2_new = read_file(
+    example_mnist2_new = read_file(
         os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_mnist2.py")
     )
-    exmaple_fuse_mnist2_desc = "This given architecture is for a model with fused convolutions and relus: "
+    exmaple_mnist2_desc = "This given architecture is for a model with fused convolutions and relus: "
 
     # k = 4
     example_tiled_matmul = read_file(
@@ -127,14 +127,14 @@ def prompt_generate_custom_cuda_fewshot_and_template(ref_arch_src: str, shots: l
 
     examples = []
     for s in shots:
-        if s not in ["ex_add", "ex_fuse_gelu", "ex_fuse_mnist2", "ex_tiled_matmul"]:
+        if s not in ["ex_add", "ex_fuse_gelu", "ex_mnist2", "ex_tiled_matmul"]:
             raise ValueError(f"Invalid shot: {s}")
         elif s == "ex_add":
             examples.append((example_add, example_add_new, example_add_desc))
         elif s == "ex_fuse_gelu":
             examples.append((example_fuse_gelu, example_fuse_gelu_new, example_fuse_gelu_desc))
-        elif s == "ex_fuse_mnist2":
-            examples.append((example_fuse_mnist2, example_fuse_mnist2_new, exmaple_fuse_mnist2_desc))
+        elif s == "ex_mnist2":
+            examples.append((example_mnist2, example_mnist2_new, exmaple_mnist2_desc))
         elif s == "ex_tiled_matmul":
             examples.append((example_tiled_matmul, example_tiled_matmul_new, example_tiled_matmul_desc))
 
@@ -171,7 +171,7 @@ def prompt_generate_ex_with_CoT_template(ref_arch_src: str, cot_example: str) ->
     Generate a prompt with a CoT example following a template 
     Avaliable CoT examples: 
     - ex_fuse_gelu: fused gelu
-    - ex_fuse_mnist2: fused convolutions and relus
+    - ex_mnist2: fused convolutions and relus
     - ex_tiled_matmul: tiled matrix multiplication
     """
 
@@ -184,7 +184,7 @@ def prompt_generate_ex_with_CoT_template(ref_arch_src: str, cot_example: str) ->
 
     prompt = PROBLEM_STATEMENT_CLEANED
     
-    assert cot_example in ["ex_fuse_gelu", "ex_fuse_mnist2", "ex_tiled_matmul"]
+    assert cot_example in ["ex_fuse_gelu", "ex_mnist2", "ex_tiled_matmul"]
 
     # k = 2
     example_fuse_gelu = read_file(
@@ -199,16 +199,16 @@ def prompt_generate_ex_with_CoT_template(ref_arch_src: str, cot_example: str) ->
     example_fuse_gelu_desc = "This given architecture is for a fused gelu: "
 
     # k = 3
-    example_fuse_mnist2 = read_file(
+    example_mnist2 = read_file(
         os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_mnist2.py")
     )
-    example_fuse_mnist2_cot = read_file(
+    example_mnist2_cot = read_file(
         os.path.join(REPO_TOP_PATH, "src/prompts/cot/model_cot_mnist2.py")
     )
-    example_fuse_mnist2_new = read_file(
+    example_mnist2_new = read_file(
         os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_mnist2.py")
     )
-    exmaple_fuse_mnist2_desc = "This given architecture is for a model with fused convolutions and relus: "
+    exmaple_mnist2_desc = "This given architecture is for a model with fused convolutions and relus: "
 
     # k = 4
     example_tiled_matmul = read_file(
@@ -228,16 +228,18 @@ def prompt_generate_ex_with_CoT_template(ref_arch_src: str, cot_example: str) ->
             cot = example_fuse_gelu_cot
             kernel = example_fuse_gelu_new
             desc = example_fuse_gelu_desc
-        case "ex_fuse_mnist2":
-            base = example_fuse_mnist2
-            cot = example_fuse_mnist2_cot
-            kernel = example_fuse_mnist2_new
-            desc = exmaple_fuse_mnist2_desc
+        case "ex_mnist2":
+            base = example_mnist2
+            cot = example_mnist2_cot
+            kernel = example_mnist2_new
+            desc = exmaple_mnist2_desc
         case "ex_tiled_matmul":
             base = example_tiled_matmul
             cot = example_tiled_matmul_cot
             kernel = example_tiled_matmul_new
             desc = example_tiled_matmul_desc
+        case _:
+            raise ValueError(f"Invalid CoT example: {cot_example} not found in CoT examples")
 
     # construct example with 
     # NOTE: we only do one example with CoT for now