pytorch · facebook-github-bot · May 12, 2025 · May 12, 2025
diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
@@ -93,8 +93,7 @@ cmake_build_llava_runner_for_android() {
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
         -DANDROID_ABI=arm64-v8a                                                 \
         ${LLAVA_COMMON_CMAKE_ARGS}                                              \
-        -DCMAKE_PREFIX_PATH="$python_lib"                  \
-        -DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON                                  \
+        -DCMAKE_PREFIX_PATH="$python_lib"                                       \
         -B${BUILD_DIR}/${dir}                                                   \
         ${dir}
 
@@ -107,11 +106,10 @@ export_llava() {
     $PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
 }
 
-# Download a new image with different size, to test if the model can handle different image sizes
-prepare_image_tensor() {
+# Download a new image
+download_image() {
     echo "Downloading image"
     curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg
-    $PYTHON_EXECUTABLE -m executorch.examples.models.llava.image_util --image-path basketball.jpg --output-path image.pt
 }
 
 run_and_verify() {
@@ -121,20 +119,18 @@ run_and_verify() {
         echo "Export failed. Abort"
         exit 1
     fi
-    if [[ ! -f "image.pt" ]]; then
-        echo "image.pt is missing."
+    if [[ ! -f "basketball.jpg" ]]; then
+        echo "basketball.jpg is missing."
         exit 1
     fi
     if [[ ! -f "tokenizer.bin" ]]; then
         echo "tokenizer.bin is missing."
         exit 1
     fi
 
-
-
     RUNTIME_ARGS="--model_path=llava.pte    \
         --tokenizer_path=tokenizer.bin      \
-        --image_path=image.pt               \
+        --image_path=basketball.jpg         \
         --prompt=ASSISTANT:                 \
         --temperature=0                     \
         --seq_len=650"
@@ -149,13 +145,8 @@ run_and_verify() {
 
     # verify result.txt
     RESULT=$(cat result.txt)
-    # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
-    if [[ "$(uname)" == "Darwin" ]]; then
-        EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. One of the players is dribbling the ball, while the others are in various"
-    else
-        # set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
-        EXPECTED_PREFIX="ASSISTANT: image"
-    fi
+    EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. "
+
     if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
         echo "Expected result prefix: ${EXPECTED_PREFIX}"
         echo "Actual result: ${RESULT}"
@@ -184,5 +175,5 @@ fi
 export_llava
 
 # Step3. Run
-prepare_image_tensor
+download_image
 run_and_verify
@@ -15,14 +15,12 @@
 # ~~~
 # It should also be cmake-lint clean.
 #
-cmake_minimum_required(VERSION 3.24)  # 3.24 is required for WHOLE_ARCHIVE
+cmake_minimum_required(VERSION 3.24) # 3.24 is required for WHOLE_ARCHIVE
 project(llava)
 
 # Duplicating options as root CMakeLists.txt
 option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF)
 
-# This is a temporary hack to get around Torch dep so we can test this on android
-option(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE "Hack option to feed dummy image to remove torch.load dep" OFF)
 
 include(CMakeDependentOption)
 #
@@ -73,15 +71,6 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
 find_package(gflags REQUIRED)
 
-# Avoid torch dep from torch.load()-ing the image.
-# This is a temporary hack.
-if(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
-  add_definitions(-DLLAVA_NO_TORCH_DUMMY_IMAGE=1)
-  message("Buidling the runner without Torch, feeding a dummy image!")
-else()
-  find_package_torch()
-endif()
-
 #
 # llava_main: test binary to run llava, with tokenizer and sampler integrated
 #
@@ -95,9 +84,6 @@ target_link_options_shared_lib(executorch)
 add_subdirectory(runner)
 
 set(LINK_LIBS executorch gflags)
-if(NOT LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
-  list(APPEND LINK_LIBS torch)
-endif()
 set(link_libraries ${LINK_LIBS})
 set(_srcs main.cpp)
 
@@ -197,6 +183,19 @@ if(ANDROID)
   list(APPEND link_libraries log)
 endif()
 
+# stb_image: a lightweight library to load images
+include(FetchContent)
+FetchContent_Declare(
+  stb
+  GIT_REPOSITORY https://github.com/nothings/stb.git
+  GIT_TAG f0569113c93ad095470c54bf34a17b36646bbbb5
+)
+FetchContent_MakeAvailable(stb)
+# Add deprecated/ to use stb_image_resize.h for internal compatibility
+list(APPEND _common_include_directories ${stb_SOURCE_DIR}
+     ${stb_SOURCE_DIR}/deprecated
+)
+
 add_executable(llava_main ${_srcs})
 if(CMAKE_BUILD_TYPE STREQUAL "Release")
   if(APPLE)

@@ -30,7 +30,6 @@
 from executorch.examples.models.llama.source_transformation.sdpa import (
     replace_sdpa_with_custom_op,
 )
-from executorch.examples.models.llava.image_util import serialize_image
 from executorch.examples.models.llava.model import LlavaModel
 from executorch.exir import (
     EdgeCompileConfig,
@@ -44,7 +43,6 @@
     ConstraintBasedSymShapeEvalPass,
     HintBasedSymShapeEvalPass,
 )
-
 from executorch.extension.llm.export.builder import DType, LLMEdgeManager
 from executorch.util.activation_memory_profiler import generate_memory_trace
 from pytorch_tokenizers.llama2c import Llama2cTokenizer as Tokenizer
@@ -265,13 +263,6 @@ def export_all(llava_model: LlavaModel):
     return executorch_program
 
 
-def get_image_tensor_for_llava_runner(llava_model):
-    # llava runner doesn't have image reader so an image tensor is needed.
-    (resized,) = llava_model.get_example_inputs()
-
-    serialize_image(resized, "image.pt")
-
-
 def get_tokenizer_for_llava_runner(llava_model):
     # serialize tokenizer into tokenizer.bin
     llava_model.tokenizer.save_vocabulary("./")
@@ -336,7 +327,6 @@ def main():
 
     # artifacts
     if args.with_artifacts:
-        get_image_tensor_for_llava_runner(llava_model)
         get_tokenizer_for_llava_runner(llava_model)
 
 

@@ -8,11 +8,10 @@
 
 #include <executorch/examples/models/llava/runner/llava_runner.h>
 #include <gflags/gflags.h>
-#ifndef LLAVA_NO_TORCH_DUMMY_IMAGE
-#include <torch/torch.h>
-#else
-#include <algorithm> // std::fill
-#endif
+#define STB_IMAGE_IMPLEMENTATION
+#include <stb_image.h>
+#define STB_IMAGE_RESIZE_IMPLEMENTATION
+#include <stb_image_resize.h>
 
 #if defined(ET_USE_THREADPOOL)
 #include <executorch/extension/threadpool/cpuinfo_utils.h>
@@ -28,10 +27,7 @@ DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
 
 DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
 
-DEFINE_string(
-    image_path,
-    "",
-    "The path to a .pt file, a serialized torch tensor for an image, longest edge resized to 336.");
+DEFINE_string(image_path, "", "The path to a .jpg file.");
 
 DEFINE_double(
     temperature,
@@ -50,6 +46,56 @@ DEFINE_int32(
 
 using executorch::extension::llm::Image;
 
+void load_image(const std::string& image_path, Image& image) {
+  int width, height, channels;
+  unsigned char* data =
+      stbi_load(image_path.c_str(), &width, &height, &channels, 0);
+  if (!data) {
+    ET_LOG(Fatal, "Failed to load image: %s", image_path.c_str());
+    exit(1);
+  }
+  // resize the longest edge to 336
+  int new_width = width;
+  int new_height = height;
+  if (width > height) {
+    new_width = 336;
+    new_height = static_cast<int>(height * 336.0 / width);
+  } else {
+    new_height = 336;
+    new_width = static_cast<int>(width * 336.0 / height);
+  }
+  std::vector<uint8_t> resized_data(new_width * new_height * channels);
+  stbir_resize_uint8(
+      data,
+      width,
+      height,
+      0,
+      resized_data.data(),
+      new_width,
+      new_height,
+      0,
+      channels);
+  // transpose to CHW
+  image.data.resize(channels * new_width * new_height);
+  for (int i = 0; i < new_width * new_height; ++i) {
+    for (int c = 0; c < channels; ++c) {
+      image.data[c * new_width * new_height + i] =
+          resized_data[i * channels + c];
+    }
+  }
+  image.width = new_width;
+  image.height = new_height;
+  image.channels = channels;
+  // convert to tensor
+  ET_LOG(
+      Info,
+      "image Channels: %" PRId32 ", Height: %" PRId32 ", Width: %" PRId32,
+      image.channels,
+      image.height,
+      image.width);
+  stbi_image_free(data);
+}
+
 int32_t main(int32_t argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
 
@@ -84,40 +130,9 @@ int32_t main(int32_t argc, char** argv) {
   // create llama runner
   example::LlavaRunner runner(model_path, tokenizer_path, temperature);
 
-  // read image and resize the longest edge to 336
-  std::vector<uint8_t> image_data;
-
-#ifdef LLAVA_NO_TORCH_DUMMY_IMAGE
-  // Work without torch using a random data
-  image_data.resize(3 * 240 * 336);
-  std::fill(image_data.begin(), image_data.end(), 0); // black
-  std::array<int32_t, 3> image_shape = {3, 240, 336};
-  std::vector<Image> images = {
-      {.data = image_data, .width = image_shape[2], .height = image_shape[1]}};
-#else //  LLAVA_NO_TORCH_DUMMY_IMAGE
-  //   cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR);
-  //   int longest_edge = std::max(image.rows, image.cols);
-  //   float scale_factor = 336.0f / longest_edge;
-  //   cv::Size new_size(image.cols * scale_factor, image.rows * scale_factor);
-  //   cv::Mat resized_image;
-  //   cv::resize(image, resized_image, new_size);
-  //   image_data.assign(resized_image.datastart, resized_image.dataend);
-  torch::Tensor image_tensor;
-  torch::load(image_tensor, image_path); // CHW
-  ET_LOG(
-      Info,
-      "image size(0): %" PRId64 ", size(1): %" PRId64 ", size(2): %" PRId64,
-      image_tensor.size(0),
-      image_tensor.size(1),
-      image_tensor.size(2));
-  image_data.assign(
-      image_tensor.data_ptr<uint8_t>(),
-      image_tensor.data_ptr<uint8_t>() + image_tensor.numel());
-  std::vector<Image> images = {
-      {.data = image_data,
-       .width = static_cast<int32_t>(image_tensor.size(2)),
-       .height = static_cast<int32_t>(image_tensor.size(1))}};
-#endif // LLAVA_NO_TORCH_DUMMY_IMAGE
+  Image image;
+  load_image(image_path, image);
+  std::vector<Image> images = {image};
 
   // generate
   runner.generate(std::move(images), prompt, seq_len);