Skip to content

[llava] Remove torch.jit.save in llava example #10794

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 9 additions & 18 deletions .ci/scripts/test_llava.sh
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,7 @@ cmake_build_llava_runner_for_android() {
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
${LLAVA_COMMON_CMAKE_ARGS} \
-DCMAKE_PREFIX_PATH="$python_lib" \
-DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON \
-DCMAKE_PREFIX_PATH="$python_lib" \
-B${BUILD_DIR}/${dir} \
${dir}

Expand All @@ -107,11 +106,10 @@ export_llava() {
$PYTHON_EXECUTABLE -m executorch.examples.models.llava.export_llava --pte-name llava.pte --with-artifacts
}

# Download a new image with different size, to test if the model can handle different image sizes
prepare_image_tensor() {
# Download a new image
download_image() {
echo "Downloading image"
curl -o basketball.jpg https://upload.wikimedia.org/wikipedia/commons/7/73/Chicago_Bulls_and_New_Jersey_Nets%2C_March_28%2C_1991.jpg
$PYTHON_EXECUTABLE -m executorch.examples.models.llava.image_util --image-path basketball.jpg --output-path image.pt
}

run_and_verify() {
Expand All @@ -121,20 +119,18 @@ run_and_verify() {
echo "Export failed. Abort"
exit 1
fi
if [[ ! -f "image.pt" ]]; then
echo "image.pt is missing."
if [[ ! -f "basketball.jpg" ]]; then
echo "basketball.jpg is missing."
exit 1
fi
if [[ ! -f "tokenizer.bin" ]]; then
echo "tokenizer.bin is missing."
exit 1
fi



RUNTIME_ARGS="--model_path=llava.pte \
--tokenizer_path=tokenizer.bin \
--image_path=image.pt \
--image_path=basketball.jpg \
--prompt=ASSISTANT: \
--temperature=0 \
--seq_len=650"
Expand All @@ -149,13 +145,8 @@ run_and_verify() {

# verify result.txt
RESULT=$(cat result.txt)
# set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
if [[ "$(uname)" == "Darwin" ]]; then
EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. One of the players is dribbling the ball, while the others are in various"
else
# set the expected prefix to be the same as prompt because there's a bug in sdpa_with_kv_cache that causes <unk> tokens.
EXPECTED_PREFIX="ASSISTANT: image"
fi
EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. "

if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
echo "Expected result prefix: ${EXPECTED_PREFIX}"
echo "Actual result: ${RESULT}"
Expand Down Expand Up @@ -184,5 +175,5 @@ fi
export_llava

# Step3. Run
prepare_image_tensor
download_image
run_and_verify
29 changes: 14 additions & 15 deletions examples/models/llava/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,12 @@
# ~~~
# It should also be cmake-lint clean.
#
cmake_minimum_required(VERSION 3.24) # 3.24 is required for WHOLE_ARCHIVE
cmake_minimum_required(VERSION 3.24) # 3.24 is required for WHOLE_ARCHIVE
project(llava)

# Duplicating options as root CMakeLists.txt
option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF)

# This is a temporary hack to get around Torch dep so we can test this on android
option(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE "Hack option to feed dummy image to remove torch.load dep" OFF)

include(CMakeDependentOption)
#
Expand Down Expand Up @@ -73,15 +71,6 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
find_package(gflags REQUIRED)

# Avoid torch dep from torch.load()-ing the image.
# This is a temporary hack.
if(LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
add_definitions(-DLLAVA_NO_TORCH_DUMMY_IMAGE=1)
message("Buidling the runner without Torch, feeding a dummy image!")
else()
find_package_torch()
endif()

#
# llava_main: test binary to run llava, with tokenizer and sampler integrated
#
Expand All @@ -95,9 +84,6 @@ target_link_options_shared_lib(executorch)
add_subdirectory(runner)

set(LINK_LIBS executorch gflags)
if(NOT LLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE)
list(APPEND LINK_LIBS torch)
endif()
set(link_libraries ${LINK_LIBS})
set(_srcs main.cpp)

Expand Down Expand Up @@ -197,6 +183,19 @@ if(ANDROID)
list(APPEND link_libraries log)
endif()

# stb_image: a lightweight library to load images
include(FetchContent)
FetchContent_Declare(
stb
GIT_REPOSITORY https://github.com/nothings/stb.git
GIT_TAG f0569113c93ad095470c54bf34a17b36646bbbb5
)
FetchContent_MakeAvailable(stb)
# Add deprecated/ to use stb_image_resize.h for internal compatibility
list(APPEND _common_include_directories ${stb_SOURCE_DIR}
${stb_SOURCE_DIR}/deprecated
)

add_executable(llava_main ${_srcs})
if(CMAKE_BUILD_TYPE STREQUAL "Release")
if(APPLE)
Expand Down
10 changes: 0 additions & 10 deletions examples/models/llava/export_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from executorch.examples.models.llama.source_transformation.sdpa import (
replace_sdpa_with_custom_op,
)
from executorch.examples.models.llava.image_util import serialize_image
from executorch.examples.models.llava.model import LlavaModel
from executorch.exir import (
EdgeCompileConfig,
Expand All @@ -44,7 +43,6 @@
ConstraintBasedSymShapeEvalPass,
HintBasedSymShapeEvalPass,
)

from executorch.extension.llm.export.builder import DType, LLMEdgeManager
from executorch.util.activation_memory_profiler import generate_memory_trace
from pytorch_tokenizers.llama2c import Llama2cTokenizer as Tokenizer
Expand Down Expand Up @@ -265,13 +263,6 @@ def export_all(llava_model: LlavaModel):
return executorch_program


def get_image_tensor_for_llava_runner(llava_model):
# llava runner doesn't have image reader so an image tensor is needed.
(resized,) = llava_model.get_example_inputs()

serialize_image(resized, "image.pt")


def get_tokenizer_for_llava_runner(llava_model):
# serialize tokenizer into tokenizer.bin
llava_model.tokenizer.save_vocabulary("./")
Expand Down Expand Up @@ -336,7 +327,6 @@ def main():

# artifacts
if args.with_artifacts:
get_image_tensor_for_llava_runner(llava_model)
get_tokenizer_for_llava_runner(llava_model)


Expand Down
79 changes: 0 additions & 79 deletions examples/models/llava/image_util.py

This file was deleted.

101 changes: 58 additions & 43 deletions examples/models/llava/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,10 @@

#include <executorch/examples/models/llava/runner/llava_runner.h>
#include <gflags/gflags.h>
#ifndef LLAVA_NO_TORCH_DUMMY_IMAGE
#include <torch/torch.h>
#else
#include <algorithm> // std::fill
#endif
#define STB_IMAGE_IMPLEMENTATION
#include <stb_image.h>
#define STB_IMAGE_RESIZE_IMPLEMENTATION
#include <stb_image_resize.h>

#if defined(ET_USE_THREADPOOL)
#include <executorch/extension/threadpool/cpuinfo_utils.h>
Expand All @@ -28,10 +27,7 @@ DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");

DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");

DEFINE_string(
image_path,
"",
"The path to a .pt file, a serialized torch tensor for an image, longest edge resized to 336.");
DEFINE_string(image_path, "", "The path to a .jpg file.");

DEFINE_double(
temperature,
Expand All @@ -50,6 +46,56 @@ DEFINE_int32(

using executorch::extension::llm::Image;

void load_image(const std::string& image_path, Image& image) {
int width, height, channels;
unsigned char* data =
stbi_load(image_path.c_str(), &width, &height, &channels, 0);
if (!data) {
ET_LOG(Fatal, "Failed to load image: %s", image_path.c_str());
exit(1);
}
// resize the longest edge to 336
int new_width = width;
int new_height = height;
if (width > height) {
new_width = 336;
new_height = static_cast<int>(height * 336.0 / width);
} else {
new_height = 336;
new_width = static_cast<int>(width * 336.0 / height);
}
std::vector<uint8_t> resized_data(new_width * new_height * channels);
stbir_resize_uint8(
data,
width,
height,
0,
resized_data.data(),
new_width,
new_height,
0,
channels);
// transpose to CHW
image.data.resize(channels * new_width * new_height);
for (int i = 0; i < new_width * new_height; ++i) {
for (int c = 0; c < channels; ++c) {
image.data[c * new_width * new_height + i] =
resized_data[i * channels + c];
}
}
image.width = new_width;
image.height = new_height;
image.channels = channels;
// convert to tensor
ET_LOG(
Info,
"image Channels: %" PRId32 ", Height: %" PRId32 ", Width: %" PRId32,
image.channels,
image.height,
image.width);
stbi_image_free(data);
}

int32_t main(int32_t argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);

Expand Down Expand Up @@ -84,40 +130,9 @@ int32_t main(int32_t argc, char** argv) {
// create llama runner
example::LlavaRunner runner(model_path, tokenizer_path, temperature);

// read image and resize the longest edge to 336
std::vector<uint8_t> image_data;

#ifdef LLAVA_NO_TORCH_DUMMY_IMAGE
// Work without torch using a random data
image_data.resize(3 * 240 * 336);
std::fill(image_data.begin(), image_data.end(), 0); // black
std::array<int32_t, 3> image_shape = {3, 240, 336};
std::vector<Image> images = {
{.data = image_data, .width = image_shape[2], .height = image_shape[1]}};
#else // LLAVA_NO_TORCH_DUMMY_IMAGE
// cv::Mat image = cv::imread(image_path, cv::IMREAD_COLOR);
// int longest_edge = std::max(image.rows, image.cols);
// float scale_factor = 336.0f / longest_edge;
// cv::Size new_size(image.cols * scale_factor, image.rows * scale_factor);
// cv::Mat resized_image;
// cv::resize(image, resized_image, new_size);
// image_data.assign(resized_image.datastart, resized_image.dataend);
torch::Tensor image_tensor;
torch::load(image_tensor, image_path); // CHW
ET_LOG(
Info,
"image size(0): %" PRId64 ", size(1): %" PRId64 ", size(2): %" PRId64,
image_tensor.size(0),
image_tensor.size(1),
image_tensor.size(2));
image_data.assign(
image_tensor.data_ptr<uint8_t>(),
image_tensor.data_ptr<uint8_t>() + image_tensor.numel());
std::vector<Image> images = {
{.data = image_data,
.width = static_cast<int32_t>(image_tensor.size(2)),
.height = static_cast<int32_t>(image_tensor.size(1))}};
#endif // LLAVA_NO_TORCH_DUMMY_IMAGE
Image image;
load_image(image_path, image);
std::vector<Image> images = {image};

// generate
runner.generate(std::move(images), prompt, seq_len);
Expand Down
Loading
Loading