Skip to content

Llava 1.6 support #5267

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Feb 14, 2024
Merged
Changes from 1 commit
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
10c830c
Create llava-survery-v2.py
cmp-nct Feb 1, 2024
97dda1e
Update convert-image-encoder-to-gguf.py
cmp-nct Feb 1, 2024
8ebdaec
Update convert-image-encoder-to-gguf.py
cmp-nct Feb 1, 2024
1f9367c
Rename llava-survery-v2.py to llava-surgery-v2.py
cmp-nct Feb 1, 2024
a27b9a4
Update convert-image-encoder-to-gguf.py
cmp-nct Feb 2, 2024
440b2ae
Update convert-image-encoder-to-gguf.py
cmp-nct Feb 2, 2024
35b7a7a
Update llava-surgery-v2.py
cmp-nct Feb 2, 2024
37a147e
Clip: Bugfix for normalization (it did not loat the 3 std and mean va…
cmp-nct Feb 8, 2024
7dcadb4
whitespace corrections
Feb 11, 2024
7107b90
ws
Feb 11, 2024
51e60c9
Tensors are now properly permuted.
Feb 12, 2024
60c5f46
ws
Feb 12, 2024
0dd6c9d
added verbose_prompt support into cli
Feb 12, 2024
3a72267
moved llava functions to llava.cpp, made clip.h C compatible API, rep…
Feb 12, 2024
07f5cd7
ws
Feb 12, 2024
6b8d69b
convert : skip unknown tensors (need for LLaVA)
ggerganov Feb 13, 2024
a284885
llava : update readme
ggerganov Feb 13, 2024
65ec518
llava : fix compile warnings
ggerganov Feb 13, 2024
a20c071
Merge remote-tracking branch 'origin/master' into HEAD
ggerganov Feb 13, 2024
997dd1f
llava : style
ggerganov Feb 13, 2024
9d166b0
convert : add --skip-unknown CLI arg
ggerganov Feb 13, 2024
c92431a
server : remove clip structs
ggerganov Feb 13, 2024
c9874dd
bugfix for non llava-1.6
cmp-nct Feb 14, 2024
7974ff7
clip : minor code rearrange
ggerganov Feb 14, 2024
6727cfd
llava : update readme a bit
ggerganov Feb 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
clip : minor code rearrange
  • Loading branch information
ggerganov committed Feb 14, 2024
commit 7974ff7f027739b108927acc1eb540076fadfb6d
221 changes: 110 additions & 111 deletions examples/llava/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
#include <cinttypes>
#include <limits>

// #define CLIP_DEBUG_FUNCTIONS
//#define CLIP_DEBUG_FUNCTIONS

// RGB uint8 image
struct clip_image_u8 {
Expand Down Expand Up @@ -258,6 +258,114 @@ static projector_type clip_projector_type_from_string(const std::string & name)
return PROJECTOR_TYPE_UNKNOWN;
}

#ifdef CLIP_DEBUG_FUNCTIONS
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) {
std::cerr << "Failed to open file for writing: " << filename << std::endl;
return;
}

// PPM header: P6 format, width, height, and max color value
file << "P6\n" << img.nx << " " << img.ny << "\n255\n";

// Write pixel data
for (size_t i = 0; i < img.buf.size(); i += 3) {
// PPM expects binary data in RGB format, which matches our image buffer
file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
}

file.close();
}

static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) {
std::cerr << "Failed to open file for writing: " << filename << std::endl;
return;
}

int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
int bytesPerPixel = 3;
int widthInBytes = img.nx * bytesPerPixel;
int paddingAmount = (4 - (widthInBytes % 4)) % 4;
int stride = widthInBytes + paddingAmount;

// Bitmap file header
unsigned char fileHeader[14] = {
'B','M', // Signature
0,0,0,0, // Image file size in bytes
0,0,0,0, // Reserved
54,0,0,0 // Start of pixel array
};

// Total file size
fileSize = 54 + (stride * img.ny);
fileHeader[2] = (unsigned char)(fileSize);
fileHeader[3] = (unsigned char)(fileSize >> 8);
fileHeader[4] = (unsigned char)(fileSize >> 16);
fileHeader[5] = (unsigned char)(fileSize >> 24);

// Bitmap information header (BITMAPINFOHEADER)
unsigned char infoHeader[40] = {
40,0,0,0, // Size of this header (40 bytes)
0,0,0,0, // Image width
0,0,0,0, // Image height
1,0, // Number of color planes
24,0, // Bits per pixel
0,0,0,0, // No compression
0,0,0,0, // Image size (can be 0 for no compression)
0,0,0,0, // X pixels per meter (not specified)
0,0,0,0, // Y pixels per meter (not specified)
0,0,0,0, // Total colors (color table not used)
0,0,0,0 // Important colors (all are important)
};

// Width and height in the information header
infoHeader[4] = (unsigned char)(img.nx);
infoHeader[5] = (unsigned char)(img.nx >> 8);
infoHeader[6] = (unsigned char)(img.nx >> 16);
infoHeader[7] = (unsigned char)(img.nx >> 24);
infoHeader[8] = (unsigned char)(img.ny);
infoHeader[9] = (unsigned char)(img.ny >> 8);
infoHeader[10] = (unsigned char)(img.ny >> 16);
infoHeader[11] = (unsigned char)(img.ny >> 24);

// Write file headers
file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));

// Pixel data
std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
for (int x = 0; x < img.nx; ++x) {
// Each pixel
size_t pixelIndex = (y * img.nx + x) * 3;
unsigned char pixel[3] = {
img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
img.buf[pixelIndex + 1],
img.buf[pixelIndex]
};
file.write(reinterpret_cast<char*>(pixel), 3);
}
// Write padding for the row
file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
}

file.close();
}

// debug function to convert f32 to u8
static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
dst.nx = src.nx;
dst.ny = src.ny;
dst.buf.resize(3 * src.nx * src.ny);
for (size_t i = 0; i < src.buf.size(); ++i) {
dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
}
}
#endif


//
// clip layers
Expand All @@ -274,7 +382,7 @@ struct clip_hparams {

float eps;

char mm_patch_merge_type[32]="flat"; // spatial_unpad or flat (default)
char mm_patch_merge_type[32] = "flat"; // spatial_unpad or flat (default)

int32_t image_grid_pinpoints[32];
int32_t image_crop_resolution;
Expand Down Expand Up @@ -1156,103 +1264,6 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
return true;
}

#ifdef CLIP_DEBUG_FUNCTIONS
void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) {
std::cerr << "Failed to open file for writing: " << filename << std::endl;
return;
}

// PPM header: P6 format, width, height, and max color value
file << "P6\n" << img.nx << " " << img.ny << "\n255\n";

// Write pixel data
for (size_t i = 0; i < img.buf.size(); i += 3) {
// PPM expects binary data in RGB format, which matches our image buffer
file.write(reinterpret_cast<const char*>(&img.buf[i]), 3);
}

file.close();
}
void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary);
if (!file.is_open()) {
std::cerr << "Failed to open file for writing: " << filename << std::endl;
return;
}

int fileSize = 54 + 3 * img.nx * img.ny; // File header + info header + pixel data
int bytesPerPixel = 3;
int widthInBytes = img.nx * bytesPerPixel;
int paddingAmount = (4 - (widthInBytes % 4)) % 4;
int stride = widthInBytes + paddingAmount;

// Bitmap file header
unsigned char fileHeader[14] = {
'B','M', // Signature
0,0,0,0, // Image file size in bytes
0,0,0,0, // Reserved
54,0,0,0 // Start of pixel array
};

// Total file size
fileSize = 54 + (stride * img.ny);
fileHeader[2] = (unsigned char)(fileSize);
fileHeader[3] = (unsigned char)(fileSize >> 8);
fileHeader[4] = (unsigned char)(fileSize >> 16);
fileHeader[5] = (unsigned char)(fileSize >> 24);

// Bitmap information header (BITMAPINFOHEADER)
unsigned char infoHeader[40] = {
40,0,0,0, // Size of this header (40 bytes)
0,0,0,0, // Image width
0,0,0,0, // Image height
1,0, // Number of color planes
24,0, // Bits per pixel
0,0,0,0, // No compression
0,0,0,0, // Image size (can be 0 for no compression)
0,0,0,0, // X pixels per meter (not specified)
0,0,0,0, // Y pixels per meter (not specified)
0,0,0,0, // Total colors (color table not used)
0,0,0,0 // Important colors (all are important)
};

// Width and height in the information header
infoHeader[4] = (unsigned char)(img.nx);
infoHeader[5] = (unsigned char)(img.nx >> 8);
infoHeader[6] = (unsigned char)(img.nx >> 16);
infoHeader[7] = (unsigned char)(img.nx >> 24);
infoHeader[8] = (unsigned char)(img.ny);
infoHeader[9] = (unsigned char)(img.ny >> 8);
infoHeader[10] = (unsigned char)(img.ny >> 16);
infoHeader[11] = (unsigned char)(img.ny >> 24);

// Write file headers
file.write(reinterpret_cast<char*>(fileHeader), sizeof(fileHeader));
file.write(reinterpret_cast<char*>(infoHeader), sizeof(infoHeader));

// Pixel data
std::vector<unsigned char> padding(3, 0); // Max padding size to be added to each row
for (int y = img.ny - 1; y >= 0; --y) { // BMP files are stored bottom-to-top
for (int x = 0; x < img.nx; ++x) {
// Each pixel
size_t pixelIndex = (y * img.nx + x) * 3;
unsigned char pixel[3] = {
img.buf[pixelIndex + 2], // BMP stores pixels in BGR format
img.buf[pixelIndex + 1],
img.buf[pixelIndex]
};
file.write(reinterpret_cast<char*>(pixel), 3);
}
// Write padding for the row
file.write(reinterpret_cast<char*>(padding.data()), paddingAmount);
}

file.close();
}
#endif

// Linear interpolation between two points
inline float lerp(float s, float e, float t) {
return s + (e - s) * t;
Expand Down Expand Up @@ -1469,18 +1480,6 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
return patches;
}

#ifdef CLIP_DEBUG_FUNCTIONS
// debug function to convert f32 to u8
static void clip_image_convert_f32_to_u8(const clip_image_f32& src, clip_image_u8& dst) {
dst.nx = src.nx;
dst.ny = src.ny;
dst.buf.resize(3 * src.nx * src.ny);
for (size_t i = 0; i < src.buf.size(); ++i) {
dst.buf[i] = static_cast<uint8_t>(std::min(std::max(int(src.buf[i] * 255.0f), 0), 255));
}
}
#endif

// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
// res_imgs memory is being allocated here, previous allocations will be freed if found
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) {
Expand Down