Llava API + LlavaTest

Preliminary
SciSharp · martindevans · Mar 13, 2024 · Mar 1, 2024 · Mar 2, 2024 · Mar 3, 2024
commit 6307a2f635153c82da05d30a0e4c21f0b74695af
diff --git a/LLama.Unittest/Constants.cs b/LLama.Unittest/Constants.cs
@@ -3,5 +3,7 @@
     internal static class Constants
     {
         public static string ModelPath = "Models/llama-2-7b-chat.Q3_K_S.gguf";
+        public static string LLavaModelPath = "Models/llava-v1.6-mistral-7b.Q3_K_XS.gguf";
+        public static string LLavaMmpPath = "Models/mmproj-model-f16.gguf";
     }
 }
diff --git a/LLama.Unittest/LLama.Unittest.csproj b/LLama.Unittest/LLama.Unittest.csproj
@@ -27,8 +27,9 @@
   </ItemGroup>
 
   <Target Name="DownloadContentFiles" BeforeTargets="Build">
-      <DownloadFile SourceUrl="https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q3_K_S.gguf" DestinationFolder="Models" DestinationFileName="llama-2-7b-chat.Q3_K_S.gguf" SkipUnchangedFiles="true">
-    </DownloadFile>
+    <DownloadFile SourceUrl="https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q3_K_S.gguf" DestinationFolder="Models" DestinationFileName="llama-2-7b-chat.Q3_K_S.gguf" SkipUnchangedFiles="true"></DownloadFile>
+    <DownloadFile SourceUrl="https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/blob/main/llava-v1.6-mistral-7b.Q3_K_XS.gguf" DestinationFolder="Models" DestinationFileName="llava-v1.6-mistral-7b.Q3_K_XS.gguf" SkipUnchangedFiles="true"></DownloadFile>
+    <DownloadFile SourceUrl="https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/blob/main/mmproj-model-f16.gguf" DestinationFolder="Models" DestinationFileName="mmproj-model-f16.gguf" SkipUnchangedFiles="true"></DownloadFile>
   </Target>
 
   <ItemGroup>

diff --git a/LLama.Unittest/LLavaWeigthsTests.cs b/LLama.Unittest/LLavaWeigthsTests.cs
@@ -0,0 +1,96 @@
+using LLama.Common;
+using LLama.Native;
+
+namespace LLama.Unittest
+{
+    public sealed class LLavaWeightTests
+        : IDisposable
+    {
+        private readonly LLamaWeights _llamaWeights;
+        private readonly LLavaWeights _lLavaWeights;
+        private readonly LLamaContext _context;
+
+        public LLavaWeightTests()
+        {
+            var @params = new ModelParams(Constants.LLavaMmpPath)
+            {
+                // Llava models requires big context
+                ContextSize = 4096,
+            };
+            _llamaWeights = LLamaWeights.LoadFromFile(@params);
+            _lLavaWeights = LLavaWeights.LoadFromFile(Constants.LLavaMmpPath);
+
+            _context = _llamaWeights.CreateContext(@params);
+
+        }
+
+        public void Dispose()
+        {
+            _llamaWeights.Dispose();
+            _lLavaWeights.Dispose();
+        }
+
+        [Fact]
+        public void CheckProperties()
+        {
+            Assert.Equal(768u, _context.ContextSize);
+            Assert.Equal(4096, _context.EmbeddingSize);
+            Assert.Equal(32000, _context.VocabCount);
+        }
+
+        [Fact]
+        public void Tokenize()
+        {
+            var tokens = _context.Tokenize("The quick brown fox", true);
+
+            Assert.Equal(new LLamaToken[] { 1, 450, 4996, 17354, 1701, 29916 }, tokens);
+        }
+
+        [Fact]
+        public void TokenizeNewline()
+        {
+            var tokens = _context.Tokenize("\n", false, false);
+
+            Assert.Equal(new LLamaToken[] { 29871, 13 }, tokens);
+        }
+
+        [Fact]
+        public void TokenizeRoundtripSpecialStrings()
+        {
+            var strings = new[]
+            {
+                "\t", "\t\t", "\t\t\t",
+                "\n\n", "\n\n\n", "\n\n\n\n",
+                "\t\n", "\t\n\t\n\n\n\n\t\t",
+                "\b", "\v", "\0"
+            };
+
+            foreach (var s in strings)
+            {
+                var tokens = _context.Tokenize(s, false, false);
+                var decoder = new StreamingTokenDecoder(_context);
+                decoder.AddRange(tokens);
+
+                var str = decoder.Read();
+
+                Assert.Equal(s, str.TrimStart(' '));
+            }
+        }
+
+        [Fact]
+        public void TokenizeWithoutBOS()
+        {
+            var tokens = _context.Tokenize("The quick brown fox", false);
+
+            Assert.Equal(new LLamaToken[] { 450, 4996, 17354, 1701, 29916 }, tokens);
+        }
+
+        [Fact]
+        public void TokenizeEmpty()
+        {
+            var tokens = _context.Tokenize("", false);
+
+            Assert.Equal(Array.Empty<LLamaToken>(), tokens);
+        }
+    }
+}
diff --git a/LLama.Unittest/Models/extreme-ironing-taxi-610x427.jpg b/LLama.Unittest/Models/extreme-ironing-taxi-610x427.jpg
diff --git a/LLama/LLavaWeights.cs b/LLama/LLavaWeights.cs
@@ -0,0 +1,51 @@
+
+using System;
+using LLama.Native;
+
+namespace LLama;
+
+public sealed class LLavaWeights : IDisposable
+{
+    public SafeLlavaModelHandle NativeClipHandle { get; }   
+
+    internal LLavaWeights(SafeLlavaModelHandle weights)
+    {
+        NativeClipHandle = weights;
+    }
+
+    public static LLavaWeights LoadFromFile(string mmProject)
+    {
+        var weights = SafeLlavaModelHandle.LoadFromFile(mmProject, 1);
+        return new LLavaWeights(weights);
+    }
+
+    /// <summary>
+    /// Embed the image from file into llama context
+    /// </summary>
+    /// <param name="ctxLlama"></param>
+    /// <param name="Image"></param>
+    /// <param name="n_past"></param>
+    /// <returns></returns>
+    public bool EmbedImage(LLamaContext ctxLlama, string Image, out int n_past )
+    {
+        return NativeClipHandle.EmbedImage(ctxLlama, Image, out n_past );
+    }
+
+    /// <summary>
+    /// Embed the image from binary into llama context.
+    /// </summary>
+    /// <param name="ctxLlama"></param>
+    /// <param name="Image"></param>
+    /// <param name="n_past"></param>
+    /// <returns></returns>
+    public bool EmbedImage(LLamaContext ctxLlama, Byte[] Image, out int n_past )
+    {
+        return NativeClipHandle.EmbedImage(ctxLlama, Image, out n_past );
+    }
+
+    public void Dispose()
+    {
+        NativeClipHandle.Dispose();
+    }    
+
+}
diff --git a/LLama/Native/NativeApi.LLava.cs b/LLama/Native/NativeApi.LLava.cs
@@ -0,0 +1,103 @@
+using System;
+using System.Runtime.InteropServices;
+
+namespace LLama.Native;
+
+using clip_ctx = IntPtr;
+public static unsafe partial class NativeApi
+{
+
+    /*/// <summary>
+    /// Clip Vision Parameters
+    /// </summary>
+    [StructLayout(LayoutKind.Sequential)]
+    public struct clip_vision_hparams
+    {
+        public Int32 image_size;
+        public Int32 patch_size;
+        public Int32 hidden_size;
+        public Int32 n_intermediate;
+        public Int32 projection_dim;
+        public Int32 n_head;
+        public Int32 n_layer;
+        public float eps;
+    };*/
+
+    /// <summary>
+    /// LLaVa Image embeddings 
+    /// </summary>
+    [StructLayout(LayoutKind.Sequential)]
+    public struct llava_image_embed
+    {
+        public float* embed;
+        public int n_image_pos;
+    }
+
+    /// <summary>
+    /// Load MULTI MODAL PROJECTIONS model / Clip Model
+    /// </summary>
+    /// <param name="mmProj"> Model path/file</param>
+    /// <param name="verbosity">Verbosity level</param>
+    /// <returns></returns>
+    [DllImport(llavaLibraryName, EntryPoint = "clip_model_load", CallingConvention = CallingConvention.Cdecl)]
+    public static extern clip_ctx clip_model_load(string mmProj, int verbosity);
+
+    /// <summary>
+    /// Frees MULTI MODAL PROJECTIONS model / Clip Model
+    /// </summary>
+    /// <param name="ctx"></param>
+    [DllImport(llavaLibraryName, EntryPoint = "clip_free", CallingConvention = CallingConvention.Cdecl)]
+    public static extern void clip_free(clip_ctx ctx);
+
+
+    /// <summary>
+    /// Sanity check for clip <-> llava embed size match
+    /// </summary>
+    /// <returns></returns>
+    [DllImport(llavaLibraryName, EntryPoint = "llava_validate_embed_size", CallingConvention = CallingConvention.Cdecl)]
+    public static extern bool llava_validate_embed_size( SafeLLamaContextHandle ctxLlama, clip_ctx ctxClip);
+
+    /// <summary>
+    /// Build an image embed from image file bytes
+    /// </summary>
+    /// <param name="ctx_clip"></param>
+    /// <param name="n_threads"></param>
+    /// <param name="image_bytes"></param>
+    /// <param name="image_bytes_length"></param>
+    /// <returns></returns>
+    [DllImport(llavaLibraryName, EntryPoint = "llava_image_embed_make_with_bytes",
+        CallingConvention = CallingConvention.Cdecl)]
+    public static extern llava_image_embed* llava_image_embed_make_with_bytes(clip_ctx ctx_clip, int n_threads,
+        byte[] image_bytes, int image_bytes_length);
+
+    /// <summary>
+    /// Build an image embed from a path to an image filename
+    /// </summary>
+    /// <param name="ctx_clip"></param>
+    /// <param name="n_threads"></param>
+    /// <param name="image_path"></param>
+    /// <returns></returns>
+    [DllImport(llavaLibraryName, EntryPoint = "llava_image_embed_make_with_filename",
+        CallingConvention = CallingConvention.Cdecl)]
+    public static extern llava_image_embed* llava_image_embed_make_with_filename(clip_ctx ctx_clip, int n_threads,
+        [MarshalAs(UnmanagedType.LPStr)] string image_path);
+
+    /// <summary>
+    /// Free an embedding made with llava_image_embed_make_*
+    /// </summary>
+    /// <param name="embed"></param>
+    /// <returns></returns>
+    [DllImport(llavaLibraryName, EntryPoint = "llava_image_embed_free", CallingConvention = CallingConvention.Cdecl)]
+    public static extern llava_image_embed* llava_image_embed_free(llava_image_embed* embed);
+
+    /// <summary>
+    /// Write the image represented by embed into the llama context with batch size n_batch, starting at context
+    /// pos n_past. on completion, n_past points to the next position in the context after the image embed.
+    /// </summary>
+    /// <param name="embed">ctx_llama</param>
+    /// <returns></returns>
+    [DllImport(llavaLibraryName, EntryPoint = "llava_eval_image_embed", CallingConvention = CallingConvention.Cdecl)]
+    public static extern bool llava_eval_image_embed(SafeLLamaContextHandle ctc_llama, llava_image_embed* embed,
+        int n_batch, out int n_past);
+
+}
diff --git a/LLama/Native/NativeApi.Load.cs b/LLama/Native/NativeApi.Load.cs
@@ -338,6 +338,7 @@ string TryFindPath(string filename)
         }
 
         internal const string libraryName = "llama";
+        internal const string llavaLibraryName = "llava_shared";        
         private const string cudaVersionFile = "version.json";
         private const string loggingPrefix = "[LLamaSharp Native]";
         private static bool enableLogging = false;

diff --git a/LLama/Native/SafeLlavaModelHandle.cs b/LLama/Native/SafeLlavaModelHandle.cs
@@ -0,0 +1,92 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using LLama;
+using LLama.Exceptions;
+
+
+namespace LLama.Native
+{
+    /// <summary>
+    /// A reference to a set of llava model weights
+    /// </summary>
+    public sealed class SafeLlavaModelHandle
+        : SafeLLamaHandleBase
+    {
+
+        internal protected SafeLlavaModelHandle(IntPtr handle)
+            : base(handle, true)
+        {
+        }
+
+        /// <inheritdoc />
+        protected override bool ReleaseHandle()
+        {
+
+            NativeApi.clip_free(DangerousGetHandle());
+            SetHandle(IntPtr.Zero);
+            return true;
+        }
+
+        /// <summary>
+        /// Load a model from the given file path into memory
+        /// </summary>
+        /// <param name="modelPath"></param>
+        /// <param name="lparams"></param>
+        /// <returns></returns>
+        /// <exception cref="RuntimeError"></exception>
+        public static SafeLlavaModelHandle LoadFromFile(string modelPath, int verbosity )
+        {
+            var ctxContext =  NativeApi.clip_model_load(modelPath, verbosity );            
+            if (ctxContext == IntPtr.Zero)
+                throw new RuntimeError($"Failed to load LLaVa model {modelPath}.");
+
+            return new SafeLlavaModelHandle(ctxContext);
+        }
+
+        public void LoadImage( string imagePath, int threads )
+        {
+            unsafe
+            {
+                NativeApi.llava_image_embed_make_with_filename( this.handle, threads,  imagePath);                   
+            }
+        }
+
+        /// <summary>
+        /// Embed the image from file in llama context
+        /// </summary>
+        /// <param name="ctxLlama"></param>
+        /// <param name="image"></param>
+        /// <param name="n_past"></param>
+        /// <returns></returns>
+        public bool EmbedImage(LLamaContext ctxLlama, string image, out int n_past)
+        {
+            unsafe
+            {
+                var ptrImageEmbed = NativeApi.llava_image_embed_make_with_filename(this.handle,  (int) ctxLlama.Params.Threads, image);
+                bool result = NativeApi.llava_eval_image_embed(ctxLlama.NativeHandle, ptrImageEmbed, (int)ctxLlama.Params.BatchSize, out n_past );
+                NativeApi.llava_image_embed_free(ptrImageEmbed);
+                return result;
+            }            
+        }
+
+        /// <summary>
+        /// Embed the image from binary in llama context
+        /// </summary>
+        /// <param name="ctxLlama"></param>
+        /// <param name="image"></param>
+        /// <param name="n_past"></param>
+        /// <returns></returns>
+        public bool EmbedImage(LLamaContext ctxLlama, Byte[] image, out int n_past )
+        {
+            unsafe
+            {
+                var ptrImageEmbed = NativeApi.llava_image_embed_make_with_bytes(this.handle, (int) ctxLlama.Params.Threads, image.ToArray(), image.Length);
+                bool result = NativeApi.llava_eval_image_embed(ctxLlama.NativeHandle, ptrImageEmbed, (int)ctxLlama.Params.BatchSize, out n_past );
+                NativeApi.llava_image_embed_free(ptrImageEmbed);
+                return result;
+            }
+        }
+    }
+}