feat(vllm): add support for video-to-text

mudler · mudler · commit f3f9d1df81e4 · 2024-10-04T19:27:57.000+02:00
Closes: #2318 Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
@@ -18,6 +18,7 @@
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.multimodal.utils import fetch_image
+from vllm.assets.video import VideoAsset
 
 _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 
@@ -202,19 +203,27 @@ async def _predict(self, request, context, streaming=False):
 
         # Extract image paths and process images
         prompt = request.Prompt
+
         image_paths = request.Images
         image_data = [self.load_image(img_path) for img_path in image_paths]
 
+        videos_path = request.Videos
+        video_data = [self.load_video(video_path) for video_path in videos_path]
+
         # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
         if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
             prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
 
         # Generate text using the LLM engine
         request_id = random_uuid()
+        print(f"Generating text with request_id: {request_id}", file=sys.stderr)
         outputs = self.llm.generate(
             {
                 "prompt": prompt,
-                "multi_modal_data": {"image": image_data} if image_data else None,
+                "multi_modal_data": {
+                    "image": image_data if image_data else None,
+                    "video": video_data if video_data else None,
+                } if image_data or video_data else None,
             },
             sampling_params=sampling_params,
             request_id=request_id,
@@ -251,7 +260,7 @@ async def _predict(self, request, context, streaming=False):
         # Sending the final generated text
         yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
 
-    def load_image(self, image_path: str) -> Image:
+    def load_image(self, image_path: str):
         """
         Load an image from the given file path.
         
@@ -265,6 +274,23 @@ def load_image(self, image_path: str) -> Image:
             return Image.open(image_path)
         except Exception as e:
             print(f"Error loading image {image_path}: {e}", file=sys.stderr)
+            return self.load_video(image_path)
+
+    def load_video(self, video_path: str):
+        """
+        Load a video from the given file path.
+        
+        Args:
+            video_path (str): The path to the image file.
+
+        Returns:
+            Image: The loaded image.
+        """
+        try:
+            video = VideoAsset(name=video_path).np_ndarrays
+            return video
+        except Exception as e:
+            print(f"Error loading video {image_path}: {e}", file=sys.stderr)
             return None
 
 async def serve(address):