Skip to content

Commit f3f9d1d

Browse files
committed
feat(vllm): add support for video-to-text
Closes: #2318 Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent 536434e commit f3f9d1d

File tree

1 file changed

+28
-2
lines changed

1 file changed

+28
-2
lines changed

backend/python/vllm/backend.py

+28-2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from vllm.utils import random_uuid
1919
from vllm.transformers_utils.tokenizer import get_tokenizer
2020
from vllm.multimodal.utils import fetch_image
21+
from vllm.assets.video import VideoAsset
2122

2223
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
2324

@@ -202,19 +203,27 @@ async def _predict(self, request, context, streaming=False):
202203

203204
# Extract image paths and process images
204205
prompt = request.Prompt
206+
205207
image_paths = request.Images
206208
image_data = [self.load_image(img_path) for img_path in image_paths]
207209

210+
videos_path = request.Videos
211+
video_data = [self.load_video(video_path) for video_path in videos_path]
212+
208213
# If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
209214
if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
210215
prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
211216

212217
# Generate text using the LLM engine
213218
request_id = random_uuid()
219+
print(f"Generating text with request_id: {request_id}", file=sys.stderr)
214220
outputs = self.llm.generate(
215221
{
216222
"prompt": prompt,
217-
"multi_modal_data": {"image": image_data} if image_data else None,
223+
"multi_modal_data": {
224+
"image": image_data if image_data else None,
225+
"video": video_data if video_data else None,
226+
} if image_data or video_data else None,
218227
},
219228
sampling_params=sampling_params,
220229
request_id=request_id,
@@ -251,7 +260,7 @@ async def _predict(self, request, context, streaming=False):
251260
# Sending the final generated text
252261
yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
253262

254-
def load_image(self, image_path: str) -> Image:
263+
def load_image(self, image_path: str):
255264
"""
256265
Load an image from the given file path.
257266
@@ -265,6 +274,23 @@ def load_image(self, image_path: str) -> Image:
265274
return Image.open(image_path)
266275
except Exception as e:
267276
print(f"Error loading image {image_path}: {e}", file=sys.stderr)
277+
return self.load_video(image_path)
278+
279+
def load_video(self, video_path: str):
280+
"""
281+
Load a video from the given file path.
282+
283+
Args:
284+
video_path (str): The path to the image file.
285+
286+
Returns:
287+
Image: The loaded image.
288+
"""
289+
try:
290+
video = VideoAsset(name=video_path).np_ndarrays
291+
return video
292+
except Exception as e:
293+
print(f"Error loading video {image_path}: {e}", file=sys.stderr)
268294
return None
269295

270296
async def serve(address):

0 commit comments

Comments
 (0)