18
18
from vllm .utils import random_uuid
19
19
from vllm .transformers_utils .tokenizer import get_tokenizer
20
20
from vllm .multimodal .utils import fetch_image
21
+ from vllm .assets .video import VideoAsset
21
22
22
23
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
23
24
@@ -202,19 +203,27 @@ async def _predict(self, request, context, streaming=False):
202
203
203
204
# Extract image paths and process images
204
205
prompt = request .Prompt
206
+
205
207
image_paths = request .Images
206
208
image_data = [self .load_image (img_path ) for img_path in image_paths ]
207
209
210
+ videos_path = request .Videos
211
+ video_data = [self .load_video (video_path ) for video_path in videos_path ]
212
+
208
213
# If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
209
214
if not request .Prompt and request .UseTokenizerTemplate and request .Messages :
210
215
prompt = self .tokenizer .apply_chat_template (request .Messages , tokenize = False , add_generation_prompt = True )
211
216
212
217
# Generate text using the LLM engine
213
218
request_id = random_uuid ()
219
+ print (f"Generating text with request_id: { request_id } " , file = sys .stderr )
214
220
outputs = self .llm .generate (
215
221
{
216
222
"prompt" : prompt ,
217
- "multi_modal_data" : {"image" : image_data } if image_data else None ,
223
+ "multi_modal_data" : {
224
+ "image" : image_data if image_data else None ,
225
+ "video" : video_data if video_data else None ,
226
+ } if image_data or video_data else None ,
218
227
},
219
228
sampling_params = sampling_params ,
220
229
request_id = request_id ,
@@ -251,7 +260,7 @@ async def _predict(self, request, context, streaming=False):
251
260
# Sending the final generated text
252
261
yield backend_pb2 .Reply (message = bytes (generated_text , encoding = 'utf-8' ))
253
262
254
- def load_image (self , image_path : str ) -> Image :
263
+ def load_image (self , image_path : str ):
255
264
"""
256
265
Load an image from the given file path.
257
266
@@ -265,6 +274,23 @@ def load_image(self, image_path: str) -> Image:
265
274
return Image .open (image_path )
266
275
except Exception as e :
267
276
print (f"Error loading image { image_path } : { e } " , file = sys .stderr )
277
+ return self .load_video (image_path )
278
+
279
+ def load_video (self , video_path : str ):
280
+ """
281
+ Load a video from the given file path.
282
+
283
+ Args:
284
+ video_path (str): The path to the image file.
285
+
286
+ Returns:
287
+ Image: The loaded image.
288
+ """
289
+ try :
290
+ video = VideoAsset (name = video_path ).np_ndarrays
291
+ return video
292
+ except Exception as e :
293
+ print (f"Error loading video { image_path } : { e } " , file = sys .stderr )
268
294
return None
269
295
270
296
async def serve (address ):
0 commit comments