add dtype support

mandalsouvik3333 · mandalsouvik3333 · commit cc84eb235228 · 2025-06-12T08:48:43.000Z
diff --git a/docext/app/app.py b/docext/app/app.py
@@ -270,6 +270,7 @@ def main(
     max_img_size: int,
     concurrency_limit: int,
     share: bool,
+    dtype: str,
 ):
     vllm_server = None
     if model_name.startswith("hosted_vllm/") and (
@@ -290,6 +291,7 @@ def main(
                 gpu_memory_utilization=gpu_memory_utilization,
                 max_num_imgs=max_num_imgs,
                 vllm_start_timeout=vllm_start_timeout,
+                dtype=dtype,
             )
             vllm_server.run_in_background()
 
@@ -356,6 +358,7 @@ def docext_app():
         args.max_img_size,
         args.concurrency_limit,
         args.share,
+        args.dtype,
     )
 
 
diff --git a/docext/app/args.py b/docext/app/args.py
@@ -79,4 +79,10 @@ def parse_args():
         default=1,
         help="Maximum number of concurrent PDF to markdown conversion requests. Higher values allow more users to process documents simultaneously but require more memory and compute resources.",
     )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="bfloat16",
+        help="Data type for the model. Can be 'bfloat16' or 'float16'.",
+    )
     return parser.parse_args()
diff --git a/docext/core/vllm.py b/docext/core/vllm.py
@@ -20,6 +20,7 @@ def __init__(
         gpu_memory_utilization: float = 0.98,
         max_num_imgs: int = 5,
         vllm_start_timeout: int = 300,
+        dtype: str = "bfloat16",
     ):
         self.host = host
         self.port = port
@@ -30,12 +31,18 @@ def __init__(
         self.server_process = None
         self.url = f"http://{self.host}:{self.port}/v1/models"
         self.vllm_start_timeout = vllm_start_timeout
+        self.dtype = dtype
+        assert self.dtype in [
+            "bfloat16",
+            "float16",
+        ], "Invalid dtype. Must be 'bfloat16' or 'float16'."
 
     def start_server(self):
         """Start the vLLM server in a background thread."""
         logger.info("Starting vLLM server...")
         # Command to start the vLLM server
         is_awq = "awq" in self.model_name.lower()
+        dtype = dtype if not is_awq else "float16"
         command = [
             "vllm",
             "serve",
@@ -45,7 +52,7 @@ def start_server(self):
             "--port",
             str(self.port),
             "--dtype",
-            "bfloat16" if not is_awq else "float16",
+            dtype,
             "--limit-mm-per-prompt",
             f"image={self.max_num_imgs},video=0",
             "--served-model-name",