Merge pull request #1442 from roboflow/florence-2-lora

hansent · web-flow · commit 0616b1111621 · 2025-07-28T14:04:36.000-05:00
inference-exp:  Florence 2 LoRA
diff --git a/inference_experimental/inference_exp/models/auto_loaders/models_registry.py b/inference_experimental/inference_exp/models/auto_loaders/models_registry.py
@@ -121,6 +121,10 @@
         module_name="inference_exp.models.paligemma.paligemma_hf",
         class_name="PaliGemmaHF",
     ),
+    ("florence-2", VLM_TASK, BackendType.HF): LazyClass(
+        module_name="inference_exp.models.florence2.florence2_hf",
+        class_name="Florence2HF",
+    ),
     ("clip", EMBEDDING_TASK, BackendType.TORCH): LazyClass(
         module_name="inference_exp.models.clip.clip_pytorch",
         class_name="ClipTorch",
diff --git a/inference_experimental/inference_exp/models/florence2/florence2_hf.py b/inference_experimental/inference_exp/models/florence2/florence2_hf.py
@@ -1,8 +1,10 @@
 from typing import List, Literal, Optional, Tuple, Union
+import os
 
 import cv2
 import numpy as np
 import torch
+from peft import LoraConfig, PeftModel
 from inference_exp import Detections, InstanceDetections
 from inference_exp.configuration import DEFAULT_DEVICE
 from inference_exp.entities import ImageDimensions
@@ -18,9 +20,9 @@
     "very_detailed": "<MORE_DETAILED_CAPTION>",
 }
 LABEL_MODE2TASK = {
-    "roi": "<REGION_PROPOSAL>",
-    "class": "<OD>",
-    "caption": "<DENSE_REGION_CAPTION>",
+    "rois": "<REGION_PROPOSAL>",
+    "classes": "<OD>",
+    "captions": "<DENSE_REGION_CAPTION>",
 }
 LOC_BINS = 1000
 
@@ -35,15 +37,36 @@ def from_pretrained(
         **kwargs,
     ) -> "Florence2HF":
         torch_dtype = torch.float16 if device.type == "cuda" else torch.float32
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name_or_path,
-            torch_dtype=torch_dtype,
-            trust_remote_code=True,
-        ).to(device)
-        processor = AutoProcessor.from_pretrained(
-            model_name_or_path,
-            trust_remote_code=True,
-        )
+
+        adapter_config_path = os.path.join(model_name_or_path, "adapter_config.json")
+        if os.path.exists(adapter_config_path):
+            base_model_path = os.path.join(model_name_or_path, "base")
+            model = AutoModelForCausalLM.from_pretrained(
+                base_model_path,
+                torch_dtype=torch_dtype,
+                trust_remote_code=True,
+                local_files_only=True,
+            )
+            model = PeftModel.from_pretrained(model, model_name_or_path)
+            model.merge_and_unload()
+            model.to(device)
+
+            processor = AutoProcessor.from_pretrained(
+                base_model_path, trust_remote_code=True, local_files_only=True
+            )
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name_or_path,
+                torch_dtype=torch_dtype,
+                trust_remote_code=True,
+                local_files_only=True,
+            ).to(device)
+            processor = AutoProcessor.from_pretrained(
+                model_name_or_path,
+                trust_remote_code=True,
+                local_files_only=True,
+            )
+
         return cls(
             model=model, processor=processor, device=device, torch_dtype=torch_dtype
         )
diff --git a/inference_experimental/tests/integration_tests/conftest.py b/inference_experimental/tests/integration_tests/conftest.py
@@ -0,0 +1,51 @@
+import os.path
+import zipfile
+
+import cv2
+import numpy as np
+import pytest
+import requests
+import torch
+import torchvision.io
+from filelock import FileLock
+from PIL import Image
+
+ASSETS_DIR = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "models", "assets")
+)
+DOG_IMAGE_PATH = os.path.join(ASSETS_DIR, "dog.jpeg")
+DOG_IMAGE_URL = "https://media.roboflow.com/dog.jpeg"
+
+
+def _download_if_not_exists(file_path: str, url: str, lock_timeout: int = 120) -> None:
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    lock_path = f"{file_path}.lock"
+    with FileLock(lock_file=lock_path, timeout=lock_timeout):
+        if os.path.exists(file_path):
+            return None
+        with requests.get(url, stream=True) as response:
+            response.raise_for_status()
+            with open(file_path, "wb") as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    if chunk:
+                        f.write(chunk)
+
+
+@pytest.fixture(scope="function")
+def dog_image_numpy() -> np.ndarray:
+    _download_if_not_exists(file_path=DOG_IMAGE_PATH, url=DOG_IMAGE_URL)
+    image = cv2.imread(DOG_IMAGE_PATH)
+    assert image is not None, "Could not load test image"
+    return image
+
+
+@pytest.fixture(scope="function")
+def dog_image_torch() -> torch.Tensor:
+    _download_if_not_exists(file_path=DOG_IMAGE_PATH, url=DOG_IMAGE_URL)
+    return torchvision.io.read_image(DOG_IMAGE_PATH)
+
+
+@pytest.fixture(scope="function")
+def dog_image_pil() -> Image.Image:
+    _download_if_not_exists(file_path=DOG_IMAGE_PATH, url=DOG_IMAGE_URL)
+    return Image.open(DOG_IMAGE_PATH)
diff --git a/inference_experimental/tests/integration_tests/e2e/test_florence2_e2e.py b/inference_experimental/tests/integration_tests/e2e/test_florence2_e2e.py
@@ -0,0 +1,36 @@
+import numpy as np
+import pytest
+import torch
+from inference_exp import AutoModel
+
+
+@pytest.mark.e2e_model_inference
+def test_florence2_base_model(dog_image_numpy: np.ndarray):
+    # GIVEN
+    model = AutoModel.from_pretrained("florence-2-base")
+
+    # WHEN
+    captions = model.caption_image(dog_image_numpy)
+
+    # THEN
+    assert isinstance(captions, list)
+    assert len(captions) == 1
+    assert isinstance(captions[0], str)
+    assert captions[0] == "A man carrying a blue dog on his back."
+
+
+@pytest.mark.e2e_model_inference
+def test_florence2_lora_model(
+    dog_image_numpy: np.ndarray, dog_image_torch: torch.Tensor
+):
+    # GIVEN
+    model = AutoModel.from_pretrained("florence-2-lora-test")
+
+    # WHEN
+    captions = model.caption_image(dog_image_numpy)
+
+    # THEN
+    assert isinstance(captions, list)
+    assert len(captions) == 1
+    assert isinstance(captions[0], str)
+    assert captions[0] == "Disease"
diff --git a/inference_experimental/tests/integration_tests/models/conftest.py b/inference_experimental/tests/integration_tests/models/conftest.py
@@ -1,4 +1,5 @@
 import os.path
+import zipfile
 
 import cv2
 import numpy as np
@@ -11,13 +12,18 @@
 
 ASSETS_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "assets"))
 MODELS_DIR = os.path.join(ASSETS_DIR, "models")
-DOG_IMAGE_PATH = os.path.join(ASSETS_DIR, "dog.jpeg")
-DOG_IMAGE_URL = "https://media.roboflow.com/dog.jpeg"
 CLIP_RN50_TORCH_URL = "https://storage.googleapis.com/roboflow-tests-assets/clip_packages/RN50/torch/model.pt"
 CLIP_RN50_ONNX_VISUAL = "https://storage.googleapis.com/roboflow-tests-assets/clip_packages/RN50/onnx/visual.onnx"
 CLIP_RN50_ONNX_TEXTUAL = "https://storage.googleapis.com/roboflow-tests-assets/clip_packages/RN50/onnx/textual.onnx"
 PE_MODEL_URL = "https://storage.googleapis.com/roboflow-tests-assets/perception-encoder/pe-core-b16-224/model.pt"
 PE_CONFIG_URL = "https://storage.googleapis.com/roboflow-tests-assets/perception-encoder/pe-core-b16-224/config.json"
+FLORENCE2_BASE_FT_URL = (
+    "https://storage.googleapis.com/roboflow-tests-assets/florence2/base-ft.zip"
+)
+FLORENCE2_LARGE_FT_URL = (
+    "https://storage.googleapis.com/roboflow-tests-assets/florence2/large-ft.zip"
+)
+OCR_TEST_IMAGE_PATH = os.path.join(ASSETS_DIR, "ocr_test_image.png")
 
 
 @pytest.fixture(scope="module")
@@ -59,25 +65,13 @@ def perception_encoder_path() -> str:
 
 
 @pytest.fixture(scope="function")
-def dog_image_numpy() -> np.ndarray:
-    _download_if_not_exists(file_path=DOG_IMAGE_PATH, url=DOG_IMAGE_URL)
-    image = cv2.imread(DOG_IMAGE_PATH)
-    assert image is not None, "Could not load test image"
+def ocr_test_image_numpy() -> np.ndarray:
+    """Returns the OCR test image as a numpy array."""
+    image = cv2.imread(OCR_TEST_IMAGE_PATH)
+    assert image is not None, "Could not load OCR test image"
     return image
 
 
-@pytest.fixture(scope="function")
-def dog_image_torch() -> torch.Tensor:
-    _download_if_not_exists(file_path=DOG_IMAGE_PATH, url=DOG_IMAGE_URL)
-    return torchvision.io.read_image(DOG_IMAGE_PATH)
-
-
-@pytest.fixture(scope="function")
-def dog_image_pil() -> Image.Image:
-    _download_if_not_exists(file_path=DOG_IMAGE_PATH, url=DOG_IMAGE_URL)
-    return Image.open(DOG_IMAGE_PATH)
-
-
 def _download_if_not_exists(file_path: str, url: str, lock_timeout: int = 120) -> None:
     os.makedirs(os.path.dirname(file_path), exist_ok=True)
     lock_path = f"{file_path}.lock"
@@ -90,3 +84,33 @@ def _download_if_not_exists(file_path: str, url: str, lock_timeout: int = 120) -
                 for chunk in response.iter_content(chunk_size=8192):
                     if chunk:
                         f.write(chunk)
+
+
+@pytest.fixture(scope="module")
+def florence2_base_ft_path() -> str:
+    package_dir = os.path.join(MODELS_DIR, "florence2")
+    unzipped_package_path = os.path.join(package_dir, "base-ft")
+    os.makedirs(package_dir, exist_ok=True)
+    zip_path = os.path.join(package_dir, "base-ft.zip")
+    _download_if_not_exists(file_path=zip_path, url=FLORENCE2_BASE_FT_URL)
+    lock_path = f"{unzipped_package_path}.lock"
+    with FileLock(lock_path, timeout=120):
+        if not os.path.exists(unzipped_package_path):
+            with zipfile.ZipFile(zip_path, "r") as zip_ref:
+                zip_ref.extractall(package_dir)
+    return unzipped_package_path
+
+
+@pytest.fixture(scope="module")
+def florence2_large_ft_path() -> str:
+    package_dir = os.path.join(MODELS_DIR, "florence2")
+    unzipped_package_path = os.path.join(package_dir, "large-ft")
+    os.makedirs(package_dir, exist_ok=True)
+    zip_path = os.path.join(package_dir, "large-ft.zip")
+    _download_if_not_exists(file_path=zip_path, url=FLORENCE2_LARGE_FT_URL)
+    lock_path = f"{unzipped_package_path}.lock"
+    with FileLock(lock_path, timeout=120):
+        if not os.path.exists(unzipped_package_path):
+            with zipfile.ZipFile(zip_path, "r") as zip_ref:
+                zip_ref.extractall(package_dir)
+    return unzipped_package_path
diff --git a/inference_experimental/tests/integration_tests/models/test_florence2_predictions.py b/inference_experimental/tests/integration_tests/models/test_florence2_predictions.py
@@ -0,0 +1,125 @@
+import numpy as np
+import pytest
+import torch
+
+from inference_exp.models.florence2.florence2_hf import Florence2HF
+
+
+@pytest.fixture(scope="module")
+def florence2_model(florence2_base_ft_path: str) -> Florence2HF:
+    return Florence2HF.from_pretrained(florence2_base_ft_path)
+
+
+@pytest.mark.slow
+def test_classify_image_region(
+    florence2_model: Florence2HF, dog_image_numpy: np.ndarray
+):
+    # given
+    xyxy = [100, 100, 300, 300]
+    # when
+    result = florence2_model.classify_image_region(images=dog_image_numpy, xyxy=xyxy)
+    # then
+    assert result == ["human face"]
+
+
+@pytest.mark.slow
+def test_caption_image_region(
+    florence2_model: Florence2HF, dog_image_numpy: np.ndarray
+):
+    # given
+    xyxy = [100, 100, 300, 300]
+    # when
+    result = florence2_model.caption_image_region(images=dog_image_numpy, xyxy=xyxy)
+    # then
+    assert result == ["human face"]
+
+
+@pytest.mark.slow
+def test_ocr_image_region(
+    florence2_model: Florence2HF, ocr_test_image_numpy: np.ndarray
+):
+    # TODO: figure out if this is imlementation error? doesnt really seem to work, like just returns text from the whole image
+    # given
+    xyxy = [0, 0, 100, 150]
+    # when
+    result = florence2_model.ocr_image_region(images=ocr_test_image_numpy, xyxy=xyxy)
+    # then
+    assert result == ["This is a test image for OCR."]
+
+
+@pytest.mark.slow
+def test_segment_region(florence2_model: Florence2HF, dog_image_numpy: np.ndarray):
+    # given
+    xyxy = [100, 100, 300, 300]
+    # when
+    result = florence2_model.segment_region(images=dog_image_numpy, xyxy=xyxy)
+    # then
+    assert isinstance(result, list)
+    assert len(result) == 1
+    assert result[0].xyxy.shape == (1, 4)
+    assert torch.allclose(
+        result[0].xyxy, torch.tensor([[100, 100, 302, 303]], dtype=torch.int32), atol=2
+    )
+    assert result[0].mask.shape == (1, 1280, 720)
+
+
+@pytest.mark.slow
+def test_segment_phrase(florence2_model: Florence2HF, dog_image_numpy: np.ndarray):
+    # when
+    result = florence2_model.segment_phrase(images=dog_image_numpy, phrase="dog")
+    # then
+    assert isinstance(result, list)
+    assert len(result) == 1
+    assert result[0].xyxy.shape == (1, 4)
+    assert torch.allclose(
+        result[0].xyxy, torch.tensor([[71, 249, 649, 926]], dtype=torch.int32), atol=5
+    )
+    assert result[0].mask.shape == (1, 1280, 720)
+
+
+@pytest.mark.slow
+def test_detect_objects(florence2_model: Florence2HF, dog_image_numpy: np.ndarray):
+    # when
+    result = florence2_model.detect_objects(images=dog_image_numpy)
+    # then
+    assert isinstance(result, list)
+    assert len(result) == 1
+    assert result[0].xyxy.shape == (4, 4)
+    expected_bboxes_metadata = [
+        {"class_name": "backpack"},
+        {"class_name": "dog"},
+        {"class_name": "hat"},
+        {"class_name": "person"},
+    ]
+    assert result[0].bboxes_metadata == expected_bboxes_metadata
+
+
+@pytest.mark.slow
+def test_caption_image(florence2_model: Florence2HF, dog_image_numpy: np.ndarray):
+    # when
+    result = florence2_model.caption_image(images=dog_image_numpy)
+    # then
+    assert result == ["A man carrying a blue dog on his back."]
+
+
+@pytest.mark.slow
+def test_parse_document(florence2_model: Florence2HF, ocr_test_image_numpy: np.ndarray):
+    # when
+    result = florence2_model.parse_document(images=ocr_test_image_numpy)
+    # then
+    assert isinstance(result, list)
+    assert len(result) == 1
+    assert result[0].xyxy.shape[0] >= 1
+    assert result[0].xyxy.shape[1] == 4
+    full_text = "".join(
+        meta["class_name"] for meta in result[0].bboxes_metadata
+    ).lstrip("</s>")
+    assert full_text == "This is a test image for OCR."
+
+
+@pytest.mark.slow
+def test_ocr_image(florence2_model: Florence2HF, ocr_test_image_numpy: np.ndarray):
+    # when
+    result = florence2_model.ocr_image(images=ocr_test_image_numpy)
+    # then
+    assert result == ["This is a test image for OCR."]
diff --git a/inference_experimental/tests/integration_tests/models/test_florence2_preprocessing.py b/inference_experimental/tests/integration_tests/models/test_florence2_preprocessing.py