Move matching and ocr to individual packages

kvithayathil · kvithayathil · commit 4a9aa38eedd4 · 2025-10-06T22:43:17.000-04:00
- Hide non public functions in ocr helper
- Update python requirement
- Move matching and ocr code into respective package folders
diff --git a/backend/app/api.py b/backend/app/api.py
@@ -2,12 +2,13 @@
 
 from fastapi import FastAPI, Response
 from fastapi.middleware.cors import CORSMiddleware
-from fuzzy_match_helper import create_ocr_matched_df, create_select_voter_records
-from ocr_helper import create_ocr_df
 from routers import file
 from settings.settings_repo import config
 from utils import logger
 
+from .matcher import create_ocr_matched_df
+from .ocr import create_ocr_df
+
 app = FastAPI(root_path="/api")
 app.state.voter_records_df = None
 
diff --git a/backend/app/matcher/__init__.py b/backend/app/matcher/__init__.py
@@ -0,0 +1,3 @@
+from .voter_matcher import create_ocr_matched_df
+
+_all__ = ["create_ocr_matched_df"]
diff --git a/backend/app/matcher/voter_matcher.py b/backend/app/matcher/voter_matcher.py
@@ -120,7 +120,7 @@ def score_fuzzy_match_slim(
     return results
 
 
-def get_matched_name_address(
+def _get_matched_name_address(
     ocr_name: str, ocr_address: str, select_voter_records: pd.DataFrame
 ) -> List[Tuple[str, str, float, int]]:
     """
@@ -208,7 +208,7 @@ def create_ocr_matched_df(
         with ThreadPoolExecutor() as executor:
             batch_results = list(
                 executor.map(
-                    lambda row: get_matched_name_address(
+                    lambda row: _get_matched_name_address(
                         row["OCR Name"], row["OCR Address"], select_voter_records
                     ),
                     [row for _, row in batch.iterrows()],
diff --git a/backend/app/ocr/__init__.py b/backend/app/ocr/__init__.py
@@ -1,3 +1,3 @@
-from .ocr_client_factory import extract_from_encoding_async
+from .ocr_helper import create_ocr_df
 
-__all__ = ["extract_from_encoding_async"]
+__all__ = ["create_ocr_df"]
diff --git a/backend/app/ocr/ocr_helper.py b/backend/app/ocr/ocr_helper.py
@@ -1,17 +1,17 @@
-from typing import List
+import asyncio
 import base64
-import os
 import json
-from tqdm.notebook import tqdm
-from dotenv import load_dotenv
-import pandas as pd
-import asyncio
-import fitz  # Add this import at the top with other imports
-
 import logging
+import os
 from datetime import datetime
+from typing import List
+
+import fitz  # Add this import at the top with other imports
+import pandas as pd
+from dotenv import load_dotenv
+from tqdm.notebook import tqdm
 
-from ocr import extract_from_encoding_async
+from .ocr_client_factory import extract_from_encoding_async
 
 # Set up logging
 log_directory = "logs"
@@ -50,7 +50,7 @@
     config = json.load(f)
 
 
-def collecting_pdf_encoded_images(file_path: str) -> List[str]:
+def _collecting_pdf_encoded_images(file_path: str) -> List[str]:
     """Convert PDF pages to encoded images, cropping to target area.
     Returns list of base64 encoded image strings."""
 
@@ -97,7 +97,7 @@ def collecting_pdf_encoded_images(file_path: str) -> List[str]:
 
 
 # function for adding data
-def add_metadata(initial_data: List[dict], page_no: int, filename: str) -> List[dict]:
+def _add_metadata(initial_data: List[dict], page_no: int, filename: str) -> List[dict]:
     """
     Adds page number, row number, and filename metadata to the recognized signatures
 
@@ -121,7 +121,7 @@ def add_metadata(initial_data: List[dict], page_no: int, filename: str) -> List[
     return final_data
 
 
-async def process_batch_async(encodings: List[str]) -> List[List[dict]]:
+async def _process_batch_async(encodings: List[str]) -> List[List[dict]]:
     """
     Process a batch of images concurrently
     """
@@ -132,7 +132,7 @@ async def process_batch_async(encodings: List[str]) -> List[List[dict]]:
     return results
 
 
-def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
+def _get_or_create_event_loop() -> asyncio.AbstractEventLoop:
     try:
         return asyncio.get_event_loop()
     except RuntimeError:
@@ -141,7 +141,7 @@ def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
         return loop
 
 
-def collect_ocr_data(
+def _collect_ocr_data(
     filedir: str,
     filename: str,
     max_page_num: int = None,
@@ -165,7 +165,7 @@ def collect_ocr_data(
     logger.info(f"Parameters - max_page_num: {max_page_num}, batch_size: {batch_size}")
 
     # collecting images
-    encoded_images = collecting_pdf_encoded_images(os.path.join(filedir, filename))
+    encoded_images = _collecting_pdf_encoded_images(os.path.join(filedir, filename))
 
     # selecting pages
     if max_page_num:
@@ -180,7 +180,7 @@ def collect_ocr_data(
     total_pages = len(encoded_images)
 
     # getting event loop
-    loop = get_or_create_event_loop()
+    loop = _get_or_create_event_loop()
 
     # Process in batches
     logger.info(f"Processing {total_pages} pages in batches of {batch_size}")
@@ -199,12 +199,12 @@ def collect_ocr_data(
             )
 
         # Run async batch processing using the event loop
-        batch_results = loop.run_until_complete(process_batch_async(batch))
+        batch_results = loop.run_until_complete(_process_batch_async(batch))
 
         # Add metadata for each result in the batch
         for page_idx, result in enumerate(batch_results):
             current_page = i + page_idx
-            ocr_data = add_metadata(result, current_page, filename)
+            ocr_data = _add_metadata(result, current_page, filename)
             full_data.extend(ocr_data)
 
         logger.info(
@@ -238,7 +238,7 @@ def create_ocr_df(
     logger.info("Starting OCR DataFrame creation")
 
     # gathering ocr_data
-    ocr_data = collect_ocr_data(
+    ocr_data = _collect_ocr_data(
         filedir,
         filename,
         max_page_num=max_page_num,
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
@@ -3,7 +3,7 @@ name = "vote-catcher-server"
 version = "0.0.15"
 description = "A python package for verifying ballot petition signatures"
 readme = "README.md"
-requires-python = "~=3.12"
+requires-python = ">=3.12"
 authors = [{ name = "Mobolaji Williams" }]
 maintainers = [
     { name = "Mobolaji Williams" },
@@ -36,6 +36,7 @@ dependencies = [
     "streamlit>=1.44.1",
     "streamlit-shadcn-ui>=0.1.18",
     "structlog>=25.2.0",
+    "supabase>=2.21.1",
     "tomli>=2.2.1",
     "tomli-w>=1.2.0",
 ]
diff --git a/backend/uv.lock b/backend/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .voter_matcher import create_ocr_matched_df`
	`2`	`+`
	`3`	`+_all__ = ["create_ocr_matched_df"]`