Skip to content

Commit 4a9aa38

Browse files
committed
Move matching and ocr to individual packages
- Hide non public functions in ocr helper - Update python requirement - Move matching and ocr code into respective package folders
1 parent 6fc8215 commit 4a9aa38

File tree

7 files changed

+390
-104
lines changed

7 files changed

+390
-104
lines changed

backend/app/api.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22

33
from fastapi import FastAPI, Response
44
from fastapi.middleware.cors import CORSMiddleware
5-
from fuzzy_match_helper import create_ocr_matched_df, create_select_voter_records
6-
from ocr_helper import create_ocr_df
75
from routers import file
86
from settings.settings_repo import config
97
from utils import logger
108

9+
from .matcher import create_ocr_matched_df
10+
from .ocr import create_ocr_df
11+
1112
app = FastAPI(root_path="/api")
1213
app.state.voter_records_df = None
1314

backend/app/matcher/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .voter_matcher import create_ocr_matched_df
2+
3+
_all__ = ["create_ocr_matched_df"]

backend/app/fuzzy_match_helper.py renamed to backend/app/matcher/voter_matcher.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def score_fuzzy_match_slim(
120120
return results
121121

122122

123-
def get_matched_name_address(
123+
def _get_matched_name_address(
124124
ocr_name: str, ocr_address: str, select_voter_records: pd.DataFrame
125125
) -> List[Tuple[str, str, float, int]]:
126126
"""
@@ -208,7 +208,7 @@ def create_ocr_matched_df(
208208
with ThreadPoolExecutor() as executor:
209209
batch_results = list(
210210
executor.map(
211-
lambda row: get_matched_name_address(
211+
lambda row: _get_matched_name_address(
212212
row["OCR Name"], row["OCR Address"], select_voter_records
213213
),
214214
[row for _, row in batch.iterrows()],

backend/app/ocr/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
from .ocr_client_factory import extract_from_encoding_async
1+
from .ocr_helper import create_ocr_df
22

3-
__all__ = ["extract_from_encoding_async"]
3+
__all__ = ["create_ocr_df"]

backend/app/ocr_helper.py renamed to backend/app/ocr/ocr_helper.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
1-
from typing import List
1+
import asyncio
22
import base64
3-
import os
43
import json
5-
from tqdm.notebook import tqdm
6-
from dotenv import load_dotenv
7-
import pandas as pd
8-
import asyncio
9-
import fitz # Add this import at the top with other imports
10-
114
import logging
5+
import os
126
from datetime import datetime
7+
from typing import List
8+
9+
import fitz # Add this import at the top with other imports
10+
import pandas as pd
11+
from dotenv import load_dotenv
12+
from tqdm.notebook import tqdm
1313

14-
from ocr import extract_from_encoding_async
14+
from .ocr_client_factory import extract_from_encoding_async
1515

1616
# Set up logging
1717
log_directory = "logs"
@@ -50,7 +50,7 @@
5050
config = json.load(f)
5151

5252

53-
def collecting_pdf_encoded_images(file_path: str) -> List[str]:
53+
def _collecting_pdf_encoded_images(file_path: str) -> List[str]:
5454
"""Convert PDF pages to encoded images, cropping to target area.
5555
Returns list of base64 encoded image strings."""
5656

@@ -97,7 +97,7 @@ def collecting_pdf_encoded_images(file_path: str) -> List[str]:
9797

9898

9999
# function for adding data
100-
def add_metadata(initial_data: List[dict], page_no: int, filename: str) -> List[dict]:
100+
def _add_metadata(initial_data: List[dict], page_no: int, filename: str) -> List[dict]:
101101
"""
102102
Adds page number, row number, and filename metadata to the recognized signatures
103103
@@ -121,7 +121,7 @@ def add_metadata(initial_data: List[dict], page_no: int, filename: str) -> List[
121121
return final_data
122122

123123

124-
async def process_batch_async(encodings: List[str]) -> List[List[dict]]:
124+
async def _process_batch_async(encodings: List[str]) -> List[List[dict]]:
125125
"""
126126
Process a batch of images concurrently
127127
"""
@@ -132,7 +132,7 @@ async def process_batch_async(encodings: List[str]) -> List[List[dict]]:
132132
return results
133133

134134

135-
def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
135+
def _get_or_create_event_loop() -> asyncio.AbstractEventLoop:
136136
try:
137137
return asyncio.get_event_loop()
138138
except RuntimeError:
@@ -141,7 +141,7 @@ def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
141141
return loop
142142

143143

144-
def collect_ocr_data(
144+
def _collect_ocr_data(
145145
filedir: str,
146146
filename: str,
147147
max_page_num: int = None,
@@ -165,7 +165,7 @@ def collect_ocr_data(
165165
logger.info(f"Parameters - max_page_num: {max_page_num}, batch_size: {batch_size}")
166166

167167
# collecting images
168-
encoded_images = collecting_pdf_encoded_images(os.path.join(filedir, filename))
168+
encoded_images = _collecting_pdf_encoded_images(os.path.join(filedir, filename))
169169

170170
# selecting pages
171171
if max_page_num:
@@ -180,7 +180,7 @@ def collect_ocr_data(
180180
total_pages = len(encoded_images)
181181

182182
# getting event loop
183-
loop = get_or_create_event_loop()
183+
loop = _get_or_create_event_loop()
184184

185185
# Process in batches
186186
logger.info(f"Processing {total_pages} pages in batches of {batch_size}")
@@ -199,12 +199,12 @@ def collect_ocr_data(
199199
)
200200

201201
# Run async batch processing using the event loop
202-
batch_results = loop.run_until_complete(process_batch_async(batch))
202+
batch_results = loop.run_until_complete(_process_batch_async(batch))
203203

204204
# Add metadata for each result in the batch
205205
for page_idx, result in enumerate(batch_results):
206206
current_page = i + page_idx
207-
ocr_data = add_metadata(result, current_page, filename)
207+
ocr_data = _add_metadata(result, current_page, filename)
208208
full_data.extend(ocr_data)
209209

210210
logger.info(
@@ -238,7 +238,7 @@ def create_ocr_df(
238238
logger.info("Starting OCR DataFrame creation")
239239

240240
# gathering ocr_data
241-
ocr_data = collect_ocr_data(
241+
ocr_data = _collect_ocr_data(
242242
filedir,
243243
filename,
244244
max_page_num=max_page_num,

backend/pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name = "vote-catcher-server"
33
version = "0.0.15"
44
description = "A python package for verifying ballot petition signatures"
55
readme = "README.md"
6-
requires-python = "~=3.12"
6+
requires-python = ">=3.12"
77
authors = [{ name = "Mobolaji Williams" }]
88
maintainers = [
99
{ name = "Mobolaji Williams" },
@@ -36,6 +36,7 @@ dependencies = [
3636
"streamlit>=1.44.1",
3737
"streamlit-shadcn-ui>=0.1.18",
3838
"structlog>=25.2.0",
39+
"supabase>=2.21.1",
3940
"tomli>=2.2.1",
4041
"tomli-w>=1.2.0",
4142
]

0 commit comments

Comments
 (0)