1- from typing import List
1+ import asyncio
22import base64
3- import os
43import json
5- from tqdm .notebook import tqdm
6- from dotenv import load_dotenv
7- import pandas as pd
8- import asyncio
9- import fitz # Add this import at the top with other imports
10-
114import logging
5+ import os
126from datetime import datetime
7+ from typing import List
8+
9+ import fitz # Add this import at the top with other imports
10+ import pandas as pd
11+ from dotenv import load_dotenv
12+ from tqdm .notebook import tqdm
1313
14- from ocr import extract_from_encoding_async
14+ from . ocr_client_factory import extract_from_encoding_async
1515
1616# Set up logging
1717log_directory = "logs"
5050 config = json .load (f )
5151
5252
53- def collecting_pdf_encoded_images (file_path : str ) -> List [str ]:
53+ def _collecting_pdf_encoded_images (file_path : str ) -> List [str ]:
5454 """Convert PDF pages to encoded images, cropping to target area.
5555 Returns list of base64 encoded image strings."""
5656
@@ -97,7 +97,7 @@ def collecting_pdf_encoded_images(file_path: str) -> List[str]:
9797
9898
9999# function for adding data
100- def add_metadata (initial_data : List [dict ], page_no : int , filename : str ) -> List [dict ]:
100+ def _add_metadata (initial_data : List [dict ], page_no : int , filename : str ) -> List [dict ]:
101101 """
102102 Adds page number, row number, and filename metadata to the recognized signatures
103103
@@ -121,7 +121,7 @@ def add_metadata(initial_data: List[dict], page_no: int, filename: str) -> List[
121121 return final_data
122122
123123
124- async def process_batch_async (encodings : List [str ]) -> List [List [dict ]]:
124+ async def _process_batch_async (encodings : List [str ]) -> List [List [dict ]]:
125125 """
126126 Process a batch of images concurrently
127127 """
@@ -132,7 +132,7 @@ async def process_batch_async(encodings: List[str]) -> List[List[dict]]:
132132 return results
133133
134134
135- def get_or_create_event_loop () -> asyncio .AbstractEventLoop :
135+ def _get_or_create_event_loop () -> asyncio .AbstractEventLoop :
136136 try :
137137 return asyncio .get_event_loop ()
138138 except RuntimeError :
@@ -141,7 +141,7 @@ def get_or_create_event_loop() -> asyncio.AbstractEventLoop:
141141 return loop
142142
143143
144- def collect_ocr_data (
144+ def _collect_ocr_data (
145145 filedir : str ,
146146 filename : str ,
147147 max_page_num : int = None ,
@@ -165,7 +165,7 @@ def collect_ocr_data(
165165 logger .info (f"Parameters - max_page_num: { max_page_num } , batch_size: { batch_size } " )
166166
167167 # collecting images
168- encoded_images = collecting_pdf_encoded_images (os .path .join (filedir , filename ))
168+ encoded_images = _collecting_pdf_encoded_images (os .path .join (filedir , filename ))
169169
170170 # selecting pages
171171 if max_page_num :
@@ -180,7 +180,7 @@ def collect_ocr_data(
180180 total_pages = len (encoded_images )
181181
182182 # getting event loop
183- loop = get_or_create_event_loop ()
183+ loop = _get_or_create_event_loop ()
184184
185185 # Process in batches
186186 logger .info (f"Processing { total_pages } pages in batches of { batch_size } " )
@@ -199,12 +199,12 @@ def collect_ocr_data(
199199 )
200200
201201 # Run async batch processing using the event loop
202- batch_results = loop .run_until_complete (process_batch_async (batch ))
202+ batch_results = loop .run_until_complete (_process_batch_async (batch ))
203203
204204 # Add metadata for each result in the batch
205205 for page_idx , result in enumerate (batch_results ):
206206 current_page = i + page_idx
207- ocr_data = add_metadata (result , current_page , filename )
207+ ocr_data = _add_metadata (result , current_page , filename )
208208 full_data .extend (ocr_data )
209209
210210 logger .info (
@@ -238,7 +238,7 @@ def create_ocr_df(
238238 logger .info ("Starting OCR DataFrame creation" )
239239
240240 # gathering ocr_data
241- ocr_data = collect_ocr_data (
241+ ocr_data = _collect_ocr_data (
242242 filedir ,
243243 filename ,
244244 max_page_num = max_page_num ,
0 commit comments