Skip to content

Commit ceefcba

Browse files
add pdf support through api
1 parent 83d4fc8 commit ceefcba

File tree

3 files changed

+44
-5
lines changed

3 files changed

+44
-5
lines changed

EXT_README.md

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ python -m docext.app.app --concurrency_limit 10
7171
import pandas as pd
7272
import concurrent.futures
7373
from gradio_client import Client, handle_file
74+
from docext.core.file_converters.pdf_converter import PDFConverter
7475

7576

7677
def dataframe_to_custom_dict(df: pd.DataFrame) -> dict:
@@ -110,6 +111,12 @@ fields_and_tables = dataframe_to_custom_dict(pd.DataFrame([
110111
{"name": "item_description", "type": "table", "description": "Item/Product description"}
111112
# add more fields and table columns as needed
112113
]))
114+
# client url can be the local host or the public url like `https://6986bdd23daef6f7eb.gradio.live/`
115+
CLIENT_URL = "http://localhost:7860"
116+
117+
118+
119+
## ======= Image Inputs =======
113120

114121
file_inputs = [
115122
{
@@ -119,21 +126,34 @@ file_inputs = [
119126
]
120127

121128
## send single request
122-
### client url can be the local host or the public url like `https://6986bdd23daef6f7eb.gradio.live/`
123129
fields_df, tables_df = get_extracted_fields_and_tables(
124-
"http://localhost:7860", "admin", "admin", "hosted_vllm/Qwen/Qwen2.5-VL-7B-Instruct-AWQ", fields_and_tables, file_inputs
130+
CLIENT_URL, "admin", "admin", "hosted_vllm/Qwen/Qwen2.5-VL-7B-Instruct-AWQ", fields_and_tables, file_inputs
125131
)
126132
print("========Fields:=========")
127133
print(fields_df)
128134
print("========Tables:=========")
129135
print(tables_df)
130136

131137

138+
## ======= PDF Inputs =======
139+
140+
pdf_converter = PDFConverter()
141+
document_pages = pdf_converter.convert_and_save_images("assets/invoice_test.pdf")
142+
file_inputs = [{"image": handle_file(page)} for page in document_pages]
143+
144+
fields_df, tables_df = get_extracted_fields_and_tables(
145+
CLIENT_URL, "admin", "admin", "hosted_vllm/Qwen/Qwen2.5-VL-7B-Instruct-AWQ", fields_and_tables, file_inputs
146+
)
147+
print("========Fields:=========")
148+
print(fields_df)
149+
print("========Tables:=========")
150+
print(tables_df)
151+
132152
## send multiple requests in parallel
133153
# Define a wrapper function for parallel execution
134154
def run_request():
135155
return get_extracted_fields_and_tables(
136-
"http://localhost:7860", "admin", "admin", "hosted_vllm/Qwen/Qwen2.5-VL-7B-Instruct-AWQ", fields_and_tables, file_inputs
156+
CLIENT_URL, "admin", "admin", "hosted_vllm/Qwen/Qwen2.5-VL-7B-Instruct-AWQ", fields_and_tables, file_inputs
137157
)
138158

139159
# Use ThreadPoolExecutor to send 10 requests in parallel

assets/invoice_test.pdf

57.9 KB
Binary file not shown.
Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,28 @@
11
from __future__ import annotations
22

3-
from docext.core.file_converters.file_converter import FileConverter
3+
import os
4+
import tempfile
5+
from typing import Optional
6+
47
from pdf2image import convert_from_path
58

9+
from docext.core.file_converters.file_converter import FileConverter
10+
611

712
class PDFConverter(FileConverter):
813
def convert_to_images(self, file_path: str):
9-
return convert_from_path(file_path)
14+
return convert_from_path(file_path)
15+
16+
def convert_and_save_images(self, file_path: str, output_folder: str | None = None):
17+
images = self.convert_to_images(file_path)
18+
if not output_folder:
19+
# set tmp folder as output folder
20+
output_folder = tempfile.gettempdir()
21+
os.makedirs(output_folder, exist_ok=True)
22+
# save images to output folder
23+
output_file_paths = []
24+
for i, image in enumerate(images):
25+
output_file_path = os.path.join(output_folder, f"page_{i}.png")
26+
image.save(output_file_path)
27+
output_file_paths.append(output_file_path)
28+
return output_file_paths

0 commit comments

Comments
 (0)