Skip to content

Commit c1d49ab

Browse files
Merge pull request NanoNets#23 from sirius116/enhancement/pdf_support
PDF Support, Multi File Support and Some Fixes
2 parents 2c7c3c0 + 85250dd commit c1d49ab

File tree

7 files changed

+111
-11
lines changed

7 files changed

+111
-11
lines changed

docext/app/app.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from docext.core.config import TEMPLATES_FIELDS
1515
from docext.core.config import TEMPLATES_TABLES
1616
from docext.core.extract import extract_information
17+
from docext.core.utils import convert_files_to_images
1718
from docext.core.vllm import VLLMServer
1819

1920
METADATA = []
@@ -139,8 +140,28 @@ def define_keys_and_extract(model_name: str, max_img_size: int, concurrency_limi
139140
visible=False,
140141
)
141142

142-
images_input = gr.Gallery(label="Upload images", preview=True)
143-
submit_btn = gr.Button("Submit")
143+
file_input = gr.File(
144+
label="Upload Documents",
145+
file_types=[".pdf", ".jpg", ".jpeg", ".png", ".tiff", ".bmp", ".gif", ".webp"],
146+
file_count="multiple"
147+
)
148+
images_input = gr.Gallery(label="Document Preview", preview=True, visible=False)
149+
submit_btn = gr.Button("Submit", visible=False)
150+
151+
def handle_file_upload(files):
152+
if not files:
153+
return None, gr.update(visible=False), gr.update(visible=False)
154+
155+
file_paths = [f.name for f in files]
156+
# Convert PDFs to images if necessary and get all image paths
157+
image_paths = convert_files_to_images(file_paths)
158+
return image_paths, gr.update(visible=True, value=image_paths), gr.update(visible=True)
159+
160+
file_input.change(
161+
handle_file_upload,
162+
inputs=[file_input],
163+
outputs=[images_input, images_input, submit_btn]
164+
)
144165

145166
with gr.Row():
146167
with gr.Column(scale=3):

docext/core/client.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ def sync_request(
4242
elif model_name.startswith("openrouter"):
4343
completion_args["response_format"] = format
4444
elif "gpt" in model_name.lower():
45-
completion_args["response_format"] = {"type": "json_object"}
45+
# Only set response_format if the prompt mentions "json"
46+
if any("json" in m.get("text", "").lower() for m in messages if isinstance(m, dict)):
47+
completion_args["response_format"] = {"type": "json_object"}
4648

4749
response = completion(**completion_args)
4850
return response.json()

docext/core/extract.py

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from docext.core.confidence import get_fields_confidence_score_messages_binary
1414
from docext.core.prompts import get_fields_messages
1515
from docext.core.prompts import get_tables_messages
16+
from docext.core.utils import convert_files_to_images
1617
from docext.core.utils import resize_images
1718
from docext.core.utils import validate_fields_and_tables
1819
from docext.core.utils import validate_file_paths
@@ -65,14 +66,36 @@ def extract_fields_from_documents(
6566
extracted_fields = json_repair.loads(response)
6667
conf_scores = json_repair.loads(response_conf_score)
6768

68-
df = pd.DataFrame(
69-
{
70-
"fields": field_names,
71-
"answer": [extracted_fields.get(field, "") for field in field_names],
72-
"confidence": [conf_scores.get(field, "Low") for field in field_names],
73-
},
74-
)
75-
return df
69+
logger.info(f"Extracted fields: {extracted_fields}")
70+
logger.info(f"Conf scores: {conf_scores}")
71+
72+
# Handle both single dictionary and list of dictionaries
73+
if not isinstance(extracted_fields, list):
74+
extracted_fields = [extracted_fields]
75+
76+
# Handle confidence scores similarly
77+
if not isinstance(conf_scores, list):
78+
conf_scores = [conf_scores] * len(extracted_fields)
79+
elif len(conf_scores) < len(extracted_fields):
80+
# If we have fewer confidence scores than documents, pad with the first confidence score
81+
conf_scores.extend([conf_scores[0]] * (len(extracted_fields) - len(conf_scores)))
82+
83+
# Create a list of dataframes, one for each document
84+
dfs = []
85+
for idx, (doc_fields, doc_conf_scores) in enumerate(zip(extracted_fields, conf_scores)):
86+
df = pd.DataFrame(
87+
{
88+
"fields": field_names,
89+
"answer": [doc_fields.get(field, "") for field in field_names],
90+
"confidence": [doc_conf_scores.get(field, "Low") for field in field_names],
91+
"document_index": [idx] * len(field_names)
92+
},
93+
)
94+
dfs.append(df)
95+
96+
# Concatenate all dataframes with a document index
97+
final_df = pd.concat(dfs, ignore_index=True)
98+
return final_df
7699

77100

78101
def extract_tables_from_documents(
@@ -112,6 +135,7 @@ def extract_information(
112135
for file_input in file_inputs
113136
]
114137
validate_file_paths(file_paths)
138+
file_paths = convert_files_to_images(file_paths)
115139
resize_images(file_paths, max_img_size)
116140

117141
# call fields and tables extraction in parallel
@@ -131,4 +155,9 @@ def extract_information(
131155

132156
fields_df = future_fields.result()
133157
tables_df = future_tables.result()
158+
159+
# Group fields by document_index for better display
160+
if not fields_df.empty and 'document_index' in fields_df.columns:
161+
fields_df = fields_df.sort_values(['document_index', 'fields'])
162+
134163
return fields_df, tables_df
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from __future__ import annotations
2+
3+
from abc import ABC
4+
from abc import abstractmethod
5+
6+
7+
class FileConverter(ABC):
8+
@abstractmethod
9+
def convert_to_images(self, file_path: str):
10+
pass
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from __future__ import annotations
2+
3+
from docext.core.file_converters.file_converter import FileConverter
4+
from pdf2image import convert_from_path
5+
6+
7+
class PDFConverter(FileConverter):
8+
def convert_to_images(self, file_path: str):
9+
return convert_from_path(file_path)

docext/core/utils.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import pandas as pd
99
from PIL import Image
10+
from docext.core.file_converters.pdf_converter import PDFConverter
1011

1112

1213
def encode_image(image_path):
@@ -65,4 +66,31 @@ def validate_file_paths(file_paths: list[str]):
6566
".bmp",
6667
".gif",
6768
".webp",
69+
".pdf",
6870
], f"File {file_path} is not an image"
71+
72+
def file_is_supported_image(file_path: str) -> bool:
73+
return os.path.splitext(file_path)[1].lower() in [
74+
".jpg",
75+
".jpeg",
76+
".png",
77+
".tiff",
78+
".bmp",
79+
".gif",
80+
".webp",
81+
]
82+
83+
# TODO: add support for other file types; only support pdf for now
84+
def convert_files_to_images(file_paths: list[str]):
85+
converted_file_paths = []
86+
pdf_converter = PDFConverter()
87+
for file_path in file_paths:
88+
if os.path.splitext(file_path)[1].lower() == ".pdf":
89+
images = pdf_converter.convert_to_images(file_path)
90+
for i, image in enumerate(images):
91+
image.save(f"{file_path.replace('.pdf', '')}_{i}.jpg")
92+
converted_file_paths.append(f"{file_path.replace('.pdf', '')}_{i}.jpg")
93+
else:
94+
if file_is_supported_image(file_path):
95+
converted_file_paths.append(file_path)
96+
return converted_file_paths

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,4 @@ tenacity
1616
types-requests
1717
vllm==v0.8.3
1818
xgrammar==0.1.17
19+
pdf2image

0 commit comments

Comments
 (0)