Skip to content

Commit d92a2ea

Browse files
WOONBEilayaperumalg
authored andcommitted
fix getTextBetweenParagraphs to check for invalid page numbers in outline items
Fix ParagraphPdfDocumentReader to reliably extract text from PDFs with imperfect outlines and coordinate edge cases Test : Add test to validate ParagraphPdfDocumentReader to skip Invalid Outline Auto-cherry-pick to 1.0.x Fixes #3421 Signed-off-by: WOONBE <[email protected]>
1 parent c1ee22d commit d92a2ea

File tree

3 files changed

+101
-49
lines changed

3 files changed

+101
-49
lines changed

document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java

Lines changed: 46 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2023-2024 the original author or authors.
2+
* Copyright 2023-2025 the original author or authors.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -46,6 +46,7 @@
4646
* The paragraphs are grouped into {@link Document} objects.
4747
*
4848
* @author Christian Tzolov
49+
* @author Heonwoo Kim
4950
*/
5051
public class ParagraphPdfDocumentReader implements DocumentReader {
5152

@@ -127,29 +128,18 @@ public ParagraphPdfDocumentReader(Resource pdfResource, PdfDocumentReaderConfig
127128
*/
128129
@Override
129130
public List<Document> get() {
130-
131131
var paragraphs = this.paragraphTextExtractor.flatten();
132-
133-
List<Document> documents = new ArrayList<>(paragraphs.size());
134-
135-
if (!CollectionUtils.isEmpty(paragraphs)) {
136-
logger.info("Start processing paragraphs from PDF");
137-
Iterator<Paragraph> itr = paragraphs.iterator();
138-
139-
var current = itr.next();
140-
141-
if (!itr.hasNext()) {
142-
documents.add(toDocument(current, current));
143-
}
144-
else {
145-
while (itr.hasNext()) {
146-
var next = itr.next();
147-
Document document = toDocument(current, next);
148-
if (document != null && StringUtils.hasText(document.getText())) {
149-
documents.add(toDocument(current, next));
150-
}
151-
current = next;
152-
}
132+
List<Document> documents = new ArrayList<>();
133+
if (CollectionUtils.isEmpty(paragraphs)) {
134+
return documents;
135+
}
136+
logger.info("Start processing paragraphs from PDF");
137+
for (int i = 0; i < paragraphs.size(); i++) {
138+
Paragraph from = paragraphs.get(i);
139+
Paragraph to = (i + 1 < paragraphs.size()) ? paragraphs.get(i + 1) : from;
140+
Document document = toDocument(from, to);
141+
if (document != null && StringUtils.hasText(document.getText())) {
142+
documents.add(document);
153143
}
154144
}
155145
logger.info("End processing paragraphs from PDF");
@@ -173,17 +163,27 @@ protected Document toDocument(Paragraph from, Paragraph to) {
173163
protected void addMetadata(Paragraph from, Paragraph to, Document document) {
174164
document.getMetadata().put(METADATA_TITLE, from.title());
175165
document.getMetadata().put(METADATA_START_PAGE, from.startPageNumber());
176-
document.getMetadata().put(METADATA_END_PAGE, to.startPageNumber());
166+
document.getMetadata().put(METADATA_END_PAGE, from.endPageNumber());
177167
document.getMetadata().put(METADATA_LEVEL, from.level());
178168
document.getMetadata().put(METADATA_FILE_NAME, this.resourceFileName);
179169
}
180170

181171
public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toParagraph) {
182172

173+
if (fromParagraph.startPageNumber() < 1) {
174+
logger.warn("Skipping paragraph titled '{}' because it has an invalid start page number: {}",
175+
fromParagraph.title(), fromParagraph.startPageNumber());
176+
return "";
177+
}
178+
183179
// Page started from index 0, while PDFBOx getPage return them from index 1.
184180
int startPage = fromParagraph.startPageNumber() - 1;
185181
int endPage = toParagraph.startPageNumber() - 1;
186182

183+
if (fromParagraph == toParagraph || endPage < startPage) {
184+
endPage = startPage;
185+
}
186+
187187
try {
188188

189189
StringBuilder sb = new StringBuilder();
@@ -194,39 +194,37 @@ public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toPara
194194
for (int pageNumber = startPage; pageNumber <= endPage; pageNumber++) {
195195

196196
var page = this.document.getPage(pageNumber);
197+
float pageHeight = page.getMediaBox().getHeight();
197198

198-
int fromPosition = fromParagraph.position();
199-
int toPosition = toParagraph.position();
200-
201-
if (this.config.reversedParagraphPosition) {
202-
fromPosition = (int) (page.getMediaBox().getHeight() - fromPosition);
203-
toPosition = (int) (page.getMediaBox().getHeight() - toPosition);
204-
}
205-
206-
int x0 = (int) page.getMediaBox().getLowerLeftX();
207-
int xW = (int) page.getMediaBox().getWidth();
199+
int fromPos = fromParagraph.position();
200+
int toPos = (fromParagraph != toParagraph) ? toParagraph.position() : 0;
208201

209-
int y0 = (int) page.getMediaBox().getLowerLeftY();
210-
int yW = (int) page.getMediaBox().getHeight();
202+
int x = (int) page.getMediaBox().getLowerLeftX();
203+
int w = (int) page.getMediaBox().getWidth();
204+
int y, h;
211205

212-
if (pageNumber == startPage) {
213-
y0 = fromPosition;
214-
yW = (int) page.getMediaBox().getHeight() - y0;
206+
if (pageNumber == startPage && pageNumber == endPage) {
207+
y = toPos;
208+
h = fromPos - toPos;
215209
}
216-
if (pageNumber == endPage) {
217-
yW = toPosition - y0;
210+
else if (pageNumber == startPage) {
211+
y = 0;
212+
h = fromPos;
218213
}
219-
220-
if ((y0 + yW) == (int) page.getMediaBox().getHeight()) {
221-
yW = yW - this.config.pageBottomMargin;
214+
else if (pageNumber == endPage) {
215+
y = toPos;
216+
h = (int) pageHeight - toPos;
217+
}
218+
else {
219+
y = 0;
220+
h = (int) pageHeight;
222221
}
223222

224-
if (y0 == 0) {
225-
y0 = y0 + this.config.pageTopMargin;
226-
yW = yW - this.config.pageTopMargin;
223+
if (h < 0) {
224+
h = 0;
227225
}
228226

229-
pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x0, y0, xW, yW));
227+
pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x, y, w, h));
230228
pdfTextStripper.extractRegions(page);
231229
var text = pdfTextStripper.getTextForRegion("pdfPageRegion");
232230
if (StringUtils.hasText(text)) {

document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2023-2024 the original author or authors.
2+
* Copyright 2023-2025 the original author or authors.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -16,15 +16,32 @@
1616

1717
package org.springframework.ai.reader.pdf;
1818

19+
import java.io.ByteArrayOutputStream;
20+
import java.io.IOException;
21+
import java.io.InputStream;
22+
import java.util.List;
23+
24+
import org.apache.pdfbox.Loader;
25+
import org.apache.pdfbox.pdmodel.PDDocument;
26+
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
27+
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
28+
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
1929
import org.junit.jupiter.api.Test;
2030

31+
import org.springframework.ai.document.Document;
2132
import org.springframework.ai.reader.ExtractedTextFormatter;
2233
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
34+
import org.springframework.core.io.ByteArrayResource;
35+
import org.springframework.core.io.ClassPathResource;
36+
import org.springframework.core.io.Resource;
2337

38+
import static org.assertj.core.api.Assertions.assertThat;
2439
import static org.assertj.core.api.Assertions.assertThatThrownBy;
40+
import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
2541

2642
/**
2743
* @author Christian Tzolov
44+
* @author Heonwoo Kim
2845
*/
2946
public class ParagraphPdfDocumentReaderTests {
3047

@@ -50,4 +67,41 @@ public void testPdfWithoutToc() {
5067

5168
}
5269

70+
@Test
71+
void shouldSkipInvalidOutline() throws IOException {
72+
73+
Resource basePdfResource = new ClassPathResource("sample3.pdf");
74+
75+
PDDocument documentToModify;
76+
try (InputStream inputStream = basePdfResource.getInputStream()) {
77+
78+
byte[] pdfBytes = inputStream.readAllBytes();
79+
80+
documentToModify = Loader.loadPDF(pdfBytes);
81+
}
82+
PDDocumentOutline outline = documentToModify.getDocumentCatalog().getDocumentOutline();
83+
if (outline != null && outline.getFirstChild() != null) {
84+
PDOutlineItem chapter2OutlineItem = outline.getFirstChild().getNextSibling();
85+
if (chapter2OutlineItem != null) {
86+
87+
chapter2OutlineItem.setDestination((PDDestination) null);
88+
}
89+
}
90+
ByteArrayOutputStream baos = new ByteArrayOutputStream();
91+
documentToModify.save(baos);
92+
documentToModify.close();
93+
94+
Resource corruptedPdfResource = new ByteArrayResource(baos.toByteArray());
95+
96+
ParagraphPdfDocumentReader reader = new ParagraphPdfDocumentReader(corruptedPdfResource,
97+
PdfDocumentReaderConfig.defaultConfig());
98+
99+
List<Document> documents = assertDoesNotThrow(() -> reader.get());
100+
101+
assertThat(documents).isNotNull();
102+
assertThat(documents).hasSize(2);
103+
assertThat(documents.get(0).getMetadata().get("title")).isEqualTo("Chapter 1");
104+
assertThat(documents.get(1).getMetadata().get("title")).isEqualTo("Chapter 3");
105+
}
106+
53107
}
Binary file not shown.

0 commit comments

Comments
 (0)