fix getTextBetweenParagraphs to check for invalid page numbers in outline items

WOONBE · ilayaperumalg · commit d92a2ea35152 · 2025-06-25T10:47:29.000+01:00
Fix ParagraphPdfDocumentReader to reliably extract text from PDFs with imperfect outlines and coordinate edge cases Test : Add test to validate ParagraphPdfDocumentReader to skip Invalid Outline Auto-cherry-pick to 1.0.x Fixes #3421 Signed-off-by: WOONBE <kepull2918@naver.com>
diff --git a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReader.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023-2024 the original author or authors.
+ * Copyright 2023-2025 the original author or authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,6 +46,7 @@
  * The paragraphs are grouped into {@link Document} objects.
  *
  * @author Christian Tzolov
+ * @author Heonwoo Kim
  */
 public class ParagraphPdfDocumentReader implements DocumentReader {
 
@@ -127,29 +128,18 @@ public ParagraphPdfDocumentReader(Resource pdfResource, PdfDocumentReaderConfig
 	 */
 	@Override
 	public List<Document> get() {
-
 		var paragraphs = this.paragraphTextExtractor.flatten();
-
-		List<Document> documents = new ArrayList<>(paragraphs.size());
-
-		if (!CollectionUtils.isEmpty(paragraphs)) {
-			logger.info("Start processing paragraphs from PDF");
-			Iterator<Paragraph> itr = paragraphs.iterator();
-
-			var current = itr.next();
-
-			if (!itr.hasNext()) {
-				documents.add(toDocument(current, current));
-			}
-			else {
-				while (itr.hasNext()) {
-					var next = itr.next();
-					Document document = toDocument(current, next);
-					if (document != null && StringUtils.hasText(document.getText())) {
-						documents.add(toDocument(current, next));
-					}
-					current = next;
-				}
+		List<Document> documents = new ArrayList<>();
+		if (CollectionUtils.isEmpty(paragraphs)) {
+			return documents;
+		}
+		logger.info("Start processing paragraphs from PDF");
+		for (int i = 0; i < paragraphs.size(); i++) {
+			Paragraph from = paragraphs.get(i);
+			Paragraph to = (i + 1 < paragraphs.size()) ? paragraphs.get(i + 1) : from;
+			Document document = toDocument(from, to);
+			if (document != null && StringUtils.hasText(document.getText())) {
+				documents.add(document);
 			}
 		}
 		logger.info("End processing paragraphs from PDF");
@@ -173,17 +163,27 @@ protected Document toDocument(Paragraph from, Paragraph to) {
 	protected void addMetadata(Paragraph from, Paragraph to, Document document) {
 		document.getMetadata().put(METADATA_TITLE, from.title());
 		document.getMetadata().put(METADATA_START_PAGE, from.startPageNumber());
-		document.getMetadata().put(METADATA_END_PAGE, to.startPageNumber());
+		document.getMetadata().put(METADATA_END_PAGE, from.endPageNumber());
 		document.getMetadata().put(METADATA_LEVEL, from.level());
 		document.getMetadata().put(METADATA_FILE_NAME, this.resourceFileName);
 	}
 
 	public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toParagraph) {
 
+		if (fromParagraph.startPageNumber() < 1) {
+			logger.warn("Skipping paragraph titled '{}' because it has an invalid start page number: {}",
+					fromParagraph.title(), fromParagraph.startPageNumber());
+			return "";
+		}
+
 		// Page started from index 0, while PDFBOx getPage return them from index 1.
 		int startPage = fromParagraph.startPageNumber() - 1;
 		int endPage = toParagraph.startPageNumber() - 1;
 
+		if (fromParagraph == toParagraph || endPage < startPage) {
+			endPage = startPage;
+		}
+
 		try {
 
 			StringBuilder sb = new StringBuilder();
@@ -194,39 +194,37 @@ public String getTextBetweenParagraphs(Paragraph fromParagraph, Paragraph toPara
 			for (int pageNumber = startPage; pageNumber <= endPage; pageNumber++) {
 
 				var page = this.document.getPage(pageNumber);
+				float pageHeight = page.getMediaBox().getHeight();
 
-				int fromPosition = fromParagraph.position();
-				int toPosition = toParagraph.position();
-
-				if (this.config.reversedParagraphPosition) {
-					fromPosition = (int) (page.getMediaBox().getHeight() - fromPosition);
-					toPosition = (int) (page.getMediaBox().getHeight() - toPosition);
-				}
-
-				int x0 = (int) page.getMediaBox().getLowerLeftX();
-				int xW = (int) page.getMediaBox().getWidth();
+				int fromPos = fromParagraph.position();
+				int toPos = (fromParagraph != toParagraph) ? toParagraph.position() : 0;
 
-				int y0 = (int) page.getMediaBox().getLowerLeftY();
-				int yW = (int) page.getMediaBox().getHeight();
+				int x = (int) page.getMediaBox().getLowerLeftX();
+				int w = (int) page.getMediaBox().getWidth();
+				int y, h;
 
-				if (pageNumber == startPage) {
-					y0 = fromPosition;
-					yW = (int) page.getMediaBox().getHeight() - y0;
+				if (pageNumber == startPage && pageNumber == endPage) {
+					y = toPos;
+					h = fromPos - toPos;
 				}
-				if (pageNumber == endPage) {
-					yW = toPosition - y0;
+				else if (pageNumber == startPage) {
+					y = 0;
+					h = fromPos;
 				}
-
-				if ((y0 + yW) == (int) page.getMediaBox().getHeight()) {
-					yW = yW - this.config.pageBottomMargin;
+				else if (pageNumber == endPage) {
+					y = toPos;
+					h = (int) pageHeight - toPos;
+				}
+				else {
+					y = 0;
+					h = (int) pageHeight;
 				}
 
-				if (y0 == 0) {
-					y0 = y0 + this.config.pageTopMargin;
-					yW = yW - this.config.pageTopMargin;
+				if (h < 0) {
+					h = 0;
 				}
 
-				pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x0, y0, xW, yW));
+				pdfTextStripper.addRegion("pdfPageRegion", new Rectangle(x, y, w, h));
 				pdfTextStripper.extractRegions(page);
 				var text = pdfTextStripper.getTextForRegion("pdfPageRegion");
 				if (StringUtils.hasText(text)) {
diff --git a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/ParagraphPdfDocumentReaderTests.java
@@ -1,5 +1,5 @@
 /*
- * Copyright 2023-2024 the original author or authors.
+ * Copyright 2023-2025 the original author or authors.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,15 +16,32 @@
 
 package org.springframework.ai.reader.pdf;
 
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
 import org.junit.jupiter.api.Test;
 
+import org.springframework.ai.document.Document;
 import org.springframework.ai.reader.ExtractedTextFormatter;
 import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
+import org.springframework.core.io.ByteArrayResource;
+import org.springframework.core.io.ClassPathResource;
+import org.springframework.core.io.Resource;
 
+import static org.assertj.core.api.Assertions.assertThat;
 import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
 
 /**
  * @author Christian Tzolov
+ * @author Heonwoo Kim
  */
 public class ParagraphPdfDocumentReaderTests {
 
@@ -50,4 +67,41 @@ public void testPdfWithoutToc() {
 
 	}
 
+	@Test
+	void shouldSkipInvalidOutline() throws IOException {
+
+		Resource basePdfResource = new ClassPathResource("sample3.pdf");
+
+		PDDocument documentToModify;
+		try (InputStream inputStream = basePdfResource.getInputStream()) {
+
+			byte[] pdfBytes = inputStream.readAllBytes();
+
+			documentToModify = Loader.loadPDF(pdfBytes);
+		}
+		PDDocumentOutline outline = documentToModify.getDocumentCatalog().getDocumentOutline();
+		if (outline != null && outline.getFirstChild() != null) {
+			PDOutlineItem chapter2OutlineItem = outline.getFirstChild().getNextSibling();
+			if (chapter2OutlineItem != null) {
+
+				chapter2OutlineItem.setDestination((PDDestination) null);
+			}
+		}
+		ByteArrayOutputStream baos = new ByteArrayOutputStream();
+		documentToModify.save(baos);
+		documentToModify.close();
+
+		Resource corruptedPdfResource = new ByteArrayResource(baos.toByteArray());
+
+		ParagraphPdfDocumentReader reader = new ParagraphPdfDocumentReader(corruptedPdfResource,
+				PdfDocumentReaderConfig.defaultConfig());
+
+		List<Document> documents = assertDoesNotThrow(() -> reader.get());
+
+		assertThat(documents).isNotNull();
+		assertThat(documents).hasSize(2);
+		assertThat(documents.get(0).getMetadata().get("title")).isEqualTo("Chapter 1");
+		assertThat(documents.get(1).getMetadata().get("title")).isEqualTo("Chapter 3");
+	}
+
 }
diff --git a/document-readers/pdf-reader/src/test/resources/sample3.pdf b/document-readers/pdf-reader/src/test/resources/sample3.pdf