diff --git a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/layout/TextLine.java b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/layout/TextLine.java index cd6e0002c..4b76b1491 100644 --- a/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/layout/TextLine.java +++ b/document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/layout/TextLine.java @@ -16,29 +16,38 @@ package org.springframework.ai.reader.pdf.layout; +import java.util.Arrays; + +/* + * @author Soby Chacko + * @author Tibor Tarnai + */ + class TextLine { private static final char SPACE_CHARACTER = ' '; - private int lineLength; + private final int lineLength; - private String line; + private final char[] line; private int lastIndex; TextLine(int lineLength) { - this.line = ""; + if (lineLength < 0) { + throw new IllegalArgumentException("Line length cannot be negative"); + } this.lineLength = lineLength / ForkPDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT; - this.completeLineWithSpaces(); + this.line = new char[this.lineLength]; + Arrays.fill(this.line, SPACE_CHARACTER); } public void writeCharacterAtIndex(final Character character) { character.setIndex(this.computeIndexForCharacter(character)); int index = character.getIndex(); char characterValue = character.getCharacterValue(); - if (this.indexIsInBounds(index) && this.line.charAt(index) == SPACE_CHARACTER) { - this.line = this.line.substring(0, index) + characterValue - + this.line.substring(index + 1, this.getLineLength()); + if (this.indexIsInBounds(index) && this.line[index] == SPACE_CHARACTER) { + this.line[index] = characterValue; } } @@ -47,7 +56,7 @@ class TextLine { } public String getLine() { - return this.line; + return new String(this.line); } private int computeIndexForCharacter(final Character character) { @@ -64,7 +73,7 @@ class TextLine { index = this.findMinimumIndexWithSpaceCharacterFromIndex(index); } else if (isCharacterCloseToPreviousWord) { - if (this.line.charAt(index) != SPACE_CHARACTER) { + if (this.line[index] != SPACE_CHARACTER) { index = index + 1; } else { @@ -76,52 +85,36 @@ class TextLine { } } - private boolean isSpaceCharacterAtIndex(int index) { - return this.line.charAt(index) != SPACE_CHARACTER; + private boolean isNotSpaceCharacterAtIndex(int index) { + return this.line[index] != SPACE_CHARACTER; } private boolean isNewIndexGreaterThanLastIndex(int index) { - int lastIndex = this.getLastIndex(); - return (index > lastIndex); + return index > this.lastIndex; } private int getNextValidIndex(int index, boolean isCharacterPartOfPreviousWord) { int nextValidIndex = index; - int lastIndex = this.getLastIndex(); if (!this.isNewIndexGreaterThanLastIndex(index)) { - nextValidIndex = lastIndex + 1; + nextValidIndex = this.lastIndex + 1; } - if (!isCharacterPartOfPreviousWord && this.isSpaceCharacterAtIndex(index - 1)) { + if (!isCharacterPartOfPreviousWord && index > 0 && this.isNotSpaceCharacterAtIndex(index - 1)) { nextValidIndex = nextValidIndex + 1; } - this.setLastIndex(nextValidIndex); + this.lastIndex = nextValidIndex; return nextValidIndex; } private int findMinimumIndexWithSpaceCharacterFromIndex(int index) { int newIndex = index; - while (newIndex >= 0 && this.line.charAt(newIndex) == SPACE_CHARACTER) { + while (newIndex >= 0 && this.line[newIndex] == SPACE_CHARACTER) { newIndex = newIndex - 1; } return newIndex + 1; } private boolean indexIsInBounds(int index) { - return (index >= 0 && index < this.lineLength); - } - - private void completeLineWithSpaces() { - for (int i = 0; i < this.getLineLength(); ++i) { - this.line += SPACE_CHARACTER; - } - } - - private int getLastIndex() { - return this.lastIndex; - } - - private void setLastIndex(int lastIndex) { - this.lastIndex = lastIndex; + return index >= 0 && index < this.lineLength; } } diff --git a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/PagePdfDocumentReaderTests.java b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/PagePdfDocumentReaderTests.java index 71c230faf..a9009bbfc 100644 --- a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/PagePdfDocumentReaderTests.java +++ b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/PagePdfDocumentReaderTests.java @@ -29,11 +29,12 @@ import static org.assertj.core.api.Assertions.assertThat; /** * @author Christian Tzolov + * @author Tibor Tarnai */ -public class PagePdfDocumentReaderTests { +class PagePdfDocumentReaderTests { @Test - public void classpathRead() { + void classpathRead() { PagePdfDocumentReader pdfReader = new PagePdfDocumentReader("classpath:/sample1.pdf", PdfDocumentReaderConfig.builder() @@ -51,10 +52,22 @@ public class PagePdfDocumentReaderTests { assertThat(docs).hasSize(4); - String allText = docs.stream().map(d -> d.getContent()).collect(Collectors.joining(System.lineSeparator())); + String allText = docs.stream().map(Document::getContent).collect(Collectors.joining(System.lineSeparator())); assertThat(allText).doesNotContain( List.of("Page 1 of 4", "Page 2 of 4", "Page 3 of 4", "Page 4 of 4", "PDF Bookmark Sample")); } + @Test + void testIndexOutOfBound() { + var documents = new PagePdfDocumentReader("classpath:/sample2.pdf", + PdfDocumentReaderConfig.builder() + .withPageExtractedTextFormatter(ExtractedTextFormatter.builder().build()) + .withPagesPerDocument(1) + .build()) + .get(); + + assertThat(documents).hasSize(64); + } + } diff --git a/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/layout/TextLineTest.java b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/layout/TextLineTest.java new file mode 100644 index 000000000..5836f2b58 --- /dev/null +++ b/document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/layout/TextLineTest.java @@ -0,0 +1,134 @@ +/* + * Copyright 2023-2024 the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.springframework.ai.reader.pdf.layout; + +import java.util.stream.Stream; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +/* + * @author Tibor Tarnai + */ + +class TextLineTest { + + public static Stream testWriteCharacterAtIndexValidIndex() { + return Stream.of(Arguments.of(new Character('A', 0, false, false, false, false)), + Arguments.of(new Character('A', 10, true, false, false, false)), + Arguments.of(new Character('A', 0, false, true, false, false))); + } + + @ParameterizedTest + @MethodSource + void testWriteCharacterAtIndexValidIndex(Character character) { + TextLine textLine = new TextLine(100); + textLine.writeCharacterAtIndex(character); + assertEquals(" A" + " ".repeat(23), textLine.getLine()); + } + + @Test + void testWriteCharacterAtIndex_PartOfPreviousWord() { + TextLine textLine = new TextLine(100); + Character character = new Character('A', 10, true, false, false, false); + textLine.writeCharacterAtIndex(character); + assertEquals(" A" + " ".repeat(23), textLine.getLine()); + } + + @Test + void testWriteCharacterAtIndex_BeginningOfNewLine() { + TextLine textLine = new TextLine(100); + Character character = new Character('A', 0, false, true, false, false); + textLine.writeCharacterAtIndex(character); + assertEquals(" A" + " ".repeat(23), textLine.getLine()); + } + + @Test + void testWriteCharacterAtIndex_InvalidIndex() { + TextLine textLine = new TextLine(100); + Character character = new Character('A', 150, false, false, false, false); + textLine.writeCharacterAtIndex(character); + assertEquals(" ".repeat(25), textLine.getLine()); + } + + @Test + void testWriteCharacterAtIndex_NegativeIndex() { + TextLine textLine = new TextLine(100); + Character character = new Character('A', -1, false, false, false, false); + textLine.writeCharacterAtIndex(character); + assertEquals(" ".repeat(25), textLine.getLine()); + } + + @Test + void testWriteCharacterAtIndex_SpaceCharacter() { + TextLine textLine = new TextLine(100); + Character character = new Character('A', 10, false, false, false, false); + textLine.writeCharacterAtIndex(character); + assertEquals(" ".repeat(10) + "A" + " ".repeat(14), textLine.getLine()); + } + + @Test + void testWriteCharacterAtIndex_CloseToPreviousWord() { + TextLine textLine = new TextLine(100); + Character character = new Character('A', 10, false, false, true, false); + textLine.writeCharacterAtIndex(character); + assertEquals(" ".repeat(10) + "A" + " ".repeat(14), textLine.getLine()); + } + + @Test + void testGetLineLength() { + TextLine textLine = new TextLine(100); + assertEquals(100 / ForkPDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT, textLine.getLineLength()); + } + + @Test + void testGetLine() { + TextLine textLine = new TextLine(100); + assertEquals(" ".repeat(100 / ForkPDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT), + textLine.getLine()); + } + + @Test + void testNegativeLineLength() { + IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, () -> new TextLine(-100)); + assertEquals("Line length cannot be negative", exception.getMessage()); + } + + @Test + void testComputeIndexForCharacter_CloseToPreviousWord() { + TextLine textLine = new TextLine(100); + Character character = new Character('A', 10, true, false, true, true); + textLine.writeCharacterAtIndex(character); + assertEquals(" A" + " ".repeat(23), textLine.getLine()); + } + + @Test + void testComputeIndexForCharacter_CloseToPreviousWord_WriteTwoCharacters() { + TextLine textLine = new TextLine(100); + Character character = new Character('A', 10, true, false, true, true); + Character anotherCharacter = new Character('B', 1, true, false, true, true); + textLine.writeCharacterAtIndex(character); + textLine.writeCharacterAtIndex(anotherCharacter); + assertEquals(" AB" + " ".repeat(22), textLine.getLine()); + } + +} diff --git a/document-readers/pdf-reader/src/test/resources/sample2.pdf b/document-readers/pdf-reader/src/test/resources/sample2.pdf new file mode 100644 index 000000000..c6657bf05 Binary files /dev/null and b/document-readers/pdf-reader/src/test/resources/sample2.pdf differ