GH-1689 Handle StringIndexOutOfBoundsException in PagePdfDocumentReader

- Add test coverage to TextLine
    - Use char[] instead of String for TextLine
    - Optimise index handling when reading text lines

Resolves #1689
This commit is contained in:
d050150
2024-11-07 12:15:04 +01:00
committed by Ilayaperumal Gopinathan
parent 865d429451
commit 78a2a2788b
4 changed files with 176 additions and 36 deletions

View File

@@ -16,29 +16,38 @@
package org.springframework.ai.reader.pdf.layout;
import java.util.Arrays;
/*
* @author Soby Chacko
* @author Tibor Tarnai
*/
class TextLine {
private static final char SPACE_CHARACTER = ' ';
private int lineLength;
private final int lineLength;
private String line;
private final char[] line;
private int lastIndex;
TextLine(int lineLength) {
this.line = "";
if (lineLength < 0) {
throw new IllegalArgumentException("Line length cannot be negative");
}
this.lineLength = lineLength / ForkPDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT;
this.completeLineWithSpaces();
this.line = new char[this.lineLength];
Arrays.fill(this.line, SPACE_CHARACTER);
}
public void writeCharacterAtIndex(final Character character) {
character.setIndex(this.computeIndexForCharacter(character));
int index = character.getIndex();
char characterValue = character.getCharacterValue();
if (this.indexIsInBounds(index) && this.line.charAt(index) == SPACE_CHARACTER) {
this.line = this.line.substring(0, index) + characterValue
+ this.line.substring(index + 1, this.getLineLength());
if (this.indexIsInBounds(index) && this.line[index] == SPACE_CHARACTER) {
this.line[index] = characterValue;
}
}
@@ -47,7 +56,7 @@ class TextLine {
}
public String getLine() {
return this.line;
return new String(this.line);
}
private int computeIndexForCharacter(final Character character) {
@@ -64,7 +73,7 @@ class TextLine {
index = this.findMinimumIndexWithSpaceCharacterFromIndex(index);
}
else if (isCharacterCloseToPreviousWord) {
if (this.line.charAt(index) != SPACE_CHARACTER) {
if (this.line[index] != SPACE_CHARACTER) {
index = index + 1;
}
else {
@@ -76,52 +85,36 @@ class TextLine {
}
}
private boolean isSpaceCharacterAtIndex(int index) {
return this.line.charAt(index) != SPACE_CHARACTER;
private boolean isNotSpaceCharacterAtIndex(int index) {
return this.line[index] != SPACE_CHARACTER;
}
private boolean isNewIndexGreaterThanLastIndex(int index) {
int lastIndex = this.getLastIndex();
return (index > lastIndex);
return index > this.lastIndex;
}
private int getNextValidIndex(int index, boolean isCharacterPartOfPreviousWord) {
int nextValidIndex = index;
int lastIndex = this.getLastIndex();
if (!this.isNewIndexGreaterThanLastIndex(index)) {
nextValidIndex = lastIndex + 1;
nextValidIndex = this.lastIndex + 1;
}
if (!isCharacterPartOfPreviousWord && this.isSpaceCharacterAtIndex(index - 1)) {
if (!isCharacterPartOfPreviousWord && index > 0 && this.isNotSpaceCharacterAtIndex(index - 1)) {
nextValidIndex = nextValidIndex + 1;
}
this.setLastIndex(nextValidIndex);
this.lastIndex = nextValidIndex;
return nextValidIndex;
}
private int findMinimumIndexWithSpaceCharacterFromIndex(int index) {
int newIndex = index;
while (newIndex >= 0 && this.line.charAt(newIndex) == SPACE_CHARACTER) {
while (newIndex >= 0 && this.line[newIndex] == SPACE_CHARACTER) {
newIndex = newIndex - 1;
}
return newIndex + 1;
}
private boolean indexIsInBounds(int index) {
return (index >= 0 && index < this.lineLength);
}
private void completeLineWithSpaces() {
for (int i = 0; i < this.getLineLength(); ++i) {
this.line += SPACE_CHARACTER;
}
}
private int getLastIndex() {
return this.lastIndex;
}
private void setLastIndex(int lastIndex) {
this.lastIndex = lastIndex;
return index >= 0 && index < this.lineLength;
}
}

View File

@@ -29,11 +29,12 @@ import static org.assertj.core.api.Assertions.assertThat;
/**
* @author Christian Tzolov
* @author Tibor Tarnai
*/
public class PagePdfDocumentReaderTests {
class PagePdfDocumentReaderTests {
@Test
public void classpathRead() {
void classpathRead() {
PagePdfDocumentReader pdfReader = new PagePdfDocumentReader("classpath:/sample1.pdf",
PdfDocumentReaderConfig.builder()
@@ -51,10 +52,22 @@ public class PagePdfDocumentReaderTests {
assertThat(docs).hasSize(4);
String allText = docs.stream().map(d -> d.getContent()).collect(Collectors.joining(System.lineSeparator()));
String allText = docs.stream().map(Document::getContent).collect(Collectors.joining(System.lineSeparator()));
assertThat(allText).doesNotContain(
List.of("Page 1 of 4", "Page 2 of 4", "Page 3 of 4", "Page 4 of 4", "PDF Bookmark Sample"));
}
@Test
void testIndexOutOfBound() {
var documents = new PagePdfDocumentReader("classpath:/sample2.pdf",
PdfDocumentReaderConfig.builder()
.withPageExtractedTextFormatter(ExtractedTextFormatter.builder().build())
.withPagesPerDocument(1)
.build())
.get();
assertThat(documents).hasSize(64);
}
}

View File

@@ -0,0 +1,134 @@
/*
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.ai.reader.pdf.layout;
import java.util.stream.Stream;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
/*
* @author Tibor Tarnai
*/
class TextLineTest {
public static Stream<Arguments> testWriteCharacterAtIndexValidIndex() {
return Stream.of(Arguments.of(new Character('A', 0, false, false, false, false)),
Arguments.of(new Character('A', 10, true, false, false, false)),
Arguments.of(new Character('A', 0, false, true, false, false)));
}
@ParameterizedTest
@MethodSource
void testWriteCharacterAtIndexValidIndex(Character character) {
TextLine textLine = new TextLine(100);
textLine.writeCharacterAtIndex(character);
assertEquals(" A" + " ".repeat(23), textLine.getLine());
}
@Test
void testWriteCharacterAtIndex_PartOfPreviousWord() {
TextLine textLine = new TextLine(100);
Character character = new Character('A', 10, true, false, false, false);
textLine.writeCharacterAtIndex(character);
assertEquals(" A" + " ".repeat(23), textLine.getLine());
}
@Test
void testWriteCharacterAtIndex_BeginningOfNewLine() {
TextLine textLine = new TextLine(100);
Character character = new Character('A', 0, false, true, false, false);
textLine.writeCharacterAtIndex(character);
assertEquals(" A" + " ".repeat(23), textLine.getLine());
}
@Test
void testWriteCharacterAtIndex_InvalidIndex() {
TextLine textLine = new TextLine(100);
Character character = new Character('A', 150, false, false, false, false);
textLine.writeCharacterAtIndex(character);
assertEquals(" ".repeat(25), textLine.getLine());
}
@Test
void testWriteCharacterAtIndex_NegativeIndex() {
TextLine textLine = new TextLine(100);
Character character = new Character('A', -1, false, false, false, false);
textLine.writeCharacterAtIndex(character);
assertEquals(" ".repeat(25), textLine.getLine());
}
@Test
void testWriteCharacterAtIndex_SpaceCharacter() {
TextLine textLine = new TextLine(100);
Character character = new Character('A', 10, false, false, false, false);
textLine.writeCharacterAtIndex(character);
assertEquals(" ".repeat(10) + "A" + " ".repeat(14), textLine.getLine());
}
@Test
void testWriteCharacterAtIndex_CloseToPreviousWord() {
TextLine textLine = new TextLine(100);
Character character = new Character('A', 10, false, false, true, false);
textLine.writeCharacterAtIndex(character);
assertEquals(" ".repeat(10) + "A" + " ".repeat(14), textLine.getLine());
}
@Test
void testGetLineLength() {
TextLine textLine = new TextLine(100);
assertEquals(100 / ForkPDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT, textLine.getLineLength());
}
@Test
void testGetLine() {
TextLine textLine = new TextLine(100);
assertEquals(" ".repeat(100 / ForkPDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT),
textLine.getLine());
}
@Test
void testNegativeLineLength() {
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, () -> new TextLine(-100));
assertEquals("Line length cannot be negative", exception.getMessage());
}
@Test
void testComputeIndexForCharacter_CloseToPreviousWord() {
TextLine textLine = new TextLine(100);
Character character = new Character('A', 10, true, false, true, true);
textLine.writeCharacterAtIndex(character);
assertEquals(" A" + " ".repeat(23), textLine.getLine());
}
@Test
void testComputeIndexForCharacter_CloseToPreviousWord_WriteTwoCharacters() {
TextLine textLine = new TextLine(100);
Character character = new Character('A', 10, true, false, true, true);
Character anotherCharacter = new Character('B', 1, true, false, true, true);
textLine.writeCharacterAtIndex(character);
textLine.writeCharacterAtIndex(anotherCharacter);
assertEquals(" AB" + " ".repeat(22), textLine.getLine());
}
}