GH-1689 Handle StringIndexOutOfBoundsException in PagePdfDocumentReader
- Add test coverage to TextLine
- Use char[] instead of String for TextLine
- Optimise index handling when reading text lines
Resolves #1689
This commit is contained in:
committed by
Ilayaperumal Gopinathan
parent
865d429451
commit
78a2a2788b
@@ -16,29 +16,38 @@
|
||||
|
||||
package org.springframework.ai.reader.pdf.layout;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/*
|
||||
* @author Soby Chacko
|
||||
* @author Tibor Tarnai
|
||||
*/
|
||||
|
||||
class TextLine {
|
||||
|
||||
private static final char SPACE_CHARACTER = ' ';
|
||||
|
||||
private int lineLength;
|
||||
private final int lineLength;
|
||||
|
||||
private String line;
|
||||
private final char[] line;
|
||||
|
||||
private int lastIndex;
|
||||
|
||||
TextLine(int lineLength) {
|
||||
this.line = "";
|
||||
if (lineLength < 0) {
|
||||
throw new IllegalArgumentException("Line length cannot be negative");
|
||||
}
|
||||
this.lineLength = lineLength / ForkPDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT;
|
||||
this.completeLineWithSpaces();
|
||||
this.line = new char[this.lineLength];
|
||||
Arrays.fill(this.line, SPACE_CHARACTER);
|
||||
}
|
||||
|
||||
public void writeCharacterAtIndex(final Character character) {
|
||||
character.setIndex(this.computeIndexForCharacter(character));
|
||||
int index = character.getIndex();
|
||||
char characterValue = character.getCharacterValue();
|
||||
if (this.indexIsInBounds(index) && this.line.charAt(index) == SPACE_CHARACTER) {
|
||||
this.line = this.line.substring(0, index) + characterValue
|
||||
+ this.line.substring(index + 1, this.getLineLength());
|
||||
if (this.indexIsInBounds(index) && this.line[index] == SPACE_CHARACTER) {
|
||||
this.line[index] = characterValue;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,7 +56,7 @@ class TextLine {
|
||||
}
|
||||
|
||||
public String getLine() {
|
||||
return this.line;
|
||||
return new String(this.line);
|
||||
}
|
||||
|
||||
private int computeIndexForCharacter(final Character character) {
|
||||
@@ -64,7 +73,7 @@ class TextLine {
|
||||
index = this.findMinimumIndexWithSpaceCharacterFromIndex(index);
|
||||
}
|
||||
else if (isCharacterCloseToPreviousWord) {
|
||||
if (this.line.charAt(index) != SPACE_CHARACTER) {
|
||||
if (this.line[index] != SPACE_CHARACTER) {
|
||||
index = index + 1;
|
||||
}
|
||||
else {
|
||||
@@ -76,52 +85,36 @@ class TextLine {
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isSpaceCharacterAtIndex(int index) {
|
||||
return this.line.charAt(index) != SPACE_CHARACTER;
|
||||
private boolean isNotSpaceCharacterAtIndex(int index) {
|
||||
return this.line[index] != SPACE_CHARACTER;
|
||||
}
|
||||
|
||||
private boolean isNewIndexGreaterThanLastIndex(int index) {
|
||||
int lastIndex = this.getLastIndex();
|
||||
return (index > lastIndex);
|
||||
return index > this.lastIndex;
|
||||
}
|
||||
|
||||
private int getNextValidIndex(int index, boolean isCharacterPartOfPreviousWord) {
|
||||
int nextValidIndex = index;
|
||||
int lastIndex = this.getLastIndex();
|
||||
if (!this.isNewIndexGreaterThanLastIndex(index)) {
|
||||
nextValidIndex = lastIndex + 1;
|
||||
nextValidIndex = this.lastIndex + 1;
|
||||
}
|
||||
if (!isCharacterPartOfPreviousWord && this.isSpaceCharacterAtIndex(index - 1)) {
|
||||
if (!isCharacterPartOfPreviousWord && index > 0 && this.isNotSpaceCharacterAtIndex(index - 1)) {
|
||||
nextValidIndex = nextValidIndex + 1;
|
||||
}
|
||||
this.setLastIndex(nextValidIndex);
|
||||
this.lastIndex = nextValidIndex;
|
||||
return nextValidIndex;
|
||||
}
|
||||
|
||||
private int findMinimumIndexWithSpaceCharacterFromIndex(int index) {
|
||||
int newIndex = index;
|
||||
while (newIndex >= 0 && this.line.charAt(newIndex) == SPACE_CHARACTER) {
|
||||
while (newIndex >= 0 && this.line[newIndex] == SPACE_CHARACTER) {
|
||||
newIndex = newIndex - 1;
|
||||
}
|
||||
return newIndex + 1;
|
||||
}
|
||||
|
||||
private boolean indexIsInBounds(int index) {
|
||||
return (index >= 0 && index < this.lineLength);
|
||||
}
|
||||
|
||||
private void completeLineWithSpaces() {
|
||||
for (int i = 0; i < this.getLineLength(); ++i) {
|
||||
this.line += SPACE_CHARACTER;
|
||||
}
|
||||
}
|
||||
|
||||
private int getLastIndex() {
|
||||
return this.lastIndex;
|
||||
}
|
||||
|
||||
private void setLastIndex(int lastIndex) {
|
||||
this.lastIndex = lastIndex;
|
||||
return index >= 0 && index < this.lineLength;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -29,11 +29,12 @@ import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
/**
|
||||
* @author Christian Tzolov
|
||||
* @author Tibor Tarnai
|
||||
*/
|
||||
public class PagePdfDocumentReaderTests {
|
||||
class PagePdfDocumentReaderTests {
|
||||
|
||||
@Test
|
||||
public void classpathRead() {
|
||||
void classpathRead() {
|
||||
|
||||
PagePdfDocumentReader pdfReader = new PagePdfDocumentReader("classpath:/sample1.pdf",
|
||||
PdfDocumentReaderConfig.builder()
|
||||
@@ -51,10 +52,22 @@ public class PagePdfDocumentReaderTests {
|
||||
|
||||
assertThat(docs).hasSize(4);
|
||||
|
||||
String allText = docs.stream().map(d -> d.getContent()).collect(Collectors.joining(System.lineSeparator()));
|
||||
String allText = docs.stream().map(Document::getContent).collect(Collectors.joining(System.lineSeparator()));
|
||||
|
||||
assertThat(allText).doesNotContain(
|
||||
List.of("Page 1 of 4", "Page 2 of 4", "Page 3 of 4", "Page 4 of 4", "PDF Bookmark Sample"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testIndexOutOfBound() {
|
||||
var documents = new PagePdfDocumentReader("classpath:/sample2.pdf",
|
||||
PdfDocumentReaderConfig.builder()
|
||||
.withPageExtractedTextFormatter(ExtractedTextFormatter.builder().build())
|
||||
.withPagesPerDocument(1)
|
||||
.build())
|
||||
.get();
|
||||
|
||||
assertThat(documents).hasSize(64);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -0,0 +1,134 @@
|
||||
/*
|
||||
* Copyright 2023-2024 the original author or authors.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* https://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.springframework.ai.reader.pdf.layout;
|
||||
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
|
||||
/*
|
||||
* @author Tibor Tarnai
|
||||
*/
|
||||
|
||||
class TextLineTest {
|
||||
|
||||
public static Stream<Arguments> testWriteCharacterAtIndexValidIndex() {
|
||||
return Stream.of(Arguments.of(new Character('A', 0, false, false, false, false)),
|
||||
Arguments.of(new Character('A', 10, true, false, false, false)),
|
||||
Arguments.of(new Character('A', 0, false, true, false, false)));
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource
|
||||
void testWriteCharacterAtIndexValidIndex(Character character) {
|
||||
TextLine textLine = new TextLine(100);
|
||||
textLine.writeCharacterAtIndex(character);
|
||||
assertEquals(" A" + " ".repeat(23), textLine.getLine());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testWriteCharacterAtIndex_PartOfPreviousWord() {
|
||||
TextLine textLine = new TextLine(100);
|
||||
Character character = new Character('A', 10, true, false, false, false);
|
||||
textLine.writeCharacterAtIndex(character);
|
||||
assertEquals(" A" + " ".repeat(23), textLine.getLine());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testWriteCharacterAtIndex_BeginningOfNewLine() {
|
||||
TextLine textLine = new TextLine(100);
|
||||
Character character = new Character('A', 0, false, true, false, false);
|
||||
textLine.writeCharacterAtIndex(character);
|
||||
assertEquals(" A" + " ".repeat(23), textLine.getLine());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testWriteCharacterAtIndex_InvalidIndex() {
|
||||
TextLine textLine = new TextLine(100);
|
||||
Character character = new Character('A', 150, false, false, false, false);
|
||||
textLine.writeCharacterAtIndex(character);
|
||||
assertEquals(" ".repeat(25), textLine.getLine());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testWriteCharacterAtIndex_NegativeIndex() {
|
||||
TextLine textLine = new TextLine(100);
|
||||
Character character = new Character('A', -1, false, false, false, false);
|
||||
textLine.writeCharacterAtIndex(character);
|
||||
assertEquals(" ".repeat(25), textLine.getLine());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testWriteCharacterAtIndex_SpaceCharacter() {
|
||||
TextLine textLine = new TextLine(100);
|
||||
Character character = new Character('A', 10, false, false, false, false);
|
||||
textLine.writeCharacterAtIndex(character);
|
||||
assertEquals(" ".repeat(10) + "A" + " ".repeat(14), textLine.getLine());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testWriteCharacterAtIndex_CloseToPreviousWord() {
|
||||
TextLine textLine = new TextLine(100);
|
||||
Character character = new Character('A', 10, false, false, true, false);
|
||||
textLine.writeCharacterAtIndex(character);
|
||||
assertEquals(" ".repeat(10) + "A" + " ".repeat(14), textLine.getLine());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testGetLineLength() {
|
||||
TextLine textLine = new TextLine(100);
|
||||
assertEquals(100 / ForkPDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT, textLine.getLineLength());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testGetLine() {
|
||||
TextLine textLine = new TextLine(100);
|
||||
assertEquals(" ".repeat(100 / ForkPDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT),
|
||||
textLine.getLine());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testNegativeLineLength() {
|
||||
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, () -> new TextLine(-100));
|
||||
assertEquals("Line length cannot be negative", exception.getMessage());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testComputeIndexForCharacter_CloseToPreviousWord() {
|
||||
TextLine textLine = new TextLine(100);
|
||||
Character character = new Character('A', 10, true, false, true, true);
|
||||
textLine.writeCharacterAtIndex(character);
|
||||
assertEquals(" A" + " ".repeat(23), textLine.getLine());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testComputeIndexForCharacter_CloseToPreviousWord_WriteTwoCharacters() {
|
||||
TextLine textLine = new TextLine(100);
|
||||
Character character = new Character('A', 10, true, false, true, true);
|
||||
Character anotherCharacter = new Character('B', 1, true, false, true, true);
|
||||
textLine.writeCharacterAtIndex(character);
|
||||
textLine.writeCharacterAtIndex(anotherCharacter);
|
||||
assertEquals(" AB" + " ".repeat(22), textLine.getLine());
|
||||
}
|
||||
|
||||
}
|
||||
BIN
document-readers/pdf-reader/src/test/resources/sample2.pdf
Normal file
BIN
document-readers/pdf-reader/src/test/resources/sample2.pdf
Normal file
Binary file not shown.
Reference in New Issue
Block a user