Introduce checkstyle plugin

- Based on https://github.com/spring-io/spring-javaformat
- In this iteration, checkstyles are only enabled for spring-ai-core
This commit is contained in:
Soby Chacko
2024-10-24 10:39:48 -04:00
committed by Mark Pollack
parent 33a72417e1
commit 8e758dbd00
1412 changed files with 26997 additions and 21963 deletions

View File

@@ -1,4 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright 2023-2024 the original author or authors.
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ https://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">

View File

@@ -1,18 +1,45 @@
package org.springframework.ai.reader.markdown;
/*
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.commonmark.node.*;
import org.commonmark.parser.Parser;
import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentReader;
import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource;
package org.springframework.ai.reader.markdown;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.commonmark.node.AbstractVisitor;
import org.commonmark.node.BlockQuote;
import org.commonmark.node.Code;
import org.commonmark.node.FencedCodeBlock;
import org.commonmark.node.HardLineBreak;
import org.commonmark.node.Heading;
import org.commonmark.node.ListItem;
import org.commonmark.node.Node;
import org.commonmark.node.SoftLineBreak;
import org.commonmark.node.Text;
import org.commonmark.node.ThematicBreak;
import org.commonmark.parser.Parser;
import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentReader;
import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource;
/**
* Reads the given Markdown resource and groups headers, paragraphs, or text divided by
* horizontal lines (depending on the
@@ -58,10 +85,10 @@ public class MarkdownDocumentReader implements DocumentReader {
*/
@Override
public List<Document> get() {
try (var input = markdownResource.getInputStream()) {
Node node = parser.parseReader(new InputStreamReader(input));
try (var input = this.markdownResource.getInputStream()) {
Node node = this.parser.parseReader(new InputStreamReader(input));
DocumentVisitor documentVisitor = new DocumentVisitor(config);
DocumentVisitor documentVisitor = new DocumentVisitor(this.config);
node.accept(documentVisitor);
return documentVisitor.getDocuments();
@@ -90,7 +117,7 @@ public class MarkdownDocumentReader implements DocumentReader {
@Override
public void visit(org.commonmark.node.Document document) {
currentDocumentBuilder = Document.builder();
this.currentDocumentBuilder = Document.builder();
super.visit(document);
}
@@ -102,7 +129,7 @@ public class MarkdownDocumentReader implements DocumentReader {
@Override
public void visit(ThematicBreak thematicBreak) {
if (config.horizontalRuleCreateDocument) {
if (this.config.horizontalRuleCreateDocument) {
buildAndFlush();
}
super.visit(thematicBreak);
@@ -128,32 +155,32 @@ public class MarkdownDocumentReader implements DocumentReader {
@Override
public void visit(BlockQuote blockQuote) {
if (!config.includeBlockquote) {
if (!this.config.includeBlockquote) {
buildAndFlush();
}
translateLineBreakToSpace();
currentDocumentBuilder.withMetadata("category", "blockquote");
this.currentDocumentBuilder.withMetadata("category", "blockquote");
super.visit(blockQuote);
}
@Override
public void visit(Code code) {
currentParagraphs.add(code.getLiteral());
currentDocumentBuilder.withMetadata("category", "code_inline");
this.currentParagraphs.add(code.getLiteral());
this.currentDocumentBuilder.withMetadata("category", "code_inline");
super.visit(code);
}
@Override
public void visit(FencedCodeBlock fencedCodeBlock) {
if (!config.includeCodeBlock) {
if (!this.config.includeCodeBlock) {
buildAndFlush();
}
translateLineBreakToSpace();
currentParagraphs.add(fencedCodeBlock.getLiteral());
currentDocumentBuilder.withMetadata("category", "code_block");
currentDocumentBuilder.withMetadata("lang", fencedCodeBlock.getInfo());
this.currentParagraphs.add(fencedCodeBlock.getLiteral());
this.currentDocumentBuilder.withMetadata("category", "code_block");
this.currentDocumentBuilder.withMetadata("lang", fencedCodeBlock.getInfo());
buildAndFlush();
@@ -163,11 +190,11 @@ public class MarkdownDocumentReader implements DocumentReader {
@Override
public void visit(Text text) {
if (text.getParent() instanceof Heading heading) {
currentDocumentBuilder.withMetadata("category", "header_%d".formatted(heading.getLevel()))
this.currentDocumentBuilder.withMetadata("category", "header_%d".formatted(heading.getLevel()))
.withMetadata("title", text.getLiteral());
}
else {
currentParagraphs.add(text.getLiteral());
this.currentParagraphs.add(text.getLiteral());
}
super.visit(text);
@@ -176,29 +203,29 @@ public class MarkdownDocumentReader implements DocumentReader {
public List<Document> getDocuments() {
buildAndFlush();
return documents;
return this.documents;
}
private void buildAndFlush() {
if (!currentParagraphs.isEmpty()) {
String content = String.join("", currentParagraphs);
if (!this.currentParagraphs.isEmpty()) {
String content = String.join("", this.currentParagraphs);
Document.Builder builder = currentDocumentBuilder.withContent(content);
Document.Builder builder = this.currentDocumentBuilder.withContent(content);
config.additionalMetadata.forEach(builder::withMetadata);
this.config.additionalMetadata.forEach(builder::withMetadata);
Document document = builder.build();
documents.add(document);
this.documents.add(document);
currentParagraphs.clear();
this.currentParagraphs.clear();
}
currentDocumentBuilder = Document.builder();
this.currentDocumentBuilder = Document.builder();
}
private void translateLineBreakToSpace() {
if (!currentParagraphs.isEmpty()) {
currentParagraphs.add(" ");
if (!this.currentParagraphs.isEmpty()) {
this.currentParagraphs.add(" ");
}
}

View File

@@ -1,12 +1,28 @@
/*
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.ai.reader.markdown.config;
import java.util.HashMap;
import java.util.Map;
import org.springframework.ai.document.Document;
import org.springframework.ai.reader.markdown.MarkdownDocumentReader;
import org.springframework.util.Assert;
import java.util.HashMap;
import java.util.Map;
/**
* Common configuration for the {@link MarkdownDocumentReader}.
*
@@ -23,10 +39,10 @@ public class MarkdownDocumentReaderConfig {
public final Map<String, Object> additionalMetadata;
public MarkdownDocumentReaderConfig(Builder builder) {
horizontalRuleCreateDocument = builder.horizontalRuleCreateDocument;
includeCodeBlock = builder.includeCodeBlock;
includeBlockquote = builder.includeBlockquote;
additionalMetadata = builder.additionalMetadata;
this.horizontalRuleCreateDocument = builder.horizontalRuleCreateDocument;
this.includeCodeBlock = builder.includeCodeBlock;
this.includeBlockquote = builder.includeBlockquote;
this.additionalMetadata = builder.additionalMetadata;
}
/**

View File

@@ -1,12 +1,29 @@
package org.springframework.ai.reader.markdown;
/*
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.junit.jupiter.api.Test;
import org.springframework.ai.document.Document;
import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
package org.springframework.ai.reader.markdown;
import java.util.List;
import java.util.Map;
import org.junit.jupiter.api.Test;
import org.springframework.ai.document.Document;
import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.groups.Tuple.tuple;

View File

@@ -1,4 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright 2023-2024 the original author or authors.
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ https://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>

View File

@@ -1,11 +1,11 @@
/*
* Copyright 2023 - 2024 the original author or authors.
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,9 +13,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.ai.reader.pdf;
import java.awt.Rectangle;
import java.awt.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
@@ -24,9 +25,9 @@ import java.util.stream.Collectors;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentReader;
import org.springframework.ai.reader.pdf.config.PdfDocumentReaderConfig;
@@ -46,22 +47,22 @@ import org.springframework.util.StringUtils;
*/
public class PagePdfDocumentReader implements DocumentReader {
private final Logger logger = LoggerFactory.getLogger(getClass());
private static final String PDF_PAGE_REGION = "pdfPageRegion";
public static final String METADATA_START_PAGE_NUMBER = "page_number";
public static final String METADATA_END_PAGE_NUMBER = "end_page_number";
public static final String METADATA_FILE_NAME = "file_name";
private static final String PDF_PAGE_REGION = "pdfPageRegion";
protected final PDDocument document;
private PdfDocumentReaderConfig config;
private final Logger logger = LoggerFactory.getLogger(getClass());
protected String resourceFileName;
private PdfDocumentReaderConfig config;
public PagePdfDocumentReader(String resourceUrl) {
this(new DefaultResourceLoader().getResource(resourceUrl));
}
@@ -103,15 +104,15 @@ public class PagePdfDocumentReader implements DocumentReader {
int totalPages = this.document.getDocumentCatalog().getPages().getCount();
int logFrequency = totalPages > 10 ? totalPages / 10 : 1; // if less than 10
// pages, print
// each iteration
// pages, print
// each iteration
int counter = 0;
PDPage lastPage = this.document.getDocumentCatalog().getPages().iterator().next();
for (PDPage page : this.document.getDocumentCatalog().getPages()) {
lastPage = page;
if (counter % logFrequency == 0 && counter / logFrequency < 10) {
logger.info("Processing PDF page: {}", (counter + 1));
this.logger.info("Processing PDF page: {}", (counter + 1));
}
counter++;
@@ -153,7 +154,7 @@ public class PagePdfDocumentReader implements DocumentReader {
readDocuments.add(toDocument(lastPage, pageTextGroupList.stream().collect(Collectors.joining()),
startPageNumber, pageNumber));
}
logger.info("Processing {} pages", totalPages);
this.logger.info("Processing {} pages", totalPages);
return readDocuments;
}

View File

@@ -1,11 +1,11 @@
/*
* Copyright 2023 - 2024 the original author or authors.
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,18 +13,19 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.ai.reader.pdf;
import java.awt.Rectangle;
import java.awt.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentReader;
import org.springframework.ai.reader.pdf.config.ParagraphManager;
@@ -48,8 +49,6 @@ import org.springframework.util.StringUtils;
*/
public class ParagraphPdfDocumentReader implements DocumentReader {
private final Logger logger = LoggerFactory.getLogger(getClass());
// Constants for metadata keys
private static final String METADATA_START_PAGE = "page_number";
@@ -61,14 +60,16 @@ public class ParagraphPdfDocumentReader implements DocumentReader {
private static final String METADATA_FILE_NAME = "file_name";
private final ParagraphManager paragraphTextExtractor;
protected final PDDocument document;
private PdfDocumentReaderConfig config;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final ParagraphManager paragraphTextExtractor;
protected String resourceFileName;
private PdfDocumentReaderConfig config;
/**
* Constructs a ParagraphPdfDocumentReader using a resource URL.
* @param resourceUrl The URL of the PDF resource.
@@ -132,7 +133,7 @@ public class ParagraphPdfDocumentReader implements DocumentReader {
List<Document> documents = new ArrayList<>(paragraphs.size());
if (!CollectionUtils.isEmpty(paragraphs)) {
logger.info("Start processing paragraphs from PDF");
this.logger.info("Start processing paragraphs from PDF");
Iterator<Paragraph> itr = paragraphs.iterator();
var current = itr.next();
@@ -151,7 +152,7 @@ public class ParagraphPdfDocumentReader implements DocumentReader {
}
}
}
logger.info("End processing paragraphs from PDF");
this.logger.info("End processing paragraphs from PDF");
return documents;
}

View File

@@ -1,11 +1,11 @@
/*
* Copyright 2023 - 2024 the original author or authors.
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,15 +13,16 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.ai.reader.pdf.aot;
import java.io.IOException;
import java.util.Set;
import org.springframework.aot.hint.RuntimeHints;
import org.springframework.aot.hint.RuntimeHintsRegistrar;
import org.springframework.core.io.support.PathMatchingResourcePatternResolver;
import java.io.IOException;
import java.util.Set;
/**
* The PdfReaderRuntimeHints class is responsible for registering runtime hints for PDFBox
* resources.

View File

@@ -1,11 +1,11 @@
/*
* Copyright 2023 - 2024 the original author or authors.
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.ai.reader.pdf.config;
import java.io.IOException;
@@ -39,34 +40,6 @@ import org.springframework.util.CollectionUtils;
*/
public class ParagraphManager {
/**
* Represents a document paragraph metadata and hierarchy.
*
* @param parent Parent paragraph that will contain a children paragraphs.
* @param title Paragraph title as it appears in the PDF document.
* @param level The TOC deepness level for this paragraph. The root is at level 0.
* @param startPageNumber The page number in the PDF where this paragraph begins.
* @param endPageNumber The page number in the PDF where this paragraph ends.
* @param children Sub-paragraphs for this paragraph.
*/
public record Paragraph(Paragraph parent, String title, int level, int startPageNumber, int endPageNumber,
int position, List<Paragraph> children) {
public Paragraph(Paragraph parent, String title, int level, int startPageNumber, int endPageNumber,
int position) {
this(parent, title, level, startPageNumber, endPageNumber, position, new ArrayList<>());
}
@Override
public String toString() {
String indent = (level < 0) ? "" : new String(new char[level * 2]).replace('\0', ' ');
return indent + " " + level + ") " + title + " [" + startPageNumber + "," + endPageNumber + "], children = "
+ children.size() + ", pos = " + position;
}
}
/**
* Root of the paragraphs tree.
*/
@@ -90,7 +63,7 @@ public class ParagraphManager {
new Paragraph(null, "root", -1, 1, this.document.getNumberOfPages(), 0),
this.document.getDocumentCatalog().getDocumentOutline(), 0);
printParagraph(rootParagraph, System.out);
printParagraph(this.rootParagraph, System.out);
}
catch (Exception e) {
throw new RuntimeException(e);
@@ -203,4 +176,32 @@ public class ParagraphManager {
return resultList;
}
/**
* Represents a document paragraph metadata and hierarchy.
*
* @param parent Parent paragraph that will contain a children paragraphs.
* @param title Paragraph title as it appears in the PDF document.
* @param level The TOC deepness level for this paragraph. The root is at level 0.
* @param startPageNumber The page number in the PDF where this paragraph begins.
* @param endPageNumber The page number in the PDF where this paragraph ends.
* @param children Sub-paragraphs for this paragraph.
*/
public record Paragraph(Paragraph parent, String title, int level, int startPageNumber, int endPageNumber,
int position, List<Paragraph> children) {
public Paragraph(Paragraph parent, String title, int level, int startPageNumber, int endPageNumber,
int position) {
this(parent, title, level, startPageNumber, endPageNumber, position, new ArrayList<>());
}
@Override
public String toString() {
String indent = (this.level < 0) ? "" : new String(new char[this.level * 2]).replace('\0', ' ');
return indent + " " + this.level + ") " + this.title + " [" + this.startPageNumber + ","
+ this.endPageNumber + "], children = " + this.children.size() + ", pos = " + this.position;
}
}
}

View File

@@ -1,11 +1,11 @@
/*
* Copyright 2023 - 2024 the original author or authors.
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.ai.reader.pdf.config;
import org.springframework.ai.reader.ExtractedTextFormatter;
@@ -40,6 +41,14 @@ public class PdfDocumentReaderConfig {
public final ExtractedTextFormatter pageExtractedTextFormatter;
private PdfDocumentReaderConfig(PdfDocumentReaderConfig.Builder builder) {
this.pagesPerDocument = builder.pagesPerDocument;
this.pageBottomMargin = builder.pageBottomMargin;
this.pageTopMargin = builder.pageTopMargin;
this.pageExtractedTextFormatter = builder.pageExtractedTextFormatter;
this.reversedParagraphPosition = builder.reversedParagraphPosition;
}
/**
* Start building a new configuration.
* @return The entry point for creating a new configuration.
@@ -56,14 +65,6 @@ public class PdfDocumentReaderConfig {
return builder().build();
}
private PdfDocumentReaderConfig(PdfDocumentReaderConfig.Builder builder) {
this.pagesPerDocument = builder.pagesPerDocument;
this.pageBottomMargin = builder.pageBottomMargin;
this.pageTopMargin = builder.pageTopMargin;
this.pageExtractedTextFormatter = builder.pageExtractedTextFormatter;
this.reversedParagraphPosition = builder.reversedParagraphPosition;
}
public static class Builder {
private int pagesPerDocument = 1;

View File

@@ -1,11 +1,11 @@
/*
* Copyright 2023 - 2024 the original author or authors.
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -180,8 +180,9 @@ public class ForkPDFLayoutTextStripper extends PDFTextStripper {
double height = textPosition.getHeight();
int numberOfLines = (int) (Math.floor(textYPosition - previousTextYPosition) / height);
numberOfLines = Math.max(1, numberOfLines - 1); // exclude current new line
if (DEBUG)
if (DEBUG) {
System.out.println(height + " " + numberOfLines);
}
return numberOfLines;
}
else {
@@ -191,7 +192,7 @@ public class ForkPDFLayoutTextStripper extends PDFTextStripper {
private TextLine addNewLine() {
TextLine textLine = new TextLine(this.getCurrentPageWidth());
textLineList.add(textLine);
this.textLineList.add(textLine);
return textLine;
}
@@ -248,7 +249,7 @@ class TextLine {
}
public String getLine() {
return line;
return this.line;
}
private int computeIndexForCharacter(final Character character) {
@@ -313,7 +314,7 @@ class TextLine {
private void completeLineWithSpaces() {
for (int i = 0; i < this.getLineLength(); ++i) {
line += SPACE_CHARACTER;
this.line += SPACE_CHARACTER;
}
}
@@ -350,8 +351,9 @@ class Character {
this.isFirstCharacterOfAWord = isFirstCharacterOfAWord;
this.isCharacterAtTheBeginningOfNewLine = isCharacterAtTheBeginningOfNewLine;
this.isCharacterCloseToPreviousWord = isCharacterPartOfASentence;
if (ForkPDFLayoutTextStripper.DEBUG)
if (ForkPDFLayoutTextStripper.DEBUG) {
System.out.println(this.toString());
}
}
public char getCharacterValue() {
@@ -384,14 +386,14 @@ class Character {
public String toString() {
String toString = "";
toString += index;
toString += this.index;
toString += " ";
toString += characterValue;
toString += " isCharacterPartOfPreviousWord=" + isCharacterPartOfPreviousWord;
toString += " isFirstCharacterOfAWord=" + isFirstCharacterOfAWord;
toString += " isCharacterAtTheBeginningOfNewLine=" + isCharacterAtTheBeginningOfNewLine;
toString += " isCharacterPartOfASentence=" + isCharacterCloseToPreviousWord;
toString += " isCharacterCloseToPreviousWord=" + isCharacterCloseToPreviousWord;
toString += this.characterValue;
toString += " isCharacterPartOfPreviousWord=" + this.isCharacterPartOfPreviousWord;
toString += " isFirstCharacterOfAWord=" + this.isFirstCharacterOfAWord;
toString += " isCharacterAtTheBeginningOfNewLine=" + this.isCharacterAtTheBeginningOfNewLine;
toString += " isCharacterPartOfASentence=" + this.isCharacterCloseToPreviousWord;
toString += " isCharacterCloseToPreviousWord=" + this.isCharacterCloseToPreviousWord;
return toString;
}
@@ -424,12 +426,12 @@ class CharacterFactory {
this.isCharacterCloseToPreviousWord = this.isCharacterCloseToPreviousWord(textPosition);
char character = this.getCharacterFromTextPosition(textPosition);
int index = (int) textPosition.getX() / ForkPDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT;
return new Character(character, index, isCharacterPartOfPreviousWord, isFirstCharacterOfAWord,
isCharacterAtTheBeginningOfNewLine, isCharacterCloseToPreviousWord);
return new Character(character, index, this.isCharacterPartOfPreviousWord, this.isFirstCharacterOfAWord,
this.isCharacterAtTheBeginningOfNewLine, this.isCharacterCloseToPreviousWord);
}
private boolean isCharacterAtTheBeginningOfNewLine(final TextPosition textPosition) {
if (!firstCharacterOfLineFound) {
if (!this.firstCharacterOfLineFound) {
return true;
}
TextPosition previousTextPosition = this.getPreviousTextPosition();
@@ -438,18 +440,18 @@ class CharacterFactory {
}
private boolean isFirstCharacterOfAWord(final TextPosition textPosition) {
if (!firstCharacterOfLineFound) {
if (!this.firstCharacterOfLineFound) {
return true;
}
double numberOfSpaces = this.numberOfSpacesBetweenTwoCharacters(previousTextPosition, textPosition);
double numberOfSpaces = this.numberOfSpacesBetweenTwoCharacters(this.previousTextPosition, textPosition);
return (numberOfSpaces > 1) || this.isCharacterAtTheBeginningOfNewLine(textPosition);
}
private boolean isCharacterCloseToPreviousWord(final TextPosition textPosition) {
if (!firstCharacterOfLineFound) {
if (!this.firstCharacterOfLineFound) {
return false;
}
double numberOfSpaces = this.numberOfSpacesBetweenTwoCharacters(previousTextPosition, textPosition);
double numberOfSpaces = this.numberOfSpacesBetweenTwoCharacters(this.previousTextPosition, textPosition);
return (numberOfSpaces > 1 && numberOfSpaces <= ForkPDFLayoutTextStripper.OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT);
}
@@ -485,4 +487,4 @@ class CharacterFactory {
this.previousTextPosition = previousTextPosition;
}
}
}

View File

@@ -1,11 +1,11 @@
/*
* Copyright 2023 - 2024 the original author or authors.
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.ai.reader.pdf.layout;
import java.awt.geom.Rectangle2D;
@@ -70,8 +71,8 @@ public class PDFLayoutTextStripperByArea extends ForkPDFLayoutTextStripper {
* java coordinates (y == 0 is top), not PDF coordinates (y == 0 is bottom).
*/
public void addRegion(String regionName, Rectangle2D rect) {
regions.add(regionName);
regionArea.put(regionName, rect);
this.regions.add(regionName);
this.regionArea.put(regionName, rect);
}
/**
@@ -80,8 +81,8 @@ public class PDFLayoutTextStripperByArea extends ForkPDFLayoutTextStripper {
* @param regionName The name of the region to delete.
*/
public void removeRegion(String regionName) {
regions.remove(regionName);
regionArea.remove(regionName);
this.regions.remove(regionName);
this.regionArea.remove(regionName);
}
/**
@@ -89,7 +90,7 @@ public class PDFLayoutTextStripperByArea extends ForkPDFLayoutTextStripper {
* @return A list of java.lang.String objects to identify the region names.
*/
public List<String> getRegions() {
return regions;
return this.regions;
}
/**
@@ -98,7 +99,7 @@ public class PDFLayoutTextStripperByArea extends ForkPDFLayoutTextStripper {
* @return The text that was identified in that region.
*/
public String getTextForRegion(String regionName) {
StringWriter text = regionText.get(regionName);
StringWriter text = this.regionText.get(regionName);
return text.toString();
}
@@ -108,14 +109,14 @@ public class PDFLayoutTextStripperByArea extends ForkPDFLayoutTextStripper {
* @throws IOException If there is an error while extracting text.
*/
public void extractRegions(PDPage page) throws IOException {
for (String regionName : regions) {
for (String regionName : this.regions) {
setStartPage(getCurrentPageNo());
setEndPage(getCurrentPageNo());
// reset the stored text for the region so this class can be reused.
ArrayList<List<TextPosition>> regionCharactersByArticle = new ArrayList<List<TextPosition>>();
regionCharactersByArticle.add(new ArrayList<TextPosition>());
regionCharacterList.put(regionName, regionCharactersByArticle);
regionText.put(regionName, new StringWriter());
this.regionCharacterList.put(regionName, regionCharactersByArticle);
this.regionText.put(regionName, new StringWriter());
}
if (page.hasContents()) {
@@ -128,10 +129,10 @@ public class PDFLayoutTextStripperByArea extends ForkPDFLayoutTextStripper {
*/
@Override
protected void processTextPosition(TextPosition text) {
for (Map.Entry<String, Rectangle2D> regionAreaEntry : regionArea.entrySet()) {
for (Map.Entry<String, Rectangle2D> regionAreaEntry : this.regionArea.entrySet()) {
Rectangle2D rect = regionAreaEntry.getValue();
if (rect.contains(text.getX(), text.getY())) {
charactersByArticle = regionCharacterList.get(regionAreaEntry.getKey());
this.charactersByArticle = this.regionCharacterList.get(regionAreaEntry.getKey());
super.processTextPosition(text);
}
}
@@ -143,9 +144,9 @@ public class PDFLayoutTextStripperByArea extends ForkPDFLayoutTextStripper {
*/
@Override
protected void writePage() throws IOException {
for (String region : regionArea.keySet()) {
charactersByArticle = regionCharacterList.get(region);
output = regionText.get(region);
for (String region : this.regionArea.keySet()) {
this.charactersByArticle = this.regionCharacterList.get(region);
this.output = this.regionText.get(region);
super.writePage();
}
}

View File

@@ -1,11 +1,11 @@
/*
* Copyright 2023 - 2024 the original author or authors.
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.ai.reader.pdf;
import java.util.List;

View File

@@ -1,11 +1,11 @@
/*
* Copyright 2023 - 2024 the original author or authors.
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.ai.reader.pdf;
import org.junit.jupiter.api.Test;

View File

@@ -1,11 +1,11 @@
/*
* Copyright 2023 - 2024 the original author or authors.
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,10 +13,12 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.ai.reader.pdf.aot;
import org.assertj.core.api.Assertions;
import org.junit.jupiter.api.Test;
import org.springframework.aot.hint.RuntimeHints;
import static org.springframework.aot.hint.predicate.RuntimeHintsPredicates.resource;

View File

@@ -1,4 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright 2023-2024 the original author or authors.
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ https://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>

View File

@@ -1,11 +1,11 @@
/*
* Copyright 2023 - 2024 the original author or authors.
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.ai.reader.tika;
import java.io.IOException;

View File

@@ -1,11 +1,11 @@
/*
* Copyright 2023 - 2024 the original author or authors.
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.springframework.ai.reader.tika;
import org.junit.jupiter.params.ParameterizedTest;