diff --git a/document-readers/markdown-reader/pom.xml b/document-readers/markdown-reader/pom.xml
new file mode 100644
index 000000000..5922ea2b4
--- /dev/null
+++ b/document-readers/markdown-reader/pom.xml
@@ -0,0 +1,46 @@
+
+
+ 4.0.0
+
+ org.springframework.ai
+ spring-ai
+ 1.0.0-SNAPSHOT
+ ../../pom.xml
+
+ spring-ai-markdown-document-reader
+ jar
+ Spring AI Document Reader - Markdown
+ Spring AI Markdown document reader
+ https://github.com/spring-projects/spring-ai
+
+
+ https://github.com/spring-projects/spring-ai
+ git://github.com/spring-projects/spring-ai.git
+ git@github.com:spring-projects/spring-ai.git
+
+
+
+
+ org.springframework.ai
+ spring-ai-core
+ ${parent.version}
+
+
+
+ org.commonmark
+ commonmark
+ ${commonmark.version}
+
+
+
+
+ org.springframework.boot
+ spring-boot-starter-test
+ test
+
+
+
+
+
diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java
new file mode 100644
index 000000000..7ed8aa6b5
--- /dev/null
+++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java
@@ -0,0 +1,207 @@
+package org.springframework.ai.reader.markdown;
+
+import org.commonmark.node.*;
+import org.commonmark.parser.Parser;
+import org.springframework.ai.document.Document;
+import org.springframework.ai.document.DocumentReader;
+import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
+import org.springframework.core.io.DefaultResourceLoader;
+import org.springframework.core.io.Resource;
+
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Reads the given Markdown resource and groups headers, paragraphs, or text divided by
+ * horizontal lines (depending on the
+ * {@link MarkdownDocumentReaderConfig#horizontalRuleCreateDocument} configuration) into
+ * {@link Document}s.
+ *
+ * @author Piotr Olaszewski
+ */
+public class MarkdownDocumentReader implements DocumentReader {
+
+ /**
+ * The resource points to the Markdown document.
+ */
+ private final Resource markdownResource;
+
+ /**
+ * Configuration to a parsing process.
+ */
+ private final MarkdownDocumentReaderConfig config;
+
+ /**
+ * Markdown parser.
+ */
+ private final Parser parser;
+
+ public MarkdownDocumentReader(String markdownResource) {
+ this(new DefaultResourceLoader().getResource(markdownResource), MarkdownDocumentReaderConfig.defaultConfig());
+ }
+
+ public MarkdownDocumentReader(String markdownResource, MarkdownDocumentReaderConfig config) {
+ this(new DefaultResourceLoader().getResource(markdownResource), config);
+ }
+
+ public MarkdownDocumentReader(Resource markdownResource, MarkdownDocumentReaderConfig config) {
+ this.markdownResource = markdownResource;
+ this.config = config;
+ this.parser = Parser.builder().build();
+ }
+
+ /**
+ * Extracts and returns a list of documents from the resource.
+ * @return List of extracted {@link Document}
+ */
+ @Override
+ public List get() {
+ try (var input = markdownResource.getInputStream()) {
+ Node node = parser.parseReader(new InputStreamReader(input));
+
+ DocumentVisitor documentVisitor = new DocumentVisitor(config);
+ node.accept(documentVisitor);
+
+ return documentVisitor.getDocuments();
+ }
+ catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * A convenient class for visiting handled nodes in the Markdown document.
+ */
+ static class DocumentVisitor extends AbstractVisitor {
+
+ private final List documents = new ArrayList<>();
+
+ private final List currentParagraphs = new ArrayList<>();
+
+ private final MarkdownDocumentReaderConfig config;
+
+ private Document.Builder currentDocumentBuilder;
+
+ public DocumentVisitor(MarkdownDocumentReaderConfig config) {
+ this.config = config;
+ }
+
+ @Override
+ public void visit(org.commonmark.node.Document document) {
+ currentDocumentBuilder = Document.builder();
+ super.visit(document);
+ }
+
+ @Override
+ public void visit(Heading heading) {
+ buildAndFlush();
+ super.visit(heading);
+ }
+
+ @Override
+ public void visit(ThematicBreak thematicBreak) {
+ if (config.horizontalRuleCreateDocument) {
+ buildAndFlush();
+ }
+ super.visit(thematicBreak);
+ }
+
+ @Override
+ public void visit(SoftLineBreak softLineBreak) {
+ translateLineBreakToSpace();
+ super.visit(softLineBreak);
+ }
+
+ @Override
+ public void visit(HardLineBreak hardLineBreak) {
+ translateLineBreakToSpace();
+ super.visit(hardLineBreak);
+ }
+
+ @Override
+ public void visit(ListItem listItem) {
+ translateLineBreakToSpace();
+ super.visit(listItem);
+ }
+
+ @Override
+ public void visit(BlockQuote blockQuote) {
+ if (!config.includeBlockquote) {
+ buildAndFlush();
+ }
+
+ translateLineBreakToSpace();
+ currentDocumentBuilder.withMetadata("category", "blockquote");
+ super.visit(blockQuote);
+ }
+
+ @Override
+ public void visit(Code code) {
+ currentParagraphs.add(code.getLiteral());
+ currentDocumentBuilder.withMetadata("category", "code_inline");
+ super.visit(code);
+ }
+
+ @Override
+ public void visit(FencedCodeBlock fencedCodeBlock) {
+ if (!config.includeCodeBlock) {
+ buildAndFlush();
+ }
+
+ translateLineBreakToSpace();
+ currentParagraphs.add(fencedCodeBlock.getLiteral());
+ currentDocumentBuilder.withMetadata("category", "code_block");
+ currentDocumentBuilder.withMetadata("lang", fencedCodeBlock.getInfo());
+
+ buildAndFlush();
+
+ super.visit(fencedCodeBlock);
+ }
+
+ @Override
+ public void visit(Text text) {
+ if (text.getParent() instanceof Heading heading) {
+ currentDocumentBuilder.withMetadata("category", "header_%d".formatted(heading.getLevel()))
+ .withMetadata("title", text.getLiteral());
+ }
+ else {
+ currentParagraphs.add(text.getLiteral());
+ }
+
+ super.visit(text);
+ }
+
+ public List getDocuments() {
+ buildAndFlush();
+
+ return documents;
+ }
+
+ private void buildAndFlush() {
+ if (!currentParagraphs.isEmpty()) {
+ String content = String.join("", currentParagraphs);
+
+ Document.Builder builder = currentDocumentBuilder.withContent(content);
+
+ config.additionalMetadata.forEach(builder::withMetadata);
+
+ Document document = builder.build();
+
+ documents.add(document);
+
+ currentParagraphs.clear();
+ }
+ currentDocumentBuilder = Document.builder();
+ }
+
+ private void translateLineBreakToSpace() {
+ if (!currentParagraphs.isEmpty()) {
+ currentParagraphs.add(" ");
+ }
+ }
+
+ }
+
+}
diff --git a/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java
new file mode 100644
index 000000000..d5ad3ec58
--- /dev/null
+++ b/document-readers/markdown-reader/src/main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java
@@ -0,0 +1,123 @@
+package org.springframework.ai.reader.markdown.config;
+
+import org.springframework.ai.document.Document;
+import org.springframework.ai.reader.markdown.MarkdownDocumentReader;
+import org.springframework.util.Assert;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Common configuration for the {@link MarkdownDocumentReader}.
+ *
+ * @author Piotr Olaszewski
+ */
+public class MarkdownDocumentReaderConfig {
+
+ public final boolean horizontalRuleCreateDocument;
+
+ public final boolean includeCodeBlock;
+
+ public final boolean includeBlockquote;
+
+ public final Map additionalMetadata;
+
+ public MarkdownDocumentReaderConfig(Builder builder) {
+ horizontalRuleCreateDocument = builder.horizontalRuleCreateDocument;
+ includeCodeBlock = builder.includeCodeBlock;
+ includeBlockquote = builder.includeBlockquote;
+ additionalMetadata = builder.additionalMetadata;
+ }
+
+ /**
+ * @return the default configuration
+ */
+ public static MarkdownDocumentReaderConfig defaultConfig() {
+ return builder().build();
+ }
+
+ public static Builder builder() {
+ return new Builder();
+ }
+
+ public static class Builder {
+
+ private boolean horizontalRuleCreateDocument = false;
+
+ private boolean includeCodeBlock = false;
+
+ private boolean includeBlockquote = false;
+
+ private Map additionalMetadata = new HashMap<>();
+
+ private Builder() {
+ }
+
+ /**
+ * Text divided by horizontal lines will create new {@link Document}s. The default
+ * is {@code false}, meaning text separated by horizontal lines won't create a new
+ * document.
+ * @param horizontalRuleCreateDocument flag to determine whether new documents are
+ * created from text divided by horizontal line
+ * @return this builder
+ */
+ public Builder withHorizontalRuleCreateDocument(boolean horizontalRuleCreateDocument) {
+ this.horizontalRuleCreateDocument = horizontalRuleCreateDocument;
+ return this;
+ }
+
+ /**
+ * Whatever to include code blocks in {@link Document}s. The default is
+ * {@code false}, which means all code blocks are in separate documents.
+ * @param includeCodeBlock flag to include code block into paragraph document or
+ * create new with code only
+ * @return this builder
+ */
+ public Builder withIncludeCodeBlock(boolean includeCodeBlock) {
+ this.includeCodeBlock = includeCodeBlock;
+ return this;
+ }
+
+ /**
+ * Whatever to include blockquotes in {@link Document}s. The default is
+ * {@code false}, which means all blockquotes are in separate documents.
+ * @param includeBlockquote flag to include blockquotes into paragraph document or
+ * create new with blockquote only
+ * @return this builder
+ */
+ public Builder withIncludeBlockquote(boolean includeBlockquote) {
+ this.includeBlockquote = includeBlockquote;
+ return this;
+ }
+
+ /**
+ * Adds this additional metadata to the all built {@link Document}s.
+ * @return this builder
+ */
+ public Builder withAdditionalMetadata(String key, Object value) {
+ Assert.notNull(key, "key must not be null");
+ Assert.notNull(value, "value must not be null");
+ this.additionalMetadata.put(key, value);
+ return this;
+ }
+
+ /**
+ * Adds this additional metadata to the all built {@link Document}s.
+ * @return this builder
+ */
+ public Builder withAdditionalMetadata(Map additionalMetadata) {
+ Assert.notNull(additionalMetadata, "additionalMetadata must not be null");
+ this.additionalMetadata = additionalMetadata;
+ return this;
+ }
+
+ /**
+ * @return the immutable configuration
+ */
+ public MarkdownDocumentReaderConfig build() {
+ return new MarkdownDocumentReaderConfig(this);
+ }
+
+ }
+
+}
diff --git a/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java
new file mode 100644
index 000000000..739dbbd70
--- /dev/null
+++ b/document-readers/markdown-reader/src/test/java/org/springframework/ai/reader/markdown/MarkdownDocumentReaderTest.java
@@ -0,0 +1,230 @@
+package org.springframework.ai.reader.markdown;
+
+import org.junit.jupiter.api.Test;
+import org.springframework.ai.document.Document;
+import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
+
+import java.util.List;
+import java.util.Map;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.groups.Tuple.tuple;
+
+/**
+ * @author Piotr Olaszewski
+ */
+class MarkdownDocumentReaderTest {
+
+ @Test
+ void testOnlyHeadersWithParagraphs() {
+ MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/only-headers.md");
+
+ List documents = reader.get();
+
+ assertThat(documents).hasSize(4)
+ .extracting(Document::getMetadata, Document::getContent)
+ .containsOnly(tuple(Map.of("category", "header_1", "title", "Header 1a"),
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."),
+ tuple(Map.of("category", "header_1", "title", "Header 1b"),
+ "Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh."),
+ tuple(Map.of("category", "header_2", "title", "Header 2b"),
+ "Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero."),
+ tuple(Map.of("category", "header_2", "title", "Header 2c"),
+ "Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit."));
+ }
+
+ @Test
+ void testWithFormatting() {
+ MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/with-formatting.md");
+
+ List documents = reader.get();
+
+ assertThat(documents).hasSize(2)
+ .extracting(Document::getMetadata, Document::getContent)
+ .containsOnly(tuple(Map.of("category", "header_1", "title", "This is a fancy header name"),
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim."),
+ tuple(Map.of("category", "header_3", "title", "Header 3"),
+ "Aenean eu leo eu nibh tristique posuere quis quis massa."));
+ }
+
+ @Test
+ void testDocumentDividedViaHorizontalRules() {
+ MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder()
+ .withHorizontalRuleCreateDocument(true)
+ .build();
+
+ MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/horizontal-rules.md", config);
+
+ List documents = reader.get();
+
+ assertThat(documents).hasSize(7)
+ .extracting(Document::getMetadata, Document::getContent)
+ .containsOnly(tuple(Map.of(),
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida."),
+ tuple(Map.of(),
+ "Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim."),
+ tuple(Map.of(),
+ "Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis et magna."),
+ tuple(Map.of(),
+ "Vestibulum nec eros non felis fermentum posuere eget ac risus. Curabitur et fringilla massa. Cras facilisis nec nisl sit amet sagittis."),
+ tuple(Map.of(),
+ "Aenean eu leo eu nibh tristique posuere quis quis massa. Nullam lacinia luctus sem ut vehicula."),
+ tuple(Map.of(),
+ "Aenean quis vulputate mi. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Nam tincidunt nunc a tortor tincidunt, nec lobortis diam rhoncus."),
+ tuple(Map.of(), "Nulla facilisi. Phasellus eget tellus sed nibh ornare interdum eu eu mi."));
+ }
+
+ @Test
+ void testDocumentNotDividedViaHorizontalRulesWhenIsDisabled() {
+ MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder()
+ .withHorizontalRuleCreateDocument(false)
+ .build();
+
+ MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/horizontal-rules.md", config);
+
+ List documents = reader.get();
+
+ assertThat(documents).hasSize(1);
+
+ Document documentsFirst = documents.get(0);
+ assertThat(documentsFirst.getMetadata()).isEmpty();
+ assertThat(documentsFirst.getContent()).startsWith("Lorem ipsum dolor sit amet, consectetur adipiscing elit")
+ .endsWith("Phasellus eget tellus sed nibh ornare interdum eu eu mi.");
+ }
+
+ @Test
+ void testSimpleMarkdownDocumentWithHardAndSoftLineBreaks() {
+ MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/simple.md");
+
+ List documents = reader.get();
+
+ assertThat(documents).hasSize(1);
+
+ Document documentsFirst = documents.get(0);
+ assertThat(documentsFirst.getMetadata()).isEmpty();
+ assertThat(documentsFirst.getContent()).isEqualTo(
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim.Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis et magna. Vestibulum nec eros non felis fermentum posuere eget ac risus.Aenean eu leo eu nibh tristique posuere quis quis massa. Nullam lacinia luctus sem ut vehicula.");
+ }
+
+ @Test
+ void testCode() {
+ MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder()
+ .withHorizontalRuleCreateDocument(true)
+ .build();
+
+ MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/code.md", config);
+
+ List documents = reader.get();
+
+ assertThat(documents).satisfiesExactly(document -> {
+ assertThat(document.getMetadata()).isEqualTo(Map.of());
+ assertThat(document.getContent()).isEqualTo("This is a Java sample application:");
+ }, document -> {
+ assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "java", "category", "code_block"));
+ assertThat(document.getContent()).startsWith("package com.example.demo;")
+ .contains("SpringApplication.run(DemoApplication.class, args);");
+ }, document -> {
+ assertThat(document.getMetadata()).isEqualTo(Map.of("category", "code_inline"));
+ assertThat(document.getContent()).isEqualTo(
+ "Markdown also provides the possibility to use inline code formatting throughout the entire sentence.");
+ }, document -> {
+ assertThat(document.getMetadata()).isEqualTo(Map.of());
+ assertThat(document.getContent())
+ .isEqualTo("Another possibility is to set block code without specific highlighting:");
+ }, document -> {
+ assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "", "category", "code_block"));
+ assertThat(document.getContent()).isEqualTo("./mvnw spring-javaformat:apply\n");
+ });
+ }
+
+ @Test
+ void testCodeWhenCodeBlockShouldNotBeSeparatedDocument() {
+ MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder()
+ .withHorizontalRuleCreateDocument(true)
+ .withIncludeCodeBlock(true)
+ .build();
+
+ MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/code.md", config);
+
+ List documents = reader.get();
+
+ assertThat(documents).satisfiesExactly(document -> {
+ assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "java", "category", "code_block"));
+ assertThat(document.getContent()).startsWith("This is a Java sample application: package com.example.demo")
+ .contains("SpringApplication.run(DemoApplication.class, args);");
+ }, document -> {
+ assertThat(document.getMetadata()).isEqualTo(Map.of("category", "code_inline"));
+ assertThat(document.getContent()).isEqualTo(
+ "Markdown also provides the possibility to use inline code formatting throughout the entire sentence.");
+ }, document -> {
+ assertThat(document.getMetadata()).isEqualTo(Map.of("lang", "", "category", "code_block"));
+ assertThat(document.getContent()).isEqualTo(
+ "Another possibility is to set block code without specific highlighting: ./mvnw spring-javaformat:apply\n");
+ });
+ }
+
+ @Test
+ void testBlockquote() {
+ MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/blockquote.md");
+
+ List documents = reader.get();
+
+ assertThat(documents).hasSize(2)
+ .extracting(Document::getMetadata, Document::getContent)
+ .containsOnly(tuple(Map.of(),
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue."),
+ tuple(Map.of("category", "blockquote"),
+ "Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit."));
+ }
+
+ @Test
+ void testBlockquoteWhenBlockquoteShouldNotBeSeparatedDocument() {
+ MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder()
+ .withIncludeBlockquote(true)
+ .build();
+
+ MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/blockquote.md", config);
+
+ List documents = reader.get();
+
+ assertThat(documents).hasSize(1);
+
+ Document documentsFirst = documents.get(0);
+ assertThat(documentsFirst.getMetadata()).isEqualTo(Map.of("category", "blockquote"));
+ assertThat(documentsFirst.getContent()).isEqualTo(
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit.");
+ }
+
+ @Test
+ void testLists() {
+ MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/lists.md");
+
+ List documents = reader.get();
+
+ assertThat(documents).hasSize(2)
+ .extracting(Document::getMetadata, Document::getContent)
+ .containsOnly(tuple(Map.of("category", "header_2", "title", "Ordered list"),
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue. Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien odio. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor."),
+ tuple(Map.of("category", "header_2", "title", "Unordered list"),
+ "Aenean eu leo eu nibh tristique posuere quis quis massa. Aenean imperdiet libero dui, nec malesuada dui maximus vel. Vestibulum sed dui condimentum, cursus libero in, dapibus tortor. Etiam facilisis enim in egestas dictum."));
+ }
+
+ @Test
+ void testWithAdditionalMetadata() {
+ MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder()
+ .withAdditionalMetadata("service", "some-service-name")
+ .withAdditionalMetadata("env", "prod")
+ .build();
+
+ MarkdownDocumentReader reader = new MarkdownDocumentReader("classpath:/simple.md", config);
+
+ List documents = reader.get();
+
+ assertThat(documents).hasSize(1);
+
+ Document documentsFirst = documents.get(0);
+ assertThat(documentsFirst.getMetadata()).isEqualTo(Map.of("service", "some-service-name", "env", "prod"));
+ assertThat(documentsFirst.getContent()).startsWith("Lorem ipsum dolor sit amet, consectetur adipiscing elit.");
+ }
+
+}
diff --git a/document-readers/markdown-reader/src/test/resources/blockquote.md b/document-readers/markdown-reader/src/test/resources/blockquote.md
new file mode 100644
index 000000000..d92ac44f6
--- /dev/null
+++ b/document-readers/markdown-reader/src/test/resources/blockquote.md
@@ -0,0 +1,8 @@
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed
+nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue.
+
+> Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget
+> sapien odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a
+> porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum
+> suscipit.
+
diff --git a/document-readers/markdown-reader/src/test/resources/code.md b/document-readers/markdown-reader/src/test/resources/code.md
new file mode 100644
index 000000000..31d7c7b03
--- /dev/null
+++ b/document-readers/markdown-reader/src/test/resources/code.md
@@ -0,0 +1,25 @@
+This is a Java sample application:
+
+```java
+package com.example.demo;
+
+import org.springframework.boot.SpringApplication;
+import org.springframework.boot.autoconfigure.SpringBootApplication;
+
+@SpringBootApplication
+public class DemoApplication {
+ public static void main(String[] args) {
+ SpringApplication.run(DemoApplication.class, args);
+ }
+}
+```
+
+Markdown also provides the possibility to `use inline code formatting throughout` the entire sentence.
+
+---
+
+Another possibility is to set block code without specific highlighting:
+
+```
+./mvnw spring-javaformat:apply
+```
diff --git a/document-readers/markdown-reader/src/test/resources/horizontal-rules.md b/document-readers/markdown-reader/src/test/resources/horizontal-rules.md
new file mode 100644
index 000000000..f7affefc1
--- /dev/null
+++ b/document-readers/markdown-reader/src/test/resources/horizontal-rules.md
@@ -0,0 +1,27 @@
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida.
+
+---
+
+Cras accumsan tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu
+elementum dignissim.
+
+***
+Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis
+et magna.
+
+* * *
+
+Vestibulum nec eros non felis fermentum posuere eget ac risus. Curabitur et fringilla massa. Cras facilisis nec nisl sit
+amet sagittis.
+
+*****
+
+Aenean eu leo eu nibh tristique posuere quis quis massa. Nullam lacinia luctus sem ut vehicula.
+
+---------------------------------------
+
+Aenean quis vulputate mi. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Nam tincidunt nunc a tortor tincidunt, nec lobortis diam rhoncus.
+
+- - -
+
+Nulla facilisi. Phasellus eget tellus sed nibh ornare interdum eu eu mi.
diff --git a/document-readers/markdown-reader/src/test/resources/lists.md b/document-readers/markdown-reader/src/test/resources/lists.md
new file mode 100644
index 000000000..f82e7e345
--- /dev/null
+++ b/document-readers/markdown-reader/src/test/resources/lists.md
@@ -0,0 +1,17 @@
+## Ordered list
+
+1. Lorem ipsum dolor sit *amet*, consectetur adipiscing elit. **Curabitur** diam eros, laoreet sit _amet_ cursus vitae,
+ varius sed nisi.
+2. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue.
+3. Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget
+ sapien odio.
+ 1. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum
+ suscipit.
+ 2. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero. Ut rhoncus nec justo a porttitor.
+
+## Unordered list
+
+* Aenean eu leo eu nibh tristique posuere quis quis massa.
+* Aenean imperdiet libero dui, nec malesuada dui maximus vel. Vestibulum sed dui condimentum, cursus libero in, dapibus
+ tortor.
+ * Etiam facilisis enim in egestas dictum.
diff --git a/document-readers/markdown-reader/src/test/resources/only-headers.md b/document-readers/markdown-reader/src/test/resources/only-headers.md
new file mode 100644
index 000000000..81c770e87
--- /dev/null
+++ b/document-readers/markdown-reader/src/test/resources/only-headers.md
@@ -0,0 +1,20 @@
+# Header 1a
+
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur diam eros, laoreet sit amet cursus vitae, varius sed
+nisi. Cras sit amet quam quis velit commodo porta consectetur id nisi. Phasellus tincidunt pulvinar augue.
+
+# Header 1b
+
+Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae; Etiam lobortis risus libero, sed
+sollicitudin risus cursus in. Morbi enim metus, ornare vel lacinia eget, venenatis vel nibh.
+
+## Header 2b
+
+Proin vel laoreet leo, sed luctus augue. Sed et ligula commodo, commodo lacus at, consequat turpis. Maecenas eget sapien
+odio. Maecenas urna lectus, pellentesque in accumsan aliquam, congue eu libero.
+
+# Header 1c
+
+## Header 2c
+
+Ut rhoncus nec justo a porttitor. Pellentesque auctor pharetra eros, viverra sodales lorem aliquet id. Curabitur semper nisi vel sem interdum suscipit.
diff --git a/document-readers/markdown-reader/src/test/resources/simple.md b/document-readers/markdown-reader/src/test/resources/simple.md
new file mode 100644
index 000000000..3275c89b8
--- /dev/null
+++ b/document-readers/markdown-reader/src/test/resources/simple.md
@@ -0,0 +1,8 @@
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec tincidunt velit non bibendum gravida. Cras accumsan
+tincidunt ornare. Donec hendrerit consequat tellus blandit accumsan. Aenean aliquam metus at arcu elementum dignissim.
+
+Nullam nisi dui, egestas nec sem nec, interdum lobortis enim. Pellentesque odio orci, faucibus eu luctus nec, venenatis et magna. Vestibulum nec eros non felis fermentum posuere eget ac risus.
+
+Aenean eu leo eu nibh tristique posuere quis quis massa.\
+Nullam lacinia luctus sem ut vehicula.
+
diff --git a/document-readers/markdown-reader/src/test/resources/with-formatting.md b/document-readers/markdown-reader/src/test/resources/with-formatting.md
new file mode 100644
index 000000000..963743ece
--- /dev/null
+++ b/document-readers/markdown-reader/src/test/resources/with-formatting.md
@@ -0,0 +1,9 @@
+# This is a fancy header name
+
+Lorem ipsum dolor sit amet, **consectetur adipiscing elit**. Donec tincidunt velit non bibendum gravida. Cras accumsan
+tincidunt ornare. Donec hendrerit consequat tellus *blandit* accumsan. Aenean aliquam metus at ***arcu elementum***
+dignissim.
+
+### Header 3
+
+Aenean eu leo eu nibh tristique _posuere quis quis massa_.
diff --git a/pom.xml b/pom.xml
index fad014fe9..331e30a1e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -23,6 +23,7 @@
spring-ai-spring-boot-testcontainers
spring-ai-spring-cloud-bindings
+ document-readers/markdown-reader
document-readers/pdf-reader
document-readers/tika-reader
@@ -186,6 +187,7 @@
1.9.1
0.5.0
2.10.1
+ 0.22.0
5.3.1
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/etl-pipeline.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/etl-pipeline.adoc
index 0235c9fcd..5a66a12b9 100644
--- a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/etl-pipeline.adoc
+++ b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/etl-pipeline.adoc
@@ -46,22 +46,7 @@ Alternatively, you can use method names that are more naturally expressive for t
vectorStore.write(tokenTextSplitter.split(pdfReader.read()));
----
-
-
-== Getting Started
-
-To begin creating a Spring AI RAG application, follow these steps:
-
-. Download the latest https://github.com/spring-projects/spring-cli/releases[Spring CLI Release]
-and follow the https://docs.spring.io/spring-cli/reference/installation.html#_setting_up_your_path_or_alias[installation instructions].
-. To create a simple OpenAI-based application, use the command:
-+
-```shell
-spring boot new --from ai-rag --name myrag
-```
-. Consult the generated `README.md` file for guidance on obtaining an OpenAI API Key and running your first AI RAG application.
-
-== ETL Interfaces and Implementations
+== ETL Interfaces
The ETL pipeline is composed of the following interfaces and implementations.
Detailed ETL class diagram is shown in the <> section.
@@ -79,15 +64,57 @@ public interface DocumentReader extends Supplier> {
}
----
-==== JsonReader
-The `JsonReader` Parses documents in JSON format.
-Example:
+=== DocumentTransformer
+
+Transforms a batch of documents as part of the processing workflow.
+
+[source,java]
+----
+public interface DocumentTransformer extends Function, List> {
+
+ default List transform(List transform) {
+ return apply(transform);
+ }
+}
+----
+
+
+=== DocumentWriter
+
+Manages the final stage of the ETL process, preparing documents for storage.
+
+```java
+public interface DocumentWriter extends Consumer> {
+
+ default void write(List documents) {
+ accept(documents);
+ }
+}
+```
+
+
+[[etl-class-diagram]]
+=== ETL Class Diagram
+
+The following class diagram illustrates the ETL interfaces and implementations.
+
+// image::etl-class-diagram.jpg[align="center", width="800px"]
+image::etl-class-diagram.jpg[align="center"]
+
+== DocumentReaders
+
+=== JSON
+
+The `JsonReader` processes JSON documents, converting them into a list of `Document` objects.
+
+
+==== Example
[source,java]
----
@Component
-class MyAiAppComponent {
+class MyJsonReader {
private final Resource resource;
@@ -96,16 +123,69 @@ class MyAiAppComponent {
}
List loadJsonAsDocuments() {
- JsonReader jsonReader = new JsonReader(resource, "description");
- return jsonReader.read();
+ JsonReader jsonReader = new JsonReader(resource, "description", "content");
+ return jsonReader.get();
}
}
----
-==== TextReader
-The `TextReader` processes plain text documents.
+==== Constructor Options
-Example:
+The `JsonReader` provides several constructor options:
+
+1. `JsonReader(Resource resource)`
+2. `JsonReader(Resource resource, String... jsonKeysToUse)`
+3. `JsonReader(Resource resource, JsonMetadataGenerator jsonMetadataGenerator, String... jsonKeysToUse)`
+
+==== Parameters
+
+* `resource`: A Spring `Resource` object pointing to the JSON file.
+* `jsonKeysToUse`: An array of keys from the JSON that should be used as the text content in the resulting `Document` objects.
+* `jsonMetadataGenerator`: An optional `JsonMetadataGenerator` to create metadata for each `Document`.
+
+==== Behavior
+
+The `JsonReader` processes JSON content as follows:
+
+* It can handle both JSON arrays and single JSON objects.
+* For each JSON object (either in an array or a single object):
+** It extracts the content based on the specified `jsonKeysToUse`.
+** If no keys are specified, it uses the entire JSON object as content.
+** It generates metadata using the provided `JsonMetadataGenerator` (or an empty one if not provided).
+** It creates a `Document` object with the extracted content and metadata.
+
+==== Example JSON Structure
+
+[source,json]
+----
+[
+ {
+ "id": 1,
+ "brand": "Trek",
+ "description": "A high-performance mountain bike for trail riding."
+ },
+ {
+ "id": 2,
+ "brand": "Cannondale",
+ "description": "An aerodynamic road bike for racing enthusiasts."
+ }
+]
+----
+
+In this example, if the `JsonReader` is configured with `"description"` as the `jsonKeysToUse`, it will create `Document` objects where the content is the value of the "description" field for each bike in the array.
+
+==== Notes
+
+* The `JsonReader` uses Jackson for JSON parsing.
+* It can handle large JSON files efficiently by using streaming for arrays.
+* If multiple keys are specified in `jsonKeysToUse`, the content will be a concatenation of the values for those keys.
+* The reader is flexible and can be adapted to various JSON structures by customizing the `jsonKeysToUse` and `JsonMetadataGenerator`.
+
+
+=== Text
+The `TextReader` processes plain text documents, converting them into a list of `Document` objects.
+
+==== Example
[source,java]
----
@@ -126,10 +206,153 @@ class MyTextReader {
}
----
-==== PagePdfDocumentReader
+==== Constructor Options
+
+The `TextReader` provides two constructor options:
+
+1. `TextReader(String resourceUrl)`
+2. `TextReader(Resource resource)`
+
+==== Parameters
+
+* `resourceUrl`: A string representing the URL of the resource to be read.
+* `resource`: A Spring `Resource` object pointing to the text file.
+
+==== Configuration
+
+* `setCharset(Charset charset)`: Sets the character set used for reading the text file. Default is UTF-8.
+* `getCustomMetadata()`: Returns a mutable map where you can add custom metadata for the documents.
+
+==== Behavior
+
+The `TextReader` processes text content as follows:
+
+* It reads the entire content of the text file into a single `Document` object.
+* The content of the file becomes the content of the `Document`.
+* Metadata is automatically added to the `Document`:
+** `charset`: The character set used to read the file (default: "UTF-8").
+** `source`: The filename of the source text file.
+* Any custom metadata added via `getCustomMetadata()` is included in the `Document`.
+
+
+==== Notes
+
+* The `TextReader` reads the entire file content into memory, so it may not be suitable for very large files.
+* If you need to split the text into smaller chunks, you can use a text splitter like `TokenTextSplitter` after reading the document:
+
+[source,java]
+----
+List documents = textReader.get();
+List splitDocuments = new TokenTextSplitter().apply(documents);
+----
+
+* The reader uses Spring's `Resource` abstraction, allowing it to read from various sources (classpath, file system, URL, etc.).
+* Custom metadata can be added to all documents created by the reader using the `getCustomMetadata()` method.
+
+
+=== Markdown
+
+The `MarkdownDocumentReader` processes Markdown documents, converting them into a list of `Document` objects.
+
+==== Example
+
+[source,java]
+----
+@Component
+class MyMarkdownReader {
+
+ private final Resource resource;
+
+ MyMarkdownReader(@Value("classpath:code.md") Resource resource) {
+ this.resource = resource;
+ }
+
+ List loadMarkdown() {
+ MarkdownDocumentReaderConfig config = MarkdownDocumentReaderConfig.builder()
+ .withHorizontalRuleCreateDocument(true)
+ .withIncludeCodeBlock(false)
+ .withIncludeBlockquote(false)
+ .withAdditionalMetadata("filename", "code.md")
+ .build();
+
+ MarkdownDocumentReader reader = new MarkdownDocumentReader(resource, config);
+ return reader.get();
+ }
+}
+----
+
+The `MarkdownDocumentReaderConfig` allows you to customize the behavior of the MarkdownDocumentReader:
+
+* `horizontalRuleCreateDocument`: When set to `true`, horizontal rules in the Markdown will create new `Document` objects.
+* `includeCodeBlock`: When set to `true`, code blocks will be included in the same `Document` as the surrounding text. When `false`, code blocks create separate `Document` objects.
+* `includeBlockquote`: When set to `true`, blockquotes will be included in the same `Document` as the surrounding text. When `false`, blockquotes create separate `Document` objects.
+* `additionalMetadata`: Allows you to add custom metadata to all created `Document` objects.
+
+==== Sample Document: code.md
+
+[source,markdown]
+----
+This is a Java sample application:
+
+```java
+package com.example.demo;
+
+import org.springframework.boot.SpringApplication;
+import org.springframework.boot.autoconfigure.SpringBootApplication;
+
+@SpringBootApplication
+public class DemoApplication {
+ public static void main(String[] args) {
+ SpringApplication.run(DemoApplication.class, args);
+ }
+}
+```
+
+Markdown also provides the possibility to `use inline code formatting throughout` the entire sentence.
+
+---
+
+Another possibility is to set block code without specific highlighting:
+
+```
+./mvnw spring-javaformat:apply
+```
+----
+
+Behavior: The MarkdownDocumentReader processes the Markdown content and creates Document objects based on the configuration:
+
+* Headers become metadata in the Document objects.
+* Paragraphs become the content of Document objects.
+* Code blocks can be separated into their own Document objects or included with surrounding text.
+* Blockquotes can be separated into their own Document objects or included with surrounding text.
+* Horizontal rules can be used to split the content into separate Document objects.
+
+The reader preserves formatting like inline code, lists, and text styling within the content of the Document objects.
+
+
+=== PDF Page
The `PagePdfDocumentReader` uses Apache PdfBox library to parse PDF documents
-Example:
+Add the dependency to your project using Maven or Gradle.
+
+[source, xml]
+----
+
+ org.springframework.ai
+ spring-ai-pdf-document-reader
+
+----
+
+or to your Gradle `build.gradle` build file.
+
+[source,groovy]
+----
+dependencies {
+ implementation 'org.springframework.ai:spring-ai-pdf-document-reader'
+}
+----
+
+==== Example
[source,java]
----
@@ -154,12 +377,32 @@ public class MyPagePdfDocumentReader {
----
-
-==== ParagraphPdfDocumentReader
+=== PDF Paragraph
The `ParagraphPdfDocumentReader` uses the PDF catalog (e.g. TOC) information to split the input PDF into text paragraphs and output a single `Document` per paragraph.
NOTE: Not all PDF documents contain the PDF catalog.
-Example:
+==== Dependencies
+Add the dependency to your project using Maven or Gradle.
+
+[source, xml]
+----
+
+ org.springframework.ai
+ spring-ai-pdf-document-reader
+
+----
+
+or to your Gradle `build.gradle` build file.
+
+[source,groovy]
+----
+dependencies {
+ implementation 'org.springframework.ai:spring-ai-pdf-document-reader'
+}
+----
+
+
+==== Example
[source,java]
----
@@ -183,10 +426,29 @@ public class MyPagePdfDocumentReader {
----
-==== TikaDocumentReader
+=== Tika (DOCX, PPTX, HTML...)
The `TikaDocumentReader` uses Apache Tika to extract text from a variety of document formats, such as PDF, DOC/DOCX, PPT/PPTX, and HTML. For a comprehensive list of supported formats, refer to the https://tika.apache.org/2.9.0/formats.html[Tika documentation].
-Example:
+==== Dependencies
+
+[source, xml]
+----
+
+ org.springframework.ai
+ spring-ai-tika-document-reader
+
+----
+
+or to your Gradle `build.gradle` build file.
+
+[source,groovy]
+----
+dependencies {
+ implementation 'org.springframework.ai:spring-ai-tika-document-reader'
+}
+----
+
+==== Example
[source,java]
----
@@ -207,60 +469,367 @@ class MyTikaDocumentReader {
}
----
-=== DocumentTransformer
+== Transformers
-Transforms a batch of documents as part of the processing workflow.
-
-[source,java]
-----
-public interface DocumentTransformer extends Function, List> {
-
- default List transform(List transform) {
- return apply(transform);
- }
-}
-----
-
-==== TextSplitter
+=== TextSplitter
The `TextSplitter` an abstract base class that helps divides documents to fit the AI model's context window.
-==== TokenTextSplitter
-Splits documents while preserving token-level integrity.
+=== TokenTextSplitter
+The `TokenTextSplitter` is an implementation of `TextSplitter` that splits text into chunks based on token count, using the `CL100K_BASE encoding.
-==== ContentFormatTransformer
+==== Usage
+
+[source,java]
+----
+@Component
+class MyTokenTextSplitter {
+
+ public List splitDocuments(List documents) {
+ TokenTextSplitter splitter = new TokenTextSplitter();
+ return splitter.apply(documents);
+ }
+
+ public List splitCustomized(List documents) {
+ TokenTextSplitter splitter = new TokenTextSplitter(1000, 400, 10, 5000, true);
+ return splitter.apply(documents);
+ }
+}
+----
+
+==== Constructor Options
+
+The `TokenTextSplitter` provides two constructor options:
+
+1. `TokenTextSplitter()`: Creates a splitter with default settings.
+2. `TokenTextSplitter(int defaultChunkSize, int minChunkSizeChars, int minChunkLengthToEmbed, int maxNumChunks, boolean keepSeparator)`
+
+
+==== Parameters
+
+* `defaultChunkSize`: The target size of each text chunk in tokens (default: 800).
+* `minChunkSizeChars`: The minimum size of each text chunk in characters (default: 350).
+* `minChunkLengthToEmbed`: The minimum length of a chunk to be included (default: 5).
+* `maxNumChunks`: The maximum number of chunks to generate from a text (default: 10000).
+* `keepSeparator`: Whether to keep separators (like newlines) in the chunks (default: true).
+
+==== Behavior
+
+The `TokenTextSplitter` processes text content as follows:
+
+1. It encodes the input text into tokens using the CL100K_BASE encoding.
+2. It splits the encoded text into chunks based on the `defaultChunkSize`.
+3. For each chunk:
+a. It decodes the chunk back into text.
+b. It attempts to find a suitable break point (period, question mark, exclamation mark, or newline) after the `minChunkSizeChars`.
+c. If a break point is found, it truncates the chunk at that point.
+d. It trims the chunk and optionally removes newline characters based on the `keepSeparator` setting.
+e. If the resulting chunk is longer than `minChunkLengthToEmbed`, it's added to the output.
+4. This process continues until all tokens are processed or `maxNumChunks` is reached.
+5. Any remaining text is added as a final chunk if it's longer than `minChunkLengthToEmbed`.
+
+==== Example
+
+[source,java]
+----
+Document doc1 = new Document("This is a long piece of text that needs to be split into smaller chunks for processing.",
+ Map.of("source", "example.txt"));
+Document doc2 = new Document("Another document with content that will be split based on token count.",
+ Map.of("source", "example2.txt"));
+
+TokenTextSplitter splitter = new TokenTextSplitter();
+List splitDocuments = splitter.apply(List.of(doc1, doc2));
+
+for (Document doc : splitDocuments) {
+ System.out.println("Chunk: " + doc.getContent());
+ System.out.println("Metadata: " + doc.getMetadata());
+}
+----
+
+
+==== Notes
+
+* The `TokenTextSplitter` uses the CL100K_BASE encoding from the `jtokkit` library, which is compatible with newer OpenAI models.
+* The splitter attempts to create semantically meaningful chunks by breaking at sentence boundaries where possible.
+* Metadata from the original documents is preserved and copied to all chunks derived from that document.
+* The content formatter (if set) from the original document is also copied to the derived chunks if `copyContentFormatter` is set to `true` (default behavior).
+* This splitter is particularly useful for preparing text for large language models that have token limits, ensuring that each chunk is within the model's processing capacity.
+=== ContentFormatTransformer
Ensures uniform content formats across all documents.
-==== KeywordMetadataEnricher
-Augments documents with essential keyword metadata.
+=== KeywordMetadataEnricher
+The `KeywordMetadataEnricher` is a `DocumentTransformer` that uses a generative AI model to extract keywords from document content and add them as metadata.
-==== SummaryMetadataEnricher
-Enriches documents with summarization metadata for enhanced retrieval.
+==== Usage
-=== DocumentWriter
+[source,java]
+----
+@Component
+class MyKeywordEnricher {
-Manages the final stage of the ETL process, preparing documents for storage.
+ private final ChatModel chatModel;
-```java
-public interface DocumentWriter extends Consumer> {
+ MyKeywordEnricher(ChatModel chatModel) {
+ this.chatModel = chatModel;
+ }
- default void write(List documents) {
- accept(documents);
- }
+ List enrichDocuments(List documents) {
+ KeywordMetadataEnricher enricher = new KeywordMetadataEnricher(chatModel, 5);
+ return enricher.apply(documents);
+ }
}
-```
-==== FileDocumentWriter
-Persist documents to a file .
+----
-==== VectorStore
+==== Constructor
+
+The `KeywordMetadataEnricher` constructor takes two parameters:
+
+1. `ChatModel chatModel`: The AI model used for generating keywords.
+2. `int keywordCount`: The number of keywords to extract for each document.
+
+==== Behavior
+
+The `KeywordMetadataEnricher` processes documents as follows:
+
+1. For each input document, it creates a prompt using the document's content.
+2. It sends this prompt to the provided `ChatModel` to generate keywords.
+3. The generated keywords are added to the document's metadata under the key "excerpt_keywords".
+4. The enriched documents are returned.
+
+
+==== Customization
+
+The keyword extraction prompt can be customized by modifying the `KEYWORDS_TEMPLATE` constant in the class. The default template is:
+
+[source,java]
+----
+\{context_str}. Give %s unique keywords for this document. Format as comma separated. Keywords:
+----
+
+Where `+{context_str}+` is replaced with the document content, and `%s` is replaced with the specified keyword count.
+
+==== Example
+
+[source,java]
+----
+ChatModel chatModel = // initialize your chat model
+KeywordMetadataEnricher enricher = new KeywordMetadataEnricher(chatModel, 5);
+
+Document doc = new Document("This is a document about artificial intelligence and its applications in modern technology.");
+
+List enrichedDocs = enricher.apply(List.of(doc));
+
+Document enrichedDoc = enrichedDocs.get(0);
+String keywords = (String) enrichedDoc.getMetadata().get("excerpt_keywords");
+System.out.println("Extracted keywords: " + keywords);
+----
+
+==== Notes
+
+* The `KeywordMetadataEnricher` requires a functioning `ChatModel` to generate keywords.
+* The keyword count must be 1 or greater.
+* The enricher adds the "excerpt_keywords" metadata field to each processed document.
+* The generated keywords are returned as a comma-separated string.
+* This enricher is particularly useful for improving document searchability and for generating tags or categories for documents.
+
+=== SummaryMetadataEnricher
+The `SummaryMetadataEnricher` is a `DocumentTransformer` that uses a generative AI model to create summaries for documents and add them as metadata. It can generate summaries for the current document, as well as adjacent documents (previous and next).
+
+==== Usage
+
+[source,java]
+----
+@Configuration
+class EnricherConfig {
+
+ @Bean
+ public SummaryMetadataEnricher summaryMetadata(OpenAiChatModel aiClient) {
+ return new SummaryMetadataEnricher(aiClient,
+ List.of(SummaryType.PREVIOUS, SummaryType.CURRENT, SummaryType.NEXT));
+ }
+}
+
+@Component
+class MySummaryEnricher {
+
+ private final SummaryMetadataEnricher enricher;
+
+ MySummaryEnricher(SummaryMetadataEnricher enricher) {
+ this.enricher = enricher;
+ }
+
+ List enrichDocuments(List documents) {
+ return enricher.apply(documents);
+ }
+}
+----
+
+
+==== Constructor
+
+The `SummaryMetadataEnricher` provides two constructors:
+
+1. `SummaryMetadataEnricher(ChatModel chatModel, List summaryTypes)`
+2. `SummaryMetadataEnricher(ChatModel chatModel, List summaryTypes, String summaryTemplate, MetadataMode metadataMode)`
+
+==== Parameters
+
+* `chatModel`: The AI model used for generating summaries.
+* `summaryTypes`: A list of `SummaryType` enum values indicating which summaries to generate (PREVIOUS, CURRENT, NEXT).
+* `summaryTemplate`: A custom template for summary generation (optional).
+* `metadataMode`: Specifies how to handle document metadata when generating summaries (optional).
+
+
+==== Behavior
+
+The `SummaryMetadataEnricher` processes documents as follows:
+
+1. For each input document, it creates a prompt using the document's content and the specified summary template.
+2. It sends this prompt to the provided `ChatModel` to generate a summary.
+3. Depending on the specified `summaryTypes`, it adds the following metadata to each document:
+* `section_summary`: Summary of the current document.
+* `prev_section_summary`: Summary of the previous document (if available and requested).
+* `next_section_summary`: Summary of the next document (if available and requested).
+4. The enriched documents are returned.
+
+==== Customization
+
+The summary generation prompt can be customized by providing a custom `summaryTemplate`. The default template is:
+
+[source,java]
+----
+"""
+Here is the content of the section:
+{context_str}
+
+Summarize the key topics and entities of the section.
+
+Summary:
+"""
+----
+
+==== Example
+
+[source,java]
+----
+ChatModel chatModel = // initialize your chat model
+SummaryMetadataEnricher enricher = new SummaryMetadataEnricher(chatModel,
+ List.of(SummaryType.PREVIOUS, SummaryType.CURRENT, SummaryType.NEXT));
+
+Document doc1 = new Document("Content of document 1");
+Document doc2 = new Document("Content of document 2");
+
+List enrichedDocs = enricher.apply(List.of(doc1, doc2));
+
+// Check the metadata of the enriched documents
+for (Document doc : enrichedDocs) {
+ System.out.println("Current summary: " + doc.getMetadata().get("section_summary"));
+ System.out.println("Previous summary: " + doc.getMetadata().get("prev_section_summary"));
+ System.out.println("Next summary: " + doc.getMetadata().get("next_section_summary"));
+}
+----
+
+The provided example demonstrates the expected behavior:
+
+* For a list of two documents, both documents receive a `section_summary`.
+* The first document receives a `next_section_summary` but no `prev_section_summary`.
+* The second document receives a `prev_section_summary` but no `next_section_summary`.
+* The `section_summary` of the first document matches the `prev_section_summary` of the second document.
+* The `next_section_summary` of the first document matches the `section_summary` of the second document.
+
+==== Notes
+
+* The `SummaryMetadataEnricher` requires a functioning `ChatModel` to generate summaries.
+* The enricher can handle document lists of any size, properly handling edge cases for the first and last documents.
+* This enricher is particularly useful for creating context-aware summaries, allowing for better understanding of document relationships in a sequence.
+* The `MetadataMode` parameter allows control over how existing metadata is incorporated into the summary generation process.
+
+
+== Writers
+
+=== File
+
+The `FileDocumentWriter` is a `DocumentWriter` implementation that writes the content of a list of `Document` objects into a file.
+
+==== Usage
+
+[source,java]
+----
+@Component
+class MyDocumentWriter {
+
+ public void writeDocuments(List documents) {
+ FileDocumentWriter writer = new FileDocumentWriter("output.txt", true, MetadataMode.ALL, false);
+ writer.accept(documents);
+ }
+}
+----
+
+==== Constructors
+
+The `FileDocumentWriter` provides three constructors:
+
+1. `FileDocumentWriter(String fileName)`
+2. `FileDocumentWriter(String fileName, boolean withDocumentMarkers)`
+3. `FileDocumentWriter(String fileName, boolean withDocumentMarkers, MetadataMode metadataMode, boolean append)`
+
+==== Parameters
+
+* `fileName`: The name of the file to write the documents to.
+* `withDocumentMarkers`: Whether to include document markers in the output (default: false).
+* `metadataMode`: Specifies what document content to be written to the file (default: MetadataMode.NONE).
+* `append`: If true, data will be written to the end of the file rather than the beginning (default: false).
+
+==== Behavior
+
+The `FileDocumentWriter` processes documents as follows:
+
+1. It opens a FileWriter for the specified file name.
+2. For each document in the input list:
+a. If `withDocumentMarkers` is true, it writes a document marker including the document index and page numbers.
+b. It writes the formatted content of the document based on the specified `metadataMode`.
+3. The file is closed after all documents have been written.
+
+
+
+==== Document Markers
+
+When `withDocumentMarkers` is set to true, the writer includes markers for each document in the following format:
+
+[source]
+----
+### Doc: [index], pages:[start_page_number,end_page_number]
+----
+
+==== Metadata Handling
+
+The writer uses two specific metadata keys:
+
+* `page_number`: Represents the starting page number of the document.
+* `end_page_number`: Represents the ending page number of the document.
+
+These are used when writing document markers.
+
+==== Example
+
+[source,java]
+----
+List documents = // initialize your documents
+FileDocumentWriter writer = new FileDocumentWriter("output.txt", true, MetadataMode.ALL, true);
+writer.accept(documents);
+----
+
+This will write all documents to "output.txt", including document markers, using all available metadata, and appending to the file if it already exists.
+
+==== Notes
+
+* The writer uses `FileWriter`, so it writes text files with the default character encoding of the operating system.
+* If an error occurs during writing, a `RuntimeException` is thrown with the original exception as its cause.
+* The `metadataMode` parameter allows control over how existing metadata is incorporated into the written content.
+* This writer is particularly useful for debugging or creating human-readable outputs of document collections.
+
+
+=== VectorStore
Provides integration with various vector stores.
-See xref:api/vectordbs.adoc[Vector DB Documentation] for a full listing.
-
-[[etl-class-diagram]]
-=== ETL Class Diagram
-
-The following class diagram illustrates the ETL interfaces and implementations.
-
-// image::etl-class-diagram.jpg[align="center", width="800px"]
-image::etl-class-diagram.jpg[align="center"]
+See xref:api/vectordbs.adoc[Vector DB Documentation] for a full listing.
\ No newline at end of file
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/getting-started.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/getting-started.adoc
index b617b87b3..1e95aeceb 100644
--- a/spring-ai-docs/src/main/antora/modules/ROOT/pages/getting-started.adoc
+++ b/spring-ai-docs/src/main/antora/modules/ROOT/pages/getting-started.adoc
@@ -5,37 +5,6 @@ This section offers jumping off points for how to get started using Spring AI.
You should follow the steps in each of the following section according to your needs.
-[[spring-cli]]
-== Spring CLI
-
-The https://docs.spring.io/spring-cli/reference/index.html[Spring CLI], simplifies creating new applications directly from your terminal.
-Like the 'create-react-app' command for those familiar with the JavaScript ecosystem, Spring CLI provides a `spring boot new` command to create Spring-based projects.
-Spring CLI also offers features to integrate external code bases into your current project, and many other productivity features.
-
-NOTE: It is important to understand that the "Spring CLI" is a distinct project from the "Spring Boot CLI", each with its own set of functionalities.
-
-To begin creating a Spring AI application, follow these steps:
-
-
-. Download the latest https://github.com/spring-projects/spring-cli/releases[Spring CLI Release]
-and follow the https://docs.spring.io/spring-cli/reference/installation.html#_setting_up_your_path_or_alias[installation instructions].
-. To create a simple OpenAI-based application, use the command:
-+
-```shell
-spring boot new --from ai --name myai
-```
-. Consult the generated `README.md` file for guidance on obtaining an OpenAI API Key and running your first AI application.
-
-NOTE: Currently, the Spring CLI only supports Maven projects.
-
-To add the same simple AI application to an *existing* Maven project, execute:
-
-```shell
-spring boot add ai
-```
-
-NOTE: Spring CLI allows users to define their own https://docs.spring.io/spring-cli/reference/registering-new-projects.html[project catalogs] that define which projects you can create or add to your existing code base.
-
== Spring Initializr
Head on over to https://start.spring.io/[start.spring.io] and select the AI Models and Vector Stores that you want to use in your new applications.