feat: add JSoup HTML document reader

This commit introduces the `JsoupDocumentReader` and `JsoupDocumentReaderConfig` classes, which provide functionality to read and parse HTML documents using the JSoup library. The reader supports: - Extracting text from specific HTML elements using CSS selectors. - Extracting all text from the body of the document. - Grouping text by element. - Extracting metadata, including the document title, meta tags, and link URLs. - Reading from various resource types (files, URLs, byte arrays). - Configurable character encoding, selector, separator, and metadata extraction. This new reader enhances Spring AI's ability to process web content and other HTML-based data sources. Signed-off-by: Alexandros Pappas <apappascs@gmail.com>
2025-02-14 16:47:21 +01:00
parent a6b4555d45
commit 82b46d2182
11 changed files with 759 additions and 0 deletions
--- a/document-readers/jsoup-reader/
+++ b/document-readers/jsoup-reader/
@@ -0,0 +1,30 @@
+# Spring AI JSoup Document Reader
+
+This module provides an HTML document reader for the Spring AI project. It leverages the [JSoup](https://jsoup.org/) library to parse HTML content and extract text and metadata, making it suitable for use in AI applications.
+
+## Features
+
+*   **Flexible Text Extraction:**
+    *   Extract all text from the `<body>` of an HTML document.
+    *   Extract text from specific elements using CSS selectors.
+    *   Group text by element, creating a separate document for each selected element.
+    *   Combine text from multiple selected elements using a configurable separator.
+*   **Metadata Extraction:**
+    *   Extract the document title.
+    *   Extract content from `<meta>` tags (e.g., description, keywords).  You can specify which meta tags to extract.
+    *   Extract a list of all absolute URLs of links (`<a href="...">`) within the document.
+*   **Configurable:**
+    *   Specify the character encoding (defaults to UTF-8).
+    *   Customize the CSS selector for element selection.
+    *   Configure the separator string for joining text from multiple elements.
+    *   Choose whether to extract all text or use element-based extraction.
+    *   Enable/disable link URL extraction.
+    * Add additional metadata using configuration.
+*   **Resource-Based:** Works with Spring's `Resource` abstraction, allowing you to read HTML from files, classpath resources, URLs, and even in-memory byte arrays.
+
+---
+
+#### How to Build:
+```bash
+./mvnw -pl document-readers/jsoup-reader clean install 
+```
--- a/document-readers/jsoup-reader/pom.xml
+++ b/document-readers/jsoup-reader/pom.xml
@@ -0,0 +1,63 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Copyright 2025-2025 the original author or authors.
+  ~
+  ~ Licensed under the Apache License, Version 2.0 (the "License");
+  ~ you may not use this file except in compliance with the License.
+  ~ You may obtain a copy of the License at
+  ~
+  ~      https://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+		 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+		 xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+	<parent>
+		<groupId>org.springframework.ai</groupId>
+		<artifactId>spring-ai</artifactId>
+		<version>1.0.0-SNAPSHOT</version>
+		<relativePath>../../pom.xml</relativePath>
+	</parent>
+
+	<artifactId>spring-ai-jsoup-document-reader</artifactId>
+	<packaging>jar</packaging>
+	<name>Spring AI Document Reader - HTML</name>
+	<description>Spring AI HTML document reader</description>
+	<url>https://github.com/spring-projects/spring-ai</url>
+
+	<scm>
+		<url>https://github.com/spring-projects/spring-ai</url>
+		<connection>git://github.com/spring-projects/spring-ai.git</connection>
+		<developerConnection>git@github.com:spring-projects/spring-ai.git</developerConnection>
+	</scm>
+
+	<dependencies>
+		<dependency>
+			<groupId>org.springframework.ai</groupId>
+			<artifactId>spring-ai-core</artifactId>
+			<version>${project.parent.version}</version>
+		</dependency>
+
+		<dependency>
+			<groupId>org.jsoup</groupId>
+			<artifactId>jsoup</artifactId>
+			<version>1.18.3</version>
+		</dependency>
+
+		<!-- TESTING -->
+		<dependency>
+			<groupId>org.springframework.boot</groupId>
+			<artifactId>spring-boot-starter-test</artifactId>
+			<scope>test</scope>
+		</dependency>
+
+	</dependencies>
+
+</project>
--- a/document-readers/jsoup-reader/src/main/java/org/springframework/ai/reader/jsoup/JsoupDocumentReader.java
+++ b/document-readers/jsoup-reader/src/main/java/org/springframework/ai/reader/jsoup/JsoupDocumentReader.java
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2025-2025 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.springframework.ai.reader.jsoup;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import org.springframework.ai.document.Document;
+import org.springframework.ai.document.DocumentReader;
+import org.springframework.ai.reader.jsoup.config.JsoupDocumentReaderConfig;
+import org.springframework.core.io.DefaultResourceLoader;
+import org.springframework.core.io.Resource;
+
+/**
+ * Reads HTML documents and extracts text content using JSoup.
+ *
+ * This reader provides options for selecting specific HTML elements to extract, handling
+ * links, and extracting metadata. It leverages the JSoup library for parsing HTML.
+ *
+ * @see <a href="https://jsoup.org/">JSoup Website</a>
+ * @author Alexandros Pappas
+ */
+public class JsoupDocumentReader implements DocumentReader {
+
+	private final Resource htmlResource;
+
+	private final JsoupDocumentReaderConfig config;
+
+	public JsoupDocumentReader(String htmlResource) {
+		this(new DefaultResourceLoader().getResource(htmlResource));
+	}
+
+	public JsoupDocumentReader(Resource htmlResource) {
+		this(htmlResource, JsoupDocumentReaderConfig.defaultConfig());
+	}
+
+	public JsoupDocumentReader(String htmlResource, JsoupDocumentReaderConfig config) {
+		this(new DefaultResourceLoader().getResource(htmlResource), config);
+	}
+
+	public JsoupDocumentReader(Resource htmlResource, JsoupDocumentReaderConfig config) {
+		this.htmlResource = htmlResource;
+		this.config = config;
+	}
+
+	@Override
+	public List<Document> get() {
+		try (InputStream inputStream = htmlResource.getInputStream()) {
+			org.jsoup.nodes.Document doc = Jsoup.parse(inputStream, this.config.charset, "");
+
+			List<Document> documents = new ArrayList<>();
+
+			if (this.config.allElements) {
+				// Extract text from all elements and create a single document
+				String allText = doc.body().text(); // .body to exclude head
+				Document document = new Document(allText);
+				addMetadata(doc, document);
+				documents.add(document);
+			}
+			else if (this.config.groupByElement) {
+				// Extract text on a per-element base using the defined selector.
+				Elements selectedElements = doc.select(this.config.selector);
+				for (Element element : selectedElements) {
+					String elementText = element.text();
+					Document document = new Document(elementText);
+					addMetadata(doc, document);
+					// Do not add metadata from element to avoid duplication.
+					documents.add(document);
+				}
+			}
+			else {
+				// Extract text from specific elements based on the selector
+				Elements elements = doc.select(this.config.selector);
+				String text = elements.stream().map(Element::text).collect(Collectors.joining(this.config.separator));
+				Document document = new Document(text);
+				addMetadata(doc, document);
+				documents.add(document);
+			}
+
+			return documents;
+
+		}
+		catch (IOException e) {
+			throw new RuntimeException("Failed to read HTML resource: " + htmlResource, e);
+		}
+	}
+
+	private void addMetadata(org.jsoup.nodes.Document jsoupDoc, Document springDoc) {
+		Map<String, Object> metadata = new HashMap<>();
+		metadata.put("title", jsoupDoc.title());
+
+		for (String metaTag : this.config.metadataTags) {
+			String value = jsoupDoc.select("meta[name=" + metaTag + "]").attr("content");
+			if (!value.isEmpty()) {
+				metadata.put(metaTag, value);
+			}
+		}
+
+		if (this.config.includeLinkUrls) {
+			Elements links = jsoupDoc.select("a[href]");
+			List<String> linkUrls = links.stream().map(link -> link.attr("abs:href")).toList();
+			metadata.put("linkUrls", linkUrls);
+		}
+
+		// Use putAll to add all entries from additionalMetadata
+		metadata.putAll(this.config.additionalMetadata);
+
+		// Add all collected metadata to the Spring Document
+		springDoc.getMetadata().putAll(metadata);
+	}
+
+}
--- a/document-readers/jsoup-reader/src/main/java/org/springframework/ai/reader/jsoup/config/JsoupDocumentReaderConfig.java
+++ b/document-readers/jsoup-reader/src/main/java/org/springframework/ai/reader/jsoup/config/JsoupDocumentReaderConfig.java
@@ -0,0 +1,207 @@
+/*
+ * Copyright 2025-2025 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.springframework.ai.reader.jsoup.config;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.springframework.ai.reader.jsoup.JsoupDocumentReader;
+import org.springframework.util.Assert;
+
+/**
+ * Common configuration for the {@link JsoupDocumentReader}.
+ *
+ * Provides options for specifying the character encoding, CSS selector, text separator,
+ * and whether to extract all text from the body or specific elements, and handling link
+ * extraction.
+ *
+ * @author Alexandros Pappas
+ */
+public class JsoupDocumentReaderConfig {
+
+	public final String charset;
+
+	public final String selector;
+
+	public final String separator;
+
+	public final boolean allElements;
+
+	public final boolean groupByElement;
+
+	public final boolean includeLinkUrls;
+
+	public final List<String> metadataTags;
+
+	public final Map<String, Object> additionalMetadata;
+
+	private JsoupDocumentReaderConfig(Builder builder) {
+		this.charset = builder.charset;
+		this.selector = builder.selector;
+		this.separator = builder.separator;
+		this.allElements = builder.allElements;
+		this.includeLinkUrls = builder.includeLinkUrls;
+		this.metadataTags = builder.metadataTags;
+		this.groupByElement = builder.groupByElement;
+		this.additionalMetadata = builder.additionalMetadata;
+	}
+
+	public static Builder builder() {
+		return new Builder();
+	}
+
+	public static JsoupDocumentReaderConfig defaultConfig() {
+		return builder().build();
+	}
+
+	public static class Builder {
+
+		private String charset = "UTF-8";
+
+		private String selector = "body";
+
+		private String separator = "\n";
+
+		private boolean allElements = false;
+
+		private boolean includeLinkUrls = false;
+
+		private List<String> metadataTags = new ArrayList<>(List.of("description", "keywords"));
+
+		private boolean groupByElement = false;
+
+		private Map<String, Object> additionalMetadata = new HashMap<>();
+
+		private Builder() {
+		}
+
+		/**
+		 * Sets the character encoding to use for reading the HTML. Defaults to UTF-8.
+		 * @param charset The charset to use.
+		 * @return This builder.
+		 */
+		public Builder charset(String charset) {
+			this.charset = charset;
+			return this;
+		}
+
+		/**
+		 * Sets the CSS selector to use for extracting elements. Defaults to "body".
+		 * @param selector The CSS selector.
+		 * @return This builder.
+		 */
+		public Builder selector(String selector) {
+			this.selector = selector;
+			return this;
+		}
+
+		/**
+		 * Sets the separator string to use when joining text from multiple elements.
+		 * Defaults to "\n".
+		 * @param separator The separator string.
+		 * @return This builder.
+		 */
+		public Builder separator(String separator) {
+			this.separator = separator;
+			return this;
+		}
+
+		/**
+		 * Enables extracting text from all elements in the body, creating a single
+		 * document. Overrides the selector setting. Defaults to false.
+		 * @param allElements True to extract all text, false otherwise.
+		 * @return This builder.
+		 */
+		public Builder allElements(boolean allElements) {
+			this.allElements = allElements;
+			return this;
+		}
+
+		/**
+		 * Determines if on the selected element, the content will be read on per-element
+		 * base.
+		 * @param groupByElement to read text using element as a separator.
+		 * @return this builder.
+		 */
+		public Builder groupByElement(boolean groupByElement) {
+			this.groupByElement = groupByElement;
+			return this;
+		}
+
+		/**
+		 * Enables the inclusion of link URLs in the document metadata. Defaults to false.
+		 * @param includeLinkUrls True to include link URLs, false otherwise.
+		 * @return This builder.
+		 */
+		public Builder includeLinkUrls(boolean includeLinkUrls) {
+			this.includeLinkUrls = includeLinkUrls;
+			return this;
+		}
+
+		/**
+		 * Adds a metadata tag name to extract from the HTML <meta> tags.
+		 * @param metadataTag The name of the metadata tag.
+		 * @return This builder.
+		 */
+		public Builder metadataTag(String metadataTag) {
+			this.metadataTags.add(metadataTag);
+			return this;
+		}
+
+		/**
+		 * Sets the metadata tags to extract from the HTML <meta> tags. Overwrites any
+		 * previously added tags.
+		 * @param metadataTags The list of metadata tag names.
+		 * @return This builder.
+		 */
+		public Builder metadataTags(List<String> metadataTags) {
+			this.metadataTags = new ArrayList<>(metadataTags);
+			return this;
+		}
+
+		/**
+		 * Adds this additional metadata to the all built
+		 * {@link org.springframework.ai.document.Document}s.
+		 * @return this builder
+		 */
+		public Builder additionalMetadata(String key, Object value) {
+			Assert.notNull(key, "key must not be null");
+			Assert.notNull(value, "value must not be null");
+			this.additionalMetadata.put(key, value);
+			return this;
+		}
+
+		/**
+		 * Adds this additional metadata to the all built
+		 * {@link org.springframework.ai.document.Document}s.
+		 * @return this builder
+		 */
+		public Builder additionalMetadata(Map<String, Object> additionalMetadata) {
+			Assert.notNull(additionalMetadata, "additionalMetadata must not be null");
+			this.additionalMetadata = additionalMetadata;
+			return this;
+		}
+
+		public JsoupDocumentReaderConfig build() {
+			return new JsoupDocumentReaderConfig(this);
+		}
+
+	}
+
+}
--- a/document-readers/jsoup-reader/src/test/java/org/springframework/ai/reader/jsoup/JsoupDocumentReaderTests.java
+++ b/document-readers/jsoup-reader/src/test/java/org/springframework/ai/reader/jsoup/JsoupDocumentReaderTests.java
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2025-2025 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.springframework.ai.reader.jsoup;
+
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import org.springframework.ai.document.Document;
+import org.springframework.ai.reader.jsoup.config.JsoupDocumentReaderConfig;
+import org.springframework.core.io.ByteArrayResource;
+import org.springframework.core.io.DefaultResourceLoader;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Tests for {@link JsoupDocumentReader}.
+ *
+ * @author Alexandros Pappas
+ */
+class JsoupDocumentReaderTests {
+
+	@Test
+	void testSimpleRead() {
+		JsoupDocumentReader reader = new JsoupDocumentReader("classpath:/test.html");
+		List<Document> documents = reader.get();
+		assertThat(documents).hasSize(1);
+		Document document = documents.get(0);
+		assertThat(document.getText()).contains("This is a test HTML document.");
+		assertThat(document.getText()).contains("Some paragraph text.");
+		assertThat(document.getMetadata()).containsEntry("title", "Test HTML");
+		assertThat(document.getMetadata()).containsEntry("description", "A test document for Spring AI");
+		assertThat(document.getMetadata()).containsEntry("keywords", "test,html,spring ai");
+	}
+
+	@Test
+	void testSimpleReadWithAdditionalMetadata() {
+		JsoupDocumentReader reader = new JsoupDocumentReader("classpath:/test.html",
+				JsoupDocumentReaderConfig.builder().additionalMetadata("key", "value").build());
+		List<Document> documents = reader.get();
+		assertThat(documents).hasSize(1);
+		Document document = documents.get(0);
+		assertThat(document.getMetadata()).containsEntry("key", "value");
+	}
+
+	@Test
+	void testSelector() {
+		JsoupDocumentReader reader = new JsoupDocumentReader("classpath:/test.html",
+				JsoupDocumentReaderConfig.builder().selector("p").build());
+		List<Document> documents = reader.get();
+		assertThat(documents).hasSize(1);
+		assertThat(documents.get(0).getText()).isEqualTo("Some paragraph text.");
+	}
+
+	@Test
+	void testAllElements() {
+		JsoupDocumentReader reader = new JsoupDocumentReader(
+				new DefaultResourceLoader().getResource("classpath:/test.html"),
+				JsoupDocumentReaderConfig.builder().allElements(true).build());
+		List<Document> documents = reader.get();
+		assertThat(documents).hasSize(1);
+		Document document = documents.get(0);
+		assertThat(document.getText()).contains("This is a test HTML document.");
+		assertThat(document.getText()).contains("Some paragraph text.");
+	}
+
+	@Test
+	void testWithLinkUrls() {
+		JsoupDocumentReader reader = new JsoupDocumentReader(
+				new DefaultResourceLoader().getResource("classpath:/test.html"),
+				JsoupDocumentReaderConfig.builder().includeLinkUrls(true).build());
+		List<Document> documents = reader.get();
+		assertThat(documents).hasSize(1);
+		Document document = documents.get(0);
+
+		assertThat(document.getMetadata()).containsKey("linkUrls");
+
+		List<String> linkUrls = (List<String>) document.getMetadata().get("linkUrls");
+		assertThat(linkUrls).contains("https://spring.io/");
+	}
+
+	@Test
+	void testWithMetadataTags() {
+		JsoupDocumentReader reader = new JsoupDocumentReader(
+				new DefaultResourceLoader().getResource("classpath:/test.html"),
+				JsoupDocumentReaderConfig.builder().metadataTags(List.of("custom1", "custom2")).build());
+		List<Document> documents = reader.get();
+		assertThat(documents).hasSize(1);
+		Document document = documents.get(0);
+		assertThat(document.getMetadata()).containsKeys("custom1", "custom2");
+		assertThat(document.getMetadata().get("custom1")).isEqualTo("value1");
+		assertThat(document.getMetadata().get("custom2")).isEqualTo("value2");
+	}
+
+	@Test
+	void testWithGroupByElement() {
+		JsoupDocumentReader reader = new JsoupDocumentReader(
+				new DefaultResourceLoader().getResource("classpath:/test-group-by.html"),
+				JsoupDocumentReaderConfig.builder().groupByElement(true).selector("section").build());
+		List<Document> documents = reader.get();
+		assertThat(documents).hasSize(2);
+		assertThat(documents.get(0).getText()).isEqualTo("Section 1 content");
+		assertThat(documents.get(1).getText()).isEqualTo("Section 2 content");
+	}
+
+	@Test
+	@Disabled("This test requires an active internet connection")
+	void testWikipediaHeadlines() {
+		// Use a URL resource instead of classpath:
+		JsoupDocumentReader reader = new JsoupDocumentReader("https://en.wikipedia.org/",
+				JsoupDocumentReaderConfig.builder().selector("#mp-itn b a").includeLinkUrls(true).build());
+
+		List<Document> documents = reader.get();
+		assertThat(documents).hasSize(1);
+		Document document = documents.get(0);
+
+		// Check for *some* content - we don't want to hard-code specific headlines
+		// as they will change. This verifies the selector is working.
+		assertThat(document.getText()).isNotEmpty();
+
+		// Check if the metadata contains any links
+		assertThat(document.getMetadata()).containsKey("linkUrls");
+		assertThat(document.getMetadata().get("linkUrls")).isInstanceOf(List.class);
+	}
+
+	@Test
+	void testParseFromString() {
+		String html = "<html><head><title>First parse</title></head>"
+				+ "<body><p>Parsed HTML into a doc.</p></body></html>";
+
+		// Decode the base64 string and create a ByteArrayResource
+		byte[] htmlBytes = html.getBytes();
+		ByteArrayResource byteArrayResource = new ByteArrayResource(htmlBytes);
+
+		JsoupDocumentReader reader = new JsoupDocumentReader(byteArrayResource,
+				JsoupDocumentReaderConfig.builder().build());
+
+		List<Document> documents = reader.get();
+		assertThat(documents).hasSize(1);
+		Document doc = documents.get(0);
+		assertThat(doc.getText()).isEqualTo("Parsed HTML into a doc.");
+		assertThat(doc.getMetadata()).containsEntry("title", "First parse");
+	}
+
+	@Test
+	void testParseBodyFragment() {
+		String html = "<div><p>Lorem ipsum.</p></div>";
+
+		// Decode the base64 string and create a ByteArrayResource
+		byte[] htmlBytes = html.getBytes();
+		ByteArrayResource byteArrayResource = new ByteArrayResource(htmlBytes);
+
+		JsoupDocumentReader reader = new JsoupDocumentReader(byteArrayResource,
+				JsoupDocumentReaderConfig.builder()
+					.selector("div") // Select the div
+					.build());
+
+		List<Document> documents = reader.get();
+		assertThat(documents).hasSize(1);
+		assertThat(documents.get(0).getText()).isEqualTo("Lorem ipsum.");
+	}
+
+	@Test
+	void testNonExistingUrl() {
+		JsoupDocumentReader reader = new JsoupDocumentReader("https://nonexistingurl.com",
+				JsoupDocumentReaderConfig.builder().build());
+		assertThatThrownBy(reader::get).isInstanceOf(RuntimeException.class);
+	}
+
+}
--- a/document-readers/jsoup-reader/src/test/resources/test-group-by.html
+++ b/document-readers/jsoup-reader/src/test/resources/test-group-by.html
@@ -0,0 +1,14 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Group By Element Test</title>
+</head>
+<body>
+<section>
+    <p>Section 1 content</p>
+</section>
+<section>
+    <p>Section 2 content</p>
+</section>
+</body>
+</html>
--- a/document-readers/jsoup-reader/src/test/resources/test.html
+++ b/document-readers/jsoup-reader/src/test/resources/test.html
@@ -0,0 +1,15 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Test HTML</title>
+    <meta name="description" content="A test document for Spring AI">
+    <meta name="keywords" content="test,html,spring ai">
+    <meta name="custom1" content="value1">
+    <meta name="custom2" content="value2">
+</head>
+<body>
+<h1>This is a test HTML document.</h1>
+<p>Some paragraph text.</p>
+<a href="https://spring.io/">Spring</a>
+</body>
+</html>