feat: add JSoup HTML document reader
This commit introduces the `JsoupDocumentReader` and `JsoupDocumentReaderConfig` classes, which provide functionality to read and parse HTML documents using the JSoup library. The reader supports: - Extracting text from specific HTML elements using CSS selectors. - Extracting all text from the body of the document. - Grouping text by element. - Extracting metadata, including the document title, meta tags, and link URLs. - Reading from various resource types (files, URLs, byte arrays). - Configurable character encoding, selector, separator, and metadata extraction. This new reader enhances Spring AI's ability to process web content and other HTML-based data sources. Signed-off-by: Alexandros Pappas <apappascs@gmail.com>
This commit is contained in:
committed by
Ilayaperumal Gopinathan
parent
a6b4555d45
commit
82b46d2182
30
document-readers/jsoup-reader/ README.md
Normal file
30
document-readers/jsoup-reader/ README.md
Normal file
@@ -0,0 +1,30 @@
|
||||
# Spring AI JSoup Document Reader
|
||||
|
||||
This module provides an HTML document reader for the Spring AI project. It leverages the [JSoup](https://jsoup.org/) library to parse HTML content and extract text and metadata, making it suitable for use in AI applications.
|
||||
|
||||
## Features
|
||||
|
||||
* **Flexible Text Extraction:**
|
||||
* Extract all text from the `<body>` of an HTML document.
|
||||
* Extract text from specific elements using CSS selectors.
|
||||
* Group text by element, creating a separate document for each selected element.
|
||||
* Combine text from multiple selected elements using a configurable separator.
|
||||
* **Metadata Extraction:**
|
||||
* Extract the document title.
|
||||
* Extract content from `<meta>` tags (e.g., description, keywords). You can specify which meta tags to extract.
|
||||
* Extract a list of all absolute URLs of links (`<a href="...">`) within the document.
|
||||
* **Configurable:**
|
||||
* Specify the character encoding (defaults to UTF-8).
|
||||
* Customize the CSS selector for element selection.
|
||||
* Configure the separator string for joining text from multiple elements.
|
||||
* Choose whether to extract all text or use element-based extraction.
|
||||
* Enable/disable link URL extraction.
|
||||
* Add additional metadata using configuration.
|
||||
* **Resource-Based:** Works with Spring's `Resource` abstraction, allowing you to read HTML from files, classpath resources, URLs, and even in-memory byte arrays.
|
||||
|
||||
---
|
||||
|
||||
#### How to Build:
|
||||
```bash
|
||||
./mvnw -pl document-readers/jsoup-reader clean install
|
||||
```
|
||||
63
document-readers/jsoup-reader/pom.xml
Normal file
63
document-readers/jsoup-reader/pom.xml
Normal file
@@ -0,0 +1,63 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
~ Copyright 2025-2025 the original author or authors.
|
||||
~
|
||||
~ Licensed under the Apache License, Version 2.0 (the "License");
|
||||
~ you may not use this file except in compliance with the License.
|
||||
~ You may obtain a copy of the License at
|
||||
~
|
||||
~ https://www.apache.org/licenses/LICENSE-2.0
|
||||
~
|
||||
~ Unless required by applicable law or agreed to in writing, software
|
||||
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
~ See the License for the specific language governing permissions and
|
||||
~ limitations under the License.
|
||||
-->
|
||||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.springframework.ai</groupId>
|
||||
<artifactId>spring-ai</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
<relativePath>../../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
<artifactId>spring-ai-jsoup-document-reader</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<name>Spring AI Document Reader - HTML</name>
|
||||
<description>Spring AI HTML document reader</description>
|
||||
<url>https://github.com/spring-projects/spring-ai</url>
|
||||
|
||||
<scm>
|
||||
<url>https://github.com/spring-projects/spring-ai</url>
|
||||
<connection>git://github.com/spring-projects/spring-ai.git</connection>
|
||||
<developerConnection>git@github.com:spring-projects/spring-ai.git</developerConnection>
|
||||
</scm>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.springframework.ai</groupId>
|
||||
<artifactId>spring-ai-core</artifactId>
|
||||
<version>${project.parent.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.jsoup</groupId>
|
||||
<artifactId>jsoup</artifactId>
|
||||
<version>1.18.3</version>
|
||||
</dependency>
|
||||
|
||||
<!-- TESTING -->
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-test</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
@@ -0,0 +1,135 @@
|
||||
/*
|
||||
* Copyright 2025-2025 the original author or authors.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* https://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.springframework.ai.reader.jsoup;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import org.springframework.ai.document.Document;
|
||||
import org.springframework.ai.document.DocumentReader;
|
||||
import org.springframework.ai.reader.jsoup.config.JsoupDocumentReaderConfig;
|
||||
import org.springframework.core.io.DefaultResourceLoader;
|
||||
import org.springframework.core.io.Resource;
|
||||
|
||||
/**
|
||||
* Reads HTML documents and extracts text content using JSoup.
|
||||
*
|
||||
* This reader provides options for selecting specific HTML elements to extract, handling
|
||||
* links, and extracting metadata. It leverages the JSoup library for parsing HTML.
|
||||
*
|
||||
* @see <a href="https://jsoup.org/">JSoup Website</a>
|
||||
* @author Alexandros Pappas
|
||||
*/
|
||||
public class JsoupDocumentReader implements DocumentReader {
|
||||
|
||||
private final Resource htmlResource;
|
||||
|
||||
private final JsoupDocumentReaderConfig config;
|
||||
|
||||
public JsoupDocumentReader(String htmlResource) {
|
||||
this(new DefaultResourceLoader().getResource(htmlResource));
|
||||
}
|
||||
|
||||
public JsoupDocumentReader(Resource htmlResource) {
|
||||
this(htmlResource, JsoupDocumentReaderConfig.defaultConfig());
|
||||
}
|
||||
|
||||
public JsoupDocumentReader(String htmlResource, JsoupDocumentReaderConfig config) {
|
||||
this(new DefaultResourceLoader().getResource(htmlResource), config);
|
||||
}
|
||||
|
||||
public JsoupDocumentReader(Resource htmlResource, JsoupDocumentReaderConfig config) {
|
||||
this.htmlResource = htmlResource;
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Document> get() {
|
||||
try (InputStream inputStream = htmlResource.getInputStream()) {
|
||||
org.jsoup.nodes.Document doc = Jsoup.parse(inputStream, this.config.charset, "");
|
||||
|
||||
List<Document> documents = new ArrayList<>();
|
||||
|
||||
if (this.config.allElements) {
|
||||
// Extract text from all elements and create a single document
|
||||
String allText = doc.body().text(); // .body to exclude head
|
||||
Document document = new Document(allText);
|
||||
addMetadata(doc, document);
|
||||
documents.add(document);
|
||||
}
|
||||
else if (this.config.groupByElement) {
|
||||
// Extract text on a per-element base using the defined selector.
|
||||
Elements selectedElements = doc.select(this.config.selector);
|
||||
for (Element element : selectedElements) {
|
||||
String elementText = element.text();
|
||||
Document document = new Document(elementText);
|
||||
addMetadata(doc, document);
|
||||
// Do not add metadata from element to avoid duplication.
|
||||
documents.add(document);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Extract text from specific elements based on the selector
|
||||
Elements elements = doc.select(this.config.selector);
|
||||
String text = elements.stream().map(Element::text).collect(Collectors.joining(this.config.separator));
|
||||
Document document = new Document(text);
|
||||
addMetadata(doc, document);
|
||||
documents.add(document);
|
||||
}
|
||||
|
||||
return documents;
|
||||
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException("Failed to read HTML resource: " + htmlResource, e);
|
||||
}
|
||||
}
|
||||
|
||||
private void addMetadata(org.jsoup.nodes.Document jsoupDoc, Document springDoc) {
|
||||
Map<String, Object> metadata = new HashMap<>();
|
||||
metadata.put("title", jsoupDoc.title());
|
||||
|
||||
for (String metaTag : this.config.metadataTags) {
|
||||
String value = jsoupDoc.select("meta[name=" + metaTag + "]").attr("content");
|
||||
if (!value.isEmpty()) {
|
||||
metadata.put(metaTag, value);
|
||||
}
|
||||
}
|
||||
|
||||
if (this.config.includeLinkUrls) {
|
||||
Elements links = jsoupDoc.select("a[href]");
|
||||
List<String> linkUrls = links.stream().map(link -> link.attr("abs:href")).toList();
|
||||
metadata.put("linkUrls", linkUrls);
|
||||
}
|
||||
|
||||
// Use putAll to add all entries from additionalMetadata
|
||||
metadata.putAll(this.config.additionalMetadata);
|
||||
|
||||
// Add all collected metadata to the Spring Document
|
||||
springDoc.getMetadata().putAll(metadata);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,207 @@
|
||||
/*
|
||||
* Copyright 2025-2025 the original author or authors.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* https://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.springframework.ai.reader.jsoup.config;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.ai.reader.jsoup.JsoupDocumentReader;
|
||||
import org.springframework.util.Assert;
|
||||
|
||||
/**
|
||||
* Common configuration for the {@link JsoupDocumentReader}.
|
||||
*
|
||||
* Provides options for specifying the character encoding, CSS selector, text separator,
|
||||
* and whether to extract all text from the body or specific elements, and handling link
|
||||
* extraction.
|
||||
*
|
||||
* @author Alexandros Pappas
|
||||
*/
|
||||
public class JsoupDocumentReaderConfig {
|
||||
|
||||
public final String charset;
|
||||
|
||||
public final String selector;
|
||||
|
||||
public final String separator;
|
||||
|
||||
public final boolean allElements;
|
||||
|
||||
public final boolean groupByElement;
|
||||
|
||||
public final boolean includeLinkUrls;
|
||||
|
||||
public final List<String> metadataTags;
|
||||
|
||||
public final Map<String, Object> additionalMetadata;
|
||||
|
||||
private JsoupDocumentReaderConfig(Builder builder) {
|
||||
this.charset = builder.charset;
|
||||
this.selector = builder.selector;
|
||||
this.separator = builder.separator;
|
||||
this.allElements = builder.allElements;
|
||||
this.includeLinkUrls = builder.includeLinkUrls;
|
||||
this.metadataTags = builder.metadataTags;
|
||||
this.groupByElement = builder.groupByElement;
|
||||
this.additionalMetadata = builder.additionalMetadata;
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
public static JsoupDocumentReaderConfig defaultConfig() {
|
||||
return builder().build();
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
|
||||
private String charset = "UTF-8";
|
||||
|
||||
private String selector = "body";
|
||||
|
||||
private String separator = "\n";
|
||||
|
||||
private boolean allElements = false;
|
||||
|
||||
private boolean includeLinkUrls = false;
|
||||
|
||||
private List<String> metadataTags = new ArrayList<>(List.of("description", "keywords"));
|
||||
|
||||
private boolean groupByElement = false;
|
||||
|
||||
private Map<String, Object> additionalMetadata = new HashMap<>();
|
||||
|
||||
private Builder() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the character encoding to use for reading the HTML. Defaults to UTF-8.
|
||||
* @param charset The charset to use.
|
||||
* @return This builder.
|
||||
*/
|
||||
public Builder charset(String charset) {
|
||||
this.charset = charset;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the CSS selector to use for extracting elements. Defaults to "body".
|
||||
* @param selector The CSS selector.
|
||||
* @return This builder.
|
||||
*/
|
||||
public Builder selector(String selector) {
|
||||
this.selector = selector;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the separator string to use when joining text from multiple elements.
|
||||
* Defaults to "\n".
|
||||
* @param separator The separator string.
|
||||
* @return This builder.
|
||||
*/
|
||||
public Builder separator(String separator) {
|
||||
this.separator = separator;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Enables extracting text from all elements in the body, creating a single
|
||||
* document. Overrides the selector setting. Defaults to false.
|
||||
* @param allElements True to extract all text, false otherwise.
|
||||
* @return This builder.
|
||||
*/
|
||||
public Builder allElements(boolean allElements) {
|
||||
this.allElements = allElements;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if on the selected element, the content will be read on per-element
|
||||
* base.
|
||||
* @param groupByElement to read text using element as a separator.
|
||||
* @return this builder.
|
||||
*/
|
||||
public Builder groupByElement(boolean groupByElement) {
|
||||
this.groupByElement = groupByElement;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Enables the inclusion of link URLs in the document metadata. Defaults to false.
|
||||
* @param includeLinkUrls True to include link URLs, false otherwise.
|
||||
* @return This builder.
|
||||
*/
|
||||
public Builder includeLinkUrls(boolean includeLinkUrls) {
|
||||
this.includeLinkUrls = includeLinkUrls;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a metadata tag name to extract from the HTML <meta> tags.
|
||||
* @param metadataTag The name of the metadata tag.
|
||||
* @return This builder.
|
||||
*/
|
||||
public Builder metadataTag(String metadataTag) {
|
||||
this.metadataTags.add(metadataTag);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the metadata tags to extract from the HTML <meta> tags. Overwrites any
|
||||
* previously added tags.
|
||||
* @param metadataTags The list of metadata tag names.
|
||||
* @return This builder.
|
||||
*/
|
||||
public Builder metadataTags(List<String> metadataTags) {
|
||||
this.metadataTags = new ArrayList<>(metadataTags);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds this additional metadata to the all built
|
||||
* {@link org.springframework.ai.document.Document}s.
|
||||
* @return this builder
|
||||
*/
|
||||
public Builder additionalMetadata(String key, Object value) {
|
||||
Assert.notNull(key, "key must not be null");
|
||||
Assert.notNull(value, "value must not be null");
|
||||
this.additionalMetadata.put(key, value);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds this additional metadata to the all built
|
||||
* {@link org.springframework.ai.document.Document}s.
|
||||
* @return this builder
|
||||
*/
|
||||
public Builder additionalMetadata(Map<String, Object> additionalMetadata) {
|
||||
Assert.notNull(additionalMetadata, "additionalMetadata must not be null");
|
||||
this.additionalMetadata = additionalMetadata;
|
||||
return this;
|
||||
}
|
||||
|
||||
public JsoupDocumentReaderConfig build() {
|
||||
return new JsoupDocumentReaderConfig(this);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,186 @@
|
||||
/*
|
||||
* Copyright 2025-2025 the original author or authors.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* https://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.springframework.ai.reader.jsoup;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThatThrownBy;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import org.springframework.ai.document.Document;
|
||||
import org.springframework.ai.reader.jsoup.config.JsoupDocumentReaderConfig;
|
||||
import org.springframework.core.io.ByteArrayResource;
|
||||
import org.springframework.core.io.DefaultResourceLoader;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
/**
|
||||
* Tests for {@link JsoupDocumentReader}.
|
||||
*
|
||||
* @author Alexandros Pappas
|
||||
*/
|
||||
class JsoupDocumentReaderTests {
|
||||
|
||||
@Test
|
||||
void testSimpleRead() {
|
||||
JsoupDocumentReader reader = new JsoupDocumentReader("classpath:/test.html");
|
||||
List<Document> documents = reader.get();
|
||||
assertThat(documents).hasSize(1);
|
||||
Document document = documents.get(0);
|
||||
assertThat(document.getText()).contains("This is a test HTML document.");
|
||||
assertThat(document.getText()).contains("Some paragraph text.");
|
||||
assertThat(document.getMetadata()).containsEntry("title", "Test HTML");
|
||||
assertThat(document.getMetadata()).containsEntry("description", "A test document for Spring AI");
|
||||
assertThat(document.getMetadata()).containsEntry("keywords", "test,html,spring ai");
|
||||
}
|
||||
|
||||
@Test
|
||||
void testSimpleReadWithAdditionalMetadata() {
|
||||
JsoupDocumentReader reader = new JsoupDocumentReader("classpath:/test.html",
|
||||
JsoupDocumentReaderConfig.builder().additionalMetadata("key", "value").build());
|
||||
List<Document> documents = reader.get();
|
||||
assertThat(documents).hasSize(1);
|
||||
Document document = documents.get(0);
|
||||
assertThat(document.getMetadata()).containsEntry("key", "value");
|
||||
}
|
||||
|
||||
@Test
|
||||
void testSelector() {
|
||||
JsoupDocumentReader reader = new JsoupDocumentReader("classpath:/test.html",
|
||||
JsoupDocumentReaderConfig.builder().selector("p").build());
|
||||
List<Document> documents = reader.get();
|
||||
assertThat(documents).hasSize(1);
|
||||
assertThat(documents.get(0).getText()).isEqualTo("Some paragraph text.");
|
||||
}
|
||||
|
||||
@Test
|
||||
void testAllElements() {
|
||||
JsoupDocumentReader reader = new JsoupDocumentReader(
|
||||
new DefaultResourceLoader().getResource("classpath:/test.html"),
|
||||
JsoupDocumentReaderConfig.builder().allElements(true).build());
|
||||
List<Document> documents = reader.get();
|
||||
assertThat(documents).hasSize(1);
|
||||
Document document = documents.get(0);
|
||||
assertThat(document.getText()).contains("This is a test HTML document.");
|
||||
assertThat(document.getText()).contains("Some paragraph text.");
|
||||
}
|
||||
|
||||
@Test
|
||||
void testWithLinkUrls() {
|
||||
JsoupDocumentReader reader = new JsoupDocumentReader(
|
||||
new DefaultResourceLoader().getResource("classpath:/test.html"),
|
||||
JsoupDocumentReaderConfig.builder().includeLinkUrls(true).build());
|
||||
List<Document> documents = reader.get();
|
||||
assertThat(documents).hasSize(1);
|
||||
Document document = documents.get(0);
|
||||
|
||||
assertThat(document.getMetadata()).containsKey("linkUrls");
|
||||
|
||||
List<String> linkUrls = (List<String>) document.getMetadata().get("linkUrls");
|
||||
assertThat(linkUrls).contains("https://spring.io/");
|
||||
}
|
||||
|
||||
@Test
|
||||
void testWithMetadataTags() {
|
||||
JsoupDocumentReader reader = new JsoupDocumentReader(
|
||||
new DefaultResourceLoader().getResource("classpath:/test.html"),
|
||||
JsoupDocumentReaderConfig.builder().metadataTags(List.of("custom1", "custom2")).build());
|
||||
List<Document> documents = reader.get();
|
||||
assertThat(documents).hasSize(1);
|
||||
Document document = documents.get(0);
|
||||
assertThat(document.getMetadata()).containsKeys("custom1", "custom2");
|
||||
assertThat(document.getMetadata().get("custom1")).isEqualTo("value1");
|
||||
assertThat(document.getMetadata().get("custom2")).isEqualTo("value2");
|
||||
}
|
||||
|
||||
@Test
|
||||
void testWithGroupByElement() {
|
||||
JsoupDocumentReader reader = new JsoupDocumentReader(
|
||||
new DefaultResourceLoader().getResource("classpath:/test-group-by.html"),
|
||||
JsoupDocumentReaderConfig.builder().groupByElement(true).selector("section").build());
|
||||
List<Document> documents = reader.get();
|
||||
assertThat(documents).hasSize(2);
|
||||
assertThat(documents.get(0).getText()).isEqualTo("Section 1 content");
|
||||
assertThat(documents.get(1).getText()).isEqualTo("Section 2 content");
|
||||
}
|
||||
|
||||
@Test
|
||||
@Disabled("This test requires an active internet connection")
|
||||
void testWikipediaHeadlines() {
|
||||
// Use a URL resource instead of classpath:
|
||||
JsoupDocumentReader reader = new JsoupDocumentReader("https://en.wikipedia.org/",
|
||||
JsoupDocumentReaderConfig.builder().selector("#mp-itn b a").includeLinkUrls(true).build());
|
||||
|
||||
List<Document> documents = reader.get();
|
||||
assertThat(documents).hasSize(1);
|
||||
Document document = documents.get(0);
|
||||
|
||||
// Check for *some* content - we don't want to hard-code specific headlines
|
||||
// as they will change. This verifies the selector is working.
|
||||
assertThat(document.getText()).isNotEmpty();
|
||||
|
||||
// Check if the metadata contains any links
|
||||
assertThat(document.getMetadata()).containsKey("linkUrls");
|
||||
assertThat(document.getMetadata().get("linkUrls")).isInstanceOf(List.class);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testParseFromString() {
|
||||
String html = "<html><head><title>First parse</title></head>"
|
||||
+ "<body><p>Parsed HTML into a doc.</p></body></html>";
|
||||
|
||||
// Decode the base64 string and create a ByteArrayResource
|
||||
byte[] htmlBytes = html.getBytes();
|
||||
ByteArrayResource byteArrayResource = new ByteArrayResource(htmlBytes);
|
||||
|
||||
JsoupDocumentReader reader = new JsoupDocumentReader(byteArrayResource,
|
||||
JsoupDocumentReaderConfig.builder().build());
|
||||
|
||||
List<Document> documents = reader.get();
|
||||
assertThat(documents).hasSize(1);
|
||||
Document doc = documents.get(0);
|
||||
assertThat(doc.getText()).isEqualTo("Parsed HTML into a doc.");
|
||||
assertThat(doc.getMetadata()).containsEntry("title", "First parse");
|
||||
}
|
||||
|
||||
@Test
|
||||
void testParseBodyFragment() {
|
||||
String html = "<div><p>Lorem ipsum.</p></div>";
|
||||
|
||||
// Decode the base64 string and create a ByteArrayResource
|
||||
byte[] htmlBytes = html.getBytes();
|
||||
ByteArrayResource byteArrayResource = new ByteArrayResource(htmlBytes);
|
||||
|
||||
JsoupDocumentReader reader = new JsoupDocumentReader(byteArrayResource,
|
||||
JsoupDocumentReaderConfig.builder()
|
||||
.selector("div") // Select the div
|
||||
.build());
|
||||
|
||||
List<Document> documents = reader.get();
|
||||
assertThat(documents).hasSize(1);
|
||||
assertThat(documents.get(0).getText()).isEqualTo("Lorem ipsum.");
|
||||
}
|
||||
|
||||
@Test
|
||||
void testNonExistingUrl() {
|
||||
JsoupDocumentReader reader = new JsoupDocumentReader("https://nonexistingurl.com",
|
||||
JsoupDocumentReaderConfig.builder().build());
|
||||
assertThatThrownBy(reader::get).isInstanceOf(RuntimeException.class);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,14 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Group By Element Test</title>
|
||||
</head>
|
||||
<body>
|
||||
<section>
|
||||
<p>Section 1 content</p>
|
||||
</section>
|
||||
<section>
|
||||
<p>Section 2 content</p>
|
||||
</section>
|
||||
</body>
|
||||
</html>
|
||||
15
document-readers/jsoup-reader/src/test/resources/test.html
Normal file
15
document-readers/jsoup-reader/src/test/resources/test.html
Normal file
@@ -0,0 +1,15 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Test HTML</title>
|
||||
<meta name="description" content="A test document for Spring AI">
|
||||
<meta name="keywords" content="test,html,spring ai">
|
||||
<meta name="custom1" content="value1">
|
||||
<meta name="custom2" content="value2">
|
||||
</head>
|
||||
<body>
|
||||
<h1>This is a test HTML document.</h1>
|
||||
<p>Some paragraph text.</p>
|
||||
<a href="https://spring.io/">Spring</a>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user