Support custom template in RelevancyEvaluator
The RelevancyEvaluator is key for validating RAG flows. This pull request improves it by making the PromptTemplate configurable, improving the format of the default one, introducing a Builder, and extending the documentation with more details on how to use it. I added some unit tests. The RelevancyEvaluator is used in lots of integration tests in the project to test the QuestionAnswerAdvisor and RetrievalAugmentationAdvisor, that also help assessing the evaluator itself. Signed-off-by: Thomas Vitale <ThomasVitale@users.noreply.github.com>
This commit is contained in:
committed by
Mark Pollack
parent
66d155c153
commit
144ae1ec92
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright 2023-2024 the original author or authors.
|
||||
* Copyright 2023-2025 the original author or authors.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@@ -16,51 +16,70 @@
|
||||
|
||||
package org.springframework.ai.chat.evaluation;
|
||||
|
||||
import java.util.Collections;
|
||||
|
||||
import org.springframework.ai.chat.client.ChatClient;
|
||||
import org.springframework.ai.chat.prompt.PromptTemplate;
|
||||
import org.springframework.ai.evaluation.EvaluationRequest;
|
||||
import org.springframework.ai.evaluation.EvaluationResponse;
|
||||
import org.springframework.ai.evaluation.Evaluator;
|
||||
import org.springframework.lang.Nullable;
|
||||
import org.springframework.util.Assert;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Evaluates the relevancy of a response to a query based on the context provided.
|
||||
*/
|
||||
public class RelevancyEvaluator implements Evaluator {
|
||||
|
||||
private static final String DEFAULT_EVALUATION_PROMPT_TEXT = """
|
||||
private static final PromptTemplate DEFAULT_PROMPT_TEMPLATE = new PromptTemplate("""
|
||||
Your task is to evaluate if the response for the query
|
||||
is in line with the context information provided.\\n
|
||||
You have two options to answer. Either YES/ NO.\\n
|
||||
Answer - YES, if the response for the query
|
||||
is in line with context information otherwise NO.\\n
|
||||
Query: \\n {query}\\n
|
||||
Response: \\n {response}\\n
|
||||
Context: \\n {context}\\n
|
||||
Answer: "
|
||||
""";
|
||||
is in line with the context information provided.
|
||||
|
||||
You have two options to answer. Either YES or NO.
|
||||
|
||||
Answer YES, if the response for the query
|
||||
is in line with context information otherwise NO.
|
||||
|
||||
Query:
|
||||
{query}
|
||||
|
||||
Response:
|
||||
{response}
|
||||
|
||||
Context:
|
||||
{context}
|
||||
|
||||
Answer:
|
||||
""");
|
||||
|
||||
private final ChatClient.Builder chatClientBuilder;
|
||||
|
||||
private final PromptTemplate promptTemplate;
|
||||
|
||||
public RelevancyEvaluator(ChatClient.Builder chatClientBuilder) {
|
||||
this(chatClientBuilder, null);
|
||||
}
|
||||
|
||||
private RelevancyEvaluator(ChatClient.Builder chatClientBuilder, @Nullable PromptTemplate promptTemplate) {
|
||||
Assert.notNull(chatClientBuilder, "chatClientBuilder cannot be null");
|
||||
this.chatClientBuilder = chatClientBuilder;
|
||||
this.promptTemplate = promptTemplate != null ? promptTemplate : DEFAULT_PROMPT_TEMPLATE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public EvaluationResponse evaluate(EvaluationRequest evaluationRequest) {
|
||||
|
||||
var response = evaluationRequest.getResponseContent();
|
||||
var context = doGetSupportingData(evaluationRequest);
|
||||
|
||||
String evaluationResponse = this.chatClientBuilder.build()
|
||||
.prompt()
|
||||
.user(userSpec -> userSpec.text(DEFAULT_EVALUATION_PROMPT_TEXT)
|
||||
.param("query", evaluationRequest.getUserText())
|
||||
.param("response", response)
|
||||
.param("context", context))
|
||||
.call()
|
||||
.content();
|
||||
var userMessage = this.promptTemplate
|
||||
.render(Map.of("query", evaluationRequest.getUserText(), "response", response, "context", context));
|
||||
|
||||
String evaluationResponse = this.chatClientBuilder.build().prompt().user(userMessage).call().content();
|
||||
|
||||
boolean passing = false;
|
||||
float score = 0;
|
||||
if (evaluationResponse.toLowerCase().contains("yes")) {
|
||||
if (evaluationResponse != null && evaluationResponse.toLowerCase().contains("yes")) {
|
||||
passing = true;
|
||||
score = 1;
|
||||
}
|
||||
@@ -68,4 +87,33 @@ public class RelevancyEvaluator implements Evaluator {
|
||||
return new EvaluationResponse(passing, score, "", Collections.emptyMap());
|
||||
}
|
||||
|
||||
public static Builder builder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
|
||||
private ChatClient.Builder chatClientBuilder;
|
||||
|
||||
private PromptTemplate promptTemplate;
|
||||
|
||||
private Builder() {
|
||||
}
|
||||
|
||||
public Builder chatClientBuilder(ChatClient.Builder chatClientBuilder) {
|
||||
this.chatClientBuilder = chatClientBuilder;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder promptTemplate(PromptTemplate promptTemplate) {
|
||||
this.promptTemplate = promptTemplate;
|
||||
return this;
|
||||
}
|
||||
|
||||
public RelevancyEvaluator build() {
|
||||
return new RelevancyEvaluator(this.chatClientBuilder, this.promptTemplate);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Copyright 2023-2025 the original author or authors.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* https://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.springframework.ai.chat.evaluation;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.ai.chat.client.ChatClient;
|
||||
import org.springframework.ai.chat.model.ChatModel;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.assertj.core.api.Assertions.assertThatThrownBy;
|
||||
import static org.mockito.Mockito.mock;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link RelevancyEvaluator}.
|
||||
*
|
||||
* @author Thomas Vitale
|
||||
*/
|
||||
class RelevancyEvaluatorTests {
|
||||
|
||||
@Test
|
||||
void whenChatClientBuilderIsNullThenThrow() {
|
||||
assertThatThrownBy(() -> new RelevancyEvaluator(null)).isInstanceOf(IllegalArgumentException.class)
|
||||
.hasMessageContaining("chatClientBuilder cannot be null");
|
||||
|
||||
assertThatThrownBy(() -> RelevancyEvaluator.builder().chatClientBuilder(null).build())
|
||||
.isInstanceOf(IllegalArgumentException.class)
|
||||
.hasMessageContaining("chatClientBuilder cannot be null");
|
||||
}
|
||||
|
||||
@Test
|
||||
void whenPromptTemplateIsNullThenUseDefault() {
|
||||
RelevancyEvaluator evaluator = new RelevancyEvaluator(ChatClient.builder(mock(ChatModel.class)));
|
||||
assertThat(evaluator).isNotNull();
|
||||
|
||||
evaluator = RelevancyEvaluator.builder()
|
||||
.chatClientBuilder(ChatClient.builder(mock(ChatModel.class)))
|
||||
.promptTemplate(null)
|
||||
.build();
|
||||
assertThat(evaluator).isNotNull();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -6,8 +6,6 @@ One method to evaluate the response is to use the AI model itself for evaluation
|
||||
|
||||
The Spring AI interface for evaluating responses is `Evaluator`, defined as:
|
||||
|
||||
|
||||
|
||||
[source,java]
|
||||
----
|
||||
@FunctionalInterface
|
||||
@@ -42,58 +40,88 @@ public class EvaluationRequest {
|
||||
* `dataList`: Contextual data, such as from Retrieval Augmented Generation, appended to the raw input.
|
||||
* `responseContent`: The AI model's response content as a `String`
|
||||
|
||||
== RelevancyEvaluator
|
||||
== Relevancy Evaluator
|
||||
|
||||
One implementation is the `RelevancyEvaluator`, which uses the AI model for evaluation. More implementations will be available in future releases.
|
||||
The `RelevancyEvaluator` is an implementation of the `Evaluator` interface, designed to assess the relevance of AI-generated responses against provided context. This evaluator helps assess the quality of a RAG flow by determining if the AI model's response is relevant to the user's input with respect to the retrieved context.
|
||||
|
||||
The `RelevancyEvaluator` uses the input (`userText`) and the AI model's output (`chatResponse`) to ask the question:
|
||||
The evaluation is based on the user input, the AI model's response, and the context information. It uses a prompt template to ask the AI model if the response is relevant to the user input and context.
|
||||
|
||||
[source, text]
|
||||
This is the default prompt template used by the `RelevancyEvaluator`:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
Your task is to evaluate if the response for the query
|
||||
is in line with the context information provided.\n
|
||||
You have two options to answer. Either YES/ NO.\n
|
||||
Answer - YES, if the response for the query
|
||||
is in line with context information otherwise NO.\n
|
||||
Query: \n {query}\n
|
||||
Response: \n {response}\n
|
||||
Context: \n {context}\n
|
||||
Answer: "
|
||||
is in line with the context information provided.
|
||||
|
||||
You have two options to answer. Either YES or NO.
|
||||
|
||||
Answer YES, if the response for the query
|
||||
is in line with context information otherwise NO.
|
||||
|
||||
Query:
|
||||
{query}
|
||||
|
||||
Response:
|
||||
{response}
|
||||
|
||||
Context:
|
||||
{context}
|
||||
|
||||
Answer:
|
||||
----
|
||||
|
||||
Here is an example of a JUnit test that performs a RAG query over a PDF document loaded into a Vector Store and then evaluates if the response is relevant to the user text.
|
||||
NOTE: You can customize the prompt template by providing your own `PromptTemplate` object via the `.promptTemplate()` builder method. See xref:_custom_template[Custom Template] for details.
|
||||
|
||||
== Usage in Integration Tests
|
||||
|
||||
Here is an example of usage of the `RelevancyEvaluator` in an integration test, validating the result of a RAG flow using the `RetrievalAugmentationAdvisor`:
|
||||
|
||||
[source,java]
|
||||
----
|
||||
@Test
|
||||
void testEvaluation() {
|
||||
void evaluateRelevancy() {
|
||||
String question = "Where does the adventure of Anacletus and Birba take place?";
|
||||
|
||||
dataController.delete();
|
||||
dataController.load();
|
||||
RetrievalAugmentationAdvisor ragAdvisor = RetrievalAugmentationAdvisor.builder()
|
||||
.documentRetriever(VectorStoreDocumentRetriever.builder()
|
||||
.vectorStore(pgVectorStore)
|
||||
.build())
|
||||
.build();
|
||||
|
||||
String userText = "What is the purpose of Carina?";
|
||||
ChatResponse chatResponse = ChatClient.builder(chatModel).build()
|
||||
.prompt(question)
|
||||
.advisors(ragAdvisor)
|
||||
.call()
|
||||
.chatResponse();
|
||||
|
||||
ChatResponse response = ChatClient.builder(chatModel)
|
||||
.build().prompt()
|
||||
.advisors(new QuestionAnswerAdvisor(vectorStore))
|
||||
.user(userText)
|
||||
.call()
|
||||
.chatResponse();
|
||||
String responseContent = response.getResult().getOutput().getContent();
|
||||
EvaluationRequest evaluationRequest = new EvaluationRequest(
|
||||
// The original user question
|
||||
question,
|
||||
// The retrieved context from the RAG flow
|
||||
chatResponse.getMetadata().get(RetrievalAugmentationAdvisor.DOCUMENT_CONTEXT),
|
||||
// The AI model's response
|
||||
chatResponse.getResult().getOutput().getText()
|
||||
);
|
||||
|
||||
var relevancyEvaluator = new RelevancyEvaluator(ChatClient.builder(chatModel));
|
||||
RelevancyEvaluator evaluator = new RelevancyEvaluator(ChatClient.builder(chatModel));
|
||||
|
||||
EvaluationRequest evaluationRequest = new EvaluationRequest(userText,
|
||||
(List<Content>) response.getMetadata().get(QuestionAnswerAdvisor.RETRIEVED_DOCUMENTS), responseContent);
|
||||
|
||||
EvaluationResponse evaluationResponse = relevancyEvaluator.evaluate(evaluationRequest);
|
||||
|
||||
assertTrue(evaluationResponse.isPass(), "Response is not relevant to the question");
|
||||
EvaluationResponse evaluationResponse = evaluator.evaluate(evaluationRequest);
|
||||
|
||||
assertThat(evaluationResponse.isPass()).isTrue();
|
||||
}
|
||||
----
|
||||
|
||||
The code above is from the example application located https://github.com/rd-1-2022/ai-azure-rag.git[here].
|
||||
You can find several integration tests in the Spring AI project that use the `RelevancyEvaluator` to test the functionality of the `QuestionAnswerAdvisor` (see https://github.com/spring-projects/spring-ai/blob/main/spring-ai-integration-tests/src/test/java/org/springframework/ai/integration/tests/client/advisor/QuestionAnswerAdvisorIT.java[tests]) and `RetrievalAugmentationAdvisor` (see https://github.com/spring-projects/spring-ai/blob/main/spring-ai-integration-tests/src/test/java/org/springframework/ai/integration/tests/client/advisor/RetrievalAugmentationAdvisorIT.java[tests]).
|
||||
|
||||
=== Custom Template
|
||||
|
||||
The `RelevancyEvaluator` uses a default template to prompt the AI model for evaluation. You can customize this behavior by providing your own `PromptTemplate` object via the `.promptTemplate()` builder method.
|
||||
|
||||
The custom `PromptTemplate` can use any `TemplateRenderer` implementation (by default, it uses `StPromptTemplate` based on the https://www.stringtemplate.org/[StringTemplate] engine). The important requirement is that the template must contain the following placeholders:
|
||||
|
||||
* a `query` placeholder to receive the user question.
|
||||
* a `response` placeholder to receive the AI model's response.
|
||||
* a `context` placeholder to receive the context information.
|
||||
|
||||
== FactCheckingEvaluator
|
||||
|
||||
|
||||
Reference in New Issue
Block a user