"""Interfaces to be implemented by general evaluators."""
from __future__ import annotations
import logging
from abc import ABC, abstractmethod
from enum import Enum
from typing import Any, Optional, Sequence, Tuple, Union
from warnings import warn
from langchain_core.agents import AgentAction
from langchain_core.language_models import BaseLanguageModel
from langchain_core.runnables.config import run_in_executor
from langchain.chains.base import Chain
logger = logging.getLogger(__name__)
[docs]
class EvaluatorType(str, Enum):
"""The types of the evaluators."""
QA = "qa"
"""Question answering evaluator, which grades answers to questions
directly using an LLM."""
COT_QA = "cot_qa"
"""Chain of thought question answering evaluator, which grades
answers to questions using
chain of thought 'reasoning'."""
CONTEXT_QA = "context_qa"
"""Question answering evaluator that incorporates 'context' in the response."""
PAIRWISE_STRING = "pairwise_string"
"""The pairwise string evaluator, which predicts the preferred prediction from
between two models."""
SCORE_STRING = "score_string"
"""The scored string evaluator, which gives a score between 1 and 10
to a prediction."""
LABELED_PAIRWISE_STRING = "labeled_pairwise_string"
"""The labeled pairwise string evaluator, which predicts the preferred prediction
from between two models based on a ground truth reference label."""
LABELED_SCORE_STRING = "labeled_score_string"
"""The labeled scored string evaluator, which gives a score between 1 and 10
to a prediction based on a ground truth reference label."""
AGENT_TRAJECTORY = "trajectory"
"""The agent trajectory evaluator, which grades the agent's intermediate steps."""
CRITERIA = "criteria"
"""The criteria evaluator, which evaluates a model based on a
custom set of criteria without any reference labels."""
LABELED_CRITERIA = "labeled_criteria"
"""The labeled criteria evaluator, which evaluates a model based on a
custom set of criteria, with a reference label."""
STRING_DISTANCE = "string_distance"
"""Compare predictions to a reference answer using string edit distances."""
EXACT_MATCH = "exact_match"
"""Compare predictions to a reference answer using exact matching."""
REGEX_MATCH = "regex_match"
"""Compare predictions to a reference answer using regular expressions."""
PAIRWISE_STRING_DISTANCE = "pairwise_string_distance"
"""Compare predictions based on string edit distances."""
EMBEDDING_DISTANCE = "embedding_distance"
"""Compare a prediction to a reference label using embedding distance."""
PAIRWISE_EMBEDDING_DISTANCE = "pairwise_embedding_distance"
"""Compare two predictions using embedding distance."""
JSON_VALIDITY = "json_validity"
"""Check if a prediction is valid JSON."""
JSON_EQUALITY = "json_equality"
"""Check if a prediction is equal to a reference JSON."""
JSON_EDIT_DISTANCE = "json_edit_distance"
"""Compute the edit distance between two JSON strings after canonicalization."""
JSON_SCHEMA_VALIDATION = "json_schema_validation"
"""Check if a prediction is valid JSON according to a JSON schema."""
[docs]
class LLMEvalChain(Chain):
"""A base class for evaluators that use an LLM."""
[docs]
@classmethod
@abstractmethod
def from_llm(cls, llm: BaseLanguageModel, **kwargs: Any) -> LLMEvalChain:
"""Create a new evaluator from an LLM."""
class _EvalArgsMixin:
"""Mixin for checking evaluation arguments."""
@property
def requires_reference(self) -> bool:
"""Whether this evaluator requires a reference label."""
return False
@property
def requires_input(self) -> bool:
"""Whether this evaluator requires an input string."""
return False
@property
def _skip_input_warning(self) -> str:
"""Warning to show when input is ignored."""
return f"Ignoring input in {self.__class__.__name__}, as it is not expected."
@property
def _skip_reference_warning(self) -> str:
"""Warning to show when reference is ignored."""
return (
f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
)
def _check_evaluation_args(
self,
reference: Optional[str] = None,
input: Optional[str] = None,
) -> None:
"""Check if the evaluation arguments are valid.
Args:
reference (Optional[str], optional): The reference label.
input (Optional[str], optional): The input string.
Raises:
ValueError: If the evaluator requires an input string but none is provided,
or if the evaluator requires a reference label but none is provided.
"""
if self.requires_input and input is None:
raise ValueError(f"{self.__class__.__name__} requires an input string.")
elif input is not None and not self.requires_input:
warn(self._skip_input_warning)
if self.requires_reference and reference is None:
raise ValueError(f"{self.__class__.__name__} requires a reference string.")
elif reference is not None and not self.requires_reference:
warn(self._skip_reference_warning)
[docs]
class StringEvaluator(_EvalArgsMixin, ABC):
"""Grade, tag, or otherwise evaluate predictions relative to their inputs
and/or reference labels."""
@property
def evaluation_name(self) -> str:
"""The name of the evaluation."""
return self.__class__.__name__
@property
def requires_reference(self) -> bool:
"""Whether this evaluator requires a reference label."""
return False
@abstractmethod
def _evaluate_strings(
self,
*,
prediction: Union[str, Any],
reference: Optional[Union[str, Any]] = None,
input: Optional[Union[str, Any]] = None,
**kwargs: Any,
) -> dict:
"""Evaluate Chain or LLM output, based on optional input and label.
Args:
prediction (str): The LLM or chain prediction to evaluate.
reference (Optional[str], optional): The reference label to evaluate against.
input (Optional[str], optional): The input to consider during evaluation.
kwargs: Additional keyword arguments, including callbacks, tags, etc.
Returns:
dict: The evaluation results containing the score or value.
It is recommended that the dictionary contain the following keys:
- score: the score of the evaluation, if applicable.
- value: the string value of the evaluation, if applicable.
- reasoning: the reasoning for the evaluation, if applicable.
""" # noqa: E501
async def _aevaluate_strings(
self,
*,
prediction: Union[str, Any],
reference: Optional[Union[str, Any]] = None,
input: Optional[Union[str, Any]] = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate Chain or LLM output, based on optional input and label.
Args:
prediction (str): The LLM or chain prediction to evaluate.
reference (Optional[str], optional): The reference label to evaluate against.
input (Optional[str], optional): The input to consider during evaluation.
kwargs: Additional keyword arguments, including callbacks, tags, etc.
Returns:
dict: The evaluation results containing the score or value.
It is recommended that the dictionary contain the following keys:
- score: the score of the evaluation, if applicable.
- value: the string value of the evaluation, if applicable.
- reasoning: the reasoning for the evaluation, if applicable.
""" # noqa: E501
return await run_in_executor(
None,
self._evaluate_strings,
prediction=prediction,
reference=reference,
input=input,
**kwargs,
)
[docs]
def evaluate_strings(
self,
*,
prediction: str,
reference: Optional[str] = None,
input: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Evaluate Chain or LLM output, based on optional input and label.
Args:
prediction (str): The LLM or chain prediction to evaluate.
reference (Optional[str], optional): The reference label to evaluate against.
input (Optional[str], optional): The input to consider during evaluation.
kwargs: Additional keyword arguments, including callbacks, tags, etc.
Returns:
dict: The evaluation results containing the score or value.
""" # noqa: E501
self._check_evaluation_args(reference=reference, input=input)
return self._evaluate_strings(
prediction=prediction, reference=reference, input=input, **kwargs
)
[docs]
async def aevaluate_strings(
self,
*,
prediction: str,
reference: Optional[str] = None,
input: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate Chain or LLM output, based on optional input and label.
Args:
prediction (str): The LLM or chain prediction to evaluate.
reference (Optional[str], optional): The reference label to evaluate against.
input (Optional[str], optional): The input to consider during evaluation.
kwargs: Additional keyword arguments, including callbacks, tags, etc.
Returns:
dict: The evaluation results containing the score or value.
""" # noqa: E501
self._check_evaluation_args(reference=reference, input=input)
return await self._aevaluate_strings(
prediction=prediction, reference=reference, input=input, **kwargs
)
[docs]
class PairwiseStringEvaluator(_EvalArgsMixin, ABC):
"""Compare the output of two models (or two outputs of the same model)."""
@abstractmethod
def _evaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
reference: Optional[str] = None,
input: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Evaluate the output string pairs.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
reference (Optional[str], optional): The expected output / reference string.
input (Optional[str], optional): The input string.
kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
Returns:
dict: A dictionary containing the preference, scores, and/or other information.
""" # noqa: E501
async def _aevaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
reference: Optional[str] = None,
input: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate the output string pairs.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
reference (Optional[str], optional): The expected output / reference string.
input (Optional[str], optional): The input string.
kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
Returns:
dict: A dictionary containing the preference, scores, and/or other information.
""" # noqa: E501
return await run_in_executor(
None,
self._evaluate_string_pairs,
prediction=prediction,
prediction_b=prediction_b,
reference=reference,
input=input,
**kwargs,
)
[docs]
def evaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
reference: Optional[str] = None,
input: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Evaluate the output string pairs.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
reference (Optional[str], optional): The expected output / reference string.
input (Optional[str], optional): The input string.
kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
Returns:
dict: A dictionary containing the preference, scores, and/or other information.
""" # noqa: E501
self._check_evaluation_args(reference=reference, input=input)
return self._evaluate_string_pairs(
prediction=prediction,
prediction_b=prediction_b,
reference=reference,
input=input,
**kwargs,
)
[docs]
async def aevaluate_string_pairs(
self,
*,
prediction: str,
prediction_b: str,
reference: Optional[str] = None,
input: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate the output string pairs.
Args:
prediction (str): The output string from the first model.
prediction_b (str): The output string from the second model.
reference (Optional[str], optional): The expected output / reference string.
input (Optional[str], optional): The input string.
kwargs: Additional keyword arguments, such as callbacks and optional reference strings.
Returns:
dict: A dictionary containing the preference, scores, and/or other information.
""" # noqa: E501
self._check_evaluation_args(reference=reference, input=input)
return await self._aevaluate_string_pairs(
prediction=prediction,
prediction_b=prediction_b,
reference=reference,
input=input,
**kwargs,
)
[docs]
class AgentTrajectoryEvaluator(_EvalArgsMixin, ABC):
"""Interface for evaluating agent trajectories."""
@property
def requires_input(self) -> bool:
"""Whether this evaluator requires an input string."""
return True
@abstractmethod
def _evaluate_agent_trajectory(
self,
*,
prediction: str,
agent_trajectory: Sequence[Tuple[AgentAction, str]],
input: str,
reference: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Evaluate a trajectory.
Args:
prediction (str): The final predicted response.
agent_trajectory (List[Tuple[AgentAction, str]]):
The intermediate steps forming the agent trajectory.
input (str): The input to the agent.
reference (Optional[str]): The reference answer.
Returns:
dict: The evaluation result.
"""
async def _aevaluate_agent_trajectory(
self,
*,
prediction: str,
agent_trajectory: Sequence[Tuple[AgentAction, str]],
input: str,
reference: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate a trajectory.
Args:
prediction (str): The final predicted response.
agent_trajectory (List[Tuple[AgentAction, str]]):
The intermediate steps forming the agent trajectory.
input (str): The input to the agent.
reference (Optional[str]): The reference answer.
Returns:
dict: The evaluation result.
"""
return await run_in_executor(
None,
self._evaluate_agent_trajectory,
prediction=prediction,
agent_trajectory=agent_trajectory,
reference=reference,
input=input,
**kwargs,
)
[docs]
def evaluate_agent_trajectory(
self,
*,
prediction: str,
agent_trajectory: Sequence[Tuple[AgentAction, str]],
input: str,
reference: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Evaluate a trajectory.
Args:
prediction (str): The final predicted response.
agent_trajectory (List[Tuple[AgentAction, str]]):
The intermediate steps forming the agent trajectory.
input (str): The input to the agent.
reference (Optional[str]): The reference answer.
Returns:
dict: The evaluation result.
"""
self._check_evaluation_args(reference=reference, input=input)
return self._evaluate_agent_trajectory(
prediction=prediction,
input=input,
agent_trajectory=agent_trajectory,
reference=reference,
**kwargs,
)
[docs]
async def aevaluate_agent_trajectory(
self,
*,
prediction: str,
agent_trajectory: Sequence[Tuple[AgentAction, str]],
input: str,
reference: Optional[str] = None,
**kwargs: Any,
) -> dict:
"""Asynchronously evaluate a trajectory.
Args:
prediction (str): The final predicted response.
agent_trajectory (List[Tuple[AgentAction, str]]):
The intermediate steps forming the agent trajectory.
input (str): The input to the agent.
reference (Optional[str]): The reference answer.
Returns:
dict: The evaluation result.
"""
self._check_evaluation_args(reference=reference, input=input)
return await self._aevaluate_agent_trajectory(
prediction=prediction,
input=input,
agent_trajectory=agent_trajectory,
reference=reference,
**kwargs,
)