"""Configuration for run evaluators."""fromtypingimportAny,Callable,Dict,List,Optional,Sequence,Unionfromlangchain_core.embeddingsimportEmbeddingsfromlangchain_core.language_modelsimportBaseLanguageModelfromlangchain_core.promptsimportBasePromptTemplatefromlangsmithimportRunEvaluatorfromlangsmith.evaluation.evaluatorimportEvaluationResult,EvaluationResultsfromlangsmith.schemasimportExample,RunfrompydanticimportBaseModel,ConfigDict,Fieldfromlangchain.evaluation.criteria.eval_chainimportCRITERIA_TYPEfromlangchain.evaluation.embedding_distance.baseimport(EmbeddingDistanceasEmbeddingDistanceEnum,)fromlangchain.evaluation.schemaimportEvaluatorType,StringEvaluatorfromlangchain.evaluation.string_distance.baseimport(StringDistanceasStringDistanceEnum,)RUN_EVALUATOR_LIKE=Callable[[Run,Optional[Example]],Union[EvaluationResult,EvaluationResults,dict]]BATCH_EVALUATOR_LIKE=Callable[[Sequence[Run],Optional[Sequence[Example]]],Union[EvaluationResult,EvaluationResults,dict],]
[docs]classEvalConfig(BaseModel):"""Configuration for a given run evaluator. Parameters ---------- evaluator_type : EvaluatorType The type of evaluator to use. Methods ------- get_kwargs() Get the keyword arguments for the evaluator configuration. """evaluator_type:EvaluatorType
[docs]defget_kwargs(self)->Dict[str,Any]:"""Get the keyword arguments for the load_evaluator call. Returns ------- Dict[str, Any] The keyword arguments for the load_evaluator call. """kwargs={}forfield,valinself:iffield=="evaluator_type":continueelifvalisNone:continuekwargs[field]=valreturnkwargs
[docs]classSingleKeyEvalConfig(EvalConfig):"""Configuration for a run evaluator that only requires a single key."""reference_key:Optional[str]=None"""The key in the dataset run to use as the reference string. If not provided, we will attempt to infer automatically."""prediction_key:Optional[str]=None"""The key from the traced run's outputs dictionary to use to represent the prediction. If not provided, it will be inferred automatically."""input_key:Optional[str]=None"""The key from the traced run's inputs dictionary to use to represent the input. If not provided, it will be inferred automatically."""
[docs]defget_kwargs(self)->Dict[str,Any]:kwargs=super().get_kwargs()# Filer out the keys that are not needed for the evaluator.forkeyin["reference_key","prediction_key","input_key"]:kwargs.pop(key,None)returnkwargs
[docs]classRunEvalConfig(BaseModel):"""Configuration for a run evaluation. Parameters ---------- evaluators : List[Union[EvaluatorType, EvalConfig, RunEvaluator, Callable]] Configurations for which evaluators to apply to the dataset run. Each can be the string of an :class:`EvaluatorType <langchain.evaluation.schema.EvaluatorType>`, such as EvaluatorType.QA, the evaluator type string ("qa"), or a configuration for a given evaluator (e.g., :class:`RunEvalConfig.QA <langchain.smith.evaluation.config.RunEvalConfig.QA>`). custom_evaluators : Optional[List[Union[RunEvaluator, StringEvaluator]]] Custom evaluators to apply to the dataset run. reference_key : Optional[str] The key in the dataset run to use as the reference string. If not provided, it will be inferred automatically. prediction_key : Optional[str] The key from the traced run's outputs dictionary to use to represent the prediction. If not provided, it will be inferred automatically. input_key : Optional[str] The key from the traced run's inputs dictionary to use to represent the input. If not provided, it will be inferred automatically. eval_llm : Optional[BaseLanguageModel] The language model to pass to any evaluators that use a language model. """# noqa: E501evaluators:List[Union[SINGLE_EVAL_CONFIG_TYPE,CUSTOM_EVALUATOR_TYPE,]]=Field(default_factory=list)"""Configurations for which evaluators to apply to the dataset run. Each can be the string of an :class:`EvaluatorType <langchain.evaluation.schema.EvaluatorType>`, such as `EvaluatorType.QA`, the evaluator type string ("qa"), or a configuration for a given evaluator (e.g., :class:`RunEvalConfig.QA <langchain.smith.evaluation.config.RunEvalConfig.QA>`)."""custom_evaluators:Optional[List[CUSTOM_EVALUATOR_TYPE]]=None"""Custom evaluators to apply to the dataset run."""batch_evaluators:Optional[List[BATCH_EVALUATOR_LIKE]]=None"""Evaluators that run on an aggregate/batch level. These generate 1 or more metrics that are assigned to the full test run. As a result, they are not associated with individual traces. """reference_key:Optional[str]=None"""The key in the dataset run to use as the reference string. If not provided, we will attempt to infer automatically."""prediction_key:Optional[str]=None"""The key from the traced run's outputs dictionary to use to represent the prediction. If not provided, it will be inferred automatically."""input_key:Optional[str]=None"""The key from the traced run's inputs dictionary to use to represent the input. If not provided, it will be inferred automatically."""eval_llm:Optional[BaseLanguageModel]=None"""The language model to pass to any evaluators that require one."""model_config=ConfigDict(arbitrary_types_allowed=True,)
[docs]classCriteria(SingleKeyEvalConfig):"""Configuration for a reference-free criteria evaluator. Parameters ---------- criteria : Optional[CRITERIA_TYPE] The criteria to evaluate. llm : Optional[BaseLanguageModel] The language model to use for the evaluation chain. """criteria:Optional[CRITERIA_TYPE]=Nonellm:Optional[BaseLanguageModel]=Noneevaluator_type:EvaluatorType=EvaluatorType.CRITERIAdef__init__(self,criteria:Optional[CRITERIA_TYPE]=None,**kwargs:Any)->None:super().__init__(criteria=criteria,**kwargs)# type: ignore[call-arg]
[docs]classLabeledCriteria(SingleKeyEvalConfig):"""Configuration for a labeled (with references) criteria evaluator. Parameters ---------- criteria : Optional[CRITERIA_TYPE] The criteria to evaluate. llm : Optional[BaseLanguageModel] The language model to use for the evaluation chain. """criteria:Optional[CRITERIA_TYPE]=Nonellm:Optional[BaseLanguageModel]=Noneevaluator_type:EvaluatorType=EvaluatorType.LABELED_CRITERIAdef__init__(self,criteria:Optional[CRITERIA_TYPE]=None,**kwargs:Any)->None:super().__init__(criteria=criteria,**kwargs)# type: ignore[call-arg]
[docs]classEmbeddingDistance(SingleKeyEvalConfig):"""Configuration for an embedding distance evaluator. Parameters ---------- embeddings : Optional[Embeddings] The embeddings to use for computing the distance. distance_metric : Optional[EmbeddingDistanceEnum] The distance metric to use for computing the distance. """evaluator_type:EvaluatorType=EvaluatorType.EMBEDDING_DISTANCEembeddings:Optional[Embeddings]=Nonedistance_metric:Optional[EmbeddingDistanceEnum]=Nonemodel_config=ConfigDict(arbitrary_types_allowed=True,)
[docs]classStringDistance(SingleKeyEvalConfig):"""Configuration for a string distance evaluator. Parameters ---------- distance : Optional[StringDistanceEnum] The string distance metric to use. """evaluator_type:EvaluatorType=EvaluatorType.STRING_DISTANCEdistance:Optional[StringDistanceEnum]=None"""The string distance metric to use. damerau_levenshtein: The Damerau-Levenshtein distance. levenshtein: The Levenshtein distance. jaro: The Jaro distance. jaro_winkler: The Jaro-Winkler distance. """normalize_score:bool=True"""Whether to normalize the distance to between 0 and 1. Applies only to the Levenshtein and Damerau-Levenshtein distances."""
[docs]classQA(SingleKeyEvalConfig):"""Configuration for a QA evaluator. Parameters ---------- prompt : Optional[BasePromptTemplate] The prompt template to use for generating the question. llm : Optional[BaseLanguageModel] The language model to use for the evaluation chain. """evaluator_type:EvaluatorType=EvaluatorType.QAllm:Optional[BaseLanguageModel]=Noneprompt:Optional[BasePromptTemplate]=None
[docs]classContextQA(SingleKeyEvalConfig):"""Configuration for a context-based QA evaluator. Parameters ---------- prompt : Optional[BasePromptTemplate] The prompt template to use for generating the question. llm : Optional[BaseLanguageModel] The language model to use for the evaluation chain. """evaluator_type:EvaluatorType=EvaluatorType.CONTEXT_QAllm:Optional[BaseLanguageModel]=Noneprompt:Optional[BasePromptTemplate]=None
[docs]classCoTQA(SingleKeyEvalConfig):"""Configuration for a context-based QA evaluator. Parameters ---------- prompt : Optional[BasePromptTemplate] The prompt template to use for generating the question. llm : Optional[BaseLanguageModel] The language model to use for the evaluation chain. """evaluator_type:EvaluatorType=EvaluatorType.CONTEXT_QAllm:Optional[BaseLanguageModel]=Noneprompt:Optional[BasePromptTemplate]=None
[docs]classJsonValidity(SingleKeyEvalConfig):"""Configuration for a json validity evaluator. Parameters ---------- """evaluator_type:EvaluatorType=EvaluatorType.JSON_VALIDITY
[docs]classJsonEqualityEvaluator(EvalConfig):"""Configuration for a json equality evaluator. Parameters ---------- """evaluator_type:EvaluatorType=EvaluatorType.JSON_EQUALITY
[docs]classExactMatch(SingleKeyEvalConfig):"""Configuration for an exact match string evaluator. Parameters ---------- ignore_case : bool Whether to ignore case when comparing strings. ignore_punctuation : bool Whether to ignore punctuation when comparing strings. ignore_numbers : bool Whether to ignore numbers when comparing strings. """evaluator_type:EvaluatorType=EvaluatorType.EXACT_MATCHignore_case:bool=Falseignore_punctuation:bool=Falseignore_numbers:bool=False
[docs]classRegexMatch(SingleKeyEvalConfig):"""Configuration for a regex match string evaluator. Parameters ---------- flags : int The flags to pass to the regex. Example: re.IGNORECASE. """evaluator_type:EvaluatorType=EvaluatorType.REGEX_MATCHflags:int=0
[docs]classScoreString(SingleKeyEvalConfig):"""Configuration for a score string evaluator. This is like the criteria evaluator but it is configured by default to return a score on the scale from 1-10. It is recommended to normalize these scores by setting `normalize_by` to 10. Parameters ---------- criteria : Optional[CRITERIA_TYPE] The criteria to evaluate. llm : Optional[BaseLanguageModel] The language model to use for the evaluation chain. normalize_by: Optional[int] = None If you want to normalize the score, the denominator to use. If not provided, the score will be between 1 and 10 (by default). prompt : Optional[BasePromptTemplate] """evaluator_type:EvaluatorType=EvaluatorType.SCORE_STRINGcriteria:Optional[CRITERIA_TYPE]=Nonellm:Optional[BaseLanguageModel]=Nonenormalize_by:Optional[float]=Noneprompt:Optional[BasePromptTemplate]=Nonedef__init__(self,criteria:Optional[CRITERIA_TYPE]=None,normalize_by:Optional[float]=None,**kwargs:Any,)->None:super().__init__(criteria=criteria,normalize_by=normalize_by,**kwargs)# type: ignore[call-arg]