Source code for langchain.smith.evaluation.string_run_evaluator
"""Run evaluator wrapper for string evaluators."""from__future__importannotationsfromabcimportabstractmethodfromtypingimportAny,Dict,List,Optionalfromlangchain_core.callbacks.managerimport(AsyncCallbackManagerForChainRun,CallbackManagerForChainRun,)fromlangchain_core.load.dumpimportdumpdfromlangchain_core.load.loadimportloadfromlangchain_core.load.serializableimportSerializablefromlangchain_core.messagesimportBaseMessage,get_buffer_string,messages_from_dictfromlangsmithimportEvaluationResult,RunEvaluatorfromlangsmith.schemasimportDataType,Example,Runfromlangchain.chains.baseimportChainfromlangchain.evaluation.schemaimportStringEvaluatorfromlangchain.schemaimportRUN_KEYdef_get_messages_from_run_dict(messages:List[dict])->List[BaseMessage]:ifnotmessages:return[]first_message=messages[0]if"lc"infirst_message:return[load(dumpd(message))formessageinmessages]else:returnmessages_from_dict(messages)
[docs]classStringRunMapper(Serializable):"""Extract items to evaluate from the run object."""@propertydefoutput_keys(self)->List[str]:"""The keys to extract from the run."""return["prediction","input"]
[docs]@abstractmethoddefmap(self,run:Run)->Dict[str,str]:"""Maps the Run to a dictionary."""
[docs]def__call__(self,run:Run)->Dict[str,str]:"""Maps the Run to a dictionary."""ifnotrun.outputs:raiseValueError(f"Run {run.id} has no outputs to evaluate.")returnself.map(run)
[docs]classLLMStringRunMapper(StringRunMapper):"""Extract items to evaluate from the run object."""
[docs]defserialize_chat_messages(self,messages:List[Dict])->str:"""Extract the input messages from the run."""ifisinstance(messages,list)andmessages:ifisinstance(messages[0],dict):chat_messages=_get_messages_from_run_dict(messages)elifisinstance(messages[0],list):# Runs from Tracer have messages as a list of lists of dictschat_messages=_get_messages_from_run_dict(messages[0])else:raiseValueError(f"Could not extract messages to evaluate {messages}")returnget_buffer_string(chat_messages)raiseValueError(f"Could not extract messages to evaluate {messages}")
[docs]defserialize_inputs(self,inputs:Dict)->str:if"prompts"ininputs:# Should we even accept this?input_="\n\n".join(inputs["prompts"])elif"prompt"ininputs:input_=inputs["prompt"]elif"messages"ininputs:input_=self.serialize_chat_messages(inputs["messages"])else:raiseValueError("LLM Run must have either messages or prompts as inputs.")returninput_
[docs]defserialize_outputs(self,outputs:Dict)->str:ifnotoutputs.get("generations"):raiseValueError("Cannot evaluate LLM Run without generations.")generations:List[Dict]=outputs["generations"]ifnotgenerations:raiseValueError("Cannot evaluate LLM run with empty generations.")first_generation:Dict=generations[0]ifisinstance(first_generation,list):# Runs from Tracer have generations as a list of lists of dicts# Whereas Runs from the API have a list of dictsfirst_generation=first_generation[0]if"message"infirst_generation:output_=self.serialize_chat_messages([first_generation["message"]])else:output_=first_generation["text"]returnoutput_
[docs]defmap(self,run:Run)->Dict[str,str]:"""Maps the Run to a dictionary."""ifrun.run_type!="llm":raiseValueError("LLM RunMapper only supports LLM runs.")elifnotrun.outputs:ifrun.error:raiseValueError(f"Cannot evaluate errored LLM run {run.id}: {run.error}")else:raiseValueError(f"Run {run.id} has no outputs. Cannot evaluate this run.")else:try:inputs=self.serialize_inputs(run.inputs)exceptExceptionase:raiseValueError(f"Could not parse LM input from run inputs {run.inputs}")frometry:output_=self.serialize_outputs(run.outputs)exceptExceptionase:raiseValueError(f"Could not parse LM prediction from run outputs {run.outputs}")fromereturn{"input":inputs,"prediction":output_}
[docs]classChainStringRunMapper(StringRunMapper):"""Extract items to evaluate from the run object from a chain."""input_key:Optional[str]=None"""The key from the model Run's inputs to use as the eval input. If not provided, will use the only input key or raise an error if there are multiple."""prediction_key:Optional[str]=None"""The key from the model Run's outputs to use as the eval prediction. If not provided, will use the only output key or raise an error if there are multiple."""def_get_key(self,source:Dict,key:Optional[str],which:str)->str:ifkeyisnotNone:returnsource[key]eliflen(source)==1:returnnext(iter(source.values()))else:raiseValueError(f"Could not map run {which} with multiple keys: "f"{source}\nPlease manually specify a {which}_key")
[docs]defmap(self,run:Run)->Dict[str,str]:"""Maps the Run to a dictionary."""ifnotrun.outputs:raiseValueError(f"Run with ID {run.id} lacks outputs required for evaluation."" Ensure the Run has valid outputs.")ifself.input_keyisnotNoneandself.input_keynotinrun.inputs:raiseValueError(f"Run with ID {run.id} is missing the expected input key"f" '{self.input_key}'.\nAvailable input keys in this Run"f" are: {run.inputs.keys()}.\nAdjust the evaluator's"f" input_key or ensure your input data includes key"f" '{self.input_key}'.")elifself.prediction_keyisnotNoneandself.prediction_keynotinrun.outputs:available_keys=", ".join(run.outputs.keys())raiseValueError(f"Run with ID {run.id} doesn't have the expected prediction key"f" '{self.prediction_key}'. Available prediction keys in this Run are:"f" {available_keys}. Adjust the evaluator's prediction_key or"" ensure the Run object's outputs the expected key.")else:input_=self._get_key(run.inputs,self.input_key,"input")prediction=self._get_key(run.outputs,self.prediction_key,"prediction")return{"input":input_,"prediction":prediction,}
[docs]classToolStringRunMapper(StringRunMapper):"""Map an input to the tool."""
[docs]defmap(self,run:Run)->Dict[str,str]:ifnotrun.outputs:raiseValueError(f"Run {run.id} has no outputs to evaluate.")return{"input":run.inputs["input"],"prediction":run.outputs["output"]}
[docs]classStringExampleMapper(Serializable):"""Map an example, or row in the dataset, to the inputs of an evaluation."""reference_key:Optional[str]=None@propertydefoutput_keys(self)->List[str]:"""The keys to extract from the run."""return["reference"]
[docs]defserialize_chat_messages(self,messages:List[Dict])->str:"""Extract the input messages from the run."""chat_messages=_get_messages_from_run_dict(messages)returnget_buffer_string(chat_messages)
[docs]defmap(self,example:Example)->Dict[str,str]:"""Maps the Example, or dataset row to a dictionary."""ifnotexample.outputs:raiseValueError(f"Example {example.id} has no outputs to use as a reference.")ifself.reference_keyisNone:iflen(example.outputs)>1:raiseValueError(f"Example {example.id} has multiple outputs, so you must"" specify a reference_key.")else:output=list(example.outputs.values())[0]elifself.reference_keynotinexample.outputs:raiseValueError(f"Example {example.id} does not have reference key"f" {self.reference_key}.")else:output=example.outputs[self.reference_key]return{"reference":self.serialize_chat_messages([output])ifisinstance(output,dict)andoutput.get("type")andoutput.get("data")elseoutput}
[docs]def__call__(self,example:Example)->Dict[str,str]:"""Maps the Run and Example to a dictionary."""ifnotexample.outputs:raiseValueError(f"Example {example.id} has no outputs to use as areference label.")returnself.map(example)
[docs]classStringRunEvaluatorChain(Chain,RunEvaluator):# type: ignore[override, override]"""Evaluate Run and optional examples."""run_mapper:StringRunMapper"""Maps the Run to a dictionary with 'input' and 'prediction' strings."""example_mapper:Optional[StringExampleMapper]=None"""Maps the Example (dataset row) to a dictionary with a 'reference' string."""name:str"""The name of the evaluation metric."""string_evaluator:StringEvaluator"""The evaluation chain."""@propertydefinput_keys(self)->List[str]:return["run","example"]@propertydefoutput_keys(self)->List[str]:return["feedback"]def_prepare_input(self,inputs:Dict[str,Any])->Dict[str,str]:run:Run=inputs["run"]example:Optional[Example]=inputs.get("example")evaluate_strings_inputs=self.run_mapper(run)ifnotself.string_evaluator.requires_input:# Hide warning about unused inputevaluate_strings_inputs.pop("input",None)ifexampleandself.example_mapperandself.string_evaluator.requires_reference:evaluate_strings_inputs.update(self.example_mapper(example))elifself.string_evaluator.requires_reference:raiseValueError(f"Evaluator {self.name} requires an reference"" example from the dataset,"f" but none was provided for run {run.id}.")returnevaluate_strings_inputsdef_prepare_output(self,output:Dict[str,Any])->Dict[str,Any]:evaluation_result=EvaluationResult(key=self.name,comment=output.get("reasoning"),**output)ifRUN_KEYinoutput:# TODO: Not currently surfaced. Updateevaluation_result.evaluator_info[RUN_KEY]=output[RUN_KEY]return{"feedback":evaluation_result}def_call(self,inputs:Dict[str,str],run_manager:Optional[CallbackManagerForChainRun]=None,)->Dict[str,Any]:"""Call the evaluation chain."""evaluate_strings_inputs=self._prepare_input(inputs)_run_manager=run_managerorCallbackManagerForChainRun.get_noop_manager()callbacks=_run_manager.get_child()chain_output=self.string_evaluator.evaluate_strings(**evaluate_strings_inputs,callbacks=callbacks,include_run_info=True,)returnself._prepare_output(chain_output)asyncdef_acall(self,inputs:Dict[str,str],run_manager:Optional[AsyncCallbackManagerForChainRun]=None,)->Dict[str,Any]:"""Call the evaluation chain."""evaluate_strings_inputs=self._prepare_input(inputs)_run_manager=run_managerorAsyncCallbackManagerForChainRun.get_noop_manager()callbacks=_run_manager.get_child()chain_output=awaitself.string_evaluator.aevaluate_strings(**evaluate_strings_inputs,callbacks=callbacks,include_run_info=True,)returnself._prepare_output(chain_output)def_prepare_evaluator_output(self,output:Dict[str,Any])->EvaluationResult:feedback:EvaluationResult=output["feedback"]ifRUN_KEYnotinfeedback.evaluator_info:feedback.evaluator_info[RUN_KEY]=output[RUN_KEY]returnfeedback
[docs]defevaluate_run(self,run:Run,example:Optional[Example]=None)->EvaluationResult:"""Evaluate an example."""try:result=self({"run":run,"example":example},include_run_info=True)returnself._prepare_evaluator_output(result)exceptExceptionase:returnEvaluationResult(key=self.string_evaluator.evaluation_name,comment=f"Error evaluating run {run.id}: {e}",# TODO: Add run ID once we can declare it via callbacks)
[docs]asyncdefaevaluate_run(self,run:Run,example:Optional[Example]=None)->EvaluationResult:"""Evaluate an example."""try:result=awaitself.acall({"run":run,"example":example},include_run_info=True)returnself._prepare_evaluator_output(result)exceptExceptionase:returnEvaluationResult(key=self.string_evaluator.evaluation_name,comment=f"Error evaluating run {run.id}: {e}",)
[docs]@classmethoddeffrom_run_and_data_type(cls,evaluator:StringEvaluator,run_type:str,data_type:DataType,input_key:Optional[str]=None,prediction_key:Optional[str]=None,reference_key:Optional[str]=None,tags:Optional[List[str]]=None,)->StringRunEvaluatorChain:""" Create a StringRunEvaluatorChain from an evaluator and the run and dataset types. This method provides an easy way to instantiate a StringRunEvaluatorChain, by taking an evaluator and information about the type of run and the data. The method supports LLM and chain runs. Args: evaluator (StringEvaluator): The string evaluator to use. run_type (str): The type of run being evaluated. Supported types are LLM and Chain. data_type (DataType): The type of dataset used in the run. input_key (str, optional): The key used to map the input from the run. prediction_key (str, optional): The key used to map the prediction from the run. reference_key (str, optional): The key used to map the reference from the dataset. tags (List[str], optional): List of tags to attach to the evaluation chain. Returns: StringRunEvaluatorChain: The instantiated evaluation chain. Raises: ValueError: If the run type is not supported, or if the evaluator requires a reference from the dataset but the reference key is not provided. """# noqa: E501# Configure how run inputs/predictions are passed to the evaluatorifrun_type=="llm":run_mapper:StringRunMapper=LLMStringRunMapper()elifrun_type=="chain":run_mapper=ChainStringRunMapper(input_key=input_key,prediction_key=prediction_key)else:raiseValueError(f"Unsupported run type {run_type}. Expected one of 'llm' or 'chain'.")# Configure how example rows are fed as a reference string to the evaluatorif(reference_keyisnotNoneordata_typein(DataType.llm,DataType.chat)orevaluator.requires_reference):example_mapper=StringExampleMapper(reference_key=reference_key)elifevaluator.requires_reference:raiseValueError(f"Evaluator {evaluator.evaluation_name} requires a reference"" example from the dataset. Please specify the reference key from"" amongst the dataset outputs keys.")else:example_mapper=Nonereturncls(name=evaluator.evaluation_name,run_mapper=run_mapper,example_mapper=example_mapper,string_evaluator=evaluator,tags=tags,)