Source code for langchain.evaluation.agents.trajectory_eval_chain
"""A chain for evaluating ReAct style agents.This chain is used to evaluate ReAct style agents by reasoning aboutthe sequence of actions taken and their outcomes. It uses a language modelchain (LLMChain) to generate the reasoning and scores."""importrefromtypingimport(Any,Dict,List,Optional,Sequence,Tuple,TypedDict,Union,cast,)fromlangchain_core.agentsimportAgentActionfromlangchain_core.callbacks.managerimport(AsyncCallbackManagerForChainRun,CallbackManagerForChainRun,Callbacks,)fromlangchain_core.exceptionsimportOutputParserExceptionfromlangchain_core.language_modelsimportBaseLanguageModelfromlangchain_core.language_models.chat_modelsimportBaseChatModelfromlangchain_core.output_parsersimportBaseOutputParserfromlangchain_core.toolsimportBaseToolfrompydanticimportConfigDict,Fieldfromlangchain.chains.llmimportLLMChainfromlangchain.evaluation.agents.trajectory_eval_promptimport(EVAL_CHAT_PROMPT,TOOL_FREE_EVAL_CHAT_PROMPT,)fromlangchain.evaluation.schemaimportAgentTrajectoryEvaluator,LLMEvalChain
[docs]classTrajectoryEval(TypedDict):"""A named tuple containing the score and reasoning for a trajectory."""score:float"""The score for the trajectory, normalized from 0 to 1."""reasoning:str"""The reasoning for the score."""
[docs]defparse(self,text:str)->TrajectoryEval:"""Parse the output text and extract the score and reasoning. Args: text (str): The output text to parse. Returns: TrajectoryEval: A named tuple containing the normalized score and reasoning. Raises: OutputParserException: If the score is not found in the output text or if the LLM's score is not a digit in the range 1-5. """if"Score:"notintext:raiseOutputParserException(f"Could not find score in model eval output: {text}")reasoning,score_str=text.split("Score: ",maxsplit=1)reasoning,score_str=reasoning.strip(),score_str.strip()# Use regex to extract the score.# This will get the number in the string, even if it is a float or more than 10.# E.g. "Score: 1" will return 1, "Score: 3.5" will return 3.5, and# "Score: 10" will return 10.# The score should be an integer digit in the range 1-5._score=re.search(r"(\d+(\.\d+)?)",score_str)# If the score is not found or is a float, raise an exception.if_scoreisNoneor"."in_score.group(1):raiseOutputParserException(f"Score is not an integer digit in the range 1-5: {text}")score=int(_score.group(1))# If the score is not in the range 1-5, raise an exception.ifnot1<=score<=5:raiseOutputParserException(f"Score is not a digit in the range 1-5: {text}")normalized_score=(score-1)/4returnTrajectoryEval(score=normalized_score,reasoning=reasoning)
[docs]classTrajectoryEvalChain(AgentTrajectoryEvaluator,LLMEvalChain):"""A chain for evaluating ReAct style agents. This chain is used to evaluate ReAct style agents by reasoning about the sequence of actions taken and their outcomes. Based on the paper "ReAct: Synergizing Reasoning and Acting in Language Models" (https://arxiv.org/abs/2210.03629) Example: .. code-block:: python from langchain.agents import AgentType, initialize_agent from langchain_community.chat_models import ChatOpenAI from langchain.evaluation import TrajectoryEvalChain from langchain.tools import tool @tool def geography_answers(country: str, question: str) -> str: \"\"\"Very helpful answers to geography questions.\"\"\" return f"{country}? IDK - We may never know {question}." llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0) agent = initialize_agent( tools=[geography_answers], llm=llm, agent=AgentType.OPENAI_FUNCTIONS, return_intermediate_steps=True, ) question = "How many dwell in the largest minor region in Argentina?" response = agent(question) eval_chain = TrajectoryEvalChain.from_llm( llm=llm, agent_tools=[geography_answers], return_reasoning=True ) result = eval_chain.evaluate_agent_trajectory( input=question, agent_trajectory=response["intermediate_steps"], prediction=response["output"], reference="Paris", ) print(result["score"]) # noqa: T201 # 0 """agent_tools:Optional[List[BaseTool]]=None"""A list of tools available to the agent."""eval_chain:LLMChain"""The language model chain used for evaluation."""output_parser:TrajectoryOutputParser=Field(default_factory=TrajectoryOutputParser)"""The output parser used to parse the output."""return_reasoning:bool=False# :meta private:"""DEPRECATED. Reasoning always returned."""model_config=ConfigDict(extra="ignore",)@propertydefrequires_reference(self)->bool:"""Whether this evaluator requires a reference label."""returnFalse@propertydef_tools_description(self)->str:"""Get the description of the agent tools. Returns: str: The description of the agent tools. """ifself.agent_toolsisNone:return""return"\n\n".join([f"""Tool {i}: {tool.name}Description: {tool.description}"""fori,toolinenumerate(self.agent_tools,1)])
[docs]@staticmethoddefget_agent_trajectory(steps:Union[str,Sequence[Tuple[AgentAction,str]]],)->str:"""Get the agent trajectory as a formatted string. Args: steps (Union[str, List[Tuple[AgentAction, str]]]): The agent trajectory. Returns: str: The formatted agent trajectory. """ifisinstance(steps,str):returnstepsreturn"\n\n".join([f"""Step {i}:Tool used: {action.tool}Tool input: {action.tool_input}Tool output: {output}"""fori,(action,output)inenumerate(steps,1)])
@staticmethoddef_format_reference(reference:Optional[str])->str:"""Format the reference text. Args: reference (str): The reference text. Returns: str: The formatted reference text. """ifnotreference:return""returnf"""The following is the expected answer. Use this to measure correctness:[GROUND_TRUTH]{reference}[END_GROUND_TRUTH]"""
[docs]@classmethoddeffrom_llm(cls,llm:BaseLanguageModel,agent_tools:Optional[Sequence[BaseTool]]=None,output_parser:Optional[TrajectoryOutputParser]=None,**kwargs:Any,)->"TrajectoryEvalChain":"""Create a TrajectoryEvalChain object from a language model chain. Args: llm (BaseChatModel): The language model chain. agent_tools (Optional[Sequence[BaseTool]]): A list of tools available to the agent. output_parser (Optional[TrajectoryOutputParser]): The output parser used to parse the chain output into a score. Returns: TrajectoryEvalChain: The TrajectoryEvalChain object. """ifnotisinstance(llm,BaseChatModel):raiseNotImplementedError("Only chat models supported by the current trajectory eval")ifagent_tools:prompt=EVAL_CHAT_PROMPTelse:prompt=TOOL_FREE_EVAL_CHAT_PROMPTeval_chain=LLMChain(llm=llm,prompt=prompt)returncls(agent_tools=agent_tools,# type: ignore[arg-type]eval_chain=eval_chain,output_parser=output_parserorTrajectoryOutputParser(),**kwargs,)
@propertydefinput_keys(self)->List[str]:"""Get the input keys for the chain. Returns: List[str]: The input keys. """return["question","agent_trajectory","answer","reference"]@propertydefoutput_keys(self)->List[str]:"""Get the output keys for the chain. Returns: List[str]: The output keys. """return["score","reasoning"]
[docs]defprep_inputs(self,inputs:Union[Dict[str,Any],Any])->Dict[str,str]:"""Validate and prep inputs."""inputs["reference"]=self._format_reference(inputs.get("reference"))returnsuper().prep_inputs(inputs)
def_call(self,inputs:Dict[str,str],run_manager:Optional[CallbackManagerForChainRun]=None,)->Dict[str,Any]:"""Run the chain and generate the output. Args: inputs (Dict[str, str]): The input values for the chain. run_manager (Optional[CallbackManagerForChainRun]): The callback manager for the chain run. Returns: Dict[str, Any]: The output values of the chain. """chain_input={**inputs}ifself.agent_tools:chain_input["tool_descriptions"]=self._tools_description_run_manager=run_managerorCallbackManagerForChainRun.get_noop_manager()raw_output=self.eval_chain.run(chain_input,callbacks=_run_manager.get_child())returncast(dict,self.output_parser.parse(raw_output))asyncdef_acall(self,inputs:Dict[str,str],run_manager:Optional[AsyncCallbackManagerForChainRun]=None,)->Dict[str,Any]:"""Run the chain and generate the output. Args: inputs (Dict[str, str]): The input values for the chain. run_manager (Optional[CallbackManagerForChainRun]): The callback manager for the chain run. Returns: Dict[str, Any]: The output values of the chain. """chain_input={**inputs}ifself.agent_tools:chain_input["tool_descriptions"]=self._tools_description_run_manager=run_managerorAsyncCallbackManagerForChainRun.get_noop_manager()raw_output=awaitself.eval_chain.arun(chain_input,callbacks=_run_manager.get_child())returncast(dict,self.output_parser.parse(raw_output))def_evaluate_agent_trajectory(self,*,prediction:str,input:str,agent_trajectory:Sequence[Tuple[AgentAction,str]],reference:Optional[str]=None,callbacks:Callbacks=None,tags:Optional[List[str]]=None,metadata:Optional[Dict[str,Any]]=None,include_run_info:bool=False,**kwargs:Any,)->dict:"""Evaluate a trajectory. Args: prediction (str): The final predicted response. input (str): The input to the agent. agent_trajectory (List[Tuple[AgentAction, str]]): The intermediate steps forming the agent trajectory. reference (Optional[str]): The reference answer. callbacks (Callbacks): Callbacks to use for this chain run. Returns: dict: The evaluation result, which includes the score and optionally the reasoning for reaching that. """inputs={"question":input,"agent_trajectory":self.get_agent_trajectory(agent_trajectory),"answer":prediction,"reference":reference,}returnself.__call__(inputs=inputs,callbacks=callbacks,tags=tags,metadata=metadata,include_run_info=include_run_info,return_only_outputs=True,)asyncdef_aevaluate_agent_trajectory(self,*,prediction:str,input:str,agent_trajectory:Sequence[Tuple[AgentAction,str]],reference:Optional[str]=None,callbacks:Callbacks=None,tags:Optional[List[str]]=None,metadata:Optional[Dict[str,Any]]=None,include_run_info:bool=False,**kwargs:Any,)->dict:"""Asynchronously evaluate a trajectory. Args: prediction (str): The final predicted response. input (str): The input to the agent. agent_trajectory (List[Tuple[AgentAction, str]]): The intermediate steps forming the agent trajectory. reference (Optional[str]): The reference answer. callbacks (Callbacks): Callbacks to use for this chain run. Returns: dict: The evaluation result, which includes the score and optionally the reasoning for reaching that. """inputs={"question":input,"agent_trajectory":self.get_agent_trajectory(agent_trajectory),"answer":prediction,"reference":reference,}returnawaitself.acall(inputs=inputs,callbacks=callbacks,tags=tags,metadata=metadata,include_run_info=include_run_info,return_only_outputs=True,)