Source code for langchain.evaluation.qa.eval_chain
"""LLM Chains for evaluating question answering."""from__future__importannotationsimportreimportstringfromtypingimportAny,List,Optional,Sequence,Tuplefromlangchain_core.callbacks.managerimportCallbacksfromlangchain_core.language_modelsimportBaseLanguageModelfromlangchain_core.promptsimportPromptTemplatefrompydanticimportConfigDictfromlangchain.chains.llmimportLLMChainfromlangchain.evaluation.qa.eval_promptimportCONTEXT_PROMPT,COT_PROMPT,PROMPTfromlangchain.evaluation.schemaimportLLMEvalChain,StringEvaluatorfromlangchain.schemaimportRUN_KEYdef_get_score(text:str)->Optional[Tuple[str,int]]:match=re.search(r"grade:\s*(correct|incorrect)",text.strip(),re.IGNORECASE)ifmatch:ifmatch.group(1).upper()=="CORRECT":return"CORRECT",1elifmatch.group(1).upper()=="INCORRECT":return"INCORRECT",0try:first_word=(text.strip().split()[0].translate(str.maketrans("","",string.punctuation)))iffirst_word.upper()=="CORRECT":return"CORRECT",1eliffirst_word.upper()=="INCORRECT":return"INCORRECT",0last_word=(text.strip().split()[-1].translate(str.maketrans("","",string.punctuation)))iflast_word.upper()=="CORRECT":return"CORRECT",1eliflast_word.upper()=="INCORRECT":return"INCORRECT",0exceptIndexError:passreturnNonedef_parse_string_eval_output(text:str)->dict:"""Parse the output text. Args: text (str): The output text to parse. Returns: Any: The parsed output. """reasoning=text.strip()parsed_scores=_get_score(reasoning)ifparsed_scoresisNone:value,score=None,Noneelse:value,score=parsed_scoresreturn{"reasoning":reasoning,"value":value,"score":score,}
[docs]classQAEvalChain(LLMChain,StringEvaluator,LLMEvalChain):"""LLM Chain for evaluating question answering."""output_key:str="results"#: :meta private:model_config=ConfigDict(extra="ignore",)@classmethoddefis_lc_serializable(cls)->bool:returnFalse@propertydefevaluation_name(self)->str:return"correctness"@propertydefrequires_reference(self)->bool:returnTrue@propertydefrequires_input(self)->bool:returnTrue
[docs]@classmethoddeffrom_llm(cls,llm:BaseLanguageModel,prompt:Optional[PromptTemplate]=None,**kwargs:Any,)->QAEvalChain:"""Load QA Eval Chain from LLM. Args: llm (BaseLanguageModel): the base language model to use. prompt (PromptTemplate): A prompt template containing the input_variables: 'input', 'answer' and 'result' that will be used as the prompt for evaluation. Defaults to PROMPT. **kwargs: additional keyword arguments. Returns: QAEvalChain: the loaded QA eval chain. """prompt=promptorPROMPTexpected_input_vars={"query","answer","result"}ifexpected_input_vars!=set(prompt.input_variables):raiseValueError(f"Input variables should be {expected_input_vars}, "f"but got {prompt.input_variables}")returncls(llm=llm,prompt=prompt,**kwargs)
[docs]defevaluate(self,examples:Sequence[dict],predictions:Sequence[dict],question_key:str="query",answer_key:str="answer",prediction_key:str="result",*,callbacks:Callbacks=None,)->List[dict]:"""Evaluate question answering examples and predictions."""inputs=[{"query":example[question_key],"answer":example[answer_key],"result":predictions[i][prediction_key],}fori,exampleinenumerate(examples)]returnself.apply(inputs,callbacks=callbacks)
def_prepare_output(self,result:dict)->dict:parsed_result=_parse_string_eval_output(result[self.output_key])ifRUN_KEYinresult:parsed_result[RUN_KEY]=result[RUN_KEY]returnparsed_resultdef_evaluate_strings(self,*,prediction:str,reference:Optional[str]=None,input:Optional[str]=None,callbacks:Callbacks=None,include_run_info:bool=False,**kwargs:Any,)->dict:"""Evaluate Chain or LLM output, based on optional input and label. Args: prediction (str): the LLM or chain prediction to evaluate. reference (Optional[str], optional): the reference label to evaluate against. input (Optional[str], optional): the input to consider during evaluation callbacks (Callbacks, optional): the callbacks to use for tracing. include_run_info (bool, optional): whether to include run info in the returned results. **kwargs: additional keyword arguments, including callbacks, tags, etc. Returns: dict: The evaluation results containing the score or value. """result=self({"query":input,"answer":reference,"result":prediction,},callbacks=callbacks,include_run_info=include_run_info,)returnself._prepare_output(result)asyncdef_aevaluate_strings(self,*,prediction:str,reference:Optional[str]=None,input:Optional[str]=None,callbacks:Callbacks=None,include_run_info:bool=False,**kwargs:Any,)->dict:result=awaitself.acall(inputs={"query":input,"answer":reference,"result":prediction},callbacks=callbacks,include_run_info=include_run_info,)returnself._prepare_output(result)
[docs]classContextQAEvalChain(LLMChain,StringEvaluator,LLMEvalChain):"""LLM Chain for evaluating QA w/o GT based on context"""@classmethoddefis_lc_serializable(cls)->bool:returnFalse@propertydefrequires_reference(self)->bool:"""Whether the chain requires a reference string."""returnTrue@propertydefrequires_input(self)->bool:"""Whether the chain requires an input string."""returnTruemodel_config=ConfigDict(extra="ignore",)@classmethoddef_validate_input_vars(cls,prompt:PromptTemplate)->None:expected_input_vars={"query","context","result"}ifexpected_input_vars!=set(prompt.input_variables):raiseValueError(f"Input variables should be {expected_input_vars}, "f"but got {prompt.input_variables}")@propertydefevaluation_name(self)->str:return"Contextual Accuracy"
[docs]@classmethoddeffrom_llm(cls,llm:BaseLanguageModel,prompt:Optional[PromptTemplate]=None,**kwargs:Any,)->ContextQAEvalChain:"""Load QA Eval Chain from LLM. Args: llm (BaseLanguageModel): the base language model to use. prompt (PromptTemplate): A prompt template containing the input_variables: 'query', 'context' and 'result' that will be used as the prompt for evaluation. Defaults to PROMPT. **kwargs: additional keyword arguments. Returns: ContextQAEvalChain: the loaded QA eval chain. """prompt=promptorCONTEXT_PROMPTcls._validate_input_vars(prompt)returncls(llm=llm,prompt=prompt,**kwargs)
[docs]defevaluate(self,examples:List[dict],predictions:List[dict],question_key:str="query",context_key:str="context",prediction_key:str="result",*,callbacks:Callbacks=None,)->List[dict]:"""Evaluate question answering examples and predictions."""inputs=[{"query":example[question_key],"context":example[context_key],"result":predictions[i][prediction_key],}fori,exampleinenumerate(examples)]returnself.apply(inputs,callbacks=callbacks)
[docs]classCotQAEvalChain(ContextQAEvalChain):"""LLM Chain for evaluating QA using chain of thought reasoning."""@classmethoddefis_lc_serializable(cls)->bool:returnFalse@propertydefevaluation_name(self)->str:return"COT Contextual Accuracy"
[docs]@classmethoddeffrom_llm(cls,llm:BaseLanguageModel,prompt:Optional[PromptTemplate]=None,**kwargs:Any,)->CotQAEvalChain:"""Load QA Eval Chain from LLM."""prompt=promptorCOT_PROMPTcls._validate_input_vars(prompt)returncls(llm=llm,prompt=prompt,**kwargs)