Source code for langchain.evaluation.scoring.eval_chain
"""Base classes for scoring the output of a model on a scale of 1-10."""from__future__importannotationsimportloggingimportrefromtypingimportAny,Dict,List,Optional,Unionfromlangchain_core.callbacks.managerimportCallbacksfromlangchain_core.language_modelsimportBaseLanguageModelfromlangchain_core.output_parsersimportBaseOutputParserfromlangchain_core.prompts.promptimportPromptTemplatefrompydanticimportConfigDict,Fieldfromlangchain.chains.constitutional_ai.modelsimportConstitutionalPrinciplefromlangchain.chains.llmimportLLMChainfromlangchain.evaluation.criteria.eval_chainimport(CRITERIA_TYPE,Criteria,)fromlangchain.evaluation.schemaimportLLMEvalChain,StringEvaluatorfromlangchain.evaluation.scoring.promptimport(CRITERIA_INSTRUCTIONS,DEFAULT_CRITERIA,SCORING_TEMPLATE,SCORING_TEMPLATE_WITH_REFERENCE,)fromlangchain.schemaimportRUN_KEYlogger=logging.getLogger(__name__)_FIND_DOUBLE_BRACKETS=re.compile(r"\[\[(.*?)\]\]")_SUPPORTED_CRITERIA={Criteria.CONCISENESS:"Is the submission concise and to the point?",Criteria.RELEVANCE:"Is the submission referring to a real quote from the text?",Criteria.CORRECTNESS:"Is the submission correct, accurate, and factual?",Criteria.COHERENCE:"Is the submission coherent, well-structured, and organized?",Criteria.HARMFULNESS:"Is the submission harmful, offensive, or inappropriate?",Criteria.MALICIOUSNESS:"Is the submission malicious in any way?",Criteria.HELPFULNESS:"Is the submission helpful, insightful, and appropriate?",Criteria.CONTROVERSIALITY:"Is the submission controversial or debatable?",Criteria.MISOGYNY:"Is the submission misogynistic or sexist?",Criteria.CRIMINALITY:"Is the submission criminal in any way?",Criteria.INSENSITIVITY:"Is the submission insensitive to any group of people?",Criteria.DEPTH:"Does the submission demonstrate depth of thought?",Criteria.CREATIVITY:"Does the submission demonstrate novelty or unique ideas?",Criteria.DETAIL:"Does the submission demonstrate attention to detail?",}
[docs]defresolve_criteria(criteria:Optional[Union[CRITERIA_TYPE,str,List[CRITERIA_TYPE]]],)->dict:"""Resolve the criteria for the pairwise evaluator. Args: criteria (Union[CRITERIA_TYPE, str], optional): The criteria to use. Returns: dict: The resolved criteria. """ifcriteriaisNone:_default_criteria=[Criteria.HELPFULNESS,Criteria.RELEVANCE,Criteria.CORRECTNESS,Criteria.DEPTH,]return{k.value:_SUPPORTED_CRITERIA[k]forkin_default_criteria}elifisinstance(criteria,Criteria):criteria_={criteria.value:_SUPPORTED_CRITERIA[criteria]}elifisinstance(criteria,str):ifcriteriain_SUPPORTED_CRITERIA:criteria_={criteria:_SUPPORTED_CRITERIA[Criteria(criteria)]}else:criteria_={criteria:""}elifisinstance(criteria,ConstitutionalPrinciple):criteria_={criteria.name:criteria.critique_request}elifisinstance(criteria,(list,tuple)):criteria_={k:vforcriterionincriteriafork,vinresolve_criteria(criterion).items()}else:ifnotcriteria:raiseValueError("Criteria cannot be empty. ""Please provide a criterion name or a mapping of the criterion name"" to its description.")criteria_=dict(criteria)returncriteria_
[docs]classScoreStringResultOutputParser(BaseOutputParser[dict]):"""A parser for the output of the ScoreStringEvalChain. Attributes: _type (str): The type of the output parser. """@propertydef_type(self)->str:"""Return the type of the output parser. Returns: str: The type of the output parser. """return"pairwise_string_result"
[docs]defparse(self,text:str)->Dict[str,Any]:"""Parse the output text. Args: text (str): The output text to parse. Returns: Dict: The parsed output. Raises: ValueError: If the verdict is invalid. """match=_FIND_DOUBLE_BRACKETS.search(text)ifmatch:verdict=match.group(1)ifnotmatchorverdictnotinlist("123456789")+["10"]:raiseValueError(f"Invalid output: {text}. ""Output must contain a double bracketed string\ with the verdict between 1 and 10.")return{"reasoning":text,"score":int(verdict),}
[docs]classScoreStringEvalChain(StringEvaluator,LLMEvalChain,LLMChain):# type: ignore[override]"""A chain for scoring on a scale of 1-10 the output of a model. Attributes: output_parser (BaseOutputParser): The output parser for the chain. Example: >>> from langchain_community.chat_models import ChatOpenAI >>> from langchain.evaluation.scoring import ScoreStringEvalChain >>> llm = ChatOpenAI(temperature=0, model_name="gpt-4") >>> chain = ScoreStringEvalChain.from_llm(llm=llm) >>> result = chain.evaluate_strings( ... input = "What is the chemical formula for water?", ... prediction = "H2O", ... reference = "The chemical formula for water is H2O.", ... ) >>> print(result) # { # "score": 8, # "comment": "The response accurately states " # "that the chemical formula for water is H2O." # "However, it does not provide an explanation of what the formula means." # } """output_key:str="results"#: :meta private:output_parser:BaseOutputParser=Field(default_factory=ScoreStringResultOutputParser)normalize_by:Optional[float]=None"""The value to normalize the score by, if specified."""criterion_name:str"""The name of the criterion being evaluated."""model_config=ConfigDict(extra="ignore",)@classmethoddefis_lc_serializable(cls)->bool:returnFalse@propertydefrequires_reference(self)->bool:"""Return whether the chain requires a reference. Returns: bool: True if the chain requires a reference, False otherwise. """returnFalse@propertydefrequires_input(self)->bool:"""Return whether the chain requires an input. Returns: bool: True if the chain requires an input, False otherwise. """returnTrue@propertydefevaluation_name(self)->str:"""Get the name of the evaluation. Returns ------- str The name of the evaluation. """returnf"score_string:{self.criterion_name}"@propertydef_skip_reference_warning(self)->str:"""Return the warning to show when reference is ignored. Returns: str: The warning to show when reference is ignored. """return(f"Ignoring reference in {self.__class__.__name__}, as it is not expected.""\nTo use a reference, use the LabeledScoreStringEvalChain instead."" (EvaluatorType.LABELED_SCORE_STRING) instead.")
[docs]@classmethoddeffrom_llm(cls,llm:BaseLanguageModel,*,prompt:Optional[PromptTemplate]=None,criteria:Optional[Union[CRITERIA_TYPE,str]]=None,normalize_by:Optional[float]=None,**kwargs:Any,)->ScoreStringEvalChain:"""Initialize the ScoreStringEvalChain from an LLM. Args: llm (BaseChatModel): The LLM to use (GPT-4 recommended). prompt (PromptTemplate, optional): The prompt to use. **kwargs (Any): Additional keyword arguments. Returns: ScoreStringEvalChain: The initialized ScoreStringEvalChain. Raises: ValueError: If the input variables are not as expected. """ifnot(hasattr(llm,"model_name")andnotllm.model_name.startswith("gpt-4")):logger.warning("This chain was only tested with GPT-4. \Performance may be significantly worse with other models.")expected_input_vars={"prediction","input","criteria"}prompt_=promptorSCORING_TEMPLATE.partial(reference="")ifexpected_input_vars!=set(prompt_.input_variables):raiseValueError(f"Input variables should be {expected_input_vars}, "f"but got {prompt_.input_variables}")criteria_=resolve_criteria(criteria)criteria_str="\n".join(f"{k}: {v}"ifvelsekfork,vincriteria_.items()).strip()criteria_str=(CRITERIA_INSTRUCTIONS+f"{criteria_str}\n"ifcriteria_strelseDEFAULT_CRITERIA)returncls(llm=llm,prompt=prompt_.partial(criteria=criteria_str),normalize_by=normalize_by,criterion_name="-".join(criteria_),**kwargs,)
def_prepare_input(self,prediction:str,input:Optional[str],reference:Optional[str],)->dict:"""Prepare the input for the chain. Args: prediction (str): The output string from the first model. prediction_b (str): The output string from the second model. input (str, optional): The input or task string. reference (str, optional): The reference string, if any. Returns: dict: The prepared input for the chain. """input_={"prediction":prediction,"input":input,}ifself.requires_reference:input_["reference"]=referencereturninput_def_prepare_output(self,result:dict)->dict:"""Prepare the output."""parsed=result[self.output_key]ifRUN_KEYinresult:parsed[RUN_KEY]=result[RUN_KEY]if"score"inparsedandself.normalize_byisnotNone:parsed["score"]=parsed["score"]/self.normalize_byreturnparseddef_evaluate_strings(self,*,prediction:str,input:Optional[str]=None,reference:Optional[str]=None,callbacks:Callbacks=None,tags:Optional[List[str]]=None,metadata:Optional[Dict[str,Any]]=None,include_run_info:bool=False,**kwargs:Any,)->dict:"""Score the output string. Args: prediction (str): The output string from the first model. input (str, optional): The input or task string. callbacks (Callbacks, optional): The callbacks to use. reference (str, optional): The reference string, if any. **kwargs (Any): Additional keyword arguments. Returns: dict: A dictionary containing: - reasoning: The reasoning for the preference. - score: A score between 1 and 10. """input_=self._prepare_input(prediction,input,reference)result=self(inputs=input_,callbacks=callbacks,tags=tags,metadata=metadata,include_run_info=include_run_info,)returnself._prepare_output(result)asyncdef_aevaluate_string_pairs(self,*,prediction:str,reference:Optional[str]=None,input:Optional[str]=None,callbacks:Callbacks=None,tags:Optional[List[str]]=None,metadata:Optional[Dict[str,Any]]=None,include_run_info:bool=False,**kwargs:Any,)->dict:"""Asynchronously score the output string. Args: prediction (str): The output string from the first model. input (str, optional): The input or task string. callbacks (Callbacks, optional): The callbacks to use. reference (str, optional): The reference string, if any. **kwargs (Any): Additional keyword arguments. Returns: dict: A dictionary containing: - reasoning: The reasoning for the preference. - score: A score between 1 and 10. """input_=self._prepare_input(prediction,input,reference)result=awaitself.acall(inputs=input_,callbacks=callbacks,tags=tags,metadata=metadata,include_run_info=include_run_info,)returnself._prepare_output(result)
[docs]classLabeledScoreStringEvalChain(ScoreStringEvalChain):# type: ignore[override]"""A chain for scoring the output of a model on a scale of 1-10. Attributes: output_parser (BaseOutputParser): The output parser for the chain. """@propertydefrequires_reference(self)->bool:"""Return whether the chain requires a reference. Returns: bool: True if the chain requires a reference, False otherwise. """returnTrue
[docs]@classmethoddeffrom_llm(cls,llm:BaseLanguageModel,*,prompt:Optional[PromptTemplate]=None,criteria:Optional[Union[CRITERIA_TYPE,str]]=None,normalize_by:Optional[float]=None,**kwargs:Any,)->LabeledScoreStringEvalChain:"""Initialize the LabeledScoreStringEvalChain from an LLM. Args: llm (BaseLanguageModel): The LLM to use. prompt (PromptTemplate, optional): The prompt to use. criteria (Union[CRITERIA_TYPE, str], optional): The criteria to use. normalize_by (float, optional): The value to normalize the score by. **kwargs (Any): Additional keyword arguments. Returns: LabeledScoreStringEvalChain: The initialized LabeledScoreStringEvalChain. Raises: ValueError: If the input variables are not as expected. """expected_input_vars={"prediction","input","reference","criteria",}prompt_=promptorSCORING_TEMPLATE_WITH_REFERENCEifexpected_input_vars!=set(prompt_.input_variables):raiseValueError(f"Input variables should be {expected_input_vars}, "f"but got {prompt_.input_variables}")criteria_=resolve_criteria(criteria)criteria_str="\n".join(f"{k}: {v}"fork,vincriteria_.items()).strip()criteria_str=(CRITERIA_INSTRUCTIONS+f"{criteria_str}\n"ifcriteria_strelseDEFAULT_CRITERIA)returncls(llm=llm,prompt=prompt_.partial(criteria=criteria_str),normalize_by=normalize_by,criterion_name="-".join(criteria_),**kwargs,)