Source code for langchain_google_vertexai.evaluators.evaluation
fromabcimportABCfromtypingimportAny,Dict,List,Optional,Sequencefromgoogle.api_core.client_optionsimportClientOptionsfromgoogle.cloud.aiplatform.constantsimportbaseasconstantsfromgoogle.cloud.aiplatform_v1beta1import(EvaluationServiceAsyncClient,EvaluationServiceClient,)fromgoogle.cloud.aiplatform_v1beta1.typesimport(EvaluateInstancesRequest,EvaluateInstancesResponse,)fromgoogle.protobuf.json_formatimportMessageToDictfromlangchain_google_vertexai._utilsimport(get_client_info,get_user_agent,)fromlangchain_google_vertexai.evaluators._coreimport(PairwiseStringEvaluator,StringEvaluator,)_METRICS=["bleu","exact_match","rouge","coherence","fluency","safety","groundedness","fulfillment","summarization_quality","summarization_helpfulness","summarization_verbosity","question_answering_quality","question_answering_relevance","question_answering_correctness",]_PAIRWISE_METRICS=["pairwise_question_answering_quality","pairwise_summarization_quality",]_METRICS_INPUTS={"rouge1":{"rouge_type":"rouge1"},"rouge2":{"rouge_type":"rouge2"},"rougeL":{"rouge_type":"rougeL"},"rougeLsum":{"rouge_type":"rougeLsum"},}_METRICS_ATTRS={"safety":["prediction"],"coherence":["prediction"],"fluency":["prediction"],"groundedness":["context","prediction"],"fulfillment":["prediction","instruction"],"summarization_quality":["prediction","instruction","context"],"summarization_helpfulness":["prediction","context"],"summarization_verbosity":["prediction","context"],"question_answering_quality":["prediction","context","instruction"],"question_answering_relevance":["prediction","instruction"],"question_answering_correctness":["prediction","instruction"],"pairwise_question_answering_quality":["prediction","baseline_prediction","context","instruction",],"pairwise_summarization_quality":["prediction","baseline_prediction","context","instruction",],}_METRICS_OPTIONAL_ATTRS={"summarization_quality":["reference"],"summarization_helpfulness":["reference","instruction"],"summarization_verbosity":["reference","instruction"],"question_answering_quality":["reference"],"question_answering_relevance":["reference","context"],"question_answering_correctness":["reference","context"],"pairwise_question_answering_quality":["reference"],"pairwise_summarization_quality":["reference"],}# a client supports multiple instances per request for these metrics_METRICS_MULTIPLE_INSTANCES=["bleu","exact_match","rouge"]def_format_metric(metric:str)->str:ifmetric.startswith("rouge"):return"rouge"returnmetricdef_format_instance(instance:Dict[str,str],metric:str)->Dict[str,str]:attrs=_METRICS_ATTRS.get(metric,["prediction","reference"])result={a:instance[a]forainattrs}forattrin_METRICS_OPTIONAL_ATTRS.get(metric,[]):ifattrininstance:result[attr]=instance[attr]returnresultdef_prepare_request(instances:Sequence[Dict[str,str]],metric:str,location:str)->EvaluateInstancesRequest:request=EvaluateInstancesRequest()metric_input:Dict[str,Any]={"metric_spec":_METRICS_INPUTS.get(metric,{})}if_format_metric(metric)notin_METRICS_MULTIPLE_INSTANCES:iflen(instances)>1:raiseValueError(f"Metric {metric} supports only a single instance per request, "f"got {len(instances)}!")metric_input["instance"]=_format_instance(instances[0],metric=metric)else:metric_input["instances"]=[_format_instance(i,metric=metric)foriininstances]setattr(request,f"{_format_metric(metric)}_input",metric_input)request.location=locationreturnrequestdef_parse_response(response:EvaluateInstancesResponse,metric:str)->List[Dict[str,Any]]:metric=_format_metric(metric)result=MessageToDict(response._pb,preserving_proto_field_name=True)ifmetricin_METRICS_MULTIPLE_INSTANCES:returnresult[f"{metric}_results"][f"{metric}_metric_values"]return[result[f"{metric}_result"]]class_EvaluatorBase(ABC):@propertydef_user_agent(self)->str:"""Gets the User Agent."""_,user_agent=get_user_agent(f"{type(self).__name__}_{self._metric}")returnuser_agentdef__init__(self,metric:str,project_id:str,location:str="us-central1"):self._metric=metricclient_options=ClientOptions(api_endpoint=f"{location}-{constants.PREDICTION_API_BASE_PATH}")self._client=EvaluationServiceClient(client_options=client_options,client_info=get_client_info(module=self._user_agent),)self._async_client=EvaluationServiceAsyncClient(client_options=client_options,client_info=get_client_info(module=self._user_agent),)self._location=self._client.common_location_path(project_id,location)def_prepare_request(self,prediction:str,reference:Optional[str]=None,input:Optional[str]=None,**kwargs:Any,)->EvaluateInstancesRequest:instance={"prediction":prediction}ifreference:instance["reference"]=referenceifinput:instance["context"]=inputinstance={**instance,**kwargs}return_prepare_request([instance],metric=self._metric,location=self._location)
[docs]classVertexStringEvaluator(_EvaluatorBase,StringEvaluator):"""Evaluate the perplexity of a predicted string."""
[docs]def__init__(self,metric:str,**kwargs):super().__init__(metric,**kwargs)if_format_metric(metric)notin_METRICS:raiseValueError(f"Metric {metric} is not supported yet!")
[docs]classVertexPairWiseStringEvaluator(_EvaluatorBase,PairwiseStringEvaluator):"""Evaluate the perplexity of a predicted string."""
[docs]def__init__(self,metric:str,**kwargs):super().__init__(metric,**kwargs)if_format_metric(metric)notin_PAIRWISE_METRICS:raiseValueError(f"Metric {metric} is not supported yet!")