[docs]defweighted_sum_ranking(primary_search_results:Sequence[RowMapping],secondary_search_results:Sequence[RowMapping],primary_results_weight:float=0.5,secondary_results_weight:float=0.5,fetch_top_k:int=4,)->Sequence[dict[str,Any]]:""" Ranks documents using a weighted sum of scores from two sources. Args: primary_search_results: A list of (document, distance) tuples from the primary search. secondary_search_results: A list of (document, distance) tuples from the secondary search. primary_results_weight: The weight for the primary source's scores. Defaults to 0.5. secondary_results_weight: The weight for the secondary source's scores. Defaults to 0.5. fetch_top_k: The number of documents to fetch after merging the results. Defaults to 4. Returns: A list of (document, distance) tuples, sorted by weighted_score in descending order. """# stores computed metric with provided distance metric and weightsweighted_scores:dict[str,dict[str,Any]]={}# Process results from primary sourceforrowinprimary_search_results:values=list(row.values())doc_id=str(values[0])# first value is doc_iddistance=float(values[-1])# type: ignore # last value is distancerow_values=dict(row)row_values["distance"]=primary_results_weight*distanceweighted_scores[doc_id]=row_values# Process results from secondary source,# adding to existing scores or creating new onesforrowinsecondary_search_results:values=list(row.values())doc_id=str(values[0])# first value is doc_iddistance=float(values[-1])# type: ignore # last value is distanceprimary_score=(weighted_scores[doc_id]["distance"]ifdoc_idinweighted_scoreselse0.0)row_values=dict(row)row_values["distance"]=distance*secondary_results_weight+primary_scoreweighted_scores[doc_id]=row_values# Sort the results by weighted score in descending orderranked_results=sorted(weighted_scores.values(),key=lambdaitem:item["distance"],reverse=True)returnranked_results[:fetch_top_k]
[docs]defreciprocal_rank_fusion(primary_search_results:Sequence[RowMapping],secondary_search_results:Sequence[RowMapping],rrf_k:float=60,fetch_top_k:int=4,)->Sequence[dict[str,Any]]:""" Ranks documents using Reciprocal Rank Fusion (RRF) of scores from two sources. Args: primary_search_results: A list of (document, distance) tuples from the primary search. secondary_search_results: A list of (document, distance) tuples from the secondary search. rrf_k: The RRF parameter k. Defaults to 60. fetch_top_k: The number of documents to fetch after merging the results. Defaults to 4. Returns: A list of (document_id, rrf_score) tuples, sorted by rrf_score in descending order. """rrf_scores:dict[str,dict[str,Any]]={}# Process results from primary sourceforrank,rowinenumerate(sorted(primary_search_results,key=lambdaitem:item["distance"],reverse=True)):values=list(row.values())doc_id=str(values[0])row_values=dict(row)primary_score=rrf_scores[doc_id]["distance"]ifdoc_idinrrf_scoreselse0.0primary_score+=1.0/(rank+rrf_k)row_values["distance"]=primary_scorerrf_scores[doc_id]=row_values# Process results from secondary sourceforrank,rowinenumerate(sorted(secondary_search_results,key=lambdaitem:item["distance"],reverse=True)):values=list(row.values())doc_id=str(values[0])row_values=dict(row)secondary_score=(rrf_scores[doc_id]["distance"]ifdoc_idinrrf_scoreselse0.0)secondary_score+=1.0/(rank+rrf_k)row_values["distance"]=secondary_scorerrf_scores[doc_id]=row_values# Sort the results by rrf score in descending order# Sort the results by weighted score in descending orderranked_results=sorted(rrf_scores.values(),key=lambdaitem:item["distance"],reverse=True)# Extract only the RowMapping for the top resultsreturnranked_results[:fetch_top_k]
[docs]@dataclassclassHybridSearchConfig(ABC):""" AlloyDB Vector Store Hybrid Search Config. Queries might be slow if the hybrid search column does not exist. For best hybrid search performance, consider creating a TSV column and adding GIN index. """tsv_column:Optional[str]=""tsv_lang:Optional[str]="pg_catalog.english"fts_query:Optional[str]=""fusion_function:Callable[[Sequence[RowMapping],Sequence[RowMapping],Any],Sequence[Any]]=weighted_sum_ranking# Updated defaultfusion_function_parameters:dict[str,Any]=field(default_factory=dict)primary_top_k:int=4secondary_top_k:int=4index_name:str="langchain_tsv_index"index_type:str="GIN"