"""Internal utilities for the in memory implementation of VectorStore.These are part of a private API, and users should not use them directlyas they can change without notice."""from__future__importannotationsimportloggingfromtypingimportTYPE_CHECKING,List,UnionifTYPE_CHECKING:importnumpyasnpMatrix=Union[List[List[float]],List[np.ndarray],np.ndarray]logger=logging.getLogger(__name__)def_cosine_similarity(X:Matrix,Y:Matrix)->np.ndarray:"""Row-wise cosine similarity between two equal-width matrices. Args: X: A matrix of shape (n, m). Y: A matrix of shape (k, m). Returns: A matrix of shape (n, k) where each element (i, j) is the cosine similarity between the ith row of X and the jth row of Y. Raises: ValueError: If the number of columns in X and Y are not the same. ImportError: If numpy is not installed. """try:importnumpyasnpexceptImportErrorase:raiseImportError("cosine_similarity requires numpy to be installed. ""Please install numpy with `pip install numpy`.")fromeiflen(X)==0orlen(Y)==0:returnnp.array([])X=np.array(X)Y=np.array(Y)ifX.shape[1]!=Y.shape[1]:raiseValueError(f"Number of columns in X and Y must be the same. X has shape {X.shape} "f"and Y has shape {Y.shape}.")try:importsimsimdassimdX=np.array(X,dtype=np.float32)Y=np.array(Y,dtype=np.float32)Z=1-np.array(simd.cdist(X,Y,metric="cosine"))returnZexceptImportError:logger.debug("Unable to import simsimd, defaulting to NumPy implementation. If you want ""to use simsimd please install with `pip install simsimd`.")X_norm=np.linalg.norm(X,axis=1)Y_norm=np.linalg.norm(Y,axis=1)# Ignore divide by zero errors run time warnings as those are handled below.withnp.errstate(divide="ignore",invalid="ignore"):similarity=np.dot(X,Y.T)/np.outer(X_norm,Y_norm)similarity[np.isnan(similarity)|np.isinf(similarity)]=0.0returnsimilarity
[docs]defmaximal_marginal_relevance(query_embedding:np.ndarray,embedding_list:list,lambda_mult:float=0.5,k:int=4,)->List[int]:"""Calculate maximal marginal relevance. Args: query_embedding: The query embedding. embedding_list: A list of embeddings. lambda_mult: The lambda parameter for MMR. Default is 0.5. k: The number of embeddings to return. Default is 4. Returns: A list of indices of the embeddings to return. Raises: ImportError: If numpy is not installed. """try:importnumpyasnpexceptImportErrorase:raiseImportError("maximal_marginal_relevance requires numpy to be installed. ""Please install numpy with `pip install numpy`.")fromeifmin(k,len(embedding_list))<=0:return[]ifquery_embedding.ndim==1:query_embedding=np.expand_dims(query_embedding,axis=0)similarity_to_query=_cosine_similarity(query_embedding,embedding_list)[0]most_similar=int(np.argmax(similarity_to_query))idxs=[most_similar]selected=np.array([embedding_list[most_similar]])whilelen(idxs)<min(k,len(embedding_list)):best_score=-np.infidx_to_add=-1similarity_to_selected=_cosine_similarity(embedding_list,selected)fori,query_scoreinenumerate(similarity_to_query):ifiinidxs:continueredundant_score=max(similarity_to_selected[i])equation_score=(lambda_mult*query_score-(1-lambda_mult)*redundant_score)ifequation_score>best_score:best_score=equation_scoreidx_to_add=iidxs.append(idx_to_add)selected=np.append(selected,[embedding_list[idx_to_add]],axis=0)returnidxs