Source code for langchain_community.document_transformers.embeddings_redundant_filter
"""Transform documents"""fromtypingimportAny,Callable,List,Sequenceimportnumpyasnpfromlangchain_core.documentsimportBaseDocumentTransformer,Documentfromlangchain_core.embeddingsimportEmbeddingsfromlangchain_core.pydantic_v1importBaseModel,Fieldfromlangchain_community.utils.mathimportcosine_similarityclass_DocumentWithState(Document):"""Wrapper for a document that includes arbitrary state."""state:dict=Field(default_factory=dict)"""State associated with the document."""@classmethoddefis_lc_serializable(cls)->bool:returnFalsedefto_document(self)->Document:"""Convert the DocumentWithState to a Document."""returnDocument(page_content=self.page_content,metadata=self.metadata)@classmethoddeffrom_document(cls,doc:Document)->"_DocumentWithState":"""Create a DocumentWithState from a Document."""ifisinstance(doc,cls):returndocreturncls(page_content=doc.page_content,metadata=doc.metadata)
[docs]defget_stateful_documents(documents:Sequence[Document],)->Sequence[_DocumentWithState]:"""Convert a list of documents to a list of documents with state. Args: documents: The documents to convert. Returns: A list of documents with state. """return[_DocumentWithState.from_document(doc)fordocindocuments]
def_filter_similar_embeddings(embedded_documents:List[List[float]],similarity_fn:Callable,threshold:float)->List[int]:"""Filter redundant documents based on the similarity of their embeddings."""similarity=np.tril(similarity_fn(embedded_documents,embedded_documents),k=-1)redundant=np.where(similarity>threshold)redundant_stacked=np.column_stack(redundant)redundant_sorted=np.argsort(similarity[redundant])[::-1]included_idxs=set(range(len(embedded_documents)))forfirst_idx,second_idxinredundant_stacked[redundant_sorted]:iffirst_idxinincluded_idxsandsecond_idxinincluded_idxs:# Default to dropping the second document of any highly similar pair.included_idxs.remove(second_idx)returnlist(sorted(included_idxs))def_get_embeddings_from_stateful_docs(embeddings:Embeddings,documents:Sequence[_DocumentWithState])->List[List[float]]:iflen(documents)and"embedded_doc"indocuments[0].state:embedded_documents=[doc.state["embedded_doc"]fordocindocuments]else:embedded_documents=embeddings.embed_documents([d.page_contentfordindocuments])fordoc,embeddinginzip(documents,embedded_documents):doc.state["embedded_doc"]=embeddingreturnembedded_documentsasyncdef_aget_embeddings_from_stateful_docs(embeddings:Embeddings,documents:Sequence[_DocumentWithState])->List[List[float]]:iflen(documents)and"embedded_doc"indocuments[0].state:embedded_documents=[doc.state["embedded_doc"]fordocindocuments]else:embedded_documents=awaitembeddings.aembed_documents([d.page_contentfordindocuments])fordoc,embeddinginzip(documents,embedded_documents):doc.state["embedded_doc"]=embeddingreturnembedded_documentsdef_filter_cluster_embeddings(embedded_documents:List[List[float]],num_clusters:int,num_closest:int,random_state:int,remove_duplicates:bool,)->List[int]:"""Filter documents based on proximity of their embeddings to clusters."""try:fromsklearn.clusterimportKMeansexceptImportError:raiseImportError("sklearn package not found, please install it with ""`pip install scikit-learn`")kmeans=KMeans(n_clusters=num_clusters,random_state=random_state).fit(embedded_documents)closest_indices=[]# Loop through the number of clusters you haveforiinrange(num_clusters):# Get the list of distances from that particular cluster centerdistances=np.linalg.norm(embedded_documents-kmeans.cluster_centers_[i],axis=1)# Find the indices of the two unique closest ones# (using argsort to find the smallest 2 distances)ifremove_duplicates:# Only add not duplicated vectors.closest_indices_sorted=[xforxinnp.argsort(distances)[:num_closest]ifxnotinclosest_indices]else:# Skip duplicates and add the next closest vector.closest_indices_sorted=[xforxinnp.argsort(distances)ifxnotinclosest_indices][:num_closest]# Append that position closest indices listclosest_indices.extend(closest_indices_sorted)returnclosest_indices
[docs]classEmbeddingsRedundantFilter(BaseDocumentTransformer,BaseModel):"""Filter that drops redundant documents by comparing their embeddings."""embeddings:Embeddings"""Embeddings to use for embedding document contents."""similarity_fn:Callable=cosine_similarity"""Similarity function for comparing documents. Function expected to take as input two matrices (List[List[float]]) and return a matrix of scores where higher values indicate greater similarity."""similarity_threshold:float=0.95"""Threshold for determining when two documents are similar enough to be considered redundant."""classConfig:arbitrary_types_allowed=True
[docs]deftransform_documents(self,documents:Sequence[Document],**kwargs:Any)->Sequence[Document]:"""Filter down documents."""stateful_documents=get_stateful_documents(documents)embedded_documents=_get_embeddings_from_stateful_docs(self.embeddings,stateful_documents)included_idxs=_filter_similar_embeddings(embedded_documents,self.similarity_fn,self.similarity_threshold)return[stateful_documents[i]foriinsorted(included_idxs)]
[docs]classEmbeddingsClusteringFilter(BaseDocumentTransformer,BaseModel):"""Perform K-means clustering on document vectors. Returns an arbitrary number of documents closest to center."""embeddings:Embeddings"""Embeddings to use for embedding document contents."""num_clusters:int=5"""Number of clusters. Groups of documents with similar meaning."""num_closest:int=1"""The number of closest vectors to return for each cluster center."""random_state:int=42"""Controls the random number generator used to initialize the cluster centroids. If you set the random_state parameter to None, the KMeans algorithm will use a random number generator that is seeded with the current time. This means that the results of the KMeans algorithm will be different each time you run it."""sorted:bool=False"""By default results are re-ordered "grouping" them by cluster, if sorted is true result will be ordered by the original position from the retriever"""remove_duplicates:bool=False""" By default duplicated results are skipped and replaced by the next closest vector in the cluster. If remove_duplicates is true no replacement will be done: This could dramatically reduce results when there is a lot of overlap between clusters. """classConfig:arbitrary_types_allowed=True
[docs]deftransform_documents(self,documents:Sequence[Document],**kwargs:Any)->Sequence[Document]:"""Filter down documents."""stateful_documents=get_stateful_documents(documents)embedded_documents=_get_embeddings_from_stateful_docs(self.embeddings,stateful_documents)included_idxs=_filter_cluster_embeddings(embedded_documents,self.num_clusters,self.num_closest,self.random_state,self.remove_duplicates,)results=sorted(included_idxs)ifself.sortedelseincluded_idxsreturn[stateful_documents[i]foriinresults]