[docs]classSearchType(str,Enum):"""Enumerator of the types of search to perform."""similarity="similarity"mmr="mmr"
[docs]classDocArrayRetriever(BaseRetriever):"""`DocArray Document Indices` retriever. Currently, it supports 5 backends: InMemoryExactNNIndex, HnswDocumentIndex, QdrantDocumentIndex, ElasticDocIndex, and WeaviateDocumentIndex. Args: index: One of the above-mentioned index instances embeddings: Embedding model to represent text as vectors search_field: Field to consider for searching in the documents. Should be an embedding/vector/tensor. content_field: Field that represents the main content in your document schema. Will be used as a `page_content`. Everything else will go into `metadata`. search_type: Type of search to perform (similarity / mmr) filters: Filters applied for document retrieval. top_k: Number of documents to return """index:Any=Noneembeddings:Embeddingssearch_field:strcontent_field:strsearch_type:SearchType=SearchType.similaritytop_k:int=1filters:Optional[Any]=Nonemodel_config=ConfigDict(arbitrary_types_allowed=True,)def_get_relevant_documents(self,query:str,*,run_manager:CallbackManagerForRetrieverRun,)->List[Document]:"""Get documents relevant for a query. Args: query: string to find relevant documents for Returns: List of relevant documents """query_emb=np.array(self.embeddings.embed_query(query))ifself.search_type==SearchType.similarity:results=self._similarity_search(query_emb)elifself.search_type==SearchType.mmr:results=self._mmr_search(query_emb)else:raiseValueError(f"Search type {self.search_type} does not exist. "f"Choose either 'similarity' or 'mmr'.")returnresultsdef_search(self,query_emb:np.ndarray,top_k:int)->List[Union[Dict[str,Any],Any]]:""" Perform a search using the query embedding and return top_k documents. Args: query_emb: Query represented as an embedding top_k: Number of documents to return Returns: A list of top_k documents matching the query """fromdocarray.indeximportElasticDocIndex,WeaviateDocumentIndexfilter_args={}search_field=self.search_fieldifisinstance(self.index,WeaviateDocumentIndex):filter_args["where_filter"]=self.filterssearch_field=""elifisinstance(self.index,ElasticDocIndex):filter_args["query"]=self.filterselse:filter_args["filter_query"]=self.filtersifself.filters:query=(self.index.build_query()# get empty query object.find(query=query_emb,search_field=search_field)# add vector similarity search.filter(**filter_args)# add filter search.build(limit=top_k)# build the query)# execute the combined query and return the resultsdocs=self.index.execute_query(query)ifhasattr(docs,"documents"):docs=docs.documentsdocs=docs[:top_k]else:docs=self.index.find(query=query_emb,search_field=search_field,limit=top_k).documentsreturndocsdef_similarity_search(self,query_emb:np.ndarray)->List[Document]:""" Perform a similarity search. Args: query_emb: Query represented as an embedding Returns: A list of documents most similar to the query """docs=self._search(query_emb=query_emb,top_k=self.top_k)results=[self._docarray_to_langchain_doc(doc)fordocindocs]returnresultsdef_mmr_search(self,query_emb:np.ndarray)->List[Document]:""" Perform a maximal marginal relevance (mmr) search. Args: query_emb: Query represented as an embedding Returns: A list of diverse documents related to the query """docs=self._search(query_emb=query_emb,top_k=20)mmr_selected=maximal_marginal_relevance(query_emb,[doc[self.search_field]ifisinstance(doc,dict)elsegetattr(doc,self.search_field)fordocindocs],k=self.top_k,)results=[self._docarray_to_langchain_doc(docs[idx])foridxinmmr_selected]returnresultsdef_docarray_to_langchain_doc(self,doc:Union[Dict[str,Any],Any])->Document:""" Convert a DocArray document (which also might be a dict) to a langchain document format. DocArray document can contain arbitrary fields, so the mapping is done in the following way: page_content <-> content_field metadata <-> all other fields excluding tensors and embeddings (so float, int, string) Args: doc: DocArray document Returns: Document in langchain format Raises: ValueError: If the document doesn't contain the content field """fields=doc.keys()ifisinstance(doc,dict)elseget_fields(doc)ifself.content_fieldnotinfields:raiseValueError(f"Document does not contain the content field - {self.content_field}.")lc_doc=Document(page_content=doc[self.content_field]ifisinstance(doc,dict)elsegetattr(doc,self.content_field))fornameinfields:value=doc[name]ifisinstance(doc,dict)elsegetattr(doc,name)if(isinstance(value,(str,int,float,bool))andname!=self.content_field):lc_doc.metadata[name]=valuereturnlc_doc