[docs]classBM25Retriever(BaseRetriever):"""`BM25` retriever without Elasticsearch."""vectorizer:Any=None""" BM25 vectorizer."""docs:List[Document]=Field(repr=False)""" List of documents."""k:int=4""" Number of documents to return."""preprocess_func:Callable[[str],List[str]]=default_preprocessing_func""" Preprocessing function to use on the text before BM25 vectorization."""model_config=ConfigDict(arbitrary_types_allowed=True,)
[docs]@classmethoddeffrom_texts(cls,texts:Iterable[str],metadatas:Optional[Iterable[dict]]=None,ids:Optional[Iterable[str]]=None,bm25_params:Optional[Dict[str,Any]]=None,preprocess_func:Callable[[str],List[str]]=default_preprocessing_func,**kwargs:Any,)->BM25Retriever:""" Create a BM25Retriever from a list of texts. Args: texts: A list of texts to vectorize. metadatas: A list of metadata dicts to associate with each text. ids: A list of ids to associate with each text. bm25_params: Parameters to pass to the BM25 vectorizer. preprocess_func: A function to preprocess each text before vectorization. **kwargs: Any other arguments to pass to the retriever. Returns: A BM25Retriever instance. """try:fromrank_bm25importBM25OkapiexceptImportError:raiseImportError("Could not import rank_bm25, please install with `pip install ""rank_bm25`.")texts_processed=[preprocess_func(t)fortintexts]bm25_params=bm25_paramsor{}vectorizer=BM25Okapi(texts_processed,**bm25_params)metadatas=metadatasor({}for_intexts)ifids:docs=[Document(page_content=t,metadata=m,id=i)fort,m,iinzip(texts,metadatas,ids)]else:docs=[Document(page_content=t,metadata=m)fort,minzip(texts,metadatas)]returncls(vectorizer=vectorizer,docs=docs,preprocess_func=preprocess_func,**kwargs)
[docs]@classmethoddeffrom_documents(cls,documents:Iterable[Document],*,bm25_params:Optional[Dict[str,Any]]=None,preprocess_func:Callable[[str],List[str]]=default_preprocessing_func,**kwargs:Any,)->BM25Retriever:""" Create a BM25Retriever from a list of Documents. Args: documents: A list of Documents to vectorize. bm25_params: Parameters to pass to the BM25 vectorizer. preprocess_func: A function to preprocess each text before vectorization. **kwargs: Any other arguments to pass to the retriever. Returns: A BM25Retriever instance. """texts,metadatas,ids=zip(*((d.page_content,d.metadata,d.id)fordindocuments))returncls.from_texts(texts=texts,bm25_params=bm25_params,metadatas=metadatas,ids=ids,preprocess_func=preprocess_func,**kwargs,)