[docs]defhash_text(text:str)->str:"""Hash a text using SHA256. Args: text: Text to hash. Returns: Hashed text. """returnstr(hashlib.sha256(text.encode("utf-8")).hexdigest())
[docs]defcreate_index(contexts:List[str],index:Any,embeddings:Embeddings,sparse_encoder:Any,ids:Optional[List[str]]=None,metadatas:Optional[List[dict]]=None,namespace:Optional[str]=None,)->None:"""Create an index from a list of contexts. It modifies the index argument in-place! Args: contexts: List of contexts to embed. index: Index to use. embeddings: Embeddings model to use. sparse_encoder: Sparse encoder to use. ids: List of ids to use for the documents. metadatas: List of metadata to use for the documents. namespace: Namespace value for index partition. """batch_size=32_iterator=range(0,len(contexts),batch_size)try:fromtqdm.autoimporttqdm_iterator=tqdm(_iterator)exceptImportError:passifidsisNone:# create unique ids using hash of the textids=[hash_text(context)forcontextincontexts]foriin_iterator:# find end of batchi_end=min(i+batch_size,len(contexts))# extract batchcontext_batch=contexts[i:i_end]batch_ids=ids[i:i_end]metadata_batch=(metadatas[i:i_end]ifmetadataselse[{}for_incontext_batch])# add context passages as metadatameta=[{"context":context,**metadata}forcontext,metadatainzip(context_batch,metadata_batch)]# create dense vectorsdense_embeds=embeddings.embed_documents(context_batch)# create sparse vectorssparse_embeds=sparse_encoder.encode_documents(context_batch)forsinsparse_embeds:s["values"]=[float(s1)fors1ins["values"]]vectors=[]# loop through the data and create dictionaries for upsertsfordoc_id,sparse,dense,metadatainzip(batch_ids,sparse_embeds,dense_embeds,meta):vectors.append({"id":doc_id,"sparse_values":sparse,"values":dense,"metadata":metadata,})# upload the documents to the new hybrid indexindex.upsert(vectors,namespace=namespace)
[docs]classPineconeHybridSearchRetriever(BaseRetriever):"""`Pinecone Hybrid Search` retriever."""embeddings:Embeddings"""Embeddings model to use.""""""description"""sparse_encoder:Any"""Sparse encoder to use."""index:Any"""Pinecone index to use."""top_k:int=4"""Number of documents to return."""alpha:float=0.5"""Alpha value for hybrid search."""namespace:Optional[str]=None"""Namespace value for index partition."""classConfig:arbitrary_types_allowed=Trueextra="forbid"
@pre_initdefvalidate_environment(cls,values:Dict)->Dict:"""Validate that api key and python package exists in environment."""try:frompinecone_text.hybridimporthybrid_convex_scale# noqa:F401frompinecone_text.sparse.base_sparse_encoderimport(BaseSparseEncoder,# noqa:F401)exceptImportError:raiseImportError("Could not import pinecone_text python package. ""Please install it with `pip install pinecone_text`.")returnvaluesdef_get_relevant_documents(self,query:str,*,run_manager:CallbackManagerForRetrieverRun,**kwargs:Any)->List[Document]:frompinecone_text.hybridimporthybrid_convex_scalesparse_vec=self.sparse_encoder.encode_queries(query)# convert the question into a dense vectordense_vec=self.embeddings.embed_query(query)# scale alpha with hybrid_scaledense_vec,sparse_vec=hybrid_convex_scale(dense_vec,sparse_vec,self.alpha)sparse_vec["values"]=[float(s1)fors1insparse_vec["values"]]# query pinecone with the query parametersresult=self.index.query(vector=dense_vec,sparse_vector=sparse_vec,top_k=self.top_k,include_metadata=True,namespace=self.namespace,**kwargs,)final_result=[]forresinresult["matches"]:context=res["metadata"].pop("context")metadata=res["metadata"]if"score"notinmetadataand"score"inres:metadata["score"]=res["score"]final_result.append(Document(page_content=context,metadata=metadata))# return search results as jsonreturnfinal_result