[docs]classSupabaseVectorStore(VectorStore):"""`Supabase Postgres` vector store. It assumes you have the `pgvector` extension installed and a `match_documents` (or similar) function. For more details: https://integrations.langchain.com/vectorstores?integration_name=SupabaseVectorStore You can implement your own `match_documents` function in order to limit the search space to a subset of documents based on your own authorization or business logic. Note that the Supabase Python client does not yet support async operations. If you'd like to use `max_marginal_relevance_search`, please review the instructions below on modifying the `match_documents` function to return matched embeddings. Examples: .. code-block:: python from langchain_community.embeddings.openai import OpenAIEmbeddings from langchain_core.documents import Document from langchain_community.vectorstores import SupabaseVectorStore from supabase.client import create_client docs = [ Document(page_content="foo", metadata={"id": 1}), ] embeddings = OpenAIEmbeddings() supabase_client = create_client("my_supabase_url", "my_supabase_key") vector_store = SupabaseVectorStore.from_documents( docs, embeddings, client=supabase_client, table_name="documents", query_name="match_documents", chunk_size=500, ) To load from an existing table: .. code-block:: python from langchain_community.embeddings.openai import OpenAIEmbeddings from langchain_community.vectorstores import SupabaseVectorStore from supabase.client import create_client embeddings = OpenAIEmbeddings() supabase_client = create_client("my_supabase_url", "my_supabase_key") vector_store = SupabaseVectorStore( client=supabase_client, embedding=embeddings, table_name="documents", query_name="match_documents", ) """
[docs]def__init__(self,client:supabase.client.Client,embedding:Embeddings,table_name:str,chunk_size:int=500,query_name:Union[str,None]=None,)->None:"""Initialize with supabase client."""try:importsupabase# noqa: F401exceptImportError:raiseImportError("Could not import supabase python package. ""Please install it with `pip install supabase`.")self._client=clientself._embedding:Embeddings=embeddingself.table_name=table_nameor"documents"self.query_name=query_nameor"match_documents"self.chunk_size=chunk_sizeor500
# According to the SupabaseVectorStore JS implementation, the best chunk size# is 500. Though for large datasets it can be too large so it is configurable.@propertydefembeddings(self)->Embeddings:returnself._embedding
[docs]@classmethoddeffrom_texts(cls:Type["SupabaseVectorStore"],texts:List[str],embedding:Embeddings,metadatas:Optional[List[dict]]=None,client:Optional[supabase.client.Client]=None,table_name:Optional[str]="documents",query_name:Union[str,None]="match_documents",chunk_size:int=500,ids:Optional[List[str]]=None,**kwargs:Any,)->"SupabaseVectorStore":"""Return VectorStore initialized from texts and embeddings."""ifnotclient:raiseValueError("Supabase client is required.")ifnottable_name:raiseValueError("Supabase document table_name is required.")embeddings=embedding.embed_documents(texts)ids=[str(uuid.uuid4())for_intexts]docs=cls._texts_to_documents(texts,metadatas)cls._add_vectors(client,table_name,embeddings,docs,ids,chunk_size,**kwargs)returncls(client=client,embedding=embedding,table_name=table_name,query_name=query_name,chunk_size=chunk_size,)
[docs]defsimilarity_search_by_vector_with_relevance_scores(self,query:List[float],k:int,filter:Optional[Dict[str,Any]]=None,postgrest_filter:Optional[str]=None,score_threshold:Optional[float]=None,)->List[Tuple[Document,float]]:# Convert MongoDB-style filter to PostgreSQL syntax if needediffilter:forkey,valueinfilter.items():ifisinstance(value,dict)and"$in"invalue:# Extract the list of values for the $in operatorin_values=value["$in"]# Create a PostgreSQL IN clausevalues_str=",".join(f"'{str(v)}'"forvinin_values)new_filter=f"metadata->>{key} IN ({values_str})"# Combine with existing postgrest_filter if presentifpostgrest_filter:postgrest_filter=f"({postgrest_filter}) and ({new_filter})"else:postgrest_filter=new_filtermatch_documents_params=self.match_args(query,filter)query_builder=self._client.rpc(self.query_name,match_documents_params)ifpostgrest_filter:query_builder.params=query_builder.params.set("and",f"({postgrest_filter})")query_builder.params=query_builder.params.set("limit",k)res=query_builder.execute()match_result=[(Document(metadata=search.get("metadata",{}),# type: ignorepage_content=search.get("content",""),),search.get("similarity",0.0),)forsearchinres.dataifsearch.get("content")]ifscore_thresholdisnotNone:match_result=[(doc,similarity)fordoc,similarityinmatch_resultifsimilarity>=score_threshold]iflen(match_result)==0:warnings.warn("No relevant docs were retrieved using the relevance score"f" threshold {score_threshold}")returnmatch_result
[docs]defsimilarity_search_by_vector_returning_embeddings(self,query:List[float],k:int,filter:Optional[Dict[str,Any]]=None,postgrest_filter:Optional[str]=None,)->List[Tuple[Document,float,np.ndarray]]:match_documents_params=self.match_args(query,filter)query_builder=self._client.rpc(self.query_name,match_documents_params)ifpostgrest_filter:query_builder.params=query_builder.params.set("and",f"({postgrest_filter})")query_builder.params=query_builder.params.set("limit",k)res=query_builder.execute()match_result=[(Document(metadata=search.get("metadata",{}),# type: ignorepage_content=search.get("content",""),),search.get("similarity",0.0),# Supabase returns a vector type as its string represation (!).# This is a hack to convert the string to numpy array.np.fromstring(search.get("embedding","").strip("[]"),np.float32,sep=","),)forsearchinres.dataifsearch.get("content")]returnmatch_result
@staticmethoddef_texts_to_documents(texts:Iterable[str],metadatas:Optional[Iterable[Dict[Any,Any]]]=None,)->List[Document]:"""Return list of Documents from list of texts and metadatas."""ifmetadatasisNone:metadatas=repeat({})docs=[Document(page_content=text,metadata=metadata)fortext,metadatainzip(texts,metadatas)]returndocs@staticmethoddef_add_vectors(client:supabase.client.Client,table_name:str,vectors:List[List[float]],documents:List[Document],ids:List[str],chunk_size:int,**kwargs:Any,)->List[str]:"""Add vectors to Supabase table."""rows:List[Dict[str,Any]]=[{"id":ids[idx],"content":documents[idx].page_content,"embedding":embedding,"metadata":documents[idx].metadata,# type: ignore**kwargs,}foridx,embeddinginenumerate(vectors)]id_list:List[str]=[]foriinrange(0,len(rows),chunk_size):chunk=rows[i:i+chunk_size]result=client.from_(table_name).upsert(chunk).execute()# type: ignoreiflen(result.data)==0:raiseException("Error inserting: No rows added")# VectorStore.add_vectors returns ids as stringsids=[str(i.get("id"))foriinresult.dataifi.get("id")]id_list.extend(ids)returnid_list
[docs]defmax_marginal_relevance_search_by_vector(self,embedding:List[float],k:int=4,fetch_k:int=20,lambda_mult:float=0.5,**kwargs:Any,)->List[Document]:"""Return docs selected using the maximal marginal relevance. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. Args: embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. Returns: List of Documents selected by maximal marginal relevance. """result=self.similarity_search_by_vector_returning_embeddings(embedding,fetch_k)matched_documents=[doc_tuple[0]fordoc_tupleinresult]matched_embeddings=[doc_tuple[2]fordoc_tupleinresult]mmr_selected=maximal_marginal_relevance(np.array([embedding],dtype=np.float32),matched_embeddings,k=k,lambda_mult=lambda_mult,)filtered_documents=[matched_documents[i]foriinmmr_selected]returnfiltered_documents
[docs]defmax_marginal_relevance_search(self,query:str,k:int=4,fetch_k:int=20,lambda_mult:float=0.5,**kwargs:Any,)->List[Document]:"""Return docs selected using the maximal marginal relevance. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. Returns: List of Documents selected by maximal marginal relevance. `max_marginal_relevance_search` requires that `query_name` returns matched embeddings alongside the match documents. The following function demonstrates how to do this: ```sql CREATE FUNCTION match_documents_embeddings(query_embedding vector(1536), match_count int) RETURNS TABLE( id uuid, content text, metadata jsonb, embedding vector(1536), similarity float) LANGUAGE plpgsql AS $$ # variable_conflict use_column BEGIN RETURN query SELECT id, content, metadata, embedding, 1 -(docstore.embedding <=> query_embedding) AS similarity FROM docstore ORDER BY docstore.embedding <=> query_embedding LIMIT match_count; END; $$; ``` """embedding=self._embedding.embed_query(query)docs=self.max_marginal_relevance_search_by_vector(embedding,k,fetch_k,lambda_mult=lambda_mult)returndocs
[docs]defdelete(self,ids:Optional[List[str]]=None,**kwargs:Any)->None:"""Delete by vector IDs. Args: ids: List of ids to delete. """ifidsisNone:raiseValueError("No ids provided to delete.")rows:List[Dict[str,Any]]=[{"id":id,}foridinids]# TODO: Check if this can be done in bulkforrowinrows:self._client.from_(self.table_name).delete().eq("id",row["id"]).execute()