Source code for langchain_community.vectorstores.lancedb
from__future__importannotationsimportbase64importosimportuuidimportwarningsfromtypingimportAny,Callable,Dict,Iterable,List,Optional,Typeimportnumpyasnpfromlangchain_core.documentsimportDocumentfromlangchain_core.embeddingsimportEmbeddingsfromlangchain_core.utilsimportguard_importfromlangchain_core.vectorstoresimportVectorStorefromlangchain_community.vectorstores.utilsimportmaximal_marginal_relevanceDEFAULT_K=4# Number of Documents to return.
[docs]defto_lance_filter(filter:Dict[str,str])->str:"""Converts a dict filter to a LanceDB filter string."""return" AND ".join([f"{k} = '{v}'"fork,vinfilter.items()])
[docs]classLanceDB(VectorStore):"""`LanceDB` vector store. To use, you should have ``lancedb`` python package installed. You can install it with ``pip install lancedb``. Args: connection: LanceDB connection to use. If not provided, a new connection will be created. embedding: Embedding to use for the vectorstore. vector_key: Key to use for the vector in the database. Defaults to ``vector``. id_key: Key to use for the id in the database. Defaults to ``id``. text_key: Key to use for the text in the database. Defaults to ``text``. table_name: Name of the table to use. Defaults to ``vectorstore``. api_key: API key to use for LanceDB cloud database. region: Region to use for LanceDB cloud database. mode: Mode to use for adding data to the table. Valid values are ``append`` and ``overwrite``. Defaults to ``overwrite``. Example: .. code-block:: python vectorstore = LanceDB(uri='/lancedb', embedding_function) vectorstore.add_texts(['text1', 'text2']) result = vectorstore.similarity_search('text1') """
[docs]def__init__(self,connection:Optional[Any]=None,embedding:Optional[Embeddings]=None,uri:Optional[str]="/tmp/lancedb",vector_key:Optional[str]="vector",id_key:Optional[str]="id",text_key:Optional[str]="text",table_name:Optional[str]="vectorstore",api_key:Optional[str]=None,region:Optional[str]=None,mode:Optional[str]="overwrite",table:Optional[Any]=None,distance:Optional[str]="l2",reranker:Optional[Any]=None,relevance_score_fn:Optional[Callable[[float],float]]=None,limit:int=DEFAULT_K,):"""Initialize with Lance DB vectorstore"""lancedb=guard_import("lancedb")lancedb.remote.table=guard_import("lancedb.remote.table")self._embedding=embeddingself._vector_key=vector_keyself._id_key=id_keyself._text_key=text_keyself.api_key=api_keyoros.getenv("LANCE_API_KEY")ifapi_key!=""elseNoneself.region=regionself.mode=modeself.distance=distanceself.override_relevance_score_fn=relevance_score_fnself.limit=limitself._fts_index=Noneifisinstance(reranker,lancedb.rerankers.Reranker):self._reranker=rerankerelifrerankerisNone:self._reranker=Noneelse:raiseValueError("`reranker` has to be a lancedb.rerankers.Reranker object.")ifisinstance(uri,str)andself.api_keyisNone:ifuri.startswith("db://"):raiseValueError("API key is required for LanceDB cloud.")ifself._embeddingisNone:raiseValueError("embedding object should be provided")ifisinstance(connection,lancedb.db.LanceDBConnection):self._connection=connectionelifisinstance(connection,(str,lancedb.db.LanceTable)):raiseValueError("`connection` has to be a lancedb.db.LanceDBConnection object.\ `lancedb.db.LanceTable` is deprecated.")else:ifself.api_keyisNone:self._connection=lancedb.connect(uri)else:ifisinstance(uri,str):ifuri.startswith("db://"):self._connection=lancedb.connect(uri,api_key=self.api_key,region=self.region)else:self._connection=lancedb.connect(uri)warnings.warn("api key provided with local uri.\ The data will be stored locally")iftableisnotNone:try:assertisinstance(table,(lancedb.db.LanceTable,lancedb.remote.table.RemoteTable))self._table=tableself._table_name=(table.nameifhasattr(table,"name")else"remote_table")exceptAssertionError:raiseValueError("""`table` has to be a lancedb.db.LanceTable or lancedb.remote.table.RemoteTable object.""")else:self._table=self.get_table(table_name,set_default=True)
[docs]defresults_to_docs(self,results:Any,score:bool=False)->Any:columns=results.schema.namesif"_distance"incolumns:score_col="_distance"elif"_relevance_score"incolumns:score_col="_relevance_score"else:score_col=None# Check if 'metadata' is in the columnshas_metadata="metadata"incolumnsifscore_colisNoneornotscore:return[Document(page_content=results[self._text_key][idx].as_py(),metadata=results["metadata"][idx].as_py()ifhas_metadataelse{},)foridxinrange(len(results))]elifscore_colandscore:return[(Document(page_content=results[self._text_key][idx].as_py(),metadata=results["metadata"][idx].as_py()ifhas_metadataelse{},),results[score_col][idx].as_py(),)foridxinrange(len(results))]
[docs]defadd_texts(self,texts:Iterable[str],metadatas:Optional[List[dict]]=None,ids:Optional[List[str]]=None,**kwargs:Any,)->List[str]:"""Turn texts into embedding and add it to the database Args: texts: Iterable of strings to add to the vectorstore. metadatas: Optional list of metadatas associated with the texts. ids: Optional list of ids to associate with the texts. ids: Optional list of ids to associate with the texts. Returns: List of ids of the added texts. """docs=[]ids=idsor[str(uuid.uuid4())for_intexts]embeddings=self._embedding.embed_documents(list(texts))# type: ignoreforidx,textinenumerate(texts):embedding=embeddings[idx]metadata=metadatas[idx]ifmetadataselse{"id":ids[idx]}docs.append({self._vector_key:embedding,self._id_key:ids[idx],self._text_key:text,"metadata":metadata,})tbl=self.get_table()iftblisNone:tbl=self._connection.create_table(self._table_name,data=docs)self._table=tblelse:ifself.api_keyisNone:tbl.add(docs,mode=self.mode)else:tbl.add(docs)self._fts_index=Nonereturnids
[docs]defget_table(self,name:Optional[str]=None,set_default:Optional[bool]=False)->Any:""" Fetches a table object from the database. Args: name (str, optional): The name of the table to fetch. Defaults to None and fetches current table object. set_default (bool, optional): Sets fetched table as the default table. Defaults to False. Returns: Any: The fetched table object. Raises: ValueError: If the specified table is not found in the database. """ifnameisnotNone:ifset_default:self._table_name=name_name=self._table_nameelse:_name=nameelse:_name=self._table_nametry:returnself._connection.open_table(_name)exceptException:returnNone
[docs]defcreate_index(self,col_name:Optional[str]=None,vector_col:Optional[str]=None,num_partitions:Optional[int]=256,num_sub_vectors:Optional[int]=96,index_cache_size:Optional[int]=None,metric:Optional[str]="L2",name:Optional[str]=None,)->None:""" Create a scalar(for non-vector cols) or a vector index on a table. Make sure your vector column has enough data before creating an index on it. Args: vector_col: Provide if you want to create index on a vector column. col_name: Provide if you want to create index on a non-vector column. metric: Provide the metric to use for vector index. Defaults to 'L2' choice of metrics: 'L2', 'dot', 'cosine' num_partitions: Number of partitions to use for the index. Defaults to 256. num_sub_vectors: Number of sub-vectors to use for the index. Defaults to 96. index_cache_size: Size of the index cache. Defaults to None. name: Name of the table to create index on. Defaults to None. Returns: None """tbl=self.get_table(name)ifvector_col:tbl.create_index(metric=metric,vector_column_name=vector_col,num_partitions=num_partitions,num_sub_vectors=num_sub_vectors,index_cache_size=index_cache_size,)elifcol_name:tbl.create_scalar_index(col_name)else:raiseValueError("Provide either vector_col or col_name")
[docs]defencode_image(self,uri:str)->str:"""Get base64 string from image URI."""withopen(uri,"rb")asimage_file:returnbase64.b64encode(image_file.read()).decode("utf-8")
[docs]defadd_images(self,uris:List[str],metadatas:Optional[List[dict]]=None,ids:Optional[List[str]]=None,**kwargs:Any,)->List[str]:"""Run more images through the embeddings and add to the vectorstore. Args: uris List[str]: File path to the image. metadatas (Optional[List[dict]], optional): Optional list of metadatas. ids (Optional[List[str]], optional): Optional list of IDs. Returns: List[str]: List of IDs of the added images. """tbl=self.get_table()# Map from uris to b64 encoded stringsb64_texts=[self.encode_image(uri=uri)foruriinuris]# Populate IDsifidsisNone:ids=[str(uuid.uuid4())for_inuris]embeddings=None# Set embeddingsifself._embeddingisnotNoneandhasattr(self._embedding,"embed_image"):embeddings=self._embedding.embed_image(uris=uris)else:raiseValueError("embedding object should be provided and must have embed_image method.")data=[]foridx,embinenumerate(embeddings):metadata=metadatas[idx]ifmetadataselse{"id":ids[idx]}data.append({self._vector_key:emb,self._id_key:ids[idx],self._text_key:b64_texts[idx],"metadata":metadata,})iftblisNone:tbl=self._connection.create_table(self._table_name,data=data)self._table=tblelse:tbl.add(data)returnids
def_query(self,query:Any,k:Optional[int]=None,filter:Optional[Any]=None,name:Optional[str]=None,**kwargs:Any,)->Any:ifkisNone:k=self.limittbl=self.get_table(name)ifisinstance(filter,dict):filter=to_lance_filter(filter)prefilter=kwargs.get("prefilter",False)query_type=kwargs.get("query_type","vector")ifmetrics:=kwargs.get("metrics"):lance_query=(tbl.search(query=query,vector_column_name=self._vector_key).limit(k).metric(metrics).where(filter,prefilter=prefilter))else:lance_query=(tbl.search(query=query,vector_column_name=self._vector_key).limit(k).where(filter,prefilter=prefilter))ifquery_type=="hybrid"andself._rerankerisnotNone:lance_query.rerank(reranker=self._reranker)docs=lance_query.to_arrow()iflen(docs)==0:warnings.warn("No results found for the query.")returndocsdef_select_relevance_score_fn(self)->Callable[[float],float]:""" The 'correct' relevance function may differ depending on a few things, including: - the distance / similarity metric used by the VectorStore - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) - embedding dimensionality - etc. """ifself.override_relevance_score_fn:returnself.override_relevance_score_fnifself.distance=="cosine":returnself._cosine_relevance_score_fnelifself.distance=="l2":returnself._euclidean_relevance_score_fnelifself.distance=="ip":returnself._max_inner_product_relevance_score_fnelse:raiseValueError("No supported normalization function"f" for distance metric of type: {self.distance}.""Consider providing relevance_score_fn to Chroma constructor.")
[docs]defsimilarity_search_by_vector(self,embedding:List[float],k:Optional[int]=None,filter:Optional[Dict[str,str]]=None,name:Optional[str]=None,**kwargs:Any,)->Any:""" Return documents most similar to the query vector. """ifkisNone:k=self.limitres=self._query(embedding,k,filter=filter,name=name,**kwargs)returnself.results_to_docs(res,score=kwargs.pop("score",False))
[docs]defsimilarity_search_by_vector_with_relevance_scores(self,embedding:List[float],k:Optional[int]=None,filter:Optional[Dict[str,str]]=None,name:Optional[str]=None,**kwargs:Any,)->Any:""" Return documents most similar to the query vector with relevance scores. """ifkisNone:k=self.limitrelevance_score_fn=self._select_relevance_score_fn()docs_and_scores=self.similarity_search_by_vector(embedding,k,score=True,**kwargs)return[(doc,relevance_score_fn(float(score)))fordoc,scoreindocs_and_scores]
[docs]defsimilarity_search_with_score(self,query:str,k:Optional[int]=None,filter:Optional[Dict[str,str]]=None,**kwargs:Any,)->Any:"""Return documents most similar to the query with relevance scores."""ifkisNone:k=self.limitscore=kwargs.get("score",True)name=kwargs.get("name",None)query_type=kwargs.get("query_type","vector")ifself._embeddingisNone:raiseValueError("search needs an emmbedding function to be specified.")ifquery_type=="fts"orquery_type=="hybrid":ifself.api_keyisNoneandself._fts_indexisNone:tbl=self.get_table(name)self._fts_index=tbl.create_fts_index(self._text_key,replace=True)ifquery_type=="hybrid":embedding=self._embedding.embed_query(query)_query=(embedding,query)else:_query=query# type: ignoreres=self._query(_query,k,filter=filter,name=name,**kwargs)returnself.results_to_docs(res,score=score)else:raiseNotImplementedError("Full text/ Hybrid search is not supported in LanceDB Cloud yet.")else:embedding=self._embedding.embed_query(query)res=self._query(embedding,k,filter=filter,**kwargs)returnself.results_to_docs(res,score=score)
[docs]defsimilarity_search(self,query:str,k:Optional[int]=None,name:Optional[str]=None,filter:Optional[Any]=None,fts:Optional[bool]=False,**kwargs:Any,)->List[Document]:"""Return documents most similar to the query Args: query: String to query the vectorstore with. k: Number of documents to return. filter (Optional[Dict]): Optional filter arguments sql_filter(Optional[string]): SQL filter to apply to the query. prefilter(Optional[bool]): Whether to apply the filter prior to the vector search. Raises: ValueError: If the specified table is not found in the database. Returns: List of documents most similar to the query. """res=self.similarity_search_with_score(query=query,k=k,name=name,filter=filter,fts=fts,score=False,**kwargs)returnres
[docs]defmax_marginal_relevance_search(self,query:str,k:Optional[int]=None,fetch_k:int=20,lambda_mult:float=0.5,filter:Optional[Dict[str,str]]=None,**kwargs:Any,)->List[Document]:"""Return docs selected using the maximal marginal relevance. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. Returns: List of Documents selected by maximal marginal relevance. """ifkisNone:k=self.limitifself._embeddingisNone:raiseValueError("For MMR search, you must specify an embedding function oncreation.")embedding=self._embedding.embed_query(query)docs=self.max_marginal_relevance_search_by_vector(embedding,k,fetch_k,lambda_mult=lambda_mult,filter=filter,)returndocs
[docs]defmax_marginal_relevance_search_by_vector(self,embedding:List[float],k:Optional[int]=None,fetch_k:int=20,lambda_mult:float=0.5,filter:Optional[Dict[str,str]]=None,**kwargs:Any,)->List[Document]:"""Return docs selected using the maximal marginal relevance. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. Args: embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. Returns: List of Documents selected by maximal marginal relevance. """results=self._query(query=embedding,k=fetch_k,filter=filter,**kwargs,)mmr_selected=maximal_marginal_relevance(np.array(embedding,dtype=np.float32),results["vector"].to_pylist(),k=korself.limit,lambda_mult=lambda_mult,)candidates=self.results_to_docs(results)selected_results=[rfori,rinenumerate(candidates)ifiinmmr_selected]returnselected_results
[docs]defdelete(self,ids:Optional[List[str]]=None,delete_all:Optional[bool]=None,filter:Optional[str]=None,drop_columns:Optional[List[str]]=None,name:Optional[str]=None,**kwargs:Any,)->None:""" Allows deleting rows by filtering, by ids or drop columns from the table. Args: filter: Provide a string SQL expression - "{col} {operation} {value}". ids: Provide list of ids to delete from the table. drop_columns: Provide list of columns to drop from the table. delete_all: If True, delete all rows from the table. """tbl=self.get_table(name)iffilter:tbl.delete(filter)elifids:tbl.delete(f"{self._id_key} in ('{{}}')".format(",".join(ids)))elifdrop_columns:ifself.api_keyisnotNone:raiseNotImplementedError("Column operations currently not supported in LanceDB Cloud.")else:tbl.drop_columns(drop_columns)elifdelete_all:tbl.delete("true")else:raiseValueError("Provide either filter, ids, drop_columns or delete_all")