Source code for langchain_community.vectorstores.hanavector
"""SAP HANA Cloud Vector Engine"""from__future__importannotationsimportimportlib.utilimportjsonimportrefromtypingimport(TYPE_CHECKING,Any,Callable,Dict,Iterable,List,Optional,Pattern,Tuple,Type,)importnumpyasnpfromlangchain_core.documentsimportDocumentfromlangchain_core.embeddingsimportEmbeddingsfromlangchain_core.runnables.configimportrun_in_executorfromlangchain_core.vectorstoresimportVectorStorefromlangchain_community.vectorstores.utilsimport(DistanceStrategy,maximal_marginal_relevance,)ifTYPE_CHECKING:fromhdbcliimportdbapiHANA_DISTANCE_FUNCTION:dict={DistanceStrategy.COSINE:("COSINE_SIMILARITY","DESC"),DistanceStrategy.EUCLIDEAN_DISTANCE:("L2DISTANCE","ASC"),}COMPARISONS_TO_SQL={"$eq":"=","$ne":"<>","$lt":"<","$lte":"<=","$gt":">","$gte":">=",}IN_OPERATORS_TO_SQL={"$in":"IN","$nin":"NOT IN",}BETWEEN_OPERATOR="$between"LIKE_OPERATOR="$like"LOGICAL_OPERATORS_TO_SQL={"$and":"AND","$or":"OR"}default_distance_strategy=DistanceStrategy.COSINEdefault_table_name:str="EMBEDDINGS"default_content_column:str="VEC_TEXT"default_metadata_column:str="VEC_META"default_vector_column:str="VEC_VECTOR"default_vector_column_length:int=-1# -1 means dynamic length
[docs]classHanaDB(VectorStore):"""SAP HANA Cloud Vector Engine The prerequisite for using this class is the installation of the ``hdbcli`` Python package. The HanaDB vectorstore can be created by providing an embedding function and an existing database connection. Optionally, the names of the table and the columns to use. """
[docs]def__init__(self,connection:dbapi.Connection,embedding:Embeddings,distance_strategy:DistanceStrategy=default_distance_strategy,table_name:str=default_table_name,content_column:str=default_content_column,metadata_column:str=default_metadata_column,vector_column:str=default_vector_column,vector_column_length:int=default_vector_column_length,*,specific_metadata_columns:Optional[List[str]]=None,):# Check if the hdbcli package is installedifimportlib.util.find_spec("hdbcli")isNone:raiseImportError("Could not import hdbcli python package. ""Please install it with `pip install hdbcli`.")valid_distance=FalseforkeyinHANA_DISTANCE_FUNCTION.keys():ifkeyisdistance_strategy:valid_distance=Trueifnotvalid_distance:raiseValueError("Unsupported distance_strategy: {}".format(distance_strategy))self.connection=connectionself.embedding=embeddingself.distance_strategy=distance_strategyself.table_name=HanaDB._sanitize_name(table_name)self.content_column=HanaDB._sanitize_name(content_column)self.metadata_column=HanaDB._sanitize_name(metadata_column)self.vector_column=HanaDB._sanitize_name(vector_column)self.vector_column_length=HanaDB._sanitize_int(vector_column_length)self.specific_metadata_columns=HanaDB._sanitize_specific_metadata_columns(specific_metadata_columnsor[])# Check if the table exists, and eventually create itifnotself._table_exists(self.table_name):sql_str=(f'CREATE TABLE "{self.table_name}"('f'"{self.content_column}" NCLOB, 'f'"{self.metadata_column}" NCLOB, 'f'"{self.vector_column}" REAL_VECTOR ')ifself.vector_column_lengthin[-1,0]:sql_str+=");"else:sql_str+=f"({self.vector_column_length}));"try:cur=self.connection.cursor()cur.execute(sql_str)finally:cur.close()# Check if the needed columns exist and have the correct typeself._check_column(self.table_name,self.content_column,["NCLOB","NVARCHAR"])self._check_column(self.table_name,self.metadata_column,["NCLOB","NVARCHAR"])self._check_column(self.table_name,self.vector_column,["REAL_VECTOR"],self.vector_column_length,)forcolumn_nameinself.specific_metadata_columns:self._check_column(self.table_name,column_name)
def_table_exists(self,table_name)->bool:# type: ignore[no-untyped-def]sql_str=("SELECT COUNT(*) FROM SYS.TABLES WHERE SCHEMA_NAME = CURRENT_SCHEMA"" AND TABLE_NAME = ?")try:cur=self.connection.cursor()cur.execute(sql_str,(table_name))ifcur.has_result_set():rows=cur.fetchall()ifrows[0][0]==1:returnTruefinally:cur.close()returnFalsedef_check_column(# type: ignore[no-untyped-def]self,table_name,column_name,column_type=None,column_length=None):sql_str=("SELECT DATA_TYPE_NAME, LENGTH FROM SYS.TABLE_COLUMNS WHERE ""SCHEMA_NAME = CURRENT_SCHEMA ""AND TABLE_NAME = ? AND COLUMN_NAME = ?")try:cur=self.connection.cursor()cur.execute(sql_str,(table_name,column_name))ifcur.has_result_set():rows=cur.fetchall()iflen(rows)==0:raiseAttributeError(f"Column {column_name} does not exist")# Check data typeifcolumn_type:ifrows[0][0]notincolumn_type:raiseAttributeError(f"Column {column_name} has the wrong type: {rows[0][0]}")# Check length, if parameter was provided# Length can either be -1 (QRC01+02-24) or 0 (QRC03-24 onwards)# to indicate no length constraint being present.ifcolumn_lengthisnotNoneandcolumn_length>0:ifrows[0][1]!=column_length:raiseAttributeError(f"Column {column_name} has the wrong length: {rows[0][1]} "f"expected: {column_length}")else:raiseAttributeError(f"Column {column_name} does not exist")finally:cur.close()@propertydefembeddings(self)->Embeddings:returnself.embedding@staticmethoddef_sanitize_name(input_str:str)->str:# type: ignore[misc]# Remove characters that are not alphanumeric or underscoresreturnre.sub(r"[^a-zA-Z0-9_]","",input_str)@staticmethoddef_sanitize_int(input_int:any)->int:# type: ignore[valid-type]value=int(str(input_int))ifvalue<-1:raiseValueError(f"Value ({value}) must not be smaller than -1")returnint(str(input_int))@staticmethoddef_sanitize_list_float(embedding:List[float])->List[float]:forvalueinembedding:ifnotisinstance(value,float):raiseValueError(f"Value ({value}) does not have type float")returnembedding# Compile pattern only once, for better performance_compiled_pattern:Pattern=re.compile("^[_a-zA-Z][_a-zA-Z0-9]*$")@staticmethoddef_sanitize_metadata_keys(metadata:dict)->dict:forkeyinmetadata.keys():ifnotHanaDB._compiled_pattern.match(key):raiseValueError(f"Invalid metadata key {key}")returnmetadata@staticmethoddef_sanitize_specific_metadata_columns(specific_metadata_columns:List[str],)->List[str]:metadata_columns=[]forcinspecific_metadata_columns:sanitized_name=HanaDB._sanitize_name(c)metadata_columns.append(sanitized_name)returnmetadata_columnsdef_split_off_special_metadata(self,metadata:dict)->Tuple[dict,list]:# Use provided values by default or fallbackspecial_metadata=[]ifnotmetadata:return{},[]forcolumn_nameinself.specific_metadata_columns:special_metadata.append(metadata.get(column_name,None))returnmetadata,special_metadata
[docs]defcreate_hnsw_index(self,m:Optional[int]=None,# Optional M parameteref_construction:Optional[int]=None,# Optional efConstruction parameteref_search:Optional[int]=None,# Optional efSearch parameterindex_name:Optional[str]=None,# Optional custom index name)->None:""" Creates an HNSW vector index on a specified table and vector column with optional build and search configurations. If no configurations are provided, default parameters from the database are used. If provided values exceed the valid ranges, an error will be raised. The index is always created in ONLINE mode. Args: m: (Optional) Maximum number of neighbors per graph node (Valid Range: [4, 1000]) ef_construction: (Optional) Maximal candidates to consider when building the graph (Valid Range: [1, 100000]) ef_search: (Optional) Minimum candidates for top-k-nearest neighbor queries (Valid Range: [1, 100000]) index_name: (Optional) Custom index name. Defaults to <table_name>_<distance_strategy>_idx """# Set default index name if not provideddistance_func_name=HANA_DISTANCE_FUNCTION[self.distance_strategy][0]default_index_name=f"{self.table_name}_{distance_func_name}_idx"# Use provided index_name or defaultindex_name=(HanaDB._sanitize_name(index_name)ifindex_nameelsedefault_index_name)# Initialize build_config and search_config as empty dictionariesbuild_config={}search_config={}# Validate and add m parameter to build_config if providedifmisnotNone:m=HanaDB._sanitize_int(m)ifnot(4<=m<=1000):raiseValueError("M must be in the range [4, 1000]")build_config["M"]=m# Validate and add ef_construction to build_config if providedifef_constructionisnotNone:ef_construction=HanaDB._sanitize_int(ef_construction)ifnot(1<=ef_construction<=100000):raiseValueError("efConstruction must be in the range [1, 100000]")build_config["efConstruction"]=ef_construction# Validate and add ef_search to search_config if providedifef_searchisnotNone:ef_search=HanaDB._sanitize_int(ef_search)ifnot(1<=ef_search<=100000):raiseValueError("efSearch must be in the range [1, 100000]")search_config["efSearch"]=ef_search# Convert build_config and search_config to JSON strings if they contain valuesbuild_config_str=json.dumps(build_config)ifbuild_configelse""search_config_str=json.dumps(search_config)ifsearch_configelse""# Create the index SQL string with the ONLINE keywordsql_str=(f'CREATE HNSW VECTOR INDEX {index_name} ON "{self.table_name}" 'f'("{self.vector_column}") 'f"SIMILARITY FUNCTION {distance_func_name} ")# Append build_config to the SQL string if providedifbuild_config_str:sql_str+=f"BUILD CONFIGURATION '{build_config_str}' "# Append search_config to the SQL string if providedifsearch_config_str:sql_str+=f"SEARCH CONFIGURATION '{search_config_str}' "# Always add the ONLINE optionsql_str+="ONLINE "cur=self.connection.cursor()try:cur.execute(sql_str)finally:cur.close()
[docs]defadd_texts(# type: ignore[override]self,texts:Iterable[str],metadatas:Optional[List[dict]]=None,embeddings:Optional[List[List[float]]]=None,**kwargs:Any,)->List[str]:"""Add more texts to the vectorstore. Args: texts (Iterable[str]): Iterable of strings/text to add to the vectorstore. metadatas (Optional[List[dict]], optional): Optional list of metadatas. Defaults to None. embeddings (Optional[List[List[float]]], optional): Optional pre-generated embeddings. Defaults to None. Returns: List[str]: empty list """# Create all embeddings of the texts beforehand to improve performanceifembeddingsisNone:embeddings=self.embedding.embed_documents(list(texts))# Create sql parameters arraysql_params=[]fori,textinenumerate(texts):metadata=metadatas[i]ifmetadataselse{}metadata,extracted_special_metadata=self._split_off_special_metadata(metadata)embedding=(embeddings[i]ifembeddingselseself.embedding.embed_documents([text])[0])sql_params.append((text,json.dumps(HanaDB._sanitize_metadata_keys(metadata)),f"[{','.join(map(str,embedding))}]",*extracted_special_metadata,))# Insert data into the tablecur=self.connection.cursor()try:specific_metadata_columns_string='", "'.join(self.specific_metadata_columns)ifspecific_metadata_columns_string:specific_metadata_columns_string=(', "'+specific_metadata_columns_string+'"')sql_str=(f'INSERT INTO "{self.table_name}" ("{self.content_column}", 'f'"{self.metadata_column}", 'f'"{self.vector_column}"{specific_metadata_columns_string}) 'f"VALUES (?, ?, TO_REAL_VECTOR (?)"f"{', ?'*len(self.specific_metadata_columns)});")cur.executemany(sql_str,sql_params)finally:cur.close()return[]
[docs]@classmethoddeffrom_texts(# type: ignore[no-untyped-def, override]cls:Type[HanaDB],texts:List[str],embedding:Embeddings,metadatas:Optional[List[dict]]=None,connection:dbapi.Connection=None,distance_strategy:DistanceStrategy=default_distance_strategy,table_name:str=default_table_name,content_column:str=default_content_column,metadata_column:str=default_metadata_column,vector_column:str=default_vector_column,vector_column_length:int=default_vector_column_length,*,specific_metadata_columns:Optional[List[str]]=None,):"""Create a HanaDB instance from raw documents. This is a user-friendly interface that: 1. Embeds documents. 2. Creates a table if it does not yet exist. 3. Adds the documents to the table. This is intended to be a quick way to get started. """instance=cls(connection=connection,embedding=embedding,distance_strategy=distance_strategy,table_name=table_name,content_column=content_column,metadata_column=metadata_column,vector_column=vector_column,vector_column_length=vector_column_length,# -1 means dynamic lengthspecific_metadata_columns=specific_metadata_columns,)instance.add_texts(texts,metadatas)returninstance
[docs]defsimilarity_search(# type: ignore[override]self,query:str,k:int=4,filter:Optional[dict]=None)->List[Document]:"""Return docs most similar to query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: A dictionary of metadata fields and values to filter by. Defaults to None. Returns: List of Documents most similar to the query """docs_and_scores=self.similarity_search_with_score(query=query,k=k,filter=filter)return[docfordoc,_indocs_and_scores]
[docs]defsimilarity_search_with_score(self,query:str,k:int=4,filter:Optional[dict]=None)->List[Tuple[Document,float]]:"""Return documents and score values most similar to query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: A dictionary of metadata fields and values to filter by. Defaults to None. Returns: List of tuples (containing a Document and a score) that are most similar to the query """embedding=self.embedding.embed_query(query)returnself.similarity_search_with_score_by_vector(embedding=embedding,k=k,filter=filter)
[docs]defsimilarity_search_with_score_and_vector_by_vector(self,embedding:List[float],k:int=4,filter:Optional[dict]=None)->List[Tuple[Document,float,List[float]]]:"""Return docs most similar to the given embedding. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: A dictionary of metadata fields and values to filter by. Defaults to None. Returns: List of Documents most similar to the query and score and the document's embedding vector for each """result=[]k=HanaDB._sanitize_int(k)embedding=HanaDB._sanitize_list_float(embedding)distance_func_name=HANA_DISTANCE_FUNCTION[self.distance_strategy][0]embedding_as_str="["+",".join(map(str,embedding))+"]"sql_str=(f"SELECT TOP {k}"f' "{self.content_column}", '# row[0]f' "{self.metadata_column}", '# row[1]f' TO_NVARCHAR("{self.vector_column}"), '# row[2]f' {distance_func_name}("{self.vector_column}", TO_REAL_VECTOR (?)) AS CS 'f'FROM "{self.table_name}"')order_str=f" order by CS {HANA_DISTANCE_FUNCTION[self.distance_strategy][1]}"where_str,query_tuple=self._create_where_by_filter(filter)query_tuple=(embedding_as_str,)+tuple(query_tuple)sql_str=sql_str+where_strsql_str=sql_str+order_strtry:cur=self.connection.cursor()cur.execute(sql_str,query_tuple)ifcur.has_result_set():rows=cur.fetchall()forrowinrows:js=json.loads(row[1])doc=Document(page_content=row[0],metadata=js)result_vector=HanaDB._parse_float_array_from_string(row[2])result.append((doc,row[3],result_vector))finally:cur.close()returnresult
[docs]defsimilarity_search_with_score_by_vector(self,embedding:List[float],k:int=4,filter:Optional[dict]=None)->List[Tuple[Document,float]]:"""Return docs most similar to the given embedding. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: A dictionary of metadata fields and values to filter by. Defaults to None. Returns: List of Documents most similar to the query and score for each """whole_result=self.similarity_search_with_score_and_vector_by_vector(embedding=embedding,k=k,filter=filter)return[(result_item[0],result_item[1])forresult_iteminwhole_result]
[docs]defsimilarity_search_by_vector(# type: ignore[override]self,embedding:List[float],k:int=4,filter:Optional[dict]=None)->List[Document]:"""Return docs most similar to embedding vector. Args: embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: A dictionary of metadata fields and values to filter by. Defaults to None. Returns: List of Documents most similar to the query vector. """docs_and_scores=self.similarity_search_with_score_by_vector(embedding=embedding,k=k,filter=filter)return[docfordoc,_indocs_and_scores]
def_create_where_by_filter(self,filter):# type: ignore[no-untyped-def]query_tuple=[]where_str=""iffilter:where_str,query_tuple=self._process_filter_object(filter)where_str=" WHERE "+where_strreturnwhere_str,query_tupledef_process_filter_object(self,filter):# type: ignore[no-untyped-def]query_tuple=[]where_str=""iffilter:fori,keyinenumerate(filter.keys()):filter_value=filter[key]ifi!=0:where_str+=" AND "# Handling of 'special' boolean operators "$and", "$or"ifkeyinLOGICAL_OPERATORS_TO_SQL:logical_operator=LOGICAL_OPERATORS_TO_SQL[key]logical_operands=filter_valueforj,logical_operandinenumerate(logical_operands):ifj!=0:where_str+=f" {logical_operator} "(where_str_logical,query_tuple_logical,)=self._process_filter_object(logical_operand)where_str+="("+where_str_logical+")"query_tuple+=query_tuple_logicalcontinueoperator="="sql_param="?"ifisinstance(filter_value,bool):query_tuple.append("true"iffilter_valueelse"false")elifisinstance(filter_value,int)orisinstance(filter_value,str):query_tuple.append(filter_value)elifisinstance(filter_value,Dict):# Handling of 'special' operators starting with "$"special_op=next(iter(filter_value))special_val=filter_value[special_op]# "$eq", "$ne", "$lt", "$lte", "$gt", "$gte"ifspecial_opinCOMPARISONS_TO_SQL:operator=COMPARISONS_TO_SQL[special_op]ifisinstance(special_val,bool):query_tuple.append("true"ifspecial_valelse"false")elifisinstance(special_val,float):sql_param="CAST(? as float)"query_tuple.append(special_val)elif(isinstance(special_val,dict)and"type"inspecial_valandspecial_val["type"]=="date"):# Date typesql_param="CAST(? as DATE)"query_tuple.append(special_val["date"])else:query_tuple.append(special_val)# "$between"elifspecial_op==BETWEEN_OPERATOR:between_from=special_val[0]between_to=special_val[1]operator="BETWEEN"sql_param="? AND ?"query_tuple.append(between_from)query_tuple.append(between_to)# "$like"elifspecial_op==LIKE_OPERATOR:operator="LIKE"query_tuple.append(special_val)# "$in", "$nin"elifspecial_opinIN_OPERATORS_TO_SQL:operator=IN_OPERATORS_TO_SQL[special_op]ifisinstance(special_val,list):fori,list_entryinenumerate(special_val):ifi==0:sql_param="("sql_param=sql_param+"?"ifi==(len(special_val)-1):sql_param=sql_param+")"else:sql_param=sql_param+","query_tuple.append(list_entry)else:raiseValueError(f"Unsupported value for {operator}: {special_val}")else:raiseValueError(f"Unsupported operator: {special_op}")else:raiseValueError(f"Unsupported filter data-type: {type(filter_value)}")selector=(f' "{key}"'ifkeyinself.specific_metadata_columnselsef"JSON_VALUE({self.metadata_column}, '$.{key}')")where_str+=f"{selector}{operator}{sql_param}"returnwhere_str,query_tuple
[docs]defdelete(# type: ignore[override]self,ids:Optional[List[str]]=None,filter:Optional[dict]=None)->Optional[bool]:"""Delete entries by filter with metadata values Args: ids: Deletion with ids is not supported! A ValueError will be raised. filter: A dictionary of metadata fields and values to filter by. An empty filter ({}) will delete all entries in the table. Returns: Optional[bool]: True, if deletion is technically successful. Deletion of zero entries, due to non-matching filters is a success. """ifidsisnotNone:raiseValueError("Deletion via ids is not supported")iffilterisNone:raiseValueError("Parameter 'filter' is required when calling 'delete'")where_str,query_tuple=self._create_where_by_filter(filter)sql_str=f'DELETE FROM "{self.table_name}" {where_str}'try:cur=self.connection.cursor()cur.execute(sql_str,query_tuple)finally:cur.close()returnTrue
[docs]asyncdefadelete(# type: ignore[override]self,ids:Optional[List[str]]=None,filter:Optional[dict]=None)->Optional[bool]:"""Delete by vector ID or other criteria. Args: ids: List of ids to delete. Returns: Optional[bool]: True if deletion is successful, False otherwise, None if not implemented. """returnawaitrun_in_executor(None,self.delete,ids=ids,filter=filter)
[docs]defmax_marginal_relevance_search(# type: ignore[override]self,query:str,k:int=4,fetch_k:int=20,lambda_mult:float=0.5,filter:Optional[dict]=None,)->List[Document]:"""Return docs selected using the maximal marginal relevance. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. Args: query: search query text. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. filter: Filter on metadata properties, e.g. { "str_property": "foo", "int_property": 123 } Returns: List of Documents selected by maximal marginal relevance. """embedding=self.embedding.embed_query(query)returnself.max_marginal_relevance_search_by_vector(embedding=embedding,k=k,fetch_k=fetch_k,lambda_mult=lambda_mult,filter=filter,)
[docs]asyncdefamax_marginal_relevance_search_by_vector(# type: ignore[override]self,embedding:List[float],k:int=4,fetch_k:int=20,lambda_mult:float=0.5,)->List[Document]:"""Return docs selected using the maximal marginal relevance."""returnawaitrun_in_executor(None,self.max_marginal_relevance_search_by_vector,embedding=embedding,k=k,fetch_k=fetch_k,lambda_mult=lambda_mult,)
@staticmethoddef_cosine_relevance_score_fn(distance:float)->float:returndistancedef_select_relevance_score_fn(self)->Callable[[float],float]:""" The 'correct' relevance function may differ depending on a few things, including: - the distance / similarity metric used by the VectorStore - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) - embedding dimensionality - etc. Vectorstores should define their own selection based method of relevance. """ifself.distance_strategy==DistanceStrategy.COSINE:returnHanaDB._cosine_relevance_score_fnelifself.distance_strategy==DistanceStrategy.EUCLIDEAN_DISTANCE:returnHanaDB._euclidean_relevance_score_fnelse:raiseValueError("Unsupported distance_strategy: {}".format(self.distance_strategy))