Source code for langchain_community.vectorstores.hanavector
"""SAP HANA Cloud Vector Engine"""from__future__importannotationsimportimportlib.utilimportjsonimportrefromtypingimport(TYPE_CHECKING,Any,Callable,Dict,Iterable,List,Optional,Pattern,Tuple,Type,)importnumpyasnpfromlangchain_core.documentsimportDocumentfromlangchain_core.embeddingsimportEmbeddingsfromlangchain_core.runnables.configimportrun_in_executorfromlangchain_core.vectorstoresimportVectorStorefromlangchain_community.vectorstores.utilsimport(DistanceStrategy,maximal_marginal_relevance,)ifTYPE_CHECKING:fromhdbcliimportdbapiHANA_DISTANCE_FUNCTION:dict={DistanceStrategy.COSINE:("COSINE_SIMILARITY","DESC"),DistanceStrategy.EUCLIDEAN_DISTANCE:("L2DISTANCE","ASC"),}COMPARISONS_TO_SQL={"$eq":"=","$ne":"<>","$lt":"<","$lte":"<=","$gt":">","$gte":">=",}IN_OPERATORS_TO_SQL={"$in":"IN","$nin":"NOT IN",}BETWEEN_OPERATOR="$between"LIKE_OPERATOR="$like"LOGICAL_OPERATORS_TO_SQL={"$and":"AND","$or":"OR"}default_distance_strategy=DistanceStrategy.COSINEdefault_table_name:str="EMBEDDINGS"default_content_column:str="VEC_TEXT"default_metadata_column:str="VEC_META"default_vector_column:str="VEC_VECTOR"default_vector_column_length:int=-1# -1 means dynamic length
[docs]classHanaDB(VectorStore):"""SAP HANA Cloud Vector Engine The prerequisite for using this class is the installation of the ``hdbcli`` Python package. The HanaDB vectorstore can be created by providing an embedding function and an existing database connection. Optionally, the names of the table and the columns to use. """
[docs]def__init__(self,connection:dbapi.Connection,embedding:Embeddings,distance_strategy:DistanceStrategy=default_distance_strategy,table_name:str=default_table_name,content_column:str=default_content_column,metadata_column:str=default_metadata_column,vector_column:str=default_vector_column,vector_column_length:int=default_vector_column_length,*,specific_metadata_columns:Optional[List[str]]=None,):# Check if the hdbcli package is installedifimportlib.util.find_spec("hdbcli")isNone:raiseImportError("Could not import hdbcli python package. ""Please install it with `pip install hdbcli`.")valid_distance=FalseforkeyinHANA_DISTANCE_FUNCTION.keys():ifkeyisdistance_strategy:valid_distance=Trueifnotvalid_distance:raiseValueError("Unsupported distance_strategy: {}".format(distance_strategy))self.connection=connectionself.embedding=embeddingself.distance_strategy=distance_strategyself.table_name=HanaDB._sanitize_name(table_name)self.content_column=HanaDB._sanitize_name(content_column)self.metadata_column=HanaDB._sanitize_name(metadata_column)self.vector_column=HanaDB._sanitize_name(vector_column)self.vector_column_length=HanaDB._sanitize_int(vector_column_length)self.specific_metadata_columns=HanaDB._sanitize_specific_metadata_columns(specific_metadata_columnsor[])# Check if the table exists, and eventually create itifnotself._table_exists(self.table_name):sql_str=(f'CREATE TABLE "{self.table_name}"('f'"{self.content_column}" NCLOB, 'f'"{self.metadata_column}" NCLOB, 'f'"{self.vector_column}" REAL_VECTOR ')ifself.vector_column_lengthin[-1,0]:sql_str+=");"else:sql_str+=f"({self.vector_column_length}));"try:cur=self.connection.cursor()cur.execute(sql_str)finally:cur.close()# Check if the needed columns exist and have the correct typeself._check_column(self.table_name,self.content_column,["NCLOB","NVARCHAR"])self._check_column(self.table_name,self.metadata_column,["NCLOB","NVARCHAR"])self._check_column(self.table_name,self.vector_column,["REAL_VECTOR"],self.vector_column_length,)forcolumn_nameinself.specific_metadata_columns:self._check_column(self.table_name,column_name)
def_table_exists(self,table_name)->bool:# type: ignore[no-untyped-def]sql_str=("SELECT COUNT(*) FROM SYS.TABLES WHERE SCHEMA_NAME = CURRENT_SCHEMA"" AND TABLE_NAME = ?")try:cur=self.connection.cursor()cur.execute(sql_str,(table_name))ifcur.has_result_set():rows=cur.fetchall()ifrows[0][0]==1:returnTruefinally:cur.close()returnFalsedef_check_column(# type: ignore[no-untyped-def]self,table_name,column_name,column_type=None,column_length=None):sql_str=("SELECT DATA_TYPE_NAME, LENGTH FROM SYS.TABLE_COLUMNS WHERE ""SCHEMA_NAME = CURRENT_SCHEMA ""AND TABLE_NAME = ? AND COLUMN_NAME = ?")try:cur=self.connection.cursor()cur.execute(sql_str,(table_name,column_name))ifcur.has_result_set():rows=cur.fetchall()iflen(rows)==0:raiseAttributeError(f"Column {column_name} does not exist")# Check data typeifcolumn_type:ifrows[0][0]notincolumn_type:raiseAttributeError(f"Column {column_name} has the wrong type: {rows[0][0]}")# Check length, if parameter was provided# Length can either be -1 (QRC01+02-24) or 0 (QRC03-24 onwards)# to indicate no length constraint being present.ifcolumn_lengthisnotNoneandcolumn_length>0:ifrows[0][1]!=column_length:raiseAttributeError(f"Column {column_name} has the wrong length: {rows[0][1]} "f"expected: {column_length}")else:raiseAttributeError(f"Column {column_name} does not exist")finally:cur.close()@propertydefembeddings(self)->Embeddings:returnself.embedding@staticmethoddef_sanitize_name(input_str:str)->str:# type: ignore[misc]# Remove characters that are not alphanumeric or underscoresreturnre.sub(r"[^a-zA-Z0-9_]","",input_str)@staticmethoddef_sanitize_int(input_int:any)->int:# type: ignore[valid-type]value=int(str(input_int))ifvalue<-1:raiseValueError(f"Value ({value}) must not be smaller than -1")returnint(str(input_int))@staticmethoddef_sanitize_list_float(embedding:List[float])->List[float]:forvalueinembedding:ifnotisinstance(value,float):raiseValueError(f"Value ({value}) does not have type float")returnembedding# Compile pattern only once, for better performance_compiled_pattern:Pattern=re.compile("^[_a-zA-Z][_a-zA-Z0-9]*$")@staticmethoddef_sanitize_metadata_keys(metadata:dict)->dict:forkeyinmetadata.keys():ifnotHanaDB._compiled_pattern.match(key):raiseValueError(f"Invalid metadata key {key}")returnmetadata@staticmethoddef_sanitize_specific_metadata_columns(specific_metadata_columns:List[str],)->List[str]:metadata_columns=[]forcinspecific_metadata_columns:sanitized_name=HanaDB._sanitize_name(c)metadata_columns.append(sanitized_name)returnmetadata_columnsdef_split_off_special_metadata(self,metadata:dict)->Tuple[dict,list]:# Use provided values by default or fallbackspecial_metadata=[]ifnotmetadata:return{},[]forcolumn_nameinself.specific_metadata_columns:special_metadata.append(metadata.get(column_name,None))returnmetadata,special_metadata
[docs]defadd_texts(# type: ignore[override]self,texts:Iterable[str],metadatas:Optional[List[dict]]=None,embeddings:Optional[List[List[float]]]=None,**kwargs:Any,)->List[str]:"""Add more texts to the vectorstore. Args: texts (Iterable[str]): Iterable of strings/text to add to the vectorstore. metadatas (Optional[List[dict]], optional): Optional list of metadatas. Defaults to None. embeddings (Optional[List[List[float]]], optional): Optional pre-generated embeddings. Defaults to None. Returns: List[str]: empty list """# Create all embeddings of the texts beforehand to improve performanceifembeddingsisNone:embeddings=self.embedding.embed_documents(list(texts))# Create sql parameters arraysql_params=[]fori,textinenumerate(texts):metadata=metadatas[i]ifmetadataselse{}metadata,extracted_special_metadata=self._split_off_special_metadata(metadata)embedding=(embeddings[i]ifembeddingselseself.embedding.embed_documents([text])[0])sql_params.append((text,json.dumps(HanaDB._sanitize_metadata_keys(metadata)),f"[{','.join(map(str,embedding))}]",*extracted_special_metadata,))# Insert data into the tablecur=self.connection.cursor()try:specific_metadata_columns_string='", "'.join(self.specific_metadata_columns)ifspecific_metadata_columns_string:specific_metadata_columns_string=(', "'+specific_metadata_columns_string+'"')sql_str=(f'INSERT INTO "{self.table_name}" ("{self.content_column}", 'f'"{self.metadata_column}", 'f'"{self.vector_column}"{specific_metadata_columns_string}) 'f"VALUES (?, ?, TO_REAL_VECTOR (?)"f"{', ?'*len(self.specific_metadata_columns)});")cur.executemany(sql_str,sql_params)finally:cur.close()return[]
[docs]@classmethoddeffrom_texts(# type: ignore[no-untyped-def, override]cls:Type[HanaDB],texts:List[str],embedding:Embeddings,metadatas:Optional[List[dict]]=None,connection:dbapi.Connection=None,distance_strategy:DistanceStrategy=default_distance_strategy,table_name:str=default_table_name,content_column:str=default_content_column,metadata_column:str=default_metadata_column,vector_column:str=default_vector_column,vector_column_length:int=default_vector_column_length,*,specific_metadata_columns:Optional[List[str]]=None,):"""Create a HanaDB instance from raw documents. This is a user-friendly interface that: 1. Embeds documents. 2. Creates a table if it does not yet exist. 3. Adds the documents to the table. This is intended to be a quick way to get started. """instance=cls(connection=connection,embedding=embedding,distance_strategy=distance_strategy,table_name=table_name,content_column=content_column,metadata_column=metadata_column,vector_column=vector_column,vector_column_length=vector_column_length,# -1 means dynamic lengthspecific_metadata_columns=specific_metadata_columns,)instance.add_texts(texts,metadatas)returninstance
[docs]defsimilarity_search(# type: ignore[override]self,query:str,k:int=4,filter:Optional[dict]=None)->List[Document]:"""Return docs most similar to query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: A dictionary of metadata fields and values to filter by. Defaults to None. Returns: List of Documents most similar to the query """docs_and_scores=self.similarity_search_with_score(query=query,k=k,filter=filter)return[docfordoc,_indocs_and_scores]
[docs]defsimilarity_search_with_score(self,query:str,k:int=4,filter:Optional[dict]=None)->List[Tuple[Document,float]]:"""Return documents and score values most similar to query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: A dictionary of metadata fields and values to filter by. Defaults to None. Returns: List of tuples (containing a Document and a score) that are most similar to the query """embedding=self.embedding.embed_query(query)returnself.similarity_search_with_score_by_vector(embedding=embedding,k=k,filter=filter)
[docs]defsimilarity_search_with_score_and_vector_by_vector(self,embedding:List[float],k:int=4,filter:Optional[dict]=None)->List[Tuple[Document,float,List[float]]]:"""Return docs most similar to the given embedding. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: A dictionary of metadata fields and values to filter by. Defaults to None. Returns: List of Documents most similar to the query and score and the document's embedding vector for each """result=[]k=HanaDB._sanitize_int(k)embedding=HanaDB._sanitize_list_float(embedding)distance_func_name=HANA_DISTANCE_FUNCTION[self.distance_strategy][0]embedding_as_str=",".join(map(str,embedding))sql_str=(f"SELECT TOP {k}"f' "{self.content_column}", '# row[0]f' "{self.metadata_column}", '# row[1]f' TO_NVARCHAR("{self.vector_column}"), '# row[2]f' {distance_func_name}("{self.vector_column}", TO_REAL_VECTOR 'f" (ARRAY({embedding_as_str}))) AS CS "# row[3]f'FROM "{self.table_name}"')order_str=f" order by CS {HANA_DISTANCE_FUNCTION[self.distance_strategy][1]}"where_str,query_tuple=self._create_where_by_filter(filter)sql_str=sql_str+where_strsql_str=sql_str+order_strtry:cur=self.connection.cursor()cur.execute(sql_str,query_tuple)ifcur.has_result_set():rows=cur.fetchall()forrowinrows:js=json.loads(row[1])doc=Document(page_content=row[0],metadata=js)result_vector=HanaDB._parse_float_array_from_string(row[2])result.append((doc,row[3],result_vector))finally:cur.close()returnresult
[docs]defsimilarity_search_with_score_by_vector(self,embedding:List[float],k:int=4,filter:Optional[dict]=None)->List[Tuple[Document,float]]:"""Return docs most similar to the given embedding. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: A dictionary of metadata fields and values to filter by. Defaults to None. Returns: List of Documents most similar to the query and score for each """whole_result=self.similarity_search_with_score_and_vector_by_vector(embedding=embedding,k=k,filter=filter)return[(result_item[0],result_item[1])forresult_iteminwhole_result]
[docs]defsimilarity_search_by_vector(# type: ignore[override]self,embedding:List[float],k:int=4,filter:Optional[dict]=None)->List[Document]:"""Return docs most similar to embedding vector. Args: embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. filter: A dictionary of metadata fields and values to filter by. Defaults to None. Returns: List of Documents most similar to the query vector. """docs_and_scores=self.similarity_search_with_score_by_vector(embedding=embedding,k=k,filter=filter)return[docfordoc,_indocs_and_scores]
def_create_where_by_filter(self,filter):# type: ignore[no-untyped-def]query_tuple=[]where_str=""iffilter:where_str,query_tuple=self._process_filter_object(filter)where_str=" WHERE "+where_strreturnwhere_str,query_tupledef_process_filter_object(self,filter):# type: ignore[no-untyped-def]query_tuple=[]where_str=""iffilter:fori,keyinenumerate(filter.keys()):filter_value=filter[key]ifi!=0:where_str+=" AND "# Handling of 'special' boolean operators "$and", "$or"ifkeyinLOGICAL_OPERATORS_TO_SQL:logical_operator=LOGICAL_OPERATORS_TO_SQL[key]logical_operands=filter_valueforj,logical_operandinenumerate(logical_operands):ifj!=0:where_str+=f" {logical_operator} "(where_str_logical,query_tuple_logical,)=self._process_filter_object(logical_operand)where_str+=where_str_logicalquery_tuple+=query_tuple_logicalcontinueoperator="="sql_param="?"ifisinstance(filter_value,bool):query_tuple.append("true"iffilter_valueelse"false")elifisinstance(filter_value,int)orisinstance(filter_value,str):query_tuple.append(filter_value)elifisinstance(filter_value,Dict):# Handling of 'special' operators starting with "$"special_op=next(iter(filter_value))special_val=filter_value[special_op]# "$eq", "$ne", "$lt", "$lte", "$gt", "$gte"ifspecial_opinCOMPARISONS_TO_SQL:operator=COMPARISONS_TO_SQL[special_op]ifisinstance(special_val,bool):query_tuple.append("true"ifspecial_valelse"false")elifisinstance(special_val,float):sql_param="CAST(? as float)"query_tuple.append(special_val)elif(isinstance(special_val,dict)and"type"inspecial_valandspecial_val["type"]=="date"):# Date typesql_param="CAST(? as DATE)"query_tuple.append(special_val["date"])else:query_tuple.append(special_val)# "$between"elifspecial_op==BETWEEN_OPERATOR:between_from=special_val[0]between_to=special_val[1]operator="BETWEEN"sql_param="? AND ?"query_tuple.append(between_from)query_tuple.append(between_to)# "$like"elifspecial_op==LIKE_OPERATOR:operator="LIKE"query_tuple.append(special_val)# "$in", "$nin"elifspecial_opinIN_OPERATORS_TO_SQL:operator=IN_OPERATORS_TO_SQL[special_op]ifisinstance(special_val,list):fori,list_entryinenumerate(special_val):ifi==0:sql_param="("sql_param=sql_param+"?"ifi==(len(special_val)-1):sql_param=sql_param+")"else:sql_param=sql_param+","query_tuple.append(list_entry)else:raiseValueError(f"Unsupported value for {operator}: {special_val}")else:raiseValueError(f"Unsupported operator: {special_op}")else:raiseValueError(f"Unsupported filter data-type: {type(filter_value)}")selector=(f' "{key}"'ifkeyinself.specific_metadata_columnselsef"JSON_VALUE({self.metadata_column}, '$.{key}')")where_str+=f"{selector} "f"{operator}{sql_param}"returnwhere_str,query_tuple
[docs]defdelete(# type: ignore[override]self,ids:Optional[List[str]]=None,filter:Optional[dict]=None)->Optional[bool]:"""Delete entries by filter with metadata values Args: ids: Deletion with ids is not supported! A ValueError will be raised. filter: A dictionary of metadata fields and values to filter by. An empty filter ({}) will delete all entries in the table. Returns: Optional[bool]: True, if deletion is technically successful. Deletion of zero entries, due to non-matching filters is a success. """ifidsisnotNone:raiseValueError("Deletion via ids is not supported")iffilterisNone:raiseValueError("Parameter 'filter' is required when calling 'delete'")where_str,query_tuple=self._create_where_by_filter(filter)sql_str=f'DELETE FROM "{self.table_name}" {where_str}'try:cur=self.connection.cursor()cur.execute(sql_str,query_tuple)finally:cur.close()returnTrue
[docs]asyncdefadelete(# type: ignore[override]self,ids:Optional[List[str]]=None,filter:Optional[dict]=None)->Optional[bool]:"""Delete by vector ID or other criteria. Args: ids: List of ids to delete. Returns: Optional[bool]: True if deletion is successful, False otherwise, None if not implemented. """returnawaitrun_in_executor(None,self.delete,ids=ids,filter=filter)
[docs]defmax_marginal_relevance_search(# type: ignore[override]self,query:str,k:int=4,fetch_k:int=20,lambda_mult:float=0.5,filter:Optional[dict]=None,)->List[Document]:"""Return docs selected using the maximal marginal relevance. Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. Args: query: search query text. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. lambda_mult: Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. filter: Filter on metadata properties, e.g. { "str_property": "foo", "int_property": 123 } Returns: List of Documents selected by maximal marginal relevance. """embedding=self.embedding.embed_query(query)returnself.max_marginal_relevance_search_by_vector(embedding=embedding,k=k,fetch_k=fetch_k,lambda_mult=lambda_mult,filter=filter,)
[docs]asyncdefamax_marginal_relevance_search_by_vector(# type: ignore[override]self,embedding:List[float],k:int=4,fetch_k:int=20,lambda_mult:float=0.5,)->List[Document]:"""Return docs selected using the maximal marginal relevance."""returnawaitrun_in_executor(None,self.max_marginal_relevance_search_by_vector,embedding=embedding,k=k,fetch_k=fetch_k,lambda_mult=lambda_mult,)
@staticmethoddef_cosine_relevance_score_fn(distance:float)->float:returndistancedef_select_relevance_score_fn(self)->Callable[[float],float]:""" The 'correct' relevance function may differ depending on a few things, including: - the distance / similarity metric used by the VectorStore - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) - embedding dimensionality - etc. Vectorstores should define their own selection based method of relevance. """ifself.distance_strategy==DistanceStrategy.COSINE:returnHanaDB._cosine_relevance_score_fnelifself.distance_strategy==DistanceStrategy.EUCLIDEAN_DISTANCE:returnHanaDB._euclidean_relevance_score_fnelse:raiseValueError("Unsupported distance_strategy: {}".format(self.distance_strategy))