[docs]@deprecated("0.0.27",alternative="Use ElasticsearchStore class in langchain-elasticsearch package",pending=True,)classElasticVectorSearch(VectorStore):""" ElasticVectorSearch uses the brute force method of searching on vectors. Recommended to use ElasticsearchStore instead, which gives you the option to uses the approx HNSW algorithm which performs better on large datasets. ElasticsearchStore also supports metadata filtering, customising the query retriever and much more! You can read more on ElasticsearchStore: https://python.langchain.com/docs/integrations/vectorstores/elasticsearch To connect to an `Elasticsearch` instance that does not require login credentials, pass the Elasticsearch URL and index name along with the embedding object to the constructor. Example: .. code-block:: python from langchain_community.vectorstores import ElasticVectorSearch from langchain_community.embeddings import OpenAIEmbeddings embedding = OpenAIEmbeddings() elastic_vector_search = ElasticVectorSearch( elasticsearch_url="http://localhost:9200", index_name="test_index", embedding=embedding ) To connect to an Elasticsearch instance that requires login credentials, including Elastic Cloud, use the Elasticsearch URL format https://username:password@es_host:9243. For example, to connect to Elastic Cloud, create the Elasticsearch URL with the required authentication details and pass it to the ElasticVectorSearch constructor as the named parameter elasticsearch_url. You can obtain your Elastic Cloud URL and login credentials by logging in to the Elastic Cloud console at https://cloud.elastic.co, selecting your deployment, and navigating to the "Deployments" page. To obtain your Elastic Cloud password for the default "elastic" user: 1. Log in to the Elastic Cloud console at https://cloud.elastic.co 2. Go to "Security" > "Users" 3. Locate the "elastic" user and click "Edit" 4. Click "Reset password" 5. Follow the prompts to reset the password The format for Elastic Cloud URLs is https://username:password@cluster_id.region_id.gcp.cloud.es.io:9243. Example: .. code-block:: python from langchain_community.vectorstores import ElasticVectorSearch from langchain_community.embeddings import OpenAIEmbeddings embedding = OpenAIEmbeddings() elastic_host = "cluster_id.region_id.gcp.cloud.es.io" elasticsearch_url = f"https://username:password@{elastic_host}:9243" elastic_vector_search = ElasticVectorSearch( elasticsearch_url=elasticsearch_url, index_name="test_index", embedding=embedding ) Args: elasticsearch_url (str): The URL for the Elasticsearch instance. index_name (str): The name of the Elasticsearch index for the embeddings. embedding (Embeddings): An object that provides the ability to embed text. It should be an instance of a class that subclasses the Embeddings abstract base class, such as OpenAIEmbeddings() Raises: ValueError: If the elasticsearch python package is not installed. """
[docs]def__init__(self,elasticsearch_url:str,index_name:str,embedding:Embeddings,*,ssl_verify:Optional[Dict[str,Any]]=None,):"""Initialize with necessary components."""warnings.warn("ElasticVectorSearch will be removed in a future release. See""Elasticsearch integration docs on how to upgrade.")try:importelasticsearchexceptImportError:raiseImportError("Could not import elasticsearch python package. ""Please install it with `pip install elasticsearch`.")self.embedding=embeddingself.index_name=index_name_ssl_verify=ssl_verifyor{}try:self.client=elasticsearch.Elasticsearch(elasticsearch_url,**_ssl_verify,headers={"user-agent":self.get_user_agent()},)exceptValueErrorase:raiseValueError(f"Your elasticsearch client string is mis-formatted. Got error: {e} ")
[docs]defadd_texts(self,texts:Iterable[str],metadatas:Optional[List[dict]]=None,ids:Optional[List[str]]=None,refresh_indices:bool=True,**kwargs:Any,)->List[str]:"""Run more texts through the embeddings and add to the vectorstore. Args: texts: Iterable of strings to add to the vectorstore. metadatas: Optional list of metadatas associated with the texts. ids: Optional list of unique IDs. refresh_indices: bool to refresh ElasticSearch indices Returns: List of ids from adding the texts into the vectorstore. """try:fromelasticsearch.exceptionsimportNotFoundErrorfromelasticsearch.helpersimportbulkexceptImportError:raiseImportError("Could not import elasticsearch python package. ""Please install it with `pip install elasticsearch`.")requests=[]ids=idsor[str(uuid.uuid4())for_intexts]embeddings=self.embedding.embed_documents(list(texts))dim=len(embeddings[0])mapping=_default_text_mapping(dim)# check to see if the index already existstry:self.client.indices.get(index=self.index_name)exceptNotFoundError:# TODO would be nice to create index before embedding,# just to save expensive steps for lastself.create_index(self.client,self.index_name,mapping)fori,textinenumerate(texts):metadata=metadatas[i]ifmetadataselse{}request={"_op_type":"index","_index":self.index_name,"vector":embeddings[i],"text":text,"metadata":metadata,"_id":ids[i],}requests.append(request)bulk(self.client,requests)ifrefresh_indices:self.client.indices.refresh(index=self.index_name)returnids
[docs]defsimilarity_search(self,query:str,k:int=4,filter:Optional[dict]=None,**kwargs:Any)->List[Document]:"""Return docs most similar to query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. Returns: List of Documents most similar to the query. """docs_and_scores=self.similarity_search_with_score(query,k,filter=filter)documents=[d[0]fordindocs_and_scores]returndocuments
[docs]defsimilarity_search_with_score(self,query:str,k:int=4,filter:Optional[dict]=None,**kwargs:Any)->List[Tuple[Document,float]]:"""Return docs most similar to query. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. Returns: List of Documents most similar to the query. """embedding=self.embedding.embed_query(query)script_query=_default_script_query(embedding,filter)response=self.client_search(self.client,self.index_name,script_query,size=k)hits=[hitforhitinresponse["hits"]["hits"]]docs_and_scores=[(Document(page_content=hit["_source"]["text"],metadata=hit["_source"]["metadata"],),hit["_score"],)forhitinhits]returndocs_and_scores
[docs]@classmethoddeffrom_texts(cls,texts:List[str],embedding:Embeddings,metadatas:Optional[List[dict]]=None,ids:Optional[List[str]]=None,index_name:Optional[str]=None,refresh_indices:bool=True,**kwargs:Any,)->ElasticVectorSearch:"""Construct ElasticVectorSearch wrapper from raw documents. This is a user-friendly interface that: 1. Embeds documents. 2. Creates a new index for the embeddings in the Elasticsearch instance. 3. Adds the documents to the newly created Elasticsearch index. This is intended to be a quick way to get started. Example: .. code-block:: python from langchain_community.vectorstores import ElasticVectorSearch from langchain_community.embeddings import OpenAIEmbeddings embeddings = OpenAIEmbeddings() elastic_vector_search = ElasticVectorSearch.from_texts( texts, embeddings, elasticsearch_url="http://localhost:9200" ) """elasticsearch_url=get_from_dict_or_env(kwargs,"elasticsearch_url","ELASTICSEARCH_URL")if"elasticsearch_url"inkwargs:delkwargs["elasticsearch_url"]index_name=index_nameoruuid.uuid4().hexvectorsearch=cls(elasticsearch_url,index_name,embedding,**kwargs)vectorsearch.add_texts(texts,metadatas=metadatas,ids=ids,refresh_indices=refresh_indices)returnvectorsearch
[docs]defdelete(self,ids:Optional[List[str]]=None,**kwargs:Any)->None:"""Delete by vector IDs. Args: ids: List of ids to delete. """ifidsisNone:raiseValueError("No ids provided to delete.")# TODO: Check if this can be done in bulkforidinids:self.client.delete(index=self.index_name,id=id)
[docs]@deprecated("0.0.1",alternative="Use ElasticsearchStore class in langchain-elasticsearch package",pending=True,)classElasticKnnSearch(VectorStore):"""[DEPRECATED] `Elasticsearch` with k-nearest neighbor search (`k-NN`) vector store. Recommended to use ElasticsearchStore instead, which supports metadata filtering, customising the query retriever and much more! You can read more on ElasticsearchStore: https://python.langchain.com/docs/integrations/vectorstores/elasticsearch It creates an Elasticsearch index of text data that can be searched using k-NN search. The text data is transformed into vector embeddings using a provided embedding model, and these embeddings are stored in the Elasticsearch index. Attributes: index_name (str): The name of the Elasticsearch index. embedding (Embeddings): The embedding model to use for transforming text data into vector embeddings. es_connection (Elasticsearch, optional): An existing Elasticsearch connection. es_cloud_id (str, optional): The Cloud ID of your Elasticsearch Service deployment. es_user (str, optional): The username for your Elasticsearch Service deployment. es_password (str, optional): The password for your Elasticsearch Service deployment. vector_query_field (str, optional): The name of the field in the Elasticsearch index that contains the vector embeddings. query_field (str, optional): The name of the field in the Elasticsearch index that contains the original text data. Usage: >>> from embeddings import Embeddings >>> embedding = Embeddings.load('glove') >>> es_search = ElasticKnnSearch('my_index', embedding) >>> es_search.add_texts(['Hello world!', 'Another text']) >>> results = es_search.knn_search('Hello') [(Document(page_content='Hello world!', metadata={}), 0.9)] """
[docs]def__init__(self,index_name:str,embedding:Embeddings,es_connection:Optional["Elasticsearch"]=None,es_cloud_id:Optional[str]=None,es_user:Optional[str]=None,es_password:Optional[str]=None,vector_query_field:Optional[str]="vector",query_field:Optional[str]="text",):try:importelasticsearchexceptImportError:raiseImportError("Could not import elasticsearch python package. ""Please install it with `pip install elasticsearch`.")warnings.warn("ElasticKnnSearch will be removed in a future release.""Use ElasticsearchStore instead. See Elasticsearch ""integration docs on how to upgrade.")self.embedding=embeddingself.index_name=index_nameself.query_field=query_fieldself.vector_query_field=vector_query_field# If a pre-existing Elasticsearch connection is provided, use it.ifes_connectionisnotNone:self.client=es_connectionelse:# If credentials for a new Elasticsearch connection are provided,# create a new connection.ifes_cloud_idandes_userandes_password:self.client=elasticsearch.Elasticsearch(cloud_id=es_cloud_id,basic_auth=(es_user,es_password))else:raiseValueError("""Either provide a pre-existing Elasticsearch connection, \ or valid credentials for creating a new connection.""")
@staticmethoddef_default_knn_mapping(dims:int,similarity:Optional[str]="dot_product")->Dict:return{"properties":{"text":{"type":"text"},"vector":{"type":"dense_vector","dims":dims,"index":True,"similarity":similarity,},}}def_default_knn_query(self,query_vector:Optional[List[float]]=None,query:Optional[str]=None,model_id:Optional[str]=None,k:Optional[int]=10,num_candidates:Optional[int]=10,)->Dict:knn:Dict={"field":self.vector_query_field,"k":k,"num_candidates":num_candidates,}# Case 1: `query_vector` is provided, but not `model_id` -> use query_vectorifquery_vectorandnotmodel_id:knn["query_vector"]=query_vector# Case 2: `query` and `model_id` are provided, -> use query_vector_builderelifqueryandmodel_id:knn["query_vector_builder"]={"text_embedding":{"model_id":model_id,# use 'model_id' argument"model_text":query,# use 'query' argument}}else:raiseValueError("Either `query_vector` or `model_id` must be provided, but not both.")returnknn
[docs]defsimilarity_search(self,query:str,k:int=4,filter:Optional[dict]=None,**kwargs:Any)->List[Document]:""" Pass through to `knn_search` """results=self.knn_search(query=query,k=k,**kwargs)return[docfordoc,scoreinresults]
[docs]defsimilarity_search_with_score(self,query:str,k:int=10,**kwargs:Any)->List[Tuple[Document,float]]:"""Pass through to `knn_search including score`"""returnself.knn_search(query=query,k=k,**kwargs)
[docs]defknn_search(self,query:Optional[str]=None,k:Optional[int]=10,query_vector:Optional[List[float]]=None,model_id:Optional[str]=None,size:Optional[int]=10,source:Optional[bool]=True,fields:Optional[Union[List[Mapping[str,Any]],Tuple[Mapping[str,Any],...],None]]=None,page_content:Optional[str]="text",)->List[Tuple[Document,float]]:""" Perform a k-NN search on the Elasticsearch index. Args: query (str, optional): The query text to search for. k (int, optional): The number of nearest neighbors to return. query_vector (List[float], optional): The query vector to search for. model_id (str, optional): The ID of the model to use for transforming the query text into a vector. size (int, optional): The number of search results to return. source (bool, optional): Whether to return the source of the search results. fields (List[Mapping[str, Any]], optional): The fields to return in the search results. page_content (str, optional): The name of the field that contains the page content. Returns: A list of tuples, where each tuple contains a Document object and a score. """# if not source and (fields == None or page_content not in fields):ifnotsourceand(fieldsisNoneornotany(page_contentinfieldforfieldinfields)):raiseValueError("If source=False `page_content` field must be in `fields`")knn_query_body=self._default_knn_query(query_vector=query_vector,query=query,model_id=model_id,k=k)# Perform the kNN search on the Elasticsearch index and return the results.response=self.client.search(index=self.index_name,knn=knn_query_body,size=size,source=source,fields=fields,)hits=[hitforhitinresponse["hits"]["hits"]]docs_and_scores=[(Document(page_content=(hit["_source"][page_content]ifsourceelsehit["fields"][page_content][0]),metadata=hit["fields"]iffieldselse{},),hit["_score"],)forhitinhits]returndocs_and_scores
[docs]defknn_hybrid_search(self,query:Optional[str]=None,k:Optional[int]=10,query_vector:Optional[List[float]]=None,model_id:Optional[str]=None,size:Optional[int]=10,source:Optional[bool]=True,knn_boost:Optional[float]=0.9,query_boost:Optional[float]=0.1,fields:Optional[Union[List[Mapping[str,Any]],Tuple[Mapping[str,Any],...],None]]=None,page_content:Optional[str]="text",)->List[Tuple[Document,float]]:""" Perform a hybrid k-NN and text search on the Elasticsearch index. Args: query (str, optional): The query text to search for. k (int, optional): The number of nearest neighbors to return. query_vector (List[float], optional): The query vector to search for. model_id (str, optional): The ID of the model to use for transforming the query text into a vector. size (int, optional): The number of search results to return. source (bool, optional): Whether to return the source of the search results. knn_boost (float, optional): The boost value to apply to the k-NN search results. query_boost (float, optional): The boost value to apply to the text search results. fields (List[Mapping[str, Any]], optional): The fields to return in the search results. page_content (str, optional): The name of the field that contains the page content. Returns: A list of tuples, where each tuple contains a Document object and a score. """# if not source and (fields == None or page_content not in fields):ifnotsourceand(fieldsisNoneornotany(page_contentinfieldforfieldinfields)):raiseValueError("If source=False `page_content` field must be in `fields`")knn_query_body=self._default_knn_query(query_vector=query_vector,query=query,model_id=model_id,k=k)# Modify the knn_query_body to add a "boost" parameterknn_query_body["boost"]=knn_boost# Generate the body of the standard Elasticsearch querymatch_query_body={"match":{self.query_field:{"query":query,"boost":query_boost}}}# Perform the hybrid search on the Elasticsearch index and return the results.response=self.client.search(index=self.index_name,query=match_query_body,knn=knn_query_body,fields=fields,size=size,source=source,)hits=[hitforhitinresponse["hits"]["hits"]]docs_and_scores=[(Document(page_content=(hit["_source"][page_content]ifsourceelsehit["fields"][page_content][0]),metadata=hit["fields"]iffieldselse{},),hit["_score"],)forhitinhits]returndocs_and_scores
[docs]defcreate_knn_index(self,mapping:Dict)->None:""" Create a new k-NN index in Elasticsearch. Args: mapping (Dict): The mapping to use for the new index. Returns: None """self.client.indices.create(index=self.index_name,mappings=mapping)
[docs]defadd_texts(self,texts:Iterable[str],metadatas:Optional[List[Dict[Any,Any]]]=None,model_id:Optional[str]=None,refresh_indices:bool=False,**kwargs:Any,)->List[str]:""" Add a list of texts to the Elasticsearch index. Args: texts (Iterable[str]): The texts to add to the index. metadatas (List[Dict[Any, Any]], optional): A list of metadata dictionaries to associate with the texts. model_id (str, optional): The ID of the model to use for transforming the texts into vectors. refresh_indices (bool, optional): Whether to refresh the Elasticsearch indices after adding the texts. **kwargs: Arbitrary keyword arguments. Returns: A list of IDs for the added texts. """# Check if the index exists.ifnotself.client.indices.exists(index=self.index_name):dims=kwargs.get("dims")ifdimsisNone:raiseValueError("ElasticKnnSearch requires 'dims' parameter")similarity=kwargs.get("similarity")optional_args={}ifsimilarityisnotNone:optional_args["similarity"]=similaritymapping=self._default_knn_mapping(dims=dims,**optional_args)self.create_knn_index(mapping)embeddings=self.embedding.embed_documents(list(texts))# body = []body:List[Mapping[str,Any]]=[]fortext,vectorinzip(texts,embeddings):body.extend([{"index":{"_index":self.index_name}},{"text":text,"vector":vector},])responses=self.client.bulk(operations=body)ids=[item["index"]["_id"]foriteminresponses["items"]ifitem["index"]["result"]=="created"]ifrefresh_indices:self.client.indices.refresh(index=self.index_name)returnids
[docs]@classmethoddeffrom_texts(cls,texts:List[str],embedding:Embeddings,metadatas:Optional[List[Dict[Any,Any]]]=None,**kwargs:Any,)->ElasticKnnSearch:""" Create a new ElasticKnnSearch instance and add a list of texts to the Elasticsearch index. Args: texts (List[str]): The texts to add to the index. embedding (Embeddings): The embedding model to use for transforming the texts into vectors. metadatas (List[Dict[Any, Any]], optional): A list of metadata dictionaries to associate with the texts. **kwargs: Arbitrary keyword arguments. Returns: A new ElasticKnnSearch instance. """index_name=kwargs.get("index_name",str(uuid.uuid4()))es_connection=kwargs.get("es_connection")es_cloud_id=kwargs.get("es_cloud_id")es_user=kwargs.get("es_user")es_password=kwargs.get("es_password")vector_query_field=kwargs.get("vector_query_field","vector")query_field=kwargs.get("query_field","text")model_id=kwargs.get("model_id")dims=kwargs.get("dims")ifdimsisNone:raiseValueError("ElasticKnnSearch requires 'dims' parameter")optional_args={}ifvector_query_fieldisnotNone:optional_args["vector_query_field"]=vector_query_fieldifquery_fieldisnotNone:optional_args["query_field"]=query_fieldknnvectorsearch=cls(index_name=index_name,embedding=embedding,es_connection=es_connection,es_cloud_id=es_cloud_id,es_user=es_user,es_password=es_password,**optional_args,)# Encode the provided texts and add them to the newly created index.knnvectorsearch.add_texts(texts,model_id=model_id,dims=dims,**optional_args)returnknnvectorsearch