Source code for langchain_community.vectorstores.pathway

"""
Pathway Vector Store client.


The Pathway Vector Server is a pipeline written in the Pathway framweork which indexes
all files in a given folder, embeds them, and builds a vector index. The pipeline reacts
to changes in source files, automatically updating appropriate index entries.

The PathwayVectorClient implements the LangChain VectorStore interface and queries the
PathwayVectorServer to retrieve up-to-date documents.

You can use the client with managed instances of Pathway Vector Store, or run your own
instance as described at https://pathway.com/developers/user-guide/llm-xpack/vectorstore_pipeline/

"""

import json
import logging
from typing import Any, Callable, Iterable, List, Optional, Tuple

import requests
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore


# Copied from https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/vector_store.py
# to remove dependency on Pathway library.
class _VectorStoreClient:
    def __init__(
        self,
        host: Optional[str] = None,
        port: Optional[int] = None,
        url: Optional[str] = None,
    ):
        """
        A client you can use to query :py:class:`VectorStoreServer`.

        Please provide aither the `url`, or `host` and `port`.

        Args:
            - host: host on which `:py:class:`VectorStoreServer` listens
            - port: port on which `:py:class:`VectorStoreServer` listens
            - url: url at which `:py:class:`VectorStoreServer` listens
        """
        err = "Either (`host` and `port`) or `url` must be provided, but not both."
        if url is not None:
            if host or port:
                raise ValueError(err)
            self.url = url
        else:
            if host is None:
                raise ValueError(err)
            port = port or 80
            self.url = f"http://{host}:{port}"

    def query(
        self, query: str, k: int = 3, metadata_filter: Optional[str] = None
    ) -> List[dict]:
        """
        Perform a query to the vector store and fetch results.

        Args:
            - query:
            - k: number of documents to be returned
            - metadata_filter: optional string representing the metadata filtering query
                in the JMESPath format. The search will happen only for documents
                satisfying this filtering.
        """

        data = {"query": query, "k": k}
        if metadata_filter is not None:
            data["metadata_filter"] = metadata_filter
        url = self.url + "/v1/retrieve"
        response = requests.post(
            url,
            data=json.dumps(data),
            headers={"Content-Type": "application/json"},
            timeout=3,
        )
        responses = response.json()
        return sorted(responses, key=lambda x: x["dist"])

    # Make an alias
    __call__ = query

    def get_vectorstore_statistics(self) -> dict:
        """Fetch basic statistics about the vector store."""

        url = self.url + "/v1/statistics"
        response = requests.post(
            url,
            json={},
            headers={"Content-Type": "application/json"},
        )
        responses = response.json()
        return responses

    def get_input_files(
        self,
        metadata_filter: Optional[str] = None,
        filepath_globpattern: Optional[str] = None,
    ) -> list:
        """
        Fetch information on documents in the vector store.

        Args:
            metadata_filter: optional string representing the metadata filtering query
                in the JMESPath format. The search will happen only for documents
                satisfying this filtering.
            filepath_globpattern: optional glob pattern specifying which documents
                will be searched for this query.
        """
        url = self.url + "/v1/inputs"
        response = requests.post(
            url,
            json={
                "metadata_filter": metadata_filter,
                "filepath_globpattern": filepath_globpattern,
            },
            headers={"Content-Type": "application/json"},
        )
        responses = response.json()
        return responses


[docs] class PathwayVectorClient(VectorStore): """ VectorStore connecting to Pathway Vector Store. """
[docs] def __init__( self, host: Optional[str] = None, port: Optional[int] = None, url: Optional[str] = None, ) -> None: """ A client you can use to query Pathway Vector Store. Please provide aither the `url`, or `host` and `port`. Args: - host: host on which Pathway Vector Store listens - port: port on which Pathway Vector Store listens - url: url at which Pathway Vector Store listens """ self.client = _VectorStoreClient(host, port, url)
[docs] def add_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> List[str]: """Pathway is not suitable for this method.""" raise NotImplementedError( "Pathway vector store does not support adding or removing texts" " from client." )
[docs] @classmethod def from_texts( cls, texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, **kwargs: Any, ) -> "PathwayVectorClient": raise NotImplementedError( "Pathway vector store does not support initializing from_texts." )
[docs] def similarity_search_with_score( self, query: str, k: int = 4, metadata_filter: Optional[str] = None, ) -> List[Tuple[Document, float]]: """Run similarity search with Pathway with distance. Args: - query (str): Query text to search for. - k (int): Number of results to return. Defaults to 4. - metadata_filter (Optional[str]): Filter by metadata. Filtering query should be in JMESPath format. Defaults to None. Returns: List[Tuple[Document, float]]: List of documents most similar to the query text and cosine distance in float for each. Lower score represents more similarity. """ rets = self.client(query=query, k=k, metadata_filter=metadata_filter) return [ (Document(page_content=ret["text"], metadata=ret["metadata"]), ret["dist"]) for ret in rets ]
def _select_relevance_score_fn(self) -> Callable[[float], float]: return self._cosine_relevance_score_fn
[docs] def get_vectorstore_statistics(self) -> dict: """Fetch basic statistics about the Vector Store.""" return self.client.get_vectorstore_statistics()
[docs] def get_input_files( self, metadata_filter: Optional[str] = None, filepath_globpattern: Optional[str] = None, ) -> list: """List files indexed by the Vector Store.""" return self.client.get_input_files(metadata_filter, filepath_globpattern)