Source code for langchain_community.vectorstores.yellowbrick

from __future__ import annotations

import atexit
import csv
import enum
import json
import logging
import uuid
from contextlib import contextmanager
from io import StringIO
from typing import (
    TYPE_CHECKING,
    Any,
    Dict,
    Generator,
    Iterable,
    List,
    Optional,
    Tuple,
    Type,
)

from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore

from langchain_community.docstore.document import Document

if TYPE_CHECKING:
    from psycopg2.extensions import connection as PgConnection
    from psycopg2.extensions import cursor as PgCursor



[docs]
class Yellowbrick(VectorStore):
    """Yellowbrick as a vector database.
    Example:
        .. code-block:: python
            from langchain_community.vectorstores import Yellowbrick
            from langchain_community.embeddings.openai import OpenAIEmbeddings
            ...
    """

    class IndexType(str, enum.Enum):
        """Enumerator for the supported Index types within Yellowbrick."""

        NONE = "none"
        LSH = "lsh"

    class IndexParams:
        """Parameters for configuring a Yellowbrick index."""

        def __init__(
            self,
            index_type: Optional["Yellowbrick.IndexType"] = None,
            params: Optional[Dict[str, Any]] = None,
        ):
            if index_type is None:
                index_type = Yellowbrick.IndexType.NONE
            self.index_type = index_type
            self.params = params or {}

        def get_param(self, key: str, default: Any = None) -> Any:
            return self.params.get(key, default)


[docs]
    def __init__(
        self,
        embedding: Embeddings,
        connection_string: str,
        table: str,
        *,
        schema: Optional[str] = None,
        logger: Optional[logging.Logger] = None,
        drop: bool = False,
    ) -> None:
        """Initialize with yellowbrick client.
        Args:
            embedding: Embedding operator
            connection_string: Format 'postgres://username:password@host:port/database'
            table: Table used to store / retrieve embeddings from
        """
        from psycopg2 import extras

        extras.register_uuid()

        if logger:
            self.logger = logger
        else:
            self.logger = logging.getLogger(__name__)
            self.logger.setLevel(logging.ERROR)
            handler = logging.StreamHandler()
            handler.setLevel(logging.DEBUG)
            formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)

        if not isinstance(embedding, Embeddings):
            self.logger.error("embeddings input must be Embeddings object.")
            return

        self.LSH_INDEX_TABLE: str = "_lsh_index"
        self.LSH_HYPERPLANE_TABLE: str = "_lsh_hyperplane"
        self.CONTENT_TABLE: str = "_content"

        self.connection_string = connection_string
        self.connection = Yellowbrick.DatabaseConnection(connection_string, self.logger)
        atexit.register(self.connection.close_connection)

        self._schema = schema
        self._table = table
        self._embedding = embedding
        self._max_embedding_len = None
        self._check_database_utf8()

        with self.connection.get_cursor() as cursor:
            if drop:
                self.drop(table=self._table, schema=self._schema, cursor=cursor)
                self.drop(
                    table=self._table + self.CONTENT_TABLE,
                    schema=self._schema,
                    cursor=cursor,
                )
                self._drop_lsh_index_tables(cursor)

            self._create_schema(cursor)
            self._create_table(cursor)


    class DatabaseConnection:
        _instance = None
        _connection_string: str
        _connection: Optional["PgConnection"] = None
        _logger: logging.Logger

        def __new__(
            cls, connection_string: str, logger: logging.Logger
        ) -> "Yellowbrick.DatabaseConnection":
            if cls._instance is None:
                cls._instance = super().__new__(cls)
                cls._instance._connection_string = connection_string
                cls._instance._logger = logger
            return cls._instance

        def close_connection(self) -> None:
            if self._connection and not self._connection.closed:
                self._connection.close()
                self._connection = None

        def get_connection(self) -> "PgConnection":
            import psycopg2

            if not self._connection or self._connection.closed:
                self._connection = psycopg2.connect(self._connection_string)
                self._connection.autocommit = False

            return self._connection

        @contextmanager
        def get_managed_connection(self) -> Generator["PgConnection", None, None]:
            from psycopg2 import DatabaseError

            conn = self.get_connection()
            try:
                yield conn
            except DatabaseError as e:
                conn.rollback()
                self._logger.error(
                    "Database error occurred, rolling back transaction.", exc_info=True
                )
                raise RuntimeError("Database transaction failed.") from e
            else:
                conn.commit()

        @contextmanager
        def get_cursor(self) -> Generator["PgCursor", None, None]:
            with self.get_managed_connection() as conn:
                cursor = conn.cursor()
                try:
                    yield cursor
                finally:
                    cursor.close()

    def _create_schema(self, cursor: "PgCursor") -> None:
        """
        Helper function: create schema if not exists
        """
        from psycopg2 import sql

        if self._schema:
            cursor.execute(
                sql.SQL(
                    """
                    CREATE SCHEMA IF NOT EXISTS {s}
                """
                ).format(
                    s=sql.Identifier(self._schema),
                )
            )

    def _create_table(self, cursor: "PgCursor") -> None:
        """
        Helper function: create table if not exists
        """
        from psycopg2 import sql

        schema_prefix = (self._schema,) if self._schema else ()
        t = sql.Identifier(*schema_prefix, self._table + self.CONTENT_TABLE)
        c = sql.Identifier(self._table + self.CONTENT_TABLE + "_pk_doc_id")
        cursor.execute(
            sql.SQL(
                """
                CREATE TABLE IF NOT EXISTS {t} (
                doc_id UUID NOT NULL,
                text VARCHAR(60000) NOT NULL,
                metadata VARCHAR(1024) NOT NULL,
                CONSTRAINT {c} PRIMARY KEY (doc_id))
                DISTRIBUTE ON (doc_id) SORT ON (doc_id)
            """
            ).format(
                t=t,
                c=c,
            )
        )

        schema_prefix = (self._schema,) if self._schema else ()
        t1 = sql.Identifier(*schema_prefix, self._table)
        t2 = sql.Identifier(*schema_prefix, self._table + self.CONTENT_TABLE)
        c1 = sql.Identifier(
            self._table + self.CONTENT_TABLE + "_pk_doc_id_embedding_id"
        )
        c2 = sql.Identifier(self._table + self.CONTENT_TABLE + "_fk_doc_id")
        cursor.execute(
            sql.SQL(
                """
                CREATE TABLE IF NOT EXISTS {t1} (
                doc_id UUID NOT NULL,
                embedding_id SMALLINT NOT NULL,
                embedding FLOAT NOT NULL,
                CONSTRAINT {c1} PRIMARY KEY (doc_id, embedding_id),
                CONSTRAINT {c2} FOREIGN KEY (doc_id) REFERENCES {t2}(doc_id))
                DISTRIBUTE ON (doc_id) SORT ON (doc_id)
            """
            ).format(
                t1=t1,
                t2=t2,
                c1=c1,
                c2=c2,
            )
        )


[docs]
    def drop(
        self,
        table: str,
        schema: Optional[str] = None,
        cursor: Optional["PgCursor"] = None,
    ) -> None:
        """
        Helper function: Drop data. If a cursor is provided, use it;
        otherwise, obtain a new cursor for the operation.
        """
        if cursor is None:
            with self.connection.get_cursor() as cursor:
                self._drop_table(cursor, table, schema=schema)
        else:
            self._drop_table(cursor, table, schema=schema)


    def _drop_table(
        self,
        cursor: "PgCursor",
        table: str,
        schema: Optional[str] = None,
    ) -> None:
        """
        Executes the drop table command using the given cursor.
        """
        from psycopg2 import sql

        if schema:
            table_name = sql.Identifier(schema, table)
        else:
            table_name = sql.Identifier(table)

        drop_table_query = sql.SQL(
            """
        DROP TABLE IF EXISTS {} CASCADE
        """
        ).format(table_name)
        cursor.execute(drop_table_query)

    def _check_database_utf8(self) -> bool:
        """
        Helper function: Test the database is UTF-8 encoded
        """
        with self.connection.get_cursor() as cursor:
            query = """
                SELECT pg_encoding_to_char(encoding)
                FROM pg_database
                WHERE datname = current_database();
            """
            cursor.execute(query)
            encoding = cursor.fetchone()[0]

        if encoding.lower() == "utf8" or encoding.lower() == "utf-8":
            return True
        else:
            raise Exception("Database encoding is not UTF-8")

        return False


[docs]
    def add_texts(
        self,
        texts: Iterable[str],
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> List[str]:
        batch_size = 10000

        texts = list(texts)
        embeddings = self._embedding.embed_documents(list(texts))
        results = []
        if not metadatas:
            metadatas = [{} for _ in texts]

        index_params = kwargs.get("index_params") or Yellowbrick.IndexParams()

        with self.connection.get_cursor() as cursor:
            content_io = StringIO()
            embeddings_io = StringIO()
            content_writer = csv.writer(
                content_io, delimiter="\t", quotechar='"', quoting=csv.QUOTE_MINIMAL
            )
            embeddings_writer = csv.writer(
                embeddings_io, delimiter="\t", quotechar='"', quoting=csv.QUOTE_MINIMAL
            )
            current_batch_size = 0

            for i, text in enumerate(texts):
                doc_uuid = str(uuid.uuid4())
                results.append(doc_uuid)

                content_writer.writerow([doc_uuid, text, json.dumps(metadatas[i])])

                for embedding_id, embedding in enumerate(embeddings[i]):
                    embeddings_writer.writerow([doc_uuid, embedding_id, embedding])

                current_batch_size += 1

                if current_batch_size >= batch_size:
                    self._copy_to_db(cursor, content_io, embeddings_io)

                    content_io.seek(0)
                    content_io.truncate(0)
                    embeddings_io.seek(0)
                    embeddings_io.truncate(0)
                    current_batch_size = 0

            if current_batch_size > 0:
                self._copy_to_db(cursor, content_io, embeddings_io)

        if index_params.index_type == Yellowbrick.IndexType.LSH:
            self._update_index(index_params, uuid.UUID(doc_uuid))

        return results


    def _copy_to_db(
        self, cursor: "PgCursor", content_io: StringIO, embeddings_io: StringIO
    ) -> None:
        content_io.seek(0)
        embeddings_io.seek(0)

        from psycopg2 import sql

        schema_prefix = (self._schema,) if self._schema else ()
        table = sql.Identifier(*schema_prefix, self._table + self.CONTENT_TABLE)
        content_copy_query = sql.SQL(
            """
            COPY {table} (doc_id, text, metadata) FROM 
            STDIN WITH (FORMAT CSV, DELIMITER E'\\t', QUOTE '\"')
        """
        ).format(table=table)
        cursor.copy_expert(content_copy_query, content_io)

        schema_prefix = (self._schema,) if self._schema else ()
        table = sql.Identifier(*schema_prefix, self._table)
        embeddings_copy_query = sql.SQL(
            """
            COPY {table} (doc_id, embedding_id, embedding) FROM 
            STDIN WITH (FORMAT CSV, DELIMITER E'\\t', QUOTE '\"')
        """
        ).format(table=table)
        cursor.copy_expert(embeddings_copy_query, embeddings_io)


[docs]
    @classmethod
    def from_texts(
        cls: Type[Yellowbrick],
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
        connection_string: str = "",
        table: str = "langchain",
        schema: str = "public",
        drop: bool = False,
        **kwargs: Any,
    ) -> Yellowbrick:
        """Add texts to the vectorstore index.
        Args:
            texts: Iterable of strings to add to the vectorstore.
            metadatas: Optional list of metadatas associated with the texts.
            connection_string: URI to Yellowbrick instance
            embedding: Embedding function
            table: table to store embeddings
            kwargs: vectorstore specific parameters
        """
        vss = cls(
            embedding=embedding,
            connection_string=connection_string,
            table=table,
            schema=schema,
            drop=drop,
        )
        vss.add_texts(texts=texts, metadatas=metadatas, **kwargs)
        return vss



[docs]
    def delete(
        self,
        ids: Optional[List[str]] = None,
        delete_all: Optional[bool] = None,
        **kwargs: Any,
    ) -> None:
        """Delete vectors by uuids.

        Args:
            ids: List of ids to delete, where each id is a uuid string.
        """
        from psycopg2 import sql

        if delete_all:
            where_sql = sql.SQL(
                """
                WHERE 1=1
            """
            )
        elif ids is not None:
            uuids = tuple(sql.Literal(id) for id in ids)
            ids_formatted = sql.SQL(", ").join(uuids)
            where_sql = sql.SQL(
                """
                WHERE doc_id IN ({ids})
            """
            ).format(
                ids=ids_formatted,
            )
        else:
            raise ValueError("Either ids or delete_all must be provided.")

        schema_prefix = (self._schema,) if self._schema else ()
        with self.connection.get_cursor() as cursor:
            table_identifier = sql.Identifier(
                *schema_prefix, self._table + self.CONTENT_TABLE
            )
            query = sql.SQL("DELETE FROM {table} {where_sql}").format(
                table=table_identifier, where_sql=where_sql
            )
            cursor.execute(query)

            table_identifier = sql.Identifier(*schema_prefix, self._table)
            query = sql.SQL("DELETE FROM {table} {where_sql}").format(
                table=table_identifier, where_sql=where_sql
            )
            cursor.execute(query)

            if self._table_exists(
                cursor, self._table + self.LSH_INDEX_TABLE, *schema_prefix
            ):
                table_identifier = sql.Identifier(
                    *schema_prefix, self._table + self.LSH_INDEX_TABLE
                )
                query = sql.SQL("DELETE FROM {table} {where_sql}").format(
                    table=table_identifier, where_sql=where_sql
                )
                cursor.execute(query)

        return None


    def _table_exists(
        self, cursor: "PgCursor", table_name: str, schema: str = "public"
    ) -> bool:
        """
        Checks if a table exists in the given schema
        """
        from psycopg2 import sql

        schema = sql.Literal(schema)
        table_name = sql.Literal(table_name)
        cursor.execute(
            sql.SQL(
                """
                SELECT COUNT(*)
                FROM sys.table t INNER JOIN sys.schema s ON t.schema_id = s.schema_id
                WHERE s.name = {schema} AND t.name = {table_name}
            """
            ).format(
                schema=schema,
                table_name=table_name,
            )
        )
        return cursor.fetchone()[0] > 0

    def _generate_vector_uuid(self, vector: List[float]) -> uuid.UUID:
        import hashlib

        vector_str = ",".join(map(str, vector))
        hash_object = hashlib.sha1(vector_str.encode())
        hash_digest = hash_object.digest()
        vector_uuid = uuid.UUID(bytes=hash_digest[:16])
        return vector_uuid


[docs]
    def similarity_search_with_score_by_vector(
        self, embedding: List[float], k: int = 4, **kwargs: Any
    ) -> List[Tuple[Document, float]]:
        """Perform a similarity search with Yellowbrick with vector

        Args:
            embedding (List[float]): query embedding
            k (int, optional): Top K neighbors to retrieve. Defaults to 4.

            NOTE: Please do not let end-user fill this and always be aware
                  of SQL injection.

        Returns:
            List[Document, float]: List of Documents and scores
        """
        from psycopg2 import sql
        from psycopg2.extras import execute_values

        index_params = kwargs.get("index_params") or Yellowbrick.IndexParams()

        with self.connection.get_cursor() as cursor:
            tmp_embeddings_table = "tmp_" + self._table
            tmp_doc_id = self._generate_vector_uuid(embedding)
            create_table_query = sql.SQL(
                """ 
                CREATE TEMPORARY TABLE {} (
                doc_id UUID,
                embedding_id SMALLINT,
                embedding FLOAT)
                ON COMMIT DROP
                DISTRIBUTE REPLICATE
            """
            ).format(sql.Identifier(tmp_embeddings_table))
            cursor.execute(create_table_query)
            data_input = [
                (str(tmp_doc_id), embedding_id, embedding_value)
                for embedding_id, embedding_value in enumerate(embedding)
            ]
            insert_query = sql.SQL(
                "INSERT INTO {} (doc_id, embedding_id, embedding) VALUES %s"
            ).format(sql.Identifier(tmp_embeddings_table))
            execute_values(cursor, insert_query, data_input)

            v1 = sql.Identifier(tmp_embeddings_table)
            schema_prefix = (self._schema,) if self._schema else ()
            v2 = sql.Identifier(*schema_prefix, self._table)
            v3 = sql.Identifier(*schema_prefix, self._table + self.CONTENT_TABLE)
            if index_params.index_type == Yellowbrick.IndexType.LSH:
                tmp_hash_table = self._table + "_tmp_hash"
                self._generate_tmp_lsh_hashes(
                    cursor,
                    tmp_embeddings_table,
                    tmp_hash_table,
                )

                schema_prefix = (self._schema,) if self._schema else ()
                lsh_index = sql.Identifier(
                    *schema_prefix, self._table + self.LSH_INDEX_TABLE
                )
                input_hash_table = sql.Identifier(tmp_hash_table)
                sql_query = sql.SQL(
                    """
                    WITH index_docs AS (
                    SELECT
                        t1.doc_id,
                        SUM(ABS(t1.hash-t2.hash)) as hamming_distance
                    FROM
                        {lsh_index} t1
                    INNER JOIN
                        {input_hash_table} t2
                    ON t1.hash_index = t2.hash_index
                    GROUP BY t1.doc_id
                    HAVING hamming_distance <= {hamming_distance}
                    )
                    SELECT
                        text,
                        metadata,
                       SUM(v1.embedding * v2.embedding) /
                        (SQRT(SUM(v1.embedding * v1.embedding)) *
                       SQRT(SUM(v2.embedding * v2.embedding))) AS score
                    FROM
                        {v1} v1
                    INNER JOIN
                        {v2} v2
                    ON v1.embedding_id = v2.embedding_id
                    INNER JOIN
                        {v3} v3
                    ON v2.doc_id = v3.doc_id
                    INNER JOIN
                        index_docs v4
                    ON v2.doc_id = v4.doc_id
                    GROUP BY v3.doc_id, v3.text, v3.metadata
                    ORDER BY score DESC
                    LIMIT %s
                """
                ).format(
                    v1=v1,
                    v2=v2,
                    v3=v3,
                    lsh_index=lsh_index,
                    input_hash_table=input_hash_table,
                    hamming_distance=sql.Literal(
                        index_params.get_param("hamming_distance", 0)
                    ),
                )
                cursor.execute(
                    sql_query,
                    (k,),
                )
                results = cursor.fetchall()
            else:
                sql_query = sql.SQL(
                    """
                    SELECT 
                        text,
                        metadata,
                        score
                    FROM
                        (SELECT
                            v2.doc_id doc_id,
                            SUM(v1.embedding * v2.embedding) /
                            (SQRT(SUM(v1.embedding * v1.embedding)) *
                            SQRT(SUM(v2.embedding * v2.embedding))) AS score
                        FROM
                            {v1} v1
                        INNER JOIN
                            {v2} v2
                        ON v1.embedding_id = v2.embedding_id
                        GROUP BY v2.doc_id
                        ORDER BY score DESC LIMIT %s
                        ) v4
                    INNER JOIN
                        {v3} v3
                    ON v4.doc_id = v3.doc_id
                    ORDER BY score DESC
                """
                ).format(
                    v1=v1,
                    v2=v2,
                    v3=v3,
                )
                cursor.execute(sql_query, (k,))
                results = cursor.fetchall()

        documents: List[Tuple[Document, float]] = []
        for result in results:
            metadata = json.loads(result[1]) or {}
            doc = Document(page_content=result[0], metadata=metadata)
            documents.append((doc, result[2]))

        return documents



[docs]
    def similarity_search(
        self, query: str, k: int = 4, **kwargs: Any
    ) -> List[Document]:
        """Perform a similarity search with Yellowbrick

        Args:
            query (str): query string
            k (int, optional): Top K neighbors to retrieve. Defaults to 4.

            NOTE: Please do not let end-user fill this and always be aware
                  of SQL injection.

        Returns:
            List[Document]: List of Documents
        """
        embedding = self._embedding.embed_query(query)
        documents = self.similarity_search_with_score_by_vector(
            embedding=embedding, k=k, **kwargs
        )
        return [doc for doc, _ in documents]



[docs]
    def similarity_search_with_score(
        self, query: str, k: int = 4, **kwargs: Any
    ) -> List[Tuple[Document, float]]:
        """Perform a similarity search with Yellowbrick

        Args:
            query (str): query string
            k (int, optional): Top K neighbors to retrieve. Defaults to 4.

            NOTE: Please do not let end-user fill this and always be aware
                  of SQL injection.

        Returns:
            List[Document]: List of (Document, similarity)
        """
        embedding = self._embedding.embed_query(query)
        documents = self.similarity_search_with_score_by_vector(
            embedding=embedding, k=k, **kwargs
        )
        return documents



[docs]
    def similarity_search_by_vector(
        self, embedding: List[float], k: int = 4, **kwargs: Any
    ) -> List[Document]:
        """Perform a similarity search with Yellowbrick by vectors

        Args:
            embedding (List[float]): query embedding
            k (int, optional): Top K neighbors to retrieve. Defaults to 4.

            NOTE: Please do not let end-user fill this and always be aware
                  of SQL injection.

        Returns:
            List[Document]: List of documents
        """
        documents = self.similarity_search_with_score_by_vector(
            embedding=embedding, k=k, **kwargs
        )
        return [doc for doc, _ in documents]


    def _update_lsh_hashes(
        self,
        cursor: "PgCursor",
        doc_id: Optional[uuid.UUID] = None,
    ) -> None:
        """Add hashes to LSH index"""
        from psycopg2 import sql

        schema_prefix = (self._schema,) if self._schema else ()
        lsh_hyperplane_table = sql.Identifier(
            *schema_prefix, self._table + self.LSH_HYPERPLANE_TABLE
        )
        lsh_index_table_id = sql.Identifier(
            *schema_prefix, self._table + self.LSH_INDEX_TABLE
        )
        embedding_table_id = sql.Identifier(*schema_prefix, self._table)
        query_prefix_id = sql.SQL("INSERT INTO {}").format(lsh_index_table_id)
        condition = (
            sql.SQL("WHERE e.doc_id = {doc_id}").format(doc_id=sql.Literal(str(doc_id)))
            if doc_id
            else sql.SQL("")
        )
        group_by = sql.SQL("GROUP BY 1, 2")

        input_query = sql.SQL(
            """
            {query_prefix}
            SELECT
                e.doc_id as doc_id,
                h.id as hash_index,
                CASE WHEN SUM(e.embedding * h.hyperplane) > 0 THEN 1 ELSE 0 END as hash
            FROM {embedding_table} e
            INNER JOIN {hyperplanes} h ON e.embedding_id = h.hyperplane_id
            {condition}
            {group_by}
        """
        ).format(
            query_prefix=query_prefix_id,
            embedding_table=embedding_table_id,
            hyperplanes=lsh_hyperplane_table,
            condition=condition,
            group_by=group_by,
        )
        cursor.execute(input_query)

    def _generate_tmp_lsh_hashes(
        self, cursor: "PgCursor", tmp_embedding_table: str, tmp_hash_table: str
    ) -> None:
        """Generate temp LSH"""
        from psycopg2 import sql

        schema_prefix = (self._schema,) if self._schema else ()
        lsh_hyperplane_table = sql.Identifier(
            *schema_prefix, self._table + self.LSH_HYPERPLANE_TABLE
        )
        tmp_embedding_table_id = sql.Identifier(tmp_embedding_table)
        tmp_hash_table_id = sql.Identifier(tmp_hash_table)
        query_prefix = sql.SQL("CREATE TEMPORARY TABLE {} ON COMMIT DROP AS").format(
            tmp_hash_table_id
        )
        group_by = sql.SQL("GROUP BY 1")

        input_query = sql.SQL(
            """
            {query_prefix}
            SELECT
                h.id as hash_index,
                CASE WHEN SUM(e.embedding * h.hyperplane) > 0 THEN 1 ELSE 0 END as hash
            FROM {embedding_table} e
            INNER JOIN {hyperplanes} h ON e.embedding_id = h.hyperplane_id
            {group_by}
            DISTRIBUTE REPLICATE
        """
        ).format(
            query_prefix=query_prefix,
            embedding_table=tmp_embedding_table_id,
            hyperplanes=lsh_hyperplane_table,
            group_by=group_by,
        )
        cursor.execute(input_query)

    def _populate_hyperplanes(self, cursor: "PgCursor", num_hyperplanes: int) -> None:
        """Generate random hyperplanes and store in Yellowbrick"""
        from psycopg2 import sql

        schema_prefix = (self._schema,) if self._schema else ()
        hyperplanes_table = sql.Identifier(
            *schema_prefix, self._table + self.LSH_HYPERPLANE_TABLE
        )
        cursor.execute(sql.SQL("SELECT COUNT(*) FROM {t}").format(t=hyperplanes_table))
        if cursor.fetchone()[0] > 0:
            return

        t = sql.Identifier(*schema_prefix, self._table)
        cursor.execute(sql.SQL("SELECT MAX(embedding_id) FROM {t}").format(t=t))
        num_dimensions = cursor.fetchone()[0]
        num_dimensions += 1

        insert_query = sql.SQL(
            """
            WITH parameters AS (
                SELECT {num_hyperplanes} AS num_hyperplanes,
                    {dims_per_hyperplane} AS dims_per_hyperplane
            )
            INSERT INTO {hyperplanes_table} (id, hyperplane_id, hyperplane)
                SELECT id, hyperplane_id, (random() * 2 - 1) AS hyperplane
                FROM
                (SELECT range-1 id FROM sys.rowgenerator
                    WHERE range BETWEEN 1 AND
                    (SELECT num_hyperplanes FROM parameters) AND
                    worker_lid = 0 AND thread_id = 0) a,
                (SELECT range-1 hyperplane_id FROM sys.rowgenerator
                    WHERE range BETWEEN 1 AND
                    (SELECT dims_per_hyperplane FROM parameters) AND
                    worker_lid = 0 AND thread_id = 0) b
        """
        ).format(
            num_hyperplanes=sql.Literal(num_hyperplanes),
            dims_per_hyperplane=sql.Literal(num_dimensions),
            hyperplanes_table=hyperplanes_table,
        )
        cursor.execute(insert_query)

    def _create_lsh_index_tables(self, cursor: "PgCursor") -> None:
        """Create LSH index and hyperplane tables"""
        from psycopg2 import sql

        schema_prefix = (self._schema,) if self._schema else ()
        t1 = sql.Identifier(*schema_prefix, self._table + self.LSH_INDEX_TABLE)
        t2 = sql.Identifier(*schema_prefix, self._table + self.CONTENT_TABLE)
        c1 = sql.Identifier(self._table + self.LSH_INDEX_TABLE + "_pk_doc_id")
        c2 = sql.Identifier(self._table + self.LSH_INDEX_TABLE + "_fk_doc_id")
        cursor.execute(
            sql.SQL(
                """
                CREATE TABLE IF NOT EXISTS {t1} (
                doc_id UUID NOT NULL,
                hash_index SMALLINT NOT NULL,
                hash SMALLINT NOT NULL,
                CONSTRAINT {c1} PRIMARY KEY (doc_id, hash_index),
                CONSTRAINT {c2} FOREIGN KEY (doc_id) REFERENCES {t2}(doc_id))
                DISTRIBUTE ON (doc_id) SORT ON (doc_id)
            """
            ).format(
                t1=t1,
                t2=t2,
                c1=c1,
                c2=c2,
            )
        )

        schema_prefix = (self._schema,) if self._schema else ()
        t = sql.Identifier(*schema_prefix, self._table + self.LSH_HYPERPLANE_TABLE)
        c = sql.Identifier(self._table + self.LSH_HYPERPLANE_TABLE + "_pk_id_hp_id")
        cursor.execute(
            sql.SQL(
                """
                CREATE TABLE IF NOT EXISTS {t} (
                id SMALLINT NOT NULL,
                hyperplane_id SMALLINT NOT NULL,
                hyperplane FLOAT NOT NULL,
                CONSTRAINT {c} PRIMARY KEY (id, hyperplane_id))
                DISTRIBUTE REPLICATE SORT ON (id)
            """
            ).format(
                t=t,
                c=c,
            )
        )

    def _drop_lsh_index_tables(self, cursor: "PgCursor") -> None:
        """Drop LSH index tables"""
        self.drop(
            schema=self._schema, table=self._table + self.LSH_INDEX_TABLE, cursor=cursor
        )
        self.drop(
            schema=self._schema,
            table=self._table + self.LSH_HYPERPLANE_TABLE,
            cursor=cursor,
        )


[docs]
    def create_index(self, index_params: Yellowbrick.IndexParams) -> None:
        """Create index from existing vectors"""
        if index_params.index_type == Yellowbrick.IndexType.LSH:
            with self.connection.get_cursor() as cursor:
                self._drop_lsh_index_tables(cursor)
                self._create_lsh_index_tables(cursor)
                self._populate_hyperplanes(
                    cursor, index_params.get_param("num_hyperplanes", 128)
                )
                self._update_lsh_hashes(cursor)



[docs]
    def drop_index(self, index_params: Yellowbrick.IndexParams) -> None:
        """Drop an index"""
        if index_params.index_type == Yellowbrick.IndexType.LSH:
            with self.connection.get_cursor() as cursor:
                self._drop_lsh_index_tables(cursor)


    def _update_index(
        self, index_params: Yellowbrick.IndexParams, doc_id: uuid.UUID
    ) -> None:
        """Update an index with a new or modified embedding in the embeddings table"""
        if index_params.index_type == Yellowbrick.IndexType.LSH:
            with self.connection.get_cursor() as cursor:
                self._update_lsh_hashes(cursor, doc_id)


[docs]
    def migrate_schema_v1_to_v2(self) -> None:
        from psycopg2 import sql

        try:
            with self.connection.get_cursor() as cursor:
                schema_prefix = (self._schema,) if self._schema else ()
                embeddings = sql.Identifier(*schema_prefix, self._table)
                old_embeddings = sql.Identifier(*schema_prefix, self._table + "_v1")
                content = sql.Identifier(
                    *schema_prefix, self._table + self.CONTENT_TABLE
                )
                alter_table_query = sql.SQL("ALTER TABLE {t1} RENAME TO {t2}").format(
                    t1=embeddings,
                    t2=old_embeddings,
                )
                cursor.execute(alter_table_query)

                self._create_table(cursor)

                insert_query = sql.SQL(
                    """
                    INSERT INTO {t1} (doc_id, embedding_id, embedding) 
                    SELECT id, embedding_id, embedding FROM {t2}
                """
                ).format(
                    t1=embeddings,
                    t2=old_embeddings,
                )
                cursor.execute(insert_query)

                insert_content_query = sql.SQL(
                    """
                    INSERT INTO {t1} (doc_id, text, metadata) 
                    SELECT DISTINCT id, text, metadata FROM {t2}
                """
                ).format(t1=content, t2=old_embeddings)
                cursor.execute(insert_content_query)
        except Exception as e:
            raise RuntimeError(f"Failed to migrate schema: {e}") from e