Source code for langchain_community.chat_message_histories.kafka

"""Kafka-based chat message history by using confluent-kafka-python.
confluent-kafka-python is under Apache 2.0 license.
https://github.com/confluentinc/confluent-kafka-python
"""

from __future__ import annotations

import json
import logging
import time
from enum import Enum
from typing import TYPE_CHECKING, List, Optional, Sequence

from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.messages import BaseMessage, message_to_dict, messages_from_dict

if TYPE_CHECKING:
    from confluent_kafka import TopicPartition
    from confluent_kafka.admin import AdminClient

logger = logging.getLogger(__name__)

BOOTSTRAP_SERVERS_CONFIG = "bootstrap.servers"

DEFAULT_TTL_MS = 604800000  # 7 days
DEFAULT_REPLICATION_FACTOR = 1
DEFAULT_PARTITION = 3



[docs]
class ConsumeStartPosition(Enum):
    """Consume start position for Kafka consumer to get chat history messages.
    LAST_CONSUMED: Continue from the last consumed offset.
    EARLIEST: Start consuming from the beginning.
    LATEST: Start consuming from the latest offset.
    """

    LAST_CONSUMED = 1
    EARLIEST = 2
    LATEST = 3




[docs]
def ensure_topic_exists(
    admin_client: AdminClient,
    topic_name: str,
    replication_factor: int,
    partition: int,
    ttl_ms: int,
) -> int:
    """Create topic if it doesn't exist, and return the number of partitions.
    If the topic already exists, we don't change the topic configuration.
    """
    from confluent_kafka.admin import NewTopic

    try:
        topic_metadata = admin_client.list_topics().topics
        if topic_name in topic_metadata:
            num_partitions = len(topic_metadata[topic_name].partitions)
            logger.info(
                f"Topic {topic_name} already exists with {num_partitions} partitions"
            )
            return num_partitions
    except Exception as e:
        logger.error(f"Failed to list topics: {e}")
        raise e

    topics = [
        NewTopic(
            topic_name,
            num_partitions=partition,
            replication_factor=replication_factor,
            config={"retention.ms": str(ttl_ms)},
        )
    ]
    try:
        futures = admin_client.create_topics(topics)
        for _, f in futures.items():
            f.result()  # result is None
        logger.info(f"Topic {topic_name} created")
    except Exception as e:
        logger.error(f"Failed to create topic {topic_name}: {e}")
        raise e

    return partition




[docs]
class KafkaChatMessageHistory(BaseChatMessageHistory):
    """Chat message history stored in Kafka.

    Setup:
        Install ``confluent-kafka-python``.

        .. code-block:: bash

            pip install confluent_kafka

    Instantiate:
        .. code-block:: python

            from langchain_community.chat_message_histories import KafkaChatMessageHistory

            history = KafkaChatMessageHistory(
                session_id="your_session_id",
                bootstrap_servers="host:port",
            )

    Add and retrieve messages:
        .. code-block:: python

            # Add messages
            history.add_messages([message1, message2, message3, ...])

            # Retrieve messages
            message_batch_0 = history.messages

            # retrieve messages after message_batch_0
            message_batch_1 = history.messages

            # Reset to beginning and retrieve messages
            messages_from_beginning = history.messages_from_beginning()

    Retrieving messages is stateful. Internally, it uses Kafka consumer to read.
    The consumed offset is maintained persistently.

    To retrieve messages, you can use the following methods:
    - `messages`:
        continue consuming chat messages from last one.
    - `messages_from_beginning`:
        reset the consumer to the beginning of the chat history and return messages.
        Optional parameters:
        1. `max_message_count`: maximum number of messages to return.
        2. `max_time_sec`: maximum time in seconds to wait for messages.
    - `messages_from_latest`:
        reset to end of the chat history and try consuming messages.
        Optional parameters same as above.
    - `messages_from_last_consumed`:
        continuing from the last consumed message, similar to `messages`.
        Optional parameters same as above.

    `max_message_count` and `max_time_sec` are used to avoid blocking indefinitely
     when retrieving messages. As a result, the method to retrieve messages may not
     return all messages. Change `max_message_count` and `max_time_sec` to retrieve
     all history messages.
    """  # noqa: E501


[docs]
    def __init__(
        self,
        session_id: str,
        bootstrap_servers: str,
        ttl_ms: int = DEFAULT_TTL_MS,
        replication_factor: int = DEFAULT_REPLICATION_FACTOR,
        partition: int = DEFAULT_PARTITION,
    ):
        """
        Args:
            session_id: The ID for single chat session. It is used as Kafka topic name.
            bootstrap_servers:
                Comma-separated host/port pairs to establish connection to Kafka cluster
                https://kafka.apache.org/documentation.html#adminclientconfigs_bootstrap.servers
            ttl_ms:
                Time-to-live (milliseconds) for automatic expiration of entries.
                Default 7 days. -1 for no expiration.
                It translates to https://kafka.apache.org/documentation.html#topicconfigs_retention.ms
            replication_factor: The replication factor for the topic. Default 1.
            partition: The number of partitions for the topic. Default 3.
        """
        try:
            from confluent_kafka import Producer
            from confluent_kafka.admin import AdminClient
        except (ImportError, ModuleNotFoundError):
            raise ImportError(
                "Could not import confluent_kafka package. "
                "Please install it with `pip install confluent_kafka`."
            )

        self.session_id = session_id
        self.bootstrap_servers = bootstrap_servers
        self.admin_client = AdminClient({BOOTSTRAP_SERVERS_CONFIG: bootstrap_servers})
        self.num_partitions = ensure_topic_exists(
            self.admin_client, session_id, replication_factor, partition, ttl_ms
        )
        self.producer = Producer({BOOTSTRAP_SERVERS_CONFIG: bootstrap_servers})



[docs]
    def add_messages(
        self,
        messages: Sequence[BaseMessage],
        flush_timeout_seconds: float = 5.0,
    ) -> None:
        """Add messages to the chat history by producing to the Kafka topic."""
        try:
            for message in messages:
                self.producer.produce(
                    topic=self.session_id,
                    value=json.dumps(message_to_dict(message)),
                )
            message_remaining = self.producer.flush(flush_timeout_seconds)
            if message_remaining > 0:
                logger.warning(f"{message_remaining} messages are still in-flight.")
        except Exception as e:
            logger.error(f"Failed to add messages to Kafka: {e}")
            raise e


    def __read_messages(
        self,
        consume_start_pos: ConsumeStartPosition,
        max_message_count: Optional[int],
        max_time_sec: Optional[float],
    ) -> List[BaseMessage]:
        """Retrieve messages from Kafka topic for the session.
           Please note this method is stateful. Internally, it uses Kafka consumer
           to consume messages, and maintains the consumed offset.

         Args:
              consume_start_pos: Start position for Kafka consumer.
              max_message_count: Maximum number of messages to consume.
              max_time_sec:      Time limit in seconds to consume messages.
        Returns:
              List of messages.
        """
        from confluent_kafka import OFFSET_BEGINNING, OFFSET_END, Consumer

        consumer_config = {
            BOOTSTRAP_SERVERS_CONFIG: self.bootstrap_servers,
            "group.id": self.session_id,
            "auto.offset.reset": "latest"
            if consume_start_pos == ConsumeStartPosition.LATEST
            else "earliest",
        }

        def assign_beginning(
            assigned_consumer: Consumer, assigned_partitions: list[TopicPartition]
        ) -> None:
            for p in assigned_partitions:
                p.offset = OFFSET_BEGINNING
            assigned_consumer.assign(assigned_partitions)

        def assign_latest(
            assigned_consumer: Consumer, assigned_partitions: list[TopicPartition]
        ) -> None:
            for p in assigned_partitions:
                p.offset = OFFSET_END
            assigned_consumer.assign(assigned_partitions)

        messages: List[dict] = []
        consumer = Consumer(consumer_config)
        try:
            if consume_start_pos == ConsumeStartPosition.EARLIEST:
                consumer.subscribe([self.session_id], on_assign=assign_beginning)
            elif consume_start_pos == ConsumeStartPosition.LATEST:
                consumer.subscribe([self.session_id], on_assign=assign_latest)
            else:
                consumer.subscribe([self.session_id])
            start_time_sec = time.time()
            while True:
                if (
                    max_time_sec is not None
                    and time.time() - start_time_sec > max_time_sec
                ):
                    break
                if max_message_count is not None and len(messages) >= max_message_count:
                    break

                message = consumer.poll(timeout=1.0)
                if message is None:  # poll timeout
                    continue
                if message.error() is not None:  # error
                    logger.error(f"Consumer error: {message.error()}")
                    continue
                if message.value() is None:  # empty value
                    logger.warning("Empty message value")
                    continue
                messages.append(json.loads(message.value()))
        except Exception as e:
            logger.error(f"Failed to consume messages from Kafka: {e}")
            raise e
        finally:
            consumer.close()

        return messages_from_dict(messages)


[docs]
    def messages_from_beginning(
        self, max_message_count: Optional[int] = 5, max_time_sec: Optional[float] = 5.0
    ) -> List[BaseMessage]:
        """Retrieve messages from Kafka topic from the beginning.
        This method resets the consumer to the beginning and consumes messages.

             Args:
                 max_message_count: Maximum number of messages to consume.
                 max_time_sec:      Time limit in seconds to consume messages.
             Returns:
                 List of messages.
        """
        return self.__read_messages(
            consume_start_pos=ConsumeStartPosition.EARLIEST,
            max_message_count=max_message_count,
            max_time_sec=max_time_sec,
        )



[docs]
    def messages_from_latest(
        self, max_message_count: Optional[int] = 5, max_time_sec: Optional[float] = 5.0
    ) -> List[BaseMessage]:
        """Reset to the end offset. Try to consume messages if available.

        Args:
            max_message_count: Maximum number of messages to consume.
            max_time_sec:      Time limit in seconds to consume messages.
        Returns:
            List of messages.
        """

        return self.__read_messages(
            consume_start_pos=ConsumeStartPosition.LATEST,
            max_message_count=max_message_count,
            max_time_sec=max_time_sec,
        )



[docs]
    def messages_from_last_consumed(
        self, max_message_count: Optional[int] = 5, max_time_sec: Optional[float] = 5.0
    ) -> List[BaseMessage]:
        """Retrieve messages from Kafka topic from the last consumed message.
        Please note this method is stateful. Internally, it uses Kafka consumer
        to consume messages, and maintains the commit offset.

          Args:
               max_message_count: Maximum number of messages to consume.
               max_time_sec:      Time limit in seconds to consume messages.
          Returns:
               List of messages.
        """

        return self.__read_messages(
            consume_start_pos=ConsumeStartPosition.LAST_CONSUMED,
            max_message_count=max_message_count,
            max_time_sec=max_time_sec,
        )


    @property
    def messages(self) -> List[BaseMessage]:  # type: ignore[override]
        """
        Retrieve the messages for the session, from Kafka topic continuously
        from last consumed message. This method is stateful and maintains
        consumed(committed) offset based on consumer group.
        Alternatively, use messages_from_last_consumed() with specified parameters.
        Use messages_from_beginning() to read from the earliest message.
        Use messages_from_latest() to read from the latest message.
        """
        return self.messages_from_last_consumed()


[docs]
    def clear(self) -> None:
        """Clear the chat history by deleting the Kafka topic."""
        try:
            futures = self.admin_client.delete_topics([self.session_id])
            for _, f in futures.items():
                f.result()  # result is None
            logger.info(f"Topic {self.session_id} deleted")
        except Exception as e:
            logger.error(f"Failed to delete topic {self.session_id}: {e}")
            raise e



[docs]
    def close(self) -> None:
        """Release the resources.
        Nothing to be released at this moment.
        """
        pass