Source code for langchain_community.graph_vectorstores.extractors.gliner_link_extractor

from typing import Any, Dict, Iterable, List, Optional, Set, Union

from langchain_core._api import beta
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import Link

from langchain_community.graph_vectorstores.extractors.link_extractor import (
    LinkExtractor,
)

# TypeAlias is not available in Python 3.9, we can't use that or the newer `type`.
GLiNERInput = Union[str, Document]


[docs]@beta()
class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]):
    """Link documents with common named entities using `GLiNER`_.

    `GLiNER`_ is a Named Entity Recognition (NER) model capable of identifying any
    entity type using a bidirectional transformer encoder (BERT-like).

    The ``GLiNERLinkExtractor`` uses GLiNER to create links between documents that
    have named entities in common.

    Example::

        extractor = GLiNERLinkExtractor(
            labels=["Person", "Award", "Date", "Competitions", "Teams"]
        )
        results = extractor.extract_one("some long text...")

    .. _GLiNER: https://github.com/urchade/GLiNER

    .. seealso::

            - :mod:`How to use a graph vector store <langchain_community.graph_vectorstores>`
            - :class:`How to create links between documents <langchain_core.graph_vectorstores.links.Link>`

    How to link Documents on common named entities
    ==============================================

    Preliminaries
    -------------

    Install the ``gliner`` package:

    .. code-block:: bash

        pip install -q langchain_community gliner

    Usage
    -----

    We load the ``state_of_the_union.txt`` file, chunk it, then for each chunk we
    extract named entity links and add them to the chunk.

    Using extract_one()
    ^^^^^^^^^^^^^^^^^^^

    We can use :meth:`extract_one` on a document to get the links and add the links
    to the document metadata with
    :meth:`~langchain_core.graph_vectorstores.links.add_links`::

        from langchain_community.document_loaders import TextLoader
        from langchain_community.graph_vectorstores import CassandraGraphVectorStore
        from langchain_community.graph_vectorstores.extractors import GLiNERLinkExtractor
        from langchain_core.graph_vectorstores.links import add_links
        from langchain_text_splitters import CharacterTextSplitter

        loader = TextLoader("state_of_the_union.txt")
        raw_documents = loader.load()

        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
        documents = text_splitter.split_documents(raw_documents)

        ner_extractor = GLiNERLinkExtractor(["Person", "Topic"])
        for document in documents:
            links = ner_extractor.extract_one(document)
            add_links(document, links)

        print(documents[0].metadata)

    .. code-block:: output

        {'source': 'state_of_the_union.txt', 'links': [Link(kind='entity:Person', direction='bidir', tag='President Zelenskyy'), Link(kind='entity:Person', direction='bidir', tag='Vladimir Putin')]}

    Using LinkExtractorTransformer
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

    Using the :class:`~langchain_community.graph_vectorstores.extractors.keybert_link_extractor.LinkExtractorTransformer`,
    we can simplify the link extraction::

        from langchain_community.document_loaders import TextLoader
        from langchain_community.graph_vectorstores.extractors import (
            GLiNERLinkExtractor,
            LinkExtractorTransformer,
        )
        from langchain_text_splitters import CharacterTextSplitter

        loader = TextLoader("state_of_the_union.txt")
        raw_documents = loader.load()

        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
        documents = text_splitter.split_documents(raw_documents)

        ner_extractor = GLiNERLinkExtractor(["Person", "Topic"])
        transformer = LinkExtractorTransformer([ner_extractor])
        documents = transformer.transform_documents(documents)

        print(documents[0].metadata)

    .. code-block:: output

        {'source': 'state_of_the_union.txt', 'links': [Link(kind='entity:Person', direction='bidir', tag='President Zelenskyy'), Link(kind='entity:Person', direction='bidir', tag='Vladimir Putin')]}

    The documents with named entity links can then be added to a :class:`~langchain_core.graph_vectorstores.base.GraphVectorStore`::

        from langchain_community.graph_vectorstores import CassandraGraphVectorStore

        store = CassandraGraphVectorStore.from_documents(documents=documents, embedding=...)

    Args:
        labels: List of kinds of entities to extract.
        kind: Kind of links to produce with this extractor.
        model: GLiNER model to use.
        extract_kwargs: Keyword arguments to pass to GLiNER.
    """  # noqa: E501

[docs]    def __init__(
        self,
        labels: List[str],
        *,
        kind: str = "entity",
        model: str = "urchade/gliner_mediumv2.1",
        extract_kwargs: Optional[Dict[str, Any]] = None,
    ):
        try:
            from gliner import GLiNER

            self._model = GLiNER.from_pretrained(model)

        except ImportError:
            raise ImportError(
                "gliner is required for GLiNERLinkExtractor. "
                "Please install it with `pip install gliner`."
            ) from None

        self._labels = labels
        self._kind = kind
        self._extract_kwargs = extract_kwargs or {}

[docs]    def extract_one(self, input: GLiNERInput) -> Set[Link]:  # noqa: A002
        return next(iter(self.extract_many([input])))

[docs]    def extract_many(
        self,
        inputs: Iterable[GLiNERInput],
    ) -> Iterable[Set[Link]]:
        strs = [i if isinstance(i, str) else i.page_content for i in inputs]
        for entities in self._model.batch_predict_entities(
            strs, self._labels, **self._extract_kwargs
        ):
            yield {
                Link.bidir(kind=f"{self._kind}:{e['label']}", tag=e["text"])
                for e in entities
            }