Source code for langchain_community.graph_vectorstores.extractors.gliner_link_extractor

from typing import Any, Dict, Iterable, List, Optional, Set, Union

from langchain_core._api import beta
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import Link

from langchain_community.graph_vectorstores.extractors.link_extractor import (
    LinkExtractor,
)

# TypeAlias is not available in Python 3.9, we can't use that or the newer `type`.
GLiNERInput = Union[str, Document]


[docs]@beta() class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]): """Link documents with common named entities using `GLiNER`_. `GLiNER`_ is a Named Entity Recognition (NER) model capable of identifying any entity type using a bidirectional transformer encoder (BERT-like). The ``GLiNERLinkExtractor`` uses GLiNER to create links between documents that have named entities in common. Example:: extractor = GLiNERLinkExtractor( labels=["Person", "Award", "Date", "Competitions", "Teams"] ) results = extractor.extract_one("some long text...") .. _GLiNER: https://github.com/urchade/GLiNER .. seealso:: - :mod:`How to use a graph vector store <langchain_community.graph_vectorstores>` - :class:`How to create links between documents <langchain_core.graph_vectorstores.links.Link>` How to link Documents on common named entities ============================================== Preliminaries ------------- Install the ``gliner`` package: .. code-block:: bash pip install -q langchain_community gliner Usage ----- We load the ``state_of_the_union.txt`` file, chunk it, then for each chunk we extract named entity links and add them to the chunk. Using extract_one() ^^^^^^^^^^^^^^^^^^^ We can use :meth:`extract_one` on a document to get the links and add the links to the document metadata with :meth:`~langchain_core.graph_vectorstores.links.add_links`:: from langchain_community.document_loaders import TextLoader from langchain_community.graph_vectorstores import CassandraGraphVectorStore from langchain_community.graph_vectorstores.extractors import GLiNERLinkExtractor from langchain_core.graph_vectorstores.links import add_links from langchain_text_splitters import CharacterTextSplitter loader = TextLoader("state_of_the_union.txt") raw_documents = loader.load() text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) documents = text_splitter.split_documents(raw_documents) ner_extractor = GLiNERLinkExtractor(["Person", "Topic"]) for document in documents: links = ner_extractor.extract_one(document) add_links(document, links) print(documents[0].metadata) .. code-block:: output {'source': 'state_of_the_union.txt', 'links': [Link(kind='entity:Person', direction='bidir', tag='President Zelenskyy'), Link(kind='entity:Person', direction='bidir', tag='Vladimir Putin')]} Using LinkExtractorTransformer ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Using the :class:`~langchain_community.graph_vectorstores.extractors.keybert_link_extractor.LinkExtractorTransformer`, we can simplify the link extraction:: from langchain_community.document_loaders import TextLoader from langchain_community.graph_vectorstores.extractors import ( GLiNERLinkExtractor, LinkExtractorTransformer, ) from langchain_text_splitters import CharacterTextSplitter loader = TextLoader("state_of_the_union.txt") raw_documents = loader.load() text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) documents = text_splitter.split_documents(raw_documents) ner_extractor = GLiNERLinkExtractor(["Person", "Topic"]) transformer = LinkExtractorTransformer([ner_extractor]) documents = transformer.transform_documents(documents) print(documents[0].metadata) .. code-block:: output {'source': 'state_of_the_union.txt', 'links': [Link(kind='entity:Person', direction='bidir', tag='President Zelenskyy'), Link(kind='entity:Person', direction='bidir', tag='Vladimir Putin')]} The documents with named entity links can then be added to a :class:`~langchain_core.graph_vectorstores.base.GraphVectorStore`:: from langchain_community.graph_vectorstores import CassandraGraphVectorStore store = CassandraGraphVectorStore.from_documents(documents=documents, embedding=...) Args: labels: List of kinds of entities to extract. kind: Kind of links to produce with this extractor. model: GLiNER model to use. extract_kwargs: Keyword arguments to pass to GLiNER. """ # noqa: E501
[docs] def __init__( self, labels: List[str], *, kind: str = "entity", model: str = "urchade/gliner_mediumv2.1", extract_kwargs: Optional[Dict[str, Any]] = None, ): try: from gliner import GLiNER self._model = GLiNER.from_pretrained(model) except ImportError: raise ImportError( "gliner is required for GLiNERLinkExtractor. " "Please install it with `pip install gliner`." ) from None self._labels = labels self._kind = kind self._extract_kwargs = extract_kwargs or {}
[docs] def extract_one(self, input: GLiNERInput) -> Set[Link]: # noqa: A002 return next(iter(self.extract_many([input])))
[docs] def extract_many( self, inputs: Iterable[GLiNERInput], ) -> Iterable[Set[Link]]: strs = [i if isinstance(i, str) else i.page_content for i in inputs] for entities in self._model.batch_predict_entities( strs, self._labels, **self._extract_kwargs ): yield { Link.bidir(kind=f"{self._kind}:{e['label']}", tag=e["text"]) for e in entities }