Source code for langchain_community.graph_vectorstores.extractors.keybert_link_extractor

from typing import Any, Dict, Iterable, Optional, Set, Union

from langchain_core._api import beta
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import Link

from langchain_community.graph_vectorstores.extractors.link_extractor import (
    LinkExtractor,
)

KeybertInput = Union[str, Document]


[docs]@beta() class KeybertLinkExtractor(LinkExtractor[KeybertInput]):
[docs] def __init__( self, *, kind: str = "kw", embedding_model: str = "all-MiniLM-L6-v2", extract_keywords_kwargs: Optional[Dict[str, Any]] = None, ): """Extract keywords using `KeyBERT <https://maartengr.github.io/KeyBERT/>`_. KeyBERT is a minimal and easy-to-use keyword extraction technique that leverages BERT embeddings to create keywords and keyphrases that are most similar to a document. The KeybertLinkExtractor uses KeyBERT to create links between documents that have keywords in common. Example:: extractor = KeybertLinkExtractor() results = extractor.extract_one("lorem ipsum...") .. seealso:: - :mod:`How to use a graph vector store <langchain_community.graph_vectorstores>` - :class:`How to create links between documents <langchain_core.graph_vectorstores.links.Link>` How to link Documents on common keywords using Keybert ====================================================== Preliminaries ------------- Install the keybert package: .. code-block:: bash pip install -q langchain_community keybert Usage ----- We load the ``state_of_the_union.txt`` file, chunk it, then for each chunk we extract keyword links and add them to the chunk. Using extract_one() ^^^^^^^^^^^^^^^^^^^ We can use :meth:`extract_one` on a document to get the links and add the links to the document metadata with :meth:`~langchain_core.graph_vectorstores.links.add_links`:: from langchain_community.document_loaders import TextLoader from langchain_community.graph_vectorstores import CassandraGraphVectorStore from langchain_community.graph_vectorstores.extractors import KeybertLinkExtractor from langchain_core.graph_vectorstores.links import add_links from langchain_text_splitters import CharacterTextSplitter loader = TextLoader("state_of_the_union.txt") raw_documents = loader.load() text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) documents = text_splitter.split_documents(raw_documents) keyword_extractor = KeybertLinkExtractor() for document in documents: links = keyword_extractor.extract_one(document) add_links(document, links) print(documents[0].metadata) .. code-block:: output {'source': 'state_of_the_union.txt', 'links': [Link(kind='kw', direction='bidir', tag='ukraine'), Link(kind='kw', direction='bidir', tag='ukrainian'), Link(kind='kw', direction='bidir', tag='putin'), Link(kind='kw', direction='bidir', tag='vladimir'), Link(kind='kw', direction='bidir', tag='russia')]} Using LinkExtractorTransformer ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Using the :class:`~langchain_community.graph_vectorstores.extractors.keybert_link_extractor.LinkExtractorTransformer`, we can simplify the link extraction:: from langchain_community.document_loaders import TextLoader from langchain_community.graph_vectorstores.extractors import ( KeybertLinkExtractor, LinkExtractorTransformer, ) from langchain_text_splitters import CharacterTextSplitter loader = TextLoader("state_of_the_union.txt") raw_documents = loader.load() text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) documents = text_splitter.split_documents(raw_documents) transformer = LinkExtractorTransformer([KeybertLinkExtractor()]) documents = transformer.transform_documents(documents) print(documents[0].metadata) .. code-block:: output {'source': 'state_of_the_union.txt', 'links': [Link(kind='kw', direction='bidir', tag='ukraine'), Link(kind='kw', direction='bidir', tag='ukrainian'), Link(kind='kw', direction='bidir', tag='putin'), Link(kind='kw', direction='bidir', tag='vladimir'), Link(kind='kw', direction='bidir', tag='russia')]} The documents with keyword links can then be added to a :class:`~langchain_core.graph_vectorstores.base.GraphVectorStore`:: from langchain_community.graph_vectorstores import CassandraGraphVectorStore store = CassandraGraphVectorStore.from_documents(documents=documents, embedding=...) Args: kind: Kind of links to produce with this extractor. embedding_model: Name of the embedding model to use with KeyBERT. extract_keywords_kwargs: Keyword arguments to pass to KeyBERT's ``extract_keywords`` method. """ # noqa: E501 try: import keybert self._kw_model = keybert.KeyBERT(model=embedding_model) except ImportError: raise ImportError( "keybert is required for KeybertLinkExtractor. " "Please install it with `pip install keybert`." ) from None self._kind = kind self._extract_keywords_kwargs = extract_keywords_kwargs or {}
[docs] def extract_one(self, input: KeybertInput) -> Set[Link]: # noqa: A002 keywords = self._kw_model.extract_keywords( input if isinstance(input, str) else input.page_content, **self._extract_keywords_kwargs, ) return {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}
[docs] def extract_many( self, inputs: Iterable[KeybertInput], ) -> Iterable[Set[Link]]: inputs = list(inputs) if len(inputs) == 1: # Even though we pass a list, if it contains one item, keybert will # flatten it. This means it's easier to just call the special case # for one item. yield self.extract_one(inputs[0]) elif len(inputs) > 1: strs = [i if isinstance(i, str) else i.page_content for i in inputs] extracted = self._kw_model.extract_keywords( strs, **self._extract_keywords_kwargs ) for keywords in extracted: yield {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}