Source code for langchain_community.graph_vectorstores.extractors.keybert_link_extractor
from typing import Any, Dict, Iterable, Optional, Set, Union
from langchain_core._api import beta
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import Link
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
KeybertInput = Union[str, Document]
[docs]@beta()
class KeybertLinkExtractor(LinkExtractor[KeybertInput]):
[docs] def __init__(
self,
*,
kind: str = "kw",
embedding_model: str = "all-MiniLM-L6-v2",
extract_keywords_kwargs: Optional[Dict[str, Any]] = None,
):
"""Extract keywords using `KeyBERT <https://maartengr.github.io/KeyBERT/>`_.
KeyBERT is a minimal and easy-to-use keyword extraction technique that
leverages BERT embeddings to create keywords and keyphrases that are most
similar to a document.
The KeybertLinkExtractor uses KeyBERT to create links between documents that
have keywords in common.
Example::
extractor = KeybertLinkExtractor()
results = extractor.extract_one("lorem ipsum...")
.. seealso::
- :mod:`How to use a graph vector store <langchain_community.graph_vectorstores>`
- :class:`How to create links between documents <langchain_core.graph_vectorstores.links.Link>`
How to link Documents on common keywords using Keybert
======================================================
Preliminaries
-------------
Install the keybert package:
.. code-block:: bash
pip install -q langchain_community keybert
Usage
-----
We load the ``state_of_the_union.txt`` file, chunk it, then for each chunk we
extract keyword links and add them to the chunk.
Using extract_one()
^^^^^^^^^^^^^^^^^^^
We can use :meth:`extract_one` on a document to get the links and add the links
to the document metadata with
:meth:`~langchain_core.graph_vectorstores.links.add_links`::
from langchain_community.document_loaders import TextLoader
from langchain_community.graph_vectorstores import CassandraGraphVectorStore
from langchain_community.graph_vectorstores.extractors import KeybertLinkExtractor
from langchain_core.graph_vectorstores.links import add_links
from langchain_text_splitters import CharacterTextSplitter
loader = TextLoader("state_of_the_union.txt")
raw_documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
keyword_extractor = KeybertLinkExtractor()
for document in documents:
links = keyword_extractor.extract_one(document)
add_links(document, links)
print(documents[0].metadata)
.. code-block:: output
{'source': 'state_of_the_union.txt', 'links': [Link(kind='kw', direction='bidir', tag='ukraine'), Link(kind='kw', direction='bidir', tag='ukrainian'), Link(kind='kw', direction='bidir', tag='putin'), Link(kind='kw', direction='bidir', tag='vladimir'), Link(kind='kw', direction='bidir', tag='russia')]}
Using LinkExtractorTransformer
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Using the :class:`~langchain_community.graph_vectorstores.extractors.keybert_link_extractor.LinkExtractorTransformer`,
we can simplify the link extraction::
from langchain_community.document_loaders import TextLoader
from langchain_community.graph_vectorstores.extractors import (
KeybertLinkExtractor,
LinkExtractorTransformer,
)
from langchain_text_splitters import CharacterTextSplitter
loader = TextLoader("state_of_the_union.txt")
raw_documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
transformer = LinkExtractorTransformer([KeybertLinkExtractor()])
documents = transformer.transform_documents(documents)
print(documents[0].metadata)
.. code-block:: output
{'source': 'state_of_the_union.txt', 'links': [Link(kind='kw', direction='bidir', tag='ukraine'), Link(kind='kw', direction='bidir', tag='ukrainian'), Link(kind='kw', direction='bidir', tag='putin'), Link(kind='kw', direction='bidir', tag='vladimir'), Link(kind='kw', direction='bidir', tag='russia')]}
The documents with keyword links can then be added to a :class:`~langchain_core.graph_vectorstores.base.GraphVectorStore`::
from langchain_community.graph_vectorstores import CassandraGraphVectorStore
store = CassandraGraphVectorStore.from_documents(documents=documents, embedding=...)
Args:
kind: Kind of links to produce with this extractor.
embedding_model: Name of the embedding model to use with KeyBERT.
extract_keywords_kwargs: Keyword arguments to pass to KeyBERT's
``extract_keywords`` method.
""" # noqa: E501
try:
import keybert
self._kw_model = keybert.KeyBERT(model=embedding_model)
except ImportError:
raise ImportError(
"keybert is required for KeybertLinkExtractor. "
"Please install it with `pip install keybert`."
) from None
self._kind = kind
self._extract_keywords_kwargs = extract_keywords_kwargs or {}
[docs] def extract_one(self, input: KeybertInput) -> Set[Link]: # noqa: A002
keywords = self._kw_model.extract_keywords(
input if isinstance(input, str) else input.page_content,
**self._extract_keywords_kwargs,
)
return {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}
[docs] def extract_many(
self,
inputs: Iterable[KeybertInput],
) -> Iterable[Set[Link]]:
inputs = list(inputs)
if len(inputs) == 1:
# Even though we pass a list, if it contains one item, keybert will
# flatten it. This means it's easier to just call the special case
# for one item.
yield self.extract_one(inputs[0])
elif len(inputs) > 1:
strs = [i if isinstance(i, str) else i.page_content for i in inputs]
extracted = self._kw_model.extract_keywords(
strs, **self._extract_keywords_kwargs
)
for keywords in extracted:
yield {Link.bidir(kind=self._kind, tag=kw[0]) for kw in keywords}