Source code for langchain_community.graph_vectorstores.extractors.link_extractor_transformer
from typing import Any, Sequence
from langchain_core._api import beta
from langchain_core.documents import Document
from langchain_core.documents.transformers import BaseDocumentTransformer
from langchain_core.graph_vectorstores.links import copy_with_links
from langchain_community.graph_vectorstores.extractors.link_extractor import (
LinkExtractor,
)
[docs]@beta()
class LinkExtractorTransformer(BaseDocumentTransformer):
"""DocumentTransformer for applying one or more LinkExtractors.
Example:
.. code-block:: python
extract_links = LinkExtractorTransformer([
HtmlLinkExtractor().as_document_extractor(),
])
extract_links.transform_documents(docs)
"""
[docs] def __init__(self, link_extractors: Sequence[LinkExtractor[Document]]):
"""Create a DocumentTransformer which adds extracted links to each document."""
self.link_extractors = link_extractors
[docs] def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
# Implement `transform_docments` directly, so that LinkExtractors which operate
# better in batch (`extract_many`) get a chance to do so.
# Run each extractor over all documents.
links_per_extractor = [e.extract_many(documents) for e in self.link_extractors]
# Transpose the list of lists to pair each document with the tuple of links.
links_per_document = zip(*links_per_extractor)
return [
copy_with_links(document, *links)
for document, links in zip(documents, links_per_document)
]