Source code for langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor

from typing import Callable, List, Set

from langchain_core._api import beta
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import Link

from langchain_community.graph_vectorstores.extractors.link_extractor import (
    LinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
    LinkExtractorAdapter,
)

# TypeAlias is not available in Python 3.9, we can't use that or the newer `type`.
HierarchyInput = List[str]

_PARENT: str = "p:"
_CHILD: str = "c:"
_SIBLING: str = "s:"


[docs]@beta()
class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
[docs]    def __init__(
        self,
        *,
        kind: str = "hierarchy",
        parent_links: bool = True,
        child_links: bool = False,
        sibling_links: bool = False,
    ):
        """Extract links from a document hierarchy.

        Example:

            .. code-block:: python

                # Given three paths (in this case, within the "Root" document):
                h1 = ["Root", "H1"]
                h1a = ["Root", "H1", "a"]
                h1b = ["Root", "H1", "b"]

                # Parent links `h1a` and `h1b` to `h1`.
                # Child links `h1` to `h1a` and `h1b`.
                # Sibling links `h1a` and `h1b` together (both directions).

        Example use with documents:
            .. code_block: python
                transformer = LinkExtractorTransformer([
                    HierarchyLinkExtractor().as_document_extractor(
                        # Assumes the "path" to each document is in the metadata.
                        # Could split strings, etc.
                        lambda doc: doc.metadata.get("path", [])
                    )
                ])
                linked = transformer.transform_documents(docs)

        Args:
            kind: Kind of links to produce with this extractor.
            parent_links: Link from a section to its parent.
            child_links: Link from a section to its children.
            sibling_links: Link from a section to other sections with the same parent.
        """
        self._kind = kind
        self._parent_links = parent_links
        self._child_links = child_links
        self._sibling_links = sibling_links

[docs]    def as_document_extractor(
        self, hierarchy: Callable[[Document], HierarchyInput]
    ) -> LinkExtractor[Document]:
        """Create a LinkExtractor from `Document`.

        Args:
            hierarchy: Function that returns the path for the given document.

        Returns:
            A `LinkExtractor[Document]` suitable for application to `Documents` directly
            or with `LinkExtractorTransformer`.
        """
        return LinkExtractorAdapter(underlying=self, transform=hierarchy)

[docs]    def extract_one(
        self,
        input: HierarchyInput,
    ) -> Set[Link]:
        this_path = "/".join(input)
        parent_path = None

        links = set()
        if self._parent_links:
            # This is linked from everything with this parent path.
            links.add(Link.incoming(kind=self._kind, tag=_PARENT + this_path))
        if self._child_links:
            # This is linked to every child with this as it's "parent" path.
            links.add(Link.outgoing(kind=self._kind, tag=_CHILD + this_path))

        if len(input) >= 1:
            parent_path = "/".join(input[0:-1])
            if self._parent_links and len(input) > 1:
                # This is linked to the nodes with the given parent path.
                links.add(Link.outgoing(kind=self._kind, tag=_PARENT + parent_path))
            if self._child_links and len(input) > 1:
                # This is linked from every node with the given parent path.
                links.add(Link.incoming(kind=self._kind, tag=_CHILD + parent_path))
            if self._sibling_links:
                # This is a sibling of everything with the same parent.
                links.add(Link.bidir(kind=self._kind, tag=_SIBLING + parent_path))

        return links