Source code for langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor

from typing import Callable, List, Set

from langchain_core._api import beta
from langchain_core.documents import Document
from langchain_core.graph_vectorstores.links import Link

from langchain_community.graph_vectorstores.extractors.link_extractor import (
    LinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
    LinkExtractorAdapter,
)

# TypeAlias is not available in Python 3.9, we can't use that or the newer `type`.
HierarchyInput = List[str]

_PARENT: str = "p:"
_CHILD: str = "c:"
_SIBLING: str = "s:"


[docs]@beta() class HierarchyLinkExtractor(LinkExtractor[HierarchyInput]):
[docs] def __init__( self, *, kind: str = "hierarchy", parent_links: bool = True, child_links: bool = False, sibling_links: bool = False, ): """Extract links from a document hierarchy. Example: .. code-block:: python # Given three paths (in this case, within the "Root" document): h1 = ["Root", "H1"] h1a = ["Root", "H1", "a"] h1b = ["Root", "H1", "b"] # Parent links `h1a` and `h1b` to `h1`. # Child links `h1` to `h1a` and `h1b`. # Sibling links `h1a` and `h1b` together (both directions). Example use with documents: .. code_block: python transformer = LinkExtractorTransformer([ HierarchyLinkExtractor().as_document_extractor( # Assumes the "path" to each document is in the metadata. # Could split strings, etc. lambda doc: doc.metadata.get("path", []) ) ]) linked = transformer.transform_documents(docs) Args: kind: Kind of links to produce with this extractor. parent_links: Link from a section to its parent. child_links: Link from a section to its children. sibling_links: Link from a section to other sections with the same parent. """ self._kind = kind self._parent_links = parent_links self._child_links = child_links self._sibling_links = sibling_links
[docs] def as_document_extractor( self, hierarchy: Callable[[Document], HierarchyInput] ) -> LinkExtractor[Document]: """Create a LinkExtractor from `Document`. Args: hierarchy: Function that returns the path for the given document. Returns: A `LinkExtractor[Document]` suitable for application to `Documents` directly or with `LinkExtractorTransformer`. """ return LinkExtractorAdapter(underlying=self, transform=hierarchy)
[docs] def extract_one( self, input: HierarchyInput, ) -> Set[Link]: this_path = "/".join(input) parent_path = None links = set() if self._parent_links: # This is linked from everything with this parent path. links.add(Link.incoming(kind=self._kind, tag=_PARENT + this_path)) if self._child_links: # This is linked to every child with this as it's "parent" path. links.add(Link.outgoing(kind=self._kind, tag=_CHILD + this_path)) if len(input) >= 1: parent_path = "/".join(input[0:-1]) if self._parent_links and len(input) > 1: # This is linked to the nodes with the given parent path. links.add(Link.outgoing(kind=self._kind, tag=_PARENT + parent_path)) if self._child_links and len(input) > 1: # This is linked from every node with the given parent path. links.add(Link.incoming(kind=self._kind, tag=_CHILD + parent_path)) if self._sibling_links: # This is a sibling of everything with the same parent. links.add(Link.bidir(kind=self._kind, tag=_SIBLING + parent_path)) return links