Source code for langchain_community.graph_vectorstores.extractors.html_link_extractor

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING, List, Optional, Set, Union
from urllib.parse import urldefrag, urljoin, urlparse

from langchain_core._api import beta
from langchain_core.documents import Document
from langchain_core.graph_vectorstores import Link

from langchain_community.graph_vectorstores.extractors.link_extractor import (
    LinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
    LinkExtractorAdapter,
)

if TYPE_CHECKING:
    from bs4 import BeautifulSoup
    from bs4.element import Tag


def _parse_url(link: Tag, page_url: str, drop_fragments: bool = True) -> Optional[str]:
    href = link.get("href")
    if href is None:
        return None
    url = urlparse(href)
    if url.scheme not in ["http", "https", ""]:
        return None

    # Join the HREF with the page_url to convert relative paths to absolute.
    url = str(urljoin(page_url, href))

    # Fragments would be useful if we chunked a page based on section.
    # Then, each chunk would have a different URL based on the fragment.
    # Since we aren't doing that yet, they just "break" links. So, drop
    # the fragment.
    if drop_fragments:
        return urldefrag(url).url
    return url


def _parse_hrefs(
    soup: BeautifulSoup, url: str, drop_fragments: bool = True
) -> Set[str]:
    soup_links: List[Tag] = soup.find_all("a")
    links: Set[str] = set()

    for link in soup_links:
        parse_url = _parse_url(link, page_url=url, drop_fragments=drop_fragments)
        # Remove self links and entries for any 'a' tag that failed to parse
        # (didn't have href, or invalid domain, etc.)
        if parse_url and parse_url != url:
            links.add(parse_url)

    return links


[docs]@dataclass
class HtmlInput:
    content: Union[str, BeautifulSoup]
    base_url: str


[docs]@beta()
class HtmlLinkExtractor(LinkExtractor[HtmlInput]):
[docs]    def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True):
        """Extract hyperlinks from HTML content.

        Expects the input to be an HTML string or a `BeautifulSoup` object.

        Example::

            extractor = HtmlLinkExtractor()
            results = extractor.extract_one(HtmlInput(html, url))

        .. seealso::

            - :mod:`How to use a graph vector store <langchain_community.graph_vectorstores>`
            - :class:`How to create links between documents <langchain_core.graph_vectorstores.links.Link>`

        How to link Documents on hyperlinks in HTML
        ===========================================

        Preliminaries
        -------------

        Install the ``beautifulsoup4`` package:

        .. code-block:: bash

            pip install -q langchain_community beautifulsoup4

        Usage
        -----

        For this example, we'll scrape 2 HTML pages that have an hyperlink from one
        page to the other using an ``AsyncHtmlLoader``.
        Then we use the ``HtmlLinkExtractor`` to create the links in the documents.

        Using extract_one()
        ^^^^^^^^^^^^^^^^^^^

        We can use :meth:`extract_one` on a document to get the links and add the links
        to the document metadata with
        :meth:`~langchain_core.graph_vectorstores.links.add_links`::

            from langchain_community.document_loaders import AsyncHtmlLoader
            from langchain_community.graph_vectorstores.extractors import (
                HtmlInput,
                HtmlLinkExtractor,
            )
            from langchain_community.graph_vectorstores.links import add_links
            from langchain_core.documents import Document

            loader = AsyncHtmlLoader(
                [
                    "https://python.langchain.com/v0.2/docs/integrations/providers/astradb/",
                    "https://docs.datastax.com/en/astra/home/astra.html",
                ]
            )

            documents = loader.load()

            html_extractor = HtmlLinkExtractor()

            for doc in documents:
                links = html_extractor.extract_one(HtmlInput(doc.page_content, url))
                add_links(doc, links)

            documents[0].metadata["links"][:5]

        .. code-block:: output

            [Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/spreedly/'),
             Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/nvidia/'),
             Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/ray_serve/'),
             Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/bageldb/'),
             Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/introduction/')]

        Using as_document_extractor()
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

        If you use a document loader that returns the raw HTML and that sets the source
        key in the document metadata such as ``AsyncHtmlLoader``,
        you can simplify by using :meth:`as_document_extractor` that takes directly a
        ``Document`` as input::

            from langchain_community.document_loaders import AsyncHtmlLoader
            from langchain_community.graph_vectorstores.extractors import HtmlLinkExtractor
            from langchain_core.graph_vectorstores.links import add_links

            loader = AsyncHtmlLoader(
                [
                    "https://python.langchain.com/v0.2/docs/integrations/providers/astradb/",
                    "https://docs.datastax.com/en/astra/home/astra.html",
                ]
            )
            documents = loader.load()
            html_extractor = HtmlLinkExtractor().as_document_extractor()

            for document in documents:
                links = html_extractor.extract_one(document)
                add_links(document, links)

            documents[0].metadata["links"][:5]

        .. code-block:: output

            [Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/spreedly/'),
             Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/nvidia/'),
             Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/ray_serve/'),
             Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/bageldb/'),
             Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/introduction/')]

        Using LinkExtractorTransformer
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

        Using the :class:`~langchain_community.graph_vectorstores.extractors.keybert_link_extractor.LinkExtractorTransformer`,
        we can simplify the link extraction::

            from langchain_community.document_loaders import AsyncHtmlLoader
            from langchain_community.graph_vectorstores.extractors import (
                HtmlLinkExtractor,
                LinkExtractorTransformer,
            )
            from langchain_community.graph_vectorstores.links import add_links

            loader = AsyncHtmlLoader(
                [
                    "https://python.langchain.com/v0.2/docs/integrations/providers/astradb/",
                    "https://docs.datastax.com/en/astra/home/astra.html",
                ]
            )

            documents = loader.load()
            transformer = LinkExtractorTransformer([HtmlLinkExtractor().as_document_extractor()])
            documents = transformer.transform_documents(documents)

            documents[0].metadata["links"][:5]

        .. code-block:: output

            [Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/spreedly/'),
             Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/nvidia/'),
             Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/ray_serve/'),
             Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/bageldb/'),
             Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/introduction/')]

        We can check that there is a link from the first document to the second::

            for doc_to in documents:
                for link_to in doc_to.metadata["links"]:
                    if link_to.direction == "in":
                        for doc_from in documents:
                            for link_from in doc_from.metadata["links"]:
                                if (
                                    link_to.direction == "in"
                                    and link_from.direction == "out"
                                    and link_to.tag == link_from.tag
                                ):
                                    print(
                                        f"Found link from {doc_from.metadata['source']} to {doc_to.metadata['source']}."
                                    )

        .. code-block:: output

            Found link from https://python.langchain.com/v0.2/docs/integrations/providers/astradb/ to https://docs.datastax.com/en/astra/home/astra.html.

        The documents with URL links can then be added to a :class:`~langchain_core.graph_vectorstores.base.GraphVectorStore`::

            from langchain_community.graph_vectorstores import CassandraGraphVectorStore

            store = CassandraGraphVectorStore.from_documents(documents=documents, embedding=...)

        Args:
            kind: The kind of edge to extract. Defaults to ``hyperlink``.
            drop_fragments: Whether fragments in URLs and links should be
                dropped. Defaults to ``True``.
        """  # noqa: E501
        try:
            import bs4  # noqa:F401
        except ImportError as e:
            raise ImportError(
                "BeautifulSoup4 is required for HtmlLinkExtractor. "
                "Please install it with `pip install beautifulsoup4`."
            ) from e

        self._kind = kind
        self.drop_fragments = drop_fragments

[docs]    def as_document_extractor(
        self, url_metadata_key: str = "source"
    ) -> LinkExtractor[Document]:
        """Return a LinkExtractor that applies to documents.

        Note:
            Since the HtmlLinkExtractor parses HTML, if you use with other similar
            link extractors it may be more efficient to call the link extractors
            directly on the parsed BeautifulSoup object.

        Args:
            url_metadata_key: The name of the filed in document metadata with the URL of
                the document.
        """
        return LinkExtractorAdapter(
            underlying=self,
            transform=lambda doc: HtmlInput(
                doc.page_content, doc.metadata[url_metadata_key]
            ),
        )

[docs]    def extract_one(
        self,
        input: HtmlInput,  # noqa: A002
    ) -> Set[Link]:
        content = input.content
        if isinstance(content, str):
            from bs4 import BeautifulSoup

            content = BeautifulSoup(content, "html.parser")

        base_url = input.base_url
        if self.drop_fragments:
            base_url = urldefrag(base_url).url

        hrefs = _parse_hrefs(content, base_url, self.drop_fragments)

        links = {Link.outgoing(kind=self._kind, tag=url) for url in hrefs}
        links.add(Link.incoming(kind=self._kind, tag=base_url))
        return links