Source code for langchain_community.graph_vectorstores.extractors.html_link_extractor

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING, List, Optional, Set, Union
from urllib.parse import urldefrag, urljoin, urlparse

from langchain_core._api import beta
from langchain_core.documents import Document
from langchain_core.graph_vectorstores import Link

from langchain_community.graph_vectorstores.extractors.link_extractor import (
    LinkExtractor,
)
from langchain_community.graph_vectorstores.extractors.link_extractor_adapter import (
    LinkExtractorAdapter,
)

if TYPE_CHECKING:
    from bs4 import BeautifulSoup
    from bs4.element import Tag


def _parse_url(link: Tag, page_url: str, drop_fragments: bool = True) -> Optional[str]:
    href = link.get("href")
    if href is None:
        return None
    url = urlparse(href)
    if url.scheme not in ["http", "https", ""]:
        return None

    # Join the HREF with the page_url to convert relative paths to absolute.
    url = str(urljoin(page_url, href))

    # Fragments would be useful if we chunked a page based on section.
    # Then, each chunk would have a different URL based on the fragment.
    # Since we aren't doing that yet, they just "break" links. So, drop
    # the fragment.
    if drop_fragments:
        return urldefrag(url).url
    return url


def _parse_hrefs(
    soup: BeautifulSoup, url: str, drop_fragments: bool = True
) -> Set[str]:
    soup_links: List[Tag] = soup.find_all("a")
    links: Set[str] = set()

    for link in soup_links:
        parse_url = _parse_url(link, page_url=url, drop_fragments=drop_fragments)
        # Remove self links and entries for any 'a' tag that failed to parse
        # (didn't have href, or invalid domain, etc.)
        if parse_url and parse_url != url:
            links.add(parse_url)

    return links


[docs]@dataclass class HtmlInput: content: Union[str, BeautifulSoup] base_url: str
[docs]@beta() class HtmlLinkExtractor(LinkExtractor[HtmlInput]):
[docs] def __init__(self, *, kind: str = "hyperlink", drop_fragments: bool = True): """Extract hyperlinks from HTML content. Expects the input to be an HTML string or a `BeautifulSoup` object. Example:: extractor = HtmlLinkExtractor() results = extractor.extract_one(HtmlInput(html, url)) .. seealso:: - :mod:`How to use a graph vector store <langchain_community.graph_vectorstores>` - :class:`How to create links between documents <langchain_core.graph_vectorstores.links.Link>` How to link Documents on hyperlinks in HTML =========================================== Preliminaries ------------- Install the ``beautifulsoup4`` package: .. code-block:: bash pip install -q langchain_community beautifulsoup4 Usage ----- For this example, we'll scrape 2 HTML pages that have an hyperlink from one page to the other using an ``AsyncHtmlLoader``. Then we use the ``HtmlLinkExtractor`` to create the links in the documents. Using extract_one() ^^^^^^^^^^^^^^^^^^^ We can use :meth:`extract_one` on a document to get the links and add the links to the document metadata with :meth:`~langchain_core.graph_vectorstores.links.add_links`:: from langchain_community.document_loaders import AsyncHtmlLoader from langchain_community.graph_vectorstores.extractors import ( HtmlInput, HtmlLinkExtractor, ) from langchain_community.graph_vectorstores.links import add_links from langchain_core.documents import Document loader = AsyncHtmlLoader( [ "https://python.langchain.com/v0.2/docs/integrations/providers/astradb/", "https://docs.datastax.com/en/astra/home/astra.html", ] ) documents = loader.load() html_extractor = HtmlLinkExtractor() for doc in documents: links = html_extractor.extract_one(HtmlInput(doc.page_content, url)) add_links(doc, links) documents[0].metadata["links"][:5] .. code-block:: output [Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/spreedly/'), Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/nvidia/'), Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/ray_serve/'), Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/bageldb/'), Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/introduction/')] Using as_document_extractor() ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ If you use a document loader that returns the raw HTML and that sets the source key in the document metadata such as ``AsyncHtmlLoader``, you can simplify by using :meth:`as_document_extractor` that takes directly a ``Document`` as input:: from langchain_community.document_loaders import AsyncHtmlLoader from langchain_community.graph_vectorstores.extractors import HtmlLinkExtractor from langchain_core.graph_vectorstores.links import add_links loader = AsyncHtmlLoader( [ "https://python.langchain.com/v0.2/docs/integrations/providers/astradb/", "https://docs.datastax.com/en/astra/home/astra.html", ] ) documents = loader.load() html_extractor = HtmlLinkExtractor().as_document_extractor() for document in documents: links = html_extractor.extract_one(document) add_links(document, links) documents[0].metadata["links"][:5] .. code-block:: output [Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/spreedly/'), Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/nvidia/'), Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/ray_serve/'), Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/bageldb/'), Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/introduction/')] Using LinkExtractorTransformer ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Using the :class:`~langchain_community.graph_vectorstores.extractors.keybert_link_extractor.LinkExtractorTransformer`, we can simplify the link extraction:: from langchain_community.document_loaders import AsyncHtmlLoader from langchain_community.graph_vectorstores.extractors import ( HtmlLinkExtractor, LinkExtractorTransformer, ) from langchain_community.graph_vectorstores.links import add_links loader = AsyncHtmlLoader( [ "https://python.langchain.com/v0.2/docs/integrations/providers/astradb/", "https://docs.datastax.com/en/astra/home/astra.html", ] ) documents = loader.load() transformer = LinkExtractorTransformer([HtmlLinkExtractor().as_document_extractor()]) documents = transformer.transform_documents(documents) documents[0].metadata["links"][:5] .. code-block:: output [Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/spreedly/'), Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/nvidia/'), Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/ray_serve/'), Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/integrations/providers/bageldb/'), Link(kind='hyperlink', direction='out', tag='https://python.langchain.com/v0.2/docs/introduction/')] We can check that there is a link from the first document to the second:: for doc_to in documents: for link_to in doc_to.metadata["links"]: if link_to.direction == "in": for doc_from in documents: for link_from in doc_from.metadata["links"]: if ( link_to.direction == "in" and link_from.direction == "out" and link_to.tag == link_from.tag ): print( f"Found link from {doc_from.metadata['source']} to {doc_to.metadata['source']}." ) .. code-block:: output Found link from https://python.langchain.com/v0.2/docs/integrations/providers/astradb/ to https://docs.datastax.com/en/astra/home/astra.html. The documents with URL links can then be added to a :class:`~langchain_core.graph_vectorstores.base.GraphVectorStore`:: from langchain_community.graph_vectorstores import CassandraGraphVectorStore store = CassandraGraphVectorStore.from_documents(documents=documents, embedding=...) Args: kind: The kind of edge to extract. Defaults to ``hyperlink``. drop_fragments: Whether fragments in URLs and links should be dropped. Defaults to ``True``. """ # noqa: E501 try: import bs4 # noqa:F401 except ImportError as e: raise ImportError( "BeautifulSoup4 is required for HtmlLinkExtractor. " "Please install it with `pip install beautifulsoup4`." ) from e self._kind = kind self.drop_fragments = drop_fragments
[docs] def as_document_extractor( self, url_metadata_key: str = "source" ) -> LinkExtractor[Document]: """Return a LinkExtractor that applies to documents. Note: Since the HtmlLinkExtractor parses HTML, if you use with other similar link extractors it may be more efficient to call the link extractors directly on the parsed BeautifulSoup object. Args: url_metadata_key: The name of the filed in document metadata with the URL of the document. """ return LinkExtractorAdapter( underlying=self, transform=lambda doc: HtmlInput( doc.page_content, doc.metadata[url_metadata_key] ), )
[docs] def extract_one( self, input: HtmlInput, # noqa: A002 ) -> Set[Link]: content = input.content if isinstance(content, str): from bs4 import BeautifulSoup content = BeautifulSoup(content, "html.parser") base_url = input.base_url if self.drop_fragments: base_url = urldefrag(base_url).url hrefs = _parse_hrefs(content, base_url, self.drop_fragments) links = {Link.outgoing(kind=self._kind, tag=url) for url in hrefs} links.add(Link.incoming(kind=self._kind, tag=base_url)) return links