Source code for langchain_community.document_loaders.firecrawl

import warnings
from typing import Iterator, Literal, Optional

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from langchain_core.utils import get_from_env


[docs] class FireCrawlLoader(BaseLoader): """ FireCrawlLoader document loader integration Setup: Install ``firecrawl-py``,``langchain_community`` and set environment variable ``FIRECRAWL_API_KEY``. .. code-block:: bash pip install -U firecrawl-py langchain_community export FIRECRAWL_API_KEY="your-api-key" Instantiate: .. code-block:: python from langchain_community.document_loaders import FireCrawlLoader loader = FireCrawlLoader( url = "https://firecrawl.dev", mode = "crawl" # other params = ... ) Lazy load: .. code-block:: python docs = [] docs_lazy = loader.lazy_load() # async variant: # docs_lazy = await loader.alazy_load() for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl) Join the waitlist to turn any web {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []} Async load: .. code-block:: python docs = await loader.aload() print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl) Join the waitlist to turn any web {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []} """ # noqa: E501
[docs] def legacy_crawler_options_adapter(self, params: dict) -> dict: use_legacy_options = False legacy_keys = [ "includes", "excludes", "allowBackwardCrawling", "allowExternalContentLinks", "pageOptions", ] for key in legacy_keys: if params.get(key): use_legacy_options = True break if use_legacy_options: warnings.warn( "Deprecated parameters detected. See Firecrawl v1 docs for updates.", DeprecationWarning, ) if "includes" in params: if params["includes"] is True: params["includePaths"] = params["includes"] del params["includes"] if "excludes" in params: if params["excludes"] is True: params["excludePaths"] = params["excludes"] del params["excludes"] if "allowBackwardCrawling" in params: if params["allowBackwardCrawling"] is True: params["allowBackwardLinks"] = params["allowBackwardCrawling"] del params["allowBackwardCrawling"] if "allowExternalContentLinks" in params: if params["allowExternalContentLinks"] is True: params["allowExternalLinks"] = params["allowExternalContentLinks"] del params["allowExternalContentLinks"] if "pageOptions" in params: if isinstance(params["pageOptions"], dict): params["scrapeOptions"] = self.legacy_scrape_options_adapter( params["pageOptions"] ) del params["pageOptions"] return params
[docs] def legacy_scrape_options_adapter(self, params: dict) -> dict: use_legacy_options = False formats = ["markdown"] if "extractorOptions" in params: if "mode" in params["extractorOptions"]: if ( params["extractorOptions"]["mode"] == "llm-extraction" or params["extractorOptions"]["mode"] == "llm-extraction-from-raw-html" or params["extractorOptions"]["mode"] == "llm-extraction-from-markdown" ): use_legacy_options = True if "extractionPrompt" in params["extractorOptions"]: if params["extractorOptions"]["extractionPrompt"]: params["prompt"] = params["extractorOptions"][ "extractionPrompt" ] else: params["prompt"] = params["extractorOptions"].get( "extractionPrompt", "Extract page information based on the schema.", ) if "extractionSchema" in params["extractorOptions"]: if params["extractorOptions"]["extractionSchema"]: params["schema"] = params["extractorOptions"][ "extractionSchema" ] if "userPrompt" in params["extractorOptions"]: if params["extractorOptions"]["userPrompt"]: params["prompt"] = params["extractorOptions"]["userPrompt"] del params["extractorOptions"] scrape_keys = [ "includeMarkdown", "includeHtml", "includeRawHtml", "includeExtract", "includeLinks", "screenshot", "fullPageScreenshot", "onlyIncludeTags", "removeTags", ] for key in scrape_keys: if params.get(key): use_legacy_options = True break if use_legacy_options: warnings.warn( "Deprecated parameters detected. See Firecrawl v1 docs for updates.", DeprecationWarning, ) if "includeMarkdown" in params: if params["includeMarkdown"] is False: formats.remove("markdown") del params["includeMarkdown"] if "includeHtml" in params: if params["includeHtml"] is True: formats.append("html") del params["includeHtml"] if "includeRawHtml" in params: if params["includeRawHtml"] is True: formats.append("rawHtml") del params["includeRawHtml"] if "includeExtract" in params: if params["includeExtract"] is True: formats.append("extract") del params["includeExtract"] if "includeLinks" in params: if params["includeLinks"] is True: formats.append("links") del params["includeLinks"] if "screenshot" in params: if params["screenshot"] is True: formats.append("screenshot") del params["screenshot"] if "fullPageScreenshot" in params: if params["fullPageScreenshot"] is True: formats.append("screenshot@fullPage") del params["fullPageScreenshot"] if "onlyIncludeTags" in params: if params["onlyIncludeTags"] is True: params["includeTags"] = params["onlyIncludeTags"] del params["onlyIncludeTags"] if "removeTags" in params: if params["removeTags"] is True: params["excludeTags"] = params["removeTags"] del params["removeTags"] if "formats" not in params: params["formats"] = formats return params
[docs] def __init__( self, url: str, *, api_key: Optional[str] = None, api_url: Optional[str] = None, mode: Literal["crawl", "scrape", "map"] = "crawl", params: Optional[dict] = None, ): """Initialize with API key and url. Args: url: The url to be crawled. api_key: The Firecrawl API key. If not specified will be read from env var FIRECRAWL_API_KEY. Get an API key api_url: The Firecrawl API URL. If not specified will be read from env var FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev. mode: The mode to run the loader in. Default is "crawl". Options include "scrape" (single url), "crawl" (all accessible sub pages), "map" (returns list of links that are semantically related). params: The parameters to pass to the Firecrawl API. Examples include crawlerOptions. For more details, visit: https://github.com/mendableai/firecrawl-py """ try: from firecrawl import FirecrawlApp except ImportError: raise ImportError( "`firecrawl` package not found, please run `pip install firecrawl-py`" ) if mode not in ("crawl", "scrape", "search", "map"): raise ValueError( f"Invalid mode '{mode}'. Allowed: 'crawl', 'scrape', 'search', 'map'." ) if not url: raise ValueError("Url must be provided") api_key = api_key or get_from_env("api_key", "FIRECRAWL_API_KEY") self.firecrawl = FirecrawlApp(api_key=api_key, api_url=api_url) self.url = url self.mode = mode self.params = params or {}
[docs] def lazy_load(self) -> Iterator[Document]: if self.mode == "scrape": firecrawl_docs = [ self.firecrawl.scrape_url( self.url, params=self.legacy_scrape_options_adapter(self.params) ) ] elif self.mode == "crawl": if not self.url: raise ValueError("URL is required for crawl mode") crawl_response = self.firecrawl.crawl_url( self.url, params=self.legacy_crawler_options_adapter(self.params) ) firecrawl_docs = crawl_response.get("data", []) elif self.mode == "map": if not self.url: raise ValueError("URL is required for map mode") firecrawl_docs = self.firecrawl.map_url(self.url, params=self.params) elif self.mode == "search": raise ValueError( "Search mode is not supported in this version, please downgrade." ) else: raise ValueError( f"Invalid mode '{self.mode}'. Allowed: 'crawl', 'scrape', 'map'." ) for doc in firecrawl_docs: if self.mode == "map": page_content = doc metadata = {} else: page_content = ( doc.get("markdown") or doc.get("html") or doc.get("rawHtml", "") ) metadata = doc.get("metadata", {}) if not page_content: continue yield Document( page_content=page_content, metadata=metadata, )