[docs]classFireCrawlLoader(BaseLoader):""" FireCrawlLoader document loader integration Setup: Install ``firecrawl-py``,``langchain_community`` and set environment variable ``FIRECRAWL_API_KEY``. .. code-block:: bash pip install -U firecrawl-py langchain_community export FIRECRAWL_API_KEY="your-api-key" Instantiate: .. code-block:: python from langchain_community.document_loaders import FireCrawlLoader loader = FireCrawlLoader( url = "https://firecrawl.dev", mode = "crawl" # other params = ... ) Lazy load: .. code-block:: python docs = [] docs_lazy = loader.lazy_load() # async variant: # docs_lazy = await loader.alazy_load() for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl) Join the waitlist to turn any web {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []} Async load: .. code-block:: python docs = await loader.aload() print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl) Join the waitlist to turn any web {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []} """# noqa: E501
[docs]deflegacy_crawler_options_adapter(self,params:dict)->dict:use_legacy_options=Falselegacy_keys=["includes","excludes","allowBackwardCrawling","allowExternalContentLinks","pageOptions",]forkeyinlegacy_keys:ifparams.get(key):use_legacy_options=Truebreakifuse_legacy_options:warnings.warn("Deprecated parameters detected. See Firecrawl v1 docs for updates.",DeprecationWarning,)if"includes"inparams:ifparams["includes"]isTrue:params["includePaths"]=params["includes"]delparams["includes"]if"excludes"inparams:ifparams["excludes"]isTrue:params["excludePaths"]=params["excludes"]delparams["excludes"]if"allowBackwardCrawling"inparams:ifparams["allowBackwardCrawling"]isTrue:params["allowBackwardLinks"]=params["allowBackwardCrawling"]delparams["allowBackwardCrawling"]if"allowExternalContentLinks"inparams:ifparams["allowExternalContentLinks"]isTrue:params["allowExternalLinks"]=params["allowExternalContentLinks"]delparams["allowExternalContentLinks"]if"pageOptions"inparams:ifisinstance(params["pageOptions"],dict):params["scrapeOptions"]=self.legacy_scrape_options_adapter(params["pageOptions"])delparams["pageOptions"]returnparams
[docs]deflegacy_scrape_options_adapter(self,params:dict)->dict:use_legacy_options=Falseformats=["markdown"]if"extractorOptions"inparams:if"mode"inparams["extractorOptions"]:if(params["extractorOptions"]["mode"]=="llm-extraction"orparams["extractorOptions"]["mode"]=="llm-extraction-from-raw-html"orparams["extractorOptions"]["mode"]=="llm-extraction-from-markdown"):use_legacy_options=Trueif"extractionPrompt"inparams["extractorOptions"]:ifparams["extractorOptions"]["extractionPrompt"]:params["prompt"]=params["extractorOptions"]["extractionPrompt"]else:params["prompt"]=params["extractorOptions"].get("extractionPrompt","Extract page information based on the schema.",)if"extractionSchema"inparams["extractorOptions"]:ifparams["extractorOptions"]["extractionSchema"]:params["schema"]=params["extractorOptions"]["extractionSchema"]if"userPrompt"inparams["extractorOptions"]:ifparams["extractorOptions"]["userPrompt"]:params["prompt"]=params["extractorOptions"]["userPrompt"]delparams["extractorOptions"]scrape_keys=["includeMarkdown","includeHtml","includeRawHtml","includeExtract","includeLinks","screenshot","fullPageScreenshot","onlyIncludeTags","removeTags",]forkeyinscrape_keys:ifparams.get(key):use_legacy_options=Truebreakifuse_legacy_options:warnings.warn("Deprecated parameters detected. See Firecrawl v1 docs for updates.",DeprecationWarning,)if"includeMarkdown"inparams:ifparams["includeMarkdown"]isFalse:formats.remove("markdown")delparams["includeMarkdown"]if"includeHtml"inparams:ifparams["includeHtml"]isTrue:formats.append("html")delparams["includeHtml"]if"includeRawHtml"inparams:ifparams["includeRawHtml"]isTrue:formats.append("rawHtml")delparams["includeRawHtml"]if"includeExtract"inparams:ifparams["includeExtract"]isTrue:formats.append("extract")delparams["includeExtract"]if"includeLinks"inparams:ifparams["includeLinks"]isTrue:formats.append("links")delparams["includeLinks"]if"screenshot"inparams:ifparams["screenshot"]isTrue:formats.append("screenshot")delparams["screenshot"]if"fullPageScreenshot"inparams:ifparams["fullPageScreenshot"]isTrue:formats.append("screenshot@fullPage")delparams["fullPageScreenshot"]if"onlyIncludeTags"inparams:ifparams["onlyIncludeTags"]isTrue:params["includeTags"]=params["onlyIncludeTags"]delparams["onlyIncludeTags"]if"removeTags"inparams:ifparams["removeTags"]isTrue:params["excludeTags"]=params["removeTags"]delparams["removeTags"]if"formats"notinparams:params["formats"]=formatsreturnparams
[docs]def__init__(self,url:str,*,api_key:Optional[str]=None,api_url:Optional[str]=None,mode:Literal["crawl","scrape","map","extract"]="crawl",params:Optional[dict]=None,):"""Initialize with API key and url. Args: url: The url to be crawled. api_key: The Firecrawl API key. If not specified will be read from env var FIRECRAWL_API_KEY. Get an API key api_url: The Firecrawl API URL. If not specified will be read from env var FIRECRAWL_API_URL or defaults to https://api.firecrawl.dev. mode: The mode to run the loader in. Default is "crawl". Options include "scrape" (single url), "crawl" (all accessible sub pages), "map" (returns list of links that are semantically related). "extract" (extracts structured data from a page). params: The parameters to pass to the Firecrawl API. Examples include crawlerOptions. For more details, visit: https://github.com/mendableai/firecrawl-py """try:fromfirecrawlimportFirecrawlAppexceptImportError:raiseImportError("`firecrawl` package not found, please run `pip install firecrawl-py`")ifmodenotin("crawl","scrape","search","map","extract"):raiseValueError(f"""Invalid mode '{mode}'. Allowed: 'crawl', 'scrape', 'search', 'map', 'extract'.""")ifnoturl:raiseValueError("Url must be provided")api_key=api_keyorget_from_env("api_key","FIRECRAWL_API_KEY")self.firecrawl=FirecrawlApp(api_key=api_key,api_url=api_url)self.url=urlself.mode=modeself.params=paramsor{}
[docs]deflazy_load(self)->Iterator[Document]:ifself.mode=="scrape":firecrawl_docs=[self.firecrawl.scrape_url(self.url,params=self.legacy_scrape_options_adapter(self.params))]elifself.mode=="crawl":ifnotself.url:raiseValueError("URL is required for crawl mode")crawl_response=self.firecrawl.crawl_url(self.url,params=self.legacy_crawler_options_adapter(self.params))firecrawl_docs=crawl_response.get("data",[])elifself.mode=="map":ifnotself.url:raiseValueError("URL is required for map mode")firecrawl_docs=self.firecrawl.map_url(self.url,params=self.params)elifself.mode=="extract":ifnotself.url:raiseValueError("URL is required for extract mode")firecrawl_docs=[str(self.firecrawl.extract([self.url],params=self.params))]elifself.mode=="search":raiseValueError("Search mode is not supported in this version, please downgrade.")else:raiseValueError(f"""Invalid mode '{self.mode}'. Allowed: 'crawl', 'scrape', 'map', 'extract'.""")fordocinfirecrawl_docs:ifself.mode=="map"orself.mode=="extract":page_content=docmetadata={}else:page_content=(doc.get("markdown")ordoc.get("html")ordoc.get("rawHtml",""))metadata=doc.get("metadata",{})ifnotpage_content:continueyieldDocument(page_content=page_content,metadata=metadata,)