[docs]classBrowserbaseLoader(BaseLoader):"""Load pre-rendered web pages using a headless browser hosted on Browserbase. Depends on `browserbase` and `playwright` packages. Get your API key from https://browserbase.com """
[docs]def__init__(self,urls:Sequence[str],text_content:bool=False,api_key:Optional[str]=None,project_id:Optional[str]=None,session_id:Optional[str]=None,proxy:Optional[bool]=None,):self.urls=urlsself.text_content=text_contentself.session_id=session_idself.project_id=project_idself.proxy=proxytry:frombrowserbaseimportBrowserbaseexceptImportError:raiseImportError("You must run ""`pip install --upgrade ""browserbase playwright` ""to use the Browserbase loader.")self.browserbase=Browserbase(api_key=api_key)
[docs]deflazy_load(self)->Iterator[Document]:"""Load pages from URLs"""try:fromplaywright.sync_apiimportsync_playwrightexceptImportError:raiseImportError("playwright is required for BrowserbaseLoader. ""Please run `pip install --upgrade playwright`.")forurlinself.urls:withsync_playwright()asplaywright:# Create or use existing sessionifself.session_id:session=self.browserbase.sessions.retrieve(id=self.session_id)else:ifnotself.project_id:raiseValueError("project_id is required to create a session")session_params:Dict[str,Any]={"project_id":self.project_id}ifself.proxyisnotNone:session_params["proxy"]=bool(self.proxy)session=self.browserbase.sessions.create(**session_params)# Connect to the remote sessionbrowser=playwright.chromium.connect_over_cdp(session.connect_url)context=browser.contexts[0]page=context.pages[0]# Navigate to URL and get contentpage.goto(url)# Get content based on the text_content flagifself.text_content:page_text=page.inner_text("body")content=str(page_text)else:page_html=page.content()content=str(page_html)# Close browserpage.close()browser.close()yieldDocument(page_content=content,metadata={"url":url,},)