Source code for langchain_community.document_loaders.url_playwright
"""Loader that uses Playwright to load a page, then uses unstructured to parse html."""importloggingfromabcimportABC,abstractmethodfromtypingimportTYPE_CHECKING,AsyncIterator,Dict,Iterator,List,Optionalfromlangchain_core.documentsimportDocumentfromlangchain_community.document_loaders.baseimportBaseLoaderifTYPE_CHECKING:fromplaywright.async_apiimportBrowserasAsyncBrowserfromplaywright.async_apiimportPageasAsyncPagefromplaywright.async_apiimportResponseasAsyncResponsefromplaywright.sync_apiimportBrowser,Page,Responselogger=logging.getLogger(__name__)
[docs]classPlaywrightEvaluator(ABC):"""Abstract base class for all evaluators. Each evaluator should take a page, a browser instance, and a response object, process the page as necessary, and return the resulting text. """
[docs]@abstractmethoddefevaluate(self,page:"Page",browser:"Browser",response:"Response")->str:"""Synchronously process the page and return the resulting text. Args: page: The page to process. browser: The browser instance. response: The response from page.goto(). Returns: text: The text content of the page. """pass
[docs]@abstractmethodasyncdefevaluate_async(self,page:"AsyncPage",browser:"AsyncBrowser",response:"AsyncResponse")->str:"""Asynchronously process the page and return the resulting text. Args: page: The page to process. browser: The browser instance. response: The response from page.goto(). Returns: text: The text content of the page. """pass
[docs]classUnstructuredHtmlEvaluator(PlaywrightEvaluator):"""Evaluate the page HTML content using the `unstructured` library."""
[docs]def__init__(self,remove_selectors:Optional[List[str]]=None):"""Initialize UnstructuredHtmlEvaluator."""try:importunstructured# noqa:F401exceptImportError:raiseImportError("unstructured package not found, please install it with ""`pip install unstructured`")self.remove_selectors=remove_selectors
[docs]defevaluate(self,page:"Page",browser:"Browser",response:"Response")->str:"""Synchronously process the HTML content of the page."""fromunstructured.partition.htmlimportpartition_htmlforselectorinself.remove_selectorsor[]:elements=page.locator(selector).all()forelementinelements:ifelement.is_visible():element.evaluate("element => element.remove()")page_source=page.content()elements=partition_html(text=page_source)return"\n\n".join([str(el)forelinelements])
[docs]asyncdefevaluate_async(self,page:"AsyncPage",browser:"AsyncBrowser",response:"AsyncResponse")->str:"""Asynchronously process the HTML content of the page."""fromunstructured.partition.htmlimportpartition_htmlforselectorinself.remove_selectorsor[]:elements=awaitpage.locator(selector).all()forelementinelements:ifawaitelement.is_visible():awaitelement.evaluate("element => element.remove()")page_source=awaitpage.content()elements=partition_html(text=page_source)return"\n\n".join([str(el)forelinelements])
[docs]classPlaywrightURLLoader(BaseLoader):"""Load `HTML` pages with `Playwright` and parse with `Unstructured`. This is useful for loading pages that require javascript to render. Attributes: urls (List[str]): List of URLs to load. continue_on_failure (bool): If True, continue loading other URLs on failure. headless (bool): If True, the browser will run in headless mode. proxy (Optional[Dict[str, str]]): If set, the browser will access URLs through the specified proxy. Example: .. code-block:: python from langchain_community.document_loaders import PlaywrightURLLoader urls = ["https://api.ipify.org/?format=json",] proxy={ "server": "https://xx.xx.xx:15818", # https://<host>:<port> "username": "username", "password": "password" } loader = PlaywrightURLLoader(urls, proxy=proxy) data = loader.load() """
[docs]def__init__(self,urls:List[str],continue_on_failure:bool=True,headless:bool=True,remove_selectors:Optional[List[str]]=None,evaluator:Optional[PlaywrightEvaluator]=None,proxy:Optional[Dict[str,str]]=None,):"""Load a list of URLs using Playwright."""try:importplaywright# noqa:F401exceptImportError:raiseImportError("playwright package not found, please install it with ""`pip install playwright`")self.urls=urlsself.continue_on_failure=continue_on_failureself.headless=headlessself.proxy=proxyifremove_selectorsandevaluator:raiseValueError("`remove_selectors` and `evaluator` cannot be both not None")# Use the provided evaluator, if any, otherwise, use the default.self.evaluator=evaluatororUnstructuredHtmlEvaluator(remove_selectors)
[docs]deflazy_load(self)->Iterator[Document]:"""Load the specified URLs using Playwright and create Document instances. Returns: A list of Document instances with loaded content. """fromplaywright.sync_apiimportsync_playwrightwithsync_playwright()asp:browser=p.chromium.launch(headless=self.headless,proxy=self.proxy)forurlinself.urls:try:page=browser.new_page()response=page.goto(url)ifresponseisNone:raiseValueError(f"page.goto() returned None for url {url}")text=self.evaluator.evaluate(page,browser,response)metadata={"source":url}yieldDocument(page_content=text,metadata=metadata)exceptExceptionase:ifself.continue_on_failure:logger.error(f"Error fetching or processing {url}, exception: {e}")else:raiseebrowser.close()
[docs]asyncdefaload(self)->List[Document]:"""Load the specified URLs with Playwright and create Documents asynchronously. Use this function when in a jupyter notebook environment. Returns: A list of Document instances with loaded content. """return[docasyncfordocinself.alazy_load()]
[docs]asyncdefalazy_load(self)->AsyncIterator[Document]:"""Load the specified URLs with Playwright and create Documents asynchronously. Use this function when in a jupyter notebook environment. Returns: A list of Document instances with loaded content. """fromplaywright.async_apiimportasync_playwrightasyncwithasync_playwright()asp:browser=awaitp.chromium.launch(headless=self.headless,proxy=self.proxy)forurlinself.urls:try:page=awaitbrowser.new_page()response=awaitpage.goto(url)ifresponseisNone:raiseValueError(f"page.goto() returned None for url {url}")text=awaitself.evaluator.evaluate_async(page,browser,response)metadata={"source":url}yieldDocument(page_content=text,metadata=metadata)exceptExceptionase:ifself.continue_on_failure:logger.error(f"Error fetching or processing {url}, exception: {e}")else:raiseeawaitbrowser.close()