Source code for langchain_community.document_loaders.url_selenium
"""Loader that uses Selenium to load a page, then uses unstructured to load the html."""importloggingfromtypingimportTYPE_CHECKING,List,Literal,Optional,UnionifTYPE_CHECKING:fromselenium.webdriverimportChrome,Firefoxfromlangchain_core.documentsimportDocumentfromlangchain_community.document_loaders.baseimportBaseLoaderlogger=logging.getLogger(__name__)
[docs]classSeleniumURLLoader(BaseLoader):"""Load `HTML` pages with `Selenium` and parse with `Unstructured`. This is useful for loading pages that require javascript to render. Attributes: urls (List[str]): List of URLs to load. continue_on_failure (bool): If True, continue loading other URLs on failure. browser (str): The browser to use, either 'chrome' or 'firefox'. binary_location (Optional[str]): The location of the browser binary. executable_path (Optional[str]): The path to the browser executable. headless (bool): If True, the browser will run in headless mode. arguments [List[str]]: List of arguments to pass to the browser. """
[docs]def__init__(self,urls:List[str],continue_on_failure:bool=True,browser:Literal["chrome","firefox"]="chrome",binary_location:Optional[str]=None,executable_path:Optional[str]=None,headless:bool=True,arguments:List[str]=[],):"""Load a list of URLs using Selenium and unstructured."""try:importselenium# noqa:F401exceptImportError:raiseImportError("selenium package not found, please install it with ""`pip install selenium`")try:importunstructured# noqa:F401exceptImportError:raiseImportError("unstructured package not found, please install it with ""`pip install unstructured`")self.urls=urlsself.continue_on_failure=continue_on_failureself.browser=browserself.binary_location=binary_locationself.executable_path=executable_pathself.headless=headlessself.arguments=arguments
def_get_driver(self)->Union["Chrome","Firefox"]:"""Create and return a WebDriver instance based on the specified browser. Raises: ValueError: If an invalid browser is specified. Returns: Union[Chrome, Firefox]: A WebDriver instance for the specified browser. """ifself.browser.lower()=="chrome":fromselenium.webdriverimportChromefromselenium.webdriver.chrome.optionsimportOptionsasChromeOptionsfromselenium.webdriver.chrome.serviceimportServicechrome_options=ChromeOptions()forarginself.arguments:chrome_options.add_argument(arg)ifself.headless:chrome_options.add_argument("--headless")chrome_options.add_argument("--no-sandbox")ifself.binary_locationisnotNone:chrome_options.binary_location=self.binary_locationifself.executable_pathisNone:returnChrome(options=chrome_options)returnChrome(options=chrome_options,service=Service(executable_path=self.executable_path),)elifself.browser.lower()=="firefox":fromselenium.webdriverimportFirefoxfromselenium.webdriver.firefox.optionsimportOptionsasFirefoxOptionsfromselenium.webdriver.firefox.serviceimportServicefirefox_options=FirefoxOptions()forarginself.arguments:firefox_options.add_argument(arg)ifself.headless:firefox_options.add_argument("--headless")ifself.binary_locationisnotNone:firefox_options.binary_location=self.binary_locationifself.executable_pathisNone:returnFirefox(options=firefox_options)returnFirefox(options=firefox_options,service=Service(executable_path=self.executable_path),)else:raiseValueError("Invalid browser specified. Use 'chrome' or 'firefox'.")def_build_metadata(self,url:str,driver:Union["Chrome","Firefox"])->dict:fromselenium.common.exceptionsimportNoSuchElementExceptionfromselenium.webdriver.common.byimportBy"""Build metadata based on the contents of the webpage"""metadata={"source":url,"title":"No title found.","description":"No description found.","language":"No language found.",}iftitle:=driver.title:metadata["title"]=titletry:ifdescription:=driver.find_element(By.XPATH,'//meta[@name="description"]'):metadata["description"]=(description.get_attribute("content")or"No description found.")exceptNoSuchElementException:passtry:ifhtml_tag:=driver.find_element(By.TAG_NAME,"html"):metadata["language"]=(html_tag.get_attribute("lang")or"No language found.")exceptNoSuchElementException:passreturnmetadata
[docs]defload(self)->List[Document]:"""Load the specified URLs using Selenium and create Document instances. Returns: List[Document]: A list of Document instances with loaded content. """fromunstructured.partition.htmlimportpartition_htmldocs:List[Document]=list()driver=self._get_driver()forurlinself.urls:try:driver.get(url)page_content=driver.page_sourceelements=partition_html(text=page_content)text="\n\n".join([str(el)forelinelements])metadata=self._build_metadata(url,driver)docs.append(Document(page_content=text,metadata=metadata))exceptExceptionase:ifself.continue_on_failure:logger.error(f"Error fetching or processing {url}, exception: {e}")else:raiseedriver.quit()returndocs