Source code for langchain_community.document_loaders.news
"""Loader that uses unstructured to load HTML files."""importloggingfromtypingimportAny,Iterator,Listfromlangchain_core.documentsimportDocumentfromlangchain_community.document_loaders.baseimportBaseLoaderlogger=logging.getLogger(__name__)
[docs]classNewsURLLoader(BaseLoader):"""Load news articles from URLs using `Unstructured`. Args: urls: URLs to load. Each is loaded into its own document. text_mode: If True, extract text from URL and use that for page content. Otherwise, extract raw HTML. nlp: If True, perform NLP on the extracted contents, like providing a summary and extracting keywords. continue_on_failure: If True, continue loading documents even if loading fails for a particular URL. show_progress_bar: If True, use tqdm to show a loading progress bar. Requires tqdm to be installed, ``pip install tqdm``. **newspaper_kwargs: Any additional named arguments to pass to newspaper.Article(). Example: .. code-block:: python from langchain_community.document_loaders import NewsURLLoader loader = NewsURLLoader( urls=["<url-1>", "<url-2>"], ) docs = loader.load() Newspaper reference: https://newspaper.readthedocs.io/en/latest/ """
[docs]def__init__(self,urls:List[str],text_mode:bool=True,nlp:bool=False,continue_on_failure:bool=True,show_progress_bar:bool=False,**newspaper_kwargs:Any,)->None:"""Initialize with file path."""try:importnewspaperself.__version=newspaper.__version__exceptImportError:raiseImportError("newspaper package not found, please install it with ""`pip install newspaper3k`")self.urls=urlsself.text_mode=text_modeself.nlp=nlpself.continue_on_failure=continue_on_failureself.newspaper_kwargs=newspaper_kwargsself.show_progress_bar=show_progress_bar
[docs]defload(self)->List[Document]:iter=self.lazy_load()ifself.show_progress_bar:try:fromtqdmimporttqdmexceptImportErrorase:raiseImportError("Package tqdm must be installed if show_progress_bar=True. ""Please install with 'pip install tqdm' or set ""show_progress_bar=False.")fromeiter=tqdm(iter)returnlist(iter)
[docs]deflazy_load(self)->Iterator[Document]:try:fromnewspaperimportArticleexceptImportErrorase:raiseImportError("Cannot import newspaper, please install with `pip install newspaper3k`")fromeforurlinself.urls:try:article=Article(url,**self.newspaper_kwargs)article.download()article.parse()ifself.nlp:article.nlp()exceptExceptionase:ifself.continue_on_failure:logger.error(f"Error fetching or processing {url}, exception: {e}")continueelse:raiseemetadata={"title":getattr(article,"title",""),"link":getattr(article,"url",getattr(article,"canonical_link","")),"authors":getattr(article,"authors",[]),"language":getattr(article,"meta_lang",""),"description":getattr(article,"meta_description",""),"publish_date":getattr(article,"publish_date",""),}ifself.text_mode:content=article.textelse:content=article.htmlifself.nlp:metadata["keywords"]=getattr(article,"keywords",[])metadata["summary"]=getattr(article,"summary","")yieldDocument(page_content=content,metadata=metadata)