[docs]classMWDumpLoader(BaseLoader):"""Load `MediaWiki` dump from an `XML` file. Example: .. code-block:: python from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.document_loaders import MWDumpLoader loader = MWDumpLoader( file_path="myWiki.xml", encoding="utf8" ) docs = loader.load() text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=0 ) texts = text_splitter.split_documents(docs) :param file_path: XML local file path :type file_path: str :param encoding: Charset encoding, defaults to "utf8" :type encoding: str, optional :param namespaces: The namespace of pages you want to parse. See https://www.mediawiki.org/wiki/Help:Namespaces#Localisation for a list of all common namespaces :type namespaces: List[int],optional :param skip_redirects: TR=rue to skip pages that redirect to other pages, False to keep them. False by default :type skip_redirects: bool, optional :param stop_on_error: False to skip over pages that cause parsing errors, True to stop. True by default :type stop_on_error: bool, optional """
[docs]def__init__(self,file_path:Union[str,Path],encoding:Optional[str]="utf8",namespaces:Optional[Sequence[int]]=None,skip_redirects:Optional[bool]=False,stop_on_error:Optional[bool]=True,):self.file_path=file_pathifisinstance(file_path,str)elsestr(file_path)self.encoding=encoding# Namespaces range from -2 to 15, inclusive.self.namespaces=namespacesself.skip_redirects=skip_redirectsself.stop_on_error=stop_on_error
def_load_dump_file(self):# type: ignore[no-untyped-def]try:importmwxmlexceptImportErrorase:raiseImportError("Unable to import 'mwxml'. Please install with `pip install mwxml`.")fromereturnmwxml.Dump.from_file(open(self.file_path,encoding=self.encoding))def_load_single_page_from_dump(self,page)->Document:# type: ignore[no-untyped-def, return]"""Parse a single page."""try:importmwparserfromhellexceptImportErrorase:raiseImportError("Unable to import 'mwparserfromhell'. Please install with"" `pip install mwparserfromhell`.")fromeforrevisioninpage:code=mwparserfromhell.parse(revision.text)text=code.strip_code(normalize=True,collapse=True,keep_template_params=False)metadata={"source":page.title}returnDocument(page_content=text,metadata=metadata)
[docs]deflazy_load(self,)->Iterator[Document]:"""Lazy load from a file path."""dump=self._load_dump_file()forpageindump.pages:ifself.skip_redirectsandpage.redirect:continueifself.namespacesandpage.namespacenotinself.namespaces:continuetry:yieldself._load_single_page_from_dump(page)exceptExceptionase:logger.error("Parsing error: {}".format(e))ifself.stop_on_error:raiseeelse:continue