[docs]classVsdxParser(BaseBlobParser,ABC):"""Parser for vsdx files."""
[docs]defparse(self,blob:Blob)->Iterator[Document]:# type: ignore[override]"""Parse a vsdx file."""returnself.lazy_parse(blob)
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:"""Retrieve the contents of pages from a .vsdx file and insert them into documents, one document per page."""withblob.as_bytes_io()aspdf_file_obj:withzipfile.ZipFile(pdf_file_obj,"r")aszfile:pages=self.get_pages_content(zfile,blob.source)# type: ignore[arg-type]yield from[Document(page_content=page_content,metadata={"source":blob.source,"page":page_number,"page_name":page_name,},)forpage_number,page_name,page_contentinpages]
[docs]defget_pages_content(self,zfile:zipfile.ZipFile,source:str)->List[Tuple[int,str,str]]:"""Get the content of the pages of a vsdx file. Attributes: zfile (zipfile.ZipFile): The vsdx file under zip format. source (str): The path of the vsdx file. Returns: list[tuple[int, str, str]]: A list of tuples containing the page number, the name of the page and the content of the page for each page of the vsdx file. """try:importxmltodictexceptImportError:raiseImportError("The xmltodict library is required to parse vsdx files. ""Please install it with `pip install xmltodict`.")if"visio/pages/pages.xml"notinzfile.namelist():print("WARNING - No pages.xml file found in {}".format(source))# noqa: T201return# type: ignore[return-value]if"visio/pages/_rels/pages.xml.rels"notinzfile.namelist():print("WARNING - No pages.xml.rels file found in {}".format(source))# noqa: T201return# type: ignore[return-value]if"docProps/app.xml"notinzfile.namelist():print("WARNING - No app.xml file found in {}".format(source))# noqa: T201return# type: ignore[return-value]pagesxml_content:dict=xmltodict.parse(zfile.read("visio/pages/pages.xml"))appxml_content:dict=xmltodict.parse(zfile.read("docProps/app.xml"))pagesxmlrels_content:dict=xmltodict.parse(zfile.read("visio/pages/_rels/pages.xml.rels"))ifisinstance(pagesxml_content["Pages"]["Page"],list):disordered_names:List[str]=[rel["@Name"].strip()forrelinpagesxml_content["Pages"]["Page"]]else:disordered_names:List[str]=[# type: ignore[no-redef]pagesxml_content["Pages"]["Page"]["@Name"].strip()]ifisinstance(pagesxmlrels_content["Relationships"]["Relationship"],list):disordered_paths:List[str]=["visio/pages/"+rel["@Target"]forrelinpagesxmlrels_content["Relationships"]["Relationship"]]else:disordered_paths:List[str]=[# type: ignore[no-redef]"visio/pages/"+pagesxmlrels_content["Relationships"]["Relationship"]["@Target"]]ordered_names:List[str]=appxml_content["Properties"]["TitlesOfParts"]["vt:vector"]["vt:lpstr"][:len(disordered_names)]ordered_names=[name.strip()fornameinordered_names]ordered_paths=[disordered_paths[disordered_names.index(name.strip())]fornameinordered_names]# Pages out of order and without content of their relationshipsdisordered_pages=[]forpathinordered_paths:content=zfile.read(path)string_content=json.dumps(xmltodict.parse(content))samples=re.findall(r'"#text"\s*:\s*"([^\\"]*(?:\\.[^\\"]*)*)"',string_content)iflen(samples)>0:page_content="\n".join(samples)map_symboles={"\\n":"\n","\\t":"\t","\\u2013":"-","\\u2019":"'","\\u00e9r":"รฉ","\\u00f4me":"รด",}forkey,valueinmap_symboles.items():page_content=page_content.replace(key,value)disordered_pages.append({"page":path,"page_content":page_content})# Direct relationships of each page in a dict formatpagexml_rels=[{"path":page_path,"content":xmltodict.parse(zfile.read(f"visio/pages/_rels/{Path(page_path).stem}.xml.rels")),}forpage_pathinordered_pathsiff"visio/pages/_rels/{Path(page_path).stem}.xml.rels"inzfile.namelist()]# Pages in order and with content of their relationships (direct and indirect)ordered_pages:List[Tuple[int,str,str]]=[]forpage_number,(path,page_name)inenumerate(zip(ordered_paths,ordered_names)):relationships=self.get_relationships(path,zfile,ordered_paths,pagexml_rels)page_content="\n".join([page_["page_content"]forpage_indisordered_pagesifpage_["page"]inrelationships]+[page_["page_content"]forpage_indisordered_pagesifpage_["page"]==path])ordered_pages.append((page_number,page_name,page_content))returnordered_pages
[docs]defget_relationships(self,page:str,zfile:zipfile.ZipFile,filelist:List[str],pagexml_rels:List[dict],)->Set[str]:"""Get the relationships of a page and the relationships of its relationships, etc... recursively. Pages are based on other pages (ex: background page), so we need to get all the relationships to get all the content of a single page. """name_path=Path(page).nameparent_path=Path(page).parentrels_path=parent_path/f"_rels/{name_path}.rels"ifstr(rels_path)notinzfile.namelist():returnset()pagexml_rels_content=next(page_["content"]forpage_inpagexml_relsifpage_["path"]==page)ifisinstance(pagexml_rels_content["Relationships"]["Relationship"],list):targets=[rel["@Target"]forrelinpagexml_rels_content["Relationships"]["Relationship"]]else:targets=[pagexml_rels_content["Relationships"]["Relationship"]["@Target"]]relationships=set([str(parent_path/target)fortargetintargets]).intersection(filelist)forrelinrelationships:relationships=relationships|self.get_relationships(rel,zfile,filelist,pagexml_rels)returnrelationships