[docs]@deprecated(since="0.0.24",removal="1.0",alternative_import="docugami_langchain.DocugamiLoader",)classDocugamiLoader(BaseLoader,BaseModel):"""Load from `Docugami`. To use, you should have the ``dgml-utils`` python package installed. """api:str=DEFAULT_API_ENDPOINT"""The Docugami API endpoint to use."""access_token:Optional[str]=os.environ.get("DOCUGAMI_API_KEY")"""The Docugami API access token to use."""max_text_length:int=4096"""Max length of chunk text returned."""min_text_length:int=32"""Threshold under which chunks are appended to next to avoid over-chunking."""max_metadata_length:int=512"""Max length of metadata text returned."""include_xml_tags:bool=False"""Set to true for XML tags in chunk output text."""parent_hierarchy_levels:int=0"""Set appropriately to get parent chunks using the chunk hierarchy."""parent_id_key:str="doc_id""""Metadata key for parent doc ID."""sub_chunk_tables:bool=False"""Set to True to return sub-chunks within tables."""whitespace_normalize_text:bool=True"""Set to False if you want to full whitespace formatting in the original XML doc, including indentation."""docset_id:Optional[str]=None"""The Docugami API docset ID to use."""document_ids:Optional[Sequence[str]]=None"""The Docugami API document IDs to use."""file_paths:Optional[Sequence[Union[Path,str]]]"""The local file paths to use."""include_project_metadata_in_doc_metadata:bool=True"""Set to True if you want to include the project metadata in the doc metadata."""@model_validator(mode="before")@classmethoddefvalidate_local_or_remote(cls,values:Dict[str,Any])->Any:"""Validate that either local file paths are given, or remote API docset ID. Args: values: The values to validate. Returns: The validated values. """ifvalues.get("file_paths")andvalues.get("docset_id"):raiseValueError("Cannot specify both file_paths and remote API docset_id")ifnotvalues.get("file_paths")andnotvalues.get("docset_id"):raiseValueError("Must specify either file_paths or remote API docset_id")ifvalues.get("docset_id")andnotvalues.get("access_token"):raiseValueError("Must specify access token if using remote API docset_id")returnvaluesdef_parse_dgml(self,content:bytes,document_name:Optional[str]=None,additional_doc_metadata:Optional[Mapping]=None,)->List[Document]:"""Parse a single DGML document into a list of Documents."""try:fromlxmlimportetreeexceptImportError:raiseImportError("Could not import lxml python package. ""Please install it with `pip install lxml`.")try:fromdgml_utils.modelsimportChunkfromdgml_utils.segmentationimportget_chunksexceptImportError:raiseImportError("Could not import from dgml-utils python package. ""Please install it with `pip install dgml-utils`.")def_build_framework_chunk(dg_chunk:Chunk)->Document:# Stable IDs for chunks with the same text._hashed_id=hashlib.md5(dg_chunk.text.encode()).hexdigest()metadata={XPATH_KEY:dg_chunk.xpath,ID_KEY:_hashed_id,DOCUMENT_NAME_KEY:document_name,DOCUMENT_SOURCE_KEY:document_name,STRUCTURE_KEY:dg_chunk.structure,TAG_KEY:dg_chunk.tag,}text=dg_chunk.textifadditional_doc_metadata:ifself.include_project_metadata_in_doc_metadata:metadata.update(additional_doc_metadata)returnDocument(page_content=text[:self.max_text_length],metadata=metadata,)# Parse the tree and return chunkstree=etree.parse(io.BytesIO(content))root=tree.getroot()dg_chunks=get_chunks(root,min_text_length=self.min_text_length,max_text_length=self.max_text_length,whitespace_normalize_text=self.whitespace_normalize_text,sub_chunk_tables=self.sub_chunk_tables,include_xml_tags=self.include_xml_tags,parent_hierarchy_levels=self.parent_hierarchy_levels,)framework_chunks:Dict[str,Document]={}fordg_chunkindg_chunks:framework_chunk=_build_framework_chunk(dg_chunk)chunk_id=framework_chunk.metadata.get(ID_KEY)ifchunk_id:framework_chunks[chunk_id]=framework_chunkifdg_chunk.parent:framework_parent_chunk=_build_framework_chunk(dg_chunk.parent)parent_id=framework_parent_chunk.metadata.get(ID_KEY)ifparent_idandframework_parent_chunk.page_content:framework_chunk.metadata[self.parent_id_key]=parent_idframework_chunks[parent_id]=framework_parent_chunkreturnlist(framework_chunks.values())def_document_details_for_docset_id(self,docset_id:str)->List[Dict]:"""Gets all document details for the given docset ID"""url=f"{self.api}/docsets/{docset_id}/documents"all_documents=[]whileurl:response=requests.get(url,headers={"Authorization":f"Bearer {self.access_token}"},)ifresponse.ok:data=response.json()all_documents.extend(data["documents"])url=data.get("next",None)else:raiseException(f"Failed to download {url} (status: {response.status_code})")returnall_documentsdef_project_details_for_docset_id(self,docset_id:str)->List[Dict]:"""Gets all project details for the given docset ID"""url=f"{self.api}/projects?docset.id={docset_id}"all_projects=[]whileurl:response=requests.request("GET",url,headers={"Authorization":f"Bearer {self.access_token}"},data={},)ifresponse.ok:data=response.json()all_projects.extend(data["projects"])url=data.get("next",None)else:raiseException(f"Failed to download {url} (status: {response.status_code})")returnall_projectsdef_metadata_for_project(self,project:Dict)->Dict:"""Gets project metadata for all files"""project_id=project.get(ID_KEY)url=f"{self.api}/projects/{project_id}/artifacts/latest"all_artifacts=[]per_file_metadata:Dict={}whileurl:response=requests.request("GET",url,headers={"Authorization":f"Bearer {self.access_token}"},data={},)ifresponse.ok:data=response.json()all_artifacts.extend(data["artifacts"])url=data.get("next",None)elifresponse.status_code==404:# Not found is ok, just means no published projectsreturnper_file_metadataelse:raiseException(f"Failed to download {url} (status: {response.status_code})")forartifactinall_artifacts:artifact_name=artifact.get("name")artifact_url=artifact.get("url")artifact_doc=artifact.get("document")ifartifact_name=="report-values.xml"andartifact_urlandartifact_doc:doc_id=artifact_doc[ID_KEY]metadata:Dict={}# The evaluated XML for each document is named after the projectresponse=requests.request("GET",f"{artifact_url}/content",headers={"Authorization":f"Bearer {self.access_token}"},data={},)ifresponse.ok:try:fromlxmlimportetreeexceptImportError:raiseImportError("Could not import lxml python package. ""Please install it with `pip install lxml`.")artifact_tree=etree.parse(io.BytesIO(response.content))artifact_root=artifact_tree.getroot()ns=artifact_root.nsmapentries=artifact_root.xpath("//pr:Entry",namespaces=ns)forentryinentries:heading=entry.xpath("./pr:Heading",namespaces=ns)[0].textvalue=" ".join(entry.xpath("./pr:Value",namespaces=ns)[0].itertext()).strip()metadata[heading]=value[:self.max_metadata_length]per_file_metadata[doc_id]=metadataelse:raiseException(f"Failed to download {artifact_url}/content "+"(status: {response.status_code})")returnper_file_metadatadef_load_chunks_for_document(self,document_id:str,docset_id:str,document_name:Optional[str]=None,additional_metadata:Optional[Mapping]=None,)->List[Document]:"""Load chunks for a document."""url=f"{self.api}/docsets/{docset_id}/documents/{document_id}/dgml"response=requests.request("GET",url,headers={"Authorization":f"Bearer {self.access_token}"},data={},)ifresponse.ok:returnself._parse_dgml(content=response.content,document_name=document_name,additional_doc_metadata=additional_metadata,)else:raiseException(f"Failed to download {url} (status: {response.status_code})")
[docs]defload(self)->List[Document]:"""Load documents."""chunks:List[Document]=[]ifself.access_tokenandself.docset_id:# Remote mode_document_details=self._document_details_for_docset_id(self.docset_id)ifself.document_ids:_document_details=[dfordin_document_detailsifd[ID_KEY]inself.document_ids]_project_details=self._project_details_for_docset_id(self.docset_id)combined_project_metadata:Dict[str,Dict]={}if_project_detailsandself.include_project_metadata_in_doc_metadata:# If there are any projects for this docset and the caller requested# project metadata, load it.forprojectin_project_details:metadata=self._metadata_for_project(project)forfile_idinmetadata:iffile_idnotincombined_project_metadata:combined_project_metadata[file_id]=metadata[file_id]else:combined_project_metadata[file_id].update(metadata[file_id])fordocin_document_details:doc_id=doc[ID_KEY]doc_name=doc.get(DOCUMENT_NAME_KEY)doc_metadata=combined_project_metadata.get(doc_id)chunks+=self._load_chunks_for_document(document_id=doc_id,docset_id=self.docset_id,document_name=doc_name,additional_metadata=doc_metadata,)elifself.file_paths:# Local mode (for integration testing, or pre-downloaded XML)forpathinself.file_paths:path=Path(path)withopen(path,"rb")asfile:chunks+=self._parse_dgml(content=file.read(),document_name=path.name,)returnchunks