[docs]classServerUnavailableException(Exception):"""Exception raised when the Grobid server is unavailable."""pass
[docs]classGrobidParser(BaseBlobParser):"""Load article `PDF` files using `Grobid`."""
[docs]def__init__(self,segment_sentences:bool,grobid_server:str="http://localhost:8070/api/processFulltextDocument",)->None:self.segment_sentences=segment_sentencesself.grobid_server=grobid_servertry:requests.get(grobid_server)exceptrequests.exceptions.RequestException:logger.error("GROBID server does not appear up and running, \ please ensure Grobid is installed and the server is running")raiseServerUnavailableException
[docs]defprocess_xml(self,file_path:str,xml_data:str,segment_sentences:bool)->Iterator[Document]:"""Process the XML file from Grobin."""try:frombs4importBeautifulSoupexceptImportError:raiseImportError("`bs4` package not found, please install it with `pip install bs4`")soup=BeautifulSoup(xml_data,"xml")sections=soup.find_all("div")titles=soup.find_all("title")iftitles:title=titles[0].textelse:title="No title found"chunks=[]forsectioninsections:sect=section.find("head")ifsectisnotNone:fori,paragraphinenumerate(section.find_all("p")):chunk_bboxes=[]paragraph_text=[]fori,sentenceinenumerate(paragraph.find_all("s")):paragraph_text.append(sentence.text)sbboxes=[]ifsentence.get("coords")isnotNone:forbboxinsentence.get("coords").split(";"):box=bbox.split(",")sbboxes.append({"page":box[0],"x":box[1],"y":box[2],"h":box[3],"w":box[4],})chunk_bboxes.append(sbboxes)if(segment_sentencesisTrue)and(len(sbboxes)>0):fpage,lpage=sbboxes[0]["page"],sbboxes[-1]["page"]sentence_dict={"text":sentence.text,"para":str(i),"bboxes":[sbboxes],"section_title":sect.text,"section_number":sect.get("n"),"pages":(fpage,lpage),}chunks.append(sentence_dict)ifsegment_sentencesisnotTrue:fpage,lpage=(chunk_bboxes[0][0]["page"],chunk_bboxes[-1][-1]["page"],)paragraph_dict={"text":"".join(paragraph_text),"para":str(i),"bboxes":chunk_bboxes,"section_title":sect.text,"section_number":sect.get("n"),"pages":(fpage,lpage),}chunks.append(paragraph_dict)yield from[Document(page_content=chunk["text"],metadata=dict({"text":str(chunk["text"]),"para":str(chunk["para"]),"bboxes":str(chunk["bboxes"]),"pages":str(chunk["pages"]),"section_title":str(chunk["section_title"]),"section_number":str(chunk["section_number"]),"paper_title":str(title),"file_path":str(file_path),}),)forchunkinchunks]
[docs]deflazy_parse(self,blob:Blob)->Iterator[Document]:file_path=blob.sourceiffile_pathisNone:raiseValueError("blob.source cannot be None.")pdf=open(file_path,"rb")files={"input":(file_path,pdf,"application/pdf",{"Expires":"0"})}try:data:Dict[str,Union[str,List[str]]]={}forparamin["generateIDs","consolidateHeader","segmentSentences"]:data[param]="1"data["teiCoordinates"]=["head","s"]files=filesor{}r=requests.request("POST",self.grobid_server,headers=None,params=None,files=files,data=data,timeout=60,)xml_data=r.textexceptrequests.exceptions.ReadTimeout:logger.error("GROBID server timed out. Return None.")xml_data=Noneifxml_dataisNone:returniter([])else:returnself.process_xml(file_path,xml_data,self.segment_sentences)