"""Shared support for scanning document type declarations in HTML and XHTML.This module is used as a foundation for the html.parser module. It has nodocumented public API and should not be used directly."""importre_declname_match=re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match_declstringlit_match=re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match_commentclose=re.compile(r'--\s*>')_markedsectionclose=re.compile(r']\s*]\s*>')# An analysis of the MS-Word extensions is available at# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf_msmarkedsectionclose=re.compile(r']\s*>')delreclassParserBase:"""Parser base class which provides some common support methods used by the SGML/HTML and XHTML parsers."""def__init__(self):ifself.__class__isParserBase:raiseRuntimeError("_markupbase.ParserBase must be subclassed")defreset(self):self.lineno=1self.offset=0defgetpos(self):"""Return current line number and offset."""returnself.lineno,self.offset# Internal -- update line number and offset. This should be# called for each piece of data exactly once, in order -- in other# words the concatenation of all the input strings to this# function should be exactly the entire input.defupdatepos(self,i,j):ifi>=j:returnjrawdata=self.rawdatanlines=rawdata.count("\n",i,j)ifnlines:self.lineno=self.lineno+nlinespos=rawdata.rindex("\n",i,j)# Should not failself.offset=j-(pos+1)else:self.offset=self.offset+j-ireturnj_decl_otherchars=''# Internal -- parse declaration (for use by subclasses).defparse_declaration(self,i):# This is some sort of declaration; in "HTML as# deployed," this should only be the document type# declaration ("<!DOCTYPE html...>").# ISO 8879:1986, however, has more complex# declaration syntax for elements in <!...>, including:# --comment--# [marked section]# name in the following list: ENTITY, DOCTYPE, ELEMENT,# ATTLIST, NOTATION, SHORTREF, USEMAP,# LINKTYPE, LINK, IDLINK, USELINK, SYSTEMrawdata=self.rawdataj=i+2assertrawdata[i:j]=="<!","unexpected call to parse_declaration"ifrawdata[j:j+1]==">":# the empty comment <!>returnj+1ifrawdata[j:j+1]in("-",""):# Start of comment followed by buffer boundary,# or just a buffer boundary.return-1# A simple, practical version could look like: ((name|stringlit) S*) + '>'n=len(rawdata)ifrawdata[j:j+2]=='--':#comment# Locate --.*-- as the body of the commentreturnself.parse_comment(i)elifrawdata[j]=='[':#marked section# Locate [statusWord [...arbitrary SGML...]] as the body of the marked section# Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA# Note that this is extended by Microsoft Office "Save as Web" function# to include [if...] and [endif].returnself.parse_marked_section(i)else:#all other declaration elementsdecltype,j=self._scan_name(j,i)ifj<0:returnjifdecltype=="doctype":self._decl_otherchars=''whilej<n:c=rawdata[j]ifc==">":# end of declaration syntaxdata=rawdata[i+2:j]ifdecltype=="doctype":self.handle_decl(data)else:# According to the HTML5 specs sections "8.2.4.44 Bogus# comment state" and "8.2.4.45 Markup declaration open# state", a comment token should be emitted.# Calling unknown_decl provides more flexibility though.self.unknown_decl(data)returnj+1ifcin"\"'":m=_declstringlit_match(rawdata,j)ifnotm:return-1# incompletej=m.end()elifcin"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":name,j=self._scan_name(j,i)elifcinself._decl_otherchars:j=j+1elifc=="[":# this could be handled in a separate doctype parserifdecltype=="doctype":j=self._parse_doctype_subset(j+1,i)elifdecltypein{"attlist","linktype","link","element"}:# must tolerate []'d groups in a content model in an element declaration# also in data attribute specifications of attlist declaration# also link type declaration subsets in linktype declarations# also link attribute specification lists in link declarationsraiseAssertionError("unsupported '[' char in %s declaration"%decltype)else:raiseAssertionError("unexpected '[' char in declaration")else:raiseAssertionError("unexpected %r char in declaration"%rawdata[j])ifj<0:returnjreturn-1# incomplete# Internal -- parse a marked section# Override this to handle MS-word extension syntax <![if word]>content<![endif]>defparse_marked_section(self,i,report=1):rawdata=self.rawdataassertrawdata[i:i+3]=='<![',"unexpected call to parse_marked_section()"sectName,j=self._scan_name(i+3,i)ifj<0:returnjifsectNamein{"temp","cdata","ignore","include","rcdata"}:# look for standard ]]> endingmatch=_markedsectionclose.search(rawdata,i+3)elifsectNamein{"if","else","endif"}:# look for MS Office ]> endingmatch=_msmarkedsectionclose.search(rawdata,i+3)else:raiseAssertionError('unknown status keyword %r in marked section'%rawdata[i+3:j])ifnotmatch:return-1ifreport:j=match.start(0)self.unknown_decl(rawdata[i+3:j])returnmatch.end(0)# Internal -- parse comment, return length or -1 if not terminateddefparse_comment(self,i,report=1):rawdata=self.rawdataifrawdata[i:i+4]!='<!--':raiseAssertionError('unexpected call to parse_comment()')match=_commentclose.search(rawdata,i+4)ifnotmatch:return-1ifreport:j=match.start(0)self.handle_comment(rawdata[i+4:j])returnmatch.end(0)# Internal -- scan past the internal subset in a <!DOCTYPE declaration,# returning the index just past any whitespace following the trailing ']'.def_parse_doctype_subset(self,i,declstartpos):rawdata=self.rawdatan=len(rawdata)j=iwhilej<n:c=rawdata[j]ifc=="<":s=rawdata[j:j+2]ifs=="<":# end of buffer; incompletereturn-1ifs!="<!":self.updatepos(declstartpos,j+1)raiseAssertionError("unexpected char in internal subset (in %r)"%s)if(j+2)==n:# end of buffer; incompletereturn-1if(j+4)>n:# end of buffer; incompletereturn-1ifrawdata[j:j+4]=="<!--":j=self.parse_comment(j,report=0)ifj<0:returnjcontinuename,j=self._scan_name(j+2,declstartpos)ifj==-1:return-1ifnamenotin{"attlist","element","entity","notation"}:self.updatepos(declstartpos,j+2)raiseAssertionError("unknown declaration %r in internal subset"%name)# handle the individual namesmeth=getattr(self,"_parse_doctype_"+name)j=meth(j,declstartpos)ifj<0:returnjelifc=="%":# parameter entity referenceif(j+1)==n:# end of buffer; incompletereturn-1s,j=self._scan_name(j+1,declstartpos)ifj<0:returnjifrawdata[j]==";":j=j+1elifc=="]":j=j+1whilej<nandrawdata[j].isspace():j=j+1ifj<n:ifrawdata[j]==">":returnjself.updatepos(declstartpos,j)raiseAssertionError("unexpected char after internal subset")else:return-1elifc.isspace():j=j+1else:self.updatepos(declstartpos,j)raiseAssertionError("unexpected char %r in internal subset"%c)# end of buffer reachedreturn-1# Internal -- scan past <!ELEMENT declarationsdef_parse_doctype_element(self,i,declstartpos):name,j=self._scan_name(i,declstartpos)ifj==-1:return-1# style content model; just skip until '>'rawdata=self.rawdataif'>'inrawdata[j:]:returnrawdata.find(">",j)+1return-1# Internal -- scan past <!ATTLIST declarationsdef_parse_doctype_attlist(self,i,declstartpos):rawdata=self.rawdataname,j=self._scan_name(i,declstartpos)c=rawdata[j:j+1]ifc=="":return-1ifc==">":returnj+1while1:# scan a series of attribute descriptions; simplified:# name type [value] [#constraint]name,j=self._scan_name(j,declstartpos)ifj<0:returnjc=rawdata[j:j+1]ifc=="":return-1ifc=="(":# an enumerated type; look for ')'if")"inrawdata[j:]:j=rawdata.find(")",j)+1else:return-1whilerawdata[j:j+1].isspace():j=j+1ifnotrawdata[j:]:# end of buffer, incompletereturn-1else:name,j=self._scan_name(j,declstartpos)c=rawdata[j:j+1]ifnotc:return-1ifcin"'\"":m=_declstringlit_match(rawdata,j)ifm:j=m.end()else:return-1c=rawdata[j:j+1]ifnotc:return-1ifc=="#":ifrawdata[j:]=="#":# end of bufferreturn-1name,j=self._scan_name(j+1,declstartpos)ifj<0:returnjc=rawdata[j:j+1]ifnotc:return-1ifc=='>':# all donereturnj+1# Internal -- scan past <!NOTATION declarationsdef_parse_doctype_notation(self,i,declstartpos):name,j=self._scan_name(i,declstartpos)ifj<0:returnjrawdata=self.rawdatawhile1:c=rawdata[j:j+1]ifnotc:# end of buffer; incompletereturn-1ifc=='>':returnj+1ifcin"'\"":m=_declstringlit_match(rawdata,j)ifnotm:return-1j=m.end()else:name,j=self._scan_name(j,declstartpos)ifj<0:returnj# Internal -- scan past <!ENTITY declarationsdef_parse_doctype_entity(self,i,declstartpos):rawdata=self.rawdataifrawdata[i:i+1]=="%":j=i+1while1:c=rawdata[j:j+1]ifnotc:return-1ifc.isspace():j=j+1else:breakelse:j=iname,j=self._scan_name(j,declstartpos)ifj<0:returnjwhile1:c=self.rawdata[j:j+1]ifnotc:return-1ifcin"'\"":m=_declstringlit_match(rawdata,j)ifm:j=m.end()else:return-1# incompleteelifc==">":returnj+1else:name,j=self._scan_name(j,declstartpos)ifj<0:returnj# Internal -- scan a name token and the new position and the token, or# return -1 if we've reached the end of the buffer.def_scan_name(self,i,declstartpos):rawdata=self.rawdatan=len(rawdata)ifi==n:returnNone,-1m=_declname_match(rawdata,i)ifm:s=m.group()name=s.strip()if(i+len(s))==n:returnNone,-1# end of bufferreturnname.lower(),m.end()else:self.updatepos(declstartpos,i)raiseAssertionError("expected name token at %r"%rawdata[declstartpos:declstartpos+20])# To be overridden -- handlers for unknown objectsdefunknown_decl(self,data):pass