[docs]classElementInViewPort(TypedDict):"""A typed dictionary containing information about elements in the viewport."""node_index:strbackend_node_id:intnode_name:Optional[str]node_value:Optional[str]node_meta:list[str]is_clickable:boolorigin_x:intorigin_y:intcenter_x:intcenter_y:int
[docs]classCrawler:"""A crawler for web pages. **Security Note**: This is an implementation of a crawler that uses a browser via Playwright. This crawler can be used to load arbitrary webpages INCLUDING content from the local file system. Control access to who can submit crawling requests and what network access the crawler has. Make sure to scope permissions to the minimal permissions necessary for the application. See https://python.langchain.com/docs/security for more information. """
[docs]def__init__(self)->None:"""Initialize the crawler."""try:fromplaywright.sync_apiimportsync_playwrightexceptImportErrorase:msg=("Could not import playwright python package. ""Please install it with `pip install playwright`.")raiseImportError(msg)fromeself.browser:Browser=(sync_playwright().start().chromium.launch(headless=False))self.page:Page=self.browser.new_page()self.page.set_viewport_size({"width":1280,"height":1080})self.page_element_buffer:dict[int,ElementInViewPort]self.client:CDPSession
[docs]defgo_to_page(self,url:str)->None:"""Navigate to the given URL. Args: url: The URL to navigate to. If it does not contain a scheme, it will be prefixed with "http://". """self.page.goto(url=urlif"://"inurlelse"http://"+url)self.client=self.page.context.new_cdp_session(self.page)self.page_element_buffer={}
[docs]defscroll(self,direction:str)->None:"""Scroll the page in the given direction. Args: direction: The direction to scroll in, either "up" or "down". """ifdirection=="up":self.page.evaluate("(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop - window.innerHeight;"# noqa: E501)elifdirection=="down":self.page.evaluate("(document.scrollingElement || document.body).scrollTop = (document.scrollingElement || document.body).scrollTop + window.innerHeight;"# noqa: E501)
[docs]defclick(self,id_:Union[str,int])->None:"""Click on an element with the given id. Args: id_: The id of the element to click on. """# Inject javascript into the page which removes the target= attribute from linksjs=""" links = document.getElementsByTagName("a"); for (var i = 0; i < links.length; i++) { links[i].removeAttribute("target"); } """self.page.evaluate(js)element=self.page_element_buffer.get(int(id_))ifelement:x:float=element["center_x"]y:float=element["center_y"]self.page.mouse.click(x,y)else:print("Could not find element")# noqa: T201
[docs]deftype(self,id_:Union[str,int],text:str)->None:"""Type text into an element with the given id. Args: id_: The id of the element to type into. text: The text to type into the element. """self.click(id_)self.page.keyboard.type(text)
[docs]defenter(self)->None:"""Press the Enter key."""self.page.keyboard.press("Enter")
[docs]defcrawl(self)->list[str]:"""Crawl the current page. Returns: A list of the elements in the viewport. """page=self.pagepage_element_buffer=self.page_element_bufferstart=time.time()page_state_as_text=[]device_pixel_ratio:float=page.evaluate("window.devicePixelRatio")ifplatform=="darwin"anddevice_pixel_ratio==1:# liesdevice_pixel_ratio=2win_upper_bound:float=page.evaluate("window.pageYOffset")win_left_bound:float=page.evaluate("window.pageXOffset")win_width:float=page.evaluate("window.screen.width")win_height:float=page.evaluate("window.screen.height")win_right_bound:float=win_left_bound+win_widthwin_lower_bound:float=win_upper_bound+win_height# percentage_progress_start = (win_upper_bound / document_scroll_height) * 100# percentage_progress_end = (# (win_height + win_upper_bound) / document_scroll_height# ) * 100percentage_progress_start=1percentage_progress_end=2page_state_as_text.append({"x":0,"y":0,"text":f"[scrollbar {percentage_progress_start:0.2f}-{percentage_progress_end:0.2f}%]",# noqa: E501})tree=self.client.send("DOMSnapshot.captureSnapshot",{"computedStyles":[],"includeDOMRects":True,"includePaintOrder":True},)strings:dict[int,str]=tree["strings"]document:dict[str,Any]=tree["documents"][0]nodes:dict[str,Any]=document["nodes"]backend_node_id:dict[int,int]=nodes["backendNodeId"]attributes:dict[int,dict[int,Any]]=nodes["attributes"]node_value:dict[int,int]=nodes["nodeValue"]parent:dict[int,int]=nodes["parentIndex"]node_names:dict[int,int]=nodes["nodeName"]is_clickable:set[int]=set(nodes["isClickable"]["index"])input_value:dict[str,Any]=nodes["inputValue"]input_value_index:list[int]=input_value["index"]input_value_values:list[int]=input_value["value"]layout:dict[str,Any]=document["layout"]layout_node_index:list[int]=layout["nodeIndex"]bounds:dict[int,list[float]]=layout["bounds"]cursor:int=0child_nodes:dict[str,list[dict[str,Any]]]={}elements_in_view_port:list[ElementInViewPort]=[]anchor_ancestry:dict[str,tuple[bool,Optional[int]]]={"-1":(False,None)}button_ancestry:dict[str,tuple[bool,Optional[int]]]={"-1":(False,None)}defconvert_name(node_name:Optional[str],has_click_handler:Optional[bool],# noqa: FBT001)->str:ifnode_name=="a":return"link"ifnode_name=="input":return"input"ifnode_name=="img":return"img"if(node_name=="button"orhas_click_handler):# found pages that needed this quirkreturn"button"return"text"deffind_attributes(attributes:dict[int,Any],keys:list[str])->dict[str,str]:values={}for[key_index,value_index]inzip(*(iter(attributes),)*2):ifvalue_index<0:continuekey=strings[key_index]value=strings[value_index]ifkeyinkeys:values[key]=valuekeys.remove(key)ifnotkeys:returnvaluesreturnvaluesdefadd_to_hash_tree(hash_tree:dict[str,tuple[bool,Optional[int]]],tag:str,node_id:int,node_name:Optional[str],parent_id:int,)->tuple[bool,Optional[int]]:parent_id_str=str(parent_id)ifparent_id_strnotinhash_tree:parent_name=strings[node_names[parent_id]].lower()grand_parent_id=parent[parent_id]add_to_hash_tree(hash_tree,tag,parent_id,parent_name,grand_parent_id)is_parent_desc_anchor,anchor_id=hash_tree[parent_id_str]# even if the anchor is nested in another anchor, we set the "root" for all# descendants to be ::Selfifnode_name==tag:value:tuple[bool,Optional[int]]=(True,node_id)elif(is_parent_desc_anchor):# reuse the parent's anchor_id (which could be much higher in the tree)value=(True,anchor_id)else:value=(False,None,)# not a descendant of an anchor, most likely it will become text, an# interactive element or discardedhash_tree[str(node_id)]=valuereturnvalueforindex,node_name_indexinenumerate(node_names):node_parent=parent[index]node_name:Optional[str]=strings[node_name_index].lower()is_ancestor_of_anchor,anchor_id=add_to_hash_tree(anchor_ancestry,"a",index,node_name,node_parent)is_ancestor_of_button,button_id=add_to_hash_tree(button_ancestry,"button",index,node_name,node_parent)try:cursor=layout_node_index.index(index)# TODO replace this with proper cursoring, ignoring the fact this is# O(n^2) for the momentexceptValueError:continueifnode_nameinblack_listed_elements:continue[x,y,width,height]=bounds[cursor]x/=device_pixel_ratioy/=device_pixel_ratiowidth/=device_pixel_ratioheight/=device_pixel_ratioelem_left_bound=xelem_top_bound=yelem_right_bound=x+widthelem_lower_bound=y+heightpartially_is_in_viewport=(elem_left_bound<win_right_boundandelem_right_bound>=win_left_boundandelem_top_bound<win_lower_boundandelem_lower_bound>=win_upper_bound)ifnotpartially_is_in_viewport:continuemeta_data:list[str]=[]# inefficient to grab the same set of keys for kinds of objects, but it's# fine for nowelement_attributes=find_attributes(attributes[index],["type","placeholder","aria-label","title","alt"])ancestor_exception=is_ancestor_of_anchororis_ancestor_of_buttonancestor_node_key=(Noneifnotancestor_exceptionelsestr(anchor_id)ifis_ancestor_of_anchorelsestr(button_id))ancestor_node=(Noneifnotancestor_exceptionelsechild_nodes.setdefault(str(ancestor_node_key),[]))ifnode_name=="#text"andancestor_exceptionandancestor_node:text=strings[node_value[index]]iftextin{"|","•"}:continueancestor_node.append({"type":"type","value":text})else:if(node_name=="input"andelement_attributes.get("type")=="submit")ornode_name=="button":node_name="button"element_attributes.pop("type",None)# prevent [button ... (button)..]forkeyinelement_attributes:ifancestor_exceptionandancestor_node:ancestor_node.append({"type":"attribute","key":key,"value":element_attributes[key],})else:meta_data.append(element_attributes[key])element_node_value=Noneifnode_value[index]>=0:element_node_value=strings[node_value[index]]if(element_node_value=="|"# commonly used as a separator, does not add much context - lets# save ourselves some token space):continueelif(node_name=="input"andindexininput_value_indexandelement_node_valueisNone):node_input_text_index=input_value_index.index(index)text_index=input_value_values[node_input_text_index]ifnode_input_text_index>=0andtext_index>=0:element_node_value=strings[text_index]# remove redundant elementsifancestor_exceptionand(node_namenotin{"a","button"}):continueelements_in_view_port.append({"node_index":str(index),"backend_node_id":backend_node_id[index],"node_name":node_name,"node_value":element_node_value,"node_meta":meta_data,"is_clickable":indexinis_clickable,"origin_x":int(x),"origin_y":int(y),"center_x":int(x+(width/2)),"center_y":int(y+(height/2)),})# lets filter further to remove anything that does not hold any text nor has# click handlers + merge text from leaf#text nodes with the parentelements_of_interest=[]id_counter=0forelementinelements_in_view_port:node_index=element.get("node_index")node_name=element.get("node_name")element_node_value=element.get("node_value")node_is_clickable=element.get("is_clickable")node_meta_data:Optional[list[str]]=element.get("node_meta")inner_text=f"{element_node_value} "ifelement_node_valueelse""meta=""ifnode_indexinchild_nodes:forchildinchild_nodes[node_index]:entry_type=child.get("type")entry_value=child.get("value")ifentry_type=="attribute"andnode_meta_data:entry_key=child.get("key")node_meta_data.append(f'{entry_key}="{entry_value}"')else:inner_text+=f"{entry_value} "ifnode_meta_data:meta_string=" ".join(node_meta_data)meta=f" {meta_string}"ifinner_text!="":inner_text=f"{inner_text.strip()}"converted_node_name=convert_name(node_name,node_is_clickable)# not very elegant, more like a placeholderif((converted_node_name!="button"ormeta=="")andconverted_node_namenotin{"link","input","img","textarea"})andinner_text.strip()=="":continuepage_element_buffer[id_counter]=elementifinner_text!="":elements_of_interest.append(f"""<{converted_node_name} id={id_counter}{meta}>{inner_text}</{converted_node_name}>"""# noqa: E501)else:elements_of_interest.append(f"""<{converted_node_name} id={id_counter}{meta}/>""")id_counter+=1print(f"Parsing time: {time.time()-start:0.2f} seconds")# noqa: T201returnelements_of_interest