[docs]classElementInViewPort(TypedDict):"""A typed dictionary containing information about elements in the viewport."""node_index:strbackend_node_id:intnode_name:Optional[str]node_value:Optional[str]node_meta:List[str]is_clickable:boolorigin_x:intorigin_y:intcenter_x:intcenter_y:int
[docs]classCrawler:"""A crawler for web pages. **Security Note**: This is an implementation of a crawler that uses a browser via Playwright. This crawler can be used to load arbitrary webpages INCLUDING content from the local file system. Control access to who can submit crawling requests and what network access the crawler has. Make sure to scope permissions to the minimal permissions necessary for the application. See https://python.langchain.com/docs/security for more information. """
[docs]def__init__(self)->None:try:fromplaywright.sync_apiimportsync_playwrightexceptImportError:raiseImportError("Could not import playwright python package. ""Please install it with `pip install playwright`.")self.browser:Browser=(sync_playwright().start().chromium.launch(headless=False))self.page:Page=self.browser.new_page()self.page.set_viewport_size({"width":1280,"height":1080})self.page_element_buffer:Dict[int,ElementInViewPort]self.client:CDPSession
[docs]defclick(self,id:Union[str,int])->None:# Inject javascript into the page which removes the target= attribute from all linksjs=""" links = document.getElementsByTagName("a"); for (var i = 0; i < links.length; i++) { links[i].removeAttribute("target"); } """self.page.evaluate(js)element=self.page_element_buffer.get(int(id))ifelement:x:float=element["center_x"]y:float=element["center_y"]self.page.mouse.click(x,y)else:print("Could not find element")# noqa: T201
[docs]defcrawl(self)->List[str]:page=self.pagepage_element_buffer=self.page_element_bufferstart=time.time()page_state_as_text=[]device_pixel_ratio:float=page.evaluate("window.devicePixelRatio")ifplatform=="darwin"anddevice_pixel_ratio==1:# liesdevice_pixel_ratio=2win_upper_bound:float=page.evaluate("window.pageYOffset")win_left_bound:float=page.evaluate("window.pageXOffset")win_width:float=page.evaluate("window.screen.width")win_height:float=page.evaluate("window.screen.height")win_right_bound:float=win_left_bound+win_widthwin_lower_bound:float=win_upper_bound+win_height# percentage_progress_start = (win_upper_bound / document_scroll_height) * 100# percentage_progress_end = (# (win_height + win_upper_bound) / document_scroll_height# ) * 100percentage_progress_start=1percentage_progress_end=2page_state_as_text.append({"x":0,"y":0,"text":"[scrollbar {:0.2f}-{:0.2f}%]".format(round(percentage_progress_start,2),round(percentage_progress_end)),})tree=self.client.send("DOMSnapshot.captureSnapshot",{"computedStyles":[],"includeDOMRects":True,"includePaintOrder":True},)strings:Dict[int,str]=tree["strings"]document:Dict[str,Any]=tree["documents"][0]nodes:Dict[str,Any]=document["nodes"]backend_node_id:Dict[int,int]=nodes["backendNodeId"]attributes:Dict[int,Dict[int,Any]]=nodes["attributes"]node_value:Dict[int,int]=nodes["nodeValue"]parent:Dict[int,int]=nodes["parentIndex"]node_names:Dict[int,int]=nodes["nodeName"]is_clickable:Set[int]=set(nodes["isClickable"]["index"])input_value:Dict[str,Any]=nodes["inputValue"]input_value_index:List[int]=input_value["index"]input_value_values:List[int]=input_value["value"]layout:Dict[str,Any]=document["layout"]layout_node_index:List[int]=layout["nodeIndex"]bounds:Dict[int,List[float]]=layout["bounds"]cursor:int=0child_nodes:Dict[str,List[Dict[str,Any]]]={}elements_in_view_port:List[ElementInViewPort]=[]anchor_ancestry:Dict[str,Tuple[bool,Optional[int]]]={"-1":(False,None)}button_ancestry:Dict[str,Tuple[bool,Optional[int]]]={"-1":(False,None)}defconvert_name(node_name:Optional[str],has_click_handler:Optional[bool])->str:ifnode_name=="a":return"link"ifnode_name=="input":return"input"ifnode_name=="img":return"img"if(node_name=="button"orhas_click_handler):# found pages that needed this quirkreturn"button"else:return"text"deffind_attributes(attributes:Dict[int,Any],keys:List[str])->Dict[str,str]:values={}for[key_index,value_index]inzip(*(iter(attributes),)*2):ifvalue_index<0:continuekey=strings[key_index]value=strings[value_index]ifkeyinkeys:values[key]=valuekeys.remove(key)ifnotkeys:returnvaluesreturnvaluesdefadd_to_hash_tree(hash_tree:Dict[str,Tuple[bool,Optional[int]]],tag:str,node_id:int,node_name:Optional[str],parent_id:int,)->Tuple[bool,Optional[int]]:parent_id_str=str(parent_id)ifnotparent_id_strinhash_tree:parent_name=strings[node_names[parent_id]].lower()grand_parent_id=parent[parent_id]add_to_hash_tree(hash_tree,tag,parent_id,parent_name,grand_parent_id)is_parent_desc_anchor,anchor_id=hash_tree[parent_id_str]# even if the anchor is nested in another anchor, we set the "root" for all descendants to be ::Selfifnode_name==tag:value:Tuple[bool,Optional[int]]=(True,node_id)elif(is_parent_desc_anchor):# reuse the parent's anchor_id (which could be much higher in the tree)value=(True,anchor_id)else:value=(False,None,)# not a descendant of an anchor, most likely it will become text, an interactive element or discardedhash_tree[str(node_id)]=valuereturnvalueforindex,node_name_indexinenumerate(node_names):node_parent=parent[index]node_name:Optional[str]=strings[node_name_index].lower()is_ancestor_of_anchor,anchor_id=add_to_hash_tree(anchor_ancestry,"a",index,node_name,node_parent)is_ancestor_of_button,button_id=add_to_hash_tree(button_ancestry,"button",index,node_name,node_parent)try:cursor=layout_node_index.index(index)# todo replace this with proper cursoring, ignoring the fact this is O(n^2) for the momentexcept:continueifnode_nameinblack_listed_elements:continue[x,y,width,height]=bounds[cursor]x/=device_pixel_ratioy/=device_pixel_ratiowidth/=device_pixel_ratioheight/=device_pixel_ratioelem_left_bound=xelem_top_bound=yelem_right_bound=x+widthelem_lower_bound=y+heightpartially_is_in_viewport=(elem_left_bound<win_right_boundandelem_right_bound>=win_left_boundandelem_top_bound<win_lower_boundandelem_lower_bound>=win_upper_bound)ifnotpartially_is_in_viewport:continuemeta_data:List[str]=[]# inefficient to grab the same set of keys for kinds of objects, but it's fine for nowelement_attributes=find_attributes(attributes[index],["type","placeholder","aria-label","title","alt"])ancestor_exception=is_ancestor_of_anchororis_ancestor_of_buttonancestor_node_key=(Noneifnotancestor_exceptionelsestr(anchor_id)ifis_ancestor_of_anchorelsestr(button_id))ancestor_node=(Noneifnotancestor_exceptionelsechild_nodes.setdefault(str(ancestor_node_key),[]))ifnode_name=="#text"andancestor_exceptionandancestor_node:text=strings[node_value[index]]iftext=="|"ortext=="•":continueancestor_node.append({"type":"type","value":text})else:if(node_name=="input"andelement_attributes.get("type")=="submit")ornode_name=="button":node_name="button"element_attributes.pop("type",None)# prevent [button ... (button)..]forkeyinelement_attributes:ifancestor_exceptionandancestor_node:ancestor_node.append({"type":"attribute","key":key,"value":element_attributes[key],})else:meta_data.append(element_attributes[key])element_node_value=Noneifnode_value[index]>=0:element_node_value=strings[node_value[index]]if(element_node_value=="|"):# commonly used as a separator, does not add much context - lets save ourselves some token spacecontinueelif(node_name=="input"andindexininput_value_indexandelement_node_valueisNone):node_input_text_index=input_value_index.index(index)text_index=input_value_values[node_input_text_index]ifnode_input_text_index>=0andtext_index>=0:element_node_value=strings[text_index]# remove redundant elementsifancestor_exceptionand(node_name!="a"andnode_name!="button"):continueelements_in_view_port.append({"node_index":str(index),"backend_node_id":backend_node_id[index],"node_name":node_name,"node_value":element_node_value,"node_meta":meta_data,"is_clickable":indexinis_clickable,"origin_x":int(x),"origin_y":int(y),"center_x":int(x+(width/2)),"center_y":int(y+(height/2)),})# lets filter further to remove anything that does not hold any text nor has click handlers + merge text from leaf#text nodes with the parentelements_of_interest=[]id_counter=0forelementinelements_in_view_port:node_index=element.get("node_index")node_name=element.get("node_name")element_node_value=element.get("node_value")node_is_clickable=element.get("is_clickable")node_meta_data:Optional[List[str]]=element.get("node_meta")inner_text=f"{element_node_value} "ifelement_node_valueelse""meta=""ifnode_indexinchild_nodes:forchildinchild_nodes[node_index]:entry_type=child.get("type")entry_value=child.get("value")ifentry_type=="attribute"andnode_meta_data:entry_key=child.get("key")node_meta_data.append(f'{entry_key}="{entry_value}"')else:inner_text+=f"{entry_value} "ifnode_meta_data:meta_string=" ".join(node_meta_data)meta=f" {meta_string}"ifinner_text!="":inner_text=f"{inner_text.strip()}"converted_node_name=convert_name(node_name,node_is_clickable)# not very elegant, more like a placeholderif((converted_node_name!="button"ormeta=="")andconverted_node_name!="link"andconverted_node_name!="input"andconverted_node_name!="img"andconverted_node_name!="textarea")andinner_text.strip()=="":continuepage_element_buffer[id_counter]=elementifinner_text!="":elements_of_interest.append(f"""<{converted_node_name} id={id_counter}{meta}>{inner_text}</{converted_node_name}>""")else:elements_of_interest.append(f"""<{converted_node_name} id={id_counter}{meta}/>""")id_counter+=1print("Parsing time: {:0.2f} seconds".format(time.time()-start))# noqa: T201returnelements_of_interest