"""Tool for the Tavily Crawl API."""fromtypingimportAny,Dict,List,Literal,Optional,Typefromlangchain_core.callbacksimport(AsyncCallbackManagerForToolRun,CallbackManagerForToolRun,)fromlangchain_core.toolsimportBaseTool,ToolExceptionfrompydanticimportBaseModel,Fieldfromlangchain_tavily._utilitiesimportTavilyCrawlAPIWrapper
[docs]classTavilyCrawlInput(BaseModel):"""Input for [TavilyCrawl]"""url:str=Field(description=("The root URL to begin the crawl."))max_depth:Optional[int]=Field(default=1,description="""Max depth of the crawl. Defines how far from the base URL the crawler can explore. Increase this parameter when: 1. To crawl large websites and get a comprehensive overview of its structure. 2. To crawl a website that has a lot of links to other pages. Set this parameter to 1 when: 1. To stay local to the base_url 2. To crawl a single page max_depth must be greater than 0 """,# noqa: E501)max_breadth:Optional[int]=Field(default=20,description="""Max number of links to follow per level of the tree (i.e., per page). tavily-crawl uses a BFS Depth: referring to the number of link hops from the root URL. A page directly linked from the root is at BFS depth 1, regardless of its URL structure. Increase this parameter when: 1. You want many links from each page to be crawled. max_breadth must be greater than 0 """,# noqa: E501)limit:Optional[int]=Field(default=50,description="""Total number of links the crawler will process before stopping. limit must be greater than 0 """,# noqa: E501)instructions:Optional[str]=Field(default=None,description="""Natural language instructions for the crawler. The instructions parameter allows the crawler to intelligently navigate through a website using natural language. Take the users request to set the instructions parameter to guide the crawler in the direction of the users request. ex. "I want to find all the Javascript SDK documentation from Tavily" ---> instructions = "Javascript SDK documentation" """,# noqa: E501)select_paths:Optional[List[str]]=Field(default=None,description="""Regex patterns to select only URLs with specific path patterns. Use when the user explicitly asks for a specific path from a website. ex. "Only crawl the /api/v1 path" ---> ["/api/v1.*"] ex. "Only crawl the /documentation path" ---> ["/documentation/.*"] """,# noqa: E501)select_domains:Optional[List[str]]=Field(default=None,description="""Regex patterns to select only URLs from specific domains or subdomains. Use when the user explicitly asks for a specific domain or subdomain from a website. ex. "Crawl only the docs.tavily.com subdomain" ---> ["^docs\\.tavily\\.com$"] """,# noqa: E501)exclude_paths:Optional[List[str]]=Field(default=None,description="""Regex patterns to exclude URLs from the crawl with specific path patterns. Use when the user explicitly asks to exclude a specific path from a website. ex. "Crawl example.com but exclude the /api/v1 path form the crawl" ---> ["/api/v1.*"] ex. "Crawl example.com but exclude the /documentation path from the crawl" ---> ["/documentation/.*"] """,# noqa: E501)exclude_domains:Optional[List[str]]=Field(default=None,description="""Regex patterns to exclude URLs from specific domains or subdomains. Use when the user explicitly asks to exclude a specific domain or subdomain from a website. ex. "Crawl tavily.com but exclude the docs.tavily.com subdomain from the crawl" ---> ["^docs\\.tavily\\.com$"] """,# noqa: E501)allow_external:Optional[bool]=Field(default=False,description="""Allow the crawler to follow external links. Use when the user explicitly asks to allow or deny external links. """,# noqa: E501)include_images:Optional[bool]=Field(default=False,description="""Whether to include images in the crawl results. """,# noqa: E501)categories:Optional[List[Literal["Careers","Blogs","Documentation","About","Pricing","Community","Developers","Contact","Media",]]]=Field(default=None,description="""Direct the crawler to crawl specific categories of a website. Set this field to the category that best matches the user's request. Use the following guide to choose the appropriate category: Careers: Crawl pages related to job listings, open positions, and company career information. Blogs: Crawl blog posts, news articles, and editorial content. Documentation: Crawl technical documentation, user guides, API references, and manuals. About: Crawl 'About Us' pages, company background, mission statements, and team information. Pricing: Crawl pages that detail product or service pricing, plans, and cost comparisons. Community: Crawl forums, discussion boards, user groups, and community-driven content. Developers: Crawl developer portals, SDKs, API documentation, and resources for software developers. Contact: Crawl contact information pages, support forms, and customer service details. Media: Crawl press releases, media kits, newsrooms, and multimedia content. ex. "Crawl apple.com for career opportunities" ---> categories="Careers" ex. "Crawl tavily.com for API documentation" ---> categories="Documentation" """,# noqa: E501)extract_depth:Optional[Literal["basic","advanced"]]=Field(default="basic",description="""Advanced extraction retrieves more data, including tables and embedded content with higher success but may increase latency. """,# noqa: E501)include_favicon:Optional[bool]=Field(default=False,description="Whether to include the favicon URL for each result.",)
def_generate_suggestions(params:dict)->list:"""Generate helpful suggestions based on the failed crawl parameters."""suggestions=[]instructions=params.get("instructions")select_paths=params.get("select_paths")select_domains=params.get("select_domains")exclude_paths=params.get("exclude_paths")exclude_domains=params.get("exclude_domains")categories=params.get("categories")ifinstructions:suggestions.append("Try more consice instructions")ifselect_paths:suggestions.append("Remove select_paths argument")ifselect_domains:suggestions.append("Remove select_domains argument")ifexclude_paths:suggestions.append("Remove exclude_paths argument")ifexclude_domains:suggestions.append("Remove exclude_domains argument")ifcategories:suggestions.append("Remove categories argument")returnsuggestions
[docs]classTavilyCrawl(BaseTool):# type: ignore[override]"""Tool that sends requests to the Tavily Crawl API with dynamically settable parameters."""# noqa: E501name:str="tavily_crawl"description:str="""A powerful web crawler that initiates a structured web crawl starting from a specified base URL. The crawler uses a BFS Depth: refering to the number of link hops from the root URL. A page directly linked from the root is at BFS depth 1, regardless of its URL structure. You can control how deep and wide it goes, and guide it to focus on specific sections of the site. """# noqa: E501args_schema:Type[BaseModel]=TavilyCrawlInputhandle_tool_error:bool=Truemax_depth:Optional[int]=None"""Max depth of the crawl. Defines how far from the base URL the crawler can explore. max_depth must be greater than 0 default is 1 """# noqa: E501max_breadth:Optional[int]=None"""The maximum number of links to follow per level of the tree (i.e., per page). max_breadth must be greater than 0 default is 20 """limit:Optional[int]=None"""Total number of links the crawler will process before stopping. limit must be greater than 0 default is 50 """instructions:Optional[str]=None"""Natural language instructions for the crawler. ex. "Python SDK" """select_paths:Optional[List[str]]=None"""Regex patterns to select only URLs with specific path patterns. ex. ["/api/v1.*"] """select_domains:Optional[List[str]]=None"""Regex patterns to select only URLs from specific domains or subdomains. ex. ["^docs\\.example\\.com$"] """exclude_paths:Optional[List[str]]=None""" Regex patterns to exclude URLs with specific path patterns ex. [/private/.*, /admin/.*] """exclude_domains:Optional[List[str]]=None""" Regex patterns to exclude specific domains or subdomains from crawling ex. [^private\\.example\\.com$] """allow_external:Optional[bool]=None"""Whether to allow following links that go to external domains. default is False """include_images:Optional[bool]=None"""Whether to include images in the crawl results. default is False """categories:Optional[List[Literal["Careers","Blogs","Documentation","About","Pricing","Community","Developers","Contact","Media",]]]=None"""Filter URLs using predefined categories like 'Documentation', 'Blogs', etc. """extract_depth:Optional[Literal["basic","advanced"]]=None"""Advanced extraction retrieves more data, including tables and embedded content, with higher success but may increase latency. default is basic """format:Optional[str]=None""" The format of the extracted web page content. markdown returns content in markdown format. text returns plain text and may increase latency. default is markdown """include_favicon:Optional[bool]=None"""Whether to include the favicon URL for each result. Default is False. """api_wrapper:TavilyCrawlAPIWrapper=Field(default_factory=TavilyCrawlAPIWrapper)# type: ignore[arg-type]def__init__(self,**kwargs:Any)->None:# Create api_wrapper with tavily_api_key and api_base_url if providedif"tavily_api_key"inkwargsor"api_base_url"inkwargs:wrapper_kwargs={}if"tavily_api_key"inkwargs:wrapper_kwargs["tavily_api_key"]=kwargs["tavily_api_key"]if"api_base_url"inkwargs:wrapper_kwargs["api_base_url"]=kwargs["api_base_url"]kwargs["api_wrapper"]=TavilyCrawlAPIWrapper(**wrapper_kwargs)super().__init__(**kwargs)def_run(self,url:str,max_depth:Optional[int]=None,max_breadth:Optional[int]=None,limit:Optional[int]=None,instructions:Optional[str]=None,select_paths:Optional[List[str]]=None,select_domains:Optional[List[str]]=None,exclude_paths:Optional[List[str]]=None,exclude_domains:Optional[List[str]]=None,allow_external:Optional[bool]=None,include_images:Optional[bool]=None,categories:Optional[List[Literal["Careers","Blogs","Documentation","About","Pricing","Community","Developers","Contact","Media",]]]=None,extract_depth:Optional[Literal["basic","advanced"]]=None,include_favicon:Optional[bool]=None,run_manager:Optional[CallbackManagerForToolRun]=None,)->Dict[str,Any]:"""Execute a crawl using the Tavily Crawl API. Returns: - results (List[Dict]): A list of extracted content from the crawled URLs - url (str): The URL that was crawled Example: "https://tavily.com/#features" - raw_content (str): The full content extracted from the page - images (List[str]): A list of image URLs extracted from the page - response_time (float): Time in seconds it took to complete the request """try:# Execute search with parameters directlyraw_results=self.api_wrapper.raw_results(url=url,max_depth=self.max_depthifself.max_depthelsemax_depth,max_breadth=self.max_breadthifself.max_breadthelsemax_breadth,limit=self.limitifself.limitelselimit,instructions=self.instructionsifself.instructionselseinstructions,select_paths=self.select_pathsifself.select_pathselseselect_paths,select_domains=self.select_domainsifself.select_domainselseselect_domains,exclude_paths=self.exclude_pathsifself.exclude_pathselseexclude_paths,exclude_domains=self.exclude_domainsifself.exclude_domainselseexclude_domains,allow_external=self.allow_externalifself.allow_externalelseallow_external,include_images=self.include_imagesifself.include_imageselseinclude_images,categories=self.categoriesifself.categorieselsecategories,extract_depth=self.extract_depthifself.extract_depthelseextract_depth,include_favicon=self.include_faviconifself.include_faviconelseinclude_favicon,format=self.format,)# Check if results are empty and raise a specific exceptionifnotraw_results.get("results",[]):search_params={"instructions":instructions,"select_paths":select_paths,"select_domains":select_domains,"exclude_paths":exclude_paths,"exclude_domains":exclude_domains,"categories":categories,"format":self.format,}suggestions=_generate_suggestions(search_params)# Construct a detailed message for the agenterror_message=(f"No crawl results found for '{url}'. "f"Suggestions: {', '.join(suggestions)}. "f"Try modifying your crawl parameters with one of these approaches."# noqa: E501)raiseToolException(error_message)returnraw_resultsexceptToolException:# Re-raise tool exceptionsraiseexceptExceptionase:return{"error":e}asyncdef_arun(self,url:str,max_depth:Optional[int]=None,max_breadth:Optional[int]=None,limit:Optional[int]=None,instructions:Optional[str]=None,select_paths:Optional[List[str]]=None,select_domains:Optional[List[str]]=None,exclude_paths:Optional[List[str]]=None,exclude_domains:Optional[List[str]]=None,allow_external:Optional[bool]=None,include_images:Optional[bool]=None,categories:Optional[List[Literal["Careers","Blogs","Documentation","About","Pricing","Community","Developers","Contact","Media",]]]=None,extract_depth:Optional[Literal["basic","advanced"]]=None,include_favicon:Optional[bool]=None,run_manager:Optional[AsyncCallbackManagerForToolRun]=None,)->Dict[str,Any]:"""Use the tool asynchronously."""try:raw_results=awaitself.api_wrapper.raw_results_async(url=url,max_depth=self.max_depthifself.max_depthelsemax_depth,max_breadth=self.max_breadthifself.max_breadthelsemax_breadth,limit=self.limitifself.limitelselimit,instructions=self.instructionsifself.instructionselseinstructions,select_paths=self.select_pathsifself.select_pathselseselect_paths,select_domains=self.select_domainsifself.select_domainselseselect_domains,exclude_paths=self.exclude_pathsifself.exclude_pathselseexclude_paths,exclude_domains=self.exclude_domainsifself.exclude_domainselseexclude_domains,allow_external=self.allow_externalifself.allow_externalelseallow_external,include_images=self.include_imagesifself.include_imageselseinclude_images,categories=self.categoriesifself.categorieselsecategories,extract_depth=self.extract_depthifself.extract_depthelseextract_depth,include_favicon=self.include_faviconifself.include_faviconelseinclude_favicon,format=self.format,)# Check if results are empty and raise a specific exceptionifnotraw_results.get("results",[]):search_params={"instructions":instructions,"select_paths":select_paths,"select_domains":select_domains,"categories":categories,}suggestions=_generate_suggestions(search_params)# Construct a detailed message for the agenterror_message=(f"No crawl results found for '{url}'. "f"Suggestions: {', '.join(suggestions)}. "f"Try modifying your crawl parameters with one of these approaches."# noqa: E501)raiseToolException(error_message)returnraw_resultsexceptToolException:# Re-raise tool exceptionsraiseexceptExceptionase:return{"error":e}