Source code for langchain_tavily.tavily_extract

"""Tool for the Tavily Extract API."""

from typing import Any, Dict, List, Literal, Optional, Type

from langchain_core.callbacks import (
    AsyncCallbackManagerForToolRun,
    CallbackManagerForToolRun,
)
from langchain_core.tools import BaseTool, ToolException
from pydantic import BaseModel, Field

from langchain_tavily._utilities import TavilyExtractAPIWrapper


[docs] class TavilyExtractInput(BaseModel): """ Input for [TavilyExtract] Extract web page content from one or more specified URLs using Tavily Extract. """ urls: List[str] = Field(description="list of urls to extract") extract_depth: Optional[Literal["basic", "advanced"]] = Field( default="basic", description="""Controls the thoroughness of web content extraction. Use "basic" for faster extraction of main text content. Use "advanced" (default) to retrieve comprehensive content including tables and embedded elements. Always use "advanced" for LinkedIn and YouTube URLs for optimal results. Better for complex websites but may increase response time. """, # noqa: E501 ) include_images: Optional[bool] = Field( default=False, description="""Determines whether to extract and include images from the source URLs. Set to True when visualizations are needed for better context or understanding. Default is False (extracts text content only). """, # noqa: E501 )
def _generate_suggestions(params: dict) -> list: """Generate helpful suggestions based on the failed search parameters.""" suggestions = [] if params.get("extract_depth") and params["extract_depth"] == "basic": suggestions.append( "Try a more detailed extraction using 'advanced' extract_depth" ) return suggestions
[docs] class TavilyExtract(BaseTool): # type: ignore[override, override] """Tool that queries the Tavily Extract API with dynamically settable parameters.""" name: str = "tavily_extract" description: str = ( "Extracts comprehensive content from web pages based on provided URLs. " "This tool retrieves raw text of a web page, with an option to include images. " "It supports two extraction depths: 'basic' for standard text extraction and " "'advanced' for a more comprehensive extraction with higher success rate. " "Ideal for use cases such as content curation, data ingestion for NLP models, " "and automated information retrieval, this endpoint seamlessly integrates into " "your content processing pipeline. Input should be a list of one or more URLs." ) args_schema: Type[BaseModel] = TavilyExtractInput handle_tool_error: bool = True # Default parameters extract_depth: Optional[Literal["basic", "advanced"]] = "basic" """The depth of the extraction process. 'advanced' extraction retrieves more data than 'basic', with higher success but may increase latency. Default is 'basic' """ include_images: Optional[bool] = False """Include a list of images extracted from the URLs in the response. Default is False """ apiwrapper: TavilyExtractAPIWrapper = Field(default_factory=TavilyExtractAPIWrapper) # type: ignore[arg-type] def __init__(self, **kwargs: Any) -> None: # Create apiwrapper with tavily_api_key if provided if "tavily_api_key" in kwargs: kwargs["apiwrapper"] = TavilyExtractAPIWrapper( tavily_api_key=kwargs["tavily_api_key"] ) super().__init__(**kwargs) def _run( self, urls: List[str], extract_depth: Optional[Literal["basic", "advanced"]] = None, include_images: Optional[bool] = None, run_manager: Optional[CallbackManagerForToolRun] = None, ) -> Dict[str, Any]: """Use the tool.""" try: # Execute search with parameters directly raw_results = self.apiwrapper.raw_results( urls=urls, extract_depth=extract_depth if extract_depth else self.extract_depth, include_images=include_images if include_images else self.include_images, ) # Check if results are empty and raise a specific exception results = raw_results.get("results", []) failed_results = raw_results.get("failed_results", []) if not results or len(failed_results) == len(urls): search_params = { "extract_depth": extract_depth if extract_depth else self.extract_depth, "include_images": include_images if include_images else self.include_images, } suggestions = _generate_suggestions(search_params) # Construct a detailed message for the agent error_message = ( f"No extracted results found for '{urls}'. " f"Suggestions: {', '.join(suggestions)}. " f"Try modifying your extract parameters with one of these approaches." # noqa: E501 ) raise ToolException(error_message) return raw_results except ToolException: # Re-raise tool exceptions raise except Exception as e: return {"error": e} async def _arun( self, urls: List[str], extract_depth: Optional[Literal["basic", "advanced"]] = None, include_images: Optional[bool] = None, run_manager: Optional[AsyncCallbackManagerForToolRun] = None, ) -> Dict[str, Any]: """Use the tool asynchronously.""" try: raw_results = await self.apiwrapper.raw_results_async( urls=urls, extract_depth=extract_depth if extract_depth else self.extract_depth, include_images=include_images if include_images else self.include_images, ) # Check if results are empty and raise a specific exception results = raw_results.get("results", []) failed_results = raw_results.get("failed_results", []) if not results or len(failed_results) == len(urls): search_params = { "urls": urls, "extract_depth": extract_depth if extract_depth else self.extract_depth, "include_images": include_images if include_images else self.include_images, } suggestions = _generate_suggestions(search_params) error_message = ( f"No extracted results found for '{urls}'. " f"Suggestions: {', '.join(suggestions)}. " f"Try modifying your extract parameters with one of these approaches." # noqa: E501 ) raise ToolException(error_message) return raw_results except ToolException: # Re-raise tool exceptions raise except Exception as e: return {"error": e}