Source code for langchain_box.utilities.box

"""Util that calls Box APIs."""

from enum import Enum
from typing import Any, Dict, List, Optional

import box_sdk_gen  # type: ignore
import requests
from langchain_core.documents import Document
from langchain_core.utils import from_env
from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self


[docs] class DocumentFiles(Enum): """DocumentFiles(Enum). An enum containing all of the supported extensions for files Box considers Documents. These files should have text representations. """ DOC = "doc" DOCX = "docx" GDOC = "gdoc" GSHEET = "gsheet" NUMBERS = "numbers" ODS = "ods" ODT = "odt" PAGES = "pages" PDF = "pdf" RTF = "rtf" WPD = "wpd" XLS = "xls" XLSM = "xlsm" XLSX = "xlsx" AS = "as" AS3 = "as3" ASM = "asm" BAT = "bat" C = "c" CC = "cc" CMAKE = "cmake" CPP = "cpp" CS = "cs" CSS = "css" CSV = "csv" CXX = "cxx" DIFF = "diff" ERB = "erb" GROOVY = "groovy" H = "h" HAML = "haml" HH = "hh" HTM = "htm" HTML = "html" JAVA = "java" JS = "js" JSON = "json" LESS = "less" LOG = "log" M = "m" MAKE = "make" MD = "md" ML = "ml" MM = "mm" MSG = "msg" PHP = "php" PL = "pl" PROPERTIES = "properties" PY = "py" RB = "rb" RST = "rst" SASS = "sass" SCALA = "scala" SCM = "scm" SCRIPT = "script" SH = "sh" SML = "sml" SQL = "sql" TXT = "txt" VI = "vi" VIM = "vim" WEBDOC = "webdoc" XHTML = "xhtml" XLSB = "xlsb" XML = "xml" XSD = "xsd" XSL = "xsl" YAML = "yaml" GSLLIDE = "gslide" GSLIDES = "gslides" KEY = "key" ODP = "odp" PPT = "ppt" PPTX = "pptx" BOXNOTE = "boxnote"
[docs] class ImageFiles(Enum): """ImageFiles(Enum). An enum containing all of the supported extensions for files Box considers images. """ ARW = "arw" BMP = "bmp" CR2 = "cr2" DCM = "dcm" DICM = "dicm" DICOM = "dicom" DNG = "dng" EPS = "eps" EXR = "exr" GIF = "gif" HEIC = "heic" INDD = "indd" INDML = "indml" INDT = "indt" INX = "inx" JPEG = "jpeg" JPG = "jpg" NEF = "nef" PNG = "png" SVG = "svg" TIF = "tif" TIFF = "tiff" TGA = "tga" SVS = "svs"
[docs] class BoxAuthType(Enum): """BoxAuthType(Enum). an enum to tell BoxLoader how you wish to autheticate your Box connection. Options are: TOKEN - Use a developer token generated from the Box Deevloper Token. Only recommended for development. Provide ``box_developer_token``. CCG - Client Credentials Grant. provide ``box_client_id`, ``box_client_secret`, and ``box_enterprise_id`` or optionally `box_user_id`. JWT - Use JWT for authentication. Config should be stored on the file system accessible to your app. provide ``box_jwt_path``. Optionally, provide ``box_user_id`` to act as a specific user """ TOKEN = "token" """Use a developer token or a token retrieved from ``box-sdk-gen``""" CCG = "ccg" """Use ``client_credentials`` type grant""" JWT = "jwt" """Use JWT bearer token auth"""
[docs] class BoxAuth(BaseModel): """**BoxAuth.** The ``box-langchain`` package offers some flexibility to authentication. The most basic authentication method is by using a developer token. This can be found in the `Box developer console <https://account.box.com/developers/console>`_ on the configuration screen. This token is purposely short-lived (1 hour) and is intended for development. With this token, you can add it to your environment as ``BOX_DEVELOPER_TOKEN``, you can pass it directly to the loader, or you can use the ``BoxAuth`` authentication helper class. `BoxAuth` supports the following authentication methods: * **Token** — either a developer token or any token generated through the Box SDK * **JWT** with a service account * **JWT** with a specified user * **CCG** with a service account * **CCG** with a specified user .. note:: If using JWT authentication, you will need to download the configuration from the Box developer console after generating your public/private key pair. Place this file in your application directory structure somewhere. You will use the path to this file when using the ``BoxAuth`` helper class. If you wish to use OAuth2 with the authorization_code flow, please use ``BoxAuthType.TOKEN`` with the token you have acquired. For more information, learn about how to `set up a Box application <https://developer.box.com/guides/getting-started/first-application/>`_, and check out the `Box authentication guide <https://developer.box.com/guides/authentication/select/>`_ for more about our different authentication options. Simple implementation: To instantiate, you must provide a ``langchain_box.utilities.BoxAuthType``. BoxAuthType is an enum to tell BoxLoader how you wish to autheticate your Box connection. Options are: TOKEN - Use a developer token generated from the Box Deevloper Token. Only recommended for development. Provide ``box_developer_token``. CCG - Client Credentials Grant. provide ``box_client_id``, ``box_client_secret``, and ``box_enterprise_id`` or optionally ``box_user_id``. JWT - Use JWT for authentication. Config should be stored on the file system accessible to your app. provide ``box_jwt_path``. Optionally, provide ``box_user_id`` to act as a specific user **Examples**: **Token** .. code-block:: python from langchain_box.document_loaders import BoxLoader from langchain_box.utilities import BoxAuth, BoxAuthType auth = BoxAuth( auth_type=BoxAuthType.TOKEN, box_developer_token=box_developer_token ) loader = BoxLoader( box_auth=auth, ... ) **JWT with a service account** .. code-block:: python from langchain_box.document_loaders import BoxLoader from langchain_box.utilities import BoxAuth, BoxAuthType auth = BoxAuth( auth_type=BoxAuthType.JWT, box_jwt_path=box_jwt_path ) loader = BoxLoader( box_auth=auth, ... ) **JWT with a specified user** .. code-block:: python from langchain_box.document_loaders import BoxLoader from langchain_box.utilities import BoxAuth, BoxAuthType auth = BoxAuth( auth_type=BoxAuthType.JWT, box_jwt_path=box_jwt_path, box_user_id=box_user_id ) loader = BoxLoader( box_auth=auth, ... ) **CCG with a service account** .. code-block:: python from langchain_box.document_loaders import BoxLoader from langchain_box.utilities import BoxAuth, BoxAuthType auth = BoxAuth( auth_type=BoxAuthType.CCG, box_client_id=box_client_id, box_client_secret=box_client_secret, box_enterprise_id=box_enterprise_id ) loader = BoxLoader( box_auth=auth, ... ) **CCG with a specified user** .. code-block:: python from langchain_box.document_loaders import BoxLoader from langchain_box.utilities import BoxAuth, BoxAuthType auth = BoxAuth( auth_type=BoxAuthType.CCG, box_client_id=box_client_id, box_client_secret=box_client_secret, box_user_id=box_user_id ) loader = BoxLoader( box_auth=auth, ... ) """ auth_type: BoxAuthType """``langchain_box.utilities.BoxAuthType``. Enum describing how to authenticate against Box""" box_developer_token: Optional[str] = Field( default_factory=from_env("BOX_DEVELOPER_TOKEN", default=None) ) """ If using ``BoxAuthType.TOKEN``, provide your token here""" box_jwt_path: Optional[str] = Field( default_factory=from_env("BOX_JWT_PATH", default=None) ) """If using ``BoxAuthType.JWT``, provide local path to your JWT configuration file""" box_client_id: Optional[str] = Field( default_factory=from_env("BOX_CLIENT_ID", default=None) ) """If using ``BoxAuthType.CCG``, provide your app's client ID""" box_client_secret: Optional[str] = Field( default_factory=from_env("BOX_CLIENT_SECRET", default=None) ) """If using ``BoxAuthType.CCG``, provide your app's client secret""" box_enterprise_id: Optional[str] = None """If using ``BoxAuthType.CCG``, provide your enterprise ID. Only required if you are not sending ``box_user_id``""" box_user_id: Optional[str] = None """If using ``BoxAuthType.CCG`` or ``BoxAuthType.JWT``, providing ``box_user_id`` will act on behalf of a specific user""" _box_client: Optional[box_sdk_gen.BoxClient] = None _custom_header: Dict = dict({"x-box-ai-library": "langchain"}) model_config = ConfigDict( arbitrary_types_allowed=True, use_enum_values=True, extra="allow", ) @model_validator(mode="after") def validate_box_auth_inputs(self) -> Self: """Validate auth_type is set""" if not self.auth_type: raise ValueError("Auth type must be set.") """Validate that TOKEN auth type provides box_developer_token.""" if self.auth_type == "token" and not self.box_developer_token: raise ValueError(f"{self.auth_type} requires box_developer_token to be set") """Validate that JWT auth type provides box_jwt_path.""" if self.auth_type == "jwt" and not self.box_jwt_path: raise ValueError(f"{self.auth_type} requires box_jwt_path to be set") """Validate that CCG auth type provides box_client_id and box_client_secret and either box_enterprise_id or box_user_id.""" if self.auth_type == "ccg": if ( not self.box_client_id or not self.box_client_secret or (not self.box_enterprise_id and not self.box_user_id) ): raise ValueError( f"{self.auth_type} requires box_client_id, \ box_client_secret, and box_enterprise_id/box_user_id." ) return self def _authorize(self) -> None: if self.auth_type == "token": try: auth = box_sdk_gen.BoxDeveloperTokenAuth(token=self.box_developer_token) self._box_client = box_sdk_gen.BoxClient(auth=auth).with_extra_headers( extra_headers=self._custom_header ) except box_sdk_gen.BoxSDKError as bse: raise RuntimeError( f"Error getting client from developer token: {bse.message}" ) except Exception as ex: raise ValueError( f"Invalid Box developer token. Please verify your \ token and try again.\n{ex}" ) from ex elif self.auth_type == "jwt": try: jwt_config = box_sdk_gen.JWTConfig.from_config_file( config_file_path=self.box_jwt_path ) auth = box_sdk_gen.BoxJWTAuth(config=jwt_config) self._box_client = box_sdk_gen.BoxClient(auth=auth).with_extra_headers( extra_headers=self._custom_header ) if self.box_user_id is not None: user_auth = auth.with_user_subject(self.box_user_id) self._box_client = box_sdk_gen.BoxClient( auth=user_auth ).with_extra_headers(extra_headers=self._custom_header) except box_sdk_gen.BoxSDKError as bse: raise RuntimeError( f"Error getting client from jwt token: {bse.message}" ) except Exception as ex: raise ValueError( "Error authenticating. Please verify your JWT config \ and try again." ) from ex elif self.auth_type == "ccg": try: if self.box_user_id is not None: ccg_config = box_sdk_gen.CCGConfig( client_id=self.box_client_id, client_secret=self.box_client_secret, user_id=self.box_user_id, ) else: ccg_config = box_sdk_gen.CCGConfig( client_id=self.box_client_id, client_secret=self.box_client_secret, enterprise_id=self.box_enterprise_id, ) auth = box_sdk_gen.BoxCCGAuth(config=ccg_config) self._box_client = box_sdk_gen.BoxClient(auth=auth).with_extra_headers( extra_headers=self._custom_header ) except box_sdk_gen.BoxSDKError as bse: raise RuntimeError( f"Error getting client from ccg token: {bse.message}" ) except Exception as ex: raise ValueError( "Error authenticating. Please verify you are providing a \ valid client id, secret and either a valid user ID or \ enterprise ID." ) from ex else: raise ValueError( f"{self.auth_type} is not a valid auth_type. Value must be \ TOKEN, CCG, or JWT." )
[docs] def get_client(self) -> box_sdk_gen.BoxClient: """Instantiate the Box SDK.""" if self._box_client is None: self._authorize() return self._box_client
[docs] class SearchTypeFilter(Enum): """SearchTypeFilter. Enum to limit the what we search. """ NAME = "name" """The name of the item, as defined by its ``name`` field.""" DESCRIPTION = "description" """The description of the item, as defined by its ``description`` field.""" FILE_CONTENT = "file_content" """The actual content of the file.""" COMMENTS = "comments" """The content of any of the comments on a file or folder.""" TAGS = "tags" """Any tags that are applied to an item, as defined by its ``tags`` field."""
[docs] class BoxSearchOptions(BaseModel): ancestor_folder_ids: Optional[List[str]] = None """Limits the search results to items within the given list of folders, defined as a comma separated lists of folder IDs.""" search_type_filter: Optional[List[SearchTypeFilter]] = None """Limits the search results to any items that match the search query for a specific part of the file, for example the file description. Content types are defined as a comma separated lists of Box recognized content types. The allowed content types are as follows. Default is all.""" created_date_range: Optional[List[str]] = None """Limits the search results to any items created within a given date range. Date ranges are defined as comma separated RFC3339 timestamps. If the the start date is omitted (,2014-05-17T13:35:01-07:00) anything created before the end date will be returned. If the end date is omitted (2014-05-15T13:35:01-07:00,) the current date will be used as the end date instead.""" file_extensions: Optional[List[DocumentFiles]] = None """Limits the search results to any files that match any of the provided file extensions. This list is a comma-separated list of ``langchain_box.utilities.DocumentFiles`` entries""" k: Optional[int] = 100 """Defines the maximum number of items to return. Defaults to 100, maximum is 200.""" size_range: Optional[List[int]] = None """Limits the search results to any items with a size within a given file size range. This applied to files and folders. Size ranges are defined as comma separated list of a lower and upper byte size limit (inclusive). The upper and lower bound can be omitted to create open ranges.""" updated_date_range: Optional[List[str]] = None """Limits the search results to any items updated within a given date range. Date ranges are defined as comma separated RFC3339 timestamps. If the start date is omitted (,2014-05-17T13:35:01-07:00) anything updated before the end date will be returned. If the end date is omitted (2014-05-15T13:35:01-07:00,) the current date will be used as the end date instead."""
[docs] class Config: arbitrary_types_allowed = True use_enum_values = True extra = "allow"
@model_validator(mode="after") def validate_search_options(self) -> Self: """Validate k is between 1 and 200""" if self.k > 200 or self.k < 1: # type: ignore[operator] raise ValueError( f"Invalid setting of k {self.k}. " "Value must be between 1 and 200." ) """Validate created_date_range start date is before end date""" if self.created_date_range: if ( self.created_date_range[0] is None # type: ignore[index] or self.created_date_range[0] == "" # type: ignore[index] or self.created_date_range[1] is None # type: ignore[index] or self.created_date_range[1] == "" # type: ignore[index] ): pass else: if ( self.created_date_range[0] # type: ignore[index] > self.created_date_range[1] # type: ignore[index] ): raise ValueError("Start date must be before end date.") """Validate updated_date_range start date is before end date""" if self.updated_date_range: if ( self.updated_date_range[0] is None # type: ignore[index] or self.updated_date_range[0] == "" # type: ignore[index] or self.updated_date_range[1] is None # type: ignore[index] or self.updated_date_range[1] == "" # type: ignore[index] ): pass else: if ( self.updated_date_range[0] # type: ignore[index] > self.updated_date_range[1] # type: ignore[index] ): raise ValueError("Start date must be before end date.") return self
class _BoxAPIWrapper(BaseModel): """Wrapper for Box API.""" box_developer_token: Optional[str] = Field( default_factory=from_env("BOX_DEVELOPER_TOKEN", default=None) ) """String containing the Box Developer Token generated in the developer console""" box_auth: Optional[BoxAuth] = None """Configured langchain_box.utilities.BoxAuth object""" character_limit: Optional[int] = -1 """character_limit is an int that caps the number of characters to return per document.""" box_search_options: Optional[BoxSearchOptions] = None """Search options to configure BoxRetriever to narrow search results.""" _box: Optional[box_sdk_gen.BoxClient] model_config = ConfigDict( arbitrary_types_allowed=True, use_enum_values=True, extra="allow", ) @model_validator(mode="after") def validate_box_api_inputs(self) -> Self: self._box = None """Validate that TOKEN auth type provides box_developer_token.""" if not self.box_auth: if not self.box_developer_token: raise ValueError( "You must configure either box_developer_token of box_auth" ) else: box_auth = self.box_auth self._box = box_auth.get_client() # type: ignore[union-attr] return self def get_box_client(self) -> box_sdk_gen.BoxClient: box_auth = BoxAuth( auth_type=BoxAuthType.TOKEN, box_developer_token=self.box_developer_token ) self._box = box_auth.get_client() def _do_request(self, url: str) -> Any: try: access_token = self._box.auth.retrieve_token().access_token # type: ignore[union-attr] except box_sdk_gen.BoxSDKError as bse: raise RuntimeError(f"Error getting client from jwt token: {bse.message}") resp = requests.get(url, headers={"Authorization": f"Bearer {access_token}"}) resp.raise_for_status() return resp.content def _get_text_representation(self, file_id: str = "") -> tuple[str, str, str]: try: from box_sdk_gen import BoxAPIError, BoxSDKError except ImportError: raise ImportError("You must run `pip install box-sdk-gen`") if self._box is None: self.get_box_client() try: file = self._box.files.get_file_by_id( # type: ignore[union-attr] file_id, x_rep_hints="[extracted_text]", fields=["name", "representations", "type"], ) except BoxAPIError as bae: raise RuntimeError(f"BoxAPIError: Error getting text rep: {bae.message}") except BoxSDKError as bse: raise RuntimeError(f"BoxSDKError: Error getting text rep: {bse.message}") except Exception: return None, None, None # type: ignore[return-value] file_repr = file.representations.entries if len(file_repr) <= 0: return None, None, None # type: ignore[return-value] for entry in file_repr: if entry.representation == "extracted_text": # If the file representation doesn't exist, calling # info.url will generate text if possible if entry.status.state == "none": self._do_request(entry.info.url) url = entry.content.url_template.replace("{+asset_path}", "") file_name = file.name.replace(".", "_").replace(" ", "_") try: raw_content = self._do_request(url) except requests.exceptions.HTTPError: return None, None, None # type: ignore[return-value] if ( self.character_limit is not None and self.character_limit > 0 # type: ignore[operator] ): content = raw_content[0 : (self.character_limit - 1)] else: content = raw_content return file_name, content, url return None, None, None # type: ignore[return-value] def get_document_by_file_id(self, file_id: str) -> Optional[Document]: """Load a file from a Box id. Accepts file_id as str. Returns `Document`""" if self._box is None: self.get_box_client() file = self._box.files.get_file_by_id( # type: ignore[union-attr] file_id, fields=["name", "type", "extension"] ) if file.type == "file": if hasattr(DocumentFiles, file.extension.upper()): file_name, content, url = self._get_text_representation(file_id=file_id) if file_name is None or content is None or url is None: return None metadata = { "source": f"{url}", "title": f"{file_name}", } return Document(page_content=content, metadata=metadata) return None return None def get_folder_items(self, folder_id: str) -> box_sdk_gen.Items: """Get all the items in a folder. Accepts folder_id as str. returns box_sdk_gen.Items""" if self._box is None: self.get_box_client() try: folder_contents = self._box.folders.get_folder_items( # type: ignore[union-attr] folder_id, fields=["id", "type", "name"] ) except box_sdk_gen.BoxAPIError as bae: raise RuntimeError( f"BoxAPIError: Error getting folder content: {bae.message}" ) except box_sdk_gen.BoxSDKError as bse: raise RuntimeError( f"BoxSDKError: Error getting folder content: {bse.message}" ) return folder_contents.entries def search_box(self, query: str) -> List[Document]: if self._box is None: self.get_box_client() files = [] try: results = None if self.box_search_options is None: results = self._box.search.search_for_content( # type: ignore[union-attr] query=query, fields=["id", "type", "extension"], type="file" ) else: results = self._box.search.search_for_content( # type: ignore[union-attr] query=query, fields=["id", "type", "extension"], type="file", ancestor_folder_ids=self.box_search_options.ancestor_folder_ids, # type: ignore[union-attr] content_types=self.box_search_options.search_type_filter, # type: ignore[union-attr] created_at_range=self.box_search_options.created_date_range, # type: ignore[union-attr] file_extensions=self.box_search_options.file_extensions, # type: ignore[union-attr] limit=self.box_search_options.k, # type: ignore[union-attr] size_range=self.box_search_options.size_range, # type: ignore[union-attr] updated_at_range=self.box_search_options.updated_date_range, # type: ignore[union-attr] ) if results.entries is None or len(results.entries) <= 0: return None # type: ignore[return-value] for file in results.entries: if ( file is not None and file.type == "file" and hasattr(DocumentFiles, file.extension.upper()) ): doc = self.get_document_by_file_id(file.id) if doc is not None: files.append(doc) return files except box_sdk_gen.BoxAPIError as bae: raise RuntimeError( f"BoxAPIError: Error getting search results: {bae.message}" ) except box_sdk_gen.BoxSDKError as bse: raise RuntimeError( f"BoxSDKError: Error getting search results: {bse.message}" ) def ask_box_ai( self, query: str, box_file_ids: List[str], answer: bool = True, citations: bool = False, ) -> List[Document]: if self._box is None: self.get_box_client() ai_mode = box_sdk_gen.CreateAiAskMode.SINGLE_ITEM_QA.value if len(box_file_ids) > 1: ai_mode = box_sdk_gen.CreateAiAskMode.MULTIPLE_ITEM_QA.value elif len(box_file_ids) <= 0: raise ValueError("BOX_AI_ASK requires at least one file ID") items = [] for file_id in box_file_ids: item = box_sdk_gen.AiItemBase( id=file_id, type=box_sdk_gen.AiItemBaseTypeField.FILE.value ) items.append(item) try: response = self._box.ai.create_ai_ask( # type: ignore[union-attr] mode=ai_mode, prompt=query, items=items, include_citations=citations ) except box_sdk_gen.BoxAPIError as bae: raise RuntimeError( f"BoxAPIError: Error getting Box AI result: {bae.message}" ) except box_sdk_gen.BoxSDKError as bse: raise RuntimeError( f"BoxSDKError: Error getting Box AI result: {bse.message}" ) docs = [] if answer: content = response.answer metadata = {"source": "Box AI", "title": f"Box AI {query}"} document = Document(page_content=content, metadata=metadata) docs.append(document) if citations: box_citations = response.citations for citation in box_citations: content = citation.content file_name = citation.name file_id = citation.id file_type = citation.type.value metadata = { "source": f"Box AI {query}", "file_name": file_name, "file_id": file_id, "file_type": file_type, } document = Document(page_content=content, metadata=metadata) docs.append(document) return docs