Document Loaders#

All different types of document loaders.

class langchain.document_loaders.AZLyricsLoader(web_path: Union[str, List[str]], header_template: Optional[dict] = None)[source]#

Loader that loads AZLyrics webpages.

load() List[langchain.schema.Document][source]#

Load webpage.

class langchain.document_loaders.AirbyteJSONLoader(file_path: str)[source]#

Loader that loads local airbyte json files.

load() List[langchain.schema.Document][source]#

Load file.

pydantic model langchain.document_loaders.ApifyDatasetLoader[source]#

Logic for loading documents from Apify datasets.

field apify_client: Any = None#
field dataset_id: str [Required]#

The ID of the dataset on the Apify platform.

field dataset_mapping_function: Callable[[Dict], langchain.schema.Document] [Required]#

A custom function that takes a single dictionary (an Apify dataset item) and converts it to an instance of the Document class.

load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.ArxivLoader(query: str, load_max_docs: Optional[int] = 100, load_all_available_meta: Optional[bool] = False)[source]#

Loads a query result from arxiv.org into a list of Documents.

Each document represents one Document. The loader converts the original PDF format into the text.

load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.AzureBlobStorageContainerLoader(conn_str: str, container: str, prefix: str = '')[source]#

Loading logic for loading documents from Azure Blob Storage.

load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.AzureBlobStorageFileLoader(conn_str: str, container: str, blob_name: str)[source]#

Loading logic for loading documents from Azure Blob Storage.

load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.BSHTMLLoader(file_path: str, open_encoding: Optional[str] = None, bs_kwargs: Optional[dict] = None, get_text_separator: str = '')[source]#

Loader that uses beautiful soup to parse HTML files.

load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.BibtexLoader(file_path: str, *, parser: Optional[langchain.utilities.bibtex.BibtexparserWrapper] = None, max_docs: Optional[int] = None, max_content_chars: Optional[int] = 4000, load_extra_metadata: bool = False, file_pattern: str = '[^:]+\\.pdf')[source]#

Loads a bibtex file into a list of Documents.

Each document represents one entry from the bibtex file.

If a PDF file is present in the file bibtex field, the original PDF is loaded into the document text. If no such file entry is present, the abstract field is used instead.

lazy_load() Iterator[langchain.schema.Document][source]#

Load bibtex file using bibtexparser and get the article texts plus the

article metadata.

See https://bibtexparser.readthedocs.io/en/master/

Returns

a list of documents with the document.page_content in text format

load() List[langchain.schema.Document][source]#

Load bibtex file documents from the given bibtex file path.

See https://bibtexparser.readthedocs.io/en/master/

Parameters

file_path – the path to the bibtex file

Returns

a list of documents with the document.page_content in text format

class langchain.document_loaders.BigQueryLoader(query: str, project: Optional[str] = None, page_content_columns: Optional[List[str]] = None, metadata_columns: Optional[List[str]] = None, credentials: Optional[Credentials] = None)[source]#

Loads a query result from BigQuery into a list of documents.

Each document represents one row of the result. The page_content_columns are written into the page_content of the document. The metadata_columns are written into the metadata of the document. By default, all columns are written into the page_content and none into the metadata.

load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.BiliBiliLoader(video_urls: List[str])[source]#

Loader that loads bilibili transcripts.

load() List[langchain.schema.Document][source]#

Load from bilibili url.

class langchain.document_loaders.BlackboardLoader(blackboard_course_url: str, bbrouter: str, load_all_recursively: bool = True, basic_auth: Optional[Tuple[str, str]] = None, cookies: Optional[dict] = None)[source]#

Loader that loads all documents from a Blackboard course.

This loader is not compatible with all Blackboard courses. It is only compatible with courses that use the new Blackboard interface. To use this loader, you must have the BbRouter cookie. You can get this cookie by logging into the course and then copying the value of the BbRouter cookie from the browser’s developer tools.

Example

from langchain.document_loaders import BlackboardLoader

loader = BlackboardLoader(
    blackboard_course_url="https://blackboard.example.com/webapps/blackboard/execute/announcement?method=search&context=course_entry&course_id=_123456_1",
    bbrouter="expires:12345...",
)
documents = loader.load()
base_url: str#
check_bs4() None[source]#

Check if BeautifulSoup4 is installed.

Raises

ImportError – If BeautifulSoup4 is not installed.

download(path: str) None[source]#

Download a file from a url.

Parameters

path – Path to the file.

folder_path: str#
load() List[langchain.schema.Document][source]#

Load data into document objects.

Returns

List of documents.

load_all_recursively: bool#
parse_filename(url: str) str[source]#

Parse the filename from a url.

Parameters

url – Url to parse the filename from.

Returns

The filename.

class langchain.document_loaders.BlockchainDocumentLoader(contract_address: str, blockchainType: langchain.document_loaders.blockchain.BlockchainType = BlockchainType.ETH_MAINNET, api_key: str = 'docs-demo', startToken: str = '', get_all_tokens: bool = False, max_execution_time: Optional[int] = None)[source]#

Loads elements from a blockchain smart contract into Langchain documents.

The supported blockchains are: Ethereum mainnet, Ethereum Goerli testnet, Polygon mainnet, and Polygon Mumbai testnet.

If no BlockchainType is specified, the default is Ethereum mainnet.

The Loader uses the Alchemy API to interact with the blockchain. ALCHEMY_API_KEY environment variable must be set to use this loader.

The API returns 100 NFTs per request and can be paginated using the startToken parameter.

If get_all_tokens is set to True, the loader will get all tokens on the contract. Note that for contracts with a large number of tokens, this may take a long time (e.g. 10k tokens is 100 requests). Default value is false for this reason.

The max_execution_time (sec) can be set to limit the execution time of the loader.

Future versions of this loader can:
  • Support additional Alchemy APIs (e.g. getTransactions, etc.)

  • Support additional blockain APIs (e.g. Infura, Opensea, etc.)

load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.CSVLoader(file_path: str, source_column: Optional[str] = None, csv_args: Optional[Dict] = None, encoding: Optional[str] = None)[source]#

Loads a CSV file into a list of documents.

Each document represents one row of the CSV file. Every row is converted into a key/value pair and outputted to a new line in the document’s page_content.

The source for each document loaded from csv is set to the value of the file_path argument for all doucments by default. You can override this by setting the source_column argument to the name of a column in the CSV file. The source of each document will then be set to the value of the column with the name specified in source_column.

Output Example:
column1: value1
column2: value2
column3: value3
load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.ChatGPTLoader(log_file: str, num_logs: int = - 1)[source]#

Loader that loads conversations from exported ChatGPT data.

load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.CoNLLULoader(file_path: str)[source]#

Load CoNLL-U files.

load() List[langchain.schema.Document][source]#

Load from file path.

class langchain.document_loaders.CollegeConfidentialLoader(web_path: Union[str, List[str]], header_template: Optional[dict] = None)[source]#

Loader that loads College Confidential webpages.

load() List[langchain.schema.Document][source]#

Load webpage.

class langchain.document_loaders.ConfluenceLoader(url: str, api_key: Optional[str] = None, username: Optional[str] = None, oauth2: Optional[dict] = None, token: Optional[str] = None, cloud: Optional[bool] = True, number_of_retries: Optional[int] = 3, min_retry_seconds: Optional[int] = 2, max_retry_seconds: Optional[int] = 10, confluence_kwargs: Optional[dict] = None)[source]#

Load Confluence pages. Port of https://llamahub.ai/l/confluence This currently supports username/api_key, Oauth2 login or personal access token authentication.

Specify a list page_ids and/or space_key to load in the corresponding pages into Document objects, if both are specified the union of both sets will be returned.

You can also specify a boolean include_attachments to include attachments, this is set to False by default, if set to True all attachments will be downloaded and ConfluenceReader will extract the text from the attachments and add it to the Document object. Currently supported attachment types are: PDF, PNG, JPEG/JPG, SVG, Word and Excel.

Hint: space_key and page_id can both be found in the URL of a page in Confluence - https://yoursite.atlassian.com/wiki/spaces/<space_key>/pages/<page_id>

Example

from langchain.document_loaders import ConfluenceLoader

loader = ConfluenceLoader(
    url="https://yoursite.atlassian.com/wiki",
    username="me",
    api_key="12345"
)
documents = loader.load(space_key="SPACE",limit=50)
Parameters
  • url (str) – _description_

  • api_key (str, optional) – _description_, defaults to None

  • username (str, optional) – _description_, defaults to None

  • oauth2 (dict, optional) – _description_, defaults to {}

  • token (str, optional) – _description_, defaults to None

  • cloud (bool, optional) – _description_, defaults to True

  • number_of_retries (Optional[int], optional) – How many times to retry, defaults to 3

  • min_retry_seconds (Optional[int], optional) – defaults to 2

  • max_retry_seconds (Optional[int], optional) – defaults to 10

  • confluence_kwargs (dict, optional) – additional kwargs to initialize confluence with

Raises
  • ValueError – Errors while validating input

  • ImportError – Required dependencies not installed.

is_public_page(page: dict) bool[source]#

Check if a page is publicly accessible.

load(space_key: Optional[str] = None, page_ids: Optional[List[str]] = None, label: Optional[str] = None, cql: Optional[str] = None, include_restricted_content: bool = False, include_archived_content: bool = False, include_attachments: bool = False, include_comments: bool = False, limit: Optional[int] = 50, max_pages: Optional[int] = 1000) List[langchain.schema.Document][source]#
Parameters
  • space_key (Optional[str], optional) – Space key retrieved from a confluence URL, defaults to None

  • page_ids (Optional[List[str]], optional) – List of specific page IDs to load, defaults to None

  • label (Optional[str], optional) – Get all pages with this label, defaults to None

  • cql (Optional[str], optional) – CQL Expression, defaults to None

  • include_restricted_content (bool, optional) – defaults to False

  • include_archived_content (bool, optional) – Whether to include archived content, defaults to False

  • include_attachments (bool, optional) – defaults to False

  • include_comments (bool, optional) – defaults to False

  • limit (int, optional) – Maximum number of pages to retrieve per request, defaults to 50

  • max_pages (int, optional) – Maximum number of pages to retrieve in total, defaults 1000

Raises
  • ValueError – _description_

  • ImportError – _description_

Returns

_description_

Return type

List[Document]

paginate_request(retrieval_method: Callable, **kwargs: Any) List[source]#

Paginate the various methods to retrieve groups of pages.

Unfortunately, due to page size, sometimes the Confluence API doesn’t match the limit value. If limit is >100 confluence seems to cap the response to 100. Also, due to the Atlassian Python package, we don’t get the “next” values from the “_links” key because they only return the value from the results key. So here, the pagination starts from 0 and goes until the max_pages, getting the limit number of pages with each request. We have to manually check if there are more docs based on the length of the returned list of pages, rather than just checking for the presence of a next key in the response like this page would have you do: https://developer.atlassian.com/server/confluence/pagination-in-the-rest-api/

Parameters

retrieval_method (callable) – Function used to retrieve docs

Returns

List of documents

Return type

List

process_attachment(page_id: str) List[str][source]#
process_doc(link: str) str[source]#
process_image(link: str) str[source]#
process_page(page: dict, include_attachments: bool, include_comments: bool) langchain.schema.Document[source]#
process_pages(pages: List[dict], include_restricted_content: bool, include_attachments: bool, include_comments: bool) List[langchain.schema.Document][source]#

Process a list of pages into a list of documents.

process_pdf(link: str) str[source]#
process_svg(link: str) str[source]#
process_xls(link: str) str[source]#
static validate_init_args(url: Optional[str] = None, api_key: Optional[str] = None, username: Optional[str] = None, oauth2: Optional[dict] = None, token: Optional[str] = None) Optional[List][source]#

Validates proper combinations of init arguments

class langchain.document_loaders.DataFrameLoader(data_frame: Any, page_content_column: str = 'text')[source]#

Load Pandas DataFrames.

load() List[langchain.schema.Document][source]#

Load from the dataframe.

class langchain.document_loaders.DiffbotLoader(api_token: str, urls: List[str], continue_on_failure: bool = True)[source]#

Loader that loads Diffbot file json.

load() List[langchain.schema.Document][source]#

Extract text from Diffbot on all the URLs and return Document instances

class langchain.document_loaders.DirectoryLoader(path: str, glob: str = '**/[!.]*', silent_errors: bool = False, load_hidden: bool = False, loader_cls: typing.Union[typing.Type[langchain.document_loaders.unstructured.UnstructuredFileLoader], typing.Type[langchain.document_loaders.text.TextLoader], typing.Type[langchain.document_loaders.html_bs.BSHTMLLoader]] = <class 'langchain.document_loaders.unstructured.UnstructuredFileLoader'>, loader_kwargs: typing.Optional[dict] = None, recursive: bool = False, show_progress: bool = False, use_multithreading: bool = False, max_concurrency: int = 4)[source]#

Loading logic for loading documents from a directory.

load() List[langchain.schema.Document][source]#

Load documents.

load_file(item: pathlib.Path, path: pathlib.Path, docs: List[langchain.schema.Document], pbar: Optional[Any]) None[source]#
class langchain.document_loaders.DiscordChatLoader(chat_log: pd.DataFrame, user_id_col: str = 'ID')[source]#

Load Discord chat logs.

load() List[langchain.schema.Document][source]#

Load all chat messages.

pydantic model langchain.document_loaders.DocugamiLoader[source]#

Loader that loads processed docs from Docugami.

To use, you should have the lxml python package installed.

field access_token: Optional[str] = None#
field api: str = 'https://api.docugami.com/v1preview1'#
field docset_id: Optional[str] = None#
field document_ids: Optional[Sequence[str]] = None#
field file_paths: Optional[Sequence[Union[pathlib.Path, str]]] = None#
field min_chunk_size: int = 32#
load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.Docx2txtLoader(file_path: str)[source]#

Loads a DOCX with docx2txt and chunks at character level.

Defaults to check for local file, but if the file is a web path, it will download it to a temporary file, and use that, then clean up the temporary file after completion

load() List[langchain.schema.Document][source]#

Load given path as single page.

class langchain.document_loaders.DuckDBLoader(query: str, database: str = ':memory:', read_only: bool = False, config: Optional[Dict[str, str]] = None, page_content_columns: Optional[List[str]] = None, metadata_columns: Optional[List[str]] = None)[source]#

Loads a query result from DuckDB into a list of documents.

Each document represents one row of the result. The page_content_columns are written into the page_content of the document. The metadata_columns are written into the metadata of the document. By default, all columns are written into the page_content and none into the metadata.

load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.EverNoteLoader(file_path: str, load_single_document: bool = True)[source]#

EverNote Loader. Loads an EverNote notebook export file e.g. my_notebook.enex into Documents. Instructions on producing this file can be found at https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML

Currently only the plain text in the note is extracted and stored as the contents of the Document, any non content metadata (e.g. ‘author’, ‘created’, ‘updated’ etc. but not ‘content-raw’ or ‘resource’) tags on the note will be extracted and stored as metadata on the Document.

Parameters
  • file_path (str) – The path to the notebook export with a .enex extension

  • load_single_document (bool) – Whether or not to concatenate the content of all notes into a single long Document.

  • True (If this is set to) – the ‘source’ which contains the file name of the export.

load() List[langchain.schema.Document][source]#

Load documents from EverNote export file.

class langchain.document_loaders.FacebookChatLoader(path: str)[source]#

Loader that loads Facebook messages json directory dump.

load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.FaunaLoader(query: str, page_content_field: str, secret: str, metadata_fields: Optional[Sequence[str]] = None)[source]#
query#

The FQL query string to execute.

Type

str

page_content_field#

The field that contains the content of each page.

Type

str

secret#

The secret key for authenticating to FaunaDB.

Type

str

metadata_fields#

Optional list of field names to include in metadata.

Type

Optional[Sequence[str]]

lazy_load() Iterator[langchain.schema.Document][source]#

A lazy loader for document content.

load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.FigmaFileLoader(access_token: str, ids: str, key: str)[source]#

Loader that loads Figma file json.

load() List[langchain.schema.Document][source]#

Load file

class langchain.document_loaders.GCSDirectoryLoader(project_name: str, bucket: str, prefix: str = '')[source]#

Loading logic for loading documents from GCS.

load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.GCSFileLoader(project_name: str, bucket: str, blob: str)[source]#

Loading logic for loading documents from GCS.

load() List[langchain.schema.Document][source]#

Load documents.

pydantic model langchain.document_loaders.GitHubIssuesLoader[source]#
Validators
  • validate_environment » all fields

  • validate_since » since

field assignee: Optional[str] = None#

Filter on assigned user. Pass ‘none’ for no user and ‘*’ for any user.

field creator: Optional[str] = None#

Filter on the user that created the issue.

field direction: Optional[Literal['asc', 'desc']] = None#

The direction to sort the results by. Can be one of: ‘asc’, ‘desc’.

field include_prs: bool = True#

If True include Pull Requests in results, otherwise ignore them.

field labels: Optional[List[str]] = None#

Label names to filter one. Example: bug,ui,@high.

field mentioned: Optional[str] = None#

Filter on a user that’s mentioned in the issue.

field milestone: Optional[Union[int, Literal['*', 'none']]] = None#

If integer is passed, it should be a milestone’s number field. If the string ‘*’ is passed, issues with any milestone are accepted. If the string ‘none’ is passed, issues without milestones are returned.

field since: Optional[str] = None#

Only show notifications updated after the given time. This is a timestamp in ISO 8601 format: YYYY-MM-DDTHH:MM:SSZ.

field sort: Optional[Literal['created', 'updated', 'comments']] = None#

What to sort results by. Can be one of: ‘created’, ‘updated’, ‘comments’. Default is ‘created’.

field state: Optional[Literal['open', 'closed', 'all']] = None#

Filter on issue state. Can be one of: ‘open’, ‘closed’, ‘all’.

lazy_load() Iterator[langchain.schema.Document][source]#

Get issues of a GitHub repository.

Returns

  • page_content

  • metadata
    • url

    • title

    • creator

    • created_at

    • last_update_time

    • closed_time

    • number of comments

    • state

    • labels

    • assignee

    • assignees

    • milestone

    • locked

    • number

    • is_pull_request

Return type

A list of Documents with attributes

load() List[langchain.schema.Document][source]#

Get issues of a GitHub repository.

Returns

  • page_content

  • metadata
    • url

    • title

    • creator

    • created_at

    • last_update_time

    • closed_time

    • number of comments

    • state

    • labels

    • assignee

    • assignees

    • milestone

    • locked

    • number

    • is_pull_request

Return type

A list of Documents with attributes

parse_issue(issue: dict) langchain.schema.Document[source]#

Create Document objects from a list of GitHub issues.

property query_params: str#
property url: str#
class langchain.document_loaders.GitLoader(repo_path: str, clone_url: Optional[str] = None, branch: Optional[str] = 'main', file_filter: Optional[Callable[[str], bool]] = None)[source]#

Loads files from a Git repository into a list of documents. Repository can be local on disk available at repo_path, or remote at clone_url that will be cloned to repo_path. Currently supports only text files.

Each document represents one file in the repository. The path points to the local Git repository, and the branch specifies the branch to load files from. By default, it loads from the main branch.

load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.GitbookLoader(web_page: str, load_all_paths: bool = False, base_url: Optional[str] = None, content_selector: str = 'main')[source]#

Load GitBook data.

  1. load from either a single page, or

  2. load all (relative) paths in the navbar.

load() List[langchain.schema.Document][source]#

Fetch text from one single GitBook page.

class langchain.document_loaders.GoogleApiClient(credentials_path: pathlib.Path = PosixPath('/home/docs/.credentials/credentials.json'), service_account_path: pathlib.Path = PosixPath('/home/docs/.credentials/credentials.json'), token_path: pathlib.Path = PosixPath('/home/docs/.credentials/token.json'))[source]#

A Generic Google Api Client.

To use, you should have the google_auth_oauthlib,youtube_transcript_api,google python package installed. As the google api expects credentials you need to set up a google account and register your Service. “https://developers.google.com/docs/api/quickstart/python

Example

from langchain.document_loaders import GoogleApiClient
google_api_client = GoogleApiClient(
    service_account_path=Path("path_to_your_sec_file.json")
)
credentials_path: pathlib.Path = PosixPath('/home/docs/.credentials/credentials.json')#
service_account_path: pathlib.Path = PosixPath('/home/docs/.credentials/credentials.json')#
token_path: pathlib.Path = PosixPath('/home/docs/.credentials/token.json')#
classmethod validate_channel_or_videoIds_is_set(values: Dict[str, Any]) Dict[str, Any][source]#

Validate that either folder_id or document_ids is set, but not both.

class langchain.document_loaders.GoogleApiYoutubeLoader(google_api_client: langchain.document_loaders.youtube.GoogleApiClient, channel_name: Optional[str] = None, video_ids: Optional[List[str]] = None, add_video_info: bool = True, captions_language: str = 'en', continue_on_failure: bool = False)[source]#

Loader that loads all Videos from a Channel

To use, you should have the googleapiclient,youtube_transcript_api python package installed. As the service needs a google_api_client, you first have to initialize the GoogleApiClient.

Additionally you have to either provide a channel name or a list of videoids “https://developers.google.com/docs/api/quickstart/python

Example

from langchain.document_loaders import GoogleApiClient
from langchain.document_loaders import GoogleApiYoutubeLoader
google_api_client = GoogleApiClient(
    service_account_path=Path("path_to_your_sec_file.json")
)
loader = GoogleApiYoutubeLoader(
    google_api_client=google_api_client,
    channel_name = "CodeAesthetic"
)
load.load()
add_video_info: bool = True#
captions_language: str = 'en'#
channel_name: Optional[str] = None#
continue_on_failure: bool = False#
google_api_client: langchain.document_loaders.youtube.GoogleApiClient#
load() List[langchain.schema.Document][source]#

Load documents.

classmethod validate_channel_or_videoIds_is_set(values: Dict[str, Any]) Dict[str, Any][source]#

Validate that either folder_id or document_ids is set, but not both.

video_ids: Optional[List[str]] = None#
pydantic model langchain.document_loaders.GoogleDriveLoader[source]#

Loader that loads Google Docs from Google Drive.

Validators
  • validate_credentials_path » credentials_path

  • validate_inputs » all fields

field credentials_path: pathlib.Path = PosixPath('/home/docs/.credentials/credentials.json')#
field document_ids: Optional[List[str]] = None#
field file_ids: Optional[List[str]] = None#
field file_types: Optional[Sequence[str]] = None#
field folder_id: Optional[str] = None#
field load_trashed_files: bool = False#
field recursive: bool = False#
field service_account_key: pathlib.Path = PosixPath('/home/docs/.credentials/keys.json')#
field token_path: pathlib.Path = PosixPath('/home/docs/.credentials/token.json')#
load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.GutenbergLoader(file_path: str)[source]#

Loader that uses urllib to load .txt web files.

load() List[langchain.schema.Document][source]#

Load file.

class langchain.document_loaders.HNLoader(web_path: Union[str, List[str]], header_template: Optional[dict] = None)[source]#

Load Hacker News data from either main page results or the comments page.

load() List[langchain.schema.Document][source]#

Get important HN webpage information.

Components are:
  • title

  • content

  • source url,

  • time of post

  • author of the post

  • number of comments

  • rank of the post

load_comments(soup_info: Any) List[langchain.schema.Document][source]#

Load comments from a HN post.

load_results(soup: Any) List[langchain.schema.Document][source]#

Load items from an HN page.

class langchain.document_loaders.HuggingFaceDatasetLoader(path: str, page_content_column: str = 'text', name: Optional[str] = None, data_dir: Optional[str] = None, data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None, cache_dir: Optional[str] = None, keep_in_memory: Optional[bool] = None, save_infos: bool = False, use_auth_token: Optional[Union[bool, str]] = None, num_proc: Optional[int] = None)[source]#

Loading logic for loading documents from the Hugging Face Hub.

lazy_load() Iterator[langchain.schema.Document][source]#

Load documents lazily.

load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.IFixitLoader(web_path: str)[source]#

Load iFixit repair guides, device wikis and answers.

iFixit is the largest, open repair community on the web. The site contains nearly 100k repair manuals, 200k Questions & Answers on 42k devices, and all the data is licensed under CC-BY.

This loader will allow you to download the text of a repair guide, text of Q&A’s and wikis from devices on iFixit using their open APIs and web scraping.

load() List[langchain.schema.Document][source]#

Load data into document objects.

load_device(url_override: Optional[str] = None, include_guides: bool = True) List[langchain.schema.Document][source]#
load_guide(url_override: Optional[str] = None) List[langchain.schema.Document][source]#
load_questions_and_answers(url_override: Optional[str] = None) List[langchain.schema.Document][source]#
static load_suggestions(query: str = '', doc_type: str = 'all') List[langchain.schema.Document][source]#
class langchain.document_loaders.IMSDbLoader(web_path: Union[str, List[str]], header_template: Optional[dict] = None)[source]#

Loader that loads IMSDb webpages.

load() List[langchain.schema.Document][source]#

Load webpage.

class langchain.document_loaders.ImageCaptionLoader(path_images: Union[str, List[str]], blip_processor: str = 'Salesforce/blip-image-captioning-base', blip_model: str = 'Salesforce/blip-image-captioning-base')[source]#

Loader that loads the captions of an image

load() List[langchain.schema.Document][source]#

Load from a list of image files

class langchain.document_loaders.IuguLoader(resource: str, api_token: Optional[str] = None)[source]#
load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.JSONLoader(file_path: Union[str, pathlib.Path], jq_schema: str, content_key: Optional[str] = None, metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None, text_content: bool = True)[source]#

Loads a JSON file and references a jq schema provided to load the text into documents.

Example

[{“text”: …}, {“text”: …}, {“text”: …}] -> schema = .[].text {“key”: [{“text”: …}, {“text”: …}, {“text”: …}]} -> schema = .key[].text [“”, “”, “”] -> schema = .[]

load() List[langchain.schema.Document][source]#

Load and return documents from the JSON file.

class langchain.document_loaders.JoplinLoader(access_token: Optional[str] = None, port: int = 41184, host: str = 'localhost')[source]#

Loader that fetches notes from Joplin.

In order to use this loader, you need to have Joplin running with the Web Clipper enabled (look for “Web Clipper” in the app settings).

To get the access token, you need to go to the Web Clipper options and under “Advanced Options” you will find the access token.

You can find more information about the Web Clipper service here: https://joplinapp.org/clipper/

lazy_load() Iterator[langchain.schema.Document][source]#

A lazy loader for document content.

load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.MWDumpLoader(file_path: str, encoding: Optional[str] = 'utf8')[source]#

Load MediaWiki dump from XML file .. rubric:: Example

from langchain.document_loaders import MWDumpLoader

loader = MWDumpLoader(
    file_path="myWiki.xml",
    encoding="utf8"
)
docs = loader.load()
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=0
)
texts = text_splitter.split_documents(docs)
Parameters
  • file_path (str) – XML local file path

  • encoding (str, optional) – Charset encoding, defaults to “utf8”

load() List[langchain.schema.Document][source]#

Load from file path.

class langchain.document_loaders.MastodonTootsLoader(mastodon_accounts: Sequence[str], number_toots: Optional[int] = 100, exclude_replies: bool = False, access_token: Optional[str] = None, api_base_url: str = 'https://mastodon.social')[source]#

Mastodon toots loader.

load() List[langchain.schema.Document][source]#

Load toots into documents.

class langchain.document_loaders.MathpixPDFLoader(file_path: str, processed_file_format: str = 'mmd', max_wait_time_seconds: int = 500, should_clean_pdf: bool = False, **kwargs: Any)[source]#
clean_pdf(contents: str) str[source]#
property data: dict#
get_processed_pdf(pdf_id: str) str[source]#
property headers: dict#
load() List[langchain.schema.Document][source]#

Load data into document objects.

send_pdf() str[source]#
property url: str#
wait_for_processing(pdf_id: str) None[source]#
class langchain.document_loaders.MaxComputeLoader(query: str, api_wrapper: langchain.utilities.max_compute.MaxComputeAPIWrapper, *, page_content_columns: Optional[Sequence[str]] = None, metadata_columns: Optional[Sequence[str]] = None)[source]#

Loads a query result from Alibaba Cloud MaxCompute table into documents.

classmethod from_params(query: str, endpoint: str, project: str, *, access_id: Optional[str] = None, secret_access_key: Optional[str] = None, **kwargs: Any) langchain.document_loaders.max_compute.MaxComputeLoader[source]#
Convenience constructor that builds the MaxCompute API wrapper from

given parameters.

Parameters
  • query – SQL query to execute.

  • endpoint – MaxCompute endpoint.

  • project – A project is a basic organizational unit of MaxCompute, which is similar to a database.

  • access_id – MaxCompute access ID. Should be passed in directly or set as the environment variable MAX_COMPUTE_ACCESS_ID.

  • secret_access_key – MaxCompute secret access key. Should be passed in directly or set as the environment variable MAX_COMPUTE_SECRET_ACCESS_KEY.

lazy_load() Iterator[langchain.schema.Document][source]#

A lazy loader for document content.

load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.ModernTreasuryLoader(resource: str, organization_id: Optional[str] = None, api_key: Optional[str] = None)[source]#
load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.NotebookLoader(path: str, include_outputs: bool = False, max_output_length: int = 10, remove_newline: bool = False, traceback: bool = False)[source]#

Loader that loads .ipynb notebook files.

load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.NotionDBLoader(integration_token: str, database_id: str, request_timeout_sec: Optional[int] = 10)[source]#

Notion DB Loader. Reads content from pages within a Noton Database. :param integration_token: Notion integration token. :type integration_token: str :param database_id: Notion database id. :type database_id: str :param request_timeout_sec: Timeout for Notion requests in seconds. :type request_timeout_sec: int

load() List[langchain.schema.Document][source]#

Load documents from the Notion database. :returns: List of documents. :rtype: List[Document]

load_page(page_id: str) langchain.schema.Document[source]#

Read a page.

class langchain.document_loaders.NotionDirectoryLoader(path: str)[source]#

Loader that loads Notion directory dump.

load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.ObsidianLoader(path: str, encoding: str = 'UTF-8', collect_metadata: bool = True)[source]#

Loader that loads Obsidian files from disk.

FRONT_MATTER_REGEX = re.compile('^---\\n(.*?)\\n---\\n', re.MULTILINE|re.DOTALL)#
load() List[langchain.schema.Document][source]#

Load documents.

pydantic model langchain.document_loaders.OneDriveFileLoader[source]#
field file: File [Required]#
load() List[langchain.schema.Document][source]#

Load Documents

pydantic model langchain.document_loaders.OneDriveLoader[source]#
field auth_with_token: bool = False#
field drive_id: str [Required]#
field folder_path: Optional[str] = None#
field object_ids: Optional[List[str]] = None#
field settings: langchain.document_loaders.onedrive._OneDriveSettings [Optional]#
load() List[langchain.schema.Document][source]#

Loads all supported document files from the specified OneDrive drive a nd returns a list of Document objects.

Returns

A list of Document objects representing the loaded documents.

Return type

List[Document]

Raises
  • ValueError – If the specified drive ID

  • does not correspond to a drive in the OneDrive storage.

class langchain.document_loaders.OnlinePDFLoader(file_path: str)[source]#

Loader that loads online PDFs.

load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.OutlookMessageLoader(file_path: str)[source]#

Loader that loads Outlook Message files using extract_msg. TeamMsgExtractor/msg-extractor

load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.PDFMinerLoader(file_path: str)[source]#

Loader that uses PDFMiner to load PDF files.

lazy_load() Iterator[langchain.schema.Document][source]#

Lazily lod documents.

load() List[langchain.schema.Document][source]#

Eagerly load the content.

class langchain.document_loaders.PDFMinerPDFasHTMLLoader(file_path: str)[source]#

Loader that uses PDFMiner to load PDF files as HTML content.

load() List[langchain.schema.Document][source]#

Load file.

class langchain.document_loaders.PDFPlumberLoader(file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None)[source]#

Loader that uses pdfplumber to load PDF files.

load() List[langchain.schema.Document][source]#

Load file.

langchain.document_loaders.PagedPDFSplitter#

alias of langchain.document_loaders.pdf.PyPDFLoader

class langchain.document_loaders.PlaywrightURLLoader(urls: List[str], continue_on_failure: bool = True, headless: bool = True, remove_selectors: Optional[List[str]] = None)[source]#

Loader that uses Playwright and to load a page and unstructured to load the html. This is useful for loading pages that require javascript to render.

urls#

List of URLs to load.

Type

List[str]

continue_on_failure#

If True, continue loading other URLs on failure.

Type

bool

headless#

If True, the browser will run in headless mode.

Type

bool

load() List[langchain.schema.Document][source]#

Load the specified URLs using Playwright and create Document instances.

Returns

A list of Document instances with loaded content.

Return type

List[Document]

class langchain.document_loaders.PsychicLoader(api_key: str, connector_id: str, connection_id: str)[source]#

Loader that loads documents from Psychic.dev.

load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.PyMuPDFLoader(file_path: str)[source]#

Loader that uses PyMuPDF to load PDF files.

load(**kwargs: Optional[Any]) List[langchain.schema.Document][source]#

Load file.

class langchain.document_loaders.PyPDFDirectoryLoader(path: str, glob: str = '**/[!.]*.pdf', silent_errors: bool = False, load_hidden: bool = False, recursive: bool = False)[source]#

Loads a directory with PDF files with pypdf and chunks at character level.

Loader also stores page numbers in metadatas.

load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.PyPDFLoader(file_path: str)[source]#

Loads a PDF with pypdf and chunks at character level.

Loader also stores page numbers in metadatas.

lazy_load() Iterator[langchain.schema.Document][source]#

Lazy load given path as pages.

load() List[langchain.schema.Document][source]#

Load given path as pages.

class langchain.document_loaders.PyPDFium2Loader(file_path: str)[source]#

Loads a PDF with pypdfium2 and chunks at character level.

lazy_load() Iterator[langchain.schema.Document][source]#

Lazy load given path as pages.

load() List[langchain.schema.Document][source]#

Load given path as pages.

class langchain.document_loaders.PySparkDataFrameLoader(spark_session: Optional[SparkSession] = None, df: Optional[Any] = None, page_content_column: str = 'text', fraction_of_memory: float = 0.1)[source]#

Load PySpark DataFrames

get_num_rows() Tuple[int, int][source]#

Gets the amount of “feasible” rows for the DataFrame

lazy_load() Iterator[langchain.schema.Document][source]#

A lazy loader for document content.

load() List[langchain.schema.Document][source]#

Load from the dataframe.

class langchain.document_loaders.PythonLoader(file_path: str)[source]#

Load Python files, respecting any non-default encoding if specified.

class langchain.document_loaders.ReadTheDocsLoader(path: Union[str, pathlib.Path], encoding: Optional[str] = None, errors: Optional[str] = None, custom_html_tag: Optional[Tuple[str, dict]] = None, **kwargs: Optional[Any])[source]#

Loader that loads ReadTheDocs documentation directory dump.

load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.RedditPostsLoader(client_id: str, client_secret: str, user_agent: str, search_queries: Sequence[str], mode: str, categories: Sequence[str] = ['new'], number_posts: Optional[int] = 10)[source]#

Reddit posts loader. Read posts on a subreddit. First you need to go to https://www.reddit.com/prefs/apps/ and create your application

load() List[langchain.schema.Document][source]#

Load reddits.

class langchain.document_loaders.RoamLoader(path: str)[source]#

Loader that loads Roam files from disk.

load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.S3DirectoryLoader(bucket: str, prefix: str = '')[source]#

Loading logic for loading documents from s3.

load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.S3FileLoader(bucket: str, key: str)[source]#

Loading logic for loading documents from s3.

load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.SRTLoader(file_path: str)[source]#

Loader for .srt (subtitle) files.

load() List[langchain.schema.Document][source]#

Load using pysrt file.

class langchain.document_loaders.SeleniumURLLoader(urls: List[str], continue_on_failure: bool = True, browser: Literal['chrome', 'firefox'] = 'chrome', binary_location: Optional[str] = None, executable_path: Optional[str] = None, headless: bool = True, arguments: List[str] = [])[source]#

Loader that uses Selenium and to load a page and unstructured to load the html. This is useful for loading pages that require javascript to render.

urls#

List of URLs to load.

Type

List[str]

continue_on_failure#

If True, continue loading other URLs on failure.

Type

bool

browser#

The browser to use, either ‘chrome’ or ‘firefox’.

Type

str

binary_location#

The location of the browser binary.

Type

Optional[str]

executable_path#

The path to the browser executable.

Type

Optional[str]

headless#

If True, the browser will run in headless mode.

Type

bool

arguments [List[str]]

List of arguments to pass to the browser.

load() List[langchain.schema.Document][source]#

Load the specified URLs using Selenium and create Document instances.

Returns

A list of Document instances with loaded content.

Return type

List[Document]

class langchain.document_loaders.SitemapLoader(web_path: str, filter_urls: Optional[List[str]] = None, parsing_function: Optional[Callable] = None, blocksize: Optional[int] = None, blocknum: int = 0, meta_function: Optional[Callable] = None, is_local: bool = False)[source]#

Loader that fetches a sitemap and loads those URLs.

load() List[langchain.schema.Document][source]#

Load sitemap.

parse_sitemap(soup: Any) List[dict][source]#

Parse sitemap xml and load into a list of dicts.

class langchain.document_loaders.SlackDirectoryLoader(zip_path: str, workspace_url: Optional[str] = None)[source]#

Loader for loading documents from a Slack directory dump.

load() List[langchain.schema.Document][source]#

Load and return documents from the Slack directory dump.

class langchain.document_loaders.SpreedlyLoader(access_token: str, resource: str)[source]#
load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.StripeLoader(resource: str, access_token: Optional[str] = None)[source]#
load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.TelegramChatApiLoader(chat_entity: Optional[EntityLike] = None, api_id: Optional[int] = None, api_hash: Optional[str] = None, username: Optional[str] = None, file_path: str = 'telegram_data.json')[source]#

Loader that loads Telegram chat json directory dump.

async fetch_data_from_telegram() None[source]#

Fetch data from Telegram API and save it as a JSON file.

load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.TelegramChatFileLoader(path: str)[source]#

Loader that loads Telegram chat json directory dump.

load() List[langchain.schema.Document][source]#

Load documents.

langchain.document_loaders.TelegramChatLoader#

alias of langchain.document_loaders.telegram.TelegramChatFileLoader

class langchain.document_loaders.TextLoader(file_path: str, encoding: Optional[str] = None, autodetect_encoding: bool = False)[source]#

Load text files.

Parameters
  • file_path – Path to the file to load.

  • encoding – File encoding to use. If None, the file will be loaded

  • encoding. (with the default system) –

  • autodetect_encoding – Whether to try to autodetect the file encoding if the specified encoding fails.

load() List[langchain.schema.Document][source]#

Load from file path.

class langchain.document_loaders.ToMarkdownLoader(url: str, api_key: str)[source]#

Loader that loads HTML to markdown using 2markdown.

lazy_load() Iterator[langchain.schema.Document][source]#

Lazily load the file.

load() List[langchain.schema.Document][source]#

Load file.

class langchain.document_loaders.TomlLoader(source: Union[str, pathlib.Path])[source]#

A TOML document loader that inherits from the BaseLoader class.

This class can be initialized with either a single source file or a source directory containing TOML files.

lazy_load() Iterator[langchain.schema.Document][source]#

Lazily load the TOML documents from the source file or directory.

load() List[langchain.schema.Document][source]#

Load and return all documents.

class langchain.document_loaders.TrelloLoader(client: TrelloClient, board_name: str, *, include_card_name: bool = True, include_comments: bool = True, include_checklist: bool = True, card_filter: Literal['closed', 'open', 'all'] = 'all', extra_metadata: Tuple[str, ...] = ('due_date', 'labels', 'list', 'closed'))[source]#

Trello loader. Reads all cards from a Trello board.

classmethod from_credentials(board_name: str, *, api_key: Optional[str] = None, token: Optional[str] = None, **kwargs: Any) langchain.document_loaders.trello.TrelloLoader[source]#

Convenience constructor that builds TrelloClient init param for you.

Parameters
  • board_name – The name of the Trello board.

  • api_key – Trello API key. Can also be specified as environment variable TRELLO_API_KEY.

  • token – Trello token. Can also be specified as environment variable TRELLO_TOKEN.

  • include_card_name – Whether to include the name of the card in the document.

  • include_comments – Whether to include the comments on the card in the document.

  • include_checklist – Whether to include the checklist on the card in the document.

  • card_filter – Filter on card status. Valid values are “closed”, “open”, “all”.

  • extra_metadata – List of additional metadata fields to include as document metadata.Valid values are “due_date”, “labels”, “list”, “closed”.

load() List[langchain.schema.Document][source]#

Loads all cards from the specified Trello board.

You can filter the cards, metadata and text included by using the optional

parameters.

Returns:

A list of documents, one for each card in the board.

class langchain.document_loaders.TwitterTweetLoader(auth_handler: Union[OAuthHandler, OAuth2BearerHandler], twitter_users: Sequence[str], number_tweets: Optional[int] = 100)[source]#

Twitter tweets loader. Read tweets of user twitter handle.

First you need to go to https://developer.twitter.com/en/docs/twitter-api /getting-started/getting-access-to-the-twitter-api to get your token. And create a v2 version of the app.

classmethod from_bearer_token(oauth2_bearer_token: str, twitter_users: Sequence[str], number_tweets: Optional[int] = 100) langchain.document_loaders.twitter.TwitterTweetLoader[source]#

Create a TwitterTweetLoader from OAuth2 bearer token.

classmethod from_secrets(access_token: str, access_token_secret: str, consumer_key: str, consumer_secret: str, twitter_users: Sequence[str], number_tweets: Optional[int] = 100) langchain.document_loaders.twitter.TwitterTweetLoader[source]#

Create a TwitterTweetLoader from access tokens and secrets.

load() List[langchain.schema.Document][source]#

Load tweets.

class langchain.document_loaders.UnstructuredAPIFileIOLoader(file: Union[IO, Sequence[IO]], mode: str = 'single', url: str = 'https://api.unstructured.io/general/v0/general', api_key: str = '', **unstructured_kwargs: Any)[source]#

Loader that uses the unstructured web API to load file IO objects.

class langchain.document_loaders.UnstructuredAPIFileLoader(file_path: Union[str, List[str]] = '', mode: str = 'single', url: str = 'https://api.unstructured.io/general/v0/general', api_key: str = '', **unstructured_kwargs: Any)[source]#

Loader that uses the unstructured web API to load files.

class langchain.document_loaders.UnstructuredCSVLoader(file_path: str, mode: str = 'single', **unstructured_kwargs: Any)[source]#

Loader that uses unstructured to load CSV files.

class langchain.document_loaders.UnstructuredEPubLoader(file_path: Union[str, List[str]], mode: str = 'single', **unstructured_kwargs: Any)[source]#

Loader that uses unstructured to load epub files.

class langchain.document_loaders.UnstructuredEmailLoader(file_path: Union[str, List[str]], mode: str = 'single', **unstructured_kwargs: Any)[source]#

Loader that uses unstructured to load email files.

class langchain.document_loaders.UnstructuredExcelLoader(file_path: str, mode: str = 'single', **unstructured_kwargs: Any)[source]#

Loader that uses unstructured to load Microsoft Excel files.

class langchain.document_loaders.UnstructuredFileIOLoader(file: Union[IO, Sequence[IO]], mode: str = 'single', **unstructured_kwargs: Any)[source]#

Loader that uses unstructured to load file IO objects.

class langchain.document_loaders.UnstructuredFileLoader(file_path: Union[str, List[str]], mode: str = 'single', **unstructured_kwargs: Any)[source]#

Loader that uses unstructured to load files.

class langchain.document_loaders.UnstructuredHTMLLoader(file_path: Union[str, List[str]], mode: str = 'single', **unstructured_kwargs: Any)[source]#

Loader that uses unstructured to load HTML files.

class langchain.document_loaders.UnstructuredImageLoader(file_path: Union[str, List[str]], mode: str = 'single', **unstructured_kwargs: Any)[source]#

Loader that uses unstructured to load image files, such as PNGs and JPGs.

class langchain.document_loaders.UnstructuredMarkdownLoader(file_path: Union[str, List[str]], mode: str = 'single', **unstructured_kwargs: Any)[source]#

Loader that uses unstructured to load markdown files.

class langchain.document_loaders.UnstructuredODTLoader(file_path: str, mode: str = 'single', **unstructured_kwargs: Any)[source]#

Loader that uses unstructured to load open office ODT files.

class langchain.document_loaders.UnstructuredPDFLoader(file_path: Union[str, List[str]], mode: str = 'single', **unstructured_kwargs: Any)[source]#

Loader that uses unstructured to load PDF files.

class langchain.document_loaders.UnstructuredPowerPointLoader(file_path: Union[str, List[str]], mode: str = 'single', **unstructured_kwargs: Any)[source]#

Loader that uses unstructured to load powerpoint files.

class langchain.document_loaders.UnstructuredRTFLoader(file_path: str, mode: str = 'single', **unstructured_kwargs: Any)[source]#

Loader that uses unstructured to load rtf files.

class langchain.document_loaders.UnstructuredURLLoader(urls: List[str], continue_on_failure: bool = True, mode: str = 'single', **unstructured_kwargs: Any)[source]#

Loader that uses unstructured to load HTML files.

load() List[langchain.schema.Document][source]#

Load file.

class langchain.document_loaders.UnstructuredWordDocumentLoader(file_path: Union[str, List[str]], mode: str = 'single', **unstructured_kwargs: Any)[source]#

Loader that uses unstructured to load word documents.

class langchain.document_loaders.WeatherDataLoader(client: langchain.utilities.openweathermap.OpenWeatherMapAPIWrapper, places: Sequence[str])[source]#

Weather Reader.

Reads the forecast & current weather of any location using OpenWeatherMap’s free API. Checkout ‘https://openweathermap.org/appid’ for more on how to generate a free OpenWeatherMap API.

classmethod from_params(places: Sequence[str], *, openweathermap_api_key: Optional[str] = None) langchain.document_loaders.weather.WeatherDataLoader[source]#
lazy_load() Iterator[langchain.schema.Document][source]#

Lazily load weather data for the given locations.

load() List[langchain.schema.Document][source]#

Load weather data for the given locations.

class langchain.document_loaders.WebBaseLoader(web_path: Union[str, List[str]], header_template: Optional[dict] = None)[source]#

Loader that uses urllib and beautiful soup to load webpages.

aload() List[langchain.schema.Document][source]#

Load text from the urls in web_path async into Documents.

default_parser: str = 'html.parser'#

Default parser to use for BeautifulSoup.

async fetch_all(urls: List[str]) Any[source]#

Fetch all urls concurrently with rate limiting.

load() List[langchain.schema.Document][source]#

Load text from the url(s) in web_path.

requests_kwargs: Dict[str, Any] = {}#

kwargs for requests

requests_per_second: int = 2#

Max number of concurrent requests to make.

scrape(parser: Optional[str] = None) Any[source]#

Scrape data from webpage and return it in BeautifulSoup format.

scrape_all(urls: List[str], parser: Optional[str] = None) List[Any][source]#

Fetch all urls, then return soups for all results.

property web_path: str#
web_paths: List[str]#
class langchain.document_loaders.WhatsAppChatLoader(path: str)[source]#

Loader that loads WhatsApp messages text file.

load() List[langchain.schema.Document][source]#

Load documents.

class langchain.document_loaders.WikipediaLoader(query: str, lang: str = 'en', load_max_docs: Optional[int] = 100, load_all_available_meta: Optional[bool] = False)[source]#

Loads a query result from www.wikipedia.org into a list of Documents. The hard limit on the number of downloaded Documents is 300 for now.

Each wiki page represents one Document.

load() List[langchain.schema.Document][source]#

Load data into document objects.

class langchain.document_loaders.YoutubeLoader(video_id: str, add_video_info: bool = False, language: Union[str, Sequence[str]] = 'en', translation: str = 'en', continue_on_failure: bool = False)[source]#

Loader that loads Youtube transcripts.

static extract_video_id(youtube_url: str) str[source]#

Extract video id from common YT urls.

classmethod from_youtube_url(youtube_url: str, **kwargs: Any) langchain.document_loaders.youtube.YoutubeLoader[source]#

Given youtube URL, load video.

load() List[langchain.schema.Document][source]#

Load documents.