Source code for langchain_community.document_loaders.needle
from typing import Dict, Iterator, List, Optional
from langchain_core.document_loaders.base import BaseLoader
from langchain_core.documents import Document
[docs]
class NeedleLoader(BaseLoader):
"""
NeedleLoader is a document loader for managing documents stored in a collection.
Setup:
Install the `needle-python` library and set your Needle API key.
.. code-block:: bash
pip install needle-python
export NEEDLE_API_KEY="your-api-key"
Key init args:
- `needle_api_key` (Optional[str]): API key for authenticating with Needle.
- `collection_id` (str): Needle collection to load documents from.
Usage:
.. code-block:: python
from langchain_community.document_loaders.needle import NeedleLoader
loader = NeedleLoader(
needle_api_key="your-api-key",
collection_id="your-collection-id"
)
# Load documents
documents = loader.load()
for doc in documents:
print(doc.metadata)
# Lazy load documents
for doc in loader.lazy_load():
print(doc.metadata)
"""
[docs]
def __init__(
self,
needle_api_key: Optional[str] = None,
collection_id: Optional[str] = None,
) -> None:
"""
Initializes the NeedleLoader with API key and collection ID.
Args:
needle_api_key (Optional[str]): API key for authenticating with Needle.
collection_id (Optional[str]): Identifier for the Needle collection.
Raises:
ImportError: If the `needle-python` library is not installed.
ValueError: If the collection ID is not provided.
"""
try:
from needle.v1 import NeedleClient
except ImportError:
raise ImportError(
"Please install with `pip install needle-python` to use NeedleLoader."
)
super().__init__()
self.needle_api_key = needle_api_key
self.collection_id = collection_id
self.client: Optional[NeedleClient] = None
if self.needle_api_key:
self.client = NeedleClient(api_key=self.needle_api_key)
if not self.collection_id:
raise ValueError("Collection ID must be provided.")
def _get_collection(self) -> None:
"""
Ensures the Needle collection is set and the client is initialized.
Raises:
ValueError: If the Needle client is not initialized or
if the collection ID is missing.
"""
if self.client is None:
raise ValueError(
"NeedleClient is not initialized. Provide a valid API key."
)
if not self.collection_id:
raise ValueError("Collection ID must be provided.")
[docs]
def add_files(self, files: Dict[str, str]) -> None:
"""
Adds files to the Needle collection.
Args:
files (Dict[str, str]): Dictionary where keys are file names and values
are file URLs.
Raises:
ImportError: If the `needle-python` library is not installed.
ValueError: If the collection is not properly initialized.
"""
try:
from needle.v1.models import FileToAdd
except ImportError:
raise ImportError(
"Please install with `pip install needle-python` to add files."
)
self._get_collection()
assert self.client is not None, "NeedleClient must be initialized."
files_to_add = [FileToAdd(name=name, url=url) for name, url in files.items()]
self.client.collections.files.add(
collection_id=self.collection_id, files=files_to_add
)
def _fetch_documents(self) -> List[Document]:
"""
Fetches metadata for documents from the Needle collection.
Returns:
List[Document]: A list of documents with metadata. Content is excluded.
Raises:
ValueError: If the collection is not properly initialized.
"""
self._get_collection()
assert self.client is not None, "NeedleClient must be initialized."
files = self.client.collections.files.list(self.collection_id)
docs = [
Document(
page_content="", # Needle doesn't provide file content fetching
metadata={
"source": file.url,
"title": file.name,
"size": getattr(file, "size", None),
},
)
for file in files
if file.status == "indexed"
]
return docs
[docs]
def load(self) -> List[Document]:
"""
Loads all documents from the Needle collection.
Returns:
List[Document]: A list of documents from the collection.
"""
return self._fetch_documents()
[docs]
def lazy_load(self) -> Iterator[Document]:
"""
Lazily loads documents from the Needle collection.
Yields:
Iterator[Document]: An iterator over the documents.
"""
yield from self._fetch_documents()