Source code for langchain_community.document_transformers.nuclia_text_transform
import asyncio
import json
import uuid
from typing import Any, Sequence
from langchain_core.documents import BaseDocumentTransformer, Document
from langchain_community.tools.nuclia.tool import NucliaUnderstandingAPI
[docs]class NucliaTextTransformer(BaseDocumentTransformer):
"""Nuclia Text Transformer.
The Nuclia Understanding API splits into paragraphs and sentences,
identifies entities, provides a summary of the text and generates
embeddings for all sentences.
"""
[docs] def __init__(self, nua: NucliaUnderstandingAPI):
self.nua = nua
[docs] def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
raise NotImplementedError
[docs] async def atransform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
tasks = [
self.nua.arun(
{
"action": "push",
"id": str(uuid.uuid4()),
"text": doc.page_content,
"path": None,
}
)
for doc in documents
]
results = await asyncio.gather(*tasks)
for doc, result in zip(documents, results):
obj = json.loads(result)
metadata = {
"file": obj["file_extracted_data"][0],
"metadata": obj["field_metadata"][0],
}
doc.metadata["nuclia"] = metadata
return documents