Source code for langchain_community.document_transformers.doctran_text_translate

from __future__ import annotations

import asyncio
from typing import Any, Optional, Sequence

from langchain_core.documents import BaseDocumentTransformer, Document
from langchain_core.runnables.config import run_in_executor
from langchain_core.utils import get_from_env


[docs]class DoctranTextTranslator(BaseDocumentTransformer): """Translate text documents using doctran. Arguments: openai_api_key: OpenAI API key. Can also be specified via environment variable ``OPENAI_API_KEY``. language: The language to translate *to*. Example: .. code-block:: python from langchain_community.document_transformers import DoctranTextTranslator # Pass in openai_api_key or set env var OPENAI_API_KEY qa_translator = DoctranTextTranslator(language="spanish") translated_document = await qa_translator.atransform_documents(documents) """
[docs] def __init__( self, openai_api_key: Optional[str] = None, language: str = "english", openai_api_model: Optional[str] = None, ) -> None: self.openai_api_key = openai_api_key or get_from_env( "openai_api_key", "OPENAI_API_KEY" ) self.openai_api_model = openai_api_model or get_from_env( "openai_api_model", "OPENAI_API_MODEL" ) self.language = language
async def _aparse_document( self, doctran: Any, index: int, doc: Document ) -> tuple[int, Any]: parsed_doc = await run_in_executor( None, doctran.parse, content=doc.page_content, metadata=doc.metadata ) return index, parsed_doc async def _atranslate_document( self, index: int, doc: Any, language: str ) -> tuple[int, Any]: translated_doc = await run_in_executor( None, lambda: doc.translate(language=language).execute() ) return index, translated_doc
[docs] async def atransform_documents( self, documents: Sequence[Document], **kwargs: Any ) -> Sequence[Document]: """Translates text documents using doctran.""" try: from doctran import Doctran doctran = Doctran( openai_api_key=self.openai_api_key, openai_model=self.openai_api_model ) except ImportError: raise ImportError( "Install doctran to use this parser. (pip install doctran)" ) parse_tasks = [ self._aparse_document(doctran, i, doc) for i, doc in enumerate(documents) ] parsed_results = await asyncio.gather(*parse_tasks) parsed_results.sort(key=lambda x: x[0]) doctran_docs = [doc for _, doc in parsed_results] translate_tasks = [ self._atranslate_document(i, doc, self.language) for i, doc in enumerate(doctran_docs) ] translated_results = await asyncio.gather(*translate_tasks) translated_results.sort(key=lambda x: x[0]) translated_docs = [doc for _, doc in translated_results] return [ Document(page_content=doc.transformed_content, metadata=doc.metadata) for doc in translated_docs ]
[docs] def transform_documents( self, documents: Sequence[Document], **kwargs: Any ) -> Sequence[Document]: """Translates text documents using doctran.""" try: from doctran import Doctran doctran = Doctran( openai_api_key=self.openai_api_key, openai_model=self.openai_api_model ) except ImportError: raise ImportError( "Install doctran to use this parser. (pip install doctran)" ) doctran_docs = [ doctran.parse(content=doc.page_content, metadata=doc.metadata) for doc in documents ] for i, doc in enumerate(doctran_docs): doctran_docs[i] = doc.translate(language=self.language).execute() return [ Document(page_content=doc.transformed_content, metadata=doc.metadata) for doc in doctran_docs ]