Source code for langchain_experimental.tabular_synthetic_data.base

import asyncio
from typing import Any, Dict, List, Optional, Union, cast

from langchain.chains.base import Chain
from langchain.chains.llm import LLMChain
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts.few_shot import FewShotPromptTemplate
from langchain_core.utils.pydantic import is_basemodel_instance
from pydantic import BaseModel, ConfigDict, model_validator
from typing_extensions import Self


[docs] class SyntheticDataGenerator(BaseModel): """Generate synthetic data using the given LLM and few-shot template. Utilizes the provided LLM to produce synthetic data based on the few-shot prompt template. Attributes: template (FewShotPromptTemplate): Template for few-shot prompting. llm (Optional[BaseLanguageModel]): Large Language Model to use for generation. llm_chain (Optional[Chain]): LLM chain with the LLM and few-shot template. example_input_key (str): Key to use for storing example inputs. Usage Example: >>> template = FewShotPromptTemplate(...) >>> llm = BaseLanguageModel(...) >>> generator = SyntheticDataGenerator(template=template, llm=llm) >>> results = generator.generate(subject="climate change", runs=5) """ template: FewShotPromptTemplate llm: Optional[BaseLanguageModel] = None results: list = [] llm_chain: Optional[Chain] = None example_input_key: str = "example" model_config = ConfigDict( validate_assignment=True, ) @model_validator(mode="after") def set_llm_chain(self) -> Self: llm_chain = self.llm_chain llm = self.llm few_shot_template = self.template if not llm_chain: # If llm_chain is None or not present if llm is None or few_shot_template is None: raise ValueError( "Both llm and few_shot_template must be provided if llm_chain is " "not given." ) self.llm_chain = LLMChain(llm=llm, prompt=few_shot_template) return self @staticmethod def _format_dict_to_string(input_dict: Dict) -> str: formatted_str = ", ".join( [f"{key}: {value}" for key, value in input_dict.items()] ) return formatted_str def _update_examples(self, example: Union[BaseModel, Dict[str, Any], str]) -> None: """Prevents duplicates by adding previously generated examples to the few shot list.""" if self.template and self.template.examples: if is_basemodel_instance(example): formatted_example = self._format_dict_to_string( cast(BaseModel, example).dict() ) elif isinstance(example, dict): formatted_example = self._format_dict_to_string(example) else: formatted_example = str(example) self.template.examples.pop(0) self.template.examples.append({self.example_input_key: formatted_example})
[docs] def generate(self, subject: str, runs: int, *args: Any, **kwargs: Any) -> List[str]: """Generate synthetic data using the given subject string. Args: subject (str): The subject the synthetic data will be about. runs (int): Number of times to generate the data. extra (str): Extra instructions for steerability in data generation. Returns: List[str]: List of generated synthetic data. Usage Example: >>> results = generator.generate(subject="climate change", runs=5, extra="Focus on environmental impacts.") """ if self.llm_chain is None: raise ValueError( "llm_chain is none, either set either llm_chain or llm at generator " "construction" ) for _ in range(runs): result = self.llm_chain.run(subject=subject, *args, **kwargs) self.results.append(result) self._update_examples(result) return self.results
[docs] async def agenerate( self, subject: str, runs: int, extra: str = "", *args: Any, **kwargs: Any ) -> List[str]: """Generate synthetic data using the given subject asynchronously. Note: Since the LLM calls run concurrently, you may have fewer duplicates by adding specific instructions to the "extra" keyword argument. Args: subject (str): The subject the synthetic data will be about. runs (int): Number of times to generate the data asynchronously. extra (str): Extra instructions for steerability in data generation. Returns: List[str]: List of generated synthetic data for the given subject. Usage Example: >>> results = await generator.agenerate(subject="climate change", runs=5, extra="Focus on env impacts.") """ async def run_chain( subject: str, extra: str = "", *args: Any, **kwargs: Any ) -> None: if self.llm_chain is not None: result = await self.llm_chain.arun( subject=subject, extra=extra, *args, **kwargs ) self.results.append(result) await asyncio.gather( *(run_chain(subject=subject, extra=extra) for _ in range(runs)) ) return self.results