Source code for langchain_experimental.tabular_synthetic_data.base
import asyncio
from typing import Any, Dict, List, Optional, Union, cast
from langchain.chains.base import Chain
from langchain.chains.llm import LLMChain
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts.few_shot import FewShotPromptTemplate
from langchain_core.utils.pydantic import is_basemodel_instance
from pydantic import BaseModel, ConfigDict, model_validator
from typing_extensions import Self
[docs]
class SyntheticDataGenerator(BaseModel):
"""Generate synthetic data using the given LLM and few-shot template.
Utilizes the provided LLM to produce synthetic data based on the
few-shot prompt template.
Attributes:
template (FewShotPromptTemplate): Template for few-shot prompting.
llm (Optional[BaseLanguageModel]): Large Language Model to use for generation.
llm_chain (Optional[Chain]): LLM chain with the LLM and few-shot template.
example_input_key (str): Key to use for storing example inputs.
Usage Example:
>>> template = FewShotPromptTemplate(...)
>>> llm = BaseLanguageModel(...)
>>> generator = SyntheticDataGenerator(template=template, llm=llm)
>>> results = generator.generate(subject="climate change", runs=5)
"""
template: FewShotPromptTemplate
llm: Optional[BaseLanguageModel] = None
results: list = []
llm_chain: Optional[Chain] = None
example_input_key: str = "example"
model_config = ConfigDict(
validate_assignment=True,
)
@model_validator(mode="after")
def set_llm_chain(self) -> Self:
llm_chain = self.llm_chain
llm = self.llm
few_shot_template = self.template
if not llm_chain: # If llm_chain is None or not present
if llm is None or few_shot_template is None:
raise ValueError(
"Both llm and few_shot_template must be provided if llm_chain is "
"not given."
)
self.llm_chain = LLMChain(llm=llm, prompt=few_shot_template)
return self
@staticmethod
def _format_dict_to_string(input_dict: Dict) -> str:
formatted_str = ", ".join(
[f"{key}: {value}" for key, value in input_dict.items()]
)
return formatted_str
def _update_examples(self, example: Union[BaseModel, Dict[str, Any], str]) -> None:
"""Prevents duplicates by adding previously generated examples to the few shot
list."""
if self.template and self.template.examples:
if is_basemodel_instance(example):
formatted_example = self._format_dict_to_string(
cast(BaseModel, example).dict()
)
elif isinstance(example, dict):
formatted_example = self._format_dict_to_string(example)
else:
formatted_example = str(example)
self.template.examples.pop(0)
self.template.examples.append({self.example_input_key: formatted_example})
[docs]
def generate(self, subject: str, runs: int, *args: Any, **kwargs: Any) -> List[str]:
"""Generate synthetic data using the given subject string.
Args:
subject (str): The subject the synthetic data will be about.
runs (int): Number of times to generate the data.
extra (str): Extra instructions for steerability in data generation.
Returns:
List[str]: List of generated synthetic data.
Usage Example:
>>> results = generator.generate(subject="climate change", runs=5,
extra="Focus on environmental impacts.")
"""
if self.llm_chain is None:
raise ValueError(
"llm_chain is none, either set either llm_chain or llm at generator "
"construction"
)
for _ in range(runs):
result = self.llm_chain.run(subject=subject, *args, **kwargs)
self.results.append(result)
self._update_examples(result)
return self.results
[docs]
async def agenerate(
self, subject: str, runs: int, extra: str = "", *args: Any, **kwargs: Any
) -> List[str]:
"""Generate synthetic data using the given subject asynchronously.
Note: Since the LLM calls run concurrently,
you may have fewer duplicates by adding specific instructions to
the "extra" keyword argument.
Args:
subject (str): The subject the synthetic data will be about.
runs (int): Number of times to generate the data asynchronously.
extra (str): Extra instructions for steerability in data generation.
Returns:
List[str]: List of generated synthetic data for the given subject.
Usage Example:
>>> results = await generator.agenerate(subject="climate change", runs=5,
extra="Focus on env impacts.")
"""
async def run_chain(
subject: str, extra: str = "", *args: Any, **kwargs: Any
) -> None:
if self.llm_chain is not None:
result = await self.llm_chain.arun(
subject=subject, extra=extra, *args, **kwargs
)
self.results.append(result)
await asyncio.gather(
*(run_chain(subject=subject, extra=extra) for _ in range(runs))
)
return self.results