[docs]classSyntheticDataGenerator(BaseModel):"""Generate synthetic data using the given LLM and few-shot template. Utilizes the provided LLM to produce synthetic data based on the few-shot prompt template. Attributes: template (FewShotPromptTemplate): Template for few-shot prompting. llm (Optional[BaseLanguageModel]): Large Language Model to use for generation. llm_chain (Optional[Chain]): LLM chain with the LLM and few-shot template. example_input_key (str): Key to use for storing example inputs. Usage Example: >>> template = FewShotPromptTemplate(...) >>> llm = BaseLanguageModel(...) >>> generator = SyntheticDataGenerator(template=template, llm=llm) >>> results = generator.generate(subject="climate change", runs=5) """template:FewShotPromptTemplatellm:Optional[BaseLanguageModel]=Noneresults:list=[]llm_chain:Optional[Chain]=Noneexample_input_key:str="example"model_config=ConfigDict(validate_assignment=True,)@model_validator(mode="after")defset_llm_chain(self)->Self:llm_chain=self.llm_chainllm=self.llmfew_shot_template=self.templateifnotllm_chain:# If llm_chain is None or not presentifllmisNoneorfew_shot_templateisNone:raiseValueError("Both llm and few_shot_template must be provided if llm_chain is ""not given.")self.llm_chain=LLMChain(llm=llm,prompt=few_shot_template)returnself@staticmethoddef_format_dict_to_string(input_dict:Dict)->str:formatted_str=", ".join([f"{key}: {value}"forkey,valueininput_dict.items()])returnformatted_strdef_update_examples(self,example:Union[BaseModel,Dict[str,Any],str])->None:"""Prevents duplicates by adding previously generated examples to the few shot list."""ifself.templateandself.template.examples:ifis_basemodel_instance(example):formatted_example=self._format_dict_to_string(cast(BaseModel,example).dict())elifisinstance(example,dict):formatted_example=self._format_dict_to_string(example)else:formatted_example=str(example)self.template.examples.pop(0)self.template.examples.append({self.example_input_key:formatted_example})
[docs]defgenerate(self,subject:str,runs:int,*args:Any,**kwargs:Any)->List[str]:"""Generate synthetic data using the given subject string. Args: subject (str): The subject the synthetic data will be about. runs (int): Number of times to generate the data. extra (str): Extra instructions for steerability in data generation. Returns: List[str]: List of generated synthetic data. Usage Example: >>> results = generator.generate(subject="climate change", runs=5, extra="Focus on environmental impacts.") """ifself.llm_chainisNone:raiseValueError("llm_chain is none, either set either llm_chain or llm at generator ""construction")for_inrange(runs):result=self.llm_chain.run(subject=subject,*args,**kwargs)self.results.append(result)self._update_examples(result)returnself.results
[docs]asyncdefagenerate(self,subject:str,runs:int,extra:str="",*args:Any,**kwargs:Any)->List[str]:"""Generate synthetic data using the given subject asynchronously. Note: Since the LLM calls run concurrently, you may have fewer duplicates by adding specific instructions to the "extra" keyword argument. Args: subject (str): The subject the synthetic data will be about. runs (int): Number of times to generate the data asynchronously. extra (str): Extra instructions for steerability in data generation. Returns: List[str]: List of generated synthetic data for the given subject. Usage Example: >>> results = await generator.agenerate(subject="climate change", runs=5, extra="Focus on env impacts.") """asyncdefrun_chain(subject:str,extra:str="",*args:Any,**kwargs:Any)->None:ifself.llm_chainisnotNone:result=awaitself.llm_chain.arun(subject=subject,extra=extra,*args,**kwargs)self.results.append(result)awaitasyncio.gather(*(run_chain(subject=subject,extra=extra)for_inrange(runs)))returnself.results