[docs]classExLlamaV2(LLM):"""ExllamaV2 API. - working only with GPTQ models for now. - Lora models are not supported yet. To use, you should have the exllamav2 library installed, and provide the path to the Llama model as a named parameter to the constructor. Check out: Example: .. code-block:: python from langchain_community.llms import Exllamav2 llm = Exllamav2(model_path="/path/to/llama/model") #TODO: - Add loras support - Add support for custom settings - Add support for custom stop sequences """client:Any=Nonemodel_path:strexllama_cache:Any=Noneconfig:Any=Nonegenerator:Any=Nonetokenizer:Any=None# If settings is None, it will be used as the default settings for the model.# All other parameters won't be used.settings:Any=None# Langchain parameterslogfunc:Callable=printstop_sequences:List[str]=Field([])"""Sequences that immediately will stop the generator."""max_new_tokens:int=Field(150)"""Maximum number of tokens to generate."""streaming:bool=Field(True)"""Whether to stream the results, token by token."""verbose:bool=Field(True)"""Whether to print debug information."""# Generator parametersdisallowed_tokens:Optional[List[int]]=Field(None)"""List of tokens to disallow during generation."""
[docs]@pre_initdefvalidate_environment(cls,values:Dict[str,Any])->Dict[str,Any]:try:importtorchexceptImportErrorase:raiseImportError("Unable to import torch, please install with `pip install torch`.")frome# check if cuda is availableifnottorch.cuda.is_available():raiseEnvironmentError("CUDA is not available. ExllamaV2 requires CUDA.")try:fromexllamav2import(ExLlamaV2,ExLlamaV2Cache,ExLlamaV2Config,ExLlamaV2Tokenizer,)fromexllamav2.generatorimport(ExLlamaV2BaseGenerator,ExLlamaV2StreamingGenerator,)exceptImportError:raiseImportError("Could not import exllamav2 library. ""Please install the exllamav2 library with (cuda 12.1 is required)""example : ""!python -m pip install https://github.com/turboderp/exllamav2/releases/download/v0.0.12/exllamav2-0.0.12+cu121-cp311-cp311-linux_x86_64.whl")# Set logging function if verbose or set to empty lambdaverbose=values["verbose"]ifnotverbose:values["logfunc"]=lambda*args,**kwargs:Nonelogfunc=values["logfunc"]ifvalues["settings"]:settings=values["settings"]logfunc(settings.__dict__)else:raiseNotImplementedError("settings is required. Custom settings are not supported yet.")config=ExLlamaV2Config()config.model_dir=values["model_path"]config.prepare()model=ExLlamaV2(config)exllama_cache=ExLlamaV2Cache(model,lazy=True)model.load_autosplit(exllama_cache)tokenizer=ExLlamaV2Tokenizer(config)ifvalues["streaming"]:generator=ExLlamaV2StreamingGenerator(model,exllama_cache,tokenizer)else:generator=ExLlamaV2BaseGenerator(model,exllama_cache,tokenizer)# Configure the model and generatorvalues["stop_sequences"]=[x.strip().lower()forxinvalues["stop_sequences"]]setattr(settings,"stop_sequences",values["stop_sequences"])logfunc(f"stop_sequences {values['stop_sequences']}")disallowed=values.get("disallowed_tokens")ifdisallowed:settings.disallow_tokens(tokenizer,disallowed)values["client"]=modelvalues["generator"]=generatorvalues["config"]=configvalues["tokenizer"]=tokenizervalues["exllama_cache"]=exllama_cachereturnvalues
@propertydef_llm_type(self)->str:"""Return type of llm."""return"ExLlamaV2"
[docs]defget_num_tokens(self,text:str)->int:"""Get the number of tokens present in the text."""returnself.generator.tokenizer.num_tokens(text)
def_call(self,prompt:str,stop:Optional[List[str]]=None,run_manager:Optional[CallbackManagerForLLMRun]=None,**kwargs:Any,)->str:generator=self.generatorifself.streaming:combined_text_output=""forchunkinself._stream(prompt=prompt,stop=stop,run_manager=run_manager,kwargs=kwargs):combined_text_output+=str(chunk)returncombined_text_outputelse:output=generator.generate_simple(prompt=prompt,gen_settings=self.settings,num_tokens=self.max_new_tokens,)# subtract subtext from outputoutput=output[len(prompt):]returnoutputdef_stream(self,prompt:str,stop:Optional[List[str]]=None,run_manager:Optional[CallbackManagerForLLMRun]=None,**kwargs:Any,)->Iterator[GenerationChunk]:input_ids=self.tokenizer.encode(prompt)self.generator.warmup()self.generator.set_stop_conditions([])self.generator.begin_stream(input_ids,self.settings)generated_tokens=0whileTrue:chunk,eos,_=self.generator.stream()generated_tokens+=1ifrun_manager:run_manager.on_llm_new_token(token=chunk,verbose=self.verbose,)yieldchunkifeosorgenerated_tokens==self.max_new_tokens:breakreturn