[docs]classChatLlamaCpp(BaseChatModel):"""llama.cpp model. To use, you should have the llama-cpp-python library installed, and provide the path to the Llama model as a named parameter to the constructor. Check out: https://github.com/abetlen/llama-cpp-python """client:Any=None#: :meta private:model_path:str"""The path to the Llama model file."""lora_base:Optional[str]=None"""The path to the Llama LoRA base model."""lora_path:Optional[str]=None"""The path to the Llama LoRA. If None, no LoRa is loaded."""n_ctx:int=512"""Token context window."""n_parts:int=-1"""Number of parts to split the model into. If -1, the number of parts is automatically determined."""seed:int=-1"""Seed. If -1, a random seed is used."""f16_kv:bool=True"""Use half-precision for key/value cache."""logits_all:bool=False"""Return logits for all tokens, not just the last token."""vocab_only:bool=False"""Only load the vocabulary, no weights."""use_mlock:bool=False"""Force system to keep model in RAM."""n_threads:Optional[int]=None"""Number of threads to use. If None, the number of threads is automatically determined."""n_batch:int=8"""Number of tokens to process in parallel. Should be a number between 1 and n_ctx."""n_gpu_layers:Optional[int]=None"""Number of layers to be loaded into gpu memory. Default None."""suffix:Optional[str]=None"""A suffix to append to the generated text. If None, no suffix is appended."""max_tokens:int=256"""The maximum number of tokens to generate."""temperature:float=0.8"""The temperature to use for sampling."""top_p:float=0.95"""The top-p value to use for sampling."""logprobs:Optional[int]=None"""The number of logprobs to return. If None, no logprobs are returned."""echo:bool=False"""Whether to echo the prompt."""stop:Optional[List[str]]=None"""A list of strings to stop generation when encountered."""repeat_penalty:float=1.1"""The penalty to apply to repeated tokens."""top_k:int=40"""The top-k value to use for sampling."""last_n_tokens_size:int=64"""The number of tokens to look back when applying the repeat_penalty."""use_mmap:bool=True"""Whether to keep the model loaded in RAM"""rope_freq_scale:float=1.0"""Scale factor for rope sampling."""rope_freq_base:float=10000.0"""Base frequency for rope sampling."""model_kwargs:Dict[str,Any]=Field(default_factory=dict)"""Any additional parameters to pass to llama_cpp.Llama."""streaming:bool=True"""Whether to stream the results, token by token."""grammar_path:Optional[Union[str,Path]]=None""" grammar_path: Path to the .gbnf file that defines formal grammars for constraining model outputs. For instance, the grammar can be used to force the model to generate valid JSON or to speak exclusively in emojis. At most one of grammar_path and grammar should be passed in. """grammar:Any=None""" grammar: formal grammar for constraining model outputs. For instance, the grammar can be used to force the model to generate valid JSON or to speak exclusively in emojis. At most one of grammar_path and grammar should be passed in. """verbose:bool=True"""Print verbose output to stderr."""@model_validator(mode="after")defvalidate_environment(self)->Self:"""Validate that llama-cpp-python library is installed."""try:fromllama_cppimportLlama,LlamaGrammarexceptImportError:raiseImportError("Could not import llama-cpp-python library. ""Please install the llama-cpp-python library to ""use this embedding model: pip install llama-cpp-python")model_path=self.model_pathmodel_param_names=["rope_freq_scale","rope_freq_base","lora_path","lora_base","n_ctx","n_parts","seed","f16_kv","logits_all","vocab_only","use_mlock","n_threads","n_batch","use_mmap","last_n_tokens_size","verbose",]model_params={k:getattr(self,k)forkinmodel_param_names}# For backwards compatibility, only include if non-null.ifself.n_gpu_layersisnotNone:model_params["n_gpu_layers"]=self.n_gpu_layersmodel_params.update(self.model_kwargs)try:self.client=Llama(model_path,**model_params)exceptExceptionase:raiseValueError(f"Could not load Llama model from path: {model_path}. "f"Received error {e}")ifself.grammarandself.grammar_path:grammar=self.grammargrammar_path=self.grammar_pathraiseValueError("Can only pass in one of grammar and grammar_path. Received "f"{grammar=} and {grammar_path=}.")elifisinstance(self.grammar,str):self.grammar=LlamaGrammar.from_string(self.grammar)elifself.grammar_path:self.grammar=LlamaGrammar.from_file(self.grammar_path)else:passreturnselfdef_get_parameters(self,stop:Optional[List[str]])->Dict[str,Any]:""" Performs sanity check, preparing parameters in format needed by llama_cpp. Returns: Dictionary containing the combined parameters. """params=self._default_params# llama_cpp expects the "stop" key not this, so we remove it:stop_sequences=params.pop("stop_sequences")# then sets it as configured, or default to an empty list:params["stop"]=stoporstop_sequencesorself.stopor[]returnparamsdef_create_message_dicts(self,messages:List[BaseMessage])->List[Dict[str,Any]]:message_dicts=[_convert_message_to_dict(m)forminmessages]returnmessage_dictsdef_create_chat_result(self,response:dict)->ChatResult:generations=[]forresinresponse["choices"]:message=_convert_dict_to_message(res["message"])generation_info=dict(finish_reason=res.get("finish_reason"))if"logprobs"inres:generation_info["logprobs"]=res["logprobs"]gen=ChatGeneration(message=message,generation_info=generation_info)generations.append(gen)token_usage=response.get("usage",{})llm_output={"token_usage":token_usage,# "system_fingerprint": response.get("system_fingerprint", ""),}returnChatResult(generations=generations,llm_output=llm_output)def_generate(self,messages:List[BaseMessage],stop:Optional[List[str]]=None,run_manager:Optional[CallbackManagerForLLMRun]=None,**kwargs:Any,)->ChatResult:params={**self._get_parameters(stop),**kwargs}# Check tool_choice is whether available, if yes then run no stream with tool# callingifself.streamingandnotparams.get("tool_choice"):stream_iter=self._stream(messages,run_manager=run_manager,**kwargs)returngenerate_from_stream(stream_iter)message_dicts=self._create_message_dicts(messages)response=self.client.create_chat_completion(messages=message_dicts,**params)returnself._create_chat_result(response)def_stream(self,messages:List[BaseMessage],stop:Optional[List[str]]=None,run_manager:Optional[CallbackManagerForLLMRun]=None,**kwargs:Any,)->Iterator[ChatGenerationChunk]:params={**self._get_parameters(stop),**kwargs}message_dicts=self._create_message_dicts(messages)result=self.client.create_chat_completion(messages=message_dicts,stream=True,**params)default_chunk_class=AIMessageChunkcount=0forchunkinresult:count+=1ifnotisinstance(chunk,dict):chunk=chunk.model_dump()iflen(chunk["choices"])==0:continuechoice=chunk["choices"][0]ifchoice["delta"]isNone:continuechunk=_convert_delta_to_message_chunk(choice["delta"],default_chunk_class)generation_info={}iffinish_reason:=choice.get("finish_reason"):generation_info["finish_reason"]=finish_reasonlogprobs=choice.get("logprobs")iflogprobs:generation_info["logprobs"]=logprobsdefault_chunk_class=chunk.__class__chunk=ChatGenerationChunk(message=chunk,generation_info=generation_infoorNone)ifrun_manager:run_manager.on_llm_new_token(chunk.text,chunk=chunk,logprobs=logprobs)yieldchunk
[docs]defbind_tools(self,tools:Sequence[Union[Dict[str,Any],Type[BaseModel],Callable,BaseTool]],*,tool_choice:Optional[Union[dict,bool,str]]=None,**kwargs:Any,)->Runnable[LanguageModelInput,BaseMessage]:"""Bind tool-like objects to this chat model tool_choice: does not currently support "any", "auto" choices like OpenAI tool-calling API. should be a dict of the form to force this tool {"type": "function", "function": {"name": <<tool_name>>}}. """formatted_tools=[convert_to_openai_tool(tool)fortoolintools]tool_names=[ft["function"]["name"]forftinformatted_tools]iftool_choice:ifisinstance(tool_choice,dict):ifnotany(tool_choice["function"]["name"]==namefornameintool_names):raiseValueError(f"Tool choice {tool_choice=} was specified, but the only "f"provided tools were {tool_names}.")elifisinstance(tool_choice,str):chosen=[fforfinformatted_toolsiff["function"]["name"]==tool_choice]ifnotchosen:raiseValueError(f"Tool choice {tool_choice=} was specified, but the only "f"provided tools were {tool_names}.")elifisinstance(tool_choice,bool):iflen(formatted_tools)>1:raiseValueError("tool_choice=True can only be specified when a single tool is "f"passed in. Received {len(tools)} tools.")tool_choice=formatted_tools[0]else:raiseValueError("""Unrecognized tool_choice type. Expected dict having format like this {"type": "function", "function": {"name": <<tool_name>>}}"""f"Received: {tool_choice}")kwargs["tool_choice"]=tool_choiceformatted_tools=[convert_to_openai_tool(tool)fortoolintools]returnsuper().bind(tools=formatted_tools,**kwargs)
[docs]defwith_structured_output(self,schema:Optional[Union[Dict,Type[BaseModel]]]=None,*,include_raw:bool=False,**kwargs:Any,)->Runnable[LanguageModelInput,Union[Dict,BaseModel]]:"""Model wrapper that returns outputs formatted to match the given schema. Args: schema: The output schema as a dict or a Pydantic class. If a Pydantic class then the model output will be an object of that class. If a dict then the model output will be a dict. With a Pydantic class the returned attributes will be validated, whereas with a dict they will not be. If `method` is "function_calling" and `schema` is a dict, then the dict must match the OpenAI function-calling spec or be a valid JSON schema with top level 'title' and 'description' keys specified. include_raw: If False then only the parsed structured output is returned. If an error occurs during model output parsing it will be raised. If True then both the raw model response (a BaseMessage) and the parsed model response will be returned. If an error occurs during output parsing it will be caught and returned as well. The final output is always a dict with keys "raw", "parsed", and "parsing_error". kwargs: Any other args to bind to model, ``self.bind(..., **kwargs)``. Returns: A Runnable that takes any ChatModel input and returns as output: If include_raw is True then a dict with keys: raw: BaseMessage parsed: Optional[_DictOrPydantic] parsing_error: Optional[BaseException] If include_raw is False then just _DictOrPydantic is returned, where _DictOrPydantic depends on the schema: If schema is a Pydantic class then _DictOrPydantic is the Pydantic class. If schema is a dict then _DictOrPydantic is a dict. Example: Pydantic schema (include_raw=False): .. code-block:: python from langchain_community.chat_models import ChatLlamaCpp from pydantic import BaseModel class AnswerWithJustification(BaseModel): '''An answer to the user question along with justification for the answer.''' answer: str justification: str llm = ChatLlamaCpp( temperature=0., model_path="./SanctumAI-meta-llama-3-8b-instruct.Q8_0.gguf", n_ctx=10000, n_gpu_layers=4, n_batch=200, max_tokens=512, n_threads=multiprocessing.cpu_count() - 1, repeat_penalty=1.5, top_p=0.5, stop=["<|end_of_text|>", "<|eot_id|>"], ) structured_llm = llm.with_structured_output(AnswerWithJustification) structured_llm.invoke("What weighs more a pound of bricks or a pound of feathers") # -> AnswerWithJustification( # answer='They weigh the same', # justification='Both a pound of bricks and a pound of feathers weigh one pound. The weight is the same, but the volume or density of the objects may differ.' # ) Example: Pydantic schema (include_raw=True): .. code-block:: python from langchain_community.chat_models import ChatLlamaCpp from pydantic import BaseModel class AnswerWithJustification(BaseModel): '''An answer to the user question along with justification for the answer.''' answer: str justification: str llm = ChatLlamaCpp( temperature=0., model_path="./SanctumAI-meta-llama-3-8b-instruct.Q8_0.gguf", n_ctx=10000, n_gpu_layers=4, n_batch=200, max_tokens=512, n_threads=multiprocessing.cpu_count() - 1, repeat_penalty=1.5, top_p=0.5, stop=["<|end_of_text|>", "<|eot_id|>"], ) structured_llm = llm.with_structured_output(AnswerWithJustification, include_raw=True) structured_llm.invoke("What weighs more a pound of bricks or a pound of feathers") # -> { # 'raw': AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_Ao02pnFYXD6GN1yzc0uXPsvF', 'function': {'arguments': '{"answer":"They weigh the same.","justification":"Both a pound of bricks and a pound of feathers weigh one pound. The weight is the same, but the volume or density of the objects may differ."}', 'name': 'AnswerWithJustification'}, 'type': 'function'}]}), # 'parsed': AnswerWithJustification(answer='They weigh the same.', justification='Both a pound of bricks and a pound of feathers weigh one pound. The weight is the same, but the volume or density of the objects may differ.'), # 'parsing_error': None # } Example: dict schema (include_raw=False): .. code-block:: python from langchain_community.chat_models import ChatLlamaCpp from pydantic import BaseModel from langchain_core.utils.function_calling import convert_to_openai_tool class AnswerWithJustification(BaseModel): '''An answer to the user question along with justification for the answer.''' answer: str justification: str dict_schema = convert_to_openai_tool(AnswerWithJustification) llm = ChatLlamaCpp( temperature=0., model_path="./SanctumAI-meta-llama-3-8b-instruct.Q8_0.gguf", n_ctx=10000, n_gpu_layers=4, n_batch=200, max_tokens=512, n_threads=multiprocessing.cpu_count() - 1, repeat_penalty=1.5, top_p=0.5, stop=["<|end_of_text|>", "<|eot_id|>"], ) structured_llm = llm.with_structured_output(dict_schema) structured_llm.invoke("What weighs more a pound of bricks or a pound of feathers") # -> { # 'answer': 'They weigh the same', # 'justification': 'Both a pound of bricks and a pound of feathers weigh one pound. The weight is the same, but the volume and density of the two substances differ.' # } """# noqa: E501ifkwargs:raiseValueError(f"Received unsupported arguments {kwargs}")is_pydantic_schema=isinstance(schema,type)andis_basemodel_subclass(schema)ifschemaisNone:raiseValueError("schema must be specified when method is 'function_calling'. ""Received None.")tool_name=convert_to_openai_tool(schema)["function"]["name"]tool_choice={"type":"function","function":{"name":tool_name}}llm=self.bind_tools([schema],tool_choice=tool_choice)ifis_pydantic_schema:output_parser:OutputParserLike=PydanticToolsParser(tools=[cast(Type,schema)],first_tool_only=True)else:output_parser=JsonOutputKeyToolsParser(key_name=tool_name,first_tool_only=True)ifinclude_raw:parser_assign=RunnablePassthrough.assign(parsed=itemgetter("raw")|output_parser,parsing_error=lambda_:None)parser_none=RunnablePassthrough.assign(parsed=lambda_:None)parser_with_fallback=parser_assign.with_fallbacks([parser_none],exception_key="parsing_error")returnRunnableMap(raw=llm)|parser_with_fallbackelse:returnllm|output_parser
@propertydef_identifying_params(self)->Dict[str,Any]:"""Return a dictionary of identifying parameters. This information is used by the LangChain callback system, which is used for tracing purposes make it possible to monitor LLMs. """return{# The model name allows users to specify custom token counting# rules in LLM monitoring applications (e.g., in LangSmith users# can provide per token pricing for their model and monitor# costs for the given LLM.)**{"model_path":self.model_path},**self._default_params,}@propertydef_llm_type(self)->str:"""Get the type of language model used by this chat model."""return"llama-cpp-python"@propertydef_default_params(self)->Dict[str,Any]:"""Get the default parameters for calling create_chat_completion."""params:Dict={"max_tokens":self.max_tokens,"temperature":self.temperature,"top_p":self.top_p,"top_k":self.top_k,"logprobs":self.logprobs,"stop_sequences":self.stop,# key here is convention among LLM classes"repeat_penalty":self.repeat_penalty,}ifself.grammar:params["grammar"]=self.grammarreturnparams
def_lc_tool_call_to_openai_tool_call(tool_call:ToolCall)->dict:return{"type":"function","id":tool_call["id"],"function":{"name":tool_call["name"],"arguments":json.dumps(tool_call["args"]),},}def_lc_invalid_tool_call_to_openai_tool_call(invalid_tool_call:InvalidToolCall,)->dict:return{"type":"function","id":invalid_tool_call["id"],"function":{"name":invalid_tool_call["name"],"arguments":invalid_tool_call["args"],},}def_convert_dict_to_message(_dict:Mapping[str,Any])->BaseMessage:"""Convert a dictionary to a LangChain message. Args: _dict: The dictionary. Returns: The LangChain message. """role=_dict.get("role")name=_dict.get("name")id_=_dict.get("id")ifrole=="user":returnHumanMessage(content=_dict.get("content",""),id=id_,name=name)elifrole=="assistant":# Fix for azure# Also OpenAI returns None for tool invocationscontent=_dict.get("content","")or""additional_kwargs:Dict={}iffunction_call:=_dict.get("function_call"):additional_kwargs["function_call"]=dict(function_call)tool_calls=[]invalid_tool_calls=[]ifraw_tool_calls:=_dict.get("tool_calls"):additional_kwargs["tool_calls"]=raw_tool_callsforraw_tool_callinraw_tool_calls:try:tc=parse_tool_call(raw_tool_call,return_id=True)exceptExceptionase:invalid_tc=make_invalid_tool_call(raw_tool_call,str(e))invalid_tool_calls.append(invalid_tc)else:ifnottc:continueelse:tool_calls.append(tc)returnAIMessage(content=content,additional_kwargs=additional_kwargs,name=name,id=id_,tool_calls=tool_calls,# type: ignore[arg-type]invalid_tool_calls=invalid_tool_calls,)elifrole=="system":returnSystemMessage(content=_dict.get("content",""),name=name,id=id_)elifrole=="function":returnFunctionMessage(content=_dict.get("content",""),name=cast(str,_dict.get("name")),id=id_)elifrole=="tool":additional_kwargs={}if"name"in_dict:additional_kwargs["name"]=_dict["name"]returnToolMessage(content=_dict.get("content",""),tool_call_id=cast(str,_dict.get("tool_call_id")),additional_kwargs=additional_kwargs,name=name,id=id_,)else:returnChatMessage(content=_dict.get("content",""),role=cast(str,role),id=id_)def_format_message_content(content:Any)->Any:"""Format message content."""ifcontentandisinstance(content,list):# Remove unexpected block typesformatted_content=[]forblockincontent:if(isinstance(block,dict)and"type"inblockandblock["type"]=="tool_use"):continueelse:formatted_content.append(block)else:formatted_content=contentreturnformatted_contentdef_convert_message_to_dict(message:BaseMessage)->dict:"""Convert a LangChain message to a dictionary. Args: message: The LangChain message. Returns: The dictionary. """message_dict:Dict[str,Any]={"content":_format_message_content(message.content),}if(name:=message.nameormessage.additional_kwargs.get("name"))isnotNone:message_dict["name"]=name# populate role and additional message dataifisinstance(message,ChatMessage):message_dict["role"]=message.roleelifisinstance(message,HumanMessage):message_dict["role"]="user"elifisinstance(message,AIMessage):message_dict["role"]="assistant"if"function_call"inmessage.additional_kwargs:message_dict["function_call"]=message.additional_kwargs["function_call"]ifmessage.tool_callsormessage.invalid_tool_calls:message_dict["tool_calls"]=[_lc_tool_call_to_openai_tool_call(tc)fortcinmessage.tool_calls]+[_lc_invalid_tool_call_to_openai_tool_call(tc)fortcinmessage.invalid_tool_calls]elif"tool_calls"inmessage.additional_kwargs:message_dict["tool_calls"]=message.additional_kwargs["tool_calls"]tool_call_supported_props={"id","type","function"}message_dict["tool_calls"]=[{k:vfork,vintool_call.items()ifkintool_call_supported_props}fortool_callinmessage_dict["tool_calls"]]else:pass# If tool calls present, content null value should be None not empty string.if"function_call"inmessage_dictor"tool_calls"inmessage_dict:message_dict["content"]=message_dict["content"]orNoneelifisinstance(message,SystemMessage):message_dict["role"]="system"elifisinstance(message,FunctionMessage):message_dict["role"]="function"elifisinstance(message,ToolMessage):message_dict["role"]="tool"message_dict["tool_call_id"]=message.tool_call_idsupported_props={"content","role","tool_call_id"}message_dict={k:vfork,vinmessage_dict.items()ifkinsupported_props}else:raiseTypeError(f"Got unknown type {message}")returnmessage_dictdef_convert_delta_to_message_chunk(_dict:Mapping[str,Any],default_class:Type[BaseMessageChunk])->BaseMessageChunk:id_=_dict.get("id")role=cast(str,_dict.get("role"))content=cast(str,_dict.get("content")or"")additional_kwargs:Dict={}if_dict.get("function_call"):function_call=dict(_dict["function_call"])if"name"infunction_callandfunction_call["name"]isNone:function_call["name"]=""additional_kwargs["function_call"]=function_calltool_call_chunks=[]ifraw_tool_calls:=_dict.get("tool_calls"):additional_kwargs["tool_calls"]=raw_tool_callsforrtcinraw_tool_calls:try:tool_call=ToolCallChunk(name=rtc["function"].get("name"),args=rtc["function"].get("arguments"),id=rtc.get("id"),index=rtc["index"],)tool_call_chunks.append(tool_call)exceptKeyError:passifrole=="user"ordefault_class==HumanMessageChunk:returnHumanMessageChunk(content=content,id=id_)elifrole=="assistant"ordefault_class==AIMessageChunk:returnAIMessageChunk(content=content,additional_kwargs=additional_kwargs,id=id_,tool_call_chunks=tool_call_chunks,)elifrole=="system"ordefault_class==SystemMessageChunk:returnSystemMessageChunk(content=content,id=id_)elifrole=="function"ordefault_class==FunctionMessageChunk:returnFunctionMessageChunk(content=content,name=_dict["name"],id=id_)elifrole=="tool"ordefault_class==ToolMessageChunk:returnToolMessageChunk(content=content,tool_call_id=_dict["tool_call_id"],id=id_)elifroleordefault_class==ChatMessageChunk:returnChatMessageChunk(content=content,role=role,id=id_)else:returndefault_class(content=content,id=id_)# type: ignore