Source code for langchain_google_vertexai.model_garden_maas.llama

from __future__ import annotations

import json
import uuid
from typing import (
    Any,
    AsyncIterator,
    Callable,
    Dict,
    Iterator,
    List,
    Literal,
    Optional,
    Sequence,
    Type,
    Union,
    cast,
    overload,
)

from langchain_core.callbacks.manager import (
    AsyncCallbackManagerForLLMRun,
    CallbackManagerForLLMRun,
)
from langchain_core.language_models import LanguageModelInput
from langchain_core.language_models.chat_models import (
    BaseChatModel,
    agenerate_from_stream,
    generate_from_stream,
)
from langchain_core.messages import (
    AIMessage,
    AIMessageChunk,
    BaseMessage,
    HumanMessage,
    SystemMessage,
    ToolMessage,
)
from langchain_core.messages.tool import tool_call as create_tool_call
from langchain_core.messages.tool import tool_call_chunk
from langchain_core.outputs import (
    ChatGeneration,
    ChatGenerationChunk,
    ChatResult,
)
from langchain_core.runnables import Runnable
from langchain_core.tools import BaseTool
from langchain_core.utils.function_calling import (
    convert_to_openai_function,
)

from langchain_google_vertexai.model_garden_maas._base import (
    _BaseVertexMaasModelGarden,
    acompletion_with_retry,
    completion_with_retry,
)


@overload
def _parse_response_candidate_llama(
    response_candidate: Dict[str, str], streaming: Literal[False] = False
) -> AIMessage:
    ...


@overload
def _parse_response_candidate_llama(
    response_candidate: Dict[str, str], streaming: Literal[True]
) -> AIMessageChunk:
    ...


def _parse_response_candidate_llama(
    response_candidate: Dict[str, str], streaming: bool = False
) -> AIMessage:
    content = response_candidate["content"]
    role = response_candidate["role"]
    if role != "assistant":
        raise ValueError(f"Role in response is {role}, expected 'assistant'!")
    tool_calls = []
    tool_call_chunks = []

    response_json = None
    try:
        response_json = json.loads(response_candidate["content"])
    except ValueError:
        pass
    if response_json and "name" in response_json:
        function_name = response_json["name"]
        function_args = response_json.get("parameters", None)
        if streaming:
            tool_call_chunks.append(
                tool_call_chunk(
                    name=function_name, args=function_args, id=str(uuid.uuid4())
                )
            )
        else:
            tool_calls.append(
                create_tool_call(
                    name=function_name, args=function_args, id=str(uuid.uuid4())
                )
            )
        content = ""

    if streaming:
        return AIMessageChunk(
            content=content,
            tool_call_chunks=tool_call_chunks,
        )

    return AIMessage(
        content=content,
        tool_calls=tool_calls,
    )


[docs]class VertexModelGardenLlama(_BaseVertexMaasModelGarden, BaseChatModel): # type: ignore[misc] """Integration for Llama 3.1 on Google Cloud Vertex AI Model-as-a-Service. For more information, see: https://cloud.google.com/blog/products/ai-machine-learning/llama-3-1-on-vertex-ai Setup: You need to enable a corresponding MaaS model (Google Cloud UI console -> Vertex AI -> Model Garden -> search for a model you need and click enable) You must have the langchain-google-vertexai Python package installed .. code-block:: bash pip install -U langchain-google-vertexai And either: - Have credentials configured for your environment (gcloud, workload identity, etc...) - Store the path to a service account JSON file as the GOOGLE_APPLICATION_CREDENTIALS environment variable This codebase uses the google.auth library which first looks for the application credentials variable mentioned above, and then looks for system-level auth. For more information, see: https://cloud.google.com/docs/authentication/application-default-credentials#GAC and https://googleapis.dev/python/google-auth/latest/reference/google.auth.html#module-google.auth. Key init args — completion params: model: str Name of VertexMaaS model to use ("meta/llama3-405b-instruct-maas") append_tools_to_system_message: bool Whether to append tools to a system message Key init args — client params: credentials: Optional[google.auth.credentials.Credentials] The default custom credentials to use when making API calls. If not provided, credentials will be ascertained from the environment. project: Optional[str] The default GCP project to use when making Vertex API calls. location: str = "us-central1" The default location to use when making API calls. See full list of supported init args and their descriptions in the params section. Instantiate: .. code-block:: python from langchain_google_vertexai import VertexMaaS llm = VertexModelGardenLlama( model="meta/llama3-405b-instruct-maas", # other params... ) Invoke: .. code-block:: python messages = [ ("system", "You are a helpful translator. Translate the user sentence to French."), ("human", "I love programming."), ] llm.invoke(messages) .. code-block:: python AIMessage(content="J'adore programmer. \n", id='run-925ce305-2268-44c4-875f-dde9128520ad-0') Stream: .. code-block:: python for chunk in llm.stream(messages): print(chunk) .. code-block:: python AIMessageChunk(content='J', id='run-9df01d73-84d9-42db-9d6b-b1466a019e89') AIMessageChunk(content="'adore programmer. \n", id='run-9df01d73-84d9-42db-9d6b-b1466a019e89') AIMessageChunk(content='', id='run-9df01d73-84d9-42db-9d6b-b1466a019e89') .. code-block:: python stream = llm.stream(messages) full = next(stream) for chunk in stream: full += chunk full .. code-block:: python AIMessageChunk(content="J'adore programmer. \n", id='run-b7f7492c-4cb5-42d0-8fc3-dce9b293b0fb') """ # noqa: E501 def _convert_messages( self, messages: List[BaseMessage], tools: Optional[List[BaseTool]] = None ) -> List[Dict[str, Any]]: converted_messages: List[Dict[str, Any]] = [] if tools and not self.append_tools_to_system_message: raise ValueError( "If providing tools, either format system message yourself or " "append_tools_to_system_message to True!" ) elif tools: tools_str = "\n".join( [json.dumps(convert_to_openai_function(t)) for t in tools] ) formatted_system_message = ( "You are an assistant with access to the following tools:\n\n" f"{tools_str}\n\n" "If you decide to use a tool, please respond with a JSON for a " "function call with its proper arguments that best answers the " "given prompt.\nRespond in the format " '{"name": function name, "parameters": dictionary ' "of argument name and its value}. Do not use variables.\n" "Do not provide any additional comments when calling a tool.\n" "Do not mention tools to the user when preparing the final answer." ) message = messages[0] if not isinstance(message, SystemMessage): converted_messages.append( {"role": "system", "content": formatted_system_message} ) else: converted_messages.append( { "role": "system", "content": str(message.content) + "\n" + formatted_system_message, } ) for i, message in enumerate(messages): if tools and isinstance(message, SystemMessage) and i == 0: continue if isinstance(message, AIMessage): converted_messages.append( {"role": "assistant", "content": message.content} ) elif isinstance(message, HumanMessage): converted_messages.append({"role": "user", "content": message.content}) elif isinstance(message, SystemMessage): converted_messages.append( {"role": "system", "content": message.content} ) elif isinstance(message, ToolMessage): # we also need to format a previous message if we got a tool result prev_message = messages[i - 1] if not isinstance(prev_message, AIMessage): raise ValueError("ToolMessage should follow AIMessage only!") _ = converted_messages[-1].pop("content", None) tool_calls = [] for tool_call in prev_message.tool_calls: tool_calls.append( { "type": "function", "id": tool_call["id"], "function": { "name": tool_call["name"], "arguments": json.dumps(tool_call.get("args", {})), }, } ) converted_messages[-1]["tool_calls"] = tool_calls if len(tool_calls) > 1: raise ValueError( "Only a single function call per turn is supported!" ) converted_messages.append( { "role": "tool", "name": message.name, "content": message.content, "tool_call_id": message.tool_call_id, } ) else: raise ValueError(f"Message type {type(message)} is not yet supported!") return converted_messages def _generate( self, messages: List[BaseMessage], stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None, stream: Optional[bool] = None, *, tools: Optional[List[BaseTool]] = None, **kwargs: Any, ) -> ChatResult: """Generate next turn in the conversation. Args: messages: The history of the conversation as a list of messages. Code chat does not support context. stop: The list of stop words (optional). run_manager: The CallbackManager for LLM run, it's not used at the moment. stream: Whether to use the streaming endpoint. Returns: The ChatResult that contains outputs generated by the model. Raises: ValueError: if the last message in the list is not from human. """ if stream is True: return generate_from_stream( self._stream( messages, stop=stop, run_manager=run_manager, tools=tools, **kwargs, ) ) converted_messages = self._convert_messages(messages, tools=tools) response = completion_with_retry(self, messages=converted_messages, **kwargs) return self._create_chat_result(response) async def _agenerate( self, messages: List[BaseMessage], stop: Optional[List[str]] = None, run_manager: Optional[AsyncCallbackManagerForLLMRun] = None, stream: Optional[bool] = None, *, tools: Optional[List[BaseTool]] = None, **kwargs: Any, ) -> ChatResult: if stream: stream_iter = self._astream( messages=messages, stop=stop, run_manager=run_manager, **kwargs ) return await agenerate_from_stream(stream_iter) converted_messages = self._convert_messages(messages, tools=tools) response = await acompletion_with_retry( self, messages=converted_messages, run_manager=run_manager, **kwargs ) return self._create_chat_result(response) def _create_chat_result(self, response: Dict) -> ChatResult: generations = [] token_usage = response.get("usage", {}) for candidate in response["choices"]: finish_reason = response.get("finish_reason") message = _parse_response_candidate_llama(candidate["message"]) if token_usage and isinstance(message, AIMessage): message.usage_metadata = { "input_tokens": token_usage.get("prompt_tokens", 0), "output_tokens": token_usage.get("completion_tokens", 0), "total_tokens": token_usage.get("total_tokens", 0), } gen = ChatGeneration( message=message, generation_info={"finish_reason": finish_reason}, ) generations.append(gen) llm_output = {"token_usage": token_usage, "model": self.model_name} return ChatResult(generations=generations, llm_output=llm_output) @property def _llm_type(self) -> str: """Return type of chat model.""" return "vertexai_model_garden_maas_llama" def _parse_chunk(self, chunk: Dict) -> AIMessageChunk: chunk_delta = chunk["choices"][0]["delta"] content = chunk_delta.get("content", "") if chunk_delta.get("role") != "assistant": raise ValueError(f"Got chunk with non-assistant role: {chunk_delta}") additional_kwargs = {} if raw_tool_calls := chunk_delta.get("tool_calls"): additional_kwargs["tool_calls"] = raw_tool_calls try: tool_call_chunks = [] for raw_tool_call in raw_tool_calls: if not raw_tool_call.get("index") and not raw_tool_call.get("id"): tool_call_id = str(uuid.uuid4()) else: tool_call_id = raw_tool_call.get("id") tool_call_chunks.append( tool_call_chunk( name=raw_tool_call["function"].get("name"), args=raw_tool_call["function"].get("arguments"), id=tool_call_id, index=raw_tool_call.get("index"), ) ) except KeyError: pass else: tool_call_chunks = [] if token_usage := chunk.get("usage"): usage_metadata = { "input_tokens": token_usage.get("prompt_tokens", 0), "output_tokens": token_usage.get("completion_tokens", 0), "total_tokens": token_usage.get("total_tokens", 0), } else: usage_metadata = None return AIMessageChunk( content=content, additional_kwargs=additional_kwargs, tool_call_chunks=tool_call_chunks, usage_metadata=usage_metadata, # type: ignore[arg-type] ) def _stream( self, messages: List[BaseMessage], stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None, *, tools: Optional[List[BaseTool]] = None, **kwargs: Any, ) -> Iterator[ChatGenerationChunk]: converted_messages = self._convert_messages(messages, tools=tools) params = {**kwargs, "stream": True, "headers_content_type": "text/event-stream"} for chunk in completion_with_retry( self, messages=converted_messages, run_manager=run_manager, **params ): if len(chunk["choices"]) == 0: continue message = self._parse_chunk(chunk) gen_chunk = ChatGenerationChunk(message=message) if run_manager: run_manager.on_llm_new_token( token=cast(str, message.content), chunk=gen_chunk ) yield gen_chunk async def _astream( self, messages: List[BaseMessage], stop: Optional[List[str]] = None, run_manager: Optional[AsyncCallbackManagerForLLMRun] = None, *, tools: Optional[List[BaseTool]] = None, **kwargs: Any, ) -> AsyncIterator[ChatGenerationChunk]: converted_messages = self._convert_messages(messages, tools=tools) params = {**kwargs, "stream": True, "headers_content_type": "text/event-stream"} async for chunk in await acompletion_with_retry( self, messages=converted_messages, run_manager=run_manager, **params ): if len(chunk["choices"]) == 0: continue message = self._parse_chunk(chunk) gen_chunk = ChatGenerationChunk(message=message) if run_manager: await run_manager.on_llm_new_token( token=cast(str, message.content), chunk=gen_chunk ) yield gen_chunk
[docs] def bind_tools( self, tools: Sequence[Union[Dict[str, Any], Type, Callable, BaseTool]], **kwargs: Any, ) -> Runnable[LanguageModelInput, BaseMessage]: """Bind tool-like objects to this chat model.""" formatted_tools = [convert_to_openai_function(tool) for tool in tools] return super().bind(tools=formatted_tools, **kwargs)