Source code for langchain_community.embeddings.openai
from__future__importannotationsimportloggingimportosimportwarningsfromtypingimport(Any,Callable,Dict,List,Literal,Mapping,Optional,Sequence,Set,Tuple,Union,cast,)importnumpyasnpfromlangchain_core._api.deprecationimportdeprecatedfromlangchain_core.embeddingsimportEmbeddingsfromlangchain_core.utilsimport(get_from_dict_or_env,get_pydantic_field_names,pre_init,)frompydanticimportBaseModel,ConfigDict,Field,model_validatorfromtenacityimport(AsyncRetrying,before_sleep_log,retry,retry_if_exception_type,stop_after_attempt,wait_exponential,)fromlangchain_community.utils.openaiimportis_openai_v1logger=logging.getLogger(__name__)def_create_retry_decorator(embeddings:OpenAIEmbeddings)->Callable[[Any],Any]:importopenai# Wait 2^x * 1 second between each retry starting with# retry_min_seconds seconds, then up to retry_max_seconds seconds,# then retry_max_seconds seconds afterwards# retry_min_seconds and retry_max_seconds are optional arguments of# OpenAIEmbeddingsreturnretry(reraise=True,stop=stop_after_attempt(embeddings.max_retries),wait=wait_exponential(multiplier=1,min=embeddings.retry_min_seconds,max=embeddings.retry_max_seconds,),retry=(retry_if_exception_type(openai.error.Timeout)# type: ignore[attr-defined]|retry_if_exception_type(openai.error.APIError)# type: ignore[attr-defined]|retry_if_exception_type(openai.error.APIConnectionError)# type: ignore[attr-defined]|retry_if_exception_type(openai.error.RateLimitError)# type: ignore[attr-defined]|retry_if_exception_type(openai.error.ServiceUnavailableError)# type: ignore[attr-defined]),before_sleep=before_sleep_log(logger,logging.WARNING),)def_async_retry_decorator(embeddings:OpenAIEmbeddings)->Any:importopenai# Wait 2^x * 1 second between each retry starting with# retry_min_seconds seconds, then up to retry_max_seconds seconds,# then retry_max_seconds seconds afterwards# retry_min_seconds and retry_max_seconds are optional arguments of# OpenAIEmbeddingsasync_retrying=AsyncRetrying(reraise=True,stop=stop_after_attempt(embeddings.max_retries),wait=wait_exponential(multiplier=1,min=embeddings.retry_min_seconds,max=embeddings.retry_max_seconds,),retry=(retry_if_exception_type(openai.error.Timeout)# type: ignore[attr-defined]|retry_if_exception_type(openai.error.APIError)# type: ignore[attr-defined]|retry_if_exception_type(openai.error.APIConnectionError)# type: ignore[attr-defined]|retry_if_exception_type(openai.error.RateLimitError)# type: ignore[attr-defined]|retry_if_exception_type(openai.error.ServiceUnavailableError)# type: ignore[attr-defined]),before_sleep=before_sleep_log(logger,logging.WARNING),)defwrap(func:Callable)->Callable:asyncdefwrapped_f(*args:Any,**kwargs:Any)->Callable:asyncfor_inasync_retrying:returnawaitfunc(*args,**kwargs)raiseAssertionError("this is unreachable")returnwrapped_freturnwrap# https://stackoverflow.com/questions/76469415/getting-embeddings-of-length-1-from-langchain-openaiembeddingsdef_check_response(response:dict,skip_empty:bool=False)->dict:ifany(len(d["embedding"])==1fordinresponse["data"])andnotskip_empty:importopenairaiseopenai.error.APIError("OpenAI API returned an empty embedding")# type: ignore[attr-defined]returnresponse
[docs]defembed_with_retry(embeddings:OpenAIEmbeddings,**kwargs:Any)->Any:"""Use tenacity to retry the embedding call."""ifis_openai_v1():returnembeddings.client.create(**kwargs)retry_decorator=_create_retry_decorator(embeddings)@retry_decoratordef_embed_with_retry(**kwargs:Any)->Any:response=embeddings.client.create(**kwargs)return_check_response(response,skip_empty=embeddings.skip_empty)return_embed_with_retry(**kwargs)
[docs]asyncdefasync_embed_with_retry(embeddings:OpenAIEmbeddings,**kwargs:Any)->Any:"""Use tenacity to retry the embedding call."""ifis_openai_v1():returnawaitembeddings.async_client.create(**kwargs)@_async_retry_decorator(embeddings)asyncdef_async_embed_with_retry(**kwargs:Any)->Any:response=awaitembeddings.client.acreate(**kwargs)return_check_response(response,skip_empty=embeddings.skip_empty)returnawait_async_embed_with_retry(**kwargs)
[docs]@deprecated(since="0.0.9",removal="1.0",alternative_import="langchain_openai.OpenAIEmbeddings",)classOpenAIEmbeddings(BaseModel,Embeddings):"""OpenAI embedding models. To use, you should have the ``openai`` python package installed, and the environment variable ``OPENAI_API_KEY`` set with your API key or pass it as a named parameter to the constructor. Example: .. code-block:: python from langchain_community.embeddings import OpenAIEmbeddings openai = OpenAIEmbeddings(openai_api_key="my-api-key") In order to use the library with Microsoft Azure endpoints, you need to set the OPENAI_API_TYPE, OPENAI_API_BASE, OPENAI_API_KEY and OPENAI_API_VERSION. The OPENAI_API_TYPE must be set to 'azure' and the others correspond to the properties of your endpoint. In addition, the deployment name must be passed as the model parameter. Example: .. code-block:: python import os os.environ["OPENAI_API_TYPE"] = "azure" os.environ["OPENAI_API_BASE"] = "https://<your-endpoint.openai.azure.com/" os.environ["OPENAI_API_KEY"] = "your AzureOpenAI key" os.environ["OPENAI_API_VERSION"] = "2023-05-15" os.environ["OPENAI_PROXY"] = "http://your-corporate-proxy:8080" from langchain_community.embeddings.openai import OpenAIEmbeddings embeddings = OpenAIEmbeddings( deployment="your-embeddings-deployment-name", model="your-embeddings-model-name", openai_api_base="https://your-endpoint.openai.azure.com/", openai_api_type="azure", ) text = "This is a test query." query_result = embeddings.embed_query(text) """client:Any=Field(default=None,exclude=True)#: :meta private:async_client:Any=Field(default=None,exclude=True)#: :meta private:model:str="text-embedding-ada-002"# to support Azure OpenAI Service custom deployment namesdeployment:Optional[str]=model# TODO: Move to AzureOpenAIEmbeddings.openai_api_version:Optional[str]=Field(default=None,alias="api_version")"""Automatically inferred from env var `OPENAI_API_VERSION` if not provided."""# to support Azure OpenAI Service custom endpointsopenai_api_base:Optional[str]=Field(default=None,alias="base_url")"""Base URL path for API requests, leave blank if not using a proxy or service emulator."""# to support Azure OpenAI Service custom endpointsopenai_api_type:Optional[str]=None# to support explicit proxy for OpenAIopenai_proxy:Optional[str]=Noneembedding_ctx_length:int=8191"""The maximum number of tokens to embed at once."""openai_api_key:Optional[str]=Field(default=None,alias="api_key")"""Automatically inferred from env var `OPENAI_API_KEY` if not provided."""openai_organization:Optional[str]=Field(default=None,alias="organization")"""Automatically inferred from env var `OPENAI_ORG_ID` if not provided."""allowed_special:Union[Literal["all"],Set[str]]=set()disallowed_special:Union[Literal["all"],Set[str],Sequence[str]]="all"chunk_size:int=1000"""Maximum number of texts to embed in each batch"""max_retries:int=2"""Maximum number of retries to make when generating."""request_timeout:Optional[Union[float,Tuple[float,float],Any]]=Field(default=None,alias="timeout")"""Timeout for requests to OpenAI completion API. Can be float, httpx.Timeout or None."""headers:Any=Nonetiktoken_enabled:bool=True"""Set this to False for non-OpenAI implementations of the embeddings API, e.g. the `--extensions openai` extension for `text-generation-webui`"""tiktoken_model_name:Optional[str]=None"""The model name to pass to tiktoken when using this class. Tiktoken is used to count the number of tokens in documents to constrain them to be under a certain limit. By default, when set to None, this will be the same as the embedding model name. However, there are some cases where you may want to use this Embedding class with a model name not supported by tiktoken. This can include when using Azure embeddings or when using one of the many model providers that expose an OpenAI-like API but with different models. In those cases, in order to avoid erroring when tiktoken is called, you can specify a model name to use here."""show_progress_bar:bool=False"""Whether to show a progress bar when embedding."""model_kwargs:Dict[str,Any]=Field(default_factory=dict)"""Holds any model parameters valid for `create` call not explicitly specified."""skip_empty:bool=False"""Whether to skip empty strings when embedding or raise an error. Defaults to not skipping."""default_headers:Union[Mapping[str,str],None]=Nonedefault_query:Union[Mapping[str,object],None]=None# Configure a custom httpx client. See the# [httpx documentation](https://www.python-httpx.org/api/#client) for more details.retry_min_seconds:int=4"""Min number of seconds to wait between retries"""retry_max_seconds:int=20"""Max number of seconds to wait between retries"""http_client:Union[Any,None]=None"""Optional httpx.Client."""model_config=ConfigDict(populate_by_name=True,extra="forbid",protected_namespaces=())@model_validator(mode="before")@classmethoddefbuild_extra(cls,values:Dict[str,Any])->Any:"""Build extra kwargs from additional params that were passed in."""all_required_field_names=get_pydantic_field_names(cls)extra=values.get("model_kwargs",{})forfield_nameinlist(values):iffield_nameinextra:raiseValueError(f"Found {field_name} supplied twice.")iffield_namenotinall_required_field_names:warnings.warn(f"""WARNING! {field_name} is not default parameter.{field_name} was transferred to model_kwargs. Please confirm that {field_name} is what you intended.""")extra[field_name]=values.pop(field_name)invalid_model_kwargs=all_required_field_names.intersection(extra.keys())ifinvalid_model_kwargs:raiseValueError(f"Parameters {invalid_model_kwargs} should be specified explicitly. "f"Instead they were passed in as part of `model_kwargs` parameter.")values["model_kwargs"]=extrareturnvalues
[docs]@pre_initdefvalidate_environment(cls,values:Dict)->Dict:"""Validate that api key and python package exists in environment."""values["openai_api_key"]=get_from_dict_or_env(values,"openai_api_key","OPENAI_API_KEY")values["openai_api_base"]=values["openai_api_base"]oros.getenv("OPENAI_API_BASE")values["openai_api_type"]=get_from_dict_or_env(values,"openai_api_type","OPENAI_API_TYPE",default="",)values["openai_proxy"]=get_from_dict_or_env(values,"openai_proxy","OPENAI_PROXY",default="",)ifvalues["openai_api_type"]in("azure","azure_ad","azuread"):default_api_version="2023-05-15"# Azure OpenAI embedding models allow a maximum of 2048# texts at a time in each batch# See: https://learn.microsoft.com/en-us/azure/ai-services/openai/reference#embeddingsvalues["chunk_size"]=min(values["chunk_size"],2048)else:default_api_version=""values["openai_api_version"]=get_from_dict_or_env(values,"openai_api_version","OPENAI_API_VERSION",default=default_api_version,)# Check OPENAI_ORGANIZATION for backwards compatibility.values["openai_organization"]=(values["openai_organization"]oros.getenv("OPENAI_ORG_ID")oros.getenv("OPENAI_ORGANIZATION"))try:importopenaiexceptImportError:raiseImportError("Could not import openai python package. ""Please install it with `pip install openai`.")else:ifis_openai_v1():ifvalues["openai_api_type"]in("azure","azure_ad","azuread"):warnings.warn("If you have openai>=1.0.0 installed and are using Azure, ""please use the `AzureOpenAIEmbeddings` class.")client_params={"api_key":values["openai_api_key"],"organization":values["openai_organization"],"base_url":values["openai_api_base"],"timeout":values["request_timeout"],"max_retries":values["max_retries"],"default_headers":values["default_headers"],"default_query":values["default_query"],"http_client":values["http_client"],}ifnotvalues.get("client"):values["client"]=openai.OpenAI(**client_params).embeddingsifnotvalues.get("async_client"):values["async_client"]=openai.AsyncOpenAI(**client_params).embeddingselifnotvalues.get("client"):values["client"]=openai.Embedding# type: ignore[attr-defined]else:passreturnvalues
@propertydef_invocation_params(self)->Dict[str,Any]:ifis_openai_v1():openai_args:Dict={"model":self.model,**self.model_kwargs}else:openai_args={"model":self.model,"request_timeout":self.request_timeout,"headers":self.headers,"api_key":self.openai_api_key,"organization":self.openai_organization,"api_base":self.openai_api_base,"api_type":self.openai_api_type,"api_version":self.openai_api_version,**self.model_kwargs,}ifself.openai_api_typein("azure","azure_ad","azuread"):openai_args["engine"]=self.deployment# TODO: Look into proxy with openai v1.ifself.openai_proxy:try:importopenaiexceptImportError:raiseImportError("Could not import openai python package. ""Please install it with `pip install openai`.")openai.proxy={# type: ignore[attr-defined]"http":self.openai_proxy,"https":self.openai_proxy,}# type: ignore[assignment]returnopenai_args# please refer to# https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynbdef_get_len_safe_embeddings(self,texts:List[str],*,engine:str,chunk_size:Optional[int]=None)->List[List[float]]:""" Generate length-safe embeddings for a list of texts. This method handles tokenization and embedding generation, respecting the set embedding context length and chunk size. It supports both tiktoken and HuggingFace tokenizer based on the tiktoken_enabled flag. Args: texts (List[str]): A list of texts to embed. engine (str): The engine or model to use for embeddings. chunk_size (Optional[int]): The size of chunks for processing embeddings. Returns: List[List[float]]: A list of embeddings for each input text. """tokens=[]indices=[]model_name=self.tiktoken_model_nameorself.model_chunk_size=chunk_sizeorself.chunk_size# If tiktoken flag set to Falseifnotself.tiktoken_enabled:try:fromtransformersimportAutoTokenizerexceptImportError:raiseImportError("Could not import transformers python package. ""This is needed in order to for OpenAIEmbeddings without ""`tiktoken`. Please install it with `pip install transformers`. ")tokenizer=AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)fori,textinenumerate(texts):# Tokenize the text using HuggingFace transformerstokenized=tokenizer.encode(text,add_special_tokens=False)# Split tokens into chunks respecting the embedding_ctx_lengthforjinrange(0,len(tokenized),self.embedding_ctx_length):token_chunk=tokenized[j:j+self.embedding_ctx_length]# Convert token IDs back to a stringchunk_text=tokenizer.decode(token_chunk)tokens.append(chunk_text)indices.append(i)else:try:importtiktokenexceptImportError:raiseImportError("Could not import tiktoken python package. ""This is needed in order to for OpenAIEmbeddings. ""Please install it with `pip install tiktoken`.")try:encoding=tiktoken.encoding_for_model(model_name)exceptKeyError:logger.warning("Warning: model not found. Using cl100k_base encoding.")model="cl100k_base"encoding=tiktoken.get_encoding(model)fori,textinenumerate(texts):ifself.model.endswith("001"):# See: https://github.com/openai/openai-python/# issues/418#issuecomment-1525939500# replace newlines, which can negatively affect performance.text=text.replace("\n"," ")token=encoding.encode(text=text,allowed_special=self.allowed_special,disallowed_special=self.disallowed_special,)# Split tokens into chunks respecting the embedding_ctx_lengthforjinrange(0,len(token),self.embedding_ctx_length):tokens.append(token[j:j+self.embedding_ctx_length])indices.append(i)ifself.show_progress_bar:try:fromtqdm.autoimporttqdm_iter=tqdm(range(0,len(tokens),_chunk_size))exceptImportError:_iter=range(0,len(tokens),_chunk_size)else:_iter=range(0,len(tokens),_chunk_size)batched_embeddings:List[List[float]]=[]foriin_iter:response=embed_with_retry(self,input=tokens[i:i+_chunk_size],**self._invocation_params,)ifnotisinstance(response,dict):response=response.dict()batched_embeddings.extend(r["embedding"]forrinresponse["data"])results:List[List[List[float]]]=[[]for_inrange(len(texts))]num_tokens_in_batch:List[List[int]]=[[]for_inrange(len(texts))]foriinrange(len(indices)):ifself.skip_emptyandlen(batched_embeddings[i])==1:continueresults[indices[i]].append(batched_embeddings[i])num_tokens_in_batch[indices[i]].append(len(tokens[i]))embeddings:List[List[float]]=[[]for_inrange(len(texts))]foriinrange(len(texts)):_result=results[i]iflen(_result)==0:average_embedded=embed_with_retry(self,input="",**self._invocation_params,)ifnotisinstance(average_embedded,dict):average_embedded=average_embedded.dict()average=average_embedded["data"][0]["embedding"]else:average=np.average(_result,axis=0,weights=num_tokens_in_batch[i])embeddings[i]=(average/np.linalg.norm(average)).tolist()returnembeddings# please refer to# https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynbasyncdef_aget_len_safe_embeddings(self,texts:List[str],*,engine:str,chunk_size:Optional[int]=None)->List[List[float]]:""" Asynchronously generate length-safe embeddings for a list of texts. This method handles tokenization and asynchronous embedding generation, respecting the set embedding context length and chunk size. It supports both `tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag. Args: texts (List[str]): A list of texts to embed. engine (str): The engine or model to use for embeddings. chunk_size (Optional[int]): The size of chunks for processing embeddings. Returns: List[List[float]]: A list of embeddings for each input text. """tokens=[]indices=[]model_name=self.tiktoken_model_nameorself.model_chunk_size=chunk_sizeorself.chunk_size# If tiktoken flag set to Falseifnotself.tiktoken_enabled:try:fromtransformersimportAutoTokenizerexceptImportError:raiseImportError("Could not import transformers python package. ""This is needed in order to for OpenAIEmbeddings without "" `tiktoken`. Please install it with `pip install transformers`.")tokenizer=AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)fori,textinenumerate(texts):# Tokenize the text using HuggingFace transformerstokenized=tokenizer.encode(text,add_special_tokens=False)# Split tokens into chunks respecting the embedding_ctx_lengthforjinrange(0,len(tokenized),self.embedding_ctx_length):token_chunk=tokenized[j:j+self.embedding_ctx_length]# Convert token IDs back to a stringchunk_text=tokenizer.decode(token_chunk)tokens.append(chunk_text)indices.append(i)else:try:importtiktokenexceptImportError:raiseImportError("Could not import tiktoken python package. ""This is needed in order to for OpenAIEmbeddings. ""Please install it with `pip install tiktoken`.")try:encoding=tiktoken.encoding_for_model(model_name)exceptKeyError:logger.warning("Warning: model not found. Using cl100k_base encoding.")model="cl100k_base"encoding=tiktoken.get_encoding(model)fori,textinenumerate(texts):ifself.model.endswith("001"):# See: https://github.com/openai/openai-python/# issues/418#issuecomment-1525939500# replace newlines, which can negatively affect performance.text=text.replace("\n"," ")token=encoding.encode(text=text,allowed_special=self.allowed_special,disallowed_special=self.disallowed_special,)# Split tokens into chunks respecting the embedding_ctx_lengthforjinrange(0,len(token),self.embedding_ctx_length):tokens.append(token[j:j+self.embedding_ctx_length])indices.append(i)batched_embeddings:List[List[float]]=[]_chunk_size=chunk_sizeorself.chunk_sizeforiinrange(0,len(tokens),_chunk_size):response=awaitasync_embed_with_retry(self,input=tokens[i:i+_chunk_size],**self._invocation_params,)ifnotisinstance(response,dict):response=response.dict()batched_embeddings.extend(r["embedding"]forrinresponse["data"])results:List[List[List[float]]]=[[]for_inrange(len(texts))]num_tokens_in_batch:List[List[int]]=[[]for_inrange(len(texts))]foriinrange(len(indices)):results[indices[i]].append(batched_embeddings[i])num_tokens_in_batch[indices[i]].append(len(tokens[i]))embeddings:List[List[float]]=[[]for_inrange(len(texts))]foriinrange(len(texts)):_result=results[i]iflen(_result)==0:average_embedded=awaitasync_embed_with_retry(self,input="",**self._invocation_params,)ifnotisinstance(average_embedded,dict):average_embedded=average_embedded.dict()average=average_embedded["data"][0]["embedding"]else:average=np.average(_result,axis=0,weights=num_tokens_in_batch[i])embeddings[i]=(average/np.linalg.norm(average)).tolist()returnembeddings
[docs]defembed_documents(self,texts:List[str],chunk_size:Optional[int]=0)->List[List[float]]:"""Call out to OpenAI's embedding endpoint for embedding search docs. Args: texts: The list of texts to embed. chunk_size: The chunk size of embeddings. If None, will use the chunk size specified by the class. Returns: List of embeddings, one for each text. """# NOTE: to keep things simple, we assume the list may contain texts longer# than the maximum context and use length-safe embedding function.engine=cast(str,self.deployment)returnself._get_len_safe_embeddings(texts,engine=engine)
[docs]asyncdefaembed_documents(self,texts:List[str],chunk_size:Optional[int]=0)->List[List[float]]:"""Call out to OpenAI's embedding endpoint async for embedding search docs. Args: texts: The list of texts to embed. chunk_size: The chunk size of embeddings. If None, will use the chunk size specified by the class. Returns: List of embeddings, one for each text. """# NOTE: to keep things simple, we assume the list may contain texts longer# than the maximum context and use length-safe embedding function.engine=cast(str,self.deployment)returnawaitself._aget_len_safe_embeddings(texts,engine=engine)
[docs]defembed_query(self,text:str)->List[float]:"""Call out to OpenAI's embedding endpoint for embedding query text. Args: text: The text to embed. Returns: Embedding for the text. """returnself.embed_documents([text])[0]
[docs]asyncdefaembed_query(self,text:str)->List[float]:"""Call out to OpenAI's embedding endpoint async for embedding query text. Args: text: The text to embed. Returns: Embedding for the text. """embeddings=awaitself.aembed_documents([text])returnembeddings[0]