Source code for langchain_community.embeddings.huggingface
importwarningsfromtypingimportAny,Dict,List,Optionalimportrequestsfromlangchain_core._apiimportdeprecated,warn_deprecatedfromlangchain_core.embeddingsimportEmbeddingsfrompydanticimportBaseModel,ConfigDict,Field,SecretStrDEFAULT_MODEL_NAME="sentence-transformers/all-mpnet-base-v2"DEFAULT_INSTRUCT_MODEL="hkunlp/instructor-large"DEFAULT_BGE_MODEL="BAAI/bge-large-en"DEFAULT_EMBED_INSTRUCTION="Represent the document for retrieval: "DEFAULT_QUERY_INSTRUCTION=("Represent the question for retrieving supporting documents: ")DEFAULT_QUERY_BGE_INSTRUCTION_EN=("Represent this question for searching relevant passages: ")DEFAULT_QUERY_BGE_INSTRUCTION_ZH="为这个句子生成表示以用于检索相关文章:"
[docs]@deprecated(since="0.2.2",removal="1.0",alternative_import="langchain_huggingface.HuggingFaceEmbeddings",)classHuggingFaceEmbeddings(BaseModel,Embeddings):"""HuggingFace sentence_transformers embedding models. To use, you should have the ``sentence_transformers`` python package installed. Example: .. code-block:: python from langchain_community.embeddings import HuggingFaceEmbeddings model_name = "sentence-transformers/all-mpnet-base-v2" model_kwargs = {'device': 'cpu'} encode_kwargs = {'normalize_embeddings': False} hf = HuggingFaceEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs ) """client:Any=None#: :meta private:model_name:str=DEFAULT_MODEL_NAME"""Model name to use."""cache_folder:Optional[str]=None"""Path to store models. Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable."""model_kwargs:Dict[str,Any]=Field(default_factory=dict)"""Keyword arguments to pass to the Sentence Transformer model, such as `device`, `prompts`, `default_prompt_name`, `revision`, `trust_remote_code`, or `token`. See also the Sentence Transformer documentation: https://sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer"""encode_kwargs:Dict[str,Any]=Field(default_factory=dict)"""Keyword arguments to pass when calling the `encode` method of the Sentence Transformer model, such as `prompt_name`, `prompt`, `batch_size`, `precision`, `normalize_embeddings`, and more. See also the Sentence Transformer documentation: https://sbert.net/docs/package_reference/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode"""multi_process:bool=False"""Run encode() on multiple GPUs."""show_progress:bool=False"""Whether to show a progress bar."""def__init__(self,**kwargs:Any):"""Initialize the sentence_transformer."""super().__init__(**kwargs)if"model_name"notinkwargs:since="0.2.16"removal="0.4.0"warn_deprecated(since=since,removal=removal,message=f"Default values for {self.__class__.__name__}.model_name"+f" were deprecated in LangChain {since} and will be removed in"+f" {removal}. Explicitly pass a model_name to the"+f" {self.__class__.__name__} constructor instead.",)try:importsentence_transformersexceptImportErrorasexc:raiseImportError("Could not import sentence_transformers python package. ""Please install it with `pip install sentence-transformers`.")fromexcself.client=sentence_transformers.SentenceTransformer(self.model_name,cache_folder=self.cache_folder,**self.model_kwargs)model_config=ConfigDict(extra="forbid",protected_namespaces=())
[docs]defembed_documents(self,texts:List[str])->List[List[float]]:"""Compute doc embeddings using a HuggingFace transformer model. Args: texts: The list of texts to embed. Returns: List of embeddings, one for each text. """importsentence_transformerstexts=list(map(lambdax:x.replace("\n"," "),texts))ifself.multi_process:pool=self.client.start_multi_process_pool()embeddings=self.client.encode_multi_process(texts,pool)sentence_transformers.SentenceTransformer.stop_multi_process_pool(pool)else:embeddings=self.client.encode(texts,show_progress_bar=self.show_progress,**self.encode_kwargs)returnembeddings.tolist()
[docs]defembed_query(self,text:str)->List[float]:"""Compute query embeddings using a HuggingFace transformer model. Args: text: The text to embed. Returns: Embeddings for the text. """returnself.embed_documents([text])[0]
[docs]classHuggingFaceInstructEmbeddings(BaseModel,Embeddings):"""Wrapper around sentence_transformers embedding models. To use, you should have the ``sentence_transformers`` and ``InstructorEmbedding`` python packages installed. Example: .. code-block:: python from langchain_community.embeddings import HuggingFaceInstructEmbeddings model_name = "hkunlp/instructor-large" model_kwargs = {'device': 'cpu'} encode_kwargs = {'normalize_embeddings': True} hf = HuggingFaceInstructEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs ) """client:Any=None#: :meta private:model_name:str=DEFAULT_INSTRUCT_MODEL"""Model name to use."""cache_folder:Optional[str]=None"""Path to store models. Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable."""model_kwargs:Dict[str,Any]=Field(default_factory=dict)"""Keyword arguments to pass to the model."""encode_kwargs:Dict[str,Any]=Field(default_factory=dict)"""Keyword arguments to pass when calling the `encode` method of the model."""embed_instruction:str=DEFAULT_EMBED_INSTRUCTION"""Instruction to use for embedding documents."""query_instruction:str=DEFAULT_QUERY_INSTRUCTION"""Instruction to use for embedding query."""show_progress:bool=False"""Whether to show a progress bar."""def__init__(self,**kwargs:Any):"""Initialize the sentence_transformer."""super().__init__(**kwargs)if"model_name"notinkwargs:since="0.2.16"removal="0.4.0"warn_deprecated(since=since,removal=removal,message=f"Default values for {self.__class__.__name__}.model_name"+f" were deprecated in LangChain {since} and will be removed in"+f" {removal}. Explicitly pass a model_name to the"+f" {self.__class__.__name__} constructor instead.",)try:fromInstructorEmbeddingimportINSTRUCTORself.client=INSTRUCTOR(self.model_name,cache_folder=self.cache_folder,**self.model_kwargs)exceptImportErrorase:raiseImportError("Dependencies for InstructorEmbedding not found.")fromeif"show_progress_bar"inself.encode_kwargs:warn_deprecated(since="0.2.5",removal="1.0",name="encode_kwargs['show_progress_bar']",alternative=f"the show_progress method on {self.__class__.__name__}",)ifself.show_progress:warnings.warn("Both encode_kwargs['show_progress_bar'] and show_progress are set;""encode_kwargs['show_progress_bar'] takes precedence")self.show_progress=self.encode_kwargs.pop("show_progress_bar")model_config=ConfigDict(extra="forbid",protected_namespaces=())
[docs]defembed_documents(self,texts:List[str])->List[List[float]]:"""Compute doc embeddings using a HuggingFace instruct model. Args: texts: The list of texts to embed. Returns: List of embeddings, one for each text. """instruction_pairs=[[self.embed_instruction,text]fortextintexts]embeddings=self.client.encode(instruction_pairs,show_progress_bar=self.show_progress,**self.encode_kwargs,)returnembeddings.tolist()
[docs]defembed_query(self,text:str)->List[float]:"""Compute query embeddings using a HuggingFace instruct model. Args: text: The text to embed. Returns: Embeddings for the text. """instruction_pair=[self.query_instruction,text]embedding=self.client.encode([instruction_pair],show_progress_bar=self.show_progress,**self.encode_kwargs,)[0]returnembedding.tolist()
[docs]@deprecated(since="0.2.2",removal="1.0",alternative_import="langchain_huggingface.HuggingFaceEmbeddings",)classHuggingFaceBgeEmbeddings(BaseModel,Embeddings):"""HuggingFace sentence_transformers embedding models. To use, you should have the ``sentence_transformers`` python package installed. To use Nomic, make sure the version of ``sentence_transformers`` >= 2.3.0. Bge Example: .. code-block:: python from langchain_community.embeddings import HuggingFaceBgeEmbeddings model_name = "BAAI/bge-large-en-v1.5" model_kwargs = {'device': 'cpu'} encode_kwargs = {'normalize_embeddings': True} hf = HuggingFaceBgeEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs ) Nomic Example: .. code-block:: python from langchain_community.embeddings import HuggingFaceBgeEmbeddings model_name = "nomic-ai/nomic-embed-text-v1" model_kwargs = { 'device': 'cpu', 'trust_remote_code':True } encode_kwargs = {'normalize_embeddings': True} hf = HuggingFaceBgeEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs, query_instruction = "search_query:", embed_instruction = "search_document:" ) """client:Any=None#: :meta private:model_name:str=DEFAULT_BGE_MODEL"""Model name to use."""cache_folder:Optional[str]=None"""Path to store models. Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable."""model_kwargs:Dict[str,Any]=Field(default_factory=dict)"""Keyword arguments to pass to the model."""encode_kwargs:Dict[str,Any]=Field(default_factory=dict)"""Keyword arguments to pass when calling the `encode` method of the model."""query_instruction:str=DEFAULT_QUERY_BGE_INSTRUCTION_EN"""Instruction to use for embedding query."""embed_instruction:str="""""Instruction to use for embedding document."""show_progress:bool=False"""Whether to show a progress bar."""def__init__(self,**kwargs:Any):"""Initialize the sentence_transformer."""super().__init__(**kwargs)if"model_name"notinkwargs:since="0.2.5"removal="0.4.0"warn_deprecated(since=since,removal=removal,message=f"Default values for {self.__class__.__name__}.model_name"+f" were deprecated in LangChain {since} and will be removed in"+f" {removal}. Explicitly pass a model_name to the"+f" {self.__class__.__name__} constructor instead.",)try:importsentence_transformersexceptImportErrorasexc:raiseImportError("Could not import sentence_transformers python package. ""Please install it with `pip install sentence-transformers`.")fromexcextra_model_kwargs=["torch_dtype","attn_implementation","provider","file_name","export",]extra_model_kwargs_dict={k:self.model_kwargs.pop(k)forkinextra_model_kwargsifkinself.model_kwargs}self.client=sentence_transformers.SentenceTransformer(self.model_name,cache_folder=self.cache_folder,**self.model_kwargs,model_kwargs=extra_model_kwargs_dict,)if"-zh"inself.model_name:self.query_instruction=DEFAULT_QUERY_BGE_INSTRUCTION_ZHif"show_progress_bar"inself.encode_kwargs:warn_deprecated(since="0.2.5",removal="1.0",name="encode_kwargs['show_progress_bar']",alternative=f"the show_progress method on {self.__class__.__name__}",)ifself.show_progress:warnings.warn("Both encode_kwargs['show_progress_bar'] and show_progress are set;""encode_kwargs['show_progress_bar'] takes precedence")self.show_progress=self.encode_kwargs.pop("show_progress_bar")model_config=ConfigDict(extra="forbid",protected_namespaces=())
[docs]defembed_documents(self,texts:List[str])->List[List[float]]:"""Compute doc embeddings using a HuggingFace transformer model. Args: texts: The list of texts to embed. Returns: List of embeddings, one for each text. """texts=[self.embed_instruction+t.replace("\n"," ")fortintexts]embeddings=self.client.encode(texts,show_progress_bar=self.show_progress,**self.encode_kwargs)returnembeddings.tolist()
[docs]defembed_query(self,text:str)->List[float]:"""Compute query embeddings using a HuggingFace transformer model. Args: text: The text to embed. Returns: Embeddings for the text. """text=text.replace("\n"," ")embedding=self.client.encode(self.query_instruction+text,show_progress_bar=self.show_progress,**self.encode_kwargs,)returnembedding.tolist()
[docs]classHuggingFaceInferenceAPIEmbeddings(BaseModel,Embeddings):"""Embed texts using the HuggingFace API. Requires a HuggingFace Inference API key and a model name. """api_key:SecretStr"""Your API key for the HuggingFace Inference API."""model_name:str="sentence-transformers/all-MiniLM-L6-v2""""The name of the model to use for text embeddings."""api_url:Optional[str]=None"""Custom inference endpoint url. None for using default public url."""additional_headers:Dict[str,str]={}"""Pass additional headers to the requests library if needed."""model_config=ConfigDict(extra="forbid",protected_namespaces=())@propertydef_api_url(self)->str:returnself.api_urlorself._default_api_url@propertydef_default_api_url(self)->str:return("https://api-inference.huggingface.co""/pipeline""/feature-extraction"f"/{self.model_name}")@propertydef_headers(self)->dict:return{"Authorization":f"Bearer {self.api_key.get_secret_value()}",**self.additional_headers,}
[docs]defembed_documents(self,texts:List[str])->List[List[float]]:"""Get the embeddings for a list of texts. Args: texts (Documents): A list of texts to get embeddings for. Returns: Embedded texts as List[List[float]], where each inner List[float] corresponds to a single input text. Example: .. code-block:: python from langchain_community.embeddings import ( HuggingFaceInferenceAPIEmbeddings, ) hf_embeddings = HuggingFaceInferenceAPIEmbeddings( api_key="your_api_key", model_name="sentence-transformers/all-MiniLM-l6-v2" ) texts = ["Hello, world!", "How are you?"] hf_embeddings.embed_documents(texts) """# noqa: E501response=requests.post(self._api_url,headers=self._headers,json={"inputs":texts,"options":{"wait_for_model":True,"use_cache":True},},)returnresponse.json()
[docs]defembed_query(self,text:str)->List[float]:"""Compute query embeddings using a HuggingFace transformer model. Args: text: The text to embed. Returns: Embeddings for the text. """returnself.embed_documents([text])[0]