[docs]classWeightOnlyQuantPipeline(LLM):"""Weight only quantized model. To use, you should have the `intel-extension-for-transformers` packabge and `transformers` package installed. intel-extension-for-transformers: https://github.com/intel/intel-extension-for-transformers Example using from_model_id: .. code-block:: python from langchain_community.llms import WeightOnlyQuantPipeline from intel_extension_for_transformers.transformers import ( WeightOnlyQuantConfig ) config = WeightOnlyQuantConfig hf = WeightOnlyQuantPipeline.from_model_id( model_id="google/flan-t5-large", task="text2text-generation" pipeline_kwargs={"max_new_tokens": 10}, quantization_config=config, ) Example passing pipeline in directly: .. code-block:: python from langchain_community.llms import WeightOnlyQuantPipeline from intel_extension_for_transformers.transformers import ( AutoModelForSeq2SeqLM ) from intel_extension_for_transformers.transformers import ( WeightOnlyQuantConfig ) from transformers import AutoTokenizer, pipeline model_id = "google/flan-t5-large" tokenizer = AutoTokenizer.from_pretrained(model_id) config = WeightOnlyQuantConfig model = AutoModelForSeq2SeqLM.from_pretrained( model_id, quantization_config=config, ) pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=10, ) hf = WeightOnlyQuantPipeline(pipeline=pipe) """pipeline:Any=None#: :meta private:model_id:str=DEFAULT_MODEL_ID"""Model name or local path to use."""model_kwargs:Optional[dict]=None"""Key word arguments passed to the model."""pipeline_kwargs:Optional[dict]=None"""Key word arguments passed to the pipeline."""model_config=ConfigDict(extra="allow",)
[docs]@classmethoddeffrom_model_id(cls,model_id:str,task:str,device:Optional[int]=-1,device_map:Optional[str]=None,model_kwargs:Optional[dict]=None,pipeline_kwargs:Optional[dict]=None,load_in_4bit:Optional[bool]=False,load_in_8bit:Optional[bool]=False,quantization_config:Optional[Any]=None,**kwargs:Any,)->LLM:"""Construct the pipeline object from model_id and task."""ifdevice_mapisnotNoneand(isinstance(device,int)anddevice>-1):raiseValueError("`Device` and `device_map` cannot be set simultaneously!")ifimportlib.util.find_spec("torch")isNone:raiseValueError("Weight only quantization pipeline only support PyTorch now!")try:fromintel_extension_for_transformers.transformersimport(AutoModelForCausalLM,AutoModelForSeq2SeqLM,)fromintel_extension_for_transformers.utils.utilsimportis_ipex_availablefromtransformersimportAutoTokenizerfromtransformersimportpipelineashf_pipelineexceptImportError:raiseImportError("Could not import transformers python package. ""Please install it with `pip install transformers` ""and `pip install intel-extension-for-transformers`.")ifisinstance(device,int)anddevice>=0:ifnotis_ipex_available():raiseValueError("Don't find out Intel GPU on this machine!")device_map="xpu:"+str(device)elifisinstance(device,int)anddevice<0:device=NoneifdeviceisNone:ifdevice_mapisNone:device_map="cpu"_model_kwargs=model_kwargsor{}tokenizer=AutoTokenizer.from_pretrained(model_id,**_model_kwargs)try:iftask=="text-generation":model=AutoModelForCausalLM.from_pretrained(model_id,load_in_4bit=load_in_4bit,load_in_8bit=load_in_8bit,quantization_config=quantization_config,use_llm_runtime=False,device_map=device_map,**_model_kwargs,)eliftaskin("text2text-generation","summarization"):model=AutoModelForSeq2SeqLM.from_pretrained(model_id,load_in_4bit=load_in_4bit,load_in_8bit=load_in_8bit,quantization_config=quantization_config,use_llm_runtime=False,device_map=device_map,**_model_kwargs,)else:raiseValueError(f"Got invalid task {task}, "f"currently only {VALID_TASKS} are supported")exceptImportErrorase:raiseImportError(f"Could not load the {task} model due to missing dependencies.")fromeif"trust_remote_code"in_model_kwargs:_model_kwargs={k:vfork,vin_model_kwargs.items()ifk!="trust_remote_code"}_pipeline_kwargs=pipeline_kwargsor{}pipeline=hf_pipeline(task=task,model=model,tokenizer=tokenizer,device=device,model_kwargs=_model_kwargs,**_pipeline_kwargs,)ifpipeline.tasknotinVALID_TASKS:raiseValueError(f"Got invalid task {pipeline.task}, "f"currently only {VALID_TASKS} are supported")returncls(pipeline=pipeline,model_id=model_id,model_kwargs=_model_kwargs,pipeline_kwargs=_pipeline_kwargs,**kwargs,)
@propertydef_identifying_params(self)->Mapping[str,Any]:"""Get the identifying parameters."""return{"model_id":self.model_id,"model_kwargs":self.model_kwargs,"pipeline_kwargs":self.pipeline_kwargs,}@propertydef_llm_type(self)->str:"""Return type of llm."""return"weight_only_quantization"def_call(self,prompt:str,stop:Optional[List[str]]=None,run_manager:Optional[CallbackManagerForLLMRun]=None,**kwargs:Any,)->str:"""Call the HuggingFace model and return the output. Args: prompt: The prompt to use for generation. stop: A list of strings to stop generation when encountered. Returns: The generated text. Example: .. code-block:: python from langchain_community.llms import WeightOnlyQuantPipeline llm = WeightOnlyQuantPipeline.from_model_id( model_id="google/flan-t5-large", task="text2text-generation", ) llm.invoke("This is a prompt.") """response=self.pipeline(prompt)ifself.pipeline.task=="text-generation":# Text generation return includes the starter text.text=response[0]["generated_text"][len(prompt):]elifself.pipeline.task=="text2text-generation":text=response[0]["generated_text"]elifself.pipeline.task=="summarization":text=response[0]["summary_text"]else:raiseValueError(f"Got invalid task {self.pipeline.task}, "f"currently only {VALID_TASKS} are supported")ifstop:# This is a bit hacky, but I can't figure out a better way to enforce# stop tokens when making calls to huggingface_hub.text=enforce_stop_tokens(text,stop)returntext