[docs]classSentenceTransformersTokenTextSplitter(TextSplitter):"""Splitting text to tokens using sentence model tokenizer."""
[docs]def__init__(self,chunk_overlap:int=50,model_name:str="sentence-transformers/all-mpnet-base-v2",tokens_per_chunk:Optional[int]=None,**kwargs:Any,)->None:"""Create a new TextSplitter."""super().__init__(**kwargs,chunk_overlap=chunk_overlap)try:fromsentence_transformersimportSentenceTransformerexceptImportError:raiseImportError("Could not import sentence_transformers python package. ""This is needed in order to for SentenceTransformersTokenTextSplitter. ""Please install it with `pip install sentence-transformers`.")self.model_name=model_nameself._model=SentenceTransformer(self.model_name)self.tokenizer=self._model.tokenizerself._initialize_chunk_configuration(tokens_per_chunk=tokens_per_chunk)
def_initialize_chunk_configuration(self,*,tokens_per_chunk:Optional[int])->None:self.maximum_tokens_per_chunk=cast(int,self._model.max_seq_length)iftokens_per_chunkisNone:self.tokens_per_chunk=self.maximum_tokens_per_chunkelse:self.tokens_per_chunk=tokens_per_chunkifself.tokens_per_chunk>self.maximum_tokens_per_chunk:raiseValueError(f"The token limit of the models '{self.model_name}'"f" is: {self.maximum_tokens_per_chunk}."f" Argument tokens_per_chunk={self.tokens_per_chunk}"f" > maximum token limit.")
[docs]defsplit_text(self,text:str)->List[str]:"""Splits the input text into smaller components by splitting text on tokens. This method encodes the input text using a private `_encode` method, then strips the start and stop token IDs from the encoded result. It returns the processed segments as a list of strings. Args: text (str): The input text to be split. Returns: List[str]: A list of string components derived from the input text after encoding and processing. """defencode_strip_start_and_stop_token_ids(text:str)->List[int]:returnself._encode(text)[1:-1]tokenizer=Tokenizer(chunk_overlap=self._chunk_overlap,tokens_per_chunk=self.tokens_per_chunk,decode=self.tokenizer.decode,encode=encode_strip_start_and_stop_token_ids,)returnsplit_text_on_tokens(text=text,tokenizer=tokenizer)
[docs]defcount_tokens(self,*,text:str)->int:"""Counts the number of tokens in the given text. This method encodes the input text using a private `_encode` method and calculates the total number of tokens in the encoded result. Args: text (str): The input text for which the token count is calculated. Returns: int: The number of tokens in the encoded text. """returnlen(self._encode(text))