[docs]classSentenceTransformersTokenTextSplitter(TextSplitter):"""Splitting text to tokens using sentence model tokenizer."""
[docs]def__init__(self,chunk_overlap:int=50,model_name:str="sentence-transformers/all-mpnet-base-v2",tokens_per_chunk:Optional[int]=None,**kwargs:Any,)->None:"""Create a new TextSplitter."""super().__init__(**kwargs,chunk_overlap=chunk_overlap)try:fromsentence_transformersimportSentenceTransformerexceptImportError:raiseImportError("Could not import sentence_transformer python package. ""This is needed in order to for SentenceTransformersTokenTextSplitter. ""Please install it with `pip install sentence-transformers`.")self.model_name=model_nameself._model=SentenceTransformer(self.model_name)self.tokenizer=self._model.tokenizerself._initialize_chunk_configuration(tokens_per_chunk=tokens_per_chunk)
def_initialize_chunk_configuration(self,*,tokens_per_chunk:Optional[int])->None:self.maximum_tokens_per_chunk=cast(int,self._model.max_seq_length)iftokens_per_chunkisNone:self.tokens_per_chunk=self.maximum_tokens_per_chunkelse:self.tokens_per_chunk=tokens_per_chunkifself.tokens_per_chunk>self.maximum_tokens_per_chunk:raiseValueError(f"The token limit of the models '{self.model_name}'"f" is: {self.maximum_tokens_per_chunk}."f" Argument tokens_per_chunk={self.tokens_per_chunk}"f" > maximum token limit.")