[docs]classAI21SemanticTextSplitter(TextSplitter):"""Splitting text into coherent and readable units, based on distinct topics and lines. """
[docs]def__init__(self,chunk_size:int=0,chunk_overlap:int=0,client:Optional[Any]=None,api_key:Optional[SecretStr]=None,api_host:Optional[str]=None,timeout_sec:Optional[float]=None,num_retries:Optional[int]=None,**kwargs:Any,)->None:"""Create a new TextSplitter."""super().__init__(chunk_size=chunk_size,chunk_overlap=chunk_overlap,**kwargs,)self._segmentation=AI21Base(client=client,api_key=api_key,api_host=api_host,timeout_sec=timeout_sec,num_retries=num_retries,).client.segmentation
[docs]defsplit_text(self,source:str)->List[str]:"""Split text into multiple components. Args: source: Specifies the text input for text segmentation """response=self._segmentation.create(source=source,source_type=DocumentType.TEXT)segments=[segment.segment_textforsegmentinresponse.segments]ifself._chunk_size>0:returnself._merge_splits_no_seperator(segments)returnsegments
[docs]defsplit_text_to_documents(self,source:str)->List[Document]:"""Split text into multiple documents. Args: source: Specifies the text input for text segmentation """response=self._segmentation.create(source=source,source_type=DocumentType.TEXT)return[Document(page_content=segment.segment_text,metadata={"source_type":segment.segment_type},)forsegmentinresponse.segments]
[docs]defcreate_documents(self,texts:List[str],metadatas:Optional[List[dict]]=None)->List[Document]:"""Create documents from a list of texts."""_metadatas=metadatasor[{}]*len(texts)documents=[]fori,textinenumerate(texts):normalized_text=self._normalized_text(text)index=0previous_chunk_len=0forchunkinself.split_text_to_documents(text):# merge metadata from user (if exists) and from segmentation apimetadata=copy.deepcopy(_metadatas[i])metadata.update(chunk.metadata)ifself._add_start_index:# find the start index of the chunkoffset=index+previous_chunk_len-self._chunk_overlapnormalized_chunk=self._normalized_text(chunk.page_content)index=normalized_text.find(normalized_chunk,max(0,offset))metadata["start_index"]=indexprevious_chunk_len=len(normalized_chunk)documents.append(Document(page_content=chunk.page_content,metadata=metadata,))returndocuments
def_normalized_text(self,string:str)->str:"""Use regular expression to replace sequences of '\n'"""returnre.sub(r"\s+","",string)def_merge_splits(self,splits:Iterable[str],separator:str)->List[str]:"""This method overrides the default implementation of TextSplitter"""returnself._merge_splits_no_seperator(splits)def_merge_splits_no_seperator(self,splits:Iterable[str])->List[str]:"""Merge splits into chunks. If the segment size is bigger than chunk_size, it will be left as is (won't be cut to match to chunk_size). If the segment size is smaller than chunk_size, it will be merged with the next segment until the chunk_size is reached. """chunks=[]current_chunk=""forsplitinsplits:split_len=self._length_function(split)ifsplit_len>self._chunk_size:logger.warning(f"Split of length {split_len}"f"exceeds chunk size {self._chunk_size}.")ifself._length_function(current_chunk)+split_len>self._chunk_size:ifcurrent_chunk!="":chunks.append(current_chunk)current_chunk=""current_chunk+=splitifcurrent_chunk!="":chunks.append(current_chunk)returnchunks