[docs]classNLTKTextSplitter(TextSplitter):"""Splitting text using NLTK package."""
[docs]def__init__(self,separator:str="\n\n",language:str="english",*,use_span_tokenize:bool=False,**kwargs:Any,)->None:"""Initialize the NLTK splitter."""super().__init__(**kwargs)self._separator=separatorself._language=languageself._use_span_tokenize=use_span_tokenizeifself._use_span_tokenizeandself._separator!="":raiseValueError("When use_span_tokenize is True, separator should be ''")try:ifself._use_span_tokenize:fromnltk.tokenizeimport_get_punkt_tokenizerself._tokenizer=_get_punkt_tokenizer(self._language)else:fromnltk.tokenizeimportsent_tokenizeself._tokenizer=sent_tokenizeexceptImportError:raiseImportError("NLTK is not installed, please install it with `pip install nltk`.")
[docs]defsplit_text(self,text:str)->List[str]:"""Split incoming text and return chunks."""# First we naively split the large input into a bunch of smaller ones.ifself._use_span_tokenize:spans=list(self._tokenizer.span_tokenize(text))splits=[]fori,(start,end)inenumerate(spans):ifi>0:prev_end=spans[i-1][1]sentence=text[prev_end:start]+text[start:end]else:sentence=text[start:end]splits.append(sentence)else:splits=self._tokenizer(text,language=self._language)returnself._merge_splits(splits,self._separator)