"""NLTK text splitter."""from__future__importannotationsfromtypingimportAnyfromlangchain_text_splitters.baseimportTextSplittertry:importnltk_HAS_NLTK=TrueexceptImportError:_HAS_NLTK=False
[docs]classNLTKTextSplitter(TextSplitter):"""Splitting text using NLTK package."""
[docs]def__init__(self,separator:str="\n\n",language:str="english",*,use_span_tokenize:bool=False,**kwargs:Any,)->None:"""Initialize the NLTK splitter."""super().__init__(**kwargs)self._separator=separatorself._language=languageself._use_span_tokenize=use_span_tokenizeifself._use_span_tokenizeandself._separator:msg="When use_span_tokenize is True, separator should be ''"raiseValueError(msg)ifnot_HAS_NLTK:msg="NLTK is not installed, please install it with `pip install nltk`."raiseImportError(msg)ifself._use_span_tokenize:self._tokenizer=nltk.tokenize._get_punkt_tokenizer(self._language)# noqa: SLF001else:self._tokenizer=nltk.tokenize.sent_tokenize
[docs]defsplit_text(self,text:str)->list[str]:"""Split incoming text and return chunks."""# First we naively split the large input into a bunch of smaller ones.ifself._use_span_tokenize:spans=list(self._tokenizer.span_tokenize(text))splits=[]fori,(start,end)inenumerate(spans):ifi>0:prev_end=spans[i-1][1]sentence=text[prev_end:start]+text[start:end]else:sentence=text[start:end]splits.append(sentence)else:splits=self._tokenizer(text,language=self._language)returnself._merge_splits(splits,self._separator)