[docs]classSpacyTextSplitter(TextSplitter):"""Splitting text using Spacy package. Per default, Spacy's `en_core_web_sm` model is used and its default max_length is 1000000 (it is the length of maximum character this model takes which can be increased for large files). For a faster, but potentially less accurate splitting, you can use `pipeline='sentencizer'`. """
[docs]def__init__(self,separator:str="\n\n",pipeline:str="en_core_web_sm",max_length:int=1_000_000,*,strip_whitespace:bool=True,**kwargs:Any,)->None:"""Initialize the spacy text splitter."""super().__init__(**kwargs)self._tokenizer=_make_spacy_pipeline_for_splitting(pipeline,max_length=max_length)self._separator=separatorself._strip_whitespace=strip_whitespace
[docs]defsplit_text(self,text:str)->List[str]:"""Split incoming text and return chunks."""splits=(s.textifself._strip_whitespaceelses.text_with_wsforsinself._tokenizer(text).sents)returnself._merge_splits(splits,self._separator)
def_make_spacy_pipeline_for_splitting(pipeline:str,*,max_length:int=1_000_000)->Any:# avoid importing spacytry:importspacyexceptImportError:raiseImportError("Spacy is not installed, please install it with `pip install spacy`.")ifpipeline=="sentencizer":fromspacy.lang.enimportEnglishsentencizer:Any=English()sentencizer.add_pipe("sentencizer")else:sentencizer=spacy.load(pipeline,exclude=["ner","tagger"])sentencizer.max_length=max_lengthreturnsentencizer