"""Spacy text splitter."""from__future__importannotationsfromtypingimportAnyfromlangchain_text_splitters.baseimportTextSplittertry:importspacyfromspacy.lang.enimportEnglishfromspacy.languageimportLanguage_HAS_SPACY=TrueexceptImportError:_HAS_SPACY=False
[docs]classSpacyTextSplitter(TextSplitter):"""Splitting text using Spacy package. Per default, Spacy's `en_core_web_sm` model is used and its default max_length is 1000000 (it is the length of maximum character this model takes which can be increased for large files). For a faster, but potentially less accurate splitting, you can use `pipeline='sentencizer'`. """
[docs]def__init__(self,separator:str="\n\n",pipeline:str="en_core_web_sm",max_length:int=1_000_000,*,strip_whitespace:bool=True,**kwargs:Any,)->None:"""Initialize the spacy text splitter."""super().__init__(**kwargs)self._tokenizer=_make_spacy_pipeline_for_splitting(pipeline,max_length=max_length)self._separator=separatorself._strip_whitespace=strip_whitespace
[docs]defsplit_text(self,text:str)->list[str]:"""Split incoming text and return chunks."""splits=(s.textifself._strip_whitespaceelses.text_with_wsforsinself._tokenizer(text).sents)returnself._merge_splits(splits,self._separator)
def_make_spacy_pipeline_for_splitting(pipeline:str,*,max_length:int=1_000_000)->Language:ifnot_HAS_SPACY:msg="Spacy is not installed, please install it with `pip install spacy`."raiseImportError(msg)ifpipeline=="sentencizer":sentencizer:Language=English()sentencizer.add_pipe("sentencizer")else:sentencizer=spacy.load(pipeline,exclude=["ner","tagger"])sentencizer.max_length=max_lengthreturnsentencizer