[docs]classCharacterTextSplitter(TextSplitter):"""Splitting text that looks at characters."""
[docs]def__init__(self,separator:str="\n\n",is_separator_regex:bool=False,**kwargs:Any)->None:"""Create a new TextSplitter."""super().__init__(**kwargs)self._separator=separatorself._is_separator_regex=is_separator_regex
[docs]defsplit_text(self,text:str)->List[str]:"""Split incoming text and return chunks."""# First we naively split the large input into a bunch of smaller ones.separator=(self._separatorifself._is_separator_regexelsere.escape(self._separator))splits=_split_text_with_regex(text,separator,self._keep_separator)_separator=""ifself._keep_separatorelseself._separatorreturnself._merge_splits(splits,_separator)
def_split_text_with_regex(text:str,separator:str,keep_separator:Union[bool,Literal["start","end"]])->List[str]:# Now that we have the separator, split the textifseparator:ifkeep_separator:# The parentheses in the pattern keep the delimiters in the result._splits=re.split(f"({separator})",text)splits=(([_splits[i]+_splits[i+1]foriinrange(0,len(_splits)-1,2)])ifkeep_separator=="end"else([_splits[i]+_splits[i+1]foriinrange(1,len(_splits),2)]))iflen(_splits)%2==0:splits+=_splits[-1:]splits=((splits+[_splits[-1]])ifkeep_separator=="end"else([_splits[0]]+splits))else:splits=re.split(separator,text)else:splits=list(text)return[sforsinsplitsifs!=""]
[docs]classRecursiveCharacterTextSplitter(TextSplitter):"""Splitting text by recursively look at characters. Recursively tries to split by different characters to find one that works. """
[docs]def__init__(self,separators:Optional[List[str]]=None,keep_separator:Union[bool,Literal["start","end"]]=True,is_separator_regex:bool=False,**kwargs:Any,)->None:"""Create a new TextSplitter."""super().__init__(keep_separator=keep_separator,**kwargs)self._separators=separatorsor["\n\n","\n"," ",""]self._is_separator_regex=is_separator_regex
def_split_text(self,text:str,separators:List[str])->List[str]:"""Split incoming text and return chunks."""final_chunks=[]# Get appropriate separator to useseparator=separators[-1]new_separators=[]fori,_sinenumerate(separators):_separator=_sifself._is_separator_regexelsere.escape(_s)if_s=="":separator=_sbreakifre.search(_separator,text):separator=_snew_separators=separators[i+1:]break_separator=separatorifself._is_separator_regexelsere.escape(separator)splits=_split_text_with_regex(text,_separator,self._keep_separator)# Now go merging things, recursively splitting longer texts._good_splits=[]_separator=""ifself._keep_separatorelseseparatorforsinsplits:ifself._length_function(s)<self._chunk_size:_good_splits.append(s)else:if_good_splits:merged_text=self._merge_splits(_good_splits,_separator)final_chunks.extend(merged_text)_good_splits=[]ifnotnew_separators:final_chunks.append(s)else:other_info=self._split_text(s,new_separators)final_chunks.extend(other_info)if_good_splits:merged_text=self._merge_splits(_good_splits,_separator)final_chunks.extend(merged_text)returnfinal_chunks
[docs]defsplit_text(self,text:str)->List[str]:"""Split the input text into smaller chunks based on predefined separators. Args: text (str): The input text to be split. Returns: List[str]: A list of text chunks obtained after splitting. """returnself._split_text(text,self._separators)
[docs]@classmethoddeffrom_language(cls,language:Language,**kwargs:Any)->RecursiveCharacterTextSplitter:"""Return an instance of this class based on a specific language. This method initializes the text splitter with language-specific separators. Args: language (Language): The language to configure the text splitter for. **kwargs (Any): Additional keyword arguments to customize the splitter. Returns: RecursiveCharacterTextSplitter: An instance of the text splitter configured for the specified language. """separators=cls.get_separators_for_language(language)returncls(separators=separators,is_separator_regex=True,**kwargs)
[docs]@staticmethoddefget_separators_for_language(language:Language)->List[str]:"""Retrieve a list of separators specific to the given language. Args: language (Language): The language for which to get the separators. Returns: List[str]: A list of separators appropriate for the specified language. """iflanguage==Language.Corlanguage==Language.CPP:return[# Split along class definitions"\nclass ",# Split along function definitions"\nvoid ","\nint ","\nfloat ","\ndouble ",# Split along control flow statements"\nif ","\nfor ","\nwhile ","\nswitch ","\ncase ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.GO:return[# Split along function definitions"\nfunc ","\nvar ","\nconst ","\ntype ",# Split along control flow statements"\nif ","\nfor ","\nswitch ","\ncase ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.JAVA:return[# Split along class definitions"\nclass ",# Split along method definitions"\npublic ","\nprotected ","\nprivate ","\nstatic ",# Split along control flow statements"\nif ","\nfor ","\nwhile ","\nswitch ","\ncase ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.KOTLIN:return[# Split along class definitions"\nclass ",# Split along method definitions"\npublic ","\nprotected ","\nprivate ","\ninternal ","\ncompanion ","\nfun ","\nval ","\nvar ",# Split along control flow statements"\nif ","\nfor ","\nwhile ","\nwhen ","\ncase ","\nelse ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.JS:return[# Split along function definitions"\nfunction ","\nconst ","\nlet ","\nvar ","\nclass ",# Split along control flow statements"\nif ","\nfor ","\nwhile ","\nswitch ","\ncase ","\ndefault ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.TS:return["\nenum ","\ninterface ","\nnamespace ","\ntype ",# Split along class definitions"\nclass ",# Split along function definitions"\nfunction ","\nconst ","\nlet ","\nvar ",# Split along control flow statements"\nif ","\nfor ","\nwhile ","\nswitch ","\ncase ","\ndefault ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.PHP:return[# Split along function definitions"\nfunction ",# Split along class definitions"\nclass ",# Split along control flow statements"\nif ","\nforeach ","\nwhile ","\ndo ","\nswitch ","\ncase ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.PROTO:return[# Split along message definitions"\nmessage ",# Split along service definitions"\nservice ",# Split along enum definitions"\nenum ",# Split along option definitions"\noption ",# Split along import statements"\nimport ",# Split along syntax declarations"\nsyntax ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.PYTHON:return[# First, try to split along class definitions"\nclass ","\ndef ","\n\tdef ",# Now split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.RST:return[# Split along section titles"\n=+\n","\n-+\n","\n\\*+\n",# Split along directive markers"\n\n.. *\n\n",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.RUBY:return[# Split along method definitions"\ndef ","\nclass ",# Split along control flow statements"\nif ","\nunless ","\nwhile ","\nfor ","\ndo ","\nbegin ","\nrescue ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.ELIXIR:return[# Split along method function and module definition"\ndef ","\ndefp ","\ndefmodule ","\ndefprotocol ","\ndefmacro ","\ndefmacrop ",# Split along control flow statements"\nif ","\nunless ","\nwhile ","\ncase ","\ncond ","\nwith ","\nfor ","\ndo ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.RUST:return[# Split along function definitions"\nfn ","\nconst ","\nlet ",# Split along control flow statements"\nif ","\nwhile ","\nfor ","\nloop ","\nmatch ","\nconst ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.SCALA:return[# Split along class definitions"\nclass ","\nobject ",# Split along method definitions"\ndef ","\nval ","\nvar ",# Split along control flow statements"\nif ","\nfor ","\nwhile ","\nmatch ","\ncase ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.SWIFT:return[# Split along function definitions"\nfunc ",# Split along class definitions"\nclass ","\nstruct ","\nenum ",# Split along control flow statements"\nif ","\nfor ","\nwhile ","\ndo ","\nswitch ","\ncase ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.MARKDOWN:return[# First, try to split along Markdown headings (starting with level 2)"\n#{1,6} ",# Note the alternative syntax for headings (below) is not handled here# Heading level 2# ---------------# End of code block"```\n",# Horizontal lines"\n\\*\\*\\*+\n","\n---+\n","\n___+\n",# Note that this splitter doesn't handle horizontal lines defined# by *three or more* of ***, ---, or ___, but this is not handled"\n\n","\n"," ","",]eliflanguage==Language.LATEX:return[# First, try to split along Latex sections"\n\\\\chapter{","\n\\\\section{","\n\\\\subsection{","\n\\\\subsubsection{",# Now split by environments"\n\\\\begin{enumerate}","\n\\\\begin{itemize}","\n\\\\begin{description}","\n\\\\begin{list}","\n\\\\begin{quote}","\n\\\\begin{quotation}","\n\\\\begin{verse}","\n\\\\begin{verbatim}",# Now split by math environments"\n\\\\begin{align}","$$","$",# Now split by the normal type of lines" ","",]eliflanguage==Language.HTML:return[# First, try to split along HTML tags"<body","<div","<p","<br","<li","<h1","<h2","<h3","<h4","<h5","<h6","<span","<table","<tr","<td","<th","<ul","<ol","<header","<footer","<nav",# Head"<head","<style","<script","<meta","<title","",]eliflanguage==Language.CSHARP:return["\ninterface ","\nenum ","\nimplements ","\ndelegate ","\nevent ",# Split along class definitions"\nclass ","\nabstract ",# Split along method definitions"\npublic ","\nprotected ","\nprivate ","\nstatic ","\nreturn ",# Split along control flow statements"\nif ","\ncontinue ","\nfor ","\nforeach ","\nwhile ","\nswitch ","\nbreak ","\ncase ","\nelse ",# Split by exceptions"\ntry ","\nthrow ","\nfinally ","\ncatch ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.SOL:return[# Split along compiler information definitions"\npragma ","\nusing ",# Split along contract definitions"\ncontract ","\ninterface ","\nlibrary ",# Split along method definitions"\nconstructor ","\ntype ","\nfunction ","\nevent ","\nmodifier ","\nerror ","\nstruct ","\nenum ",# Split along control flow statements"\nif ","\nfor ","\nwhile ","\ndo while ","\nassembly ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.COBOL:return[# Split along divisions"\nIDENTIFICATION DIVISION.","\nENVIRONMENT DIVISION.","\nDATA DIVISION.","\nPROCEDURE DIVISION.",# Split along sections within DATA DIVISION"\nWORKING-STORAGE SECTION.","\nLINKAGE SECTION.","\nFILE SECTION.",# Split along sections within PROCEDURE DIVISION"\nINPUT-OUTPUT SECTION.",# Split along paragraphs and common statements"\nOPEN ","\nCLOSE ","\nREAD ","\nWRITE ","\nIF ","\nELSE ","\nMOVE ","\nPERFORM ","\nUNTIL ","\nVARYING ","\nACCEPT ","\nDISPLAY ","\nSTOP RUN.",# Split by the normal type of lines"\n"," ","",]eliflanguage==Language.LUA:return[# Split along variable and table definitions"\nlocal ",# Split along function definitions"\nfunction ",# Split along control flow statements"\nif ","\nfor ","\nwhile ","\nrepeat ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.HASKELL:return[# Split along function definitions"\nmain :: ","\nmain = ","\nlet ","\nin ","\ndo ","\nwhere ","\n:: ","\n= ",# Split along type declarations"\ndata ","\nnewtype ","\ntype ","\n:: ",# Split along module declarations"\nmodule ",# Split along import statements"\nimport ","\nqualified ","\nimport qualified ",# Split along typeclass declarations"\nclass ","\ninstance ",# Split along case expressions"\ncase ",# Split along guards in function definitions"\n| ",# Split along record field declarations"\ndata ","\n= {","\n, ",# Split by the normal type of lines"\n\n","\n"," ","",]eliflanguage==Language.POWERSHELL:return[# Split along function definitions"\nfunction ",# Split along parameter declarations (escape parentheses)"\nparam ",# Split along control flow statements"\nif ","\nforeach ","\nfor ","\nwhile ","\nswitch ",# Split along class definitions (for PowerShell 5.0 and above)"\nclass ",# Split along try-catch-finally blocks"\ntry ","\ncatch ","\nfinally ",# Split by normal lines and empty spaces"\n\n","\n"," ","",]eliflanguageinLanguage._value2member_map_:raiseValueError(f"Language {language} is not implemented yet!")else:raiseValueError(f"Language {language} is not supported! "f"Please choose from {list(Language)}")