[docs]classTextSplitter(BaseDocumentTransformer,ABC):"""Interface for splitting text into chunks."""
[docs]def__init__(self,chunk_size:int=4000,chunk_overlap:int=200,length_function:Callable[[str],int]=len,keep_separator:Union[bool,Literal["start","end"]]=False,add_start_index:bool=False,strip_whitespace:bool=True,)->None:"""Create a new TextSplitter. Args: chunk_size: Maximum size of chunks to return chunk_overlap: Overlap in characters between chunks length_function: Function that measures the length of given chunks keep_separator: Whether to keep the separator and where to place it in each corresponding chunk (True='start') add_start_index: If `True`, includes chunk's start index in metadata strip_whitespace: If `True`, strips whitespace from the start and end of every document """ifchunk_overlap>chunk_size:raiseValueError(f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "f"({chunk_size}), should be smaller.")self._chunk_size=chunk_sizeself._chunk_overlap=chunk_overlapself._length_function=length_functionself._keep_separator=keep_separatorself._add_start_index=add_start_indexself._strip_whitespace=strip_whitespace
[docs]@abstractmethoddefsplit_text(self,text:str)->List[str]:"""Split text into multiple components."""
[docs]defcreate_documents(self,texts:List[str],metadatas:Optional[List[dict]]=None)->List[Document]:"""Create documents from a list of texts."""_metadatas=metadatasor[{}]*len(texts)documents=[]fori,textinenumerate(texts):index=0previous_chunk_len=0forchunkinself.split_text(text):metadata=copy.deepcopy(_metadatas[i])ifself._add_start_index:offset=index+previous_chunk_len-self._chunk_overlapindex=text.find(chunk,max(0,offset))metadata["start_index"]=indexprevious_chunk_len=len(chunk)new_doc=Document(page_content=chunk,metadata=metadata)documents.append(new_doc)returndocuments
def_join_docs(self,docs:List[str],separator:str)->Optional[str]:text=separator.join(docs)ifself._strip_whitespace:text=text.strip()iftext=="":returnNoneelse:returntextdef_merge_splits(self,splits:Iterable[str],separator:str)->List[str]:# We now want to combine these smaller pieces into medium size# chunks to send to the LLM.separator_len=self._length_function(separator)docs=[]current_doc:List[str]=[]total=0fordinsplits:_len=self._length_function(d)if(total+_len+(separator_leniflen(current_doc)>0else0)>self._chunk_size):iftotal>self._chunk_size:logger.warning(f"Created a chunk of size {total}, "f"which is longer than the specified {self._chunk_size}")iflen(current_doc)>0:doc=self._join_docs(current_doc,separator)ifdocisnotNone:docs.append(doc)# Keep on popping if:# - we have a larger chunk than in the chunk overlap# - or if we still have any chunks and the length is longwhiletotal>self._chunk_overlapor(total+_len+(separator_leniflen(current_doc)>0else0)>self._chunk_sizeandtotal>0):total-=self._length_function(current_doc[0])+(separator_leniflen(current_doc)>1else0)current_doc=current_doc[1:]current_doc.append(d)total+=_len+(separator_leniflen(current_doc)>1else0)doc=self._join_docs(current_doc,separator)ifdocisnotNone:docs.append(doc)returndocs
[docs]@classmethoddeffrom_huggingface_tokenizer(cls,tokenizer:Any,**kwargs:Any)->TextSplitter:"""Text splitter that uses HuggingFace tokenizer to count length."""try:fromtransformersimportPreTrainedTokenizerBaseifnotisinstance(tokenizer,PreTrainedTokenizerBase):raiseValueError("Tokenizer received was not an instance of PreTrainedTokenizerBase")def_huggingface_tokenizer_length(text:str)->int:returnlen(tokenizer.encode(text))exceptImportError:raiseValueError("Could not import transformers python package. ""Please install it with `pip install transformers`.")returncls(length_function=_huggingface_tokenizer_length,**kwargs)
[docs]@classmethoddeffrom_tiktoken_encoder(cls:Type[TS],encoding_name:str="gpt2",model_name:Optional[str]=None,allowed_special:Union[Literal["all"],AbstractSet[str]]=set(),disallowed_special:Union[Literal["all"],Collection[str]]="all",**kwargs:Any,)->TS:"""Text splitter that uses tiktoken encoder to count length."""try:importtiktokenexceptImportError:raiseImportError("Could not import tiktoken python package. ""This is needed in order to calculate max_tokens_for_prompt. ""Please install it with `pip install tiktoken`.")ifmodel_nameisnotNone:enc=tiktoken.encoding_for_model(model_name)else:enc=tiktoken.get_encoding(encoding_name)def_tiktoken_encoder(text:str)->int:returnlen(enc.encode(text,allowed_special=allowed_special,disallowed_special=disallowed_special,))ifissubclass(cls,TokenTextSplitter):extra_kwargs={"encoding_name":encoding_name,"model_name":model_name,"allowed_special":allowed_special,"disallowed_special":disallowed_special,}kwargs={**kwargs,**extra_kwargs}returncls(length_function=_tiktoken_encoder,**kwargs)
[docs]deftransform_documents(self,documents:Sequence[Document],**kwargs:Any)->Sequence[Document]:"""Transform sequence of documents by splitting them."""returnself.split_documents(list(documents))
[docs]classTokenTextSplitter(TextSplitter):"""Splitting text to tokens using model tokenizer."""
[docs]def__init__(self,encoding_name:str="gpt2",model_name:Optional[str]=None,allowed_special:Union[Literal["all"],AbstractSet[str]]=set(),disallowed_special:Union[Literal["all"],Collection[str]]="all",**kwargs:Any,)->None:"""Create a new TextSplitter."""super().__init__(**kwargs)try:importtiktokenexceptImportError:raiseImportError("Could not import tiktoken python package. ""This is needed in order to for TokenTextSplitter. ""Please install it with `pip install tiktoken`.")ifmodel_nameisnotNone:enc=tiktoken.encoding_for_model(model_name)else:enc=tiktoken.get_encoding(encoding_name)self._tokenizer=encself._allowed_special=allowed_specialself._disallowed_special=disallowed_special
[docs]classLanguage(str,Enum):"""Enum of the programming languages."""CPP="cpp"GO="go"JAVA="java"KOTLIN="kotlin"JS="js"TS="ts"PHP="php"PROTO="proto"PYTHON="python"RST="rst"RUBY="ruby"RUST="rust"SCALA="scala"SWIFT="swift"MARKDOWN="markdown"LATEX="latex"HTML="html"SOL="sol"CSHARP="csharp"COBOL="cobol"C="c"LUA="lua"PERL="perl"HASKELL="haskell"ELIXIR="elixir"POWERSHELL="powershell"
[docs]@dataclass(frozen=True)classTokenizer:"""Tokenizer data class."""chunk_overlap:int"""Overlap in tokens between chunks"""tokens_per_chunk:int"""Maximum number of tokens per chunk"""decode:Callable[[List[int]],str]""" Function to decode a list of token ids to a string"""encode:Callable[[str],List[int]]""" Function to encode a string to a list of token ids"""
[docs]defsplit_text_on_tokens(*,text:str,tokenizer:Tokenizer)->List[str]:"""Split incoming text and return chunks using tokenizer."""splits:List[str]=[]input_ids=tokenizer.encode(text)start_idx=0cur_idx=min(start_idx+tokenizer.tokens_per_chunk,len(input_ids))chunk_ids=input_ids[start_idx:cur_idx]whilestart_idx<len(input_ids):splits.append(tokenizer.decode(chunk_ids))ifcur_idx==len(input_ids):breakstart_idx+=tokenizer.tokens_per_chunk-tokenizer.chunk_overlapcur_idx=min(start_idx+tokenizer.tokens_per_chunk,len(input_ids))chunk_ids=input_ids[start_idx:cur_idx]returnsplits