Source code for langchain_experimental.text_splitter
"""Experimental **text splitter** based on semantic similarity."""importcopyimportrefromtypingimportAny,Dict,Iterable,List,Literal,Optional,Sequence,Tuple,castimportnumpyasnpfromlangchain_community.utils.mathimport(cosine_similarity,)fromlangchain_core.documentsimportBaseDocumentTransformer,Documentfromlangchain_core.embeddingsimportEmbeddings
[docs]defcombine_sentences(sentences:List[dict],buffer_size:int=1)->List[dict]:"""Combine sentences based on buffer size. Args: sentences: List of sentences to combine. buffer_size: Number of sentences to combine. Defaults to 1. Returns: List of sentences with combined sentences. """# Go through each sentence dictforiinrange(len(sentences)):# Create a string that will hold the sentences which are joinedcombined_sentence=""# Add sentences before the current one, based on the buffer size.forjinrange(i-buffer_size,i):# Check if the index j is not negative# (to avoid index out of range like on the first one)ifj>=0:# Add the sentence at index j to the combined_sentence stringcombined_sentence+=sentences[j]["sentence"]+" "# Add the current sentencecombined_sentence+=sentences[i]["sentence"]# Add sentences after the current one, based on the buffer sizeforjinrange(i+1,i+1+buffer_size):# Check if the index j is within the range of the sentences listifj<len(sentences):# Add the sentence at index j to the combined_sentence stringcombined_sentence+=" "+sentences[j]["sentence"]# Then add the whole thing to your dict# Store the combined sentence in the current sentence dictsentences[i]["combined_sentence"]=combined_sentencereturnsentences
[docs]defcalculate_cosine_distances(sentences:List[dict])->Tuple[List[float],List[dict]]:"""Calculate cosine distances between sentences. Args: sentences: List of sentences to calculate distances for. Returns: Tuple of distances and sentences. """distances=[]foriinrange(len(sentences)-1):embedding_current=sentences[i]["combined_sentence_embedding"]embedding_next=sentences[i+1]["combined_sentence_embedding"]# Calculate cosine similaritysimilarity=cosine_similarity([embedding_current],[embedding_next])[0][0]# Convert to cosine distancedistance=1-similarity# Append cosine distance to the listdistances.append(distance)# Store distance in the dictionarysentences[i]["distance_to_next"]=distance# Optionally handle the last sentence# sentences[-1]['distance_to_next'] = None # or a default valuereturndistances,sentences
[docs]classSemanticChunker(BaseDocumentTransformer):"""Split the text based on semantic similarity. Taken from Greg Kamradt's wonderful notebook: https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb All credits to him. At a high level, this splits into sentences, then groups into groups of 3 sentences, and then merges one that are similar in the embedding space. """
def_calculate_breakpoint_threshold(self,distances:List[float])->Tuple[float,List[float]]:ifself.breakpoint_threshold_type=="percentile":returncast(float,np.percentile(distances,self.breakpoint_threshold_amount),),distanceselifself.breakpoint_threshold_type=="standard_deviation":returncast(float,np.mean(distances)+self.breakpoint_threshold_amount*np.std(distances),),distanceselifself.breakpoint_threshold_type=="interquartile":q1,q3=np.percentile(distances,[25,75])iqr=q3-q1returnnp.mean(distances)+self.breakpoint_threshold_amount*iqr,distanceselifself.breakpoint_threshold_type=="gradient":# Calculate the threshold based on the distribution of gradient of distance array. # noqa: E501distance_gradient=np.gradient(distances,range(0,len(distances)))returncast(float,np.percentile(distance_gradient,self.breakpoint_threshold_amount),),distance_gradientelse:raiseValueError(f"Got unexpected `breakpoint_threshold_type`: "f"{self.breakpoint_threshold_type}")def_threshold_from_clusters(self,distances:List[float])->float:""" Calculate the threshold based on the number of chunks. Inverse of percentile method. """ifself.number_of_chunksisNone:raiseValueError("This should never be called if `number_of_chunks` is None.")x1,y1=len(distances),0.0x2,y2=1.0,100.0x=max(min(self.number_of_chunks,x1),x2)# Linear interpolation formulaifx2==x1:y=y2else:y=y1+((y2-y1)/(x2-x1))*(x-x1)y=min(max(y,0),100)returncast(float,np.percentile(distances,y))def_calculate_sentence_distances(self,single_sentences_list:List[str])->Tuple[List[float],List[dict]]:"""Split text into multiple components."""_sentences=[{"sentence":x,"index":i}fori,xinenumerate(single_sentences_list)]sentences=combine_sentences(_sentences,self.buffer_size)embeddings=self.embeddings.embed_documents([x["combined_sentence"]forxinsentences])fori,sentenceinenumerate(sentences):sentence["combined_sentence_embedding"]=embeddings[i]returncalculate_cosine_distances(sentences)def_get_single_sentences_list(self,text:str)->List[str]:returnre.split(self.sentence_split_regex,text)
[docs]defsplit_text(self,text:str,)->List[str]:# Splitting the essay (by default on '.', '?', and '!')single_sentences_list=self._get_single_sentences_list(text)# having len(single_sentences_list) == 1 would cause the following# np.percentile to fail.iflen(single_sentences_list)==1:returnsingle_sentences_list# similarly, the following np.gradient would failif(self.breakpoint_threshold_type=="gradient"andlen(single_sentences_list)==2):returnsingle_sentences_listdistances,sentences=self._calculate_sentence_distances(single_sentences_list)ifself.number_of_chunksisnotNone:breakpoint_distance_threshold=self._threshold_from_clusters(distances)breakpoint_array=distanceselse:(breakpoint_distance_threshold,breakpoint_array,)=self._calculate_breakpoint_threshold(distances)indices_above_thresh=[ifori,xinenumerate(breakpoint_array)ifx>breakpoint_distance_threshold]chunks=[]start_index=0# Iterate through the breakpoints to slice the sentencesforindexinindices_above_thresh:# The end index is the current breakpointend_index=index# Slice the sentence_dicts from the current start index to the end indexgroup=sentences[start_index:end_index+1]combined_text=" ".join([d["sentence"]fordingroup])# If specified, merge together small chunks.if(self.min_chunk_sizeisnotNoneandlen(combined_text)<self.min_chunk_size):continuechunks.append(combined_text)# Update the start index for the next groupstart_index=index+1# The last group, if any sentences remainifstart_index<len(sentences):combined_text=" ".join([d["sentence"]fordinsentences[start_index:]])chunks.append(combined_text)returnchunks
[docs]defcreate_documents(self,texts:List[str],metadatas:Optional[List[dict]]=None)->List[Document]:"""Create documents from a list of texts."""_metadatas=metadatasor[{}]*len(texts)documents=[]fori,textinenumerate(texts):start_index=0forchunkinself.split_text(text):metadata=copy.deepcopy(_metadatas[i])ifself._add_start_index:metadata["start_index"]=start_indexnew_doc=Document(page_content=chunk,metadata=metadata)documents.append(new_doc)start_index+=len(chunk)returndocuments
[docs]deftransform_documents(self,documents:Sequence[Document],**kwargs:Any)->Sequence[Document]:"""Transform sequence of documents by splitting them."""returnself.split_documents(list(documents))