[docs]classRecursiveJsonSplitter:"""Splits JSON data into smaller, structured chunks while preserving hierarchy. This class provides methods to split JSON data into smaller dictionaries or JSON-formatted strings based on configurable maximum and minimum chunk sizes. It supports nested JSON structures, optionally converts lists into dictionaries for better chunking, and allows the creation of document objects for further use. Attributes: max_chunk_size (int): The maximum size for each chunk. Defaults to 2000. min_chunk_size (int): The minimum size for each chunk, derived from `max_chunk_size` if not explicitly provided. """
[docs]def__init__(self,max_chunk_size:int=2000,min_chunk_size:Optional[int]=None):"""Initialize the chunk size configuration for text processing. This constructor sets up the maximum and minimum chunk sizes, ensuring that the `min_chunk_size` defaults to a value slightly smaller than the `max_chunk_size` if not explicitly provided. Args: max_chunk_size (int): The maximum size for a chunk. Defaults to 2000. min_chunk_size (Optional[int]): The minimum size for a chunk. If None, defaults to the maximum chunk size minus 200, with a lower bound of 50. Attributes: max_chunk_size (int): The configured maximum size for each chunk. min_chunk_size (int): The configured minimum size for each chunk, derived from `max_chunk_size` if not explicitly provided. """super().__init__()self.max_chunk_size=max_chunk_sizeself.min_chunk_size=(min_chunk_sizeifmin_chunk_sizeisnotNoneelsemax(max_chunk_size-200,50))
@staticmethoddef_json_size(data:Dict)->int:"""Calculate the size of the serialized JSON object."""returnlen(json.dumps(data))@staticmethoddef_set_nested_dict(d:Dict,path:List[str],value:Any)->None:"""Set a value in a nested dictionary based on the given path."""forkeyinpath[:-1]:d=d.setdefault(key,{})d[path[-1]]=valuedef_list_to_dict_preprocessing(self,data:Any)->Any:ifisinstance(data,dict):# Process each key-value pair in the dictionaryreturn{k:self._list_to_dict_preprocessing(v)fork,vindata.items()}elifisinstance(data,list):# Convert the list to a dictionary with index-based keysreturn{str(i):self._list_to_dict_preprocessing(item)fori,iteminenumerate(data)}else:# Base case: the item is neither a dict nor a list, so return it unchangedreturndatadef_json_split(self,data:Dict[str,Any],current_path:Optional[List[str]]=None,chunks:Optional[List[Dict]]=None,)->List[Dict]:"""Split json into maximum size dictionaries while preserving structure."""current_path=current_pathor[]chunks=chunksifchunksisnotNoneelse[{}]ifisinstance(data,dict):forkey,valueindata.items():new_path=current_path+[key]chunk_size=self._json_size(chunks[-1])size=self._json_size({key:value})remaining=self.max_chunk_size-chunk_sizeifsize<remaining:# Add item to current chunkself._set_nested_dict(chunks[-1],new_path,value)else:ifchunk_size>=self.min_chunk_size:# Chunk is big enough, start a new chunkchunks.append({})# Iterateself._json_split(value,new_path,chunks)else:# handle single itemself._set_nested_dict(chunks[-1],current_path,data)returnchunks
[docs]defsplit_json(self,json_data:Dict[str,Any],convert_lists:bool=False,)->List[Dict]:"""Splits JSON into a list of JSON chunks."""ifconvert_lists:chunks=self._json_split(self._list_to_dict_preprocessing(json_data))else:chunks=self._json_split(json_data)# Remove the last chunk if it's emptyifnotchunks[-1]:chunks.pop()returnchunks
[docs]defsplit_text(self,json_data:Dict[str,Any],convert_lists:bool=False,ensure_ascii:bool=True,)->List[str]:"""Splits JSON into a list of JSON formatted strings."""chunks=self.split_json(json_data=json_data,convert_lists=convert_lists)# Convert to stringreturn[json.dumps(chunk,ensure_ascii=ensure_ascii)forchunkinchunks]
[docs]defcreate_documents(self,texts:List[Dict],convert_lists:bool=False,ensure_ascii:bool=True,metadatas:Optional[List[dict]]=None,)->List[Document]:"""Create documents from a list of json objects (Dict)."""_metadatas=metadatasor[{}]*len(texts)documents=[]fori,textinenumerate(texts):forchunkinself.split_text(json_data=text,convert_lists=convert_lists,ensure_ascii=ensure_ascii):metadata=copy.deepcopy(_metadatas[i])new_doc=Document(page_content=chunk,metadata=metadata)documents.append(new_doc)returndocuments