@staticmethoddef_json_size(data:Dict)->int:"""Calculate the size of the serialized JSON object."""returnlen(json.dumps(data))@staticmethoddef_set_nested_dict(d:Dict,path:List[str],value:Any)->None:"""Set a value in a nested dictionary based on the given path."""forkeyinpath[:-1]:d=d.setdefault(key,{})d[path[-1]]=valuedef_list_to_dict_preprocessing(self,data:Any)->Any:ifisinstance(data,dict):# Process each key-value pair in the dictionaryreturn{k:self._list_to_dict_preprocessing(v)fork,vindata.items()}elifisinstance(data,list):# Convert the list to a dictionary with index-based keysreturn{str(i):self._list_to_dict_preprocessing(item)fori,iteminenumerate(data)}else:# Base case: the item is neither a dict nor a list, so return it unchangedreturndatadef_json_split(self,data:Dict[str,Any],current_path:Optional[List[str]]=None,chunks:Optional[List[Dict]]=None,)->List[Dict]:""" Split json into maximum size dictionaries while preserving structure. """current_path=current_pathor[]chunks=chunksifchunksisnotNoneelse[{}]ifisinstance(data,dict):forkey,valueindata.items():new_path=current_path+[key]chunk_size=self._json_size(chunks[-1])size=self._json_size({key:value})remaining=self.max_chunk_size-chunk_sizeifsize<remaining:# Add item to current chunkself._set_nested_dict(chunks[-1],new_path,value)else:ifchunk_size>=self.min_chunk_size:# Chunk is big enough, start a new chunkchunks.append({})# Iterateself._json_split(value,new_path,chunks)else:# handle single itemself._set_nested_dict(chunks[-1],current_path,data)returnchunks
[docs]defsplit_json(self,json_data:Dict[str,Any],convert_lists:bool=False,)->List[Dict]:"""Splits JSON into a list of JSON chunks"""ifconvert_lists:chunks=self._json_split(self._list_to_dict_preprocessing(json_data))else:chunks=self._json_split(json_data)# Remove the last chunk if it's emptyifnotchunks[-1]:chunks.pop()returnchunks
[docs]defsplit_text(self,json_data:Dict[str,Any],convert_lists:bool=False,ensure_ascii:bool=True,)->List[str]:"""Splits JSON into a list of JSON formatted strings"""chunks=self.split_json(json_data=json_data,convert_lists=convert_lists)# Convert to stringreturn[json.dumps(chunk,ensure_ascii=ensure_ascii)forchunkinchunks]
[docs]defcreate_documents(self,texts:List[Dict],convert_lists:bool=False,ensure_ascii:bool=True,metadatas:Optional[List[dict]]=None,)->List[Document]:"""Create documents from a list of json objects (Dict)."""_metadatas=metadatasor[{}]*len(texts)documents=[]fori,textinenumerate(texts):forchunkinself.split_text(json_data=text,convert_lists=convert_lists,ensure_ascii=ensure_ascii):metadata=copy.deepcopy(_metadatas[i])new_doc=Document(page_content=chunk,metadata=metadata)documents.append(new_doc)returndocuments