[docs]classJSONLoader(BaseLoader):""" Load a `JSON` file using a `jq` schema. Setup: .. code-block:: bash pip install -U jq Instantiate: .. code-block:: python from langchain_community.document_loaders import JSONLoader import json from pathlib import Path file_path='./sample_quiz.json' data = json.loads(Path(file_path).read_text()) loader = JSONLoader( file_path=file_path, jq_schema='.quiz', text_content=False) Load: .. code-block:: python docs = loader.load() print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python {"sport": {"q1": {"question": "Which one is correct team name in NBA?", "options": ["New York Bulls" {'source': '/sample_quiz .json', 'seq_num': 1} Async load: .. code-block:: python docs = await loader.aload() print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python {"sport": {"q1": {"question": "Which one is correct team name in NBA?", "options": ["New York Bulls" {'source': '/sample_quizg .json', 'seq_num': 1} Lazy load: .. code-block:: python docs = [] docs_lazy = loader.lazy_load() # async variant: # docs_lazy = await loader.alazy_load() for doc in docs_lazy: docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) .. code-block:: python {"sport": {"q1": {"question": "Which one is correct team name in NBA?", "options": ["New York Bulls" {'source': '/sample_quiz .json', 'seq_num': 1} """
[docs]def__init__(self,file_path:Union[str,PathLike],jq_schema:str,content_key:Optional[str]=None,is_content_key_jq_parsable:Optional[bool]=False,metadata_func:Optional[Callable[[Dict,Dict],Dict]]=None,text_content:bool=True,json_lines:bool=False,):"""Initialize the JSONLoader. Args: file_path (Union[str, PathLike]): The path to the JSON or JSON Lines file. jq_schema (str): The jq schema to use to extract the data or text from the JSON. content_key (str): The key to use to extract the content from the JSON if the jq_schema results to a list of objects (dict). If is_content_key_jq_parsable is True, this has to be a jq compatible schema. If is_content_key_jq_parsable is False, this should be a simple string key. is_content_key_jq_parsable (bool): A flag to determine if content_key is parsable by jq or not. If True, content_key is treated as a jq schema and compiled accordingly. If False or if content_key is None, content_key is used as a simple string. Default is False. metadata_func (Callable[Dict, Dict]): A function that takes in the JSON object extracted by the jq_schema and the default metadata and returns a dict of the updated metadata. text_content (bool): Boolean flag to indicate whether the content is in string format, default to True. json_lines (bool): Boolean flag to indicate whether the input is in JSON Lines format. """try:importjqself.jq=jqexceptImportError:raiseImportError("jq package not found, please install it with `pip install jq`")self.file_path=Path(file_path).resolve()self._jq_schema=jq.compile(jq_schema)self._is_content_key_jq_parsable=is_content_key_jq_parsableself._content_key=content_keyself._metadata_func=metadata_funcself._text_content=text_contentself._json_lines=json_lines
[docs]deflazy_load(self)->Iterator[Document]:"""Load and return documents from the JSON file."""index=0ifself._json_lines:withself.file_path.open(encoding="utf-8")asf:forlineinf:line=line.strip()ifline:fordocinself._parse(line,index):yielddocindex+=1else:fordocinself._parse(self.file_path.read_text(encoding="utf-8"),index):yielddocindex+=1
def_parse(self,content:str,index:int)->Iterator[Document]:"""Convert given content to documents."""data=self._jq_schema.input(json.loads(content))# Perform some validation# This is not a perfect validation, but it should catch most cases# and prevent the user from getting a cryptic error later on.ifself._content_keyisnotNone:self._validate_content_key(data)fori,sampleinenumerate(data,index+1):text=self._get_text(sample=sample)metadata=self._get_metadata(sample=sample,source=str(self.file_path),seq_num=i)yieldDocument(page_content=text,metadata=metadata)def_get_text(self,sample:Any)->str:"""Convert sample to string format"""ifself._content_keyisnotNone:ifself._is_content_key_jq_parsable:compiled_content_key=self.jq.compile(self._content_key)content=compiled_content_key.input(sample).first()else:content=sample[self._content_key]else:content=sampleifself._text_contentandnotisinstance(content,str)andcontentisnotNone:raiseValueError(f"Expected page_content is string, got {type(content)} instead. \ Set `text_content=False` if the desired input for \ `page_content` is not a string")# In case the text is None, set it to an empty stringelifisinstance(content,str):returncontentelifisinstance(content,(dict,list)):returnjson.dumps(content)ifcontentelse""else:returnstr(content)ifcontentisnotNoneelse""def_get_metadata(self,sample:Dict[str,Any],**additional_fields:Any)->Dict[str,Any]:""" Return a metadata dictionary base on the existence of metadata_func :param sample: single data payload :param additional_fields: key-word arguments to be added as metadata values :return: """ifself._metadata_funcisnotNone:result=self._metadata_func(sample,additional_fields)ifnotisinstance(result,dict):raiseValueError(f"Expected the metadata_func to return a dict but got \ `{type(result)}`")returnresultelse:returnadditional_fieldsdef_validate_content_key(self,data:Any)->None:"""Check if a content key is valid"""sample=data.first()ifnotisinstance(sample,dict):raiseValueError(f"Expected the jq schema to result in a list of objects (dict), \ so sample must be a dict but got `{type(sample)}`")if(notself._is_content_key_jq_parsableandsample.get(self._content_key)isNone):raiseValueError(f"Expected the jq schema to result in a list of objects (dict) \ with the key `{self._content_key}`")if(self._is_content_key_jq_parsableandself.jq.compile(self._content_key).input(sample).text()isNone):raiseValueError(f"Expected the jq schema to result in a list of objects (dict) \ with the key `{self._content_key}` which should be parsable by jq")