Pandas DataFrame Parser

A Pandas DataFrame is a popular data structure in the Python programming language, commonly used for data manipulation and analysis. It provides a comprehensive set of tools for working with structured data, making it a versatile option for tasks such as data cleaning, transformation, and analysis.

This output parser allows users to specify an arbitrary Pandas DataFrame and query LLMs for data in the form of a formatted dictionary that extracts data from the corresponding DataFrame. Keep in mind that large language models are leaky abstractions! You'll have to use an LLM with sufficient capacity to generate a well-formed query as per the defined format instructions.

Use Pandas' DataFrame object to declare the DataFrame you wish to perform queries on.

import pprint
from typing import Any, Dict

import pandas as pd
from langchain.output_parsers import PandasDataFrameOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

model = ChatOpenAI(temperature=0)

# Solely for documentation purposes.
def format_parser_output(parser_output: Dict[str, Any]) -> None:
    for key in parser_output.keys():
        parser_output[key] = parser_output[key].to_dict()
    return pprint.PrettyPrinter(width=4, compact=True).pprint(parser_output)

# Define your desired Pandas DataFrame.
df = pd.DataFrame(
    {
        "num_legs": [2, 4, 8, 0],
        "num_wings": [2, 0, 0, 0],
        "num_specimen_seen": [10, 2, 1, 8],
    }
)

# Set up a parser + inject instructions into the prompt template.
parser = PandasDataFrameOutputParser(dataframe=df)

# Here's an example of a column operation being performed.
df_query = "Retrieve the num_wings column."

# Set up the prompt.
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser
parser_output = chain.invoke({"query": df_query})

format_parser_output(parser_output)

{'num_wings': {0: 2,
               1: 0,
               2: 0,
               3: 0}}

# Here's an example of a row operation being performed.
df_query = "Retrieve the first row."

# Set up the prompt.
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser
parser_output = chain.invoke({"query": df_query})

format_parser_output(parser_output)

{'0': {'num_legs': 2,
       'num_specimen_seen': 10,
       'num_wings': 2}}

# Here's an example of a random Pandas DataFrame operation limiting the number of rows
df_query = "Retrieve the average of the num_legs column from rows 1 to 3."

# Set up the prompt.
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser
parser_output = chain.invoke({"query": df_query})

print(parser_output)

{'mean': 4.0}

# Here's an example of a poorly formatted query
df_query = "Retrieve the mean of the num_fingers column."

# Set up the prompt.
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser
parser_output = chain.invoke({"query": df_query})

---------------------------------------------------------------------------
``````output
OutputParserException                     Traceback (most recent call last)
``````output
Cell In[23], line 12
prompt = PromptTemplate(
   template="Answer the user query.\n{format_instructions}\n{query}\n",
   input_variables=["query"],
   partial_variables={"format_instructions": parser.get_format_instructions()},
)
chain = prompt | model | parser
---> 12 parser_output = chain.invoke({"query": df_query})
``````output
File ~/workplace/langchain/libs/core/langchain_core/runnables/base.py:1616, in RunnableSequence.invoke(self, input, config)
try:
   for i, step in enumerate(self.steps):
-> 1616         input = step.invoke(
           input,
           # mark each step as a child run
           patch_config(
               config, callbacks=run_manager.get_child(f"seq:step:{i+1}")
           ),
       )
# finish the root run
except BaseException as e:
``````output
File ~/workplace/langchain/libs/core/langchain_core/output_parsers/base.py:170, in BaseOutputParser.invoke(self, input, config)
def invoke(
   self, input: Union[str, BaseMessage], config: Optional[RunnableConfig] = None
) -> T:
   if isinstance(input, BaseMessage):
--> 170         return self._call_with_config(
           lambda inner_input: self.parse_result(
               [ChatGeneration(message=inner_input)]
           ),
           input,
           config,
           run_type="parser",
       )
   else:
       return self._call_with_config(
           lambda inner_input: self.parse_result([Generation(text=inner_input)]),
           input,
           config,
           run_type="parser",
       )
``````output
File ~/workplace/langchain/libs/core/langchain_core/runnables/base.py:906, in Runnable._call_with_config(self, func, input, config, run_type, **kwargs)
run_manager = callback_manager.on_chain_start(
   dumpd(self),
   input,
   run_type=run_type,
   name=config.get("run_name"),
)
try:
--> 906     output = call_func_with_variable_args(
       func, input, config, run_manager, **kwargs
   )
except BaseException as e:
   run_manager.on_chain_error(e)
``````output
File ~/workplace/langchain/libs/core/langchain_core/runnables/config.py:308, in call_func_with_variable_args(func, input, config, run_manager, **kwargs)
if run_manager is not None and accepts_run_manager(func):
   kwargs["run_manager"] = run_manager
--> 308 return func(input, **kwargs)
``````output
File ~/workplace/langchain/libs/core/langchain_core/output_parsers/base.py:171, in BaseOutputParser.invoke.<locals>.<lambda>(inner_input)
def invoke(
   self, input: Union[str, BaseMessage], config: Optional[RunnableConfig] = None
) -> T:
   if isinstance(input, BaseMessage):
       return self._call_with_config(
--> 171             lambda inner_input: self.parse_result(
               [ChatGeneration(message=inner_input)]
           ),
           input,
           config,
           run_type="parser",
       )
   else:
       return self._call_with_config(
           lambda inner_input: self.parse_result([Generation(text=inner_input)]),
           input,
           config,
           run_type="parser",
       )
``````output
File ~/workplace/langchain/libs/core/langchain_core/output_parsers/base.py:222, in BaseOutputParser.parse_result(self, result, partial)
def parse_result(self, result: List[Generation], *, partial: bool = False) -> T:
   """Parse a list of candidate model Generations into a specific format.

   The return value is parsed from only the first Generation in the result, which
   (...)
       Structured output.
   """
--> 222     return self.parse(result[0].text)
``````output
File ~/workplace/langchain/libs/langchain/langchain/output_parsers/pandas_dataframe.py:90, in PandasDataFrameOutputParser.parse(self, request)
request_type, request_params = splitted_request
if request_type in {"Invalid column", "Invalid operation"}:
---> 90     raise OutputParserException(
       f"{request}. Please check the format instructions."
   )
array_exists = re.search(r"(\[.*?\])", request_params)
if array_exists:
``````output
OutputParserException: Invalid column: num_fingers. Please check the format instructions.

Find out api documentation for PandasDataFrameOutputParser.

Pandas DataFrame Parser

Help us out by providing feedback on this documentation page: