|
"""Agent for working with pandas objects.""" |
|
from io import IOBase |
|
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union |
|
|
|
from langchain._api import warn_deprecated |
|
from langchain.agents import AgentExecutor, BaseSingleActionAgent |
|
from langchain_experimental.agents.agent_toolkits.pandas.prompt import ( |
|
FUNCTIONS_WITH_DF, |
|
FUNCTIONS_WITH_MULTI_DF, |
|
MULTI_DF_PREFIX, |
|
MULTI_DF_PREFIX_FUNCTIONS, |
|
PREFIX, |
|
PREFIX_FUNCTIONS, |
|
SUFFIX_NO_DF, |
|
SUFFIX_WITH_DF, |
|
SUFFIX_WITH_MULTI_DF, |
|
) |
|
from langchain.agents.mrkl.base import ZeroShotAgent |
|
from langchain.agents.mrkl.prompt import FORMAT_INSTRUCTIONS |
|
from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent |
|
from langchain.agents.types import AgentType |
|
from langchain.callbacks.base import BaseCallbackManager |
|
from langchain.chains.llm import LLMChain |
|
from langchain.schema import BasePromptTemplate |
|
from langchain.schema.language_model import BaseLanguageModel |
|
from langchain.schema.messages import SystemMessage |
|
from langchain.tools import BaseTool |
|
from langchain_experimental.tools.python.tool import PythonAstREPLTool |
|
|
|
|
|
def _get_multi_prompt( |
|
dfs: List[Any], |
|
prefix: Optional[str] = None, |
|
suffix: Optional[str] = None, |
|
input_variables: Optional[List[str]] = None, |
|
include_df_in_prompt: Optional[bool] = True, |
|
number_of_head_rows: int = 5, |
|
) -> Tuple[BasePromptTemplate, List[PythonAstREPLTool]]: |
|
num_dfs = len(dfs) |
|
if suffix is not None: |
|
suffix_to_use = suffix |
|
include_dfs_head = True |
|
elif include_df_in_prompt: |
|
suffix_to_use = SUFFIX_WITH_MULTI_DF |
|
include_dfs_head = True |
|
else: |
|
suffix_to_use = SUFFIX_NO_DF |
|
include_dfs_head = False |
|
if input_variables is None: |
|
input_variables = ["input", "agent_scratchpad", "num_dfs"] |
|
if include_dfs_head: |
|
input_variables += ["dfs_head"] |
|
|
|
if prefix is None: |
|
prefix = MULTI_DF_PREFIX |
|
|
|
df_locals = {} |
|
for i, dataframe in enumerate(dfs): |
|
df_locals[f"df{i + 1}"] = dataframe |
|
tools = [PythonAstREPLTool(locals=df_locals)] |
|
|
|
prompt = ZeroShotAgent.create_prompt( |
|
tools, prefix=prefix, suffix=suffix_to_use, input_variables=input_variables |
|
) |
|
|
|
partial_prompt = prompt.partial() |
|
if "dfs_head" in input_variables: |
|
dfs_head = "\n\n".join([d.head(number_of_head_rows).to_markdown() for d in dfs]) |
|
partial_prompt = partial_prompt.partial(num_dfs=str(num_dfs), dfs_head=dfs_head) |
|
if "num_dfs" in input_variables: |
|
partial_prompt = partial_prompt.partial(num_dfs=str(num_dfs)) |
|
return partial_prompt, tools |
|
|
|
|
|
def _get_single_prompt( |
|
df: Any, |
|
prefix: Optional[str] = None, |
|
suffix: Optional[str] = None, |
|
input_variables: Optional[List[str]] = None, |
|
include_df_in_prompt: Optional[bool] = True, |
|
number_of_head_rows: int = 5, |
|
format_instructions=FORMAT_INSTRUCTIONS, |
|
) -> Tuple[BasePromptTemplate, List[PythonAstREPLTool]]: |
|
if suffix is not None: |
|
suffix_to_use = suffix |
|
include_df_head = True |
|
elif include_df_in_prompt: |
|
suffix_to_use = SUFFIX_WITH_DF |
|
include_df_head = True |
|
else: |
|
suffix_to_use = SUFFIX_NO_DF |
|
include_df_head = False |
|
|
|
if input_variables is None: |
|
input_variables = ["input", "agent_scratchpad"] |
|
if include_df_head: |
|
input_variables += ["df_head"] |
|
|
|
if prefix is None: |
|
prefix = PREFIX |
|
|
|
tools = [PythonAstREPLTool(locals={"df": df})] |
|
|
|
prompt = ZeroShotAgent.create_prompt( |
|
tools, prefix=prefix, suffix=suffix_to_use, input_variables=input_variables, |
|
format_instructions=format_instructions, |
|
) |
|
|
|
partial_prompt = prompt.partial() |
|
if "df_head" in input_variables: |
|
partial_prompt = partial_prompt.partial( |
|
df_head=str(df.head(number_of_head_rows).to_markdown()) |
|
) |
|
return partial_prompt, tools |
|
|
|
|
|
def _get_prompt_and_tools( |
|
df: Any, |
|
prefix: Optional[str] = None, |
|
suffix: Optional[str] = None, |
|
input_variables: Optional[List[str]] = None, |
|
include_df_in_prompt: Optional[bool] = True, |
|
number_of_head_rows: int = 5, |
|
format_instructions=FORMAT_INSTRUCTIONS, |
|
) -> Tuple[BasePromptTemplate, List[PythonAstREPLTool]]: |
|
try: |
|
import pandas as pd |
|
|
|
pd.set_option("display.max_columns", None) |
|
except ImportError: |
|
raise ImportError( |
|
"pandas package not found, please install with `pip install pandas`" |
|
) |
|
|
|
if include_df_in_prompt is not None and suffix is not None: |
|
raise ValueError("If suffix is specified, include_df_in_prompt should not be.") |
|
|
|
if isinstance(df, list): |
|
for item in df: |
|
if not isinstance(item, pd.DataFrame): |
|
raise ValueError(f"Expected pandas object, got {type(df)}") |
|
return _get_multi_prompt( |
|
df, |
|
prefix=prefix, |
|
suffix=suffix, |
|
input_variables=input_variables, |
|
include_df_in_prompt=include_df_in_prompt, |
|
number_of_head_rows=number_of_head_rows, |
|
) |
|
else: |
|
if not isinstance(df, pd.DataFrame): |
|
raise ValueError(f"Expected pandas object, got {type(df)}") |
|
return _get_single_prompt( |
|
df, |
|
prefix=prefix, |
|
suffix=suffix, |
|
input_variables=input_variables, |
|
include_df_in_prompt=include_df_in_prompt, |
|
number_of_head_rows=number_of_head_rows, |
|
format_instructions=format_instructions, |
|
) |
|
|
|
|
|
def _get_functions_single_prompt( |
|
df: Any, |
|
prefix: Optional[str] = None, |
|
suffix: Optional[str] = None, |
|
include_df_in_prompt: Optional[bool] = True, |
|
number_of_head_rows: int = 5, |
|
) -> Tuple[BasePromptTemplate, List[PythonAstREPLTool]]: |
|
if suffix is not None: |
|
suffix_to_use = suffix |
|
if include_df_in_prompt: |
|
suffix_to_use = suffix_to_use.format( |
|
df_head=str(df.head(number_of_head_rows).to_markdown()) |
|
) |
|
elif include_df_in_prompt: |
|
suffix_to_use = FUNCTIONS_WITH_DF.format( |
|
df_head=str(df.head(number_of_head_rows).to_markdown()) |
|
) |
|
else: |
|
suffix_to_use = "" |
|
|
|
if prefix is None: |
|
prefix = PREFIX_FUNCTIONS |
|
|
|
tools = [PythonAstREPLTool(locals={"df": df})] |
|
system_message = SystemMessage(content=prefix + suffix_to_use) |
|
prompt = OpenAIFunctionsAgent.create_prompt(system_message=system_message) |
|
return prompt, tools |
|
|
|
|
|
def _get_functions_multi_prompt( |
|
dfs: Any, |
|
prefix: Optional[str] = None, |
|
suffix: Optional[str] = None, |
|
include_df_in_prompt: Optional[bool] = True, |
|
number_of_head_rows: int = 5, |
|
) -> Tuple[BasePromptTemplate, List[PythonAstREPLTool]]: |
|
if suffix is not None: |
|
suffix_to_use = suffix |
|
if include_df_in_prompt: |
|
dfs_head = "\n\n".join( |
|
[d.head(number_of_head_rows).to_markdown() for d in dfs] |
|
) |
|
suffix_to_use = suffix_to_use.format( |
|
dfs_head=dfs_head, |
|
) |
|
elif include_df_in_prompt: |
|
dfs_head = "\n\n".join([d.head(number_of_head_rows).to_markdown() for d in dfs]) |
|
suffix_to_use = FUNCTIONS_WITH_MULTI_DF.format( |
|
dfs_head=dfs_head, |
|
) |
|
else: |
|
suffix_to_use = "" |
|
|
|
if prefix is None: |
|
prefix = MULTI_DF_PREFIX_FUNCTIONS |
|
prefix = prefix.format(num_dfs=str(len(dfs))) |
|
|
|
df_locals = {} |
|
for i, dataframe in enumerate(dfs): |
|
df_locals[f"df{i + 1}"] = dataframe |
|
tools = [PythonAstREPLTool(locals=df_locals)] |
|
system_message = SystemMessage(content=prefix + suffix_to_use) |
|
prompt = OpenAIFunctionsAgent.create_prompt(system_message=system_message) |
|
return prompt, tools |
|
|
|
|
|
def _get_functions_prompt_and_tools( |
|
df: Any, |
|
prefix: Optional[str] = None, |
|
suffix: Optional[str] = None, |
|
input_variables: Optional[List[str]] = None, |
|
include_df_in_prompt: Optional[bool] = True, |
|
number_of_head_rows: int = 5, |
|
) -> Tuple[BasePromptTemplate, List[PythonAstREPLTool]]: |
|
try: |
|
import pandas as pd |
|
|
|
pd.set_option("display.max_columns", None) |
|
except ImportError: |
|
raise ImportError( |
|
"pandas package not found, please install with `pip install pandas`" |
|
) |
|
if input_variables is not None: |
|
raise ValueError("`input_variables` is not supported at the moment.") |
|
|
|
if include_df_in_prompt is not None and suffix is not None: |
|
raise ValueError("If suffix is specified, include_df_in_prompt should not be.") |
|
|
|
if isinstance(df, list): |
|
for item in df: |
|
if not isinstance(item, pd.DataFrame): |
|
raise ValueError(f"Expected pandas object, got {type(df)}") |
|
return _get_functions_multi_prompt( |
|
df, |
|
prefix=prefix, |
|
suffix=suffix, |
|
include_df_in_prompt=include_df_in_prompt, |
|
number_of_head_rows=number_of_head_rows, |
|
) |
|
else: |
|
if not isinstance(df, pd.DataFrame): |
|
raise ValueError(f"Expected pandas object, got {type(df)}") |
|
return _get_functions_single_prompt( |
|
df, |
|
prefix=prefix, |
|
suffix=suffix, |
|
include_df_in_prompt=include_df_in_prompt, |
|
number_of_head_rows=number_of_head_rows, |
|
) |
|
|
|
|
|
|
|
|
|
def create_pandas_dataframe_agent( |
|
llm: BaseLanguageModel, |
|
df: Any, |
|
agent_type: AgentType = AgentType.ZERO_SHOT_REACT_DESCRIPTION, |
|
callback_manager: Optional[BaseCallbackManager] = None, |
|
prefix: Optional[str] = None, |
|
suffix: Optional[str] = None, |
|
input_variables: Optional[List[str]] = None, |
|
verbose: bool = False, |
|
return_intermediate_steps: bool = False, |
|
max_iterations: Optional[int] = 15, |
|
max_execution_time: Optional[float] = None, |
|
early_stopping_method: str = "force", |
|
agent_executor_kwargs: Optional[Dict[str, Any]] = None, |
|
include_df_in_prompt: Optional[bool] = True, |
|
number_of_head_rows: int = 5, |
|
extra_tools: Sequence[BaseTool] = (), |
|
format_instructions="", |
|
**kwargs: Any, |
|
) -> AgentExecutor: |
|
"""Construct a pandas agent from an LLM and dataframe.""" |
|
warn_deprecated( |
|
since="0.0.314", |
|
message=( |
|
"On 2023-10-27 this module will be be deprecated from langchain, and " |
|
"will be available from the langchain-experimental package." |
|
"This code is already available in langchain-experimental." |
|
"See https://github.com/langchain-ai/langchain/discussions/11680." |
|
), |
|
pending=True, |
|
) |
|
agent: BaseSingleActionAgent |
|
if agent_type == AgentType.ZERO_SHOT_REACT_DESCRIPTION: |
|
prompt, base_tools = _get_prompt_and_tools( |
|
df, |
|
prefix=prefix, |
|
suffix=suffix, |
|
input_variables=input_variables, |
|
include_df_in_prompt=include_df_in_prompt, |
|
number_of_head_rows=number_of_head_rows, |
|
format_instructions=format_instructions, |
|
) |
|
tools = base_tools + list(extra_tools) |
|
llm_chain = LLMChain( |
|
llm=llm, |
|
prompt=prompt, |
|
callback_manager=callback_manager, |
|
) |
|
tool_names = [tool.name for tool in tools] |
|
agent = ZeroShotAgent( |
|
llm_chain=llm_chain, |
|
allowed_tools=tool_names, |
|
callback_manager=callback_manager, |
|
**kwargs, |
|
) |
|
elif agent_type == AgentType.OPENAI_FUNCTIONS: |
|
_prompt, base_tools = _get_functions_prompt_and_tools( |
|
df, |
|
prefix=prefix, |
|
suffix=suffix, |
|
input_variables=input_variables, |
|
include_df_in_prompt=include_df_in_prompt, |
|
number_of_head_rows=number_of_head_rows, |
|
) |
|
tools = base_tools + list(extra_tools) |
|
agent = OpenAIFunctionsAgent( |
|
llm=llm, |
|
prompt=_prompt, |
|
tools=tools, |
|
callback_manager=callback_manager, |
|
**kwargs, |
|
) |
|
else: |
|
raise ValueError(f"Agent type {agent_type} not supported at the moment.") |
|
return AgentExecutor.from_agent_and_tools( |
|
agent=agent, |
|
tools=tools, |
|
callback_manager=callback_manager, |
|
verbose=verbose, |
|
return_intermediate_steps=return_intermediate_steps, |
|
max_iterations=max_iterations, |
|
max_execution_time=max_execution_time, |
|
early_stopping_method=early_stopping_method, |
|
**(agent_executor_kwargs or {}), |
|
) |
|
|
|
|
|
def create_csv_agent( |
|
llm: BaseLanguageModel, |
|
path: Union[str, IOBase, List[Union[str, IOBase]]], |
|
pandas_kwargs: Optional[dict] = None, |
|
**kwargs: Any, |
|
) -> AgentExecutor: |
|
"""Create csv agent by loading to a dataframe and using pandas agent.""" |
|
try: |
|
import pandas as pd |
|
except ImportError: |
|
raise ImportError( |
|
"pandas package not found, please install with `pip install pandas`" |
|
) |
|
|
|
_kwargs = pandas_kwargs or {} |
|
if isinstance(path, (str, IOBase)): |
|
df = pd.read_csv(path, **_kwargs) |
|
elif isinstance(path, list): |
|
df = [] |
|
for item in path: |
|
if not isinstance(item, (str, IOBase)): |
|
raise ValueError(f"Expected str or file-like object, got {type(path)}") |
|
df.append(pd.read_csv(item, **_kwargs)) |
|
else: |
|
raise ValueError(f"Expected str, list, or file-like object, got {type(path)}") |
|
return create_pandas_dataframe_agent(llm, df, **kwargs) |
|
|