|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
from functools import wraps |
|
|
from pathlib import Path |
|
|
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union |
|
|
|
|
|
import pandas as pd |
|
|
|
|
|
if TYPE_CHECKING: |
|
|
from pandas import DataFrame |
|
|
from pandasai import SmartDataframe |
|
|
|
|
|
|
|
|
def check_suffix(valid_suffixs: List[str]) -> Callable: |
|
|
r"""A decorator to check the file suffix of a given file path. |
|
|
|
|
|
Args: |
|
|
valid_suffix (str): The required file suffix. |
|
|
|
|
|
Returns: |
|
|
Callable: The decorator function. |
|
|
""" |
|
|
|
|
|
def decorator(func: Callable): |
|
|
@wraps(func) |
|
|
def wrapper( |
|
|
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] |
|
|
) -> "DataFrame": |
|
|
suffix = Path(file_path).suffix |
|
|
if suffix not in valid_suffixs: |
|
|
raise ValueError( |
|
|
f"Only {', '.join(valid_suffixs)} files are supported" |
|
|
) |
|
|
return func(self, file_path, *args, **kwargs) |
|
|
|
|
|
return wrapper |
|
|
|
|
|
return decorator |
|
|
|
|
|
|
|
|
class PandaReader: |
|
|
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None: |
|
|
r"""Initializes the PandaReader class. |
|
|
|
|
|
Args: |
|
|
config (Optional[Dict[str, Any]], optional): The configuration |
|
|
dictionary that can include LLM API settings for LLM-based |
|
|
processing. If not provided, it will use OpenAI with the API |
|
|
key from the OPENAI_API_KEY environment variable. You can |
|
|
customize the LLM configuration by providing a 'llm' key in |
|
|
the config dictionary. (default: :obj:`None`) |
|
|
""" |
|
|
from pandasai.llm import OpenAI |
|
|
|
|
|
self.config = config or {} |
|
|
if "llm" not in self.config: |
|
|
self.config["llm"] = OpenAI( |
|
|
api_token=os.getenv("OPENAI_API_KEY"), |
|
|
) |
|
|
|
|
|
self.__LOADER = { |
|
|
".csv": self.read_csv, |
|
|
".xlsx": self.read_excel, |
|
|
".xls": self.read_excel, |
|
|
".json": self.read_json, |
|
|
".parquet": self.read_parquet, |
|
|
".sql": self.read_sql, |
|
|
".html": self.read_html, |
|
|
".feather": self.read_feather, |
|
|
".dta": self.read_stata, |
|
|
".sas": self.read_sas, |
|
|
".pkl": self.read_pickle, |
|
|
".h5": self.read_hdf, |
|
|
".orc": self.read_orc, |
|
|
} |
|
|
|
|
|
def load( |
|
|
self, |
|
|
data: Union["DataFrame", str], |
|
|
*args: Any, |
|
|
**kwargs: Dict[str, Any], |
|
|
) -> "SmartDataframe": |
|
|
r"""Loads a file or DataFrame and returns a SmartDataframe object. |
|
|
|
|
|
args: |
|
|
data (Union[DataFrame, str]): The data to load. |
|
|
*args (Any): Additional positional arguments. |
|
|
**kwargs (Dict[str, Any]): Additional keyword arguments. |
|
|
|
|
|
Returns: |
|
|
SmartDataframe: The SmartDataframe object. |
|
|
""" |
|
|
from pandas import DataFrame |
|
|
from pandasai import SmartDataframe |
|
|
|
|
|
if isinstance(data, DataFrame): |
|
|
return SmartDataframe(data, config=self.config) |
|
|
file_path = str(data) |
|
|
path = Path(file_path) |
|
|
if not file_path.startswith("http") and not path.exists(): |
|
|
raise FileNotFoundError(f"File {file_path} not found") |
|
|
if path.suffix in self.__LOADER: |
|
|
return SmartDataframe( |
|
|
self.__LOADER[path.suffix](file_path, *args, **kwargs), |
|
|
config=self.config, |
|
|
) |
|
|
else: |
|
|
raise ValueError(f"Unsupported file format: {path.suffix}") |
|
|
|
|
|
@check_suffix([".csv"]) |
|
|
def read_csv( |
|
|
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] |
|
|
) -> "DataFrame": |
|
|
r"""Reads a CSV file and returns a DataFrame. |
|
|
|
|
|
Args: |
|
|
file_path (str): The path to the CSV file. |
|
|
*args (Any): Additional positional arguments. |
|
|
**kwargs (Dict[str, Any]): Additional keyword arguments. |
|
|
|
|
|
Returns: |
|
|
DataFrame: The DataFrame object. |
|
|
""" |
|
|
return pd.read_csv(file_path, *args, **kwargs) |
|
|
|
|
|
@check_suffix([".xlsx", ".xls"]) |
|
|
def read_excel( |
|
|
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] |
|
|
) -> "DataFrame": |
|
|
r"""Reads an Excel file and returns a DataFrame. |
|
|
|
|
|
Args: |
|
|
file_path (str): The path to the Excel file. |
|
|
*args (Any): Additional positional arguments. |
|
|
**kwargs (Dict[str, Any]): Additional keyword arguments. |
|
|
|
|
|
Returns: |
|
|
DataFrame: The DataFrame object. |
|
|
""" |
|
|
return pd.read_excel(file_path, *args, **kwargs) |
|
|
|
|
|
@check_suffix([".json"]) |
|
|
def read_json( |
|
|
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] |
|
|
) -> "DataFrame": |
|
|
r"""Reads a JSON file and returns a DataFrame. |
|
|
|
|
|
Args: |
|
|
file_path (str): The path to the JSON file. |
|
|
*args (Any): Additional positional arguments. |
|
|
**kwargs (Dict[str, Any]): Additional keyword arguments. |
|
|
|
|
|
Returns: |
|
|
DataFrame: The DataFrame object. |
|
|
""" |
|
|
return pd.read_json(file_path, *args, **kwargs) |
|
|
|
|
|
@check_suffix([".parquet"]) |
|
|
def read_parquet( |
|
|
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] |
|
|
) -> "DataFrame": |
|
|
r"""Reads a Parquet file and returns a DataFrame. |
|
|
|
|
|
Args: |
|
|
file_path (str): The path to the Parquet file. |
|
|
*args (Any): Additional positional arguments. |
|
|
**kwargs (Dict[str, Any]): Additional keyword arguments. |
|
|
|
|
|
Returns: |
|
|
DataFrame: The DataFrame object. |
|
|
""" |
|
|
return pd.read_parquet(file_path, *args, **kwargs) |
|
|
|
|
|
def read_sql(self, *args: Any, **kwargs: Dict[str, Any]) -> "DataFrame": |
|
|
r"""Reads a SQL file and returns a DataFrame. |
|
|
|
|
|
Args: |
|
|
*args (Any): Additional positional arguments. |
|
|
**kwargs (Dict[str, Any]): Additional keyword arguments. |
|
|
|
|
|
Returns: |
|
|
DataFrame: The DataFrame object. |
|
|
""" |
|
|
return pd.read_sql(*args, **kwargs) |
|
|
|
|
|
def read_table( |
|
|
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] |
|
|
) -> "DataFrame": |
|
|
r"""Reads a table and returns a DataFrame. |
|
|
|
|
|
Args: |
|
|
file_path (str): The path to the table. |
|
|
*args (Any): Additional positional arguments. |
|
|
**kwargs (Dict[str, Any]): Additional keyword arguments. |
|
|
|
|
|
Returns: |
|
|
DataFrame: The DataFrame object. |
|
|
""" |
|
|
return pd.read_table(file_path, *args, **kwargs) |
|
|
|
|
|
def read_clipboard( |
|
|
self, *args: Any, **kwargs: Dict[str, Any] |
|
|
) -> "DataFrame": |
|
|
r"""Reads a clipboard and returns a DataFrame. |
|
|
|
|
|
Args: |
|
|
*args (Any): Additional positional arguments. |
|
|
**kwargs (Dict[str, Any]): Additional keyword arguments. |
|
|
|
|
|
Returns: |
|
|
DataFrame: The DataFrame object. |
|
|
""" |
|
|
return pd.read_clipboard(*args, **kwargs) |
|
|
|
|
|
@check_suffix([".html"]) |
|
|
def read_html( |
|
|
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] |
|
|
) -> "DataFrame": |
|
|
r"""Reads an HTML file and returns a DataFrame. |
|
|
|
|
|
Args: |
|
|
file_path (str): The path to the HTML file. |
|
|
*args (Any): Additional positional arguments. |
|
|
**kwargs (Dict[str, Any]): Additional keyword arguments. |
|
|
|
|
|
Returns: |
|
|
DataFrame: The DataFrame object. |
|
|
""" |
|
|
return pd.read_html(file_path, *args, **kwargs) |
|
|
|
|
|
@check_suffix([".feather"]) |
|
|
def read_feather( |
|
|
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] |
|
|
) -> "DataFrame": |
|
|
r"""Reads a Feather file and returns a DataFrame. |
|
|
|
|
|
Args: |
|
|
file_path (str): The path to the Feather file. |
|
|
*args (Any): Additional positional arguments. |
|
|
**kwargs (Dict[str, Any]): Additional keyword arguments. |
|
|
|
|
|
Returns: |
|
|
DataFrame: The DataFrame object. |
|
|
""" |
|
|
return pd.read_feather(file_path, *args, **kwargs) |
|
|
|
|
|
@check_suffix([".dta"]) |
|
|
def read_stata( |
|
|
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] |
|
|
) -> "DataFrame": |
|
|
r"""Reads a Stata file and returns a DataFrame. |
|
|
|
|
|
Args: |
|
|
file_path (str): The path to the Stata file. |
|
|
*args (Any): Additional positional arguments. |
|
|
**kwargs (Dict[str, Any]): Additional keyword arguments. |
|
|
|
|
|
Returns: |
|
|
DataFrame: The DataFrame object. |
|
|
""" |
|
|
return pd.read_stata(file_path, *args, **kwargs) |
|
|
|
|
|
@check_suffix([".sas"]) |
|
|
def read_sas( |
|
|
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] |
|
|
) -> "DataFrame": |
|
|
r"""Reads a SAS file and returns a DataFrame. |
|
|
|
|
|
Args: |
|
|
file_path (str): The path to the SAS file. |
|
|
*args (Any): Additional positional arguments. |
|
|
**kwargs (Dict[str, Any]): Additional keyword arguments. |
|
|
|
|
|
Returns: |
|
|
DataFrame: The DataFrame object. |
|
|
""" |
|
|
return pd.read_sas(file_path, *args, **kwargs) |
|
|
|
|
|
@check_suffix([".pkl"]) |
|
|
def read_pickle( |
|
|
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] |
|
|
) -> "DataFrame": |
|
|
r"""Reads a Pickle file and returns a DataFrame. |
|
|
|
|
|
Args: |
|
|
file_path (str): The path to the Pickle file. |
|
|
*args (Any): Additional positional arguments. |
|
|
**kwargs (Dict[str, Any]): Additional keyword arguments. |
|
|
|
|
|
Returns: |
|
|
DataFrame: The DataFrame object. |
|
|
""" |
|
|
return pd.read_pickle(file_path, *args, **kwargs) |
|
|
|
|
|
@check_suffix([".h5"]) |
|
|
def read_hdf( |
|
|
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] |
|
|
) -> "DataFrame": |
|
|
r"""Reads an HDF file and returns a DataFrame. |
|
|
|
|
|
Args: |
|
|
file_path (str): The path to the HDF file. |
|
|
*args (Any): Additional positional arguments. |
|
|
**kwargs (Dict[str, Any]): Additional keyword arguments. |
|
|
|
|
|
Returns: |
|
|
DataFrame: The DataFrame object. |
|
|
""" |
|
|
return pd.read_hdf(file_path, *args, **kwargs) |
|
|
|
|
|
@check_suffix([".orc"]) |
|
|
def read_orc( |
|
|
self, file_path: str, *args: Any, **kwargs: Dict[str, Any] |
|
|
) -> "DataFrame": |
|
|
r"""Reads an ORC file and returns a DataFrame. |
|
|
|
|
|
Args: |
|
|
file_path (str): The path to the ORC file. |
|
|
*args (Any): Additional positional arguments. |
|
|
**kwargs (Dict[str, Any]): Additional keyword arguments. |
|
|
|
|
|
Returns: |
|
|
DataFrame: The DataFrame object. |
|
|
""" |
|
|
return pd.read_orc(file_path, *args, **kwargs) |
|
|
|