Update app.py
Browse files
app.py
CHANGED
|
@@ -21,7 +21,6 @@ class SALTAnalytics:
|
|
| 21 |
return "Dataset already loaded!"
|
| 22 |
|
| 23 |
try:
|
| 24 |
-
# Try loading with authentication
|
| 25 |
hf_token = os.getenv('HF_TOKEN')
|
| 26 |
|
| 27 |
if hf_token:
|
|
@@ -43,21 +42,17 @@ class SALTAnalytics:
|
|
| 43 |
|
| 44 |
df = dataset.to_pandas()
|
| 45 |
|
| 46 |
-
# Sample data for demo if too large
|
| 47 |
if len(df) > 100000:
|
| 48 |
df = df.sample(n=50000, random_state=42)
|
| 49 |
|
| 50 |
-
# Load into DuckDB
|
| 51 |
self.con.execute("CREATE TABLE salt_data AS SELECT * FROM df")
|
| 52 |
|
| 53 |
-
# Get schema information and available columns
|
| 54 |
schema_result = self.con.execute("DESCRIBE salt_data").fetchall()
|
| 55 |
self.schema_info = "\n".join([f"{col[0]}: {col[1]}" for col in schema_result])
|
| 56 |
self.available_columns = [col[0] for col in schema_result]
|
| 57 |
|
| 58 |
self.data_loaded = True
|
| 59 |
|
| 60 |
-
# Return success message with column info
|
| 61 |
return f"β
Successfully loaded {len(df)} records into DuckDB\n\nπ Available columns:\n" + "\n".join(f"β’ {col}" for col in self.available_columns[:20]) + ("\n... and more" if len(self.available_columns) > 20 else "")
|
| 62 |
|
| 63 |
except Exception as e:
|
|
@@ -68,116 +63,101 @@ class SALTAnalytics:
|
|
| 68 |
return f"β Error loading dataset: {error_msg}"
|
| 69 |
|
| 70 |
def get_predefined_insights(self):
|
| 71 |
-
"""Generate predefined analytical insights
|
| 72 |
if not self.data_loaded:
|
| 73 |
return "Please load the dataset first"
|
| 74 |
|
| 75 |
try:
|
| 76 |
insights = {}
|
| 77 |
|
| 78 |
-
#
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
elif 'SHIP' in col.upper() and 'PARTY' in col.upper():
|
| 88 |
-
customer_col = col # ShipToParty is often used as customer identifier
|
| 89 |
-
break
|
| 90 |
-
|
| 91 |
-
# Look for sales office column
|
| 92 |
-
for col in self.available_columns:
|
| 93 |
-
if 'SALES' in col.upper() and 'OFFICE' in col.upper():
|
| 94 |
-
sales_office_col = col
|
| 95 |
-
break
|
| 96 |
|
| 97 |
-
#
|
| 98 |
-
if sales_office_col:
|
| 99 |
-
if customer_col:
|
| 100 |
-
insights['Sales Office Performance'] = self.con.execute(f"""
|
| 101 |
-
SELECT {sales_office_col},
|
| 102 |
-
COUNT(*) as total_orders,
|
| 103 |
-
COUNT(DISTINCT {customer_col}) as unique_customers
|
| 104 |
-
FROM salt_data
|
| 105 |
-
WHERE {sales_office_col} IS NOT NULL
|
| 106 |
-
GROUP BY {sales_office_col}
|
| 107 |
-
ORDER BY total_orders DESC
|
| 108 |
-
LIMIT 10
|
| 109 |
-
""").fetchdf()
|
| 110 |
-
else:
|
| 111 |
-
insights['Sales Office Performance'] = self.con.execute(f"""
|
| 112 |
-
SELECT {sales_office_col},
|
| 113 |
-
COUNT(*) as total_orders
|
| 114 |
-
FROM salt_data
|
| 115 |
-
WHERE {sales_office_col} IS NOT NULL
|
| 116 |
-
GROUP BY {sales_office_col}
|
| 117 |
-
ORDER BY total_orders DESC
|
| 118 |
-
LIMIT 10
|
| 119 |
-
""").fetchdf()
|
| 120 |
-
|
| 121 |
-
# Payment Terms Distribution (if available)
|
| 122 |
if 'CUSTOMERPAYMENTTERMS' in self.available_columns:
|
| 123 |
insights['Payment Terms Distribution'] = self.con.execute("""
|
| 124 |
SELECT CUSTOMERPAYMENTTERMS,
|
| 125 |
COUNT(*) as frequency,
|
| 126 |
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
|
| 127 |
FROM salt_data
|
| 128 |
-
WHERE CUSTOMERPAYMENTTERMS IS NOT NULL
|
| 129 |
GROUP BY CUSTOMERPAYMENTTERMS
|
| 130 |
ORDER BY frequency DESC
|
| 131 |
LIMIT 10
|
| 132 |
""").fetchdf()
|
| 133 |
|
| 134 |
-
#
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
|
|
|
|
|
|
| 138 |
for col in self.available_columns:
|
| 139 |
if 'SHIPPING' in col.upper() and 'CONDITION' in col.upper():
|
| 140 |
shipping_col = col
|
| 141 |
-
|
| 142 |
-
plant_col = col
|
| 143 |
|
| 144 |
if shipping_col:
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
FROM salt_data
|
| 175 |
-
""").fetchdf()
|
| 176 |
|
| 177 |
return insights
|
| 178 |
|
| 179 |
except Exception as e:
|
| 180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
def clean_sql_response(self, sql_query: str) -> str:
|
| 183 |
"""Clean SQL response - avoiding string literal errors"""
|
|
@@ -205,8 +185,7 @@ class SALTAnalytics:
|
|
| 205 |
try:
|
| 206 |
client = openai.OpenAI(api_key=api_key)
|
| 207 |
|
| 208 |
-
|
| 209 |
-
columns_list = ", ".join(self.available_columns[:30]) # Include first 30 columns
|
| 210 |
|
| 211 |
prompt = f"""
|
| 212 |
You are a SQL expert analyzing SAP SALT dataset. The database has a table called 'salt_data' with these available columns:
|
|
@@ -215,7 +194,7 @@ class SALTAnalytics:
|
|
| 215 |
|
| 216 |
The SALT dataset contains SAP ERP sales order data where each row represents a sales document item.
|
| 217 |
|
| 218 |
-
IMPORTANT: Use only the column names I provided above. Do not assume column names
|
| 219 |
|
| 220 |
Convert this question to a DuckDB SQL query: "{question}"
|
| 221 |
|
|
@@ -261,6 +240,7 @@ def load_dataset_interface():
|
|
| 261 |
return analytics.load_salt_dataset()
|
| 262 |
|
| 263 |
def show_insights_interface():
|
|
|
|
| 264 |
insights = analytics.get_predefined_insights()
|
| 265 |
|
| 266 |
if isinstance(insights, str):
|
|
@@ -270,7 +250,7 @@ def show_insights_interface():
|
|
| 270 |
|
| 271 |
for title, df in insights.items():
|
| 272 |
output += f"## {title}\n\n"
|
| 273 |
-
if len(df) > 0:
|
| 274 |
output += df.to_markdown(index=False)
|
| 275 |
else:
|
| 276 |
output += "*No data available for this analysis*"
|
|
@@ -283,13 +263,12 @@ def qa_interface(question: str, api_key: str):
|
|
| 283 |
return "Please enter a question"
|
| 284 |
return analytics.natural_language_query(question, api_key)
|
| 285 |
|
| 286 |
-
# Updated sample questions based on likely available columns
|
| 287 |
sample_questions = [
|
| 288 |
"Which sales offices process the most orders?",
|
| 289 |
"What are the most common payment terms?",
|
| 290 |
"Show me the distribution of shipping conditions",
|
| 291 |
"What is the date range of orders in the dataset?",
|
| 292 |
-
"Which
|
| 293 |
]
|
| 294 |
|
| 295 |
with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo:
|
|
|
|
| 21 |
return "Dataset already loaded!"
|
| 22 |
|
| 23 |
try:
|
|
|
|
| 24 |
hf_token = os.getenv('HF_TOKEN')
|
| 25 |
|
| 26 |
if hf_token:
|
|
|
|
| 42 |
|
| 43 |
df = dataset.to_pandas()
|
| 44 |
|
|
|
|
| 45 |
if len(df) > 100000:
|
| 46 |
df = df.sample(n=50000, random_state=42)
|
| 47 |
|
|
|
|
| 48 |
self.con.execute("CREATE TABLE salt_data AS SELECT * FROM df")
|
| 49 |
|
|
|
|
| 50 |
schema_result = self.con.execute("DESCRIBE salt_data").fetchall()
|
| 51 |
self.schema_info = "\n".join([f"{col[0]}: {col[1]}" for col in schema_result])
|
| 52 |
self.available_columns = [col[0] for col in schema_result]
|
| 53 |
|
| 54 |
self.data_loaded = True
|
| 55 |
|
|
|
|
| 56 |
return f"β
Successfully loaded {len(df)} records into DuckDB\n\nπ Available columns:\n" + "\n".join(f"β’ {col}" for col in self.available_columns[:20]) + ("\n... and more" if len(self.available_columns) > 20 else "")
|
| 57 |
|
| 58 |
except Exception as e:
|
|
|
|
| 63 |
return f"β Error loading dataset: {error_msg}"
|
| 64 |
|
| 65 |
def get_predefined_insights(self):
|
| 66 |
+
"""Generate predefined analytical insights - COMPLETELY FIXED"""
|
| 67 |
if not self.data_loaded:
|
| 68 |
return "Please load the dataset first"
|
| 69 |
|
| 70 |
try:
|
| 71 |
insights = {}
|
| 72 |
|
| 73 |
+
# Basic Dataset Overview - This always works
|
| 74 |
+
insights['Dataset Overview'] = self.con.execute("""
|
| 75 |
+
SELECT
|
| 76 |
+
COUNT(*) as total_records,
|
| 77 |
+
COUNT(DISTINCT CREATIONDATE) as unique_dates,
|
| 78 |
+
MIN(CREATIONDATE) as earliest_date,
|
| 79 |
+
MAX(CREATIONDATE) as latest_date
|
| 80 |
+
FROM salt_data
|
| 81 |
+
""").fetchdf()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
# Payment Terms Distribution - Direct column reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
if 'CUSTOMERPAYMENTTERMS' in self.available_columns:
|
| 85 |
insights['Payment Terms Distribution'] = self.con.execute("""
|
| 86 |
SELECT CUSTOMERPAYMENTTERMS,
|
| 87 |
COUNT(*) as frequency,
|
| 88 |
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
|
| 89 |
FROM salt_data
|
| 90 |
+
WHERE CUSTOMERPAYMENTTERMS IS NOT NULL AND CUSTOMERPAYMENTTERMS != ''
|
| 91 |
GROUP BY CUSTOMERPAYMENTTERMS
|
| 92 |
ORDER BY frequency DESC
|
| 93 |
LIMIT 10
|
| 94 |
""").fetchdf()
|
| 95 |
|
| 96 |
+
# Sales Office Performance - Find and use the column
|
| 97 |
+
sales_office_col = None
|
| 98 |
+
for col in self.available_columns:
|
| 99 |
+
if 'SALES' in col.upper() and 'OFFICE' in col.upper():
|
| 100 |
+
sales_office_col = col
|
| 101 |
+
break
|
| 102 |
+
|
| 103 |
+
if sales_office_col:
|
| 104 |
+
query = f"""
|
| 105 |
+
SELECT {sales_office_col},
|
| 106 |
+
COUNT(*) as total_orders
|
| 107 |
+
FROM salt_data
|
| 108 |
+
WHERE {sales_office_col} IS NOT NULL AND {sales_office_col} != ''
|
| 109 |
+
GROUP BY {sales_office_col}
|
| 110 |
+
ORDER BY total_orders DESC
|
| 111 |
+
LIMIT 10
|
| 112 |
+
"""
|
| 113 |
+
insights['Sales Office Performance'] = self.con.execute(query).fetchdf()
|
| 114 |
|
| 115 |
+
# Shipping Conditions Analysis
|
| 116 |
+
shipping_col = None
|
| 117 |
for col in self.available_columns:
|
| 118 |
if 'SHIPPING' in col.upper() and 'CONDITION' in col.upper():
|
| 119 |
shipping_col = col
|
| 120 |
+
break
|
|
|
|
| 121 |
|
| 122 |
if shipping_col:
|
| 123 |
+
query = f"""
|
| 124 |
+
SELECT {shipping_col},
|
| 125 |
+
COUNT(*) as order_count
|
| 126 |
+
FROM salt_data
|
| 127 |
+
WHERE {shipping_col} IS NOT NULL AND {shipping_col} != ''
|
| 128 |
+
GROUP BY {shipping_col}
|
| 129 |
+
ORDER BY order_count DESC
|
| 130 |
+
LIMIT 10
|
| 131 |
+
"""
|
| 132 |
+
insights['Shipping Conditions'] = self.con.execute(query).fetchdf()
|
| 133 |
+
|
| 134 |
+
# Sales Document Categories
|
| 135 |
+
if 'SALESDOCUMENTITEMCATEGORY' in self.available_columns:
|
| 136 |
+
insights['Sales Document Categories'] = self.con.execute("""
|
| 137 |
+
SELECT SALESDOCUMENTITEMCATEGORY,
|
| 138 |
+
COUNT(*) as frequency,
|
| 139 |
+
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
|
| 140 |
+
FROM salt_data
|
| 141 |
+
WHERE SALESDOCUMENTITEMCATEGORY IS NOT NULL AND SALESDOCUMENTITEMCATEGORY != ''
|
| 142 |
+
GROUP BY SALESDOCUMENTITEMCATEGORY
|
| 143 |
+
ORDER BY frequency DESC
|
| 144 |
+
LIMIT 10
|
| 145 |
+
""").fetchdf()
|
| 146 |
+
|
| 147 |
+
# Show available columns for debugging
|
| 148 |
+
insights['Available Columns Sample'] = pd.DataFrame({
|
| 149 |
+
'Column Name': self.available_columns[:20],
|
| 150 |
+
'Index': range(len(self.available_columns[:20]))
|
| 151 |
+
})
|
|
|
|
|
|
|
| 152 |
|
| 153 |
return insights
|
| 154 |
|
| 155 |
except Exception as e:
|
| 156 |
+
# Return detailed error information for debugging
|
| 157 |
+
return f"β Error generating insights: {str(e)}\n\nπ Debug Info:\n" + \
|
| 158 |
+
f"Data loaded: {self.data_loaded}\n" + \
|
| 159 |
+
f"Available columns ({len(self.available_columns)}): {', '.join(self.available_columns[:15])}...\n" + \
|
| 160 |
+
f"Error type: {type(e).__name__}"
|
| 161 |
|
| 162 |
def clean_sql_response(self, sql_query: str) -> str:
|
| 163 |
"""Clean SQL response - avoiding string literal errors"""
|
|
|
|
| 185 |
try:
|
| 186 |
client = openai.OpenAI(api_key=api_key)
|
| 187 |
|
| 188 |
+
columns_list = ", ".join(self.available_columns[:30])
|
|
|
|
| 189 |
|
| 190 |
prompt = f"""
|
| 191 |
You are a SQL expert analyzing SAP SALT dataset. The database has a table called 'salt_data' with these available columns:
|
|
|
|
| 194 |
|
| 195 |
The SALT dataset contains SAP ERP sales order data where each row represents a sales document item.
|
| 196 |
|
| 197 |
+
IMPORTANT: Use only the column names I provided above. Do not assume column names that don't exist.
|
| 198 |
|
| 199 |
Convert this question to a DuckDB SQL query: "{question}"
|
| 200 |
|
|
|
|
| 240 |
return analytics.load_salt_dataset()
|
| 241 |
|
| 242 |
def show_insights_interface():
|
| 243 |
+
"""Fixed insights interface with better error handling"""
|
| 244 |
insights = analytics.get_predefined_insights()
|
| 245 |
|
| 246 |
if isinstance(insights, str):
|
|
|
|
| 250 |
|
| 251 |
for title, df in insights.items():
|
| 252 |
output += f"## {title}\n\n"
|
| 253 |
+
if isinstance(df, pd.DataFrame) and len(df) > 0:
|
| 254 |
output += df.to_markdown(index=False)
|
| 255 |
else:
|
| 256 |
output += "*No data available for this analysis*"
|
|
|
|
| 263 |
return "Please enter a question"
|
| 264 |
return analytics.natural_language_query(question, api_key)
|
| 265 |
|
|
|
|
| 266 |
sample_questions = [
|
| 267 |
"Which sales offices process the most orders?",
|
| 268 |
"What are the most common payment terms?",
|
| 269 |
"Show me the distribution of shipping conditions",
|
| 270 |
"What is the date range of orders in the dataset?",
|
| 271 |
+
"Which document categories are most frequent?"
|
| 272 |
]
|
| 273 |
|
| 274 |
with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo:
|