PD03 commited on
Commit
0c7416f
Β·
verified Β·
1 Parent(s): 9231df6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -93
app.py CHANGED
@@ -21,7 +21,6 @@ class SALTAnalytics:
21
  return "Dataset already loaded!"
22
 
23
  try:
24
- # Try loading with authentication
25
  hf_token = os.getenv('HF_TOKEN')
26
 
27
  if hf_token:
@@ -43,21 +42,17 @@ class SALTAnalytics:
43
 
44
  df = dataset.to_pandas()
45
 
46
- # Sample data for demo if too large
47
  if len(df) > 100000:
48
  df = df.sample(n=50000, random_state=42)
49
 
50
- # Load into DuckDB
51
  self.con.execute("CREATE TABLE salt_data AS SELECT * FROM df")
52
 
53
- # Get schema information and available columns
54
  schema_result = self.con.execute("DESCRIBE salt_data").fetchall()
55
  self.schema_info = "\n".join([f"{col[0]}: {col[1]}" for col in schema_result])
56
  self.available_columns = [col[0] for col in schema_result]
57
 
58
  self.data_loaded = True
59
 
60
- # Return success message with column info
61
  return f"βœ… Successfully loaded {len(df)} records into DuckDB\n\nπŸ“‹ Available columns:\n" + "\n".join(f"β€’ {col}" for col in self.available_columns[:20]) + ("\n... and more" if len(self.available_columns) > 20 else "")
62
 
63
  except Exception as e:
@@ -68,116 +63,101 @@ class SALTAnalytics:
68
  return f"❌ Error loading dataset: {error_msg}"
69
 
70
  def get_predefined_insights(self):
71
- """Generate predefined analytical insights using correct column names"""
72
  if not self.data_loaded:
73
  return "Please load the dataset first"
74
 
75
  try:
76
  insights = {}
77
 
78
- # Find the right customer and sales office columns
79
- customer_col = None
80
- sales_office_col = None
81
-
82
- # Look for customer-related columns
83
- for col in self.available_columns:
84
- if 'CUSTOMER' in col.upper() and ('ID' in col.upper() or 'NUM' in col.upper()):
85
- customer_col = col
86
- break
87
- elif 'SHIP' in col.upper() and 'PARTY' in col.upper():
88
- customer_col = col # ShipToParty is often used as customer identifier
89
- break
90
-
91
- # Look for sales office column
92
- for col in self.available_columns:
93
- if 'SALES' in col.upper() and 'OFFICE' in col.upper():
94
- sales_office_col = col
95
- break
96
 
97
- # Sales Office Performance (adjusted for available columns)
98
- if sales_office_col:
99
- if customer_col:
100
- insights['Sales Office Performance'] = self.con.execute(f"""
101
- SELECT {sales_office_col},
102
- COUNT(*) as total_orders,
103
- COUNT(DISTINCT {customer_col}) as unique_customers
104
- FROM salt_data
105
- WHERE {sales_office_col} IS NOT NULL
106
- GROUP BY {sales_office_col}
107
- ORDER BY total_orders DESC
108
- LIMIT 10
109
- """).fetchdf()
110
- else:
111
- insights['Sales Office Performance'] = self.con.execute(f"""
112
- SELECT {sales_office_col},
113
- COUNT(*) as total_orders
114
- FROM salt_data
115
- WHERE {sales_office_col} IS NOT NULL
116
- GROUP BY {sales_office_col}
117
- ORDER BY total_orders DESC
118
- LIMIT 10
119
- """).fetchdf()
120
-
121
- # Payment Terms Distribution (if available)
122
  if 'CUSTOMERPAYMENTTERMS' in self.available_columns:
123
  insights['Payment Terms Distribution'] = self.con.execute("""
124
  SELECT CUSTOMERPAYMENTTERMS,
125
  COUNT(*) as frequency,
126
  ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
127
  FROM salt_data
128
- WHERE CUSTOMERPAYMENTTERMS IS NOT NULL
129
  GROUP BY CUSTOMERPAYMENTTERMS
130
  ORDER BY frequency DESC
131
  LIMIT 10
132
  """).fetchdf()
133
 
134
- # Shipping Conditions Analysis (look for shipping-related columns)
135
- shipping_col = None
136
- plant_col = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
 
 
138
  for col in self.available_columns:
139
  if 'SHIPPING' in col.upper() and 'CONDITION' in col.upper():
140
  shipping_col = col
141
- elif 'PLANT' in col.upper():
142
- plant_col = col
143
 
144
  if shipping_col:
145
- if plant_col:
146
- insights['Shipping Conditions'] = self.con.execute(f"""
147
- SELECT {shipping_col},
148
- COUNT(*) as order_count,
149
- COUNT(DISTINCT {plant_col}) as plants_served
150
- FROM salt_data
151
- WHERE {shipping_col} IS NOT NULL
152
- GROUP BY {shipping_col}
153
- ORDER BY order_count DESC
154
- LIMIT 10
155
- """).fetchdf()
156
- else:
157
- insights['Shipping Conditions'] = self.con.execute(f"""
158
- SELECT {shipping_col},
159
- COUNT(*) as order_count
160
- FROM salt_data
161
- WHERE {shipping_col} IS NOT NULL
162
- GROUP BY {shipping_col}
163
- ORDER BY order_count DESC
164
- LIMIT 10
165
- """).fetchdf()
166
-
167
- # General Data Overview
168
- insights['Dataset Overview'] = self.con.execute("""
169
- SELECT
170
- COUNT(*) as total_records,
171
- COUNT(DISTINCT CREATIONDATE) as unique_dates,
172
- MIN(CREATIONDATE) as earliest_date,
173
- MAX(CREATIONDATE) as latest_date
174
- FROM salt_data
175
- """).fetchdf()
176
 
177
  return insights
178
 
179
  except Exception as e:
180
- return f"Error generating insights: {str(e)}\n\nAvailable columns: {', '.join(self.available_columns[:10])}..."
 
 
 
 
181
 
182
  def clean_sql_response(self, sql_query: str) -> str:
183
  """Clean SQL response - avoiding string literal errors"""
@@ -205,8 +185,7 @@ class SALTAnalytics:
205
  try:
206
  client = openai.OpenAI(api_key=api_key)
207
 
208
- # Enhanced prompt with actual available columns
209
- columns_list = ", ".join(self.available_columns[:30]) # Include first 30 columns
210
 
211
  prompt = f"""
212
  You are a SQL expert analyzing SAP SALT dataset. The database has a table called 'salt_data' with these available columns:
@@ -215,7 +194,7 @@ class SALTAnalytics:
215
 
216
  The SALT dataset contains SAP ERP sales order data where each row represents a sales document item.
217
 
218
- IMPORTANT: Use only the column names I provided above. Do not assume column names like 'CUSTOMERID' exist.
219
 
220
  Convert this question to a DuckDB SQL query: "{question}"
221
 
@@ -261,6 +240,7 @@ def load_dataset_interface():
261
  return analytics.load_salt_dataset()
262
 
263
  def show_insights_interface():
 
264
  insights = analytics.get_predefined_insights()
265
 
266
  if isinstance(insights, str):
@@ -270,7 +250,7 @@ def show_insights_interface():
270
 
271
  for title, df in insights.items():
272
  output += f"## {title}\n\n"
273
- if len(df) > 0:
274
  output += df.to_markdown(index=False)
275
  else:
276
  output += "*No data available for this analysis*"
@@ -283,13 +263,12 @@ def qa_interface(question: str, api_key: str):
283
  return "Please enter a question"
284
  return analytics.natural_language_query(question, api_key)
285
 
286
- # Updated sample questions based on likely available columns
287
  sample_questions = [
288
  "Which sales offices process the most orders?",
289
  "What are the most common payment terms?",
290
  "Show me the distribution of shipping conditions",
291
  "What is the date range of orders in the dataset?",
292
- "Which plants are most frequently used?"
293
  ]
294
 
295
  with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo:
 
21
  return "Dataset already loaded!"
22
 
23
  try:
 
24
  hf_token = os.getenv('HF_TOKEN')
25
 
26
  if hf_token:
 
42
 
43
  df = dataset.to_pandas()
44
 
 
45
  if len(df) > 100000:
46
  df = df.sample(n=50000, random_state=42)
47
 
 
48
  self.con.execute("CREATE TABLE salt_data AS SELECT * FROM df")
49
 
 
50
  schema_result = self.con.execute("DESCRIBE salt_data").fetchall()
51
  self.schema_info = "\n".join([f"{col[0]}: {col[1]}" for col in schema_result])
52
  self.available_columns = [col[0] for col in schema_result]
53
 
54
  self.data_loaded = True
55
 
 
56
  return f"βœ… Successfully loaded {len(df)} records into DuckDB\n\nπŸ“‹ Available columns:\n" + "\n".join(f"β€’ {col}" for col in self.available_columns[:20]) + ("\n... and more" if len(self.available_columns) > 20 else "")
57
 
58
  except Exception as e:
 
63
  return f"❌ Error loading dataset: {error_msg}"
64
 
65
  def get_predefined_insights(self):
66
+ """Generate predefined analytical insights - COMPLETELY FIXED"""
67
  if not self.data_loaded:
68
  return "Please load the dataset first"
69
 
70
  try:
71
  insights = {}
72
 
73
+ # Basic Dataset Overview - This always works
74
+ insights['Dataset Overview'] = self.con.execute("""
75
+ SELECT
76
+ COUNT(*) as total_records,
77
+ COUNT(DISTINCT CREATIONDATE) as unique_dates,
78
+ MIN(CREATIONDATE) as earliest_date,
79
+ MAX(CREATIONDATE) as latest_date
80
+ FROM salt_data
81
+ """).fetchdf()
 
 
 
 
 
 
 
 
 
82
 
83
+ # Payment Terms Distribution - Direct column reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  if 'CUSTOMERPAYMENTTERMS' in self.available_columns:
85
  insights['Payment Terms Distribution'] = self.con.execute("""
86
  SELECT CUSTOMERPAYMENTTERMS,
87
  COUNT(*) as frequency,
88
  ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
89
  FROM salt_data
90
+ WHERE CUSTOMERPAYMENTTERMS IS NOT NULL AND CUSTOMERPAYMENTTERMS != ''
91
  GROUP BY CUSTOMERPAYMENTTERMS
92
  ORDER BY frequency DESC
93
  LIMIT 10
94
  """).fetchdf()
95
 
96
+ # Sales Office Performance - Find and use the column
97
+ sales_office_col = None
98
+ for col in self.available_columns:
99
+ if 'SALES' in col.upper() and 'OFFICE' in col.upper():
100
+ sales_office_col = col
101
+ break
102
+
103
+ if sales_office_col:
104
+ query = f"""
105
+ SELECT {sales_office_col},
106
+ COUNT(*) as total_orders
107
+ FROM salt_data
108
+ WHERE {sales_office_col} IS NOT NULL AND {sales_office_col} != ''
109
+ GROUP BY {sales_office_col}
110
+ ORDER BY total_orders DESC
111
+ LIMIT 10
112
+ """
113
+ insights['Sales Office Performance'] = self.con.execute(query).fetchdf()
114
 
115
+ # Shipping Conditions Analysis
116
+ shipping_col = None
117
  for col in self.available_columns:
118
  if 'SHIPPING' in col.upper() and 'CONDITION' in col.upper():
119
  shipping_col = col
120
+ break
 
121
 
122
  if shipping_col:
123
+ query = f"""
124
+ SELECT {shipping_col},
125
+ COUNT(*) as order_count
126
+ FROM salt_data
127
+ WHERE {shipping_col} IS NOT NULL AND {shipping_col} != ''
128
+ GROUP BY {shipping_col}
129
+ ORDER BY order_count DESC
130
+ LIMIT 10
131
+ """
132
+ insights['Shipping Conditions'] = self.con.execute(query).fetchdf()
133
+
134
+ # Sales Document Categories
135
+ if 'SALESDOCUMENTITEMCATEGORY' in self.available_columns:
136
+ insights['Sales Document Categories'] = self.con.execute("""
137
+ SELECT SALESDOCUMENTITEMCATEGORY,
138
+ COUNT(*) as frequency,
139
+ ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
140
+ FROM salt_data
141
+ WHERE SALESDOCUMENTITEMCATEGORY IS NOT NULL AND SALESDOCUMENTITEMCATEGORY != ''
142
+ GROUP BY SALESDOCUMENTITEMCATEGORY
143
+ ORDER BY frequency DESC
144
+ LIMIT 10
145
+ """).fetchdf()
146
+
147
+ # Show available columns for debugging
148
+ insights['Available Columns Sample'] = pd.DataFrame({
149
+ 'Column Name': self.available_columns[:20],
150
+ 'Index': range(len(self.available_columns[:20]))
151
+ })
 
 
152
 
153
  return insights
154
 
155
  except Exception as e:
156
+ # Return detailed error information for debugging
157
+ return f"❌ Error generating insights: {str(e)}\n\nπŸ” Debug Info:\n" + \
158
+ f"Data loaded: {self.data_loaded}\n" + \
159
+ f"Available columns ({len(self.available_columns)}): {', '.join(self.available_columns[:15])}...\n" + \
160
+ f"Error type: {type(e).__name__}"
161
 
162
  def clean_sql_response(self, sql_query: str) -> str:
163
  """Clean SQL response - avoiding string literal errors"""
 
185
  try:
186
  client = openai.OpenAI(api_key=api_key)
187
 
188
+ columns_list = ", ".join(self.available_columns[:30])
 
189
 
190
  prompt = f"""
191
  You are a SQL expert analyzing SAP SALT dataset. The database has a table called 'salt_data' with these available columns:
 
194
 
195
  The SALT dataset contains SAP ERP sales order data where each row represents a sales document item.
196
 
197
+ IMPORTANT: Use only the column names I provided above. Do not assume column names that don't exist.
198
 
199
  Convert this question to a DuckDB SQL query: "{question}"
200
 
 
240
  return analytics.load_salt_dataset()
241
 
242
  def show_insights_interface():
243
+ """Fixed insights interface with better error handling"""
244
  insights = analytics.get_predefined_insights()
245
 
246
  if isinstance(insights, str):
 
250
 
251
  for title, df in insights.items():
252
  output += f"## {title}\n\n"
253
+ if isinstance(df, pd.DataFrame) and len(df) > 0:
254
  output += df.to_markdown(index=False)
255
  else:
256
  output += "*No data available for this analysis*"
 
263
  return "Please enter a question"
264
  return analytics.natural_language_query(question, api_key)
265
 
 
266
  sample_questions = [
267
  "Which sales offices process the most orders?",
268
  "What are the most common payment terms?",
269
  "Show me the distribution of shipping conditions",
270
  "What is the date range of orders in the dataset?",
271
+ "Which document categories are most frequent?"
272
  ]
273
 
274
  with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo: