PD03 commited on
Commit
9231df6
·
verified ·
1 Parent(s): 31e4035

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -68
app.py CHANGED
@@ -2,10 +2,10 @@ import gradio as gr
2
  import pandas as pd
3
  import duckdb
4
  from datasets import load_dataset
 
5
  import openai
6
  import os
7
  from typing import Dict, List, Any
8
- import json
9
 
10
  class SALTAnalytics:
11
  def __init__(self):
@@ -13,7 +13,7 @@ class SALTAnalytics:
13
  self.con = duckdb.connect(':memory:')
14
  self.data_loaded = False
15
  self.schema_info = ""
16
- self.openai_client = None
17
 
18
  def load_salt_dataset(self):
19
  """Load SAP SALT dataset from Hugging Face into DuckDB"""
@@ -21,79 +21,176 @@ class SALTAnalytics:
21
  return "Dataset already loaded!"
22
 
23
  try:
24
- dataset = load_dataset("SAP/SALT", "joined_table", split="train", streaming=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  df = dataset.to_pandas()
26
 
 
27
  if len(df) > 100000:
28
  df = df.sample(n=50000, random_state=42)
29
 
 
30
  self.con.execute("CREATE TABLE salt_data AS SELECT * FROM df")
31
 
 
32
  schema_result = self.con.execute("DESCRIBE salt_data").fetchall()
33
  self.schema_info = "\n".join([f"{col[0]}: {col[1]}" for col in schema_result])
 
34
 
35
  self.data_loaded = True
36
- return f"✅ Successfully loaded {len(df)} records into DuckDB"
 
 
37
 
38
  except Exception as e:
39
- return f"❌ Error loading dataset: {str(e)}"
 
 
 
 
40
 
41
  def get_predefined_insights(self):
42
- """Generate predefined analytical insights"""
43
  if not self.data_loaded:
44
  return "Please load the dataset first"
45
 
46
  try:
47
  insights = {}
48
 
49
- insights['Sales Office Performance'] = self.con.execute("""
50
- SELECT SALESOFFICE,
51
- COUNT(*) as total_orders,
52
- COUNT(DISTINCT CUSTOMERID) as unique_customers
53
- FROM salt_data
54
- GROUP BY SALESOFFICE
55
- ORDER BY total_orders DESC
56
- LIMIT 10
57
- """).fetchdf()
58
 
59
- insights['Payment Terms Distribution'] = self.con.execute("""
60
- SELECT CUSTOMERPAYMENTTERMS,
61
- COUNT(*) as frequency,
62
- ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
63
- FROM salt_data
64
- GROUP BY CUSTOMERPAYMENTTERMS
65
- ORDER BY frequency DESC
66
- """).fetchdf()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- insights['Shipping Conditions'] = self.con.execute("""
69
- SELECT SHIPPINGCONDITION,
70
- COUNT(*) as order_count,
71
- COUNT(DISTINCT PLANT) as plants_served
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  FROM salt_data
73
- GROUP BY SHIPPINGCONDITION
74
- ORDER BY order_count DESC
75
  """).fetchdf()
76
 
77
  return insights
78
 
79
  except Exception as e:
80
- return f"Error generating insights: {str(e)}"
81
 
82
  def clean_sql_response(self, sql_query: str) -> str:
83
- """Clean SQL response - COMPLETELY FIXED"""
84
- # Use string concatenation to avoid syntax errors
85
  backticks = "`" + "`" + "`"
86
  sql_marker = backticks + "sql"
87
 
88
- # Remove start markers
89
  if sql_query.startswith(sql_marker):
90
- sql_query = sql_query[6:] # Remove ```
91
  elif sql_query.startswith(backticks):
92
- sql_query = sql_query[3:] # Remove ```
93
 
94
- # Remove end markers
95
  if sql_query.endswith(backticks):
96
- sql_query = sql_query[:-3] # Remove trailing ```
97
 
98
  return sql_query.strip()
99
 
@@ -108,13 +205,21 @@ class SALTAnalytics:
108
  try:
109
  client = openai.OpenAI(api_key=api_key)
110
 
 
 
 
111
  prompt = f"""
112
- You are a SQL expert analyzing SAP SALT dataset. The database has a table called 'salt_data' with this schema:
113
 
114
- {self.schema_info}
 
 
 
 
115
 
116
  Convert this question to a DuckDB SQL query: "{question}"
117
- Return ONLY the SQL query, no explanation. Limit results to 20 rows.
 
118
  """
119
 
120
  response = client.chat.completions.create(
@@ -123,7 +228,7 @@ class SALTAnalytics:
123
  temperature=0.1
124
  )
125
 
126
- sql_query = response.choices.message.content.strip()
127
  sql_query = self.clean_sql_response(sql_query)
128
 
129
  result_df = self.con.execute(sql_query).fetchdf()
@@ -132,7 +237,7 @@ class SALTAnalytics:
132
  Question: {question}
133
  Results: {result_df.head(10).to_string()}
134
 
135
- Provide a clear business explanation of these SAP ERP results in 2-3 sentences.
136
  """
137
 
138
  explanation_response = client.chat.completions.create(
@@ -141,14 +246,13 @@ class SALTAnalytics:
141
  temperature=0.3
142
  )
143
 
144
- explanation = explanation_response.choices.message.content
145
 
146
- # Safe output formatting
147
  code_block = "`" + "`" + "`"
148
  return f"**SQL Query:**\n{code_block}sql\n{sql_query}\n{code_block}\n\n**Results:**\n{result_df.to_string(index=False)}\n\n**Explanation:**\n{explanation}"
149
 
150
  except Exception as e:
151
- return f"Error: {str(e)}"
152
 
153
  # Initialize analytics
154
  analytics = SALTAnalytics()
@@ -166,7 +270,10 @@ def show_insights_interface():
166
 
167
  for title, df in insights.items():
168
  output += f"## {title}\n\n"
169
- output += df.to_markdown(index=False)
 
 
 
170
  output += "\n\n---\n\n"
171
 
172
  return output
@@ -176,12 +283,13 @@ def qa_interface(question: str, api_key: str):
176
  return "Please enter a question"
177
  return analytics.natural_language_query(question, api_key)
178
 
 
179
  sample_questions = [
180
- "Which sales office has the most customers?",
181
  "What are the most common payment terms?",
182
- "Show me shipping conditions by plant",
183
- "Which customers have the highest number of orders?",
184
- "What's the distribution of sales groups?"
185
  ]
186
 
187
  with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo:
@@ -190,14 +298,14 @@ with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo:
190
  # 🚀 SAP SALT Dataset Analytics Demo
191
  ## Open Source Analytics + AI for SAP ERP
192
 
193
- This demo showcases how open source tools (DuckDB + OpenAI) can generate massive value for enterprises running SAP ERP systems.
194
  """)
195
 
196
  with gr.Tab("📥 Load Dataset"):
197
  gr.Markdown("### Load SAP SALT Dataset from Hugging Face")
198
 
199
  load_btn = gr.Button("Load SALT Dataset", variant="primary")
200
- load_output = gr.Textbox(label="Status", lines=3)
201
 
202
  load_btn.click(fn=load_dataset_interface, outputs=load_output)
203
 
@@ -222,7 +330,7 @@ with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo:
222
 
223
  question_input = gr.Textbox(
224
  label="Your Question",
225
- placeholder="e.g., Which sales office handles the most customers?",
226
  lines=2
227
  )
228
 
@@ -251,24 +359,27 @@ with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo:
251
 
252
  with gr.Tab("ℹ️ About"):
253
  gr.Markdown("""
254
- ### About This Demo
 
 
 
 
 
 
255
 
256
- **Dataset**: SAP SALT (Sales Autocompletion Linked Business Tables)
257
- - Real SAP S/4HANA sales order data
258
- - 4 linked tables: Sales Documents, Items, Customers, Addresses
259
- - 8 classification targets for ML models
 
260
 
261
- **Technology Stack**:
262
- - **DuckDB**: High-performance analytics database
263
- - **OpenAI GPT-4**: Natural language to SQL conversion
264
- - **Hugging Face**: Dataset hosting and deployment
265
- - **Gradio 4.44**: Secure interactive web interface
266
 
267
- **Business Value**:
268
- - Automate sales order completion (70-80% accuracy)
269
- - Optimize customer-to-sales office assignments
270
- - Predict shipping and payment preferences
271
- - Generate actionable business insights
272
  """)
273
 
274
  if __name__ == "__main__":
 
2
  import pandas as pd
3
  import duckdb
4
  from datasets import load_dataset
5
+ from huggingface_hub import login
6
  import openai
7
  import os
8
  from typing import Dict, List, Any
 
9
 
10
  class SALTAnalytics:
11
  def __init__(self):
 
13
  self.con = duckdb.connect(':memory:')
14
  self.data_loaded = False
15
  self.schema_info = ""
16
+ self.available_columns = []
17
 
18
  def load_salt_dataset(self):
19
  """Load SAP SALT dataset from Hugging Face into DuckDB"""
 
21
  return "Dataset already loaded!"
22
 
23
  try:
24
+ # Try loading with authentication
25
+ hf_token = os.getenv('HF_TOKEN')
26
+
27
+ if hf_token:
28
+ dataset = load_dataset(
29
+ "SAP/SALT",
30
+ "joined_table",
31
+ split="train",
32
+ token=hf_token,
33
+ streaming=False
34
+ )
35
+ else:
36
+ dataset = load_dataset(
37
+ "SAP/SALT",
38
+ "joined_table",
39
+ split="train",
40
+ use_auth_token=True,
41
+ streaming=False
42
+ )
43
+
44
  df = dataset.to_pandas()
45
 
46
+ # Sample data for demo if too large
47
  if len(df) > 100000:
48
  df = df.sample(n=50000, random_state=42)
49
 
50
+ # Load into DuckDB
51
  self.con.execute("CREATE TABLE salt_data AS SELECT * FROM df")
52
 
53
+ # Get schema information and available columns
54
  schema_result = self.con.execute("DESCRIBE salt_data").fetchall()
55
  self.schema_info = "\n".join([f"{col[0]}: {col[1]}" for col in schema_result])
56
+ self.available_columns = [col[0] for col in schema_result]
57
 
58
  self.data_loaded = True
59
+
60
+ # Return success message with column info
61
+ return f"✅ Successfully loaded {len(df)} records into DuckDB\n\n📋 Available columns:\n" + "\n".join(f"• {col}" for col in self.available_columns[:20]) + ("\n... and more" if len(self.available_columns) > 20 else "")
62
 
63
  except Exception as e:
64
+ error_msg = str(e)
65
+ if "gated dataset" in error_msg or "authentication" in error_msg.lower():
66
+ return f"❌ Authentication Error: {error_msg}\n\nTo fix this:\n1. Go to https://huggingface.co/datasets/SAP/SALT\n2. Request access to the dataset\n3. Wait for approval\n4. Set HF_TOKEN in your Space secrets"
67
+ else:
68
+ return f"❌ Error loading dataset: {error_msg}"
69
 
70
  def get_predefined_insights(self):
71
+ """Generate predefined analytical insights using correct column names"""
72
  if not self.data_loaded:
73
  return "Please load the dataset first"
74
 
75
  try:
76
  insights = {}
77
 
78
+ # Find the right customer and sales office columns
79
+ customer_col = None
80
+ sales_office_col = None
 
 
 
 
 
 
81
 
82
+ # Look for customer-related columns
83
+ for col in self.available_columns:
84
+ if 'CUSTOMER' in col.upper() and ('ID' in col.upper() or 'NUM' in col.upper()):
85
+ customer_col = col
86
+ break
87
+ elif 'SHIP' in col.upper() and 'PARTY' in col.upper():
88
+ customer_col = col # ShipToParty is often used as customer identifier
89
+ break
90
+
91
+ # Look for sales office column
92
+ for col in self.available_columns:
93
+ if 'SALES' in col.upper() and 'OFFICE' in col.upper():
94
+ sales_office_col = col
95
+ break
96
+
97
+ # Sales Office Performance (adjusted for available columns)
98
+ if sales_office_col:
99
+ if customer_col:
100
+ insights['Sales Office Performance'] = self.con.execute(f"""
101
+ SELECT {sales_office_col},
102
+ COUNT(*) as total_orders,
103
+ COUNT(DISTINCT {customer_col}) as unique_customers
104
+ FROM salt_data
105
+ WHERE {sales_office_col} IS NOT NULL
106
+ GROUP BY {sales_office_col}
107
+ ORDER BY total_orders DESC
108
+ LIMIT 10
109
+ """).fetchdf()
110
+ else:
111
+ insights['Sales Office Performance'] = self.con.execute(f"""
112
+ SELECT {sales_office_col},
113
+ COUNT(*) as total_orders
114
+ FROM salt_data
115
+ WHERE {sales_office_col} IS NOT NULL
116
+ GROUP BY {sales_office_col}
117
+ ORDER BY total_orders DESC
118
+ LIMIT 10
119
+ """).fetchdf()
120
 
121
+ # Payment Terms Distribution (if available)
122
+ if 'CUSTOMERPAYMENTTERMS' in self.available_columns:
123
+ insights['Payment Terms Distribution'] = self.con.execute("""
124
+ SELECT CUSTOMERPAYMENTTERMS,
125
+ COUNT(*) as frequency,
126
+ ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
127
+ FROM salt_data
128
+ WHERE CUSTOMERPAYMENTTERMS IS NOT NULL
129
+ GROUP BY CUSTOMERPAYMENTTERMS
130
+ ORDER BY frequency DESC
131
+ LIMIT 10
132
+ """).fetchdf()
133
+
134
+ # Shipping Conditions Analysis (look for shipping-related columns)
135
+ shipping_col = None
136
+ plant_col = None
137
+
138
+ for col in self.available_columns:
139
+ if 'SHIPPING' in col.upper() and 'CONDITION' in col.upper():
140
+ shipping_col = col
141
+ elif 'PLANT' in col.upper():
142
+ plant_col = col
143
+
144
+ if shipping_col:
145
+ if plant_col:
146
+ insights['Shipping Conditions'] = self.con.execute(f"""
147
+ SELECT {shipping_col},
148
+ COUNT(*) as order_count,
149
+ COUNT(DISTINCT {plant_col}) as plants_served
150
+ FROM salt_data
151
+ WHERE {shipping_col} IS NOT NULL
152
+ GROUP BY {shipping_col}
153
+ ORDER BY order_count DESC
154
+ LIMIT 10
155
+ """).fetchdf()
156
+ else:
157
+ insights['Shipping Conditions'] = self.con.execute(f"""
158
+ SELECT {shipping_col},
159
+ COUNT(*) as order_count
160
+ FROM salt_data
161
+ WHERE {shipping_col} IS NOT NULL
162
+ GROUP BY {shipping_col}
163
+ ORDER BY order_count DESC
164
+ LIMIT 10
165
+ """).fetchdf()
166
+
167
+ # General Data Overview
168
+ insights['Dataset Overview'] = self.con.execute("""
169
+ SELECT
170
+ COUNT(*) as total_records,
171
+ COUNT(DISTINCT CREATIONDATE) as unique_dates,
172
+ MIN(CREATIONDATE) as earliest_date,
173
+ MAX(CREATIONDATE) as latest_date
174
  FROM salt_data
 
 
175
  """).fetchdf()
176
 
177
  return insights
178
 
179
  except Exception as e:
180
+ return f"Error generating insights: {str(e)}\n\nAvailable columns: {', '.join(self.available_columns[:10])}..."
181
 
182
  def clean_sql_response(self, sql_query: str) -> str:
183
+ """Clean SQL response - avoiding string literal errors"""
 
184
  backticks = "`" + "`" + "`"
185
  sql_marker = backticks + "sql"
186
 
 
187
  if sql_query.startswith(sql_marker):
188
+ sql_query = sql_query[6:]
189
  elif sql_query.startswith(backticks):
190
+ sql_query = sql_query[3:]
191
 
 
192
  if sql_query.endswith(backticks):
193
+ sql_query = sql_query[:-3]
194
 
195
  return sql_query.strip()
196
 
 
205
  try:
206
  client = openai.OpenAI(api_key=api_key)
207
 
208
+ # Enhanced prompt with actual available columns
209
+ columns_list = ", ".join(self.available_columns[:30]) # Include first 30 columns
210
+
211
  prompt = f"""
212
+ You are a SQL expert analyzing SAP SALT dataset. The database has a table called 'salt_data' with these available columns:
213
 
214
+ {columns_list}
215
+
216
+ The SALT dataset contains SAP ERP sales order data where each row represents a sales document item.
217
+
218
+ IMPORTANT: Use only the column names I provided above. Do not assume column names like 'CUSTOMERID' exist.
219
 
220
  Convert this question to a DuckDB SQL query: "{question}"
221
+
222
+ Return ONLY the SQL query, no explanation. Limit results to 20 rows and use WHERE clauses to filter out NULL values.
223
  """
224
 
225
  response = client.chat.completions.create(
 
228
  temperature=0.1
229
  )
230
 
231
+ sql_query = response.choices[0].message.content.strip()
232
  sql_query = self.clean_sql_response(sql_query)
233
 
234
  result_df = self.con.execute(sql_query).fetchdf()
 
237
  Question: {question}
238
  Results: {result_df.head(10).to_string()}
239
 
240
+ Provide a clear business explanation of these SAP ERP results in 2-3 sentences, focusing on actionable insights for sales operations.
241
  """
242
 
243
  explanation_response = client.chat.completions.create(
 
246
  temperature=0.3
247
  )
248
 
249
+ explanation = explanation_response.choices[0].message.content
250
 
 
251
  code_block = "`" + "`" + "`"
252
  return f"**SQL Query:**\n{code_block}sql\n{sql_query}\n{code_block}\n\n**Results:**\n{result_df.to_string(index=False)}\n\n**Explanation:**\n{explanation}"
253
 
254
  except Exception as e:
255
+ return f"Error: {str(e)}\n\nTry rephrasing your question. Available columns: {', '.join(self.available_columns[:10])}..."
256
 
257
  # Initialize analytics
258
  analytics = SALTAnalytics()
 
270
 
271
  for title, df in insights.items():
272
  output += f"## {title}\n\n"
273
+ if len(df) > 0:
274
+ output += df.to_markdown(index=False)
275
+ else:
276
+ output += "*No data available for this analysis*"
277
  output += "\n\n---\n\n"
278
 
279
  return output
 
283
  return "Please enter a question"
284
  return analytics.natural_language_query(question, api_key)
285
 
286
+ # Updated sample questions based on likely available columns
287
  sample_questions = [
288
+ "Which sales offices process the most orders?",
289
  "What are the most common payment terms?",
290
+ "Show me the distribution of shipping conditions",
291
+ "What is the date range of orders in the dataset?",
292
+ "Which plants are most frequently used?"
293
  ]
294
 
295
  with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo:
 
298
  # 🚀 SAP SALT Dataset Analytics Demo
299
  ## Open Source Analytics + AI for SAP ERP
300
 
301
+ This demo uses the **authentic SAP SALT dataset** - real ERP data from sales orders, items, customers, and addresses.
302
  """)
303
 
304
  with gr.Tab("📥 Load Dataset"):
305
  gr.Markdown("### Load SAP SALT Dataset from Hugging Face")
306
 
307
  load_btn = gr.Button("Load SALT Dataset", variant="primary")
308
+ load_output = gr.Textbox(label="Status", lines=8)
309
 
310
  load_btn.click(fn=load_dataset_interface, outputs=load_output)
311
 
 
330
 
331
  question_input = gr.Textbox(
332
  label="Your Question",
333
+ placeholder="e.g., Which sales offices process the most orders?",
334
  lines=2
335
  )
336
 
 
359
 
360
  with gr.Tab("ℹ️ About"):
361
  gr.Markdown("""
362
+ ### About the SALT Dataset
363
+
364
+ **SAP SALT** (Sales Autocompletion Linked Business Tables) contains:
365
+ - **500,908 sales orders** from real SAP S/4HANA system
366
+ - **2.3M sales order line items**
367
+ - **139,611 unique customers**
368
+ - **Data from 2018-2020** with full business context
369
 
370
+ **Key Use Cases:**
371
+ - Sales process automation (70-80% accuracy)
372
+ - Customer behavior analysis
373
+ - Shipping and logistics optimization
374
+ - Payment terms prediction
375
 
376
+ **Technology Stack:**
377
+ - **DuckDB**: High-performance analytics
378
+ - **OpenAI GPT-4**: Natural language to SQL
379
+ - **Gradio**: Interactive interface
380
+ - **Real ERP Data**: Authentic business scenarios
381
 
382
+ This demonstrates how **open source tools** can unlock massive value from enterprise SAP systems at zero licensing cost.
 
 
 
 
383
  """)
384
 
385
  if __name__ == "__main__":