|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import duckdb |
|
|
from datasets import load_dataset |
|
|
import openai |
|
|
import os |
|
|
from typing import Dict, List, Any |
|
|
import json |
|
|
|
|
|
class SALTAnalytics: |
|
|
def __init__(self): |
|
|
"""Initialize SALT Analytics""" |
|
|
self.con = duckdb.connect(':memory:') |
|
|
self.data_loaded = False |
|
|
self.schema_info = "" |
|
|
self.openai_client = None |
|
|
|
|
|
def setup_openai(self, api_key: str): |
|
|
"""Setup OpenAI client with API key""" |
|
|
try: |
|
|
self.openai_client = openai.OpenAI(api_key=api_key) |
|
|
return True |
|
|
except Exception as e: |
|
|
return False |
|
|
|
|
|
def load_salt_dataset(self): |
|
|
"""Load SAP SALT dataset from Hugging Face into DuckDB""" |
|
|
if self.data_loaded: |
|
|
return "Dataset already loaded!" |
|
|
|
|
|
try: |
|
|
|
|
|
dataset = load_dataset("SAP/SALT", "joined_table", split="train", streaming=False) |
|
|
df = dataset.to_pandas() |
|
|
|
|
|
|
|
|
if len(df) > 100000: |
|
|
df = df.sample(n=50000, random_state=42) |
|
|
|
|
|
|
|
|
self.con.execute("CREATE TABLE salt_data AS SELECT * FROM df") |
|
|
|
|
|
|
|
|
schema_result = self.con.execute("DESCRIBE salt_data").fetchall() |
|
|
self.schema_info = "\n".join([f"{col}: {col}" for col in schema_result])[1] |
|
|
|
|
|
self.data_loaded = True |
|
|
return f"β
Successfully loaded {len(df)} records into DuckDB" |
|
|
|
|
|
except Exception as e: |
|
|
return f"β Error loading dataset: {str(e)}" |
|
|
|
|
|
def get_predefined_insights(self): |
|
|
"""Generate predefined analytical insights""" |
|
|
if not self.data_loaded: |
|
|
return "Please load the dataset first" |
|
|
|
|
|
try: |
|
|
insights = {} |
|
|
|
|
|
|
|
|
insights['Sales Office Performance'] = self.con.execute(""" |
|
|
SELECT SALESOFFICE, |
|
|
COUNT(*) as total_orders, |
|
|
COUNT(DISTINCT CUSTOMERID) as unique_customers |
|
|
FROM salt_data |
|
|
GROUP BY SALESOFFICE |
|
|
ORDER BY total_orders DESC |
|
|
LIMIT 10 |
|
|
""").fetchdf() |
|
|
|
|
|
|
|
|
insights['Payment Terms Distribution'] = self.con.execute(""" |
|
|
SELECT CUSTOMERPAYMENTTERMS, |
|
|
COUNT(*) as frequency, |
|
|
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage |
|
|
FROM salt_data |
|
|
GROUP BY CUSTOMERPAYMENTTERMS |
|
|
ORDER BY frequency DESC |
|
|
""").fetchdf() |
|
|
|
|
|
|
|
|
insights['Shipping Conditions'] = self.con.execute(""" |
|
|
SELECT SHIPPINGCONDITION, |
|
|
COUNT(*) as order_count, |
|
|
COUNT(DISTINCT PLANT) as plants_served |
|
|
FROM salt_data |
|
|
GROUP BY SHIPPINGCONDITION |
|
|
ORDER BY order_count DESC |
|
|
""").fetchdf() |
|
|
|
|
|
return insights |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error generating insights: {str(e)}" |
|
|
|
|
|
def natural_language_query(self, question: str, api_key: str): |
|
|
"""Convert natural language to SQL and execute""" |
|
|
if not self.data_loaded: |
|
|
return "Please load the dataset first" |
|
|
|
|
|
if not api_key: |
|
|
return "Please provide OpenAI API key" |
|
|
|
|
|
try: |
|
|
|
|
|
client = openai.OpenAI(api_key=api_key) |
|
|
|
|
|
prompt = f""" |
|
|
You are a SQL expert analyzing SAP SALT dataset. The database has a table called 'salt_data' with this schema: |
|
|
|
|
|
{self.schema_info} |
|
|
|
|
|
The SALT dataset contains SAP ERP sales order data. Key fields: |
|
|
- SALESOFFICE, SALESGROUP: Sales organization |
|
|
- CUSTOMERID: Customer identifier |
|
|
- CUSTOMERPAYMENTTERMS: Payment terms (Net30, Net45, etc.) |
|
|
- SHIPPINGCONDITION, SHIPPINGPOINT: Shipping logistics |
|
|
- PLANT: Manufacturing location |
|
|
- HEADERINCOTERMSCLASSIFICATION, ITEMINCOTERMSCLASSIFICATION: Trade terms |
|
|
|
|
|
Convert this question to a DuckDB SQL query: "{question}" |
|
|
|
|
|
Return ONLY the SQL query, no explanation. Limit results to 20 rows. |
|
|
""" |
|
|
|
|
|
response = client.chat.completions.create( |
|
|
model="gpt-4", |
|
|
messages=[{"role": "user", "content": prompt}], |
|
|
temperature=0.1 |
|
|
) |
|
|
|
|
|
sql_query = response.choices.message.content.strip() |
|
|
|
|
|
|
|
|
if sql_query.startswith('```sql'): |
|
|
sql_query = sql_query[6:-3] |
|
|
elif sql_query.startswith('``` |
|
|
sql_query = sql_query[3:-3] |
|
|
|
|
|
# Execute query |
|
|
result_df = self.con.execute(sql_query).fetchdf() |
|
|
|
|
|
# Get explanation |
|
|
explanation_prompt = f""" |
|
|
Question: {question} |
|
|
Results: {result_df.head(10).to_string()} |
|
|
|
|
|
Provide a clear business explanation of these SAP ERP results in 2-3 sentences. |
|
|
""" |
|
|
|
|
|
explanation_response = client.chat.completions.create( |
|
|
model="gpt-4", |
|
|
messages=[{"role": "user", "content": explanation_prompt}], |
|
|
temperature=0.3 |
|
|
) |
|
|
|
|
|
explanation = explanation_response.choices.message.content |
|
|
|
|
|
return f"**SQL Query:**\n```sql\n{sql_query}\n``` |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error: {str(e)}" |
|
|
|
|
|
# Initialize analytics |
|
|
analytics = SALTAnalytics() |
|
|
|
|
|
def load_dataset_interface(): |
|
|
"""Interface for loading dataset""" |
|
|
result = analytics.load_salt_dataset() |
|
|
return result |
|
|
|
|
|
def show_insights_interface(): |
|
|
"""Interface for showing insights""" |
|
|
insights = analytics.get_predefined_insights() |
|
|
|
|
|
if isinstance(insights, str): |
|
|
return insights |
|
|
|
|
|
output = "# π SAP SALT Dataset Insights\n\n" |
|
|
|
|
|
for title, df in insights.items(): |
|
|
output += f"## {title}\n\n" |
|
|
output += df.to_markdown(index=False) |
|
|
output += "\n\n---\n\n" |
|
|
|
|
|
return output |
|
|
|
|
|
def qa_interface(question: str, api_key: str): |
|
|
"""Interface for Q&A functionality""" |
|
|
if not question.strip(): |
|
|
return "Please enter a question" |
|
|
|
|
|
result = analytics.natural_language_query(question, api_key) |
|
|
return result |
|
|
|
|
|
# Sample questions for the interface |
|
|
sample_questions = [ |
|
|
"Which sales office has the most customers?", |
|
|
"What are the most common payment terms?", |
|
|
"Show me shipping conditions by plant", |
|
|
"Which customers have the highest number of orders?", |
|
|
"What's the distribution of sales groups?" |
|
|
] |
|
|
|
|
|
# Create Gradio interface |
|
|
with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo: |
|
|
|
|
|
gr.Markdown(""" |
|
|
|
|
|
|
|
|
|
|
|
This demo showcases how open source tools (DuckDB + OpenAI) can generate massive value for enterprises running SAP ERP systems. |
|
|
""") |
|
|
|
|
|
with gr.Tab("π₯ Load Dataset"): |
|
|
gr.Markdown("### Load SAP SALT Dataset from Hugging Face") |
|
|
|
|
|
load_btn = gr.Button("Load SALT Dataset", variant="primary") |
|
|
load_output = gr.Textbox(label="Status", lines=3) |
|
|
|
|
|
load_btn.click( |
|
|
fn=load_dataset_interface, |
|
|
outputs=load_output |
|
|
) |
|
|
|
|
|
with gr.Tab("π Insights"): |
|
|
gr.Markdown("### Pre-built Analytics Insights") |
|
|
|
|
|
insights_btn = gr.Button("Generate Insights", variant="primary") |
|
|
insights_output = gr.Markdown() |
|
|
|
|
|
insights_btn.click( |
|
|
fn=show_insights_interface, |
|
|
outputs=insights_output |
|
|
) |
|
|
|
|
|
with gr.Tab("π€ AI Q&A"): |
|
|
gr.Markdown("### Ask Questions in Natural Language") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=3): |
|
|
api_key_input = gr.Textbox( |
|
|
label="OpenAI API Key", |
|
|
type="password", |
|
|
placeholder="Enter your OpenAI API key" |
|
|
) |
|
|
|
|
|
question_input = gr.Textbox( |
|
|
label="Your Question", |
|
|
placeholder="e.g., Which sales office handles the most customers?", |
|
|
lines=2 |
|
|
) |
|
|
|
|
|
sample_dropdown = gr.Dropdown( |
|
|
choices=sample_questions, |
|
|
label="Or choose a sample question", |
|
|
value=None |
|
|
) |
|
|
|
|
|
ask_btn = gr.Button("Get Answer", variant="primary") |
|
|
|
|
|
with gr.Column(scale=4): |
|
|
qa_output = gr.Markdown() |
|
|
|
|
|
# Update question input when sample is selected |
|
|
sample_dropdown.change( |
|
|
fn=lambda x: x if x else "", |
|
|
inputs=sample_dropdown, |
|
|
outputs=question_input |
|
|
) |
|
|
|
|
|
ask_btn.click( |
|
|
fn=qa_interface, |
|
|
inputs=[question_input, api_key_input], |
|
|
outputs=qa_output |
|
|
) |
|
|
|
|
|
with gr.Tab("βΉοΈ About"): |
|
|
gr.Markdown(""" |
|
|
|
|
|
|
|
|
**Dataset**: SAP SALT (Sales Autocompletion Linked Business Tables) |
|
|
- Real SAP S/4HANA sales order data |
|
|
- 4 linked tables: Sales Documents, Items, Customers, Addresses |
|
|
- 8 classification targets for ML models |
|
|
|
|
|
**Technology Stack**: |
|
|
- **DuckDB**: High-performance analytics database |
|
|
- **OpenAI GPT-4**: Natural language to SQL conversion |
|
|
- **Hugging Face**: Dataset hosting and deployment |
|
|
- **Gradio**: Interactive web interface |
|
|
|
|
|
**Business Value**: |
|
|
- Automate sales order completion (70-80% accuracy) |
|
|
- Optimize customer-to-sales office assignments |
|
|
- Predict shipping and payment preferences |
|
|
- Generate actionable business insights |
|
|
|
|
|
**Open Source Benefits**: |
|
|
- Zero licensing costs vs. proprietary SAP analytics |
|
|
- Full customization and control |
|
|
- Community-driven improvements |
|
|
- Easy integration with existing systems |
|
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|