Spaces:

PD03
/

Salt

Sleeping

App Files Files Community

PD03 commited on Sep 1

Commit

9231df6

verified ·

1 Parent(s): 31e4035

Update app.py

Browse files

Files changed (1) hide show

app.py +179 -68

app.py CHANGED Viewed

@@ -2,10 +2,10 @@ import gradio as gr
 import pandas as pd
 import duckdb
 from datasets import load_dataset
 import openai
 import os
 from typing import Dict, List, Any
-import json
 class SALTAnalytics:
     def __init__(self):
@@ -13,7 +13,7 @@ class SALTAnalytics:
         self.con = duckdb.connect(':memory:')
         self.data_loaded = False
         self.schema_info = ""
-        self.openai_client = None
     def load_salt_dataset(self):
         """Load SAP SALT dataset from Hugging Face into DuckDB"""
@@ -21,79 +21,176 @@ class SALTAnalytics:
             return "Dataset already loaded!"
         try:
-            dataset = load_dataset("SAP/SALT", "joined_table", split="train", streaming=False)
             df = dataset.to_pandas()
             if len(df) > 100000:
                 df = df.sample(n=50000, random_state=42)
             self.con.execute("CREATE TABLE salt_data AS SELECT * FROM df")
             schema_result = self.con.execute("DESCRIBE salt_data").fetchall()
             self.schema_info = "\n".join([f"{col[0]}: {col[1]}" for col in schema_result])
             self.data_loaded = True
-            return f"✅ Successfully loaded {len(df)} records into DuckDB"
         except Exception as e:
-            return f"❌ Error loading dataset: {str(e)}"
     def get_predefined_insights(self):
-        """Generate predefined analytical insights"""
         if not self.data_loaded:
             return "Please load the dataset first"
         try:
             insights = {}
-            insights['Sales Office Performance'] = self.con.execute("""
-                SELECT SALESOFFICE,
-                       COUNT(*) as total_orders,
-                       COUNT(DISTINCT CUSTOMERID) as unique_customers
-                FROM salt_data
-                GROUP BY SALESOFFICE
-                ORDER BY total_orders DESC
-                LIMIT 10
-            """).fetchdf()
-            insights['Payment Terms Distribution'] = self.con.execute("""
-                SELECT CUSTOMERPAYMENTTERMS,
-                       COUNT(*) as frequency,
-                       ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
-                FROM salt_data
-                GROUP BY CUSTOMERPAYMENTTERMS
-                ORDER BY frequency DESC
-            """).fetchdf()
-            insights['Shipping Conditions'] = self.con.execute("""
-                SELECT SHIPPINGCONDITION,
-                       COUNT(*) as order_count,
-                       COUNT(DISTINCT PLANT) as plants_served
                 FROM salt_data
-                GROUP BY SHIPPINGCONDITION
-                ORDER BY order_count DESC
             """).fetchdf()
             return insights
         except Exception as e:
-            return f"Error generating insights: {str(e)}"
     def clean_sql_response(self, sql_query: str) -> str:
-        """Clean SQL response - COMPLETELY FIXED"""
-        # Use string concatenation to avoid syntax errors
         backticks = "`" + "`" + "`"
         sql_marker = backticks + "sql"
-        # Remove start markers
         if sql_query.startswith(sql_marker):
-            sql_query = sql_query[6:]  # Remove ```
         elif sql_query.startswith(backticks):
-            sql_query = sql_query[3:]  # Remove ```
-        # Remove end markers
         if sql_query.endswith(backticks):
-            sql_query = sql_query[:-3]  # Remove trailing ```
         return sql_query.strip()
@@ -108,13 +205,21 @@ class SALTAnalytics:
         try:
             client = openai.OpenAI(api_key=api_key)
             prompt = f"""
-            You are a SQL expert analyzing SAP SALT dataset. The database has a table called 'salt_data' with this schema:
-            {self.schema_info}
             Convert this question to a DuckDB SQL query: "{question}"
-            Return ONLY the SQL query, no explanation. Limit results to 20 rows.
             """
             response = client.chat.completions.create(
@@ -123,7 +228,7 @@ class SALTAnalytics:
                 temperature=0.1
             )
-            sql_query = response.choices.message.content.strip()
             sql_query = self.clean_sql_response(sql_query)
             result_df = self.con.execute(sql_query).fetchdf()
@@ -132,7 +237,7 @@ class SALTAnalytics:
             Question: {question}
             Results: {result_df.head(10).to_string()}
-            Provide a clear business explanation of these SAP ERP results in 2-3 sentences.
             """
             explanation_response = client.chat.completions.create(
@@ -141,14 +246,13 @@ class SALTAnalytics:
                 temperature=0.3
             )
-            explanation = explanation_response.choices.message.content
-            # Safe output formatting
             code_block = "`" + "`" + "`"
             return f"**SQL Query:**\n{code_block}sql\n{sql_query}\n{code_block}\n\n**Results:**\n{result_df.to_string(index=False)}\n\n**Explanation:**\n{explanation}"
         except Exception as e:
-            return f"Error: {str(e)}"
 # Initialize analytics
 analytics = SALTAnalytics()
@@ -166,7 +270,10 @@ def show_insights_interface():
     for title, df in insights.items():
         output += f"## {title}\n\n"
-        output += df.to_markdown(index=False)
         output += "\n\n---\n\n"
     return output
@@ -176,12 +283,13 @@ def qa_interface(question: str, api_key: str):
         return "Please enter a question"
     return analytics.natural_language_query(question, api_key)
 sample_questions = [
-    "Which sales office has the most customers?",
     "What are the most common payment terms?",
-    "Show me shipping conditions by plant",
-    "Which customers have the highest number of orders?",
-    "What's the distribution of sales groups?"
 ]
 with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo:
@@ -190,14 +298,14 @@ with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo:
     # 🚀 SAP SALT Dataset Analytics Demo
     ## Open Source Analytics + AI for SAP ERP
-    This demo showcases how open source tools (DuckDB + OpenAI) can generate massive value for enterprises running SAP ERP systems.
     """)
     with gr.Tab("📥 Load Dataset"):
         gr.Markdown("### Load SAP SALT Dataset from Hugging Face")
         load_btn = gr.Button("Load SALT Dataset", variant="primary")
-        load_output = gr.Textbox(label="Status", lines=3)
         load_btn.click(fn=load_dataset_interface, outputs=load_output)
@@ -222,7 +330,7 @@ with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo:
                 question_input = gr.Textbox(
                     label="Your Question",
-                    placeholder="e.g., Which sales office handles the most customers?",
                     lines=2
                 )
@@ -251,24 +359,27 @@ with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo:
     with gr.Tab("ℹ️ About"):
         gr.Markdown("""
-        ### About This Demo
-        **Dataset**: SAP SALT (Sales Autocompletion Linked Business Tables)
-        - Real SAP S/4HANA sales order data
-        - 4 linked tables: Sales Documents, Items, Customers, Addresses
-        - 8 classification targets for ML models
-        **Technology Stack**:
-        - **DuckDB**: High-performance analytics database
-        - **OpenAI GPT-4**: Natural language to SQL conversion
-        - **Hugging Face**: Dataset hosting and deployment
-        - **Gradio 4.44**: Secure interactive web interface
-        **Business Value**:
-        - Automate sales order completion (70-80% accuracy)
-        - Optimize customer-to-sales office assignments
-        - Predict shipping and payment preferences
-        - Generate actionable business insights
         """)
 if __name__ == "__main__":

 import pandas as pd
 import duckdb
 from datasets import load_dataset
+from huggingface_hub import login
 import openai
 import os
 from typing import Dict, List, Any
 class SALTAnalytics:
     def __init__(self):
         self.con = duckdb.connect(':memory:')
         self.data_loaded = False
         self.schema_info = ""
+        self.available_columns = []
     def load_salt_dataset(self):
         """Load SAP SALT dataset from Hugging Face into DuckDB"""
             return "Dataset already loaded!"
         try:
+            # Try loading with authentication
+            hf_token = os.getenv('HF_TOKEN')
+            if hf_token:
+                dataset = load_dataset(
+                    "SAP/SALT",
+                    "joined_table",
+                    split="train",
+                    token=hf_token,
+                    streaming=False
+                )
+            else:
+                dataset = load_dataset(
+                    "SAP/SALT",
+                    "joined_table",
+                    split="train",
+                    use_auth_token=True,
+                    streaming=False
+                )
             df = dataset.to_pandas()
+            # Sample data for demo if too large
             if len(df) > 100000:
                 df = df.sample(n=50000, random_state=42)
+            # Load into DuckDB
             self.con.execute("CREATE TABLE salt_data AS SELECT * FROM df")
+            # Get schema information and available columns
             schema_result = self.con.execute("DESCRIBE salt_data").fetchall()
             self.schema_info = "\n".join([f"{col[0]}: {col[1]}" for col in schema_result])
+            self.available_columns = [col[0] for col in schema_result]
             self.data_loaded = True
+            # Return success message with column info
+            return f"✅ Successfully loaded {len(df)} records into DuckDB\n\n📋 Available columns:\n" + "\n".join(f"• {col}" for col in self.available_columns[:20]) + ("\n... and more" if len(self.available_columns) > 20 else "")
         except Exception as e:
+            error_msg = str(e)
+            if "gated dataset" in error_msg or "authentication" in error_msg.lower():
+                return f"❌ Authentication Error: {error_msg}\n\nTo fix this:\n1. Go to https://huggingface.co/datasets/SAP/SALT\n2. Request access to the dataset\n3. Wait for approval\n4. Set HF_TOKEN in your Space secrets"
+            else:
+                return f"❌ Error loading dataset: {error_msg}"
     def get_predefined_insights(self):
+        """Generate predefined analytical insights using correct column names"""
         if not self.data_loaded:
             return "Please load the dataset first"
         try:
             insights = {}
+            # Find the right customer and sales office columns
+            customer_col = None
+            sales_office_col = None
+            # Look for customer-related columns
+            for col in self.available_columns:
+                if 'CUSTOMER' in col.upper() and ('ID' in col.upper() or 'NUM' in col.upper()):
+                    customer_col = col
+                    break
+                elif 'SHIP' in col.upper() and 'PARTY' in col.upper():
+                    customer_col = col  # ShipToParty is often used as customer identifier
+                    break
+            # Look for sales office column
+            for col in self.available_columns:
+                if 'SALES' in col.upper() and 'OFFICE' in col.upper():
+                    sales_office_col = col
+                    break
+            # Sales Office Performance (adjusted for available columns)
+            if sales_office_col:
+                if customer_col:
+                    insights['Sales Office Performance'] = self.con.execute(f"""
+                        SELECT {sales_office_col},
+                               COUNT(*) as total_orders,
+                               COUNT(DISTINCT {customer_col}) as unique_customers
+                        FROM salt_data
+                        WHERE {sales_office_col} IS NOT NULL
+                        GROUP BY {sales_office_col}
+                        ORDER BY total_orders DESC
+                        LIMIT 10
+                    """).fetchdf()
+                else:
+                    insights['Sales Office Performance'] = self.con.execute(f"""
+                        SELECT {sales_office_col},
+                               COUNT(*) as total_orders
+                        FROM salt_data
+                        WHERE {sales_office_col} IS NOT NULL
+                        GROUP BY {sales_office_col}
+                        ORDER BY total_orders DESC
+                        LIMIT 10
+                    """).fetchdf()
+            # Payment Terms Distribution (if available)
+            if 'CUSTOMERPAYMENTTERMS' in self.available_columns:
+                insights['Payment Terms Distribution'] = self.con.execute("""
+                    SELECT CUSTOMERPAYMENTTERMS,
+                           COUNT(*) as frequency,
+                           ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
+                    FROM salt_data
+                    WHERE CUSTOMERPAYMENTTERMS IS NOT NULL
+                    GROUP BY CUSTOMERPAYMENTTERMS
+                    ORDER BY frequency DESC
+                    LIMIT 10
+                """).fetchdf()
+            # Shipping Conditions Analysis (look for shipping-related columns)
+            shipping_col = None
+            plant_col = None
+            for col in self.available_columns:
+                if 'SHIPPING' in col.upper() and 'CONDITION' in col.upper():
+                    shipping_col = col
+                elif 'PLANT' in col.upper():
+                    plant_col = col
+            if shipping_col:
+                if plant_col:
+                    insights['Shipping Conditions'] = self.con.execute(f"""
+                        SELECT {shipping_col},
+                               COUNT(*) as order_count,
+                               COUNT(DISTINCT {plant_col}) as plants_served
+                        FROM salt_data
+                        WHERE {shipping_col} IS NOT NULL
+                        GROUP BY {shipping_col}
+                        ORDER BY order_count DESC
+                        LIMIT 10
+                    """).fetchdf()
+                else:
+                    insights['Shipping Conditions'] = self.con.execute(f"""
+                        SELECT {shipping_col},
+                               COUNT(*) as order_count
+                        FROM salt_data
+                        WHERE {shipping_col} IS NOT NULL
+                        GROUP BY {shipping_col}
+                        ORDER BY order_count DESC
+                        LIMIT 10
+                    """).fetchdf()
+            # General Data Overview
+            insights['Dataset Overview'] = self.con.execute("""
+                SELECT
+                    COUNT(*) as total_records,
+                    COUNT(DISTINCT CREATIONDATE) as unique_dates,
+                    MIN(CREATIONDATE) as earliest_date,
+                    MAX(CREATIONDATE) as latest_date
                 FROM salt_data
             """).fetchdf()
             return insights
         except Exception as e:
+            return f"Error generating insights: {str(e)}\n\nAvailable columns: {', '.join(self.available_columns[:10])}..."
     def clean_sql_response(self, sql_query: str) -> str:
+        """Clean SQL response - avoiding string literal errors"""
         backticks = "`" + "`" + "`"
         sql_marker = backticks + "sql"
         if sql_query.startswith(sql_marker):
+            sql_query = sql_query[6:]
         elif sql_query.startswith(backticks):
+            sql_query = sql_query[3:]
         if sql_query.endswith(backticks):
+            sql_query = sql_query[:-3]
         return sql_query.strip()
         try:
             client = openai.OpenAI(api_key=api_key)
+            # Enhanced prompt with actual available columns
+            columns_list = ", ".join(self.available_columns[:30])  # Include first 30 columns
             prompt = f"""
+            You are a SQL expert analyzing SAP SALT dataset. The database has a table called 'salt_data' with these available columns:
+            {columns_list}
+            The SALT dataset contains SAP ERP sales order data where each row represents a sales document item.
+            IMPORTANT: Use only the column names I provided above. Do not assume column names like 'CUSTOMERID' exist.
             Convert this question to a DuckDB SQL query: "{question}"
+            Return ONLY the SQL query, no explanation. Limit results to 20 rows and use WHERE clauses to filter out NULL values.
             """
             response = client.chat.completions.create(
                 temperature=0.1
             )
+            sql_query = response.choices[0].message.content.strip()
             sql_query = self.clean_sql_response(sql_query)
             result_df = self.con.execute(sql_query).fetchdf()
             Question: {question}
             Results: {result_df.head(10).to_string()}
+            Provide a clear business explanation of these SAP ERP results in 2-3 sentences, focusing on actionable insights for sales operations.
             """
             explanation_response = client.chat.completions.create(
                 temperature=0.3
             )
+            explanation = explanation_response.choices[0].message.content
             code_block = "`" + "`" + "`"
             return f"**SQL Query:**\n{code_block}sql\n{sql_query}\n{code_block}\n\n**Results:**\n{result_df.to_string(index=False)}\n\n**Explanation:**\n{explanation}"
         except Exception as e:
+            return f"Error: {str(e)}\n\nTry rephrasing your question. Available columns: {', '.join(self.available_columns[:10])}..."
 # Initialize analytics
 analytics = SALTAnalytics()
     for title, df in insights.items():
         output += f"## {title}\n\n"
+        if len(df) > 0:
+            output += df.to_markdown(index=False)
+        else:
+            output += "*No data available for this analysis*"
         output += "\n\n---\n\n"
     return output
         return "Please enter a question"
     return analytics.natural_language_query(question, api_key)
+# Updated sample questions based on likely available columns
 sample_questions = [
+    "Which sales offices process the most orders?",
     "What are the most common payment terms?",
+    "Show me the distribution of shipping conditions",
+    "What is the date range of orders in the dataset?",
+    "Which plants are most frequently used?"
 ]
 with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo:
     # 🚀 SAP SALT Dataset Analytics Demo
     ## Open Source Analytics + AI for SAP ERP
+    This demo uses the **authentic SAP SALT dataset** - real ERP data from sales orders, items, customers, and addresses.
     """)
     with gr.Tab("📥 Load Dataset"):
         gr.Markdown("### Load SAP SALT Dataset from Hugging Face")
         load_btn = gr.Button("Load SALT Dataset", variant="primary")
+        load_output = gr.Textbox(label="Status", lines=8)
         load_btn.click(fn=load_dataset_interface, outputs=load_output)
                 question_input = gr.Textbox(
                     label="Your Question",
+                    placeholder="e.g., Which sales offices process the most orders?",
                     lines=2
                 )
     with gr.Tab("ℹ️ About"):
         gr.Markdown("""
+        ### About the SALT Dataset
+        **SAP SALT** (Sales Autocompletion Linked Business Tables) contains:
+        - **500,908 sales orders** from real SAP S/4HANA system
+        - **2.3M sales order line items**
+        - **139,611 unique customers**
+        - **Data from 2018-2020** with full business context
+        **Key Use Cases:**
+        - Sales process automation (70-80% accuracy)
+        - Customer behavior analysis
+        - Shipping and logistics optimization
+        - Payment terms prediction
+        **Technology Stack:**
+        - **DuckDB**: High-performance analytics
+        - **OpenAI GPT-4**: Natural language to SQL
+        - **Gradio**: Interactive interface
+        - **Real ERP Data**: Authentic business scenarios
+        This demonstrates how **open source tools** can unlock massive value from enterprise SAP systems at zero licensing cost.
         """)
 if __name__ == "__main__":