Spaces:

PD03
/

Salt

Sleeping

App Files Files Community

PD03 commited on Sep 1

Commit

0c7416f

verified ·

1 Parent(s): 9231df6

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -93

app.py CHANGED Viewed

@@ -21,7 +21,6 @@ class SALTAnalytics:
             return "Dataset already loaded!"
         try:
-            # Try loading with authentication
             hf_token = os.getenv('HF_TOKEN')
             if hf_token:
@@ -43,21 +42,17 @@ class SALTAnalytics:
             df = dataset.to_pandas()
-            # Sample data for demo if too large
             if len(df) > 100000:
                 df = df.sample(n=50000, random_state=42)
-            # Load into DuckDB
             self.con.execute("CREATE TABLE salt_data AS SELECT * FROM df")
-            # Get schema information and available columns
             schema_result = self.con.execute("DESCRIBE salt_data").fetchall()
             self.schema_info = "\n".join([f"{col[0]}: {col[1]}" for col in schema_result])
             self.available_columns = [col[0] for col in schema_result]
             self.data_loaded = True
-            # Return success message with column info
             return f"✅ Successfully loaded {len(df)} records into DuckDB\n\n📋 Available columns:\n" + "\n".join(f"• {col}" for col in self.available_columns[:20]) + ("\n... and more" if len(self.available_columns) > 20 else "")
         except Exception as e:
@@ -68,116 +63,101 @@ class SALTAnalytics:
                 return f"❌ Error loading dataset: {error_msg}"
     def get_predefined_insights(self):
-        """Generate predefined analytical insights using correct column names"""
         if not self.data_loaded:
             return "Please load the dataset first"
         try:
             insights = {}
-            # Find the right customer and sales office columns
-            customer_col = None
-            sales_office_col = None
-            # Look for customer-related columns
-            for col in self.available_columns:
-                if 'CUSTOMER' in col.upper() and ('ID' in col.upper() or 'NUM' in col.upper()):
-                    customer_col = col
-                    break
-                elif 'SHIP' in col.upper() and 'PARTY' in col.upper():
-                    customer_col = col  # ShipToParty is often used as customer identifier
-                    break
-            # Look for sales office column
-            for col in self.available_columns:
-                if 'SALES' in col.upper() and 'OFFICE' in col.upper():
-                    sales_office_col = col
-                    break
-            # Sales Office Performance (adjusted for available columns)
-            if sales_office_col:
-                if customer_col:
-                    insights['Sales Office Performance'] = self.con.execute(f"""
-                        SELECT {sales_office_col},
-                               COUNT(*) as total_orders,
-                               COUNT(DISTINCT {customer_col}) as unique_customers
-                        FROM salt_data
-                        WHERE {sales_office_col} IS NOT NULL
-                        GROUP BY {sales_office_col}
-                        ORDER BY total_orders DESC
-                        LIMIT 10
-                    """).fetchdf()
-                else:
-                    insights['Sales Office Performance'] = self.con.execute(f"""
-                        SELECT {sales_office_col},
-                               COUNT(*) as total_orders
-                        FROM salt_data
-                        WHERE {sales_office_col} IS NOT NULL
-                        GROUP BY {sales_office_col}
-                        ORDER BY total_orders DESC
-                        LIMIT 10
-                    """).fetchdf()
-            # Payment Terms Distribution (if available)
             if 'CUSTOMERPAYMENTTERMS' in self.available_columns:
                 insights['Payment Terms Distribution'] = self.con.execute("""
                     SELECT CUSTOMERPAYMENTTERMS,
                            COUNT(*) as frequency,
                            ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
                     FROM salt_data
-                    WHERE CUSTOMERPAYMENTTERMS IS NOT NULL
                     GROUP BY CUSTOMERPAYMENTTERMS
                     ORDER BY frequency DESC
                     LIMIT 10
                 """).fetchdf()
-            # Shipping Conditions Analysis (look for shipping-related columns)
-            shipping_col = None
-            plant_col = None
             for col in self.available_columns:
                 if 'SHIPPING' in col.upper() and 'CONDITION' in col.upper():
                     shipping_col = col
-                elif 'PLANT' in col.upper():
-                    plant_col = col
             if shipping_col:
-                if plant_col:
-                    insights['Shipping Conditions'] = self.con.execute(f"""
-                        SELECT {shipping_col},
-                               COUNT(*) as order_count,
-                               COUNT(DISTINCT {plant_col}) as plants_served
-                        FROM salt_data
-                        WHERE {shipping_col} IS NOT NULL
-                        GROUP BY {shipping_col}
-                        ORDER BY order_count DESC
-                        LIMIT 10
-                    """).fetchdf()
-                else:
-                    insights['Shipping Conditions'] = self.con.execute(f"""
-                        SELECT {shipping_col},
-                               COUNT(*) as order_count
-                        FROM salt_data
-                        WHERE {shipping_col} IS NOT NULL
-                        GROUP BY {shipping_col}
-                        ORDER BY order_count DESC
-                        LIMIT 10
-                    """).fetchdf()
-            # General Data Overview
-            insights['Dataset Overview'] = self.con.execute("""
-                SELECT
-                    COUNT(*) as total_records,
-                    COUNT(DISTINCT CREATIONDATE) as unique_dates,
-                    MIN(CREATIONDATE) as earliest_date,
-                    MAX(CREATIONDATE) as latest_date
-                FROM salt_data
-            """).fetchdf()
             return insights
         except Exception as e:
-            return f"Error generating insights: {str(e)}\n\nAvailable columns: {', '.join(self.available_columns[:10])}..."
     def clean_sql_response(self, sql_query: str) -> str:
         """Clean SQL response - avoiding string literal errors"""
@@ -205,8 +185,7 @@ class SALTAnalytics:
         try:
             client = openai.OpenAI(api_key=api_key)
-            # Enhanced prompt with actual available columns
-            columns_list = ", ".join(self.available_columns[:30])  # Include first 30 columns
             prompt = f"""
             You are a SQL expert analyzing SAP SALT dataset. The database has a table called 'salt_data' with these available columns:
@@ -215,7 +194,7 @@ class SALTAnalytics:
             The SALT dataset contains SAP ERP sales order data where each row represents a sales document item.
-            IMPORTANT: Use only the column names I provided above. Do not assume column names like 'CUSTOMERID' exist.
             Convert this question to a DuckDB SQL query: "{question}"
@@ -261,6 +240,7 @@ def load_dataset_interface():
     return analytics.load_salt_dataset()
 def show_insights_interface():
     insights = analytics.get_predefined_insights()
     if isinstance(insights, str):
@@ -270,7 +250,7 @@ def show_insights_interface():
     for title, df in insights.items():
         output += f"## {title}\n\n"
-        if len(df) > 0:
             output += df.to_markdown(index=False)
         else:
             output += "*No data available for this analysis*"
@@ -283,13 +263,12 @@ def qa_interface(question: str, api_key: str):
         return "Please enter a question"
     return analytics.natural_language_query(question, api_key)
-# Updated sample questions based on likely available columns
 sample_questions = [
     "Which sales offices process the most orders?",
     "What are the most common payment terms?",
     "Show me the distribution of shipping conditions",
     "What is the date range of orders in the dataset?",
-    "Which plants are most frequently used?"
 ]
 with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo:

             return "Dataset already loaded!"
         try:
             hf_token = os.getenv('HF_TOKEN')
             if hf_token:
             df = dataset.to_pandas()
             if len(df) > 100000:
                 df = df.sample(n=50000, random_state=42)
             self.con.execute("CREATE TABLE salt_data AS SELECT * FROM df")
             schema_result = self.con.execute("DESCRIBE salt_data").fetchall()
             self.schema_info = "\n".join([f"{col[0]}: {col[1]}" for col in schema_result])
             self.available_columns = [col[0] for col in schema_result]
             self.data_loaded = True
             return f"✅ Successfully loaded {len(df)} records into DuckDB\n\n📋 Available columns:\n" + "\n".join(f"• {col}" for col in self.available_columns[:20]) + ("\n... and more" if len(self.available_columns) > 20 else "")
         except Exception as e:
                 return f"❌ Error loading dataset: {error_msg}"
     def get_predefined_insights(self):
+        """Generate predefined analytical insights - COMPLETELY FIXED"""
         if not self.data_loaded:
             return "Please load the dataset first"
         try:
             insights = {}
+            # Basic Dataset Overview - This always works
+            insights['Dataset Overview'] = self.con.execute("""
+                SELECT
+                    COUNT(*) as total_records,
+                    COUNT(DISTINCT CREATIONDATE) as unique_dates,
+                    MIN(CREATIONDATE) as earliest_date,
+                    MAX(CREATIONDATE) as latest_date
+                FROM salt_data
+            """).fetchdf()
+            # Payment Terms Distribution - Direct column reference
             if 'CUSTOMERPAYMENTTERMS' in self.available_columns:
                 insights['Payment Terms Distribution'] = self.con.execute("""
                     SELECT CUSTOMERPAYMENTTERMS,
                            COUNT(*) as frequency,
                            ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
                     FROM salt_data
+                    WHERE CUSTOMERPAYMENTTERMS IS NOT NULL AND CUSTOMERPAYMENTTERMS != ''
                     GROUP BY CUSTOMERPAYMENTTERMS
                     ORDER BY frequency DESC
                     LIMIT 10
                 """).fetchdf()
+            # Sales Office Performance - Find and use the column
+            sales_office_col = None
+            for col in self.available_columns:
+                if 'SALES' in col.upper() and 'OFFICE' in col.upper():
+                    sales_office_col = col
+                    break
+            if sales_office_col:
+                query = f"""
+                    SELECT {sales_office_col},
+                           COUNT(*) as total_orders
+                    FROM salt_data
+                    WHERE {sales_office_col} IS NOT NULL AND {sales_office_col} != ''
+                    GROUP BY {sales_office_col}
+                    ORDER BY total_orders DESC
+                    LIMIT 10
+                """
+                insights['Sales Office Performance'] = self.con.execute(query).fetchdf()
+            # Shipping Conditions Analysis
+            shipping_col = None
             for col in self.available_columns:
                 if 'SHIPPING' in col.upper() and 'CONDITION' in col.upper():
                     shipping_col = col
+                    break
             if shipping_col:
+                query = f"""
+                    SELECT {shipping_col},
+                           COUNT(*) as order_count
+                    FROM salt_data
+                    WHERE {shipping_col} IS NOT NULL AND {shipping_col} != ''
+                    GROUP BY {shipping_col}
+                    ORDER BY order_count DESC
+                    LIMIT 10
+                """
+                insights['Shipping Conditions'] = self.con.execute(query).fetchdf()
+            # Sales Document Categories
+            if 'SALESDOCUMENTITEMCATEGORY' in self.available_columns:
+                insights['Sales Document Categories'] = self.con.execute("""
+                    SELECT SALESDOCUMENTITEMCATEGORY,
+                           COUNT(*) as frequency,
+                           ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage
+                    FROM salt_data
+                    WHERE SALESDOCUMENTITEMCATEGORY IS NOT NULL AND SALESDOCUMENTITEMCATEGORY != ''
+                    GROUP BY SALESDOCUMENTITEMCATEGORY
+                    ORDER BY frequency DESC
+                    LIMIT 10
+                """).fetchdf()
+            # Show available columns for debugging
+            insights['Available Columns Sample'] = pd.DataFrame({
+                'Column Name': self.available_columns[:20],
+                'Index': range(len(self.available_columns[:20]))
+            })
             return insights
         except Exception as e:
+            # Return detailed error information for debugging
+            return f"❌ Error generating insights: {str(e)}\n\n🔍 Debug Info:\n" + \
+                   f"Data loaded: {self.data_loaded}\n" + \
+                   f"Available columns ({len(self.available_columns)}): {', '.join(self.available_columns[:15])}...\n" + \
+                   f"Error type: {type(e).__name__}"
     def clean_sql_response(self, sql_query: str) -> str:
         """Clean SQL response - avoiding string literal errors"""
         try:
             client = openai.OpenAI(api_key=api_key)
+            columns_list = ", ".join(self.available_columns[:30])
             prompt = f"""
             You are a SQL expert analyzing SAP SALT dataset. The database has a table called 'salt_data' with these available columns:
             The SALT dataset contains SAP ERP sales order data where each row represents a sales document item.
+            IMPORTANT: Use only the column names I provided above. Do not assume column names that don't exist.
             Convert this question to a DuckDB SQL query: "{question}"
     return analytics.load_salt_dataset()
 def show_insights_interface():
+    """Fixed insights interface with better error handling"""
     insights = analytics.get_predefined_insights()
     if isinstance(insights, str):
     for title, df in insights.items():
         output += f"## {title}\n\n"
+        if isinstance(df, pd.DataFrame) and len(df) > 0:
             output += df.to_markdown(index=False)
         else:
             output += "*No data available for this analysis*"
         return "Please enter a question"
     return analytics.natural_language_query(question, api_key)
 sample_questions = [
     "Which sales offices process the most orders?",
     "What are the most common payment terms?",
     "Show me the distribution of shipping conditions",
     "What is the date range of orders in the dataset?",
+    "Which document categories are most frequent?"
 ]
 with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo: