Spaces:

monsimas
/

spam-detection-osp

Sleeping

App Files Files Community

monsimas commited on Feb 26

Commit

c4d21e2

verified ·

1 Parent(s): 9bd9e1e

Update app.py

Browse files

Files changed (1) hide show

app.py +175 -79

app.py CHANGED Viewed

@@ -6,7 +6,12 @@ from typing import Dict, List, Tuple
 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 import numpy as np
 import re
 AVAILABLE_MODELS = [
     "llama-3.3-70b-instruct",
     "llama-3.1-70b-instruct",
@@ -17,6 +22,7 @@ AVAILABLE_MODELS = [
     "deepseek-r1-distill-llama-70b"
 ]
 CSV_PATH = "evaluation.csv"
 TEXT_COLUMN = "Contribution"
 LABEL_COLUMN = "Etat"
@@ -43,24 +49,25 @@ def create_client(api_key: str) -> OpenAI:
     )
 def parse_model_output(output: str) -> str:
-    """Parse and normalize model output to match expected labels."""
-    cleaned = output.strip()
-    if cleaned == "SPAM":
         return "Spam"
-    elif cleaned == "NOT_SPAM":
-        return "Pas spam"
-    cleaned_lower = cleaned.lower().replace('_', ' ')
-    if cleaned_lower in ['spam', 'yes', 'true', 'is spam']:
         return "Spam"
-    elif 'not spam' in cleaned_lower or cleaned_lower in ['no', 'false', 'clean', 'ham', 'legitimate']:
         return "Pas spam"
-    else:
-        # Log unexpected responses for debugging
-        print(f"Warning: Unexpected model output: {output}")
-        return "Pas spam"  # Default to not spam for unrecognized responses
 def process_single_text(
     text: str,
@@ -70,12 +77,14 @@ def process_single_text(
     max_tokens: int,
     top_p: float,
     api_key: str
-) -> Tuple[str, str]:
-    """Process a single text input through the model."""
     client = create_client(api_key)
     formatted_prompt = prompt_template.format(text=text)
     try:
         response = client.chat.completions.create(
             model=model,
@@ -91,9 +100,14 @@ def process_single_text(
         )
         raw_output = response.choices[0].message.content.strip()
         parsed_output = parse_model_output(raw_output)
-        return raw_output, parsed_output
     except Exception as e:
-        return f"Error: {str(e)}", "Pas spam"
 def evaluate_performance(
     df: pd.DataFrame,
@@ -116,23 +130,83 @@ def evaluate_performance(
     # Convert any numpy values to Python floats
     return {k: float(v) if isinstance(v, (np.floating, np.integer)) else v for k, v in metrics.items()}
 def process_benchmark(
     prompt_template: str,
     model: str,
     temperature: float,
     max_tokens: int,
     top_p: float,
-    api_key: str
-) -> Tuple[pd.DataFrame, Dict[str, float]]:
-    """Process benchmark dataset and return results with metrics."""
     # Read CSV file
     df = pd.read_csv(CSV_PATH)
     # Process each text
     raw_predictions = []
     parsed_predictions = []
-    for text in df[TEXT_COLUMN]:
-        raw_output, parsed_output = process_single_text(
             text,
             prompt_template,
             model,
@@ -143,70 +217,85 @@ def process_benchmark(
         )
         raw_predictions.append(raw_output)
         parsed_predictions.append(parsed_output)
     # Add predictions to DataFrame
     df['model_raw_output'] = raw_predictions
     df['model_prediction'] = parsed_predictions
     # Calculate metrics
     metrics = evaluate_performance(df, parsed_predictions)
-    return df, metrics
 def create_interface():
-    """Create Gradio interface."""
-    with gr.Blocks() as interface:
         gr.Markdown("# Moderation Model Testing Interface")
-        with gr.Row():
-            with gr.Column():
-                api_key = gr.Textbox(
-                    label="Scaleway API Key",
-                    placeholder="Enter your API key",
-                    type="password"
-                )
-                model = gr.Dropdown(
-                    choices=AVAILABLE_MODELS,
-                    label="Model",
-                    value=AVAILABLE_MODELS[0]
-                )
-                prompt = gr.Textbox(
-                    label="Prompt Template",
-                    value=DEFAULT_PROMPT,
-                    lines=5
-                )
-            with gr.Column():
-                temperature = gr.Slider(
-                    minimum=0,
-                    maximum=1,
-                    value=0.3,
-                    label="Temperature"
-                )
-                max_tokens = gr.Slider(
-                    minimum=1,
-                    maximum=2048,
-                    value=512,
-                    step=1,
-                    label="Max Tokens"
-                )
-                top_p = gr.Slider(
-                    minimum=0,
-                    maximum=1,
-                    value=1,
-                    label="Top P"
-                )
-        run_button = gr.Button("Run Benchmark")
-        with gr.Row():
-            with gr.Column():
-                results_df = gr.Dataframe(
-                    label="Results",
-                    headers=[TEXT_COLUMN, LABEL_COLUMN, "Raw Model Output", "Model Prediction"]
-                )
-            with gr.Column():
-                metrics_json = gr.JSON(label="Performance Metrics")
         def run_benchmark_fn(
             prompt,
@@ -214,17 +303,24 @@ def create_interface():
             temperature,
             max_tokens,
             top_p,
-            api_key
         ):
-            df, metrics = process_benchmark(
                 prompt,
                 model,
                 temperature,
                 max_tokens,
                 top_p,
-                api_key
             )
-            return df, metrics
         run_button.click(
             run_benchmark_fn,
@@ -236,7 +332,7 @@ def create_interface():
                 top_p,
                 api_key
             ],
-            outputs=[results_df, metrics_json]
         )
     return interface

 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 import numpy as np
 import re
+import time
+import matplotlib.pyplot as plt
+import matplotlib
+matplotlib.use('Agg')
+# Constants
 AVAILABLE_MODELS = [
     "llama-3.3-70b-instruct",
     "llama-3.1-70b-instruct",
     "deepseek-r1-distill-llama-70b"
 ]
+# File and column names
 CSV_PATH = "evaluation.csv"
 TEXT_COLUMN = "Contribution"
 LABEL_COLUMN = "Etat"
     )
 def parse_model_output(output: str) -> str:
+    """Parse and normalize model output to match expected labels with improved pattern matching."""
+    # Store original output for transparency
+    cleaned = output.strip().lower()
+    # Enhanced pattern matching with regex
+    if re.search(r'\bspam\b', cleaned) and not re.search(r'\bnot\s+spam\b|\bpas\s+spam\b', cleaned):
         return "Spam"
+    elif re.search(r'\bnot[\s_-]*spam\b|\bpas[\s_-]*spam\b|\blegitimate\b|\bham\b|\bclean\b', cleaned):
+        return "Non spam"
+    # Additional backup checks for specific formats
+    if cleaned == "spam":
         return "Spam"
+    elif cleaned in ["not_spam", "not spam", "pas spam"]:
         return "Pas spam"
+    # Log unexpected responses and default to not spam
+    print(f"Warning: Unexpected model output: {output}")
+    return "Pas spam"  # Default to not spam for unrecognized responses
 def process_single_text(
     text: str,
     max_tokens: int,
     top_p: float,
     api_key: str
+) -> Tuple[str, str, float]:
+    """Process a single text input through the model and measure response time."""
     client = create_client(api_key)
+    # Format the prompt
     formatted_prompt = prompt_template.format(text=text)
+    start_time = time.time()
     try:
         response = client.chat.completions.create(
             model=model,
         )
         raw_output = response.choices[0].message.content.strip()
         parsed_output = parse_model_output(raw_output)
+        # Calculate response time
+        response_time = time.time() - start_time
+        return raw_output, parsed_output, response_time
     except Exception as e:
+        response_time = time.time() - start_time
+        return f"Error: {str(e)}", "Pas spam", response_time
 def evaluate_performance(
     df: pd.DataFrame,
     # Convert any numpy values to Python floats
     return {k: float(v) if isinstance(v, (np.floating, np.integer)) else v for k, v in metrics.items()}
+def create_metrics_plot(metrics: Dict[str, float]) -> plt.Figure:
+    """Create a bar chart visualization of metrics."""
+    fig, ax = plt.subplots(figsize=(10, 6))
+    # Extract metrics excluding avg_response_time for performance bar chart
+    perf_metrics = {k: v for k, v in metrics.items() if k != 'avg_response_time'}
+    metrics_names = list(perf_metrics.keys())
+    metrics_values = list(perf_metrics.values())
+    bars = ax.bar(metrics_names, metrics_values, color='skyblue')
+    # Add value labels on top of bars
+    for bar in bars:
+        height = bar.get_height()
+        ax.annotate(f'{height:.3f}',
+                    xy=(bar.get_x() + bar.get_width() / 2, height),
+                    xytext=(0, 3),  # 3 points vertical offset
+                    textcoords="offset points",
+                    ha='center', va='bottom')
+    ax.set_ylim(0, 1.0)
+    ax.set_title('Model Performance Metrics')
+    ax.set_ylabel('Score')
+    plt.tight_layout()
+    return fig
+def create_confusion_matrix_plot(df: pd.DataFrame) -> plt.Figure:
+    """Create a confusion matrix visualization."""
+    from sklearn.metrics import confusion_matrix
+    import seaborn as sns
+    # Get true and predicted labels
+    y_true = [1 if label == "Spam" else 0 for label in df[LABEL_COLUMN]]
+    y_pred = [1 if pred == "Spam" else 0 for pred in df['model_prediction']]
+    # Create confusion matrix
+    cm = confusion_matrix(y_true, y_pred)
+    # Plot
+    fig, ax = plt.subplots(figsize=(8, 6))
+    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
+                xticklabels=['Not Spam', 'Spam'],
+                yticklabels=['Not Spam', 'Spam'])
+    ax.set_title('Confusion Matrix')
+    ax.set_ylabel('True Label')
+    ax.set_xlabel('Predicted Label')
+    plt.tight_layout()
+    return fig
 def process_benchmark(
     prompt_template: str,
     model: str,
     temperature: float,
     max_tokens: int,
     top_p: float,
+    api_key: str,
+    progress=None
+) -> Tuple[pd.DataFrame, Dict[str, float], plt.Figure, plt.Figure]:
+    """Process benchmark dataset and return results with metrics and visualizations."""
     # Read CSV file
     df = pd.read_csv(CSV_PATH)
     # Process each text
     raw_predictions = []
     parsed_predictions = []
+    response_times = []
+    total = len(df)
+    for i, text in enumerate(df[TEXT_COLUMN]):
+        if progress is not None:
+            progress(i / total, f"Processing {i+1}/{total}")
+        raw_output, parsed_output, response_time = process_single_text(
             text,
             prompt_template,
             model,
         )
         raw_predictions.append(raw_output)
         parsed_predictions.append(parsed_output)
+        response_times.append(response_time)
     # Add predictions to DataFrame
     df['model_raw_output'] = raw_predictions
     df['model_prediction'] = parsed_predictions
+    df['response_time'] = response_times
     # Calculate metrics
     metrics = evaluate_performance(df, parsed_predictions)
+    # Add average response time metric
+    metrics['avg_response_time'] = sum(response_times) / len(response_times)
+    # Create visualizations
+    metrics_plot = create_metrics_plot(metrics)
+    confusion_matrix_plot = create_confusion_matrix_plot(df)
+    return df, metrics, metrics_plot, confusion_matrix_plot
 def create_interface():
+    """Create Gradio interface with enhanced UI and visualizations."""
+    with gr.Blocks(theme=gr.themes.Soft()) as interface:
         gr.Markdown("# Moderation Model Testing Interface")
+        with gr.Tabs():
+            with gr.TabItem("Model Configuration"):
+                with gr.Row():
+                    with gr.Column():
+                        api_key = gr.Textbox(
+                            label="Scaleway API Key",
+                            placeholder="Enter your API key",
+                            type="password"
+                        )
+                        model = gr.Dropdown(
+                            choices=AVAILABLE_MODELS,
+                            label="Model",
+                            value=AVAILABLE_MODELS[0]
+                        )
+                        prompt = gr.Textbox(
+                            label="Prompt Template",
+                            value=DEFAULT_PROMPT,
+                            lines=5
+                        )
+                    with gr.Column():
+                        temperature = gr.Slider(
+                            minimum=0,
+                            maximum=1,
+                            value=0.3,
+                            label="Temperature"
+                        )
+                        max_tokens = gr.Slider(
+                            minimum=1,
+                            maximum=2048,
+                            value=512,
+                            step=1,
+                            label="Max Tokens"
+                        )
+                        top_p = gr.Slider(
+                            minimum=0,
+                            maximum=1,
+                            value=1,
+                            label="Top P"
+                        )
+                        run_button = gr.Button("Run Benchmark", variant="primary")
+            with gr.TabItem("Results"):
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        results_df = gr.Dataframe(
+                            label="Results Table",
+                            headers=[TEXT_COLUMN, LABEL_COLUMN, "Raw Model Output", "Model Prediction", "Response Time (s)"]
+                        )
+                    with gr.Column(scale=1):
+                        metrics_json = gr.JSON(label="Performance Metrics")
+                with gr.Row():
+                    metrics_plot = gr.Plot(label="Performance Metrics Visualization")
+                    confusion_matrix_vis = gr.Plot(label="Confusion Matrix")
         def run_benchmark_fn(
             prompt,
             temperature,
             max_tokens,
             top_p,
+            api_key,
+            progress=gr.Progress()
         ):
+            df, metrics, metrics_vis, confusion_vis = process_benchmark(
                 prompt,
                 model,
                 temperature,
                 max_tokens,
                 top_p,
+                api_key,
+                progress
             )
+            # Format dataframe for display
+            display_df = df[[TEXT_COLUMN, LABEL_COLUMN, 'model_raw_output', 'model_prediction', 'response_time']].copy()
+            # Format response time to 3 decimal places
+            display_df['response_time'] = display_df['response_time'].apply(lambda x: f"{x:.3f}")
+            return display_df, metrics, metrics_vis, confusion_vis
         run_button.click(
             run_benchmark_fn,
                 top_p,
                 api_key
             ],
+            outputs=[results_df, metrics_json, metrics_plot, confusion_matrix_vis]
         )
     return interface