Spaces:

codelion
/

LogProbsVisualizer

Sleeping

App Files Files Community

codelion commited on Feb 26

Commit

181b7be

verified ·

1 Parent(s): f2687d8

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -113

app.py CHANGED Viewed

@@ -4,91 +4,51 @@ import matplotlib.pyplot as plt
 import pandas as pd
 import io
 import base64
-import ast
 import math
-# Function to safely convert string representations of infinity
-def parse_infinity(value):
-    if isinstance(value, str):
-        if value.lower() == '-infinity' or value.lower() == '-inf':
-            return float('-inf')
-        elif value.lower() == 'infinity' or value.lower() == 'inf':
-            return float('inf')
-    return value
 # Function to process and visualize log probs
 def visualize_logprobs(json_input):
     try:
-        # Try to parse as JSON first, handling string representations of infinity
-        try:
-            # Attempt to load JSON, replacing -inf with "-Infinity" if needed
-            def replace_inf(s):
-                import re
-                return re.sub(r'-inf', '"-Infinity"', re.sub(r'inf', '"Infinity"', s))
-            data = json.loads(replace_inf(json_input))
-            # Convert string "Infinity" or "-Infinity" back to float if needed
-            if isinstance(data, dict) and 'content' in data:
-                for entry in data['content']:
-                    if 'logprob' in entry:
-                        entry['logprob'] = parse_infinity(entry['logprob'])
-                    if 'top_logprobs' in entry:
-                        entry['top_logprobs'] = {k: parse_infinity(v) for k, v in entry['top_logprobs'].items()}
-            elif isinstance(data, list):
-                for entry in data:
-                    if 'logprob' in entry:
-                        entry['logprob'] = parse_infinity(entry['logprob'])
-                    if 'top_logprobs' in entry:
-                        entry['top_logprobs'] = {k: parse_infinity(v) for k, v in entry['top_logprobs'].items()}
-        except json.JSONDecodeError:
-            # If JSON fails, try to parse as Python literal (e.g., with single quotes)
-            try:
-                data = ast.literal_eval(json_input)
-                # Ensure -inf is handled as float('-inf')
-                if isinstance(data, dict) and 'content' in data:
-                    for entry in data['content']:
-                        if 'logprob' in entry and isinstance(entry['logprob'], str):
-                            entry['logprob'] = parse_infinity(entry['logprob'])
-                        if 'top_logprobs' in entry:
-                            entry['top_logprobs'] = {k: parse_infinity(v) for k, v in entry['top_logprobs'].items()}
-                elif isinstance(data, list):
-                    for entry in data:
-                        if 'logprob' in entry and isinstance(entry['logprob'], str):
-                            entry['logprob'] = parse_infinity(entry['logprob'])
-                        if 'top_logprobs' in entry:
-                            entry['top_logprobs'] = {k: parse_infinity(v) for k, v in entry['top_logprobs'].items()}
-            except (SyntaxError, ValueError) as e:
-                raise ValueError(f"Malformed input: {str(e)}")
-        # Ensure data is a list or dictionary with 'content'
-        if isinstance(data, dict) and 'content' in data:
-            content = data['content']
         elif isinstance(data, list):
             content = data
         else:
             raise ValueError("Input must be a list or dictionary with 'content' key")
-        # Extract tokens and log probs, skipping None values and handling non-finite values
         tokens = []
         logprobs = []
         for entry in content:
-            if 'logprob' in entry and entry['logprob'] is not None and math.isfinite(entry['logprob']):
-                tokens.append(entry['token'])
-                logprobs.append(entry['logprob'])
-        # Prepare data for the table
         table_data = []
         for entry in content:
-            if 'logprob' in entry and entry['logprob'] is not None and math.isfinite(entry['logprob']):
-                token = entry['token']
-                logprob = entry['logprob']
-                top_logprobs = entry.get('top_logprobs', {})
-                # Filter out non-finite (e.g., -inf, inf, nan) log probs from top_logprobs
-                finite_top_logprobs = {k: v for k, v in top_logprobs.items() if math.isfinite(v)}
-                # Extract top 3 finite alternatives, sorted by log prob (most probable first)
-                top_3 = sorted(finite_top_logprobs.items(), key=lambda x: x[1], reverse=True)[:3]
                 row = [token, f"{logprob:.4f}"]
                 for alt_token, alt_logprob in top_3:
                     row.append(f"{alt_token}: {alt_logprob:.4f}")
@@ -96,88 +56,98 @@ def visualize_logprobs(json_input):
                 while len(row) < 5:
                     row.append("")
                 table_data.append(row)
-        # Create the plot (only for finite log probs)
         if logprobs:
             plt.figure(figsize=(10, 5))
-            plt.plot(range(len(logprobs)), logprobs, marker='o', linestyle='-', color='b')
             plt.title("Log Probabilities of Generated Tokens")
             plt.xlabel("Token Position")
             plt.ylabel("Log Probability")
             plt.grid(True)
-            plt.xticks(range(len(logprobs)), tokens, rotation=45, ha='right')
             plt.tight_layout()
             # Save plot to a bytes buffer
             buf = io.BytesIO()
-            plt.savefig(buf, format='png', bbox_inches='tight')
             buf.seek(0)
             plt.close()
-            # Convert buffer to base64 for Gradio
             img_bytes = buf.getvalue()
-            img_base64 = base64.b64encode(img_bytes).decode('utf-8')
             img_html = f'<img src="data:image/png;base64,{img_base64}" style="max-width: 100%; height: auto;">'
         else:
             img_html = "No finite log probabilities to plot."
-        # Create a DataFrame for the table
-        df = pd.DataFrame(
-            table_data,
-            columns=["Token", "Log Prob", "Top 1 Alternative", "Top 2 Alternative", "Top 3 Alternative"]
-        ) if table_data else None
-        # Generate colored text based on log probabilities
         if logprobs:
-            # Normalize log probs to [0, 1] for color scaling (0 = most uncertain, 1 = most confident)
             min_logprob = min(logprobs)
             max_logprob = max(logprobs)
             if max_logprob == min_logprob:
-                normalized_probs = [0.5] * len(logprobs)  # Avoid division by zero
             else:
-                normalized_probs = [(lp - min_logprob) / (max_logprob - min_logprob) for lp in logprobs]
-            # Create HTML for colored text
             colored_text = ""
             for i, (token, norm_prob) in enumerate(zip(tokens, normalized_probs)):
-                # Map normalized probability to RGB color (green for high confidence, red for low)
-                r = int(255 * (1 - norm_prob))  # Red increases as uncertainty increases
-                g = int(255 * norm_prob)        # Green decreases as uncertainty increases
-                b = 0                           # Blue stays 0 for simplicity
-                color = f'rgb({r}, {g}, {b})'
                 colored_text += f'<span style="color: {color}; font-weight: bold;">{token}</span>'
                 if i < len(tokens) - 1:
-                    colored_text += " "  # Add space between tokens
-            colored_text_html = f'<p>{colored_text}</p>'
         else:
             colored_text_html = "No finite log probabilities to display."
         return img_html, df, colored_text_html
     except Exception as e:
         return f"Error: {str(e)}", None, None
 # Gradio interface
 with gr.Blocks(title="Log Probability Visualizer") as app:
     gr.Markdown("# Log Probability Visualizer")
-    gr.Markdown("Paste your JSON or Python dictionary log prob data below to visualize the tokens and their probabilities.")
-    # Input
-    json_input = gr.Textbox(label="JSON Input", lines=10, placeholder="Paste your JSON or Python dict here...")
-    # Outputs
     plot_output = gr.HTML(label="Log Probability Plot")
     table_output = gr.Dataframe(label="Token Log Probabilities and Top Alternatives")
     text_output = gr.HTML(label="Colored Text (Confidence Visualization)")
-    # Button to trigger visualization
     btn = gr.Button("Visualize")
     btn.click(
         fn=visualize_logprobs,
         inputs=json_input,
-        outputs=[plot_output, table_output, text_output]
     )
-# Launch the app
 app.launch()

 import pandas as pd
 import io
 import base64
 import math
 # Function to process and visualize log probs
 def visualize_logprobs(json_input):
     try:
+        # Parse the JSON input
+        data = json.loads(json_input)
+        if isinstance(data, dict) and "content" in data:
+            content = data["content"]
         elif isinstance(data, list):
             content = data
         else:
             raise ValueError("Input must be a list or dictionary with 'content' key")
+        # Extract tokens and log probs, skipping None or non-finite values
         tokens = []
         logprobs = []
         for entry in content:
+            if (
+                "logprob" in entry
+                and entry["logprob"] is not None
+                and math.isfinite(entry["logprob"])
+            ):
+                tokens.append(entry["token"])
+                logprobs.append(entry["logprob"])
+        # Prepare table data, handling None in top_logprobs
         table_data = []
         for entry in content:
+            # Only include entries with finite logprob and non-None top_logprobs
+            if (
+                "logprob" in entry
+                and entry["logprob"] is not None
+                and math.isfinite(entry["logprob"])
+                and "top_logprobs" in entry
+                and entry["top_logprobs"] is not None
+            ):
+                token = entry["token"]
+                logprob = entry["logprob"]
+                top_logprobs = entry["top_logprobs"]
+                # Extract top 3 alternatives from top_logprobs
+                top_3 = sorted(
+                    top_logprobs.items(), key=lambda x: x[1], reverse=True
+                )[:3]
                 row = [token, f"{logprob:.4f}"]
                 for alt_token, alt_logprob in top_3:
                     row.append(f"{alt_token}: {alt_logprob:.4f}")
                 while len(row) < 5:
                     row.append("")
                 table_data.append(row)
+        # Create the plot
         if logprobs:
             plt.figure(figsize=(10, 5))
+            plt.plot(range(len(logprobs)), logprobs, marker="o", linestyle="-", color="b")
             plt.title("Log Probabilities of Generated Tokens")
             plt.xlabel("Token Position")
             plt.ylabel("Log Probability")
             plt.grid(True)
+            plt.xticks(range(len(logprobs)), tokens, rotation=45, ha="right")
             plt.tight_layout()
             # Save plot to a bytes buffer
             buf = io.BytesIO()
+            plt.savefig(buf, format="png", bbox_inches="tight")
             buf.seek(0)
             plt.close()
+            # Convert to base64 for Gradio
             img_bytes = buf.getvalue()
+            img_base64 = base64.b64encode(img_bytes).decode("utf-8")
             img_html = f'<img src="data:image/png;base64,{img_base64}" style="max-width: 100%; height: auto;">'
         else:
             img_html = "No finite log probabilities to plot."
+        # Create DataFrame for the table
+        df = (
+            pd.DataFrame(
+                table_data,
+                columns=[
+                    "Token",
+                    "Log Prob",
+                    "Top 1 Alternative",
+                    "Top 2 Alternative",
+                    "Top 3 Alternative",
+                ],
+            )
+            if table_data
+            else None
+        )
+        # Generate colored text
         if logprobs:
             min_logprob = min(logprobs)
             max_logprob = max(logprobs)
             if max_logprob == min_logprob:
+                normalized_probs = [0.5] * len(logprobs)
             else:
+                normalized_probs = [
+                    (lp - min_logprob) / (max_logprob - min_logprob) for lp in logprobs
+                ]
             colored_text = ""
             for i, (token, norm_prob) in enumerate(zip(tokens, normalized_probs)):
+                r = int(255 * (1 - norm_prob))  # Red for low confidence
+                g = int(255 * norm_prob)        # Green for high confidence
+                b = 0
+                color = f"rgb({r}, {g}, {b})"
                 colored_text += f'<span style="color: {color}; font-weight: bold;">{token}</span>'
                 if i < len(tokens) - 1:
+                    colored_text += " "
+            colored_text_html = f"<p>{colored_text}</p>"
         else:
             colored_text_html = "No finite log probabilities to display."
         return img_html, df, colored_text_html
     except Exception as e:
         return f"Error: {str(e)}", None, None
 # Gradio interface
 with gr.Blocks(title="Log Probability Visualizer") as app:
     gr.Markdown("# Log Probability Visualizer")
+    gr.Markdown(
+        "Paste your JSON or Python dictionary log prob data below to visualize the tokens and their probabilities."
+    )
+    json_input = gr.Textbox(
+        label="JSON Input",
+        lines=10,
+        placeholder="Paste your JSON or Python dict here...",
+    )
     plot_output = gr.HTML(label="Log Probability Plot")
     table_output = gr.Dataframe(label="Token Log Probabilities and Top Alternatives")
     text_output = gr.HTML(label="Colored Text (Confidence Visualization)")
     btn = gr.Button("Visualize")
     btn.click(
         fn=visualize_logprobs,
         inputs=json_input,
+        outputs=[plot_output, table_output, text_output],
     )
 app.launch()