Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -238,15 +238,58 @@ def visualize_logprobs(json_input, chunk=0, chunk_size=100):
|
|
| 238 |
def analyze_confidence_signature(logprobs, tokens):
|
| 239 |
if not logprobs or not tokens:
|
| 240 |
return "No data for confidence signature analysis.", None
|
| 241 |
-
|
|
|
|
|
|
|
| 242 |
if not any(p != -float('inf') for p in top_probs):
|
| 243 |
return "No valid log probabilities for confidence analysis.", None
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
def detect_interpretation_pivots(logprobs, tokens):
|
| 252 |
if not logprobs or not tokens:
|
|
|
|
| 238 |
def analyze_confidence_signature(logprobs, tokens):
|
| 239 |
if not logprobs or not tokens:
|
| 240 |
return "No data for confidence signature analysis.", None
|
| 241 |
+
|
| 242 |
+
# Extract top probabilities
|
| 243 |
+
top_probs = [lps[0][1] if lps and lps[0][1] is not None else -float('inf') for lps in logprobs]
|
| 244 |
if not any(p != -float('inf') for p in top_probs):
|
| 245 |
return "No valid log probabilities for confidence analysis.", None
|
| 246 |
+
|
| 247 |
+
# Use a larger window for smoother trends
|
| 248 |
+
window_size = 30 # Increased from 20
|
| 249 |
+
moving_avg = np.convolve(top_probs, np.ones(window_size) / window_size, mode='valid')
|
| 250 |
+
|
| 251 |
+
# Calculate drop magnitudes
|
| 252 |
+
drops = np.diff(moving_avg)
|
| 253 |
+
|
| 254 |
+
# Use adaptive thresholding - only flag drops in the bottom 5% of all changes
|
| 255 |
+
drop_threshold = np.percentile(drops, 5) # More selective
|
| 256 |
+
significant_drops = np.where(drops < drop_threshold)[0]
|
| 257 |
+
|
| 258 |
+
# Cluster nearby drops (within 10 tokens) to avoid reporting multiple points in the same reasoning shift
|
| 259 |
+
if len(significant_drops) > 0:
|
| 260 |
+
clustered_drops = [significant_drops[0]]
|
| 261 |
+
for drop in significant_drops[1:]:
|
| 262 |
+
if drop - clustered_drops[-1] > 10: # At least 10 tokens apart
|
| 263 |
+
clustered_drops.append(drop)
|
| 264 |
+
else:
|
| 265 |
+
clustered_drops = []
|
| 266 |
+
|
| 267 |
+
# Look for context markers near drops
|
| 268 |
+
filtered_drops = []
|
| 269 |
+
reasoning_markers = ["therefore", "thus", "so", "hence", "wait", "but", "however", "actually"]
|
| 270 |
+
|
| 271 |
+
for drop in clustered_drops:
|
| 272 |
+
# Adjust index for convolution window
|
| 273 |
+
token_idx = drop + window_size - 1
|
| 274 |
+
|
| 275 |
+
# Check surrounding context (10 tokens before and after)
|
| 276 |
+
start_idx = max(0, token_idx - 10)
|
| 277 |
+
end_idx = min(len(tokens), token_idx + 10)
|
| 278 |
+
context = " ".join(tokens[start_idx:end_idx])
|
| 279 |
+
|
| 280 |
+
# Only keep drops near reasoning transition markers
|
| 281 |
+
if any(marker in context.lower() for marker in reasoning_markers):
|
| 282 |
+
drop_magnitude = drops[drop]
|
| 283 |
+
filtered_drops.append((token_idx, drop_magnitude, tokens[token_idx] if token_idx < len(tokens) else "End of trace"))
|
| 284 |
+
|
| 285 |
+
# Sort by drop magnitude (largest drops first)
|
| 286 |
+
filtered_drops.sort(key=lambda x: x[1])
|
| 287 |
+
|
| 288 |
+
if not filtered_drops:
|
| 289 |
+
return "No significant confidence shifts at reasoning transitions detected.", None
|
| 290 |
+
|
| 291 |
+
# Return at most 3 most significant drops
|
| 292 |
+
return "Significant confidence shifts detected at reasoning transitions:", filtered_drops[:3]
|
| 293 |
|
| 294 |
def detect_interpretation_pivots(logprobs, tokens):
|
| 295 |
if not logprobs or not tokens:
|