participatory-planner / analyze_submissions_for_sentences.py
thadillo
Phases 1-3: Database schema, text processing, analyzer updates
71797a4
#!/usr/bin/env python3
"""
Analyze existing submissions to determine if sentence-level categorization is worth implementing.
This script:
1. Segments submissions into sentences
2. Categorizes each sentence using current AI model
3. Compares sentence-level vs submission-level categories
4. Shows statistics to inform decision
Run: python analyze_submissions_for_sentences.py
"""
import sys
import os
import re
from collections import Counter, defaultdict
from app import create_app, db
from app.models.models import Submission
from app.analyzer import get_analyzer
import nltk
# Try to download required NLTK data
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
print("Downloading NLTK punkt tokenizer...")
nltk.download('punkt', quiet=True)
def segment_sentences(text):
"""Simple sentence segmentation"""
try:
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(text)
except:
# Fallback: regex-based
pattern = r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])$'
sentences = re.split(pattern, text)
# Clean and filter
sentences = [s.strip() for s in sentences if s.strip()]
# Filter very short "sentences"
sentences = [s for s in sentences if len(s.split()) >= 3]
return sentences
def analyze_submissions():
"""Analyze submissions to see if sentence-level categorization is beneficial"""
app = create_app()
with app.app_context():
# Get all analyzed submissions
submissions = Submission.query.filter(Submission.category != None).all()
if not submissions:
print("❌ No analyzed submissions found. Please run AI analysis first.")
return
print(f"\n{'='*70}")
print(f"πŸ“Š SENTENCE-LEVEL CATEGORIZATION ANALYSIS")
print(f"{'='*70}\n")
print(f"Analyzing {len(submissions)} submissions...\n")
# Load analyzer
analyzer = get_analyzer()
# Statistics
total_submissions = len(submissions)
total_sentences = 0
multi_sentence_count = 0
multi_category_count = 0
sentence_counts = []
category_changes = []
submission_details = []
# Analyze each submission
for submission in submissions:
# Segment into sentences
sentences = segment_sentences(submission.message)
sentence_count = len(sentences)
total_sentences += sentence_count
sentence_counts.append(sentence_count)
if sentence_count > 1:
multi_sentence_count += 1
# Categorize each sentence
sentence_categories = []
for sentence in sentences:
try:
category = analyzer.analyze(sentence)
sentence_categories.append(category)
except Exception as e:
print(f"Error analyzing sentence: {e}")
sentence_categories.append(None)
# Check if categories differ
unique_categories = set([c for c in sentence_categories if c])
if len(unique_categories) > 1:
multi_category_count += 1
category_changes.append({
'id': submission.id,
'text': submission.message,
'submission_category': submission.category,
'sentence_categories': sentence_categories,
'sentences': sentences,
'contributor_type': submission.contributor_type
})
# Print Statistics
print(f"{'─'*70}")
print(f"πŸ“ˆ STATISTICS")
print(f"{'─'*70}\n")
print(f"Total Submissions: {total_submissions}")
print(f"Total Sentences: {total_sentences}")
print(f"Avg Sentences/Submission: {total_sentences/total_submissions:.1f}")
print(f"Multi-sentence (>1): {multi_sentence_count} ({multi_sentence_count/total_submissions*100:.1f}%)")
print(f"Multi-category: {multi_category_count} ({multi_category_count/total_submissions*100:.1f}%)")
# Sentence distribution
print(f"\nπŸ“Š Sentence Count Distribution:")
sentence_dist = Counter(sentence_counts)
for count in sorted(sentence_dist.keys()):
bar = 'β–ˆ' * int(sentence_dist[count] / total_submissions * 50)
print(f" {count} sentence(s): {sentence_dist[count]:3d} {bar}")
# Category changes
if category_changes:
print(f"\n{'─'*70}")
print(f"πŸ”„ SUBMISSIONS WITH MULTIPLE CATEGORIES ({len(category_changes)})")
print(f"{'─'*70}\n")
for idx, item in enumerate(category_changes[:10], 1): # Show first 10
print(f"\n{idx}. Submission #{item['id']} ({item['contributor_type']})")
print(f" Submission-level: {item['submission_category']}")
print(f" Text: \"{item['text'][:100]}{'...' if len(item['text']) > 100 else ''}\"")
print(f" Sentence breakdown:")
for i, (sentence, category) in enumerate(zip(item['sentences'], item['sentence_categories']), 1):
marker = "⚠️" if category != item['submission_category'] else "βœ“"
print(f" {marker} S{i} [{category:12s}] \"{sentence[:60]}{'...' if len(sentence) > 60 else ''}\"")
if len(category_changes) > 10:
print(f"\n ... and {len(category_changes) - 10} more")
# Category distribution comparison
print(f"\n{'─'*70}")
print(f"πŸ“Š CATEGORY DISTRIBUTION COMPARISON")
print(f"{'─'*70}\n")
# Submission-level counts
submission_cats = Counter([s.category for s in submissions if s.category])
# Sentence-level counts
sentence_cats = Counter()
for item in category_changes:
for cat in item['sentence_categories']:
if cat:
sentence_cats[cat] += 1
print(f"{'Category':<15} {'Submission-Level':<20} {'Sentence-Level (multi-cat only)':<30}")
print(f"{'-'*15} {'-'*20} {'-'*30}")
categories = ['Vision', 'Problem', 'Objectives', 'Directives', 'Values', 'Actions']
for cat in categories:
sub_count = submission_cats.get(cat, 0)
sen_count = sentence_cats.get(cat, 0)
sub_bar = 'β–ˆ' * int(sub_count / total_submissions * 20)
sen_bar = 'β–ˆ' * int(sen_count / multi_category_count * 20) if multi_category_count > 0 else ''
print(f"{cat:<15} {sub_count:3d} {sub_bar:<15} {sen_count:3d} {sen_bar:<15}")
# Recommendation
print(f"\n{'='*70}")
print(f"πŸ’‘ RECOMMENDATION")
print(f"{'='*70}\n")
multi_cat_percentage = (multi_category_count / total_submissions * 100) if total_submissions > 0 else 0
if multi_cat_percentage > 40:
print(f"βœ… STRONGLY RECOMMEND sentence-level categorization")
print(f" {multi_cat_percentage:.1f}% of submissions contain multiple categories.")
print(f" Current system is losing significant semantic detail.")
print(f"\n πŸ“ˆ Expected benefits:")
print(f" β€’ {multi_category_count} submissions will have richer categorization")
print(f" β€’ Training data will be ~{total_sentences - total_submissions} examples richer")
print(f" β€’ Analytics will be more accurate")
elif multi_cat_percentage > 20:
print(f"⚠️ RECOMMEND sentence-level categorization (or proof of concept)")
print(f" {multi_cat_percentage:.1f}% of submissions contain multiple categories.")
print(f" Moderate benefit expected.")
print(f"\n πŸ’‘ Suggestion: Start with proof of concept (display only)")
print(f" Then decide if full implementation is worth it.")
else:
print(f"ℹ️ OPTIONAL - Multi-label might be sufficient")
print(f" Only {multi_cat_percentage:.1f}% of submissions contain multiple categories.")
print(f" Sentence-level might be overkill.")
print(f"\n πŸ’‘ Consider:")
print(f" β€’ Multi-label classification (simpler)")
print(f" β€’ Or keep current system if working well")
# Implementation effort
print(f"\nπŸ“‹ Implementation Effort:")
print(f" β€’ Full sentence-level: 13-20 hours")
print(f" β€’ Proof of concept: 4-6 hours")
print(f" β€’ Multi-label: 4-6 hours")
print(f"\n{'='*70}\n")
# Export detailed results
export_path = "sentence_analysis_results.txt"
with open(export_path, 'w') as f:
f.write("DETAILED SENTENCE-LEVEL ANALYSIS RESULTS\n")
f.write("="*70 + "\n\n")
f.write(f"Total Submissions: {total_submissions}\n")
f.write(f"Multi-category Submissions: {multi_category_count} ({multi_cat_percentage:.1f}%)\n\n")
f.write("\nDETAILED BREAKDOWN:\n\n")
for idx, item in enumerate(category_changes, 1):
f.write(f"\n{idx}. Submission #{item['id']}\n")
f.write(f" Contributor: {item['contributor_type']}\n")
f.write(f" Submission Category: {item['submission_category']}\n")
f.write(f" Full Text: {item['text']}\n")
f.write(f" Sentences:\n")
for i, (sentence, category) in enumerate(zip(item['sentences'], item['sentence_categories']), 1):
f.write(f" {i}. [{category}] {sentence}\n")
f.write("\n")
print(f"πŸ“„ Detailed results exported to: {export_path}")
if __name__ == '__main__':
try:
analyze_submissions()
except Exception as e:
print(f"\n❌ Error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)