File size: 10,448 Bytes
71797a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#!/usr/bin/env python3
"""
Analyze existing submissions to determine if sentence-level categorization is worth implementing.

This script:
1. Segments submissions into sentences
2. Categorizes each sentence using current AI model
3. Compares sentence-level vs submission-level categories
4. Shows statistics to inform decision

Run: python analyze_submissions_for_sentences.py
"""

import sys
import os
import re
from collections import Counter, defaultdict
from app import create_app, db
from app.models.models import Submission
from app.analyzer import get_analyzer
import nltk

# Try to download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    print("Downloading NLTK punkt tokenizer...")
    nltk.download('punkt', quiet=True)

def segment_sentences(text):
    """Simple sentence segmentation"""
    try:
        from nltk.tokenize import sent_tokenize
        sentences = sent_tokenize(text)
    except:
        # Fallback: regex-based
        pattern = r'(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])$'
        sentences = re.split(pattern, text)
    
    # Clean and filter
    sentences = [s.strip() for s in sentences if s.strip()]
    # Filter very short "sentences"
    sentences = [s for s in sentences if len(s.split()) >= 3]
    
    return sentences

def analyze_submissions():
    """Analyze submissions to see if sentence-level categorization is beneficial"""
    
    app = create_app()
    
    with app.app_context():
        # Get all analyzed submissions
        submissions = Submission.query.filter(Submission.category != None).all()
        
        if not submissions:
            print("❌ No analyzed submissions found. Please run AI analysis first.")
            return
        
        print(f"\n{'='*70}")
        print(f"πŸ“Š SENTENCE-LEVEL CATEGORIZATION ANALYSIS")
        print(f"{'='*70}\n")
        
        print(f"Analyzing {len(submissions)} submissions...\n")
        
        # Load analyzer
        analyzer = get_analyzer()
        
        # Statistics
        total_submissions = len(submissions)
        total_sentences = 0
        multi_sentence_count = 0
        multi_category_count = 0
        
        sentence_counts = []
        category_changes = []
        
        submission_details = []
        
        # Analyze each submission
        for submission in submissions:
            # Segment into sentences
            sentences = segment_sentences(submission.message)
            sentence_count = len(sentences)
            
            total_sentences += sentence_count
            sentence_counts.append(sentence_count)
            
            if sentence_count > 1:
                multi_sentence_count += 1
                
                # Categorize each sentence
                sentence_categories = []
                for sentence in sentences:
                    try:
                        category = analyzer.analyze(sentence)
                        sentence_categories.append(category)
                    except Exception as e:
                        print(f"Error analyzing sentence: {e}")
                        sentence_categories.append(None)
                
                # Check if categories differ
                unique_categories = set([c for c in sentence_categories if c])
                
                if len(unique_categories) > 1:
                    multi_category_count += 1
                    category_changes.append({
                        'id': submission.id,
                        'text': submission.message,
                        'submission_category': submission.category,
                        'sentence_categories': sentence_categories,
                        'sentences': sentences,
                        'contributor_type': submission.contributor_type
                    })
        
        # Print Statistics
        print(f"{'─'*70}")
        print(f"πŸ“ˆ STATISTICS")
        print(f"{'─'*70}\n")
        
        print(f"Total Submissions:        {total_submissions}")
        print(f"Total Sentences:          {total_sentences}")
        print(f"Avg Sentences/Submission: {total_sentences/total_submissions:.1f}")
        print(f"Multi-sentence (>1):      {multi_sentence_count} ({multi_sentence_count/total_submissions*100:.1f}%)")
        print(f"Multi-category:           {multi_category_count} ({multi_category_count/total_submissions*100:.1f}%)")
        
        # Sentence distribution
        print(f"\nπŸ“Š Sentence Count Distribution:")
        sentence_dist = Counter(sentence_counts)
        for count in sorted(sentence_dist.keys()):
            bar = 'β–ˆ' * int(sentence_dist[count] / total_submissions * 50)
            print(f"  {count} sentence(s): {sentence_dist[count]:3d} {bar}")
        
        # Category changes
        if category_changes:
            print(f"\n{'─'*70}")
            print(f"πŸ”„ SUBMISSIONS WITH MULTIPLE CATEGORIES ({len(category_changes)})")
            print(f"{'─'*70}\n")
            
            for idx, item in enumerate(category_changes[:10], 1):  # Show first 10
                print(f"\n{idx}. Submission #{item['id']} ({item['contributor_type']})")
                print(f"   Submission-level: {item['submission_category']}")
                print(f"   Text: \"{item['text'][:100]}{'...' if len(item['text']) > 100 else ''}\"")
                print(f"   Sentence breakdown:")
                
                for i, (sentence, category) in enumerate(zip(item['sentences'], item['sentence_categories']), 1):
                    marker = "⚠️" if category != item['submission_category'] else "βœ“"
                    print(f"      {marker} S{i} [{category:12s}] \"{sentence[:60]}{'...' if len(sentence) > 60 else ''}\"")
            
            if len(category_changes) > 10:
                print(f"\n   ... and {len(category_changes) - 10} more")
        
        # Category distribution comparison
        print(f"\n{'─'*70}")
        print(f"πŸ“Š CATEGORY DISTRIBUTION COMPARISON")
        print(f"{'─'*70}\n")
        
        # Submission-level counts
        submission_cats = Counter([s.category for s in submissions if s.category])
        
        # Sentence-level counts
        sentence_cats = Counter()
        for item in category_changes:
            for cat in item['sentence_categories']:
                if cat:
                    sentence_cats[cat] += 1
        
        print(f"{'Category':<15} {'Submission-Level':<20} {'Sentence-Level (multi-cat only)':<30}")
        print(f"{'-'*15} {'-'*20} {'-'*30}")
        
        categories = ['Vision', 'Problem', 'Objectives', 'Directives', 'Values', 'Actions']
        for cat in categories:
            sub_count = submission_cats.get(cat, 0)
            sen_count = sentence_cats.get(cat, 0)
            sub_bar = 'β–ˆ' * int(sub_count / total_submissions * 20)
            sen_bar = 'β–ˆ' * int(sen_count / multi_category_count * 20) if multi_category_count > 0 else ''
            print(f"{cat:<15} {sub_count:3d} {sub_bar:<15} {sen_count:3d} {sen_bar:<15}")
        
        # Recommendation
        print(f"\n{'='*70}")
        print(f"πŸ’‘ RECOMMENDATION")
        print(f"{'='*70}\n")
        
        multi_cat_percentage = (multi_category_count / total_submissions * 100) if total_submissions > 0 else 0
        
        if multi_cat_percentage > 40:
            print(f"βœ… STRONGLY RECOMMEND sentence-level categorization")
            print(f"   {multi_cat_percentage:.1f}% of submissions contain multiple categories.")
            print(f"   Current system is losing significant semantic detail.")
            print(f"\n   πŸ“ˆ Expected benefits:")
            print(f"   β€’ {multi_category_count} submissions will have richer categorization")
            print(f"   β€’ Training data will be ~{total_sentences - total_submissions} examples richer")
            print(f"   β€’ Analytics will be more accurate")
        elif multi_cat_percentage > 20:
            print(f"⚠️ RECOMMEND sentence-level categorization (or proof of concept)")
            print(f"   {multi_cat_percentage:.1f}% of submissions contain multiple categories.")
            print(f"   Moderate benefit expected.")
            print(f"\n   πŸ’‘ Suggestion: Start with proof of concept (display only)")
            print(f"   Then decide if full implementation is worth it.")
        else:
            print(f"ℹ️ OPTIONAL - Multi-label might be sufficient")
            print(f"   Only {multi_cat_percentage:.1f}% of submissions contain multiple categories.")
            print(f"   Sentence-level might be overkill.")
            print(f"\n   πŸ’‘ Consider:")
            print(f"   β€’ Multi-label classification (simpler)")
            print(f"   β€’ Or keep current system if working well")
        
        # Implementation effort
        print(f"\nπŸ“‹ Implementation Effort:")
        print(f"   β€’ Full sentence-level: 13-20 hours")
        print(f"   β€’ Proof of concept:     4-6 hours")
        print(f"   β€’ Multi-label:          4-6 hours")
        
        print(f"\n{'='*70}\n")
        
        # Export detailed results
        export_path = "sentence_analysis_results.txt"
        with open(export_path, 'w') as f:
            f.write("DETAILED SENTENCE-LEVEL ANALYSIS RESULTS\n")
            f.write("="*70 + "\n\n")
            f.write(f"Total Submissions: {total_submissions}\n")
            f.write(f"Multi-category Submissions: {multi_category_count} ({multi_cat_percentage:.1f}%)\n\n")
            
            f.write("\nDETAILED BREAKDOWN:\n\n")
            for idx, item in enumerate(category_changes, 1):
                f.write(f"\n{idx}. Submission #{item['id']}\n")
                f.write(f"   Contributor: {item['contributor_type']}\n")
                f.write(f"   Submission Category: {item['submission_category']}\n")
                f.write(f"   Full Text: {item['text']}\n")
                f.write(f"   Sentences:\n")
                for i, (sentence, category) in enumerate(zip(item['sentences'], item['sentence_categories']), 1):
                    f.write(f"      {i}. [{category}] {sentence}\n")
                f.write("\n")
        
        print(f"πŸ“„ Detailed results exported to: {export_path}")

if __name__ == '__main__':
    try:
        analyze_submissions()
    except Exception as e:
        print(f"\n❌ Error: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)