mahmoudsaber0 commited on
Commit
290af78
·
verified ·
1 Parent(s): 6101878

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +841 -779
app.py CHANGED
@@ -1,933 +1,995 @@
1
- import os
2
- import re
3
- import torch
4
  import logging
5
  import gc
6
  import sys
7
- import numpy as np
8
  from fastapi import FastAPI, HTTPException
9
  from fastapi.middleware.cors import CORSMiddleware
10
  from pydantic import BaseModel
11
  from typing import Dict, List, Optional
12
- from transformers import (
13
- AutoTokenizer,
14
- AutoModelForSequenceClassification,
15
- AutoModelForCausalLM,
16
- pipeline
17
- )
18
  from tokenizers.normalizers import Sequence, Replace, Strip
19
  from tokenizers import Regex
20
- import math
21
- from collections import Counter
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # =====================================================
24
  # 🔧 تكوين البيئة والإعدادات
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # =====================================================
26
- logging.basicConfig(
27
- level=logging.INFO,
28
- format='%(asctime)s - %(levelname)s - %(message)s'
29
- )
30
- logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- # إعدادات الذاكرة والكاش
33
- CACHE_DIR = "/tmp/huggingface_cache"
34
- os.makedirs(CACHE_DIR, exist_ok=True)
35
 
36
- # تكوين متغيرات البيئة لـ Hugging Face
37
- os.environ.update({
38
- "HF_HOME": CACHE_DIR,
39
- "TRANSFORMERS_CACHE": CACHE_DIR,
40
- "HF_DATASETS_CACHE": CACHE_DIR,
41
- "HUGGINGFACE_HUB_CACHE": CACHE_DIR,
42
- "TORCH_HOME": CACHE_DIR,
43
- "TOKENIZERS_PARALLELISM": "false",
44
- "TRANSFORMERS_OFFLINE": "0",
45
- })
46
 
47
- # إعدادات PyTorch للذاكرة
48
- if torch.cuda.is_available():
49
- os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
50
- torch.backends.cudnn.benchmark = True
51
 
52
- # =====================================================
53
- # 🚀 تحديد الجهاز (GPU أو CPU)
54
- # =====================================================
55
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
56
- logger.info(f"🖥️ Using device: {device}")
57
- if torch.cuda.is_available():
58
- logger.info(f"🎮 CUDA Device: {torch.cuda.get_device_name(0)}")
59
- logger.info(f"💾 CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
60
 
61
- # =====================================================
62
- # 📊 خريطة الموديلات
63
- # =====================================================
64
- label_mapping = {
65
- 0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b',
66
- 6: 'bloomz', 7: 'cohere', 8: 'davinci', 9: 'dolly', 10: 'dolly-v2-12b',
67
- 11: 'flan_t5_base', 12: 'flan_t5_large', 13: 'flan_t5_small',
68
- 14: 'flan_t5_xl', 15: 'flan_t5_xxl', 16: 'gemma-7b-it', 17: 'gemma2-9b-it',
69
- 18: 'gpt-3.5-turbo', 19: 'gpt-35', 20: 'gpt4', 21: 'gpt4o',
70
- 22: 'gpt_j', 23: 'gpt_neox', 24: 'human', 25: 'llama3-70b', 26: 'llama3-8b',
71
- 27: 'mixtral-8x7b', 28: 'opt_1.3b', 29: 'opt_125m', 30: 'opt_13b',
72
- 31: 'opt_2.7b', 32: 'opt_30b', 33: 'opt_350m', 34: 'opt_6.7b',
73
- 35: 'opt_iml_30b', 36: 'opt_iml_max_1.3b', 37: 't0_11b', 38: 't0_3b',
74
- 39: 'text-davinci-002', 40: 'text-davinci-003'
75
- }
76
 
77
- # =====================================================
78
- # 📈 حسابات Perplexity و Burstiness
79
- # =====================================================
80
- class TextMetrics:
81
- """حساب المقاييس الإحصائية للنص"""
82
-
83
- @staticmethod
84
- def calculate_perplexity(text: str, model=None, tokenizer=None):
85
- """
86
- حساب Perplexity - قياس مدى "تفاجؤ" الموديل بالنص
87
- نصوص AI عادة لها perplexity أقل (أكثر قابلية للتنبؤ)
88
- """
89
- try:
90
- if model is None or tokenizer is None:
91
- # حساب تقريبي بناءً على تكرار الكلمات
92
- words = text.lower().split()
93
- word_freq = Counter(words)
94
- total_words = len(words)
95
-
96
- # حساب entropy
97
- entropy = 0
98
- for count in word_freq.values():
99
- probability = count / total_words
100
- if probability > 0:
101
- entropy -= probability * math.log2(probability)
102
-
103
- # تقريب perplexity
104
- perplexity = 2 ** entropy
105
- return min(perplexity, 1000) # Cap at 1000
106
- else:
107
- # حساب حقيقي باستخدام موديل
108
- inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
109
- with torch.no_grad():
110
- outputs = model(**inputs, labels=inputs["input_ids"])
111
- loss = outputs.loss
112
- perplexity = torch.exp(loss).item()
113
- return min(perplexity, 1000)
114
- except Exception as e:
115
- logger.warning(f"Error calculating perplexity: {e}")
116
- return 50.0 # Default value
117
-
118
- @staticmethod
119
- def calculate_burstiness(text: str):
120
- """
121
- حساب Burstiness - قياس التنوع في طول الجمل
122
- البشر عندهم burstiness أعلى (جمل متنوعة الطول)
123
- AI عادة أكثر اتساقاً
124
- """
125
- try:
126
- # تقسيم النص لجمل
127
- sentences = re.split(r'[.!?]+', text)
128
- sentences = [s.strip() for s in sentences if s.strip()]
129
-
130
- if len(sentences) < 2:
131
- return 0.0
132
-
133
- # حساب طول كل جملة
134
- sentence_lengths = [len(s.split()) for s in sentences]
135
-
136
- # حساب الانحراف المعياري والمتوسط
137
- mean_length = np.mean(sentence_lengths)
138
- std_length = np.std(sentence_lengths)
139
-
140
- # Burstiness = الانحراف المعياري / المتوسط
141
- if mean_length > 0:
142
- burstiness = std_length / mean_length
143
- else:
144
- burstiness = 0.0
145
-
146
- return round(burstiness, 4)
147
- except Exception as e:
148
- logger.warning(f"Error calculating burstiness: {e}")
149
- return 0.5
150
-
151
- @staticmethod
152
- def calculate_vocabulary_diversity(text: str):
153
- """
154
- حساب تنوع المفردات
155
- البشر يستخدمون كلمات أكثر تنوعاً
156
- """
157
- words = text.lower().split()
158
- unique_words = set(words)
159
- if len(words) > 0:
160
- diversity = len(unique_words) / len(words)
161
- else:
162
- diversity = 0
163
- return round(diversity, 4)
164
-
165
- @staticmethod
166
- def detect_ai_patterns(text: str):
167
- """
168
- كشف الأنماط الشائعة في نصوص AI
169
- """
170
- ai_patterns = [
171
- r"it['\s]+s important to note",
172
- r"in conclusion",
173
- r"furthermore",
174
- r"comprehensive understanding",
175
- r"it is worth noting",
176
- r"however, it should be noted",
177
- r"on the other hand",
178
- r"in summary",
179
- r"to begin with",
180
- r"first and foremost"
181
- ]
182
-
183
- pattern_count = 0
184
- for pattern in ai_patterns:
185
- if re.search(pattern, text.lower()):
186
- pattern_count += 1
187
-
188
- return pattern_count
189
-
190
- @staticmethod
191
- def detect_human_patterns(text: str):
192
- """
193
- كشف الأنماط الشائعة في الكتابة البشرية
194
- """
195
- human_patterns = [
196
- r"kinda|sorta|gonna|wanna|gotta",
197
- r"tbh|idk|lol|omg|btw",
198
- r"!{2,}|\?{2,}|\.{3,}",
199
- r"i think|i feel|i believe",
200
- r"like,|you know,|i mean,",
201
- r"anyway|anyhow|whatever"
202
- ]
203
-
204
- pattern_count = 0
205
- for pattern in human_patterns:
206
- if re.search(pattern, text.lower()):
207
- pattern_count += 1
208
-
209
- return pattern_count
210
 
211
- # =====================================================
212
- # 🤖 Model Manager - إدارة الموديلات المحسنة
213
- # =====================================================
214
- class EnhancedModelManager:
215
- def __init__(self):
216
- self.modernbert_tokenizer = None
217
- self.modernbert_models = []
218
- self.additional_models = {}
219
- self.additional_tokenizers = {}
220
- self.models_loaded = False
221
- self.metrics = TextMetrics()
222
-
223
- # ModernBERT URLs
224
- self.modernbert_urls = [
225
- "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12",
226
- "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
227
- ]
228
-
229
- # Additional models to try
230
- self.additional_model_configs = [
231
- {
232
- "name": "chatgpt-detector-roberta",
233
- "model_id": "Hello-SimpleAI/chatgpt-detector-roberta",
234
- "type": "classification"
235
- },
236
- {
237
- "name": "openai-detector",
238
- "model_id": "roberta-base-openai-detector",
239
- "type": "classification"
240
- },
241
- {
242
- "name": "ai-content-detector",
243
- "model_id": "PirateXX/AI-Content-Detector",
244
- "type": "classification"
245
- }
246
- ]
247
-
248
- def load_modernbert_tokenizer(self):
249
- """تحميل ModernBERT tokenizer"""
250
- try:
251
- logger.info("📝 Loading ModernBERT tokenizer...")
252
- self.modernbert_tokenizer = AutoTokenizer.from_pretrained(
253
- "answerdotai/ModernBERT-base",
254
- cache_dir=CACHE_DIR,
255
- use_fast=True,
256
- trust_remote_code=False
257
- )
258
-
259
- # إعداد معالج النصوص
260
- try:
261
- newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
262
- join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
263
- self.modernbert_tokenizer.backend_tokenizer.normalizer = Sequence([
264
- self.modernbert_tokenizer.backend_tokenizer.normalizer,
265
- join_hyphen_break,
266
- newline_to_space,
267
- Strip()
268
- ])
269
- except Exception as e:
270
- logger.warning(f"⚠️ Could not set custom normalizer: {e}")
271
-
272
- logger.info("✅ ModernBERT tokenizer loaded")
273
- return True
274
- except Exception as e:
275
- logger.error(f"❌ Failed to load tokenizer: {e}")
276
- return False
277
-
278
- def load_modernbert_model(self, model_url=None, model_path=None, model_name="ModernBERT"):
279
- """تحميل موديل ModernBERT واحد"""
280
- try:
281
- logger.info(f"🤖 Loading {model_name}...")
282
-
283
- base_model = AutoModelForSequenceClassification.from_pretrained(
284
- "answerdotai/ModernBERT-base",
285
- num_labels=41,
286
- cache_dir=CACHE_DIR,
287
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
288
- low_cpu_mem_usage=True,
289
- trust_remote_code=False
290
- )
291
-
292
- if model_path and os.path.exists(model_path):
293
- logger.info(f"📁 Loading from local file: {model_path}")
294
- state_dict = torch.load(model_path, map_location=device, weights_only=True)
295
- base_model.load_state_dict(state_dict, strict=False)
296
- elif model_url:
297
- logger.info(f"🌐 Downloading weights from URL...")
298
- try:
299
- state_dict = torch.hub.load_state_dict_from_url(
300
- model_url,
301
- map_location=device,
302
- progress=True,
303
- check_hash=False,
304
- file_name=f"{model_name}.pt"
305
- )
306
- base_model.load_state_dict(state_dict, strict=False)
307
- except Exception as e:
308
- logger.warning(f"⚠️ Could not load weights: {e}")
309
- logger.info("📊 Using model with random initialization")
310
-
311
- model = base_model.to(device)
312
- model.eval()
313
-
314
- if 'state_dict' in locals():
315
- del state_dict
316
- gc.collect()
317
- if torch.cuda.is_available():
318
- torch.cuda.empty_cache()
319
-
320
- logger.info(f"✅ {model_name} loaded")
321
- return model
322
-
323
- except Exception as e:
324
- logger.error(f"❌ Failed to load {model_name}: {e}")
325
- return None
326
-
327
- def load_additional_model(self, model_config):
328
- """تحميل موديلات إضافية للكشف عن AI"""
329
- try:
330
- model_name = model_config["name"]
331
- model_id = model_config["model_id"]
332
-
333
- logger.info(f"🔧 Loading {model_name}...")
334
-
335
- # Try loading as a pipeline first (easier)
336
- try:
337
- classifier = pipeline(
338
- "text-classification",
339
- model=model_id,
340
- device=0 if torch.cuda.is_available() else -1,
341
- model_kwargs={"cache_dir": CACHE_DIR}
342
- )
343
- self.additional_models[model_name] = classifier
344
- logger.info(f"✅ {model_name} loaded as pipeline")
345
- return True
346
- except:
347
- # Try loading manually
348
- tokenizer = AutoTokenizer.from_pretrained(
349
- model_id,
350
- cache_dir=CACHE_DIR
351
- )
352
- model = AutoModelForSequenceClassification.from_pretrained(
353
- model_id,
354
- cache_dir=CACHE_DIR,
355
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
356
- ).to(device)
357
- model.eval()
358
-
359
- self.additional_tokenizers[model_name] = tokenizer
360
- self.additional_models[model_name] = model
361
- logger.info(f"✅ {model_name} loaded manually")
362
- return True
363
-
364
- except Exception as e:
365
- logger.warning(f"⚠️ Could not load {model_config['name']}: {e}")
366
- return False
367
-
368
- def load_all_models(self, max_modernbert=2, load_additional=True):
369
- """تحميل جميع الموديلات"""
370
- if self.models_loaded:
371
- logger.info("✨ Models already loaded")
372
- return True
373
-
374
- # Load ModernBERT tokenizer
375
- if not self.load_modernbert_tokenizer():
376
- return False
377
-
378
- # Load ModernBERT models
379
- logger.info(f"🚀 Loading up to {max_modernbert} ModernBERT models...")
380
-
381
- # Try local file first
382
- local_path = "modernbert.bin"
383
- if os.path.exists(local_path):
384
- model = self.load_modernbert_model(
385
- model_path=local_path,
386
- model_name="ModernBERT-Local"
387
- )
388
- if model is not None:
389
- self.modernbert_models.append(model)
390
-
391
- # Load from URLs
392
- for i, url in enumerate(self.modernbert_urls[:max_modernbert - len(self.modernbert_models)]):
393
- if len(self.modernbert_models) >= max_modernbert:
394
- break
395
-
396
- model = self.load_modernbert_model(
397
- model_url=url,
398
- model_name=f"ModernBERT-{i+1}"
399
- )
400
- if model is not None:
401
- self.modernbert_models.append(model)
402
-
403
- # Load additional models
404
- if load_additional:
405
- logger.info("🎯 Loading additional AI detection models...")
406
- for config in self.additional_model_configs:
407
- self.load_additional_model(config)
408
-
409
- # Check success
410
- total_models = len(self.modernbert_models) + len(self.additional_models)
411
- if total_models > 0:
412
- self.models_loaded = True
413
- logger.info(f"✅ Loaded {len(self.modernbert_models)} ModernBERT + {len(self.additional_models)} additional models")
414
- return True
415
- else:
416
- logger.error("❌ No models could be loaded")
417
- return False
418
-
419
- def classify_with_modernbert(self, text: str, model_index: int):
420
- """تصنيف النص باستخدام موديل ModernBERT واحد"""
421
- try:
422
- if model_index >= len(self.modernbert_models):
423
- return None
424
-
425
- model = self.modernbert_models[model_index]
426
- cleaned_text = clean_text(text)
427
-
428
- inputs = self.modernbert_tokenizer(
429
- cleaned_text,
430
- return_tensors="pt",
431
- truncation=True,
432
- max_length=512,
433
- padding=True
434
- ).to(device)
435
-
436
- with torch.no_grad():
437
- logits = model(**inputs).logits
438
- probs = torch.softmax(logits[0], dim=0)
439
-
440
- human_prob = probs[24].item()
441
- ai_probs = probs.clone()
442
- ai_probs[24] = 0
443
- ai_total = ai_probs.sum().item()
444
-
445
- total = human_prob + ai_total
446
- if total > 0:
447
- human_pct = (human_prob / total) * 100
448
- ai_pct = (ai_total / total) * 100
449
- else:
450
- human_pct = ai_pct = 50
451
-
452
- ai_model_idx = torch.argmax(ai_probs).item()
453
-
454
- return {
455
- "model_name": f"ModernBERT-{model_index+1}",
456
- "human_score": round(human_pct, 2),
457
- "ai_score": round(ai_pct, 2),
458
- "predicted_model": label_mapping.get(ai_model_idx, "Unknown"),
459
- "confidence": round(max(human_pct, ai_pct), 2)
460
- }
461
- except Exception as e:
462
- logger.error(f"Error in ModernBERT {model_index}: {e}")
463
- return None
464
-
465
- def classify_with_additional(self, text: str, model_name: str):
466
- """تصنيف النص باستخدام موديل إضافي"""
467
- try:
468
- if model_name not in self.additional_models:
469
- return None
470
-
471
- model = self.additional_models[model_name]
472
-
473
- # Check if it's a pipeline or model
474
- if hasattr(model, '__call__'):
475
- # It's a pipeline
476
- result = model(text, truncation=True, max_length=512)
477
-
478
- # Parse results based on model output format
479
- ai_score = 0
480
- human_score = 0
481
-
482
- for item in result:
483
- label = item['label'].lower()
484
- score = item['score'] * 100
485
-
486
- if 'fake' in label or 'ai' in label or 'gpt' in label:
487
- ai_score = max(ai_score, score)
488
- elif 'real' in label or 'human' in label:
489
- human_score = max(human_score, score)
490
-
491
- # Normalize if needed
492
- if ai_score == 0 and human_score == 0:
493
- ai_score = human_score = 50
494
-
495
- return {
496
- "model_name": model_name,
497
- "human_score": round(human_score, 2),
498
- "ai_score": round(ai_score, 2),
499
- "predicted_model": "AI" if ai_score > human_score else "Human",
500
- "confidence": round(max(ai_score, human_score), 2)
501
- }
502
- else:
503
- # It's a model, use tokenizer
504
- tokenizer = self.additional_tokenizers.get(model_name)
505
- if tokenizer is None:
506
- return None
507
-
508
- inputs = tokenizer(
509
- text,
510
- return_tensors="pt",
511
- truncation=True,
512
- max_length=512,
513
- padding=True
514
- ).to(device)
515
-
516
- with torch.no_grad():
517
- outputs = model(**inputs)
518
- probs = torch.softmax(outputs.logits[0], dim=0)
519
-
520
- # Assuming binary classification (AI vs Human)
521
- if len(probs) == 2:
522
- human_score = probs[0].item() * 100
523
- ai_score = probs[1].item() * 100
524
- else:
525
- # Handle multi-class
526
- ai_score = human_score = 50
527
-
528
- return {
529
- "model_name": model_name,
530
- "human_score": round(human_score, 2),
531
- "ai_score": round(ai_score, 2),
532
- "predicted_model": "AI" if ai_score > human_score else "Human",
533
- "confidence": round(max(ai_score, human_score), 2)
534
- }
535
-
536
- except Exception as e:
537
- logger.warning(f"Error in {model_name}: {e}")
538
- return None
539
-
540
- def comprehensive_analysis(self, text: str):
541
- """تحليل شامل باستخدام جميع الموديلات والمقاييس"""
542
- if not self.models_loaded:
543
- raise ValueError("No models loaded")
544
-
545
- results = {
546
- "individual_models": [],
547
- "ensemble_result": {},
548
- "metrics": {},
549
- "pattern_analysis": {}
550
- }
551
-
552
- # 1. Calculate text metrics
553
- logger.info("📊 Calculating text metrics...")
554
- results["metrics"] = {
555
- "perplexity": self.metrics.calculate_perplexity(text),
556
- "burstiness": self.metrics.calculate_burstiness(text),
557
- "vocabulary_diversity": self.metrics.calculate_vocabulary_diversity(text),
558
- "text_length": len(text.split()),
559
- "sentence_count": len(re.split(r'[.!?]+', text))
560
- }
561
 
562
- # 2. Pattern detection
563
- results["pattern_analysis"] = {
564
- "ai_patterns_found": self.metrics.detect_ai_patterns(text),
565
- "human_patterns_found": self.metrics.detect_human_patterns(text)
 
 
 
 
566
  }
567
-
568
- # 3. Run ModernBERT models
569
- modernbert_results = []
570
- for i in range(len(self.modernbert_models)):
571
- result = self.classify_with_modernbert(text, i)
572
- if result:
573
- results["individual_models"].append(result)
574
- modernbert_results.append(result)
575
-
576
- # 4. Run additional models
577
- for model_name in self.additional_models.keys():
578
- result = self.classify_with_additional(text, model_name)
579
- if result:
580
- results["individual_models"].append(result)
581
-
582
- # 5. Calculate ensemble result (weighted average)
583
- if results["individual_models"]:
584
- total_ai = 0
585
- total_human = 0
586
- weights_sum = 0
587
-
588
- for i, result in enumerate(results["individual_models"]):
589
- # Give ModernBERT models higher weight
590
- weight = 1.5 if i < len(modernbert_results) else 1.0
591
- total_ai += result["ai_score"] * weight
592
- total_human += result["human_score"] * weight
593
- weights_sum += weight
594
-
595
- if weights_sum > 0:
596
- ensemble_ai = total_ai / weights_sum
597
- ensemble_human = total_human / weights_sum
598
- else:
599
- ensemble_ai = ensemble_human = 50
600
-
601
- # Adjust based on metrics
602
- # High perplexity suggests human text
603
- if results["metrics"]["perplexity"] > 100:
604
- ensemble_human += 5
605
- ensemble_ai -= 5
606
- elif results["metrics"]["perplexity"] < 30:
607
- ensemble_ai += 5
608
- ensemble_human -= 5
609
-
610
- # High burstiness suggests human text
611
- if results["metrics"]["burstiness"] > 0.8:
612
- ensemble_human += 5
613
- ensemble_ai -= 5
614
- elif results["metrics"]["burstiness"] < 0.3:
615
- ensemble_ai += 5
616
- ensemble_human -= 5
617
-
618
- # Pattern analysis adjustment
619
- pattern_adjustment = (results["pattern_analysis"]["ai_patterns_found"] -
620
- results["pattern_analysis"]["human_patterns_found"]) * 3
621
- ensemble_ai += pattern_adjustment
622
- ensemble_human -= pattern_adjustment
623
-
624
- # Normalize to 100%
625
- total = ensemble_ai + ensemble_human
626
- if total > 0:
627
- ensemble_ai = (ensemble_ai / total) * 100
628
- ensemble_human = (ensemble_human / total) * 100
629
-
630
- # Determine most likely AI model
631
- if ensemble_ai > ensemble_human and modernbert_results:
632
- predicted_model = modernbert_results[0]["predicted_model"]
633
- else:
634
- predicted_model = "Human"
635
-
636
- results["ensemble_result"] = {
637
- "ai_percentage": round(min(max(ensemble_ai, 0), 100), 2),
638
- "human_percentage": round(min(max(ensemble_human, 0), 100), 2),
639
- "predicted_model": predicted_model,
640
- "confidence": round(max(ensemble_ai, ensemble_human), 2),
641
- "is_human": ensemble_human > ensemble_ai,
642
- "models_used": len(results["individual_models"])
643
- }
644
-
645
- return results
646
 
647
  # =====================================================
648
  # 🧹 دوال التنظيف والمعالجة
649
- # =====================================================
650
- def clean_text(text: str) -> str:
651
- """تنظيف النص من المسافات الزائدة"""
652
- text = re.sub(r'\s{2,}', ' ', text)
653
- text = re.sub(r'\s+([,.;:?!])', r'\1', text)
654
- return text.strip()
655
-
656
- def split_into_paragraphs(text: str) -> List[str]:
657
- """تقسيم النص إلى فقرات"""
658
- paragraphs = re.split(r'\n\s*\n', text.strip())
659
- return [p.strip() for p in paragraphs if p.strip()]
660
-
661
- # =====================================================
662
  # 🌐 FastAPI Application
663
  # =====================================================
664
  app = FastAPI(
665
- title="Enhanced ModernBERT AI Detector",
666
- description="Advanced AI detection with multiple models, perplexity, and burstiness analysis",
667
- version="3.0.0"
668
  )
669
 
670
- # إضافة CORS
671
  app.add_middleware(
672
  CORSMiddleware,
673
  allow_origins=["*"],
674
- allow_credentials=True,
675
- allow_methods=["*"],
676
  allow_headers=["*"],
677
  )
678
 
679
- # إنشاء مدير الموديلات المحسن
680
- model_manager = EnhancedModelManager()
681
 
682
  # =====================================================
683
  # 📝 نماذج البيانات (Pydantic Models)
684
- # =====================================================
685
  class TextInput(BaseModel):
686
  text: str
687
  analyze_paragraphs: Optional[bool] = False
688
- return_individual_scores: Optional[bool] = True
689
 
690
  class SimpleTextInput(BaseModel):
691
  text: str
692
 
693
- class EnhancedDetectionResult(BaseModel):
694
  success: bool
695
  code: int
696
  message: str
697
- data: Dict
698
-
699
- # =====================================================
700
- # 🎯 API Endpoints
701
- # =====================================================
702
- @app.on_event("startup")
703
  async def startup_event():
704
  """تحميل الموديلات عند بداية التشغيل"""
705
  logger.info("=" * 50)
706
- logger.info("🚀 Starting Enhanced ModernBERT AI Detector...")
707
  logger.info(f"🐍 Python version: {sys.version}")
708
  logger.info(f"🔥 PyTorch version: {torch.__version__}")
 
 
 
709
  logger.info("=" * 50)
710
 
711
- # Load models
712
- max_modernbert = int(os.environ.get("MAX_MODERNBERT_MODELS", "2"))
713
- load_additional = os.environ.get("LOAD_ADDITIONAL_MODELS", "true").lower() == "true"
714
-
715
- success = model_manager.load_all_models(
716
- max_modernbert=max_modernbert,
717
- load_additional=load_additional
718
- )
719
 
720
  if success:
721
- logger.info("✅ Application ready with enhanced features!")
722
  else:
723
  logger.error("⚠️ Failed to load models - API will return errors")
 
724
 
725
  @app.get("/")
726
  async def root():
727
  """الصفحة الرئيسية"""
728
- models_info = {
729
- "modernbert_models": len(model_manager.modernbert_models),
730
- "additional_models": list(model_manager.additional_models.keys())
731
- }
732
-
733
  return {
734
- "message": "Enhanced ModernBERT AI Text Detector API",
735
  "status": "online" if model_manager.models_loaded else "initializing",
736
- "models": models_info,
 
737
  "device": str(device),
738
- "features": [
739
- "Multiple AI detection models",
740
- "Perplexity analysis",
741
- "Burstiness analysis",
742
- "Pattern detection",
743
- "Individual model scores",
744
- "Ensemble predictions"
745
- ],
746
  "endpoints": {
747
  "analyze": "/analyze",
748
  "simple": "/analyze-simple",
749
- "health": "/health",
750
- "docs": "/docs"
751
- }
752
- }
753
-
754
- @app.get("/health")
755
- async def health_check():
756
- """فحص صحة الخدمة"""
757
- memory_info = {}
758
- if torch.cuda.is_available():
759
- memory_info = {
760
- "gpu_allocated_gb": round(torch.cuda.memory_allocated() / 1024**3, 2),
761
- "gpu_reserved_gb": round(torch.cuda.memory_reserved() / 1024**3, 2)
762
- }
763
 
764
  return {
765
  "status": "healthy" if model_manager.models_loaded else "unhealthy",
766
- "modernbert_models": len(model_manager.modernbert_models),
767
- "additional_models": len(model_manager.additional_models),
768
- "total_models": len(model_manager.modernbert_models) + len(model_manager.additional_models),
769
  "device": str(device),
770
  "cuda_available": torch.cuda.is_available(),
771
  "memory_info": memory_info
772
  }
773
 
774
- @app.post("/analyze", response_model=EnhancedDetectionResult)
775
- async def analyze_text_enhanced(data: TextInput):
776
  """
777
- Enhanced analysis with multiple models and metrics
 
778
  """
779
  try:
780
- # Validate input
781
  text = data.text.strip()
782
  if not text:
783
- return EnhancedDetectionResult(
784
  success=False,
785
  code=400,
786
  message="Empty input text",
787
  data={}
788
  )
789
 
790
- # Ensure models are loaded
791
  if not model_manager.models_loaded:
792
- if not model_manager.load_all_models():
793
- return EnhancedDetectionResult(
 
794
  success=False,
795
  code=503,
796
- message="Models not available",
797
  data={}
798
  )
799
 
800
- # Comprehensive analysis
801
- analysis_result = model_manager.comprehensive_analysis(text)
802
-
803
- # Basic stats
804
  total_words = len(text.split())
805
- ai_percentage = analysis_result["ensemble_result"]["ai_percentage"]
806
- human_percentage = analysis_result["ensemble_result"]["human_percentage"]
 
 
 
 
 
 
807
  ai_words = int(total_words * (ai_percentage / 100))
808
 
809
- # Paragraph analysis if requested
810
  paragraphs_analysis = []
811
- if data.analyze_paragraphs:
812
  paragraphs = split_into_paragraphs(text)
813
- for para in paragraphs[:10]:
 
 
 
814
  if para.strip():
815
  try:
816
- para_result = model_manager.comprehensive_analysis(para)
817
  para_words = len(para.split())
 
 
818
 
819
  paragraphs_analysis.append({
820
  "paragraph": para[:200] + "..." if len(para) > 200 else para,
821
- "ai_generated_score": para_result["ensemble_result"]["ai_percentage"] / 100,
822
- "human_written_score": para_result["ensemble_result"]["human_percentage"] / 100,
823
- "predicted_model": para_result["ensemble_result"]["predicted_model"],
824
- "metrics": {
825
- "perplexity": para_result["metrics"]["perplexity"],
826
- "burstiness": para_result["metrics"]["burstiness"]
827
- }
828
  })
829
  except Exception as e:
830
  logger.warning(f"Failed to analyze paragraph: {e}")
831
-
832
- # Prepare response
833
- response_data = {
834
- "fakePercentage": ai_percentage,
835
- "isHuman": human_percentage,
836
- "textWords": total_words,
837
- "aiWords": ai_words,
838
- "predicted_model": analysis_result["ensemble_result"]["predicted_model"],
839
- "feedback": "Most of Your Text is AI/GPT Generated" if ai_percentage > 50 else "Most of Your Text Appears Human-Written",
840
- "confidence": analysis_result["ensemble_result"]["confidence"],
841
- "models_used": analysis_result["ensemble_result"]["models_used"],
842
-
843
- # New: Metrics
844
- "metrics": analysis_result["metrics"],
845
-
846
- # New: Pattern analysis
847
- "pattern_analysis": analysis_result["pattern_analysis"],
848
-
849
- # Paragraphs if requested
850
- "paragraphs": paragraphs_analysis,
851
 
852
- # Text preview
853
- "input_text": text[:500] + "..." if len(text) > 500 else text,
854
- "detected_language": "en"
855
- }
 
 
 
856
 
857
- # Add individual model scores if requested
858
- if data.return_individual_scores:
859
- response_data["individual_models"] = analysis_result["individual_models"]
 
 
860
 
861
- return EnhancedDetectionResult(
 
862
  success=True,
863
  code=200,
864
- message="Enhanced analysis completed",
865
- data=response_data
 
 
 
 
 
 
 
 
 
 
 
 
 
866
  )
867
 
868
  except Exception as e:
869
  logger.error(f"Analysis error: {e}", exc_info=True)
870
- return EnhancedDetectionResult(
871
  success=False,
872
  code=500,
873
  message=f"Analysis failed: {str(e)}",
874
- data={}
875
- )
876
-
877
  @app.post("/analyze-simple")
878
  async def analyze_simple(data: SimpleTextInput):
879
  """
880
- Simple analysis - returns basic results only
881
  """
882
  try:
883
  text = data.text.strip()
884
- if not text:
885
  raise HTTPException(status_code=400, detail="Empty text")
886
 
887
  if not model_manager.models_loaded:
888
- if not model_manager.load_all_models():
889
  raise HTTPException(status_code=503, detail="Models not available")
890
 
891
- result = model_manager.comprehensive_analysis(text)
892
- ensemble = result["ensemble_result"]
893
 
894
  return {
895
- "is_ai": ensemble["ai_percentage"] > 50,
896
- "ai_score": ensemble["ai_percentage"],
897
- "human_score": ensemble["human_percentage"],
898
- "detected_model": ensemble["predicted_model"],
899
- "confidence": ensemble["confidence"],
900
- "perplexity": result["metrics"]["perplexity"],
901
- "burstiness": result["metrics"]["burstiness"]
902
  }
903
 
904
  except HTTPException:
905
- raise
906
- except Exception as e:
907
- logger.error(f"Simple analysis error: {e}")
908
- raise HTTPException(status_code=500, detail=str(e))
909
-
910
- # =====================================================
911
- # 🏃 تشغيل التطبيق
912
- # =====================================================
913
  if __name__ == "__main__":
914
  import uvicorn
915
 
 
916
  port = int(os.environ.get("PORT", 8000))
917
  host = os.environ.get("HOST", "0.0.0.0")
918
  workers = int(os.environ.get("WORKERS", 1))
919
 
920
  logger.info("=" * 50)
921
- logger.info(f"🌐 Starting enhanced server on {host}:{port}")
922
  logger.info(f"👷 Workers: {workers}")
923
  logger.info(f"📚 Documentation: http://{host}:{port}/docs")
924
  logger.info("=" * 50)
925
 
926
  uvicorn.run(
927
- "app_enhanced:app",
928
  host=host,
929
  port=port,
930
- reload=False,
931
  workers=workers,
932
- log_level="info"
933
  )
 
 
 
 
1
  import logging
2
  import gc
3
  import sys
4
+ import pwd # Added for monkey patch
5
  from fastapi import FastAPI, HTTPException
6
  from fastapi.middleware.cors import CORSMiddleware
7
  from pydantic import BaseModel
8
  from typing import Dict, List, Optional
9
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
10
+
11
+
12
+
13
+
14
+
15
  from tokenizers.normalizers import Sequence, Replace, Strip
16
  from tokenizers import Regex
17
+ from huggingface_hub import hf_hub_download # Added for reliable HF downloads
18
+
19
+ # =====================================================
20
+ # 🛠️ Monkey Patch for Docker/Container UID Issue
21
+ # =====================================================
22
+ # Fix for 'getpwuid(): uid not found: 1000' in containerized environments
23
+ def patched_getpwuid(uid_num):
24
+ try:
25
+ return original_getpwuid(uid_num)
26
+ except KeyError:
27
+ if uid_num == os.getuid():
28
+ # Create fake user entry
29
+ return pwd.struct_pwent(
30
+ name='dockeruser',
31
+ passwd='x',
32
+ uid=uid_num,
33
+ gid=os.getgid(),
34
+ gecos='Docker User',
35
+ dir='/tmp',
36
+ shell='/bin/sh'
37
+ )
38
+ raise
39
+
40
+ original_getpwuid = pwd.getpwuid
41
+ pwd.getpwuid = patched_getpwuid
42
+
43
+ # Set fallback env vars to avoid user-dependent paths
44
+ os.environ.setdefault('HOME', '/tmp')
45
+ os.environ.setdefault('USER', 'dockeruser')
46
 
47
  # =====================================================
48
  # 🔧 تكوين البيئة والإعدادات
49
+ CACHE_DIR = "/tmp/huggingface_cache"
50
+ os.makedirs(CACHE_DIR, exist_ok=True)
51
+
52
+ # تكوين متغيرات البيئة لـ Hugging Face (removed TRANSFORMERS_CACHE to avoid deprecation warning)
53
+ os.environ.update({
54
+ "HF_HOME": CACHE_DIR,
55
+
56
+ "HF_DATASETS_CACHE": CACHE_DIR,
57
+ "HUGGINGFACE_HUB_CACHE": CACHE_DIR,
58
+ "TORCH_HOME": CACHE_DIR,
59
+ "TOKENIZERS_PARALLELISM": "false", # منع مشاكل threading
60
+ "TRANSFORMERS_OFFLINE": "0", # السماح بالتحميل من الإنترنت
61
+ })
62
+
63
+ # إعدادات PyTorch للذاكرة
64
+ }
65
+
66
  # =====================================================
67
+ # 🤖 Model Manager - إدارة الموديلات
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+
198
+
199
+
200
+
201
+
202
+ # =====================================================
203
+ class ModelManager:
204
+ def __init__(self):
205
+ self.tokenizer = None
206
+ self.models = []
207
+
208
+
209
+ self.models_loaded = False
210
+ self.model_urls = [
211
+
212
+
213
+
214
+ "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12",
215
+ "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
216
+ ]
217
+ self.base_model_id = "answerdotai/ModernBERT-base" # Primary
218
+ self.fallback_model_id = "bert-base-uncased" # Fallback if ModernBERT fails
219
+ self.using_fallback = False
220
+
221
+ def load_tokenizer(self):
222
+ """تحميل الـ Tokenizer مع fallback"""
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+
233
+
234
+
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+ try:
243
+ logger.info(f"📝 Loading tokenizer from {self.base_model_id}...")
244
+ self.tokenizer = AutoTokenizer.from_pretrained(
245
+ self.base_model_id,
246
+ cache_dir=CACHE_DIR,
247
+ use_fast=True,
248
+ trust_remote_code=False
249
+ )
250
+ logger.info("✅ Primary tokenizer loaded successfully")
251
+
252
+ except Exception as e:
253
+ logger.warning(f"⚠️ Failed to load primary tokenizer: {e}")
254
+ try:
255
+ logger.info(f"🔄 Falling back to {self.fallback_model_id}...")
256
+ self.tokenizer = AutoTokenizer.from_pretrained(
257
+ self.fallback_model_id,
258
+ cache_dir=CACHE_DIR,
259
+ use_fast=True,
260
+ trust_remote_code=False
261
+ )
262
+ self.using_fallback = True
263
+ logger.info("✅ Fallback tokenizer loaded successfully")
264
+ except Exception as fallback_e:
265
+ logger.error(f"❌ Failed to load fallback tokenizer: {fallback_e}")
266
+ return False
267
+
268
+ # إعداد معالج النصوص
269
+ try:
270
+ newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
271
+ join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
272
+ self.tokenizer.backend_tokenizer.normalizer = Sequence([
273
+ self.tokenizer.backend_tokenizer.normalizer,
274
+ join_hyphen_break,
275
+ newline_to_space,
276
+ Strip()
277
+ ])
278
+ except Exception as e:
279
+ logger.warning(f"⚠️ Could not set custom normalizer: {e}")
280
+
281
+ return True
282
+
283
+ def load_single_model(self, model_url=None, model_path=None, model_name="Model"):
284
+ """تحميل موديل واحد مع fallback ومعالجة شاملة للأخطاء"""
285
+ base_model = None
286
+ try:
287
+ logger.info(f"🤖 Loading base {model_name} from {self.base_model_id}...")
288
+
289
+ # محاولة تحميل الموديل الأساسي الرئيسي
290
+ base_model = AutoModelForSequenceClassification.from_pretrained(
291
+ self.base_model_id,
292
+ num_labels=41,
293
+ cache_dir=CACHE_DIR,
294
+ dtype=torch.float16 if torch.cuda.is_available() else torch.float32, # Updated from torch_dtype
295
+ low_cpu_mem_usage=True,
296
+ trust_remote_code=False
297
+ )
298
+ logger.info("✅ Primary base model loaded")
299
+
300
+ except Exception as e:
301
+ logger.warning(f"⚠️ Failed to load primary base model: {e}")
302
+ try:
303
+ logger.info(f"🔄 Falling back to {self.fallback_model_id}...")
304
+ base_model = AutoModelForSequenceClassification.from_pretrained(
305
+ self.fallback_model_id,
306
+ num_labels=41,
307
+ cache_dir=CACHE_DIR,
308
+ dtype=torch.float16 if torch.cuda.is_available() else torch.float32, # Updated from torch_dtype
309
+ low_cpu_mem_usage=True,
310
+ trust_remote_code=False
311
+ )
312
+ self.using_fallback = True
313
+ logger.info("✅ Fallback base model loaded (note: weights may not be compatible)")
314
+ except Exception as fallback_e:
315
+ logger.error(f"❌ Failed to load fallback base model: {fallback_e}")
316
+ return None
317
+
318
+ # محاولة تحميل الأوزان (فقط إذا لم نستخدم fallback، أو إذا كانت متوافقة)
319
+ try:
320
+ if model_path and os.path.exists(model_path):
321
+ logger.info(f"📁 Loading from local file: {model_path}")
322
+ state_dict = torch.load(model_path, map_location=device, weights_only=True)
323
+ base_model.load_state_dict(state_dict, strict=False)
324
+ elif model_url:
325
+ # استخدام hf_hub_download بدلاً من torch.hub للـ HF repos
326
+ logger.info(f"🌐 Downloading weights from HF repo...")
327
+ repo_id = "mihalykiss/modernbert_2"
328
+ filename = model_url.split('/')[-1] # Extract filename like "Model_groups_3class_seed12"
329
+ pt_file = hf_hub_download(
330
+ repo_id=repo_id,
331
+ filename=filename,
332
+ cache_dir=CACHE_DIR,
333
+ local_dir_use_symlinks=False
334
+
335
+
336
+
337
+
338
+
339
+
340
+
341
+
342
+
343
+
344
+
345
+
346
+
347
+
348
+
349
+
350
+
351
+
352
+
353
+
354
+
355
+
356
+
357
+
358
+
359
+
360
+
361
+
362
+
363
+
364
+
365
+
366
+
367
+
368
+
369
+
370
+ )
371
+ state_dict = torch.load(pt_file, map_location=device, weights_only=True)
372
+
373
+
374
+
375
+
376
+
377
+
378
+
379
+
380
+
381
+
382
+
383
+
384
+
385
+
386
+
387
+ # تحميل الأوزان فقط إذا لم نكن في وضع fallback (لأن ModernBERT weights قد لا تتوافق مع BERT القياسي)
388
+ if not self.using_fallback:
389
+ base_model.load_state_dict(state_dict, strict=False)
390
+ logger.info("✅ Weights loaded successfully")
391
+ else:
392
+ logger.warning("⚠️ Skipping weight load in fallback mode (incompatible architecture)")
393
+ else:
394
+ logger.info("📊 Using model with random initialization")
395
+ except Exception as weight_error:
396
+ logger.warning(f"⚠️ Could not load weights: {weight_error}")
397
+ logger.info("📊 Continuing with base model (random or pre-trained init)")
398
+
399
+ # نقل الموديل للجهاز المناسب
400
+ model = base_model.to(device)
401
+ model.eval()
402
+
403
+ # تنظيف الذاكرة
404
+ if 'state_dict' in locals():
405
+ del state_dict
406
+ gc.collect()
407
+ if torch.cuda.is_available():
408
+ torch.cuda.empty_cache()
409
+
410
+ logger.info(f"✅ {model_name} loaded successfully (fallback: {self.using_fallback})")
411
+ return model
412
+
413
+ def load_models(self, max_models=3): # Increased default to 3 to load local + 2 URLs
414
+ """تحميل الموديلات بحد أقصى للذاكرة"""
415
+ if self.models_loaded:
416
+ logger.info("✨ Models already loaded")
417
+ return True
418
+
419
+ # تحميل الـ Tokenizer أولاً
420
+ if not self.load_tokenizer():
421
+ logger.error("❌ Tokenizer load failed - cannot proceed")
422
+ return False
423
+
424
+ # تحميل الموديلات
425
+ logger.info(f"🚀 Loading up to {max_models} models...")
426
+
427
+ # محاولة تحميل الملف المحلي أولاً
428
+ local_model_path = "modernbert.bin"
429
+ if os.path.exists(local_model_path):
430
+ model = self.load_single_model(
431
+ model_path=local_model_path,
432
+ model_name="Model 1 (Local)"
433
+ )
434
+ if model is not None:
435
+ self.models.append(model)
436
+
437
+ # تحميل الموديلات من URLs (استخراج filenames)
438
+ for i, full_url in enumerate(self.model_urls[:max_models - len(self.models)]):
439
+ if len(self.models) >= max_models:
440
+ break
441
+
442
+ # استخدام full_url كما هو، لكن في load_single_model نستخرج filename
443
+ model = self.load_single_model(
444
+ model_url=full_url,
445
+ model_name=f"Model {len(self.models) + 1}"
446
+ )
447
+ if model is not None:
448
+ self.models.append(model)
449
+
450
+ # التحقق من الذاكرة المتاحة
451
+ if torch.cuda.is_available():
452
+ mem_allocated = torch.cuda.memory_allocated() / 1024**3
453
+ mem_reserved = torch.cuda.memory_reserved() / 1024**3
454
+ logger.info(f"💾 GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
455
+
456
+ # إيقاف التحميل إذا كانت الذاكرة ممتلئة
457
+ if mem_allocated > 6: # حد أقصى 6GB
458
+ logger.warning("⚠️ Memory limit reached, stopping model loading")
459
+ break
460
+
461
+ # التحقق من نجاح التحميل
462
+ if len(self.models) > 0:
463
+
464
+ self.models_loaded = True
465
+ logger.info(f"✅ Successfully loaded {len(self.models)} models (using fallback: {self.using_fallback})")
466
+ return True
467
+ else:
468
+ logger.error("❌ No models could be loaded")
469
+ return False
470
+
471
+ def classify_text(self, text: str) -> Dict:
472
+ """تحليل النص باستخدام الموديلات المحملة"""
473
+ if not self.models_loaded or len(self.models) == 0:
474
+ raise ValueError("No models loaded")
475
+
476
+ # تنظيف النص
477
+ cleaned_text = clean_text(text)
478
+ if not cleaned_text.strip():
479
+ raise ValueError("Empty text after cleaning")
480
+
481
+ # Tokenization (max_length adjusted for fallback BERT if needed)
482
+ max_len = 512 if not self.using_fallback else 512 # BERT max is 512
483
+ try:
484
+ inputs = self.tokenizer(
485
+
486
+
487
+
488
+
489
+
490
+
491
+ cleaned_text,
492
+ return_tensors="pt",
493
+ truncation=True,
494
+ max_length=max_len,
495
+ padding=True
496
+ ).to(device)
497
+
498
+
499
+
500
+
501
+
502
+
503
+
504
+
505
+
506
+
507
+
508
+
509
+
510
+
511
+
512
+
513
+
514
+
515
+
516
+
517
+
518
+
519
+
520
+
521
+
522
+
523
+ except Exception as e:
524
+ logger.error(f"Tokenization error: {e}")
525
+ raise ValueError(f"Failed to tokenize text: {e}")
526
+
527
+ # الحصول على التنبؤات
528
+ all_probabilities = []
529
+
530
+ with torch.no_grad():
531
+ for i, model in enumerate(self.models):
532
+ try:
533
+ logits = model(**inputs).logits
534
+ probs = torch.softmax(logits, dim=1)
535
+ all_probabilities.append(probs)
536
+ except Exception as e:
537
+ logger.warning(f"Model {i+1} prediction failed: {e}")
538
+ continue
539
+
540
+ if not all_probabilities:
541
+ raise ValueError("All models failed to make predictions")
542
+
543
+ # حساب المتوسط (Soft Voting)
544
+ averaged_probs = torch.mean(torch.stack(all_probabilities), dim=0)
545
+ probabilities = averaged_probs[0]
546
+
547
+
548
+
549
+
550
+
551
+
552
+
553
+
554
+
555
+
556
+
557
+
558
+
559
+
560
+
561
+
562
+
563
+
564
+
565
+
566
+
567
+
568
+
569
+
570
+
571
+
572
+
573
+
574
+
575
+
576
+
577
+
578
+
579
+
580
+
581
+
582
+
583
+
584
+
585
+
586
+
587
+
588
+
589
+
590
+
591
+
592
+
593
+
594
+
595
+
596
+
597
+
598
+
599
+
600
+
601
+
602
+
603
+
604
+
605
+
606
+
607
+
608
+
609
+
610
+
611
+
612
+
613
+
614
+
615
+ # حساب نسب Human vs AI
616
+ human_prob = probabilities[24].item()
617
+ ai_probs = probabilities.clone()
618
+ ai_probs[24] = 0 # إزالة احتمالية Human
619
+ ai_total_prob = ai_probs.sum().item()
620
+
621
+
622
+ # التطبيع
623
+ total = human_prob + ai_total_prob
624
+ if total > 0:
625
+ human_percentage = (human_prob / total) * 100
626
+ ai_percentage = (ai_total_prob / total) * 100
627
+ else:
628
+ human_percentage = 50
629
+ ai_percentage = 50
630
+
631
+
632
+ # تحديد الموديل الأكثر احتمالاً
633
+ ai_model_idx = torch.argmax(ai_probs).item()
634
+ predicted_model = label_mapping.get(ai_model_idx, "Unknown")
635
+
636
+
637
+
638
+ # أعلى 5 تنبؤات
639
+ top_5_probs, top_5_indices = torch.topk(probabilities, 5)
640
+ top_5_results = []
641
+ for prob, idx in zip(top_5_probs, top_5_indices):
642
+ top_5_results.append({
643
+ "model": label_mapping.get(idx.item(), "Unknown"),
644
+ "probability": round(prob.item() * 100, 2)
645
+ })
646
+
647
+
648
+
649
+
650
+
651
+
652
+
653
+
654
+
655
+
656
+
657
+
658
+
659
+
660
+
661
+
662
+
663
+
664
+
665
+
666
+
667
+
668
+
669
+
670
+
671
+
672
+
673
+
674
+
675
+
676
+
677
+
678
+
679
+
680
+
681
+
682
+
683
+
684
+
685
+
686
+
687
+
688
+
689
+
690
+
691
+
692
+
693
+
694
+
695
+
696
+
697
+
698
+
699
+
700
+
701
+
702
+
703
+
704
+
705
+
706
+
707
 
 
 
 
708
 
 
 
 
 
 
 
 
 
 
 
709
 
 
 
 
 
710
 
 
 
 
 
 
 
 
 
711
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
712
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
713
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
714
 
715
+ return {
716
+ "human_percentage": round(human_percentage, 2),
717
+ "ai_percentage": round(ai_percentage, 2),
718
+ "predicted_model": predicted_model,
719
+ "top_5_predictions": top_5_results,
720
+ "is_human": human_percentage > ai_percentage,
721
+ "models_used": len(all_probabilities),
722
+ "using_fallback": self.using_fallback
723
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
724
 
725
  # =====================================================
726
  # 🧹 دوال التنظيف والمعالجة
 
 
 
 
 
 
 
 
 
 
 
 
 
727
  # 🌐 FastAPI Application
728
  # =====================================================
729
  app = FastAPI(
730
+ title="ModernBERT AI Text Detector",
731
+ description="كشف النصوص المكتوبة بواسطة الذكاء الاصطناعي",
732
+ version="2.3.0" # Updated version with 3 models and deprecation fixes
733
  )
734
 
735
+ # إضافة CORS للسماح بالاستخدام من المتصفح
736
  app.add_middleware(
737
  CORSMiddleware,
738
  allow_origins=["*"],
 
 
739
  allow_headers=["*"],
740
  )
741
 
742
+ # إنشاء مدير الموديلات
743
+ model_manager = ModelManager()
744
 
745
  # =====================================================
746
  # 📝 نماذج البيانات (Pydantic Models)
 
747
  class TextInput(BaseModel):
748
  text: str
749
  analyze_paragraphs: Optional[bool] = False
750
+
751
 
752
  class SimpleTextInput(BaseModel):
753
  text: str
754
 
755
+ class DetectionResult(BaseModel):
756
  success: bool
757
  code: int
758
  message: str
 
 
 
 
 
 
759
  async def startup_event():
760
  """تحميل الموديلات عند بداية التشغيل"""
761
  logger.info("=" * 50)
762
+ logger.info("🚀 Starting ModernBERT AI Detector...")
763
  logger.info(f"🐍 Python version: {sys.version}")
764
  logger.info(f"🔥 PyTorch version: {torch.__version__}")
765
+ import transformers
766
+ logger.info(f"🔧 Transformers version: {transformers.__version__}")
767
+ logger.info("🛡️ UID Monkey Patch Applied (for Docker/Container)")
768
  logger.info("=" * 50)
769
 
770
+ # محاولة تحميل الموديلات
771
+ max_models = int(os.environ.get("MAX_MODELS", "3")) # Updated default to 3
772
+ success = model_manager.load_models(max_models=max_models)
773
+
774
+
775
+
776
+
777
+
778
 
779
  if success:
780
+ logger.info(f"✅ Application ready! (Fallback mode: {model_manager.using_fallback})")
781
  else:
782
  logger.error("⚠️ Failed to load models - API will return errors")
783
+ logger.info("💡 Tip: Ensure 'transformers>=4.45.0' and 'huggingface_hub' are installed. Run: pip install --upgrade transformers huggingface_hub")
784
 
785
  @app.get("/")
786
  async def root():
787
  """الصفحة الرئيسية"""
788
+
789
+
790
+
791
+
792
+
793
  return {
794
+ "message": "ModernBERT AI Text Detector API",
795
  "status": "online" if model_manager.models_loaded else "initializing",
796
+ "models_loaded": len(model_manager.models),
797
+ "using_fallback": model_manager.using_fallback,
798
  "device": str(device),
799
+
800
+
801
+
802
+
803
+
804
+
805
+
806
+
807
  "endpoints": {
808
  "analyze": "/analyze",
809
  "simple": "/analyze-simple",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
810
 
811
  return {
812
  "status": "healthy" if model_manager.models_loaded else "unhealthy",
813
+ "models_loaded": len(model_manager.models),
814
+ "using_fallback": model_manager.using_fallback,
815
+
816
  "device": str(device),
817
  "cuda_available": torch.cuda.is_available(),
818
  "memory_info": memory_info
819
  }
820
 
821
+ @app.post("/analyze", response_model=DetectionResult)
822
+ async def analyze_text(data: TextInput):
823
  """
824
+ تحليل النص للكشف عن AI
825
+ يحاكي نفس وظيفة Gradio classify_text
826
  """
827
  try:
828
+ # التحقق من النص
829
  text = data.text.strip()
830
  if not text:
831
+ return DetectionResult(
832
  success=False,
833
  code=400,
834
  message="Empty input text",
835
  data={}
836
  )
837
 
838
+ # التأكد من تحميل الموديلات
839
  if not model_manager.models_loaded:
840
+ # محاولة تحميل الموديلات
841
+ if not model_manager.load_models():
842
+ return DetectionResult(
843
  success=False,
844
  code=503,
845
+ message="Models not available. Check logs for details.",
846
  data={}
847
  )
848
 
849
+ # حساب عدد الكلمات
 
 
 
850
  total_words = len(text.split())
851
+
852
+ # التحليل الأساسي
853
+ result = model_manager.classify_text(text)
854
+
855
+ # النتائج الأساسية
856
+ ai_percentage = result["ai_percentage"]
857
+ human_percentage = result["human_percentage"]
858
+
859
  ai_words = int(total_words * (ai_percentage / 100))
860
 
861
+ # تحليل الفقرات إذا طُلب ذلك
862
  paragraphs_analysis = []
863
+ if data.analyze_paragraphs and ai_percentage > 50:
864
  paragraphs = split_into_paragraphs(text)
865
+ recalc_ai_words = 0
866
+ recalc_total_words = 0
867
+
868
+ for para in paragraphs[:10]: # حد أقصى 10 فقرات
869
  if para.strip():
870
  try:
871
+ para_result = model_manager.classify_text(para)
872
  para_words = len(para.split())
873
+ recalc_total_words += para_words
874
+ recalc_ai_words += para_words * (para_result["ai_percentage"] / 100)
875
 
876
  paragraphs_analysis.append({
877
  "paragraph": para[:200] + "..." if len(para) > 200 else para,
878
+ "ai_generated_score": para_result["ai_percentage"] / 100,
879
+ "human_written_score": para_result["human_percentage"] / 100,
880
+ "predicted_model": para_result["predicted_model"]
881
+
882
+
883
+
884
+
885
  })
886
  except Exception as e:
887
  logger.warning(f"Failed to analyze paragraph: {e}")
888
+
889
+
890
+
891
+
892
+
893
+
894
+
895
+
896
+
897
+
898
+
899
+
900
+
901
+
902
+
903
+
904
+
 
 
 
905
 
906
+ # إعادة حساب النسب بناءً على الفقرات
907
+ if recalc_total_words > 0:
908
+ ai_percentage = round((recalc_ai_words / recalc_total_words) * 100, 2)
909
+ human_percentage = round(100 - ai_percentage, 2)
910
+ ai_words = int(recalc_ai_words)
911
+
912
+
913
 
914
+ # إنشاء رسالة التغذية الراجعة
915
+ if ai_percentage > 50:
916
+ feedback = "Most of Your Text is AI/GPT Generated"
917
+ else:
918
+ feedback = "Most of Your Text Appears Human-Written"
919
 
920
+ # إرجاع النتائج بنفس تنسيق الكود الأصلي
921
+ return DetectionResult(
922
  success=True,
923
  code=200,
924
+ message="analysis completed",
925
+ data={
926
+ "fakePercentage": ai_percentage,
927
+ "isHuman": human_percentage,
928
+ "textWords": total_words,
929
+ "aiWords": ai_words,
930
+ "paragraphs": paragraphs_analysis,
931
+ "predicted_model": result["predicted_model"],
932
+ "feedback": feedback,
933
+ "input_text": text[:500] + "..." if len(text) > 500 else text,
934
+ "detected_language": "en",
935
+ "top_5_predictions": result.get("top_5_predictions", []),
936
+ "models_used": result.get("models_used", 1),
937
+ "using_fallback": result.get("using_fallback", False)
938
+ }
939
  )
940
 
941
  except Exception as e:
942
  logger.error(f"Analysis error: {e}", exc_info=True)
943
+ return DetectionResult(
944
  success=False,
945
  code=500,
946
  message=f"Analysis failed: {str(e)}",
 
 
 
947
  @app.post("/analyze-simple")
948
  async def analyze_simple(data: SimpleTextInput):
949
  """
950
+ تحليل مبسط - يرجع النتائج الأساسية فقط
951
  """
952
  try:
953
  text = data.text.strip()
 
954
  raise HTTPException(status_code=400, detail="Empty text")
955
 
956
  if not model_manager.models_loaded:
957
+ if not model_manager.load_models():
958
  raise HTTPException(status_code=503, detail="Models not available")
959
 
960
+ result = model_manager.classify_text(text)
961
+
962
 
963
  return {
964
+ "is_ai": result["ai_percentage"] > 50,
965
+ "ai_score": result["ai_percentage"],
966
+ "human_score": result["human_percentage"],
967
+ "detected_model": result["predicted_model"] if result["ai_percentage"] > 50 else None,
968
+ "confidence": max(result["ai_percentage"], result["human_percentage"]),
969
+ "using_fallback": result.get("using_fallback", False)
970
+
971
  }
972
 
973
  except HTTPException:
 
 
 
 
 
 
 
 
974
  if __name__ == "__main__":
975
  import uvicorn
976
 
977
+ # الحصول على الإعدادات من البيئة
978
  port = int(os.environ.get("PORT", 8000))
979
  host = os.environ.get("HOST", "0.0.0.0")
980
  workers = int(os.environ.get("WORKERS", 1))
981
 
982
  logger.info("=" * 50)
983
+ logger.info(f"🌐 Starting server on {host}:{port}")
984
  logger.info(f"👷 Workers: {workers}")
985
  logger.info(f"📚 Documentation: http://{host}:{port}/docs")
986
  logger.info("=" * 50)
987
 
988
  uvicorn.run(
989
+ "main:app", # Assuming this file is named main.py
990
  host=host,
991
  port=port,
992
+
993
  workers=workers,
994
+ reload=False # Set to True for dev
995
  )