AI_Detector

Running

App Files Files Community

mahmoudsaber0 commited on Oct 22

Commit

4edb764

verified ·

1 Parent(s): 290af78

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -441

app.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import logging
 import gc
 import sys
@@ -7,11 +10,6 @@ from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from typing import Dict, List, Optional
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from tokenizers.normalizers import Sequence, Replace, Strip
 from tokenizers import Regex
 from huggingface_hub import hf_hub_download  # Added for reliable HF downloads
@@ -46,13 +44,21 @@ os.environ.setdefault('USER', 'dockeruser')
 # =====================================================
 # 🔧 تكوين البيئة والإعدادات
 CACHE_DIR = "/tmp/huggingface_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
-# تكوين متغيرات البيئة لـ Hugging Face (removed TRANSFORMERS_CACHE to avoid deprecation warning)
 os.environ.update({
     "HF_HOME": CACHE_DIR,
     "HF_DATASETS_CACHE": CACHE_DIR,
     "HUGGINGFACE_HUB_CACHE": CACHE_DIR,
     "TORCH_HOME": CACHE_DIR,
@@ -61,156 +67,44 @@ os.environ.update({
 })
 # إعدادات PyTorch للذاكرة
-}
 # =====================================================
-# 🤖 Model Manager - إدارة الموديلات
 # =====================================================
 class ModelManager:
     def __init__(self):
         self.tokenizer = None
         self.models = []
         self.models_loaded = False
         self.model_urls = [
             "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12",
             "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
         ]
@@ -220,25 +114,6 @@ class ModelManager:
     def load_tokenizer(self):
         """تحميل الـ Tokenizer مع fallback"""
         try:
             logger.info(f"📝 Loading tokenizer from {self.base_model_id}...")
             self.tokenizer = AutoTokenizer.from_pretrained(
@@ -291,7 +166,7 @@ class ModelManager:
                 self.base_model_id,
                 num_labels=41,
                 cache_dir=CACHE_DIR,
-                dtype=torch.float16 if torch.cuda.is_available() else torch.float32,  # Updated from torch_dtype
                 low_cpu_mem_usage=True,
                 trust_remote_code=False
             )
@@ -305,7 +180,7 @@ class ModelManager:
                     self.fallback_model_id,
                     num_labels=41,
                     cache_dir=CACHE_DIR,
-                    dtype=torch.float16 if torch.cuda.is_available() else torch.float32,  # Updated from torch_dtype
                     low_cpu_mem_usage=True,
                     trust_remote_code=False
                 )
@@ -331,58 +206,8 @@ class ModelManager:
                     filename=filename,
                     cache_dir=CACHE_DIR,
                     local_dir_use_symlinks=False
                 )
                 state_dict = torch.load(pt_file, map_location=device, weights_only=True)
                 # تحميل الأوزان فقط إذا لم نكن في وضع fallback (لأن ModernBERT weights قد لا تتوافق مع BERT القياسي)
                 if not self.using_fallback:
@@ -410,7 +235,7 @@ class ModelManager:
         logger.info(f"✅ {model_name} loaded successfully (fallback: {self.using_fallback})")
         return model
-    def load_models(self, max_models=3):  # Increased default to 3 to load local + 2 URLs
         """تحميل الموديلات بحد أقصى للذاكرة"""
         if self.models_loaded:
             logger.info("✨ Models already loaded")
@@ -460,7 +285,6 @@ class ModelManager:
         # التحقق من نجاح التحميل
         if len(self.models) > 0:
             self.models_loaded = True
             logger.info(f"✅ Successfully loaded {len(self.models)} models (using fallback: {self.using_fallback})")
             return True
@@ -482,44 +306,12 @@ class ModelManager:
         max_len = 512 if not self.using_fallback else 512  # BERT max is 512
         try:
             inputs = self.tokenizer(
                 cleaned_text,
                 return_tensors="pt",
                 truncation=True,
                 max_length=max_len,
                 padding=True
             ).to(device)
         except Exception as e:
             logger.error(f"Tokenization error: {e}")
             raise ValueError(f"Failed to tokenize text: {e}")
@@ -543,81 +335,12 @@ class ModelManager:
             # حساب المتوسط (Soft Voting)
             averaged_probs = torch.mean(torch.stack(all_probabilities), dim=0)
             probabilities = averaged_probs[0]
         # حساب نسب Human vs AI
         human_prob = probabilities[24].item()
         ai_probs = probabilities.clone()
         ai_probs[24] = 0  # إزالة احتمالية Human
         ai_total_prob = ai_probs.sum().item()
         # التطبيع
         total = human_prob + ai_total_prob
@@ -627,13 +350,10 @@ class ModelManager:
         else:
             human_percentage = 50
             ai_percentage = 50
         # تحديد الموديل الأكثر احتمالاً
         ai_model_idx = torch.argmax(ai_probs).item()
         predicted_model = label_mapping.get(ai_model_idx, "Unknown")
         # أعلى 5 تنبؤات
         top_5_probs, top_5_indices = torch.topk(probabilities, 5)
@@ -643,74 +363,6 @@ class ModelManager:
                 "model": label_mapping.get(idx.item(), "Unknown"),
                 "probability": round(prob.item() * 100, 2)
             })
         return {
             "human_percentage": round(human_percentage, 2),
@@ -724,18 +376,33 @@ class ModelManager:
 # =====================================================
 # 🧹 دوال التنظيف والمعالجة
 # 🌐 FastAPI Application
 # =====================================================
 app = FastAPI(
     title="ModernBERT AI Text Detector",
     description="كشف النصوص المكتوبة بواسطة الذكاء الاصطناعي",
-    version="2.3.0"  # Updated version with 3 models and deprecation fixes
 )
 # إضافة CORS للسماح بالاستخدام من المتصفح
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
@@ -744,11 +411,11 @@ model_manager = ModelManager()
 # =====================================================
 # 📝 نماذج البيانات (Pydantic Models)
 class TextInput(BaseModel):
     text: str
     analyze_paragraphs: Optional[bool] = False
 class SimpleTextInput(BaseModel):
     text: str
@@ -756,6 +423,12 @@ class DetectionResult(BaseModel):
     success: bool
     code: int
     message: str
 async def startup_event():
     """تحميل الموديلات عند بداية التشغيل"""
     logger.info("=" * 50)
@@ -768,16 +441,11 @@ async def startup_event():
     logger.info("=" * 50)
     # محاولة تحميل الموديلات
-    max_models = int(os.environ.get("MAX_MODELS", "3"))  # Updated default to 3
     success = model_manager.load_models(max_models=max_models)
     if success:
-        logger.info(f"✅ Application ready! (Fallback mode: {model_manager.using_fallback})")
     else:
         logger.error("⚠️ Failed to load models - API will return errors")
         logger.info("💡 Tip: Ensure 'transformers>=4.45.0' and 'huggingface_hub' are installed. Run: pip install --upgrade transformers huggingface_hub")
@@ -785,34 +453,34 @@ async def startup_event():
 @app.get("/")
 async def root():
     """الصفحة الرئيسية"""
     return {
         "message": "ModernBERT AI Text Detector API",
         "status": "online" if model_manager.models_loaded else "initializing",
         "models_loaded": len(model_manager.models),
         "using_fallback": model_manager.using_fallback,
         "device": str(device),
         "endpoints": {
             "analyze": "/analyze",
             "simple": "/analyze-simple",
     return {
         "status": "healthy" if model_manager.models_loaded else "unhealthy",
         "models_loaded": len(model_manager.models),
         "using_fallback": model_manager.using_fallback,
         "device": str(device),
         "cuda_available": torch.cuda.is_available(),
         "memory_info": memory_info
@@ -855,7 +523,6 @@ async def analyze_text(data: TextInput):
         # النتائج الأساسية
         ai_percentage = result["ai_percentage"]
         human_percentage = result["human_percentage"]
         ai_words = int(total_words * (ai_percentage / 100))
         # تحليل الفقرات إذا طُلب ذلك
@@ -878,38 +545,15 @@ async def analyze_text(data: TextInput):
                             "ai_generated_score": para_result["ai_percentage"] / 100,
                             "human_written_score": para_result["human_percentage"] / 100,
                             "predicted_model": para_result["predicted_model"]
                         })
                     except Exception as e:
                         logger.warning(f"Failed to analyze paragraph: {e}")
             # إعادة حساب النسب بناءً على الفقرات
             if recalc_total_words > 0:
                 ai_percentage = round((recalc_ai_words / recalc_total_words) * 100, 2)
                 human_percentage = round(100 - ai_percentage, 2)
                 ai_words = int(recalc_ai_words)
         # إنشاء رسالة التغذية الراجعة
         if ai_percentage > 50:
@@ -944,6 +588,9 @@ async def analyze_text(data: TextInput):
             success=False,
             code=500,
             message=f"Analysis failed: {str(e)}",
 @app.post("/analyze-simple")
 async def analyze_simple(data: SimpleTextInput):
     """
@@ -951,6 +598,7 @@ async def analyze_simple(data: SimpleTextInput):
     """
     try:
         text = data.text.strip()
             raise HTTPException(status_code=400, detail="Empty text")
         if not model_manager.models_loaded:
@@ -958,7 +606,6 @@ async def analyze_simple(data: SimpleTextInput):
                 raise HTTPException(status_code=503, detail="Models not available")
         result = model_manager.classify_text(text)
         return {
             "is_ai": result["ai_percentage"] > 50,
@@ -967,10 +614,17 @@ async def analyze_simple(data: SimpleTextInput):
             "detected_model": result["predicted_model"] if result["ai_percentage"] > 50 else None,
             "confidence": max(result["ai_percentage"], result["human_percentage"]),
             "using_fallback": result.get("using_fallback", False)
         }
     except HTTPException:
 if __name__ == "__main__":
     import uvicorn
@@ -989,7 +643,6 @@ if __name__ == "__main__":
         "main:app",  # Assuming this file is named main.py
         host=host,
         port=port,
         workers=workers,
         reload=False  # Set to True for dev
     )

+import os
+import re
+import torch
 import logging
 import gc
 import sys
 from pydantic import BaseModel
 from typing import Dict, List, Optional
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from tokenizers.normalizers import Sequence, Replace, Strip
 from tokenizers import Regex
 from huggingface_hub import hf_hub_download  # Added for reliable HF downloads
 # =====================================================
 # 🔧 تكوين البيئة والإعدادات
+# =====================================================
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# إعدادات الذاكرة والكاش
 CACHE_DIR = "/tmp/huggingface_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
+# تكوين متغيرات البيئة لـ Hugging Face
 os.environ.update({
     "HF_HOME": CACHE_DIR,
+    "TRANSFORMERS_CACHE": CACHE_DIR,
     "HF_DATASETS_CACHE": CACHE_DIR,
     "HUGGINGFACE_HUB_CACHE": CACHE_DIR,
     "TORCH_HOME": CACHE_DIR,
 })
 # إعدادات PyTorch للذاكرة
+if torch.cuda.is_available():
+    os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
+    torch.backends.cudnn.benchmark = True
 # =====================================================
+# 🚀 تحديد الجهاز (GPU أو CPU)
+# =====================================================
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+logger.info(f"🖥️ Using device: {device}")
+if torch.cuda.is_available():
+    logger.info(f"🎮 CUDA Device: {torch.cuda.get_device_name(0)}")
+    logger.info(f"💾 CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
+# =====================================================
+# 📊 خريطة الموديلات
+# =====================================================
+label_mapping = {
+    0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b',
+    6: 'bloomz', 7: 'cohere', 8: 'davinci', 9: 'dolly', 10: 'dolly-v2-12b',
+    11: 'flan_t5_base', 12: 'flan_t5_large', 13: 'flan_t5_small',
+    14: 'flan_t5_xl', 15: 'flan_t5_xxl', 16: 'gemma-7b-it', 17: 'gemma2-9b-it',
+    18: 'gpt-3.5-turbo', 19: 'gpt-35', 20: 'gpt4', 21: 'gpt4o',
+    22: 'gpt_j', 23: 'gpt_neox', 24: 'human', 25: 'llama3-70b', 26: 'llama3-8b',
+    27: 'mixtral-8x7b', 28: 'opt_1.3b', 29: 'opt_125m', 30: 'opt_13b',
+    31: 'opt_2.7b', 32: 'opt_30b', 33: 'opt_350m', 34: 'opt_6.7b',
+    35: 'opt_iml_30b', 36: 'opt_iml_max_1.3b', 37: 't0_11b', 38: 't0_3b',
+    39: 'text-davinci-002', 40: 'text-davinci-003'
+}
+# =====================================================
+# 🤖 Model Manager - إدارة الموديلات
 # =====================================================
 class ModelManager:
     def __init__(self):
         self.tokenizer = None
         self.models = []
         self.models_loaded = False
         self.model_urls = [
             "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12",
             "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
         ]
     def load_tokenizer(self):
         """تحميل الـ Tokenizer مع fallback"""
         try:
             logger.info(f"📝 Loading tokenizer from {self.base_model_id}...")
             self.tokenizer = AutoTokenizer.from_pretrained(
                 self.base_model_id,
                 num_labels=41,
                 cache_dir=CACHE_DIR,
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                 low_cpu_mem_usage=True,
                 trust_remote_code=False
             )
                     self.fallback_model_id,
                     num_labels=41,
                     cache_dir=CACHE_DIR,
+                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                     low_cpu_mem_usage=True,
                     trust_remote_code=False
                 )
                     filename=filename,
                     cache_dir=CACHE_DIR,
                     local_dir_use_symlinks=False
                 )
                 state_dict = torch.load(pt_file, map_location=device, weights_only=True)
                 # تحميل الأوزان فقط إذا لم نكن في وضع fallback (لأن ModernBERT weights قد لا تتوافق مع BERT القياسي)
                 if not self.using_fallback:
         logger.info(f"✅ {model_name} loaded successfully (fallback: {self.using_fallback})")
         return model
+    def load_models(self, max_models=2):
         """تحميل الموديلات بحد أقصى للذاكرة"""
         if self.models_loaded:
             logger.info("✨ Models already loaded")
         # التحقق من نجاح التحميل
         if len(self.models) > 0:
             self.models_loaded = True
             logger.info(f"✅ Successfully loaded {len(self.models)} models (using fallback: {self.using_fallback})")
             return True
         max_len = 512 if not self.using_fallback else 512  # BERT max is 512
         try:
             inputs = self.tokenizer(
                 cleaned_text,
                 return_tensors="pt",
                 truncation=True,
                 max_length=max_len,
                 padding=True
             ).to(device)
         except Exception as e:
             logger.error(f"Tokenization error: {e}")
             raise ValueError(f"Failed to tokenize text: {e}")
             # حساب المتوسط (Soft Voting)
             averaged_probs = torch.mean(torch.stack(all_probabilities), dim=0)
             probabilities = averaged_probs[0]
         # حساب نسب Human vs AI
         human_prob = probabilities[24].item()
         ai_probs = probabilities.clone()
         ai_probs[24] = 0  # إزالة احتمالية Human
         ai_total_prob = ai_probs.sum().item()
         # التطبيع
         total = human_prob + ai_total_prob
         else:
             human_percentage = 50
             ai_percentage = 50
         # تحديد الموديل الأكثر احتمالاً
         ai_model_idx = torch.argmax(ai_probs).item()
         predicted_model = label_mapping.get(ai_model_idx, "Unknown")
         # أعلى 5 تنبؤات
         top_5_probs, top_5_indices = torch.topk(probabilities, 5)
                 "model": label_mapping.get(idx.item(), "Unknown"),
                 "probability": round(prob.item() * 100, 2)
             })
         return {
             "human_percentage": round(human_percentage, 2),
 # =====================================================
 # 🧹 دوال التنظيف والمعالجة
+# =====================================================
+def clean_text(text: str) -> str:
+    """تنظيف النص من المسافات الزائدة"""
+    text = re.sub(r'\s{2,}', ' ', text)
+    text = re.sub(r'\s+([,.;:?!])', r'\1', text)
+    return text.strip()
+def split_into_paragraphs(text: str) -> List[str]:
+    """تقسيم النص إلى فقرات"""
+    paragraphs = re.split(r'\n\s*\n', text.strip())
+    return [p.strip() for p in paragraphs if p.strip()]
+# =====================================================
 # 🌐 FastAPI Application
 # =====================================================
 app = FastAPI(
     title="ModernBERT AI Text Detector",
     description="كشف النصوص المكتوبة بواسطة الذكاء الاصطناعي",
+    version="2.2.0"  # Updated version with UID fix
 )
 # إضافة CORS للسماح بالاستخدام من المتصفح
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
     allow_headers=["*"],
 )
 # =====================================================
 # 📝 نماذج البيانات (Pydantic Models)
+# =====================================================
 class TextInput(BaseModel):
     text: str
     analyze_paragraphs: Optional[bool] = False
 class SimpleTextInput(BaseModel):
     text: str
     success: bool
     code: int
     message: str
+    data: Dict
+# =====================================================
+# 🎯 API Endpoints
+# =====================================================
+@app.on_event("startup")
 async def startup_event():
     """تحميل الموديلات عند بداية التشغيل"""
     logger.info("=" * 50)
     logger.info("=" * 50)
     # محاولة تحميل الموديلات
+    max_models = int(os.environ.get("MAX_MODELS", "2"))
     success = model_manager.load_models(max_models=max_models)
     if success:
+        logger.info("✅ Application ready! (Fallback mode: %s)", model_manager.using_fallback)
     else:
         logger.error("⚠️ Failed to load models - API will return errors")
         logger.info("💡 Tip: Ensure 'transformers>=4.45.0' and 'huggingface_hub' are installed. Run: pip install --upgrade transformers huggingface_hub")
 @app.get("/")
 async def root():
     """الصفحة الرئيسية"""
     return {
         "message": "ModernBERT AI Text Detector API",
         "status": "online" if model_manager.models_loaded else "initializing",
         "models_loaded": len(model_manager.models),
         "using_fallback": model_manager.using_fallback,
         "device": str(device),
         "endpoints": {
             "analyze": "/analyze",
             "simple": "/analyze-simple",
+            "health": "/health",
+            "docs": "/docs"
+        }
+    }
+@app.get("/health")
+async def health_check():
+    """فحص صحة الخدمة"""
+    memory_info = {}
+    if torch.cuda.is_available():
+        memory_info = {
+            "gpu_allocated_gb": round(torch.cuda.memory_allocated() / 1024**3, 2),
+            "gpu_reserved_gb": round(torch.cuda.memory_reserved() / 1024**3, 2)
+        }
     return {
         "status": "healthy" if model_manager.models_loaded else "unhealthy",
         "models_loaded": len(model_manager.models),
         "using_fallback": model_manager.using_fallback,
         "device": str(device),
         "cuda_available": torch.cuda.is_available(),
         "memory_info": memory_info
         # النتائج الأساسية
         ai_percentage = result["ai_percentage"]
         human_percentage = result["human_percentage"]
         ai_words = int(total_words * (ai_percentage / 100))
         # تحليل الفقرات إذا طُلب ذلك
                             "ai_generated_score": para_result["ai_percentage"] / 100,
                             "human_written_score": para_result["human_percentage"] / 100,
                             "predicted_model": para_result["predicted_model"]
                         })
                     except Exception as e:
                         logger.warning(f"Failed to analyze paragraph: {e}")
             # إعادة حساب النسب بناءً على الفقرات
             if recalc_total_words > 0:
                 ai_percentage = round((recalc_ai_words / recalc_total_words) * 100, 2)
                 human_percentage = round(100 - ai_percentage, 2)
                 ai_words = int(recalc_ai_words)
         # إنشاء رسالة التغذية الراجعة
         if ai_percentage > 50:
             success=False,
             code=500,
             message=f"Analysis failed: {str(e)}",
+            data={}
+        )
 @app.post("/analyze-simple")
 async def analyze_simple(data: SimpleTextInput):
     """
     """
     try:
         text = data.text.strip()
+        if not text:
             raise HTTPException(status_code=400, detail="Empty text")
         if not model_manager.models_loaded:
                 raise HTTPException(status_code=503, detail="Models not available")
         result = model_manager.classify_text(text)
         return {
             "is_ai": result["ai_percentage"] > 50,
             "detected_model": result["predicted_model"] if result["ai_percentage"] > 50 else None,
             "confidence": max(result["ai_percentage"], result["human_percentage"]),
             "using_fallback": result.get("using_fallback", False)
         }
     except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Simple analysis error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+# =====================================================
+# 🏃 تشغيل التطبيق
+# =====================================================
 if __name__ == "__main__":
     import uvicorn
         "main:app",  # Assuming this file is named main.py
         host=host,
         port=port,
         workers=workers,
         reload=False  # Set to True for dev
     )