AI_Detector

Running

App Files Files Community

mahmoudsaber0 commited on Oct 24

Commit

d23c0fb

verified ·

1 Parent(s): 4edb764

Update app.py

Browse files

Files changed (1) hide show

app.py +231 -192

app.py CHANGED Viewed

@@ -200,204 +200,264 @@ class ModelManager:
                 # استخدام hf_hub_download بدلاً من torch.hub للـ HF repos
                 logger.info(f"🌐 Downloading weights from HF repo...")
                 repo_id = "mihalykiss/modernbert_2"
-                filename = model_url.split('/')[-1]  # Extract filename like "Model_groups_3class_seed12"
-                pt_file = hf_hub_download(
                     repo_id=repo_id,
                     filename=filename,
-                    cache_dir=CACHE_DIR,
-                    local_dir_use_symlinks=False
                 )
-                state_dict = torch.load(pt_file, map_location=device, weights_only=True)
-                # تحميل الأوزان فقط إذا لم نكن في وضع fallback (لأن ModernBERT weights قد لا تتوافق مع BERT القياسي)
-                if not self.using_fallback:
-                    base_model.load_state_dict(state_dict, strict=False)
-                    logger.info("✅ Weights loaded successfully")
-                else:
-                    logger.warning("⚠️ Skipping weight load in fallback mode (incompatible architecture)")
-            else:
-                logger.info("📊 Using model with random initialization")
-        except Exception as weight_error:
-            logger.warning(f"⚠️ Could not load weights: {weight_error}")
-            logger.info("📊 Continuing with base model (random or pre-trained init)")
-        # نقل الموديل للجهاز المناسب
-        model = base_model.to(device)
-        model.eval()
-        # تنظيف الذاكرة
-        if 'state_dict' in locals():
-            del state_dict
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        logger.info(f"✅ {model_name} loaded successfully (fallback: {self.using_fallback})")
-        return model
-    def load_models(self, max_models=2):
-        """تحميل الموديلات بحد أقصى للذاكرة"""
         if self.models_loaded:
-            logger.info("✨ Models already loaded")
             return True
-        # تحميل الـ Tokenizer أولاً
-        if not self.load_tokenizer():
-            logger.error("❌ Tokenizer load failed - cannot proceed")
-            return False
-        # تحميل الموديلات
-        logger.info(f"🚀 Loading up to {max_models} models...")
-        # محاولة تحميل الملف المحلي أولاً
-        local_model_path = "modernbert.bin"
-        if os.path.exists(local_model_path):
-            model = self.load_single_model(
-                model_path=local_model_path,
-                model_name="Model 1 (Local)"
-            )
-            if model is not None:
-                self.models.append(model)
-        # تحميل الموديلات من URLs (استخراج filenames)
-        for i, full_url in enumerate(self.model_urls[:max_models - len(self.models)]):
-            if len(self.models) >= max_models:
-                break
-            # استخدام full_url كما هو، لكن في load_single_model نستخرج filename
-            model = self.load_single_model(
-                model_url=full_url,
-                model_name=f"Model {len(self.models) + 1}"
-            )
-            if model is not None:
                 self.models.append(model)
-            # التحقق من الذاكرة المتاحة
-            if torch.cuda.is_available():
-                mem_allocated = torch.cuda.memory_allocated() / 1024**3
-                mem_reserved = torch.cuda.memory_reserved() / 1024**3
-                logger.info(f"💾 GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
-                # إيقاف التحميل إذا كانت الذاكرة ممتلئة
-                if mem_allocated > 6:  # حد أقصى 6GB
-                    logger.warning("⚠️ Memory limit reached, stopping model loading")
-                    break
-        # التحقق من نجاح التحميل
-        if len(self.models) > 0:
             self.models_loaded = True
-            logger.info(f"✅ Successfully loaded {len(self.models)} models (using fallback: {self.using_fallback})")
             return True
-        else:
-            logger.error("❌ No models could be loaded")
             return False
-    def classify_text(self, text: str) -> Dict:
-        """تحليل النص باستخدام الموديلات المحملة"""
-        if not self.models_loaded or len(self.models) == 0:
-            raise ValueError("No models loaded")
-        # تنظيف النص
-        cleaned_text = clean_text(text)
-        if not cleaned_text.strip():
-            raise ValueError("Empty text after cleaning")
-        # Tokenization (max_length adjusted for fallback BERT if needed)
-        max_len = 512 if not self.using_fallback else 512  # BERT max is 512
         try:
             inputs = self.tokenizer(
-                cleaned_text,
                 return_tensors="pt",
                 truncation=True,
-                max_length=max_len,
                 padding=True
             ).to(device)
-        except Exception as e:
-            logger.error(f"Tokenization error: {e}")
-            raise ValueError(f"Failed to tokenize text: {e}")
-        # الحصول على التنبؤات
-        all_probabilities = []
-        with torch.no_grad():
-            for i, model in enumerate(self.models):
-                try:
-                    logits = model(**inputs).logits
-                    probs = torch.softmax(logits, dim=1)
-                    all_probabilities.append(probs)
-                except Exception as e:
-                    logger.warning(f"Model {i+1} prediction failed: {e}")
-                    continue
-            if not all_probabilities:
-                raise ValueError("All models failed to make predictions")
-            # حساب المتوسط (Soft Voting)
-            averaged_probs = torch.mean(torch.stack(all_probabilities), dim=0)
-            probabilities = averaged_probs[0]
-        # حساب نسب Human vs AI
-        human_prob = probabilities[24].item()
-        ai_probs = probabilities.clone()
-        ai_probs[24] = 0  # إزالة احتمالية Human
-        ai_total_prob = ai_probs.sum().item()
-        # التطبيع
-        total = human_prob + ai_total_prob
-        if total > 0:
-            human_percentage = (human_prob / total) * 100
-            ai_percentage = (ai_total_prob / total) * 100
-        else:
-            human_percentage = 50
-            ai_percentage = 50
-        # تحديد الموديل الأكثر احتمالاً
-        ai_model_idx = torch.argmax(ai_probs).item()
-        predicted_model = label_mapping.get(ai_model_idx, "Unknown")
-        # أعلى 5 تنبؤات
-        top_5_probs, top_5_indices = torch.topk(probabilities, 5)
-        top_5_results = []
-        for prob, idx in zip(top_5_probs, top_5_indices):
-            top_5_results.append({
-                "model": label_mapping.get(idx.item(), "Unknown"),
-                "probability": round(prob.item() * 100, 2)
-            })
         return {
-            "human_percentage": round(human_percentage, 2),
-            "ai_percentage": round(ai_percentage, 2),
-            "predicted_model": predicted_model,
-            "top_5_predictions": top_5_results,
-            "is_human": human_percentage > ai_percentage,
-            "models_used": len(all_probabilities),
-            "using_fallback": self.using_fallback
         }
 # =====================================================
-# 🧹 دوال التنظيف والمعالجة
 # =====================================================
-def clean_text(text: str) -> str:
-    """تنظيف النص من المسافات الزائدة"""
-    text = re.sub(r'\s{2,}', ' ', text)
-    text = re.sub(r'\s+([,.;:?!])', r'\1', text)
-    return text.strip()
-def split_into_paragraphs(text: str) -> List[str]:
     """تقسيم النص إلى فقرات"""
-    paragraphs = re.split(r'\n\s*\n', text.strip())
-    return [p.strip() for p in paragraphs if p.strip()]
 # =====================================================
 # 🌐 FastAPI Application
 # =====================================================
 app = FastAPI(
-    title="ModernBERT AI Text Detector",
-    description="كشف النصوص المكتوبة بواسطة الذكاء الاصطناعي",
-    version="2.2.0"  # Updated version with UID fix
 )
-# إضافة CORS للسماح بالاستخدام من المتصفح
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -406,44 +466,18 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# إنشاء مدير الموديلات
 model_manager = ModelManager()
 # =====================================================
-# 📝 نماذج البيانات (Pydantic Models)
-# =====================================================
-class TextInput(BaseModel):
-    text: str
-    analyze_paragraphs: Optional[bool] = False
-class SimpleTextInput(BaseModel):
-    text: str
-class DetectionResult(BaseModel):
-    success: bool
-    code: int
-    message: str
-    data: Dict
-# =====================================================
-# 🎯 API Endpoints
 # =====================================================
 @app.on_event("startup")
 async def startup_event():
-    """تحميل الموديلات عند بداية التشغيل"""
-    logger.info("=" * 50)
-    logger.info("🚀 Starting ModernBERT AI Detector...")
-    logger.info(f"🐍 Python version: {sys.version}")
-    logger.info(f"🔥 PyTorch version: {torch.__version__}")
-    import transformers
-    logger.info(f"🔧 Transformers version: {transformers.__version__}")
-    logger.info("🛡️ UID Monkey Patch Applied (for Docker/Container)")
-    logger.info("=" * 50)
-    # محاولة تحميل الموديلات
-    max_models = int(os.environ.get("MAX_MODELS", "2"))
-    success = model_manager.load_models(max_models=max_models)
     if success:
         logger.info("✅ Application ready! (Fallback mode: %s)", model_manager.using_fallback)
     else:
@@ -555,13 +589,16 @@ async def analyze_text(data: TextInput):
                 human_percentage = round(100 - ai_percentage, 2)
                 ai_words = int(recalc_ai_words)
         # إنشاء رسالة التغذية الراجعة
         if ai_percentage > 50:
             feedback = "Most of Your Text is AI/GPT Generated"
         else:
             feedback = "Most of Your Text Appears Human-Written"
-        # إرجاع النتائج بنفس تنسيق الكود الأصلي
         return DetectionResult(
             success=True,
             code=200,
@@ -578,7 +615,9 @@ async def analyze_text(data: TextInput):
                 "detected_language": "en",
                 "top_5_predictions": result.get("top_5_predictions", []),
                 "models_used": result.get("models_used", 1),
-                "using_fallback": result.get("using_fallback", False)
             }
         )
@@ -645,4 +684,4 @@ if __name__ == "__main__":
         port=port,
         workers=workers,
         reload=False  # Set to True for dev
-    )

                 # استخدام hf_hub_download بدلاً من torch.hub للـ HF repos
                 logger.info(f"🌐 Downloading weights from HF repo...")
                 repo_id = "mihalykiss/modernbert_2"
+                filename = model_url.split("/")[-1]
+                local_path = hf_hub_download(
                     repo_id=repo_id,
                     filename=filename,
+                    cache_dir=CACHE_DIR
                 )
+                logger.info(f"✅ Downloaded to {local_path}")
+                state_dict = torch.load(local_path, map_location=device, weights_only=True)
+                base_model.load_state_dict(state_dict, strict=False)
+            logger.info(f"✅ {model_name} weights loaded successfully")
+        except Exception as e:
+            logger.warning(f"⚠️ Could not load custom weights for {model_name}: {e}")
+            logger.info("📌 Using base model without fine-tuned weights")
+        # نقل للجهاز وضبط الوضع
+        try:
+            base_model = base_model.to(device)
+            base_model.eval()
+            logger.info(f"✅ {model_name} moved to {device} and set to eval mode")
+            return base_model
+        except Exception as e:
+            logger.error(f"❌ Failed to prepare {model_name}: {e}")
+            return None
+    def load_models(self):
+        """تحميل جميع الموديلات"""
         if self.models_loaded:
             return True
+        try:
+            # تحميل tokenizer
+            if not self.load_tokenizer():
+                return False
+            # تحميل كل موديل
+            for i, model_url in enumerate(self.model_urls):
+                model = self.load_single_model(
+                    model_url=model_url,
+                    model_name=f"Model {i+1}"
+                )
+                if model is None:
+                    logger.warning(f"⚠️ Failed to load model {i+1}")
+                    continue
                 self.models.append(model)
+            if len(self.models) == 0:
+                logger.error("❌ No models loaded successfully")
+                return False
             self.models_loaded = True
+            logger.info(f"✅ Successfully loaded {len(self.models)} model(s)")
             return True
+        except Exception as e:
+            logger.error(f"❌ Model loading error: {e}", exc_info=True)
             return False
+    def classify_text(self, text: str, max_length: int = 512) -> Dict:
+        """تصنيف النص"""
+        if not self.models_loaded or not self.tokenizer:
+            raise RuntimeError("Models or tokenizer not loaded")
         try:
+            # Tokenization
             inputs = self.tokenizer(
+                text,
                 return_tensors="pt",
                 truncation=True,
+                max_length=max_length,
                 padding=True
             ).to(device)
+            # التنبؤ باستخدام جميع الموديلات
+            all_logits = []
+            with torch.no_grad():
+                for model in self.models:
+                    outputs = model(**inputs)
+                    all_logits.append(outputs.logits)
+            # حساب المتوسط
+            avg_logits = torch.mean(torch.stack(all_logits), dim=0)
+            probabilities = torch.nn.functional.softmax(avg_logits, dim=-1)
+            # الحصول على أعلى التنبؤات
+            top_probs, top_indices = torch.topk(probabilities[0], k=5)
+            # حساب احتمالات AI vs Human
+            ai_prob = 1.0 - probabilities[0][24].item()  # 24 = human
+            human_prob = probabilities[0][24].item()
+            # الموديل المتوقع
+            predicted_idx = top_indices[0].item()
+            predicted_model = label_mapping.get(predicted_idx, "unknown")
+            # Top 5 predictions
+            top_5 = [
+                {
+                    "model": label_mapping.get(idx.item(), "unknown"),
+                    "probability": prob.item()
+                }
+                for prob, idx in zip(top_probs, top_indices)
+            ]
+            return {
+                "ai_percentage": round(ai_prob * 100, 2),
+                "human_percentage": round(human_prob * 100, 2),
+                "predicted_model": predicted_model,
+                "top_5_predictions": top_5,
+                "models_used": len(self.models),
+                "using_fallback": self.using_fallback
+            }
+        except Exception as e:
+            logger.error(f"Classification error: {e}", exc_info=True)
+            raise
+# =====================================================
+# 🆕 NEW HELPER FUNCTIONS - Content Cleaning & Splitting
+# =====================================================
+def clean_content_for_analysis(text: str, min_line_length: int = 30) -> str:
+    """
+    Clean content by removing short lines (headlines, etc.)
+    Args:
+        text: Original text
+        min_line_length: Minimum character length for a line to be kept (default: 30)
+    Returns:
+        Cleaned text with only substantial content lines
+    """
+    lines = text.split('\n')
+    cleaned_lines = []
+    for line in lines:
+        stripped = line.strip()
+        # Keep lines that are longer than min_line_length
+        if len(stripped) >= min_line_length:
+            cleaned_lines.append(stripped)
+    return ' '.join(cleaned_lines)
+def split_content_in_half(text: str) -> tuple:
+    """
+    Split cleaned content into two halves
+    Args:
+        text: Cleaned text
+    Returns:
+        Tuple of (first_half, second_half)
+    """
+    words = text.split()
+    mid_point = len(words) // 2
+    first_half = ' '.join(words[:mid_point])
+    second_half = ' '.join(words[mid_point:])
+    return first_half, second_half
+def analyze_content_halves(model_manager, text: str) -> Dict:
+    """
+    Analyze text by splitting it into two halves after cleaning
+    Args:
+        model_manager: The ModelManager instance
+        text: Original text to analyze
+    Returns:
+        Dictionary with analysis of both halves
+    """
+    try:
+        # Clean the content first
+        cleaned_text = clean_content_for_analysis(text)
+        if not cleaned_text or len(cleaned_text.split()) < 10:
+            return {
+                "halves_analysis_available": False,
+                "reason": "Content too short after cleaning"
+            }
+        # Split into halves
+        first_half, second_half = split_content_in_half(cleaned_text)
+        # Analyze first half
+        first_half_result = model_manager.classify_text(first_half)
+        first_half_words = len(first_half.split())
+        # Analyze second half
+        second_half_result = model_manager.classify_text(second_half)
+        second_half_words = len(second_half.split())
         return {
+            "halves_analysis_available": True,
+            "cleaned_content": {
+                "total_words": len(cleaned_text.split()),
+                "first_half_words": first_half_words,
+                "second_half_words": second_half_words
+            },
+            "first_half": {
+                "ai_percentage": first_half_result["ai_percentage"],
+                "human_percentage": first_half_result["human_percentage"],
+                "predicted_model": first_half_result["predicted_model"],
+                "word_count": first_half_words,
+                "preview": first_half[:200] + "..." if len(first_half) > 200 else first_half
+            },
+            "second_half": {
+                "ai_percentage": second_half_result["ai_percentage"],
+                "human_percentage": second_half_result["human_percentage"],
+                "predicted_model": second_half_result["predicted_model"],
+                "word_count": second_half_words,
+                "preview": second_half[:200] + "..." if len(second_half) > 200 else second_half
+            }
+        }
+    except Exception as e:
+        logger.error(f"Error in halves analysis: {e}", exc_info=True)
+        return {
+            "halves_analysis_available": False,
+            "error": str(e)
         }
 # =====================================================
+# 📝 Pydantic Models
 # =====================================================
+class TextInput(BaseModel):
+    text: str
+    analyze_paragraphs: bool = False
+class SimpleTextInput(BaseModel):
+    text: str
+class DetectionResult(BaseModel):
+    success: bool
+    code: int
+    message: str
+    data: Dict
+# =====================================================
+# 🔧 مساعدات
+# =====================================================
+def split_into_paragraphs(text: str, min_length: int = 100) -> List[str]:
     """تقسيم النص إلى فقرات"""
+    paragraphs = re.split(r'\n\s*\n', text)
+    return [p.strip() for p in paragraphs if len(p.strip()) >= min_length]
 # =====================================================
 # 🌐 FastAPI Application
 # =====================================================
 app = FastAPI(
+    title="ModernBERT AI Text Detector API",
+    description="API for detecting AI-generated text using ModernBERT",
+    version="2.0.0"
 )
+# CORS
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Model Manager Instance
 model_manager = ModelManager()
 # =====================================================
+# 🚀 Startup Event
 # =====================================================
 @app.on_event("startup")
 async def startup_event():
+    """تحميل الموديلات عند بدء التطبيق"""
+    logger.info("🚀 Starting application...")
+    logger.info("📦 Loading models...")
+    success = model_manager.load_models()
     if success:
         logger.info("✅ Application ready! (Fallback mode: %s)", model_manager.using_fallback)
     else:
                 human_percentage = round(100 - ai_percentage, 2)
                 ai_words = int(recalc_ai_words)
+        # 🆕 NEW FEATURE: Analyze content by halves
+        halves_analysis = analyze_content_halves(model_manager, text)
         # إنشاء رسالة التغذية الراجعة
         if ai_percentage > 50:
             feedback = "Most of Your Text is AI/GPT Generated"
         else:
             feedback = "Most of Your Text Appears Human-Written"
+        # إرجاع النتائج بنفس تنسيق الكود الأصلي + إضافة تحليل النصفين
         return DetectionResult(
             success=True,
             code=200,
                 "detected_language": "en",
                 "top_5_predictions": result.get("top_5_predictions", []),
                 "models_used": result.get("models_used", 1),
+                "using_fallback": result.get("using_fallback", False),
+                # 🆕 NEW: Halves analysis appended to response
+                "halves_analysis": halves_analysis
             }
         )
         port=port,
         workers=workers,
         reload=False  # Set to True for dev
+    )