Spaces:

chaos4455
/

CySecBERT-IT-Event-Triage-Classification

Runtime error

CySecBERT-IT-Event-Triage-Classification

File size: 20,936 Bytes

import os
import random
import sqlite3
import numpy as np
import joblib # Para carregar os modelos salvos
import streamlit as st # Para a interface de usuário
import matplotlib.pyplot as plt # Para plotagem
import torch # Necessário para SentenceTransformer, mesmo que não explícito

# Importe SentenceTransformer para embeddings otimizados
from sentence_transformers import SentenceTransformer 

# --- Configuração Inicial ---
# DB_NAME e TABLE_NAME são necessários para saber onde o DB pré-gerado está.
DB_NAME = "training_data_large.db" 
TABLE_NAME = "events" 

MODEL_NAME = "markusbayer/CySecBERT" 

RANDOM_SEED = 42 
RISK_THRESHOLD = 50.0

# --- Configuração de Seed Global (para reprodutibilidade da inferência, se houver aleatoriedade) ---
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

# --- Globais para Modelos e Ferramentas (serão carregadas UMA VEZ) ---
# Declaradas globalmente para serem acessíveis pelas funções de inferência.
model_base = None
mlp_regressor, scaler = None, None
tfidf_vectorizer, tfidf_regressor = None, None

# --- Vocabulário de Palavras-Chave para a Cabeça 3 (Regra Baseada) ---
# Essas listas são necessárias para a lógica de "Gerar Evento Aleatório" e a análise de palavras-chave.
HIGH_RISK_KEYWORDS = {
    'failed': 15, 'unauthorized': 20, 'invalid': 15, 'blocked': 25, 'mfa_failed': 30, 'brute_force': 40, 'attack': 40,
    'threat': 30, 'compromise': 30, 'malicious': 35, 'lockout': 25, 'critical': 20, 'urgent': 20, 'severe': 25,
    'breach': 40, 'exfiltration': 40, 'injection': 35, 'malware': 35, 'vulnerability': 25, 'exploit': 30,
    'lateral movement': 40, 'dns tunneling': 35, 'obfuscated': 25, 'anomaly': 20, 'misconfigured': 30,
    'ransomware': 50, 'phishing': 45, 'insider threat': 40, 'zero-day': 50, 'unauthorized access': 35, 'data integrity': 30,
    'compromised credential': 40, 'vulnerable library': 30, 'sql injection': 35, 'privilege escalation': 45
}
LOW_RISK_KEYWORDS = {
    'success': -20, 'successful': -20, 'normal': -15, 'routine': -15, 'authorized': -10, 'benign': -15, 'secure': -10,
    'safe': -15, 'approved': -10, 'expected': -5, 'completed': -10,
    'scan completed': -25, 'validated': -15, 'patched': -20, 'renewed': -15, 'posture confirmed': -30,
    'performance improved': -10, 'functionality rolled out': -10, 'resources optimized': -15,
    'backup completed': -20, 'schema migration successful': -15, 'network policy updated': -10
}

# Listas de vocabulário para o botão "Gerar Evento Aleatório" no app.py
ADVERSARIAL_RISK_ACTORS = [
    "Unsandboxed process", "Leaked API key", "Misconfigured service account", "Shadow IT application", 
    "Dormant user account", "Ransomware payload", "Phishing attempt", "Insider threat", 
    "Zero-day exploit", "Malicious actor", "Compromised credential", "Vulnerable third-party library",
    "Compromised Kubernetes pod", "Malicious Docker container", "AWS IAM role escalation", 
    "Azure AD privilege escalation", "GCP service account abuse", "Container escape attempt",
    "Serverless function injection", "Cloud storage bucket enumeration", "API gateway bypass",
    "Microservice lateral movement", "Container registry poisoning", "Cloud metadata exploitation",
    "CI/CD pipeline compromise", "Git repository poisoning", "Build artifact tampering",
    "Deployment script injection", "Infrastructure as Code attack", "Secret scanning bypass",
    "Dependency confusion attack", "Supply chain compromise", "Code signing certificate theft",
    "Pipeline privilege escalation", "Artifact repository poisoning", "Build environment escape",
    "Compromised IoT device", "Edge computing exploit", "Industrial control system breach",
    "SCADA system compromise", "Smart city infrastructure attack", "Medical device exploitation",
    "Automotive system breach", "Home automation compromise", "Sensor data manipulation",
    "Edge gateway exploitation", "Industrial protocol abuse", "IoT botnet recruitment",
    "Mobile app sandbox escape", "iOS jailbreak exploitation", "Android rootkit installation",
    "Mobile banking trojan", "Enterprise device compromise", "BYOD policy violation",
    "Mobile device management bypass", "App store poisoning", "Mobile certificate pinning bypass",
    "Endpoint detection evasion", "Mobile phishing campaign", "Device fingerprinting abuse",
    "Network segmentation bypass", "Firewall rule manipulation", "VPN tunnel exploitation",
    "DNS hijacking attempt", "BGP route hijacking", "Network protocol abuse",
    "Wireless network compromise", "Bluetooth attack vector", "NFC exploitation",
    "Network monitoring evasion", "Traffic analysis bypass", "Protocol fuzzing attack"
]
ADVERSARIAL_RISK_ACTIONS = [
    "attempted lateral movement via", "initiated a DNS tunneling request to", 
    "executed a living-off-the-land binary on", "was flagged for unusual API call patterns against", 
    "triggered a data access anomaly in", "exfiltrated data from", "modified critical system files in", 
    "gained unauthorized access to", "deployed malicious code on", "brute-forced login for", 
    "injected SQL into", "exploited a vulnerability in",
    "attempted container escape from", "escalated privileges in Kubernetes cluster", 
    "abused IAM role permissions for", "enumerated cloud storage buckets through",
    "bypassed API gateway authentication to", "injected malicious code into serverless function",
    "compromised container registry access for", "exploited cloud metadata service to",
    "performed lateral movement across microservices in", "poisoned container image in",
    "abused cloud resource tagging for", "exploited cloud logging service to",
    "compromised CI/CD pipeline to", "injected malicious code into build process for",
    "poisoned dependency repository to", "tampered with build artifacts in",
    "escalated privileges in deployment pipeline for", "bypassed security scanning in",
    "abused infrastructure automation to", "compromised secret management system for",
    "injected malicious code into deployment scripts for", "exploited build environment to",
    "abused artifact signing process for", "compromised code repository access to",
    "compromised IoT device firmware to", "exploited edge computing vulnerability in",
    "breached industrial control system through", "manipulated sensor data from",
    "exploited SCADA system vulnerability to", "compromised smart city infrastructure via",
    "abused industrial protocol to", "exploited edge gateway vulnerability in",
    "recruited device into botnet through", "compromised medical device firmware to",
    "exploited automotive system vulnerability in", "breached home automation system via",
    "escaped mobile app sandbox to", "exploited iOS jailbreak vulnerability in",
    "installed rootkit on Android device to", "compromised enterprise mobile device through",
    "bypassed mobile device management to", "poisoned mobile app store listing for",
    "exploited mobile certificate pinning in", "compromised mobile banking app through",
    "abused device fingerprinting to", "exploited mobile phishing vulnerability in",
    "breached BYOD policy through", "compromised mobile endpoint security via",
    "bypassed network segmentation to", "manipulated firewall rules for",
    "exploited VPN tunnel vulnerability in", "hijacked DNS resolution for",
    "abused BGP routing protocol to", "compromised wireless network through",
    "exploited Bluetooth vulnerability in", "abused NFC communication to",
    "evaded network monitoring through", "bypassed traffic analysis via",
    "exploited network protocol vulnerability in", "compromised network infrastructure through"
]
ADVERSARIAL_RISK_TARGETS = [
    "a code repository", "the CI/CD pipeline", "a cloud storage bucket", "the internal DNS server", 
    "the virtual machine hypervisor", "sensitive customer data", "financial databases", 
    "intellectual property servers", "critical infrastructure controls", "user authentication service", 
    "production web server", "database backup storage",
    "Kubernetes cluster control plane", "Docker container registry", "AWS S3 bucket with sensitive data",
    "Azure Active Directory tenant", "GCP Cloud Storage bucket", "container orchestration system",
    "serverless function environment", "cloud API gateway", "microservice mesh network",
    "container security scanning service", "cloud logging and monitoring system", "infrastructure as code repository",
    "Git repository with production secrets", "Jenkins build pipeline", "Docker image registry",
    "artifact repository with signed packages", "infrastructure provisioning system", "secret management vault",
    "code signing certificate store", "dependency management system", "deployment automation platform",
    "build environment with elevated privileges", "CI/CD security scanning tools", "infrastructure monitoring system",
    "industrial control system network", "SCADA system database", "IoT device management platform",
    "edge computing gateway", "smart city infrastructure", "medical device network",
    "automotive system bus", "home automation hub", "sensor data collection system",
    "industrial protocol gateway", "edge security monitoring system", "IoT device firmware repository",
    "enterprise mobile device fleet", "mobile app store backend", "mobile device management system",
    "mobile banking infrastructure", "mobile certificate authority", "mobile security scanning service",
    "BYOD policy enforcement system", "mobile endpoint detection system", "mobile app security testing platform",
    "mobile device fingerprinting database", "mobile phishing detection system", "mobile app code signing service",
    "network segmentation firewall", "VPN concentrator", "DNS authoritative server",
    "BGP route reflector", "wireless access point controller", "network monitoring system",
    "traffic analysis platform", "network security scanning tool", "protocol analysis system",
    "network infrastructure management", "security information system", "network forensics platform"
]
ADVERSARIAL_RISK_OUTCOMES = [
    "the action was obfuscated", "a low-and-slow data transfer was detected", 
    "the process terminated abnormally after execution", "security controls were temporarily disabled", 
    "alert thresholds were bypassed", "data integrity was compromised", "system uptime was impacted", 
    "a backdoor was established", "a privilege escalation was achieved", "system resources were depleted", 
    "data encryption initiated",
    "container escape was successful", "Kubernetes RBAC was bypassed", "cloud IAM policies were circumvented",
    "container registry was compromised", "serverless function was weaponized", "cloud logging was manipulated",
    "microservice communication was intercepted", "container security scanning was evaded",
    "cloud resource tagging was abused", "container orchestration was compromised",
    "cloud metadata service was exploited", "container networking was hijacked",
    "build pipeline was compromised", "dependency repository was poisoned", "artifact signing was bypassed",
    "infrastructure automation was weaponized", "secret management was breached", "code repository was compromised",
    "deployment process was hijacked", "build environment was escaped", "CI/CD security was bypassed",
    "infrastructure monitoring was disabled", "artifact integrity was compromised", "deployment approval was bypassed",
    "IoT device was recruited into botnet", "industrial control system was compromised", "edge gateway was breached",
    "sensor data was manipulated", "SCADA system was taken offline", "smart city infrastructure was disrupted",
    "medical device was compromised", "automotive system was hijacked", "home automation was breached",
    "industrial protocol was abused", "edge security was bypassed", "IoT device firmware was modified",
    "mobile device was rooted/jailbroken", "enterprise mobile security was bypassed", "mobile app was compromised",
    "mobile device management was evaded", "mobile banking was breached", "mobile certificate pinning was bypassed",
    "BYOD policy was violated", "mobile endpoint detection was evaded", "mobile app store was poisoned",
    "mobile device fingerprinting was spoofed", "mobile phishing was successful", "mobile security scanning was bypassed",
    "network segmentation was bypassed", "firewall rules were manipulated", "VPN tunnel was compromised",
    "DNS resolution was hijacked", "BGP routing was manipulated", "wireless network was compromised",
    "Bluetooth security was bypassed", "NFC communication was intercepted", "network monitoring was evaded",
    "traffic analysis was bypassed", "network protocol was abused", "network infrastructure was compromised"
]


# --- Funções para Inferência ---

# Decorator para cachear o carregamento de recursos pesados, como modelos.
# Isso garante que o modelo seja carregado apenas uma vez, mesmo após interações na UI.
@st.cache_resource
def load_all_models_and_tokenizer():
    """
    Carrega todos os modelos pré-treinados e o SentenceTransformer.
    Esta função é cacheada pelo Streamlit para ser executada apenas uma vez.
    """
    st.info("Cargando SentenceTransformer (CySecBERT)...")
    # O SentenceTransformer faz o download do modelo se não estiver em cache local
    sbert_model = SentenceTransformer(MODEL_NAME) 
    
    st.info("Cargando modelos de scikit-learn treinados...")
    mlp = joblib.load("mlp_regressor.joblib")
    scl = joblib.load("scaler.joblib")
    tfidf_vec = joblib.load("tfidf_vectorizer.joblib")
    tfidf_reg = joblib.load("tfidf_regressor.joblib")
    
    st.success("Modelos carregados com sucesso!")
    return sbert_model, mlp, scl, tfidf_vec, tfidf_reg

def classify_and_score_risk(event_text: str) -> tuple[float, dict]:
    """
    Classifica um único evento de segurança e calcula um score de risco,
    utilizando os modelos treinados e combinando os resultados de múltiplas cabeças.
    """
    global model_base, mlp_regressor, scaler, tfidf_vectorizer, tfidf_regressor # Usar os modelos carregados
    
    # Cabeça 1: Embedding Profundo (CySecBERT + MLP)
    # Garante que o embedding seja um numpy array e depois reshape para 2D para o scaler
    embedding = model_base.encode([event_text], convert_to_numpy=True, show_progress_bar=False)[0]
    embedding_scaled = scaler.transform(embedding.reshape(1, -1))
    score_embedding = mlp_regressor.predict(embedding_scaled)[0]
    
    # Cabeça 2: Vetorial Clássico (TF-IDF + Ridge)
    tfidf_vector = tfidf_vectorizer.transform([event_text])
    score_tfidf = tfidf_regressor.predict(tfidf_vector)[0]

    # Cabeça 3: Análise de Palavras-Chave (Regra Baseada)
    score_keyword = 50.0 # Score base
    text_lower = event_text.lower()
    for word, value in HIGH_RISK_KEYWORDS.items():
            if word in text_lower: score_keyword += value
    for word, value in LOW_RISK_KEYWORDS.items():
            if word in text_lower: score_keyword += value
    score_keyword = np.clip(score_keyword, 0, 100) # Garante que o score fique entre 0 e 100

    # Combinação ponderada dos scores
    weights = {'embedding': 0.5, 'tfidf': 0.3, 'keyword': 0.2}
    final_score = (score_embedding * weights['embedding'] + 
                   score_tfidf * weights['tfidf'] + 
                   score_keyword * weights['keyword'])
    final_score = np.clip(final_score, 0, 100)

    individual_scores = {
        'embedding_score': np.clip(score_embedding, 0, 100), 
        'tfidf_score': np.clip(score_tfidf, 0, 100),
        'keyword_score': score_keyword
    }
    return float(final_score), individual_scores

# --- Interface Streamlit ---
st.set_page_config(
    page_title="Detecção de Risco de Segurança Ciber",
    page_icon="🛡️",
    layout="wide",
    initial_sidebar_state="expanded"
)

st.title("🛡️ Sistema de Detecção de Risco de Segurança Ciber")
st.markdown("---")

st.sidebar.header("Sobre")
st.sidebar.write("""
Este aplicativo analisa logs de eventos de segurança usando um modelo híbrido com:
- **Embedding Profundo:** CySecBERT + MLP Regressor
- **Vetorial Clássico:** TF-IDF + Ridge Regressor
- **Regras Heurísticas:** Análise de palavras-chave
""")
st.sidebar.info("Desenvolvido para demonstração no Hugging Face Spaces.")

# Carrega os modelos uma única vez na inicialização do aplicativo
# Este bloco de `st.empty()` e `st.spinner` permite que as mensagens de carregamento sejam temporárias
loading_message_placeholder = st.empty()
with loading_message_placeholder.container():
    model_base, mlp_regressor, scaler, tfidf_vectorizer, tfidf_regressor = load_all_models_and_tokenizer()
loading_message_placeholder.empty() # Remove a mensagem de carregamento após a conclusão

st.subheader("Insira o Evento de Segurança para Análise:")
# Use st.session_state para manter o texto da área de texto após a geração aleatória
if 'event_text_input' not in st.session_state:
    st.session_state.event_text_input = "Audit log: Unsandboxed process attempted lateral movement via a code repository. Status: the action was obfuscated."

event_text = st.text_area(
    "Descrição do Evento", 
    height=200,
    value=st.session_state.event_text_input,
    key="event_text_input" # Atribui uma chave para o widget
)

# Botão para "Gerar Evento Aleatório (Risco)"
if st.button("Gerar Evento Aleatório (Risco)", help="Gera um exemplo de evento de alto risco para demonstração."):
    actor = random.choice(ADVERSARIAL_RISK_ACTORS)
    action = random.choice(ADVERSARIAL_RISK_ACTIONS)
    target = random.choice(ADVERSARIAL_RISK_TARGETS)
    outcome = random.choice(ADVERSARIAL_RISK_OUTCOMES)
    random_event_text = f"Audit log: {actor} {action} {target}. Status: {outcome}."
    st.session_state.event_text_input = random_event_text # Atualiza o texto na sessão
    st.experimental_rerun() # Força o Streamlit a re-executar para atualizar a text_area

if st.button("Analisar Risco", type="primary"):
    if not event_text.strip():
        st.warning("Por favor, insira um texto de evento para análise.")
    else:
        # Use um placeholder para resultados que será atualizado incrementalmente
        results_placeholder = st.empty()
        with results_placeholder.container():
            with st.spinner("Analisando o evento..."):
                final_score, individual_scores = classify_and_score_risk(event_text)
                
                prediction = "Risco" if final_score >= RISK_THRESHOLD else "Seguro"
                
                confidence = round(abs(final_score - 50) * 2) 
                confidence = min(100, max(0, confidence)) # Garante que fique no intervalo 0-100

                st.markdown("---")
                st.subheader("📊 Resultado da Análise:")

                if prediction == "Risco":
                    st.error(f"**PREDIÇÃO:** {prediction} (Score: {final_score:.2f})")
                else:
                    st.success(f"**PREDIÇÃO:** {prediction} (Score: {final_score:.2f})")
                st.info(f"**CONFIANÇA:** {confidence}%")

                st.markdown("---")
                st.subheader("Scores por Componente (0-100):")
                col1, col2, col3 = st.columns(3)
                col1.metric("Embedding Score", f"{individual_scores['embedding_score']:.2f}")
                col2.metric("TF-IDF Score", f"{individual_scores['tfidf_score']:.2f}")
                col3.metric("Keyword Score", f"{individual_scores['keyword_score']:.2f}")

                st.markdown("---")
                st.subheader("📈 Visualização dos Scores:")
                
                fig, ax = plt.subplots(figsize=(10, 5))
                labels = ['Embedding', 'TF-IDF', 'Keyword', 'Final']
                scores = [individual_scores['embedding_score'], 
                          individual_scores['tfidf_score'], 
                          individual_scores['keyword_score'], 
                          final_score]
                colors = ['skyblue', 'lightcoral', 'lightgreen', 'darkblue']
                
                ax.bar(labels, scores, color=colors)
                ax.axhline(RISK_THRESHOLD, color='red', linestyle='--', label=f'Limiar de Risco ({RISK_THRESHOLD:.2f})')
                ax.set_ylim(0, 100)
                ax.set_ylabel("Score de Risco")
                ax.set_title("Scores Componentes e Final do Evento")
                ax.legend()
                st.pyplot(fig) # Exibe a figura Matplotlib
                plt.close(fig) # Fecha a figura para liberar memória

                st.markdown("---")
                st.markdown("Para uma nova análise, modifique o texto do evento acima ou gere um aleatório e clique em 'Analisar Risco' novamente.")