Spaces:
Runtime error
Runtime error
| import os | |
| import random | |
| import sqlite3 | |
| import numpy as np | |
| import joblib # Para carregar os modelos salvos | |
| import streamlit as st # Para a interface de usuário | |
| import matplotlib.pyplot as plt # Para plotagem | |
| import torch # Necessário para SentenceTransformer, mesmo que não explícito | |
| # Importe SentenceTransformer para embeddings otimizados | |
| from sentence_transformers import SentenceTransformer | |
| # --- Configuração Inicial --- | |
| # DB_NAME e TABLE_NAME são necessários para saber onde o DB pré-gerado está. | |
| DB_NAME = "training_data_large.db" | |
| TABLE_NAME = "events" | |
| MODEL_NAME = "markusbayer/CySecBERT" | |
| RANDOM_SEED = 42 | |
| RISK_THRESHOLD = 50.0 | |
| # --- Configuração de Seed Global (para reprodutibilidade da inferência, se houver aleatoriedade) --- | |
| random.seed(RANDOM_SEED) | |
| np.random.seed(RANDOM_SEED) | |
| torch.manual_seed(RANDOM_SEED) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed_all(RANDOM_SEED) | |
| # --- Globais para Modelos e Ferramentas (serão carregadas UMA VEZ) --- | |
| # Declaradas globalmente para serem acessíveis pelas funções de inferência. | |
| model_base = None | |
| mlp_regressor, scaler = None, None | |
| tfidf_vectorizer, tfidf_regressor = None, None | |
| # --- Vocabulário de Palavras-Chave para a Cabeça 3 (Regra Baseada) --- | |
| # Essas listas são necessárias para a lógica de "Gerar Evento Aleatório" e a análise de palavras-chave. | |
| HIGH_RISK_KEYWORDS = { | |
| 'failed': 15, 'unauthorized': 20, 'invalid': 15, 'blocked': 25, 'mfa_failed': 30, 'brute_force': 40, 'attack': 40, | |
| 'threat': 30, 'compromise': 30, 'malicious': 35, 'lockout': 25, 'critical': 20, 'urgent': 20, 'severe': 25, | |
| 'breach': 40, 'exfiltration': 40, 'injection': 35, 'malware': 35, 'vulnerability': 25, 'exploit': 30, | |
| 'lateral movement': 40, 'dns tunneling': 35, 'obfuscated': 25, 'anomaly': 20, 'misconfigured': 30, | |
| 'ransomware': 50, 'phishing': 45, 'insider threat': 40, 'zero-day': 50, 'unauthorized access': 35, 'data integrity': 30, | |
| 'compromised credential': 40, 'vulnerable library': 30, 'sql injection': 35, 'privilege escalation': 45 | |
| } | |
| LOW_RISK_KEYWORDS = { | |
| 'success': -20, 'successful': -20, 'normal': -15, 'routine': -15, 'authorized': -10, 'benign': -15, 'secure': -10, | |
| 'safe': -15, 'approved': -10, 'expected': -5, 'completed': -10, | |
| 'scan completed': -25, 'validated': -15, 'patched': -20, 'renewed': -15, 'posture confirmed': -30, | |
| 'performance improved': -10, 'functionality rolled out': -10, 'resources optimized': -15, | |
| 'backup completed': -20, 'schema migration successful': -15, 'network policy updated': -10 | |
| } | |
| # Listas de vocabulário para o botão "Gerar Evento Aleatório" no app.py | |
| ADVERSARIAL_RISK_ACTORS = [ | |
| "Unsandboxed process", "Leaked API key", "Misconfigured service account", "Shadow IT application", | |
| "Dormant user account", "Ransomware payload", "Phishing attempt", "Insider threat", | |
| "Zero-day exploit", "Malicious actor", "Compromised credential", "Vulnerable third-party library", | |
| "Compromised Kubernetes pod", "Malicious Docker container", "AWS IAM role escalation", | |
| "Azure AD privilege escalation", "GCP service account abuse", "Container escape attempt", | |
| "Serverless function injection", "Cloud storage bucket enumeration", "API gateway bypass", | |
| "Microservice lateral movement", "Container registry poisoning", "Cloud metadata exploitation", | |
| "CI/CD pipeline compromise", "Git repository poisoning", "Build artifact tampering", | |
| "Deployment script injection", "Infrastructure as Code attack", "Secret scanning bypass", | |
| "Dependency confusion attack", "Supply chain compromise", "Code signing certificate theft", | |
| "Pipeline privilege escalation", "Artifact repository poisoning", "Build environment escape", | |
| "Compromised IoT device", "Edge computing exploit", "Industrial control system breach", | |
| "SCADA system compromise", "Smart city infrastructure attack", "Medical device exploitation", | |
| "Automotive system breach", "Home automation compromise", "Sensor data manipulation", | |
| "Edge gateway exploitation", "Industrial protocol abuse", "IoT botnet recruitment", | |
| "Mobile app sandbox escape", "iOS jailbreak exploitation", "Android rootkit installation", | |
| "Mobile banking trojan", "Enterprise device compromise", "BYOD policy violation", | |
| "Mobile device management bypass", "App store poisoning", "Mobile certificate pinning bypass", | |
| "Endpoint detection evasion", "Mobile phishing campaign", "Device fingerprinting abuse", | |
| "Network segmentation bypass", "Firewall rule manipulation", "VPN tunnel exploitation", | |
| "DNS hijacking attempt", "BGP route hijacking", "Network protocol abuse", | |
| "Wireless network compromise", "Bluetooth attack vector", "NFC exploitation", | |
| "Network monitoring evasion", "Traffic analysis bypass", "Protocol fuzzing attack" | |
| ] | |
| ADVERSARIAL_RISK_ACTIONS = [ | |
| "attempted lateral movement via", "initiated a DNS tunneling request to", | |
| "executed a living-off-the-land binary on", "was flagged for unusual API call patterns against", | |
| "triggered a data access anomaly in", "exfiltrated data from", "modified critical system files in", | |
| "gained unauthorized access to", "deployed malicious code on", "brute-forced login for", | |
| "injected SQL into", "exploited a vulnerability in", | |
| "attempted container escape from", "escalated privileges in Kubernetes cluster", | |
| "abused IAM role permissions for", "enumerated cloud storage buckets through", | |
| "bypassed API gateway authentication to", "injected malicious code into serverless function", | |
| "compromised container registry access for", "exploited cloud metadata service to", | |
| "performed lateral movement across microservices in", "poisoned container image in", | |
| "abused cloud resource tagging for", "exploited cloud logging service to", | |
| "compromised CI/CD pipeline to", "injected malicious code into build process for", | |
| "poisoned dependency repository to", "tampered with build artifacts in", | |
| "escalated privileges in deployment pipeline for", "bypassed security scanning in", | |
| "abused infrastructure automation to", "compromised secret management system for", | |
| "injected malicious code into deployment scripts for", "exploited build environment to", | |
| "abused artifact signing process for", "compromised code repository access to", | |
| "compromised IoT device firmware to", "exploited edge computing vulnerability in", | |
| "breached industrial control system through", "manipulated sensor data from", | |
| "exploited SCADA system vulnerability to", "compromised smart city infrastructure via", | |
| "abused industrial protocol to", "exploited edge gateway vulnerability in", | |
| "recruited device into botnet through", "compromised medical device firmware to", | |
| "exploited automotive system vulnerability in", "breached home automation system via", | |
| "escaped mobile app sandbox to", "exploited iOS jailbreak vulnerability in", | |
| "installed rootkit on Android device to", "compromised enterprise mobile device through", | |
| "bypassed mobile device management to", "poisoned mobile app store listing for", | |
| "exploited mobile certificate pinning in", "compromised mobile banking app through", | |
| "abused device fingerprinting to", "exploited mobile phishing vulnerability in", | |
| "breached BYOD policy through", "compromised mobile endpoint security via", | |
| "bypassed network segmentation to", "manipulated firewall rules for", | |
| "exploited VPN tunnel vulnerability in", "hijacked DNS resolution for", | |
| "abused BGP routing protocol to", "compromised wireless network through", | |
| "exploited Bluetooth vulnerability in", "abused NFC communication to", | |
| "evaded network monitoring through", "bypassed traffic analysis via", | |
| "exploited network protocol vulnerability in", "compromised network infrastructure through" | |
| ] | |
| ADVERSARIAL_RISK_TARGETS = [ | |
| "a code repository", "the CI/CD pipeline", "a cloud storage bucket", "the internal DNS server", | |
| "the virtual machine hypervisor", "sensitive customer data", "financial databases", | |
| "intellectual property servers", "critical infrastructure controls", "user authentication service", | |
| "production web server", "database backup storage", | |
| "Kubernetes cluster control plane", "Docker container registry", "AWS S3 bucket with sensitive data", | |
| "Azure Active Directory tenant", "GCP Cloud Storage bucket", "container orchestration system", | |
| "serverless function environment", "cloud API gateway", "microservice mesh network", | |
| "container security scanning service", "cloud logging and monitoring system", "infrastructure as code repository", | |
| "Git repository with production secrets", "Jenkins build pipeline", "Docker image registry", | |
| "artifact repository with signed packages", "infrastructure provisioning system", "secret management vault", | |
| "code signing certificate store", "dependency management system", "deployment automation platform", | |
| "build environment with elevated privileges", "CI/CD security scanning tools", "infrastructure monitoring system", | |
| "industrial control system network", "SCADA system database", "IoT device management platform", | |
| "edge computing gateway", "smart city infrastructure", "medical device network", | |
| "automotive system bus", "home automation hub", "sensor data collection system", | |
| "industrial protocol gateway", "edge security monitoring system", "IoT device firmware repository", | |
| "enterprise mobile device fleet", "mobile app store backend", "mobile device management system", | |
| "mobile banking infrastructure", "mobile certificate authority", "mobile security scanning service", | |
| "BYOD policy enforcement system", "mobile endpoint detection system", "mobile app security testing platform", | |
| "mobile device fingerprinting database", "mobile phishing detection system", "mobile app code signing service", | |
| "network segmentation firewall", "VPN concentrator", "DNS authoritative server", | |
| "BGP route reflector", "wireless access point controller", "network monitoring system", | |
| "traffic analysis platform", "network security scanning tool", "protocol analysis system", | |
| "network infrastructure management", "security information system", "network forensics platform" | |
| ] | |
| ADVERSARIAL_RISK_OUTCOMES = [ | |
| "the action was obfuscated", "a low-and-slow data transfer was detected", | |
| "the process terminated abnormally after execution", "security controls were temporarily disabled", | |
| "alert thresholds were bypassed", "data integrity was compromised", "system uptime was impacted", | |
| "a backdoor was established", "a privilege escalation was achieved", "system resources were depleted", | |
| "data encryption initiated", | |
| "container escape was successful", "Kubernetes RBAC was bypassed", "cloud IAM policies were circumvented", | |
| "container registry was compromised", "serverless function was weaponized", "cloud logging was manipulated", | |
| "microservice communication was intercepted", "container security scanning was evaded", | |
| "cloud resource tagging was abused", "container orchestration was compromised", | |
| "cloud metadata service was exploited", "container networking was hijacked", | |
| "build pipeline was compromised", "dependency repository was poisoned", "artifact signing was bypassed", | |
| "infrastructure automation was weaponized", "secret management was breached", "code repository was compromised", | |
| "deployment process was hijacked", "build environment was escaped", "CI/CD security was bypassed", | |
| "infrastructure monitoring was disabled", "artifact integrity was compromised", "deployment approval was bypassed", | |
| "IoT device was recruited into botnet", "industrial control system was compromised", "edge gateway was breached", | |
| "sensor data was manipulated", "SCADA system was taken offline", "smart city infrastructure was disrupted", | |
| "medical device was compromised", "automotive system was hijacked", "home automation was breached", | |
| "industrial protocol was abused", "edge security was bypassed", "IoT device firmware was modified", | |
| "mobile device was rooted/jailbroken", "enterprise mobile security was bypassed", "mobile app was compromised", | |
| "mobile device management was evaded", "mobile banking was breached", "mobile certificate pinning was bypassed", | |
| "BYOD policy was violated", "mobile endpoint detection was evaded", "mobile app store was poisoned", | |
| "mobile device fingerprinting was spoofed", "mobile phishing was successful", "mobile security scanning was bypassed", | |
| "network segmentation was bypassed", "firewall rules were manipulated", "VPN tunnel was compromised", | |
| "DNS resolution was hijacked", "BGP routing was manipulated", "wireless network was compromised", | |
| "Bluetooth security was bypassed", "NFC communication was intercepted", "network monitoring was evaded", | |
| "traffic analysis was bypassed", "network protocol was abused", "network infrastructure was compromised" | |
| ] | |
| # --- Funções para Inferência --- | |
| # Decorator para cachear o carregamento de recursos pesados, como modelos. | |
| # Isso garante que o modelo seja carregado apenas uma vez, mesmo após interações na UI. | |
| def load_all_models_and_tokenizer(): | |
| """ | |
| Carrega todos os modelos pré-treinados e o SentenceTransformer. | |
| Esta função é cacheada pelo Streamlit para ser executada apenas uma vez. | |
| """ | |
| st.info("Cargando SentenceTransformer (CySecBERT)...") | |
| # O SentenceTransformer faz o download do modelo se não estiver em cache local | |
| sbert_model = SentenceTransformer(MODEL_NAME) | |
| st.info("Cargando modelos de scikit-learn treinados...") | |
| mlp = joblib.load("mlp_regressor.joblib") | |
| scl = joblib.load("scaler.joblib") | |
| tfidf_vec = joblib.load("tfidf_vectorizer.joblib") | |
| tfidf_reg = joblib.load("tfidf_regressor.joblib") | |
| st.success("Modelos carregados com sucesso!") | |
| return sbert_model, mlp, scl, tfidf_vec, tfidf_reg | |
| def classify_and_score_risk(event_text: str) -> tuple[float, dict]: | |
| """ | |
| Classifica um único evento de segurança e calcula um score de risco, | |
| utilizando os modelos treinados e combinando os resultados de múltiplas cabeças. | |
| """ | |
| global model_base, mlp_regressor, scaler, tfidf_vectorizer, tfidf_regressor # Usar os modelos carregados | |
| # Cabeça 1: Embedding Profundo (CySecBERT + MLP) | |
| # Garante que o embedding seja um numpy array e depois reshape para 2D para o scaler | |
| embedding = model_base.encode([event_text], convert_to_numpy=True, show_progress_bar=False)[0] | |
| embedding_scaled = scaler.transform(embedding.reshape(1, -1)) | |
| score_embedding = mlp_regressor.predict(embedding_scaled)[0] | |
| # Cabeça 2: Vetorial Clássico (TF-IDF + Ridge) | |
| tfidf_vector = tfidf_vectorizer.transform([event_text]) | |
| score_tfidf = tfidf_regressor.predict(tfidf_vector)[0] | |
| # Cabeça 3: Análise de Palavras-Chave (Regra Baseada) | |
| score_keyword = 50.0 # Score base | |
| text_lower = event_text.lower() | |
| for word, value in HIGH_RISK_KEYWORDS.items(): | |
| if word in text_lower: score_keyword += value | |
| for word, value in LOW_RISK_KEYWORDS.items(): | |
| if word in text_lower: score_keyword += value | |
| score_keyword = np.clip(score_keyword, 0, 100) # Garante que o score fique entre 0 e 100 | |
| # Combinação ponderada dos scores | |
| weights = {'embedding': 0.5, 'tfidf': 0.3, 'keyword': 0.2} | |
| final_score = (score_embedding * weights['embedding'] + | |
| score_tfidf * weights['tfidf'] + | |
| score_keyword * weights['keyword']) | |
| final_score = np.clip(final_score, 0, 100) | |
| individual_scores = { | |
| 'embedding_score': np.clip(score_embedding, 0, 100), | |
| 'tfidf_score': np.clip(score_tfidf, 0, 100), | |
| 'keyword_score': score_keyword | |
| } | |
| return float(final_score), individual_scores | |
| # --- Interface Streamlit --- | |
| st.set_page_config( | |
| page_title="Detecção de Risco de Segurança Ciber", | |
| page_icon="🛡️", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| st.title("🛡️ Sistema de Detecção de Risco de Segurança Ciber") | |
| st.markdown("---") | |
| st.sidebar.header("Sobre") | |
| st.sidebar.write(""" | |
| Este aplicativo analisa logs de eventos de segurança usando um modelo híbrido com: | |
| - **Embedding Profundo:** CySecBERT + MLP Regressor | |
| - **Vetorial Clássico:** TF-IDF + Ridge Regressor | |
| - **Regras Heurísticas:** Análise de palavras-chave | |
| """) | |
| st.sidebar.info("Desenvolvido para demonstração no Hugging Face Spaces.") | |
| # Carrega os modelos uma única vez na inicialização do aplicativo | |
| # Este bloco de `st.empty()` e `st.spinner` permite que as mensagens de carregamento sejam temporárias | |
| loading_message_placeholder = st.empty() | |
| with loading_message_placeholder.container(): | |
| model_base, mlp_regressor, scaler, tfidf_vectorizer, tfidf_regressor = load_all_models_and_tokenizer() | |
| loading_message_placeholder.empty() # Remove a mensagem de carregamento após a conclusão | |
| st.subheader("Insira o Evento de Segurança para Análise:") | |
| # Use st.session_state para manter o texto da área de texto após a geração aleatória | |
| if 'event_text_input' not in st.session_state: | |
| st.session_state.event_text_input = "Audit log: Unsandboxed process attempted lateral movement via a code repository. Status: the action was obfuscated." | |
| event_text = st.text_area( | |
| "Descrição do Evento", | |
| height=200, | |
| value=st.session_state.event_text_input, | |
| key="event_text_input" # Atribui uma chave para o widget | |
| ) | |
| # Botão para "Gerar Evento Aleatório (Risco)" | |
| if st.button("Gerar Evento Aleatório (Risco)", help="Gera um exemplo de evento de alto risco para demonstração."): | |
| actor = random.choice(ADVERSARIAL_RISK_ACTORS) | |
| action = random.choice(ADVERSARIAL_RISK_ACTIONS) | |
| target = random.choice(ADVERSARIAL_RISK_TARGETS) | |
| outcome = random.choice(ADVERSARIAL_RISK_OUTCOMES) | |
| random_event_text = f"Audit log: {actor} {action} {target}. Status: {outcome}." | |
| st.session_state.event_text_input = random_event_text # Atualiza o texto na sessão | |
| st.experimental_rerun() # Força o Streamlit a re-executar para atualizar a text_area | |
| if st.button("Analisar Risco", type="primary"): | |
| if not event_text.strip(): | |
| st.warning("Por favor, insira um texto de evento para análise.") | |
| else: | |
| # Use um placeholder para resultados que será atualizado incrementalmente | |
| results_placeholder = st.empty() | |
| with results_placeholder.container(): | |
| with st.spinner("Analisando o evento..."): | |
| final_score, individual_scores = classify_and_score_risk(event_text) | |
| prediction = "Risco" if final_score >= RISK_THRESHOLD else "Seguro" | |
| confidence = round(abs(final_score - 50) * 2) | |
| confidence = min(100, max(0, confidence)) # Garante que fique no intervalo 0-100 | |
| st.markdown("---") | |
| st.subheader("📊 Resultado da Análise:") | |
| if prediction == "Risco": | |
| st.error(f"**PREDIÇÃO:** {prediction} (Score: {final_score:.2f})") | |
| else: | |
| st.success(f"**PREDIÇÃO:** {prediction} (Score: {final_score:.2f})") | |
| st.info(f"**CONFIANÇA:** {confidence}%") | |
| st.markdown("---") | |
| st.subheader("Scores por Componente (0-100):") | |
| col1, col2, col3 = st.columns(3) | |
| col1.metric("Embedding Score", f"{individual_scores['embedding_score']:.2f}") | |
| col2.metric("TF-IDF Score", f"{individual_scores['tfidf_score']:.2f}") | |
| col3.metric("Keyword Score", f"{individual_scores['keyword_score']:.2f}") | |
| st.markdown("---") | |
| st.subheader("📈 Visualização dos Scores:") | |
| fig, ax = plt.subplots(figsize=(10, 5)) | |
| labels = ['Embedding', 'TF-IDF', 'Keyword', 'Final'] | |
| scores = [individual_scores['embedding_score'], | |
| individual_scores['tfidf_score'], | |
| individual_scores['keyword_score'], | |
| final_score] | |
| colors = ['skyblue', 'lightcoral', 'lightgreen', 'darkblue'] | |
| ax.bar(labels, scores, color=colors) | |
| ax.axhline(RISK_THRESHOLD, color='red', linestyle='--', label=f'Limiar de Risco ({RISK_THRESHOLD:.2f})') | |
| ax.set_ylim(0, 100) | |
| ax.set_ylabel("Score de Risco") | |
| ax.set_title("Scores Componentes e Final do Evento") | |
| ax.legend() | |
| st.pyplot(fig) # Exibe a figura Matplotlib | |
| plt.close(fig) # Fecha a figura para liberar memória | |
| st.markdown("---") | |
| st.markdown("Para uma nova análise, modifique o texto do evento acima ou gere um aleatório e clique em 'Analisar Risco' novamente.") |