Spaces:
Runtime error
Runtime error
File size: 20,936 Bytes
80a1a21 e5c4cbf 80a1a21 e5c4cbf 80a1a21 e5c4cbf 80a1a21 e5c4cbf 80a1a21 e5c4cbf 80a1a21 e5c4cbf 80a1a21 e5c4cbf 80a1a21 e5c4cbf 80a1a21 e5c4cbf 80a1a21 e5c4cbf 80a1a21 e5c4cbf 80a1a21 e5c4cbf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 |
import os
import random
import sqlite3
import numpy as np
import joblib # Para carregar os modelos salvos
import streamlit as st # Para a interface de usuário
import matplotlib.pyplot as plt # Para plotagem
import torch # Necessário para SentenceTransformer, mesmo que não explícito
# Importe SentenceTransformer para embeddings otimizados
from sentence_transformers import SentenceTransformer
# --- Configuração Inicial ---
# DB_NAME e TABLE_NAME são necessários para saber onde o DB pré-gerado está.
DB_NAME = "training_data_large.db"
TABLE_NAME = "events"
MODEL_NAME = "markusbayer/CySecBERT"
RANDOM_SEED = 42
RISK_THRESHOLD = 50.0
# --- Configuração de Seed Global (para reprodutibilidade da inferência, se houver aleatoriedade) ---
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(RANDOM_SEED)
# --- Globais para Modelos e Ferramentas (serão carregadas UMA VEZ) ---
# Declaradas globalmente para serem acessíveis pelas funções de inferência.
model_base = None
mlp_regressor, scaler = None, None
tfidf_vectorizer, tfidf_regressor = None, None
# --- Vocabulário de Palavras-Chave para a Cabeça 3 (Regra Baseada) ---
# Essas listas são necessárias para a lógica de "Gerar Evento Aleatório" e a análise de palavras-chave.
HIGH_RISK_KEYWORDS = {
'failed': 15, 'unauthorized': 20, 'invalid': 15, 'blocked': 25, 'mfa_failed': 30, 'brute_force': 40, 'attack': 40,
'threat': 30, 'compromise': 30, 'malicious': 35, 'lockout': 25, 'critical': 20, 'urgent': 20, 'severe': 25,
'breach': 40, 'exfiltration': 40, 'injection': 35, 'malware': 35, 'vulnerability': 25, 'exploit': 30,
'lateral movement': 40, 'dns tunneling': 35, 'obfuscated': 25, 'anomaly': 20, 'misconfigured': 30,
'ransomware': 50, 'phishing': 45, 'insider threat': 40, 'zero-day': 50, 'unauthorized access': 35, 'data integrity': 30,
'compromised credential': 40, 'vulnerable library': 30, 'sql injection': 35, 'privilege escalation': 45
}
LOW_RISK_KEYWORDS = {
'success': -20, 'successful': -20, 'normal': -15, 'routine': -15, 'authorized': -10, 'benign': -15, 'secure': -10,
'safe': -15, 'approved': -10, 'expected': -5, 'completed': -10,
'scan completed': -25, 'validated': -15, 'patched': -20, 'renewed': -15, 'posture confirmed': -30,
'performance improved': -10, 'functionality rolled out': -10, 'resources optimized': -15,
'backup completed': -20, 'schema migration successful': -15, 'network policy updated': -10
}
# Listas de vocabulário para o botão "Gerar Evento Aleatório" no app.py
ADVERSARIAL_RISK_ACTORS = [
"Unsandboxed process", "Leaked API key", "Misconfigured service account", "Shadow IT application",
"Dormant user account", "Ransomware payload", "Phishing attempt", "Insider threat",
"Zero-day exploit", "Malicious actor", "Compromised credential", "Vulnerable third-party library",
"Compromised Kubernetes pod", "Malicious Docker container", "AWS IAM role escalation",
"Azure AD privilege escalation", "GCP service account abuse", "Container escape attempt",
"Serverless function injection", "Cloud storage bucket enumeration", "API gateway bypass",
"Microservice lateral movement", "Container registry poisoning", "Cloud metadata exploitation",
"CI/CD pipeline compromise", "Git repository poisoning", "Build artifact tampering",
"Deployment script injection", "Infrastructure as Code attack", "Secret scanning bypass",
"Dependency confusion attack", "Supply chain compromise", "Code signing certificate theft",
"Pipeline privilege escalation", "Artifact repository poisoning", "Build environment escape",
"Compromised IoT device", "Edge computing exploit", "Industrial control system breach",
"SCADA system compromise", "Smart city infrastructure attack", "Medical device exploitation",
"Automotive system breach", "Home automation compromise", "Sensor data manipulation",
"Edge gateway exploitation", "Industrial protocol abuse", "IoT botnet recruitment",
"Mobile app sandbox escape", "iOS jailbreak exploitation", "Android rootkit installation",
"Mobile banking trojan", "Enterprise device compromise", "BYOD policy violation",
"Mobile device management bypass", "App store poisoning", "Mobile certificate pinning bypass",
"Endpoint detection evasion", "Mobile phishing campaign", "Device fingerprinting abuse",
"Network segmentation bypass", "Firewall rule manipulation", "VPN tunnel exploitation",
"DNS hijacking attempt", "BGP route hijacking", "Network protocol abuse",
"Wireless network compromise", "Bluetooth attack vector", "NFC exploitation",
"Network monitoring evasion", "Traffic analysis bypass", "Protocol fuzzing attack"
]
ADVERSARIAL_RISK_ACTIONS = [
"attempted lateral movement via", "initiated a DNS tunneling request to",
"executed a living-off-the-land binary on", "was flagged for unusual API call patterns against",
"triggered a data access anomaly in", "exfiltrated data from", "modified critical system files in",
"gained unauthorized access to", "deployed malicious code on", "brute-forced login for",
"injected SQL into", "exploited a vulnerability in",
"attempted container escape from", "escalated privileges in Kubernetes cluster",
"abused IAM role permissions for", "enumerated cloud storage buckets through",
"bypassed API gateway authentication to", "injected malicious code into serverless function",
"compromised container registry access for", "exploited cloud metadata service to",
"performed lateral movement across microservices in", "poisoned container image in",
"abused cloud resource tagging for", "exploited cloud logging service to",
"compromised CI/CD pipeline to", "injected malicious code into build process for",
"poisoned dependency repository to", "tampered with build artifacts in",
"escalated privileges in deployment pipeline for", "bypassed security scanning in",
"abused infrastructure automation to", "compromised secret management system for",
"injected malicious code into deployment scripts for", "exploited build environment to",
"abused artifact signing process for", "compromised code repository access to",
"compromised IoT device firmware to", "exploited edge computing vulnerability in",
"breached industrial control system through", "manipulated sensor data from",
"exploited SCADA system vulnerability to", "compromised smart city infrastructure via",
"abused industrial protocol to", "exploited edge gateway vulnerability in",
"recruited device into botnet through", "compromised medical device firmware to",
"exploited automotive system vulnerability in", "breached home automation system via",
"escaped mobile app sandbox to", "exploited iOS jailbreak vulnerability in",
"installed rootkit on Android device to", "compromised enterprise mobile device through",
"bypassed mobile device management to", "poisoned mobile app store listing for",
"exploited mobile certificate pinning in", "compromised mobile banking app through",
"abused device fingerprinting to", "exploited mobile phishing vulnerability in",
"breached BYOD policy through", "compromised mobile endpoint security via",
"bypassed network segmentation to", "manipulated firewall rules for",
"exploited VPN tunnel vulnerability in", "hijacked DNS resolution for",
"abused BGP routing protocol to", "compromised wireless network through",
"exploited Bluetooth vulnerability in", "abused NFC communication to",
"evaded network monitoring through", "bypassed traffic analysis via",
"exploited network protocol vulnerability in", "compromised network infrastructure through"
]
ADVERSARIAL_RISK_TARGETS = [
"a code repository", "the CI/CD pipeline", "a cloud storage bucket", "the internal DNS server",
"the virtual machine hypervisor", "sensitive customer data", "financial databases",
"intellectual property servers", "critical infrastructure controls", "user authentication service",
"production web server", "database backup storage",
"Kubernetes cluster control plane", "Docker container registry", "AWS S3 bucket with sensitive data",
"Azure Active Directory tenant", "GCP Cloud Storage bucket", "container orchestration system",
"serverless function environment", "cloud API gateway", "microservice mesh network",
"container security scanning service", "cloud logging and monitoring system", "infrastructure as code repository",
"Git repository with production secrets", "Jenkins build pipeline", "Docker image registry",
"artifact repository with signed packages", "infrastructure provisioning system", "secret management vault",
"code signing certificate store", "dependency management system", "deployment automation platform",
"build environment with elevated privileges", "CI/CD security scanning tools", "infrastructure monitoring system",
"industrial control system network", "SCADA system database", "IoT device management platform",
"edge computing gateway", "smart city infrastructure", "medical device network",
"automotive system bus", "home automation hub", "sensor data collection system",
"industrial protocol gateway", "edge security monitoring system", "IoT device firmware repository",
"enterprise mobile device fleet", "mobile app store backend", "mobile device management system",
"mobile banking infrastructure", "mobile certificate authority", "mobile security scanning service",
"BYOD policy enforcement system", "mobile endpoint detection system", "mobile app security testing platform",
"mobile device fingerprinting database", "mobile phishing detection system", "mobile app code signing service",
"network segmentation firewall", "VPN concentrator", "DNS authoritative server",
"BGP route reflector", "wireless access point controller", "network monitoring system",
"traffic analysis platform", "network security scanning tool", "protocol analysis system",
"network infrastructure management", "security information system", "network forensics platform"
]
ADVERSARIAL_RISK_OUTCOMES = [
"the action was obfuscated", "a low-and-slow data transfer was detected",
"the process terminated abnormally after execution", "security controls were temporarily disabled",
"alert thresholds were bypassed", "data integrity was compromised", "system uptime was impacted",
"a backdoor was established", "a privilege escalation was achieved", "system resources were depleted",
"data encryption initiated",
"container escape was successful", "Kubernetes RBAC was bypassed", "cloud IAM policies were circumvented",
"container registry was compromised", "serverless function was weaponized", "cloud logging was manipulated",
"microservice communication was intercepted", "container security scanning was evaded",
"cloud resource tagging was abused", "container orchestration was compromised",
"cloud metadata service was exploited", "container networking was hijacked",
"build pipeline was compromised", "dependency repository was poisoned", "artifact signing was bypassed",
"infrastructure automation was weaponized", "secret management was breached", "code repository was compromised",
"deployment process was hijacked", "build environment was escaped", "CI/CD security was bypassed",
"infrastructure monitoring was disabled", "artifact integrity was compromised", "deployment approval was bypassed",
"IoT device was recruited into botnet", "industrial control system was compromised", "edge gateway was breached",
"sensor data was manipulated", "SCADA system was taken offline", "smart city infrastructure was disrupted",
"medical device was compromised", "automotive system was hijacked", "home automation was breached",
"industrial protocol was abused", "edge security was bypassed", "IoT device firmware was modified",
"mobile device was rooted/jailbroken", "enterprise mobile security was bypassed", "mobile app was compromised",
"mobile device management was evaded", "mobile banking was breached", "mobile certificate pinning was bypassed",
"BYOD policy was violated", "mobile endpoint detection was evaded", "mobile app store was poisoned",
"mobile device fingerprinting was spoofed", "mobile phishing was successful", "mobile security scanning was bypassed",
"network segmentation was bypassed", "firewall rules were manipulated", "VPN tunnel was compromised",
"DNS resolution was hijacked", "BGP routing was manipulated", "wireless network was compromised",
"Bluetooth security was bypassed", "NFC communication was intercepted", "network monitoring was evaded",
"traffic analysis was bypassed", "network protocol was abused", "network infrastructure was compromised"
]
# --- Funções para Inferência ---
# Decorator para cachear o carregamento de recursos pesados, como modelos.
# Isso garante que o modelo seja carregado apenas uma vez, mesmo após interações na UI.
@st.cache_resource
def load_all_models_and_tokenizer():
"""
Carrega todos os modelos pré-treinados e o SentenceTransformer.
Esta função é cacheada pelo Streamlit para ser executada apenas uma vez.
"""
st.info("Cargando SentenceTransformer (CySecBERT)...")
# O SentenceTransformer faz o download do modelo se não estiver em cache local
sbert_model = SentenceTransformer(MODEL_NAME)
st.info("Cargando modelos de scikit-learn treinados...")
mlp = joblib.load("mlp_regressor.joblib")
scl = joblib.load("scaler.joblib")
tfidf_vec = joblib.load("tfidf_vectorizer.joblib")
tfidf_reg = joblib.load("tfidf_regressor.joblib")
st.success("Modelos carregados com sucesso!")
return sbert_model, mlp, scl, tfidf_vec, tfidf_reg
def classify_and_score_risk(event_text: str) -> tuple[float, dict]:
"""
Classifica um único evento de segurança e calcula um score de risco,
utilizando os modelos treinados e combinando os resultados de múltiplas cabeças.
"""
global model_base, mlp_regressor, scaler, tfidf_vectorizer, tfidf_regressor # Usar os modelos carregados
# Cabeça 1: Embedding Profundo (CySecBERT + MLP)
# Garante que o embedding seja um numpy array e depois reshape para 2D para o scaler
embedding = model_base.encode([event_text], convert_to_numpy=True, show_progress_bar=False)[0]
embedding_scaled = scaler.transform(embedding.reshape(1, -1))
score_embedding = mlp_regressor.predict(embedding_scaled)[0]
# Cabeça 2: Vetorial Clássico (TF-IDF + Ridge)
tfidf_vector = tfidf_vectorizer.transform([event_text])
score_tfidf = tfidf_regressor.predict(tfidf_vector)[0]
# Cabeça 3: Análise de Palavras-Chave (Regra Baseada)
score_keyword = 50.0 # Score base
text_lower = event_text.lower()
for word, value in HIGH_RISK_KEYWORDS.items():
if word in text_lower: score_keyword += value
for word, value in LOW_RISK_KEYWORDS.items():
if word in text_lower: score_keyword += value
score_keyword = np.clip(score_keyword, 0, 100) # Garante que o score fique entre 0 e 100
# Combinação ponderada dos scores
weights = {'embedding': 0.5, 'tfidf': 0.3, 'keyword': 0.2}
final_score = (score_embedding * weights['embedding'] +
score_tfidf * weights['tfidf'] +
score_keyword * weights['keyword'])
final_score = np.clip(final_score, 0, 100)
individual_scores = {
'embedding_score': np.clip(score_embedding, 0, 100),
'tfidf_score': np.clip(score_tfidf, 0, 100),
'keyword_score': score_keyword
}
return float(final_score), individual_scores
# --- Interface Streamlit ---
st.set_page_config(
page_title="Detecção de Risco de Segurança Ciber",
page_icon="🛡️",
layout="wide",
initial_sidebar_state="expanded"
)
st.title("🛡️ Sistema de Detecção de Risco de Segurança Ciber")
st.markdown("---")
st.sidebar.header("Sobre")
st.sidebar.write("""
Este aplicativo analisa logs de eventos de segurança usando um modelo híbrido com:
- **Embedding Profundo:** CySecBERT + MLP Regressor
- **Vetorial Clássico:** TF-IDF + Ridge Regressor
- **Regras Heurísticas:** Análise de palavras-chave
""")
st.sidebar.info("Desenvolvido para demonstração no Hugging Face Spaces.")
# Carrega os modelos uma única vez na inicialização do aplicativo
# Este bloco de `st.empty()` e `st.spinner` permite que as mensagens de carregamento sejam temporárias
loading_message_placeholder = st.empty()
with loading_message_placeholder.container():
model_base, mlp_regressor, scaler, tfidf_vectorizer, tfidf_regressor = load_all_models_and_tokenizer()
loading_message_placeholder.empty() # Remove a mensagem de carregamento após a conclusão
st.subheader("Insira o Evento de Segurança para Análise:")
# Use st.session_state para manter o texto da área de texto após a geração aleatória
if 'event_text_input' not in st.session_state:
st.session_state.event_text_input = "Audit log: Unsandboxed process attempted lateral movement via a code repository. Status: the action was obfuscated."
event_text = st.text_area(
"Descrição do Evento",
height=200,
value=st.session_state.event_text_input,
key="event_text_input" # Atribui uma chave para o widget
)
# Botão para "Gerar Evento Aleatório (Risco)"
if st.button("Gerar Evento Aleatório (Risco)", help="Gera um exemplo de evento de alto risco para demonstração."):
actor = random.choice(ADVERSARIAL_RISK_ACTORS)
action = random.choice(ADVERSARIAL_RISK_ACTIONS)
target = random.choice(ADVERSARIAL_RISK_TARGETS)
outcome = random.choice(ADVERSARIAL_RISK_OUTCOMES)
random_event_text = f"Audit log: {actor} {action} {target}. Status: {outcome}."
st.session_state.event_text_input = random_event_text # Atualiza o texto na sessão
st.experimental_rerun() # Força o Streamlit a re-executar para atualizar a text_area
if st.button("Analisar Risco", type="primary"):
if not event_text.strip():
st.warning("Por favor, insira um texto de evento para análise.")
else:
# Use um placeholder para resultados que será atualizado incrementalmente
results_placeholder = st.empty()
with results_placeholder.container():
with st.spinner("Analisando o evento..."):
final_score, individual_scores = classify_and_score_risk(event_text)
prediction = "Risco" if final_score >= RISK_THRESHOLD else "Seguro"
confidence = round(abs(final_score - 50) * 2)
confidence = min(100, max(0, confidence)) # Garante que fique no intervalo 0-100
st.markdown("---")
st.subheader("📊 Resultado da Análise:")
if prediction == "Risco":
st.error(f"**PREDIÇÃO:** {prediction} (Score: {final_score:.2f})")
else:
st.success(f"**PREDIÇÃO:** {prediction} (Score: {final_score:.2f})")
st.info(f"**CONFIANÇA:** {confidence}%")
st.markdown("---")
st.subheader("Scores por Componente (0-100):")
col1, col2, col3 = st.columns(3)
col1.metric("Embedding Score", f"{individual_scores['embedding_score']:.2f}")
col2.metric("TF-IDF Score", f"{individual_scores['tfidf_score']:.2f}")
col3.metric("Keyword Score", f"{individual_scores['keyword_score']:.2f}")
st.markdown("---")
st.subheader("📈 Visualização dos Scores:")
fig, ax = plt.subplots(figsize=(10, 5))
labels = ['Embedding', 'TF-IDF', 'Keyword', 'Final']
scores = [individual_scores['embedding_score'],
individual_scores['tfidf_score'],
individual_scores['keyword_score'],
final_score]
colors = ['skyblue', 'lightcoral', 'lightgreen', 'darkblue']
ax.bar(labels, scores, color=colors)
ax.axhline(RISK_THRESHOLD, color='red', linestyle='--', label=f'Limiar de Risco ({RISK_THRESHOLD:.2f})')
ax.set_ylim(0, 100)
ax.set_ylabel("Score de Risco")
ax.set_title("Scores Componentes e Final do Evento")
ax.legend()
st.pyplot(fig) # Exibe a figura Matplotlib
plt.close(fig) # Fecha a figura para liberar memória
st.markdown("---")
st.markdown("Para uma nova análise, modifique o texto do evento acima ou gere um aleatório e clique em 'Analisar Risco' novamente.") |