# health.py import time from datetime import datetime, timedelta import psutil from sqlalchemy import text def get_health_status(engine, timeout=10): """ Returns system & database health metrics with star rating. Includes: - CPU/memory usage - DB connectivity & row estimates - Heartbeat alive flag - Runtime duration (monitor) """ start_time = time.time() status = "ok" db_status = "ok" db_metrics = {} stars = 5 traffic_count = 0 try: # Enforce timeout manually from threading import Thread result = {} def run_check(): try: with engine.begin() as conn: # Quick DB ping conn.execute(text("SELECT 1")) # Estimate table sizes (fast) knowledge_count = conn.execute( text("SELECT reltuples::BIGINT AS estimate FROM pg_class WHERE relname='knowledge'") ).scalar() or 0 user_memory_count = conn.execute( text("SELECT reltuples::BIGINT AS estimate FROM pg_class WHERE relname='user_memory'") ).scalar() or 0 # Count recent 1-hour user memory inserts one_hour_ago = datetime.utcnow() - timedelta(hours=1) traffic_count = conn.execute( text("SELECT COUNT(*) FROM user_memory WHERE created_at >= :t"), {"t": one_hour_ago} ).scalar() or 0 result["db_metrics"] = { "knowledge_count": knowledge_count, "user_memory_count": user_memory_count, "recent_traffic_1h": traffic_count } except Exception as e: result["error"] = str(e) t = Thread(target=run_check) t.start() t.join(timeout) if t.is_alive(): status = "timeout" db_status = f"timeout > {timeout}s" stars = 1 latency_ms = None elif "error" in result: status = "fail" db_status = f"fail: {result['error']}" stars = 0 latency_ms = None else: latency_ms = (time.time() - start_time) * 1000 db_metrics = result["db_metrics"] # System stats cpu = psutil.cpu_percent() memory = psutil.virtual_memory().percent total_rows = db_metrics["knowledge_count"] + db_metrics["user_memory_count"] # Adjust star rating if latency_ms > 500 or db_metrics["recent_traffic_1h"] > 2000 or cpu > 90 or memory > 90: stars = max(stars - 2, 1) elif latency_ms > 300 or db_metrics["recent_traffic_1h"] > 1000 or cpu > 75 or memory > 75: stars = max(stars - 1, 2) if total_rows > 20000: stars = min(stars, 1) elif total_rows > 10000: stars = min(stars, 2) elif total_rows > 5000: stars = min(stars, 3) except Exception as e: db_status = f"fail: {str(e)}" status = "fail" stars = 0 latency_ms = None cpu = memory = None db_metrics = {} elapsed = round(time.time() - start_time, 2) return { "heartbeat": "alive", "status": status, "db_status": db_status, "db_metrics": db_metrics, "latency_ms": round(latency_ms, 2) if latency_ms else None, "cpu_percent": psutil.cpu_percent(), "memory_percent": psutil.virtual_memory().percent, "stars": stars, "response_time_s": elapsed, "time": datetime.utcnow().isoformat() }