Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 87 additions & 28 deletions tools/health_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,23 +150,70 @@ def check_disk_usage(path: str = "/") -> Tuple[str, str, float]:


def check_memory_usage() -> Tuple[str, str, float]:
"""Check memory usage with cross-platform fallbacks."""
try:
with open("/proc/meminfo") as f:
meminfo = {}
for line in f:
parts = line.split(":")
if len(parts) == 2:
key = parts[0].strip()
value = parts[1].strip().replace(" kB", "")
try:
meminfo[key] = int(value) * 1024
except ValueError:
pass

total = meminfo.get("MemTotal", 0)
available = meminfo.get("MemAvailable", 0)
used = total - available
pct = (used / total) * 100 if total > 0 else 0
# Try Linux /proc/meminfo first
if os.path.exists("/proc/meminfo"):
with open("/proc/meminfo") as f:
meminfo = {}
for line in f:
parts = line.split(":")
if len(parts) == 2:
key = parts[0].strip()
value = parts[1].strip().replace(" kB", "")
try:
meminfo[key] = int(value) * 1024
except ValueError:
pass

total = meminfo.get("MemTotal", 0)
available = meminfo.get("MemAvailable", 0)
used = total - available
pct = (used / total) * 100 if total > 0 else 0
else:
# Cross-platform fallback for macOS/BSD
# On macOS, we can use subprocess to get memory info
try:
result = subprocess.run(
["sysctl", "-n", "hw.memsize"],
capture_output=True, text=True, timeout=5
)
total = int(result.stdout.strip())
except (subprocess.TimeoutExpired, ValueError, FileNotFoundError):
# Fallback: estimate from resource limits
# RLIMIT_AS is virtual memory limit, not actual total
# Use a reasonable default if we can't determine
total = 0

if total > 0:
# Get page size and free pages
try:
result = subprocess.run(
["vm_stat"],
capture_output=True, text=True, timeout=5
)
# Parse vm_stat output for page size and free pages
lines = result.stdout.strip().split("\n")
page_size = 4096 # Default page size
free_pages = 0
for line in lines:
if "page size of" in line:
page_size = int(line.split()[-1])
elif "Pages free" in line:
free_pages = int(line.split()[-1].rstrip("."))
available = free_pages * page_size
used = total - available
pct = (used / total) * 100 if total > 0 else 0
except (subprocess.TimeoutExpired, ValueError, FileNotFoundError):
pct = 0
used = 0
total = 0
else:
pct = 0
used = 0

if total == 0:
return "WARNING", "Cannot determine memory usage on this platform", 0

if pct < MEMORY_THRESHOLD_WARNING:
return "OK", f"{pct:.1f}% used ({used // (1024**3)}GB/{total // (1024**3)}GB)", pct
Expand All @@ -179,19 +226,31 @@ def check_memory_usage() -> Tuple[str, str, float]:


def check_load_average() -> Tuple[str, str, float]:
"""Check load average with cross-platform fallbacks."""
try:
with open("/proc/loadavg") as f:
parts = f.read().strip().split()
load = float(parts[0])
cpu_count = os.cpu_count() or 1
load_pct = (load / cpu_count) * 100

if load_pct < 70:
return "OK", f"Load: {load} ({load_pct:.0f}% of {cpu_count} cores)", load
elif load_pct < 90:
return "WARNING", f"Load: {load} ({load_pct:.0f}% of {cpu_count} cores)", load
else:
return "CRITICAL", f"Load: {load} ({load_pct:.0f}% of {cpu_count} cores)", load
# Try Linux /proc/loadavg first
if os.path.exists("/proc/loadavg"):
with open("/proc/loadavg") as f:
parts = f.read().strip().split()
load = float(parts[0])
else:
# Cross-platform fallback using os.getloadavg()
# Available on macOS, BSD, and most Unix-like systems
try:
load_1, load_5, load_15 = os.getloadavg()
load = load_1 # Use 1-minute load average
except OSError:
return "WARNING", "Cannot determine load average on this platform", 0

cpu_count = os.cpu_count() or 1
load_pct = (load / cpu_count) * 100

if load_pct < 70:
return "OK", f"Load: {load} ({load_pct:.0f}% of {cpu_count} cores)", load
elif load_pct < 90:
return "WARNING", f"Load: {load} ({load_pct:.0f}% of {cpu_count} cores)", load
else:
return "CRITICAL", f"Load: {load} ({load_pct:.0f}% of {cpu_count} cores)", load
except Exception as e:
return "WARNING", f"Cannot check: {e}", 0

Expand Down