Initial commit: llama.cpp watchdog service

Monitors llama-server health with multi-phase checks (zombie detection, health endpoint, loaded model probing) and auto-restarts via systemd or manual relaunch on consecutive failures. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-06 22:47:12 -05:00
commit 321a43ac81
2 changed files with 234 additions and 0 deletions
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""
+llama.cpp Watchdog Service
+Monitors llama-server health and restarts on failure.
+
+Detects:
+  - Router health endpoint failures
+  - Zombie child model-server processes
+  - Loaded models that are unreachable through the router
+"""
+
+import subprocess
+import requests
+import time
+import signal
+from datetime import datetime
+
+# Configuration
+SERVERS = [
+    {"name": "llama-main", "port": 11434, "service": "llama-cpp"},
+    {"name": "llama-alt", "port": 8082, "service": None},
+]
+
+LLAMA_SERVER_BIN = "/home/aj/llama.cpp/build/bin/llama-server"
+MODELS_DIR = "/home/aj/models"
+MODELS_PRESET = "/home/aj/models/models.ini"
+
+CHECK_INTERVAL = 30  # seconds between health checks
+HEALTH_TIMEOUT = 10  # seconds to wait for health response
+DEEP_CHECK_TIMEOUT = 30  # seconds to wait for model probe
+MAX_CONSECUTIVE_FAILURES = 2  # restart after this many failures
+
+# Track failures per server
+failure_counts = {}
+
+
+def log(message):
+    """Log with timestamp."""
+    print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {message}", flush=True)
+
+
+def check_health(port):
+    """Check if the router process is responding."""
+    try:
+        response = requests.get(f"http://localhost:{port}/health", timeout=HEALTH_TIMEOUT)
+        return response.status_code == 200
+    except requests.exceptions.RequestException:
+        return False
+
+
+def get_loaded_models(port):
+    """Get list of models the router reports as loaded."""
+    try:
+        response = requests.get(f"http://localhost:{port}/v1/models", timeout=HEALTH_TIMEOUT)
+        if response.status_code != 200:
+            return []
+        data = response.json()
+        return [
+            m["id"] for m in data.get("data", [])
+            if m.get("status", {}).get("value") == "loaded"
+        ]
+    except Exception:
+        return []
+
+
+def probe_model(port, model_name):
+    """Send a minimal completions request to verify a loaded model actually works."""
+    try:
+        response = requests.post(
+            f"http://localhost:{port}/v1/chat/completions",
+            json={
+                "model": model_name,
+                "messages": [{"role": "user", "content": "hi"}],
+                "max_tokens": 1,
+            },
+            timeout=DEEP_CHECK_TIMEOUT,
+        )
+        return response.status_code == 200
+    except requests.exceptions.RequestException:
+        return False
+
+
+def check_zombies():
+    """Check for zombie llama-server processes."""
+    result = subprocess.run(["ps", "aux"], capture_output=True, text=True)
+    zombies = []
+    for line in result.stdout.split("\n"):
+        if "llama-server" in line and "<defunct>" in line:
+            parts = line.split()
+            if len(parts) >= 2:
+                zombies.append(parts[1])
+    return zombies
+
+
+def restart_via_systemd(service_name):
+    """Restart a server using systemd."""
+    log(f"Restarting {service_name} via systemd...")
+    result = subprocess.run(
+        ["sudo", "systemctl", "restart", service_name],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode == 0:
+        log(f"{service_name} restart command succeeded")
+    else:
+        log(f"{service_name} restart failed: {result.stderr.strip()}")
+    # Give it time to come up
+    time.sleep(5)
+
+
+def restart_manual(server):
+    """Restart a server that has no systemd service by killing and re-launching."""
+    port = server["port"]
+    host = server["host"] if "host" in server else "0.0.0.0"
+    name = server["name"]
+
+    log(f"Restarting {name} (manual, port {port})...")
+
+    # Kill existing
+    subprocess.run(["pkill", "-9", "-f", f"llama-server.*--port.*{port}"], capture_output=True)
+    time.sleep(2)
+
+    cmd = [
+        LLAMA_SERVER_BIN,
+        "--host", host,
+        "--port", str(port),
+        "--models-dir", MODELS_DIR,
+        "--models-preset", MODELS_PRESET,
+    ]
+    subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, start_new_session=True)
+    time.sleep(5)
+
+    if check_health(port):
+        log(f"{name} started successfully")
+    else:
+        log(f"{name} failed to start")
+
+
+def restart_server(server):
+    """Restart a server using the appropriate method."""
+    service = server.get("service")
+    if service:
+        restart_via_systemd(service)
+    else:
+        restart_manual(server)
+
+
+def run_watchdog():
+    """Main watchdog loop."""
+    log("llama.cpp watchdog starting...")
+
+    for server in SERVERS:
+        failure_counts[server["port"]] = 0
+
+    while True:
+        try:
+            # --- Phase 1: Check for zombie child processes ---
+            zombies = check_zombies()
+            if zombies:
+                log(f"Found {len(zombies)} zombie llama-server process(es): {zombies}")
+                for server in SERVERS:
+                    restart_server(server)
+                    failure_counts[server["port"]] = 0
+                time.sleep(CHECK_INTERVAL)
+                continue
+
+            # --- Phase 2: Basic health checks ---
+            for server in SERVERS:
+                port = server["port"]
+                name = server["name"]
+
+                if not check_health(port):
+                    failure_counts[port] += 1
+                    log(f"{name} health check failed ({failure_counts[port]}/{MAX_CONSECUTIVE_FAILURES})")
+
+                    if failure_counts[port] >= MAX_CONSECUTIVE_FAILURES:
+                        restart_server(server)
+                        failure_counts[port] = 0
+                    continue
+
+                # --- Phase 3: Deep check - probe loaded models ---
+                loaded = get_loaded_models(port)
+                if loaded:
+                    all_ok = True
+                    for model in loaded:
+                        if not probe_model(port, model):
+                            log(f"{name}: loaded model '{model}' is unreachable!")
+                            all_ok = False
+                            break
+
+                    if not all_ok:
+                        failure_counts[port] += 1
+                        log(f"{name} deep check failed ({failure_counts[port]}/{MAX_CONSECUTIVE_FAILURES})")
+
+                        if failure_counts[port] >= MAX_CONSECUTIVE_FAILURES:
+                            restart_server(server)
+                            failure_counts[port] = 0
+                        continue
+
+                # All checks passed
+                if failure_counts[port] > 0:
+                    log(f"{name} recovered")
+                failure_counts[port] = 0
+
+            time.sleep(CHECK_INTERVAL)
+
+        except KeyboardInterrupt:
+            log("Watchdog stopping...")
+            break
+        except Exception as e:
+            log(f"Watchdog error: {e}")
+            time.sleep(CHECK_INTERVAL)
+
+
+if __name__ == "__main__":
+    run_watchdog()
@@ -0,0 +1,18 @@
+[Unit]
+Description=llama.cpp Watchdog Service
+After=network.target
+
+[Service]
+Type=simple
+User=aj
+ExecStart=/usr/bin/python3 /home/aj/llamacpp-watchdog/llamacpp-watchdog.py
+Restart=always
+RestartSec=10
+
+# Logging
+StandardOutput=journal
+StandardError=journal
+SyslogIdentifier=llamacpp-watchdog
+
+[Install]
+WantedBy=multi-user.target