commit 321a43ac81f0f366099a8891f883be9d77b05d0e Author: AJ Isaacs Date: Fri Feb 6 22:47:12 2026 -0500 Initial commit: llama.cpp watchdog service Monitors llama-server health with multi-phase checks (zombie detection, health endpoint, loaded model probing) and auto-restarts via systemd or manual relaunch on consecutive failures. Co-Authored-By: Claude Opus 4.6 diff --git a/llamacpp-watchdog.py b/llamacpp-watchdog.py new file mode 100644 index 0000000..0ba4fc6 --- /dev/null +++ b/llamacpp-watchdog.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +""" +llama.cpp Watchdog Service +Monitors llama-server health and restarts on failure. + +Detects: + - Router health endpoint failures + - Zombie child model-server processes + - Loaded models that are unreachable through the router +""" + +import subprocess +import requests +import time +import signal +from datetime import datetime + +# Configuration +SERVERS = [ + {"name": "llama-main", "port": 11434, "service": "llama-cpp"}, + {"name": "llama-alt", "port": 8082, "service": None}, +] + +LLAMA_SERVER_BIN = "/home/aj/llama.cpp/build/bin/llama-server" +MODELS_DIR = "/home/aj/models" +MODELS_PRESET = "/home/aj/models/models.ini" + +CHECK_INTERVAL = 30 # seconds between health checks +HEALTH_TIMEOUT = 10 # seconds to wait for health response +DEEP_CHECK_TIMEOUT = 30 # seconds to wait for model probe +MAX_CONSECUTIVE_FAILURES = 2 # restart after this many failures + +# Track failures per server +failure_counts = {} + + +def log(message): + """Log with timestamp.""" + print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {message}", flush=True) + + +def check_health(port): + """Check if the router process is responding.""" + try: + response = requests.get(f"http://localhost:{port}/health", timeout=HEALTH_TIMEOUT) + return response.status_code == 200 + except requests.exceptions.RequestException: + return False + + +def get_loaded_models(port): + """Get list of models the router reports as loaded.""" + try: + response = requests.get(f"http://localhost:{port}/v1/models", timeout=HEALTH_TIMEOUT) + if response.status_code != 200: + return [] + data = response.json() + return [ + m["id"] for m in data.get("data", []) + if m.get("status", {}).get("value") == "loaded" + ] + except Exception: + return [] + + +def probe_model(port, model_name): + """Send a minimal completions request to verify a loaded model actually works.""" + try: + response = requests.post( + f"http://localhost:{port}/v1/chat/completions", + json={ + "model": model_name, + "messages": [{"role": "user", "content": "hi"}], + "max_tokens": 1, + }, + timeout=DEEP_CHECK_TIMEOUT, + ) + return response.status_code == 200 + except requests.exceptions.RequestException: + return False + + +def check_zombies(): + """Check for zombie llama-server processes.""" + result = subprocess.run(["ps", "aux"], capture_output=True, text=True) + zombies = [] + for line in result.stdout.split("\n"): + if "llama-server" in line and "" in line: + parts = line.split() + if len(parts) >= 2: + zombies.append(parts[1]) + return zombies + + +def restart_via_systemd(service_name): + """Restart a server using systemd.""" + log(f"Restarting {service_name} via systemd...") + result = subprocess.run( + ["sudo", "systemctl", "restart", service_name], + capture_output=True, + text=True, + ) + if result.returncode == 0: + log(f"{service_name} restart command succeeded") + else: + log(f"{service_name} restart failed: {result.stderr.strip()}") + # Give it time to come up + time.sleep(5) + + +def restart_manual(server): + """Restart a server that has no systemd service by killing and re-launching.""" + port = server["port"] + host = server["host"] if "host" in server else "0.0.0.0" + name = server["name"] + + log(f"Restarting {name} (manual, port {port})...") + + # Kill existing + subprocess.run(["pkill", "-9", "-f", f"llama-server.*--port.*{port}"], capture_output=True) + time.sleep(2) + + cmd = [ + LLAMA_SERVER_BIN, + "--host", host, + "--port", str(port), + "--models-dir", MODELS_DIR, + "--models-preset", MODELS_PRESET, + ] + subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, start_new_session=True) + time.sleep(5) + + if check_health(port): + log(f"{name} started successfully") + else: + log(f"{name} failed to start") + + +def restart_server(server): + """Restart a server using the appropriate method.""" + service = server.get("service") + if service: + restart_via_systemd(service) + else: + restart_manual(server) + + +def run_watchdog(): + """Main watchdog loop.""" + log("llama.cpp watchdog starting...") + + for server in SERVERS: + failure_counts[server["port"]] = 0 + + while True: + try: + # --- Phase 1: Check for zombie child processes --- + zombies = check_zombies() + if zombies: + log(f"Found {len(zombies)} zombie llama-server process(es): {zombies}") + for server in SERVERS: + restart_server(server) + failure_counts[server["port"]] = 0 + time.sleep(CHECK_INTERVAL) + continue + + # --- Phase 2: Basic health checks --- + for server in SERVERS: + port = server["port"] + name = server["name"] + + if not check_health(port): + failure_counts[port] += 1 + log(f"{name} health check failed ({failure_counts[port]}/{MAX_CONSECUTIVE_FAILURES})") + + if failure_counts[port] >= MAX_CONSECUTIVE_FAILURES: + restart_server(server) + failure_counts[port] = 0 + continue + + # --- Phase 3: Deep check - probe loaded models --- + loaded = get_loaded_models(port) + if loaded: + all_ok = True + for model in loaded: + if not probe_model(port, model): + log(f"{name}: loaded model '{model}' is unreachable!") + all_ok = False + break + + if not all_ok: + failure_counts[port] += 1 + log(f"{name} deep check failed ({failure_counts[port]}/{MAX_CONSECUTIVE_FAILURES})") + + if failure_counts[port] >= MAX_CONSECUTIVE_FAILURES: + restart_server(server) + failure_counts[port] = 0 + continue + + # All checks passed + if failure_counts[port] > 0: + log(f"{name} recovered") + failure_counts[port] = 0 + + time.sleep(CHECK_INTERVAL) + + except KeyboardInterrupt: + log("Watchdog stopping...") + break + except Exception as e: + log(f"Watchdog error: {e}") + time.sleep(CHECK_INTERVAL) + + +if __name__ == "__main__": + run_watchdog() diff --git a/llamacpp-watchdog.service b/llamacpp-watchdog.service new file mode 100644 index 0000000..1d49b92 --- /dev/null +++ b/llamacpp-watchdog.service @@ -0,0 +1,18 @@ +[Unit] +Description=llama.cpp Watchdog Service +After=network.target + +[Service] +Type=simple +User=aj +ExecStart=/usr/bin/python3 /home/aj/llamacpp-watchdog/llamacpp-watchdog.py +Restart=always +RestartSec=10 + +# Logging +StandardOutput=journal +StandardError=journal +SyslogIdentifier=llamacpp-watchdog + +[Install] +WantedBy=multi-user.target