feat: per-model failure tracking to avoid unnecessary full restarts

Individual model probe failures are now tracked separately from router
health failures. Models get a grace period after loading, and persistently
failing models are unloaded and put in cooldown rather than triggering a
full service restart. Only router-level health failures cause restarts.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-08 12:09:07 -05:00
parent 321a43ac81
commit 174db1e5db

View File

@@ -7,6 +7,12 @@ Detects:
- Router health endpoint failures
- Zombie child model-server processes
- Loaded models that are unreachable through the router
Per-model tracking:
- Individual model failures never trigger a full service restart
- Newly loaded models get a grace period before probing
- Persistently failing models are unloaded and put in cooldown instead of causing restarts
- Only router-level health failures trigger service restarts
"""
import subprocess
@@ -14,6 +20,7 @@ import requests
import time
import signal
from datetime import datetime
from collections import defaultdict
# Configuration
SERVERS = [
@@ -25,13 +32,25 @@ LLAMA_SERVER_BIN = "/home/aj/llama.cpp/build/bin/llama-server"
MODELS_DIR = "/home/aj/models"
MODELS_PRESET = "/home/aj/models/models.ini"
CHECK_INTERVAL = 30 # seconds between health checks
HEALTH_TIMEOUT = 10 # seconds to wait for health response
DEEP_CHECK_TIMEOUT = 30 # seconds to wait for model probe
MAX_CONSECUTIVE_FAILURES = 2 # restart after this many failures
CHECK_INTERVAL = 30 # seconds between checks
HEALTH_TIMEOUT = 10 # router health check timeout
DEEP_CHECK_TIMEOUT = 30 # model probe timeout
MAX_HEALTH_FAILURES = 2 # restart after N router health failures
MAX_MODEL_PROBE_FAILURES = 5 # ignore model after N probe failures
MODEL_LOAD_GRACE_PERIOD = 300 # skip probing models loaded within last 5 min
MODEL_FAILURE_COOLDOWN = 600 # stop probing a failed model for 10 min
# Track failures per server
failure_counts = {}
# Per-server health failure tracking (router-level)
health_failures = {}
# Per-model failure tracking: {port: {model_name: count}}
model_failures = defaultdict(lambda: defaultdict(int))
# Per-model first-seen timestamps: {port: {model_name: timestamp}}
model_first_seen = defaultdict(dict)
# Per-model cooldown timestamps: {port: {model_name: timestamp}}
model_cooldowns = defaultdict(dict)
def log(message):
@@ -49,16 +68,20 @@ def check_health(port):
def get_loaded_models(port):
"""Get list of models the router reports as loaded."""
"""Get list of models the router reports as loaded, with status info.
Returns list of dicts: [{"id": "model-name", "status": "loaded"}, ...]
"""
try:
response = requests.get(f"http://localhost:{port}/v1/models", timeout=HEALTH_TIMEOUT)
if response.status_code != 200:
return []
data = response.json()
return [
m["id"] for m in data.get("data", [])
if m.get("status", {}).get("value") == "loaded"
]
models = []
for m in data.get("data", []):
status_value = m.get("status", {}).get("value", "unknown")
models.append({"id": m["id"], "status": status_value})
return models
except Exception:
return []
@@ -80,6 +103,19 @@ def probe_model(port, model_name):
return False
def unload_model(port, model_name):
"""Ask the router to unload a specific model."""
try:
response = requests.post(
f"http://localhost:{port}/models/unload",
json={"model": model_name},
timeout=HEALTH_TIMEOUT,
)
return response.status_code == 200
except requests.exceptions.RequestException:
return False
def check_zombies():
"""Check for zombie llama-server processes."""
result = subprocess.run(["ps", "aux"], capture_output=True, text=True)
@@ -145,62 +181,140 @@ def restart_server(server):
restart_manual(server)
def clear_model_tracking(port):
"""Clear all per-model tracking state for a server after restart."""
model_failures[port].clear()
model_first_seen[port].clear()
model_cooldowns[port].clear()
def run_watchdog():
"""Main watchdog loop."""
log("llama.cpp watchdog starting...")
log(f"Config: health_failures_threshold={MAX_HEALTH_FAILURES}, "
f"model_probe_failures_threshold={MAX_MODEL_PROBE_FAILURES}, "
f"grace_period={MODEL_LOAD_GRACE_PERIOD}s, "
f"cooldown={MODEL_FAILURE_COOLDOWN}s")
for server in SERVERS:
failure_counts[server["port"]] = 0
health_failures[server["port"]] = 0
while True:
try:
now = time.time()
# --- Phase 1: Check for zombie child processes ---
zombies = check_zombies()
if zombies:
log(f"Found {len(zombies)} zombie llama-server process(es): {zombies}")
for server in SERVERS:
restart_server(server)
failure_counts[server["port"]] = 0
health_failures[server["port"]] = 0
clear_model_tracking(server["port"])
time.sleep(CHECK_INTERVAL)
continue
# --- Phase 2: Basic health checks ---
# --- Phase 2: Router health checks ---
for server in SERVERS:
port = server["port"]
name = server["name"]
if not check_health(port):
failure_counts[port] += 1
log(f"{name} health check failed ({failure_counts[port]}/{MAX_CONSECUTIVE_FAILURES})")
health_failures[port] += 1
log(f"{name} health check failed ({health_failures[port]}/{MAX_HEALTH_FAILURES})")
if failure_counts[port] >= MAX_CONSECUTIVE_FAILURES:
if health_failures[port] >= MAX_HEALTH_FAILURES:
restart_server(server)
failure_counts[port] = 0
health_failures[port] = 0
clear_model_tracking(port)
continue
# --- Phase 3: Deep check - probe loaded models ---
loaded = get_loaded_models(port)
if loaded:
all_ok = True
for model in loaded:
if not probe_model(port, model):
log(f"{name}: loaded model '{model}' is unreachable!")
all_ok = False
break
# Health check passed - reset health failure counter
if health_failures[port] > 0:
log(f"{name} router recovered after {health_failures[port]} failure(s)")
health_failures[port] = 0
if not all_ok:
failure_counts[port] += 1
log(f"{name} deep check failed ({failure_counts[port]}/{MAX_CONSECUTIVE_FAILURES})")
# --- Phase 3: Deep check - probe loaded models individually ---
models = get_loaded_models(port)
if not models:
continue
if failure_counts[port] >= MAX_CONSECUTIVE_FAILURES:
restart_server(server)
failure_counts[port] = 0
# Track which models are currently loaded so we can clean up stale entries
current_model_ids = {m["id"] for m in models}
# Clean up tracking for models that are no longer loaded
for tracking_dict in [model_first_seen, model_failures, model_cooldowns]:
if port in tracking_dict:
stale = [m for m in tracking_dict[port] if m not in current_model_ids]
for m in stale:
del tracking_dict[port][m]
for model_info in models:
model_name = model_info["id"]
model_status = model_info["status"]
# Skip models that are still loading
if model_status == "loading":
log(f"{name}: model '{model_name}' is still loading, skipping probe")
continue
# All checks passed
if failure_counts[port] > 0:
log(f"{name} recovered")
failure_counts[port] = 0
# Skip models that aren't fully loaded
if model_status != "loaded":
continue
# Track first-seen time for grace period
if model_name not in model_first_seen[port]:
model_first_seen[port][model_name] = now
log(f"{name}: new model '{model_name}' detected, "
f"grace period {MODEL_LOAD_GRACE_PERIOD}s before probing")
continue
# Skip if within grace period
age = now - model_first_seen[port][model_name]
if age < MODEL_LOAD_GRACE_PERIOD:
remaining = int(MODEL_LOAD_GRACE_PERIOD - age)
# Only log occasionally to avoid spam
if int(age) % 60 < CHECK_INTERVAL:
log(f"{name}: model '{model_name}' in grace period ({remaining}s remaining)")
continue
# Skip if in cooldown from previous failures
if model_name in model_cooldowns[port]:
cooldown_elapsed = now - model_cooldowns[port][model_name]
if cooldown_elapsed < MODEL_FAILURE_COOLDOWN:
remaining = int(MODEL_FAILURE_COOLDOWN - cooldown_elapsed)
if int(cooldown_elapsed) % 120 < CHECK_INTERVAL:
log(f"{name}: model '{model_name}' in cooldown ({remaining}s remaining)")
continue
else:
# Cooldown expired, give it another chance
log(f"{name}: model '{model_name}' cooldown expired, resuming probes")
del model_cooldowns[port][model_name]
model_failures[port][model_name] = 0
# Probe the model
if probe_model(port, model_name):
# Probe succeeded - reset failure counter
if model_failures[port][model_name] > 0:
log(f"{name}: model '{model_name}' recovered after "
f"{model_failures[port][model_name]} failure(s)")
model_failures[port][model_name] = 0
else:
# Probe failed
model_failures[port][model_name] += 1
fail_count = model_failures[port][model_name]
log(f"{name}: model '{model_name}' probe failed "
f"({fail_count}/{MAX_MODEL_PROBE_FAILURES})")
if fail_count >= MAX_MODEL_PROBE_FAILURES:
# Try to unload the bad model to free resources
if unload_model(port, model_name):
log(f"{name}: model '{model_name}' persistently unreachable, "
f"unloaded successfully, cooldown {MODEL_FAILURE_COOLDOWN}s (NO restart)")
else:
log(f"{name}: model '{model_name}' persistently unreachable, "
f"unload failed, cooldown {MODEL_FAILURE_COOLDOWN}s (NO restart)")
model_cooldowns[port][model_name] = now
time.sleep(CHECK_INTERVAL)