Support hybrid LLM: local Qwen triage + OpenAI escalation

Triage analysis runs on Qwen 8B (athena.lan) for free first-pass. Escalation, chat, image roasts, and commands use GPT-4o via OpenAI. Each tier gets its own base URL, API key, and concurrency settings. Local models get /no_think and serialized requests automatically. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 12:20:07 -05:00
parent b5e401f036
commit 8a06ddbd6e
3 changed files with 49 additions and 18 deletions
@@ -1,7 +1,12 @@
 DISCORD_BOT_TOKEN=your_token_here
-LLM_BASE_URL=
-LLM_MODEL=gpt-4o-mini
+# Triage model (local llama.cpp / Ollama — leave BASE_URL empty for OpenAI)
+LLM_BASE_URL=http://athena.lan:11434
+LLM_MODEL=Qwen3-8B-Q6_K
+LLM_API_KEY=not-needed
+# Escalation model (OpenAI — leave BASE_URL empty for OpenAI)
+LLM_ESCALATION_BASE_URL=
 LLM_ESCALATION_MODEL=gpt-4o
-LLM_API_KEY=your_openai_api_key_here
+LLM_ESCALATION_API_KEY=your_openai_api_key_here
+# Database
 MSSQL_SA_PASSWORD=YourStrong!Passw0rd
 DB_CONNECTION_STRING=DRIVER={ODBC Driver 18 for SQL Server};SERVER=localhost,1433;DATABASE=BreehaviorMonitor;UID=sa;PWD=YourStrong!Passw0rd;TrustServerCertificate=yes