Support hybrid LLM: local Qwen triage + OpenAI escalation

Triage analysis runs on Qwen 8B (athena.lan) for free first-pass.
Escalation, chat, image roasts, and commands use GPT-4o via OpenAI.

Each tier gets its own base URL, API key, and concurrency settings.
Local models get /no_think and serialized requests automatically.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-23 12:20:07 -05:00
parent b5e401f036
commit 8a06ddbd6e
3 changed files with 49 additions and 18 deletions

22
bot.py
View File

@@ -68,15 +68,25 @@ class BCSBot(commands.Bot):
# Database (initialized async in setup_hook)
self.db = Database()
# LLM clients (OpenAI — set LLM_BASE_URL to override for local models)
# Triage LLM (local Qwen on athena for cheap first-pass analysis)
llm_base_url = os.getenv("LLM_BASE_URL", "")
llm_model = os.getenv("LLM_MODEL", "gpt-4o-mini")
llm_api_key = os.getenv("LLM_API_KEY", "")
self.llm = LLMClient(llm_base_url, llm_model, llm_api_key, db=self.db)
llm_api_key = os.getenv("LLM_API_KEY", "not-needed")
is_local = bool(llm_base_url)
self.llm = LLMClient(
llm_base_url, llm_model, llm_api_key, db=self.db,
no_think=is_local, concurrency=1 if is_local else 4,
)
# Heavy/escalation model for re-analysis, chat, and manual commands
llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", "gpt-4o")
self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key, db=self.db)
# Heavy/escalation LLM (OpenAI for re-analysis, chat, image roasts, commands)
esc_base_url = os.getenv("LLM_ESCALATION_BASE_URL", "")
esc_model = os.getenv("LLM_ESCALATION_MODEL", "gpt-4o")
esc_api_key = os.getenv("LLM_ESCALATION_API_KEY", llm_api_key)
esc_is_local = bool(esc_base_url)
self.llm_heavy = LLMClient(
esc_base_url, esc_model, esc_api_key, db=self.db,
no_think=esc_is_local, concurrency=1 if esc_is_local else 4,
)
# Active mode (server-wide)
modes_config = config.get("modes", {})