Switch LLM backend from llama.cpp/Qwen to OpenAI

- Default models: gpt-4o-mini (triage), gpt-4o (escalation)
- Remove Qwen-specific /no_think hacks
- Reduce timeout from 600s to 120s, increase concurrency semaphore to 4
- Support empty LLM_BASE_URL to use OpenAI directly

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-23 12:07:53 -05:00
parent a9bc24e48e
commit 28fb66d5f9
3 changed files with 24 additions and 31 deletions

19
bot.py
View File

@@ -68,14 +68,14 @@ class BCSBot(commands.Bot):
# Database (initialized async in setup_hook)
self.db = Database()
# LLM clients (OpenAI-compatible — works with llama.cpp, Ollama, or OpenAI)
llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434")
llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0")
llm_api_key = os.getenv("LLM_API_KEY", "not-needed")
# LLM clients (OpenAI — set LLM_BASE_URL to override for local models)
llm_base_url = os.getenv("LLM_BASE_URL", "")
llm_model = os.getenv("LLM_MODEL", "gpt-4o-mini")
llm_api_key = os.getenv("LLM_API_KEY", "")
self.llm = LLMClient(llm_base_url, llm_model, llm_api_key, db=self.db)
# Heavy/escalation model for re-analysis, chat, and manual commands
llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", llm_model)
llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", "gpt-4o")
self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key, db=self.db)
# Active mode (server-wide)
@@ -118,17 +118,16 @@ class BCSBot(commands.Bot):
await self.tree.sync()
logger.info("Slash commands synced.")
# Warm up the LLM so the model is loaded into VRAM before messages arrive
logger.info("Warming up LLM model...")
# Quick connectivity check
try:
resp = await self.llm._client.chat.completions.create(
await self.llm._client.chat.completions.create(
model=self.llm.model,
messages=[{"role": "user", "content": "hi"}],
max_tokens=1,
)
logger.info("LLM warm-up complete.")
logger.info("LLM connectivity check passed.")
except Exception as e:
logger.warning("LLM warm-up failed — first messages may be slow: %s", e)
logger.warning("LLM connectivity check failed: %s", e)
async def on_message(self, message: discord.Message):
logger.info(