Switch LLM backend from llama.cpp/Qwen to OpenAI

- Default models: gpt-4o-mini (triage), gpt-4o (escalation) - Remove Qwen-specific /no_think hacks - Reduce timeout from 600s to 120s, increase concurrency semaphore to 4 - Support empty LLM_BASE_URL to use OpenAI directly Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 12:07:53 -05:00
parent a9bc24e48e
commit 28fb66d5f9
3 changed files with 24 additions and 31 deletions
--- a/bot.py
+++ b/bot.py
@@ -68,14 +68,14 @@ class BCSBot(commands.Bot):
        # Database (initialized async in setup_hook)
        self.db = Database()

-        # LLM clients (OpenAI-compatible — works with llama.cpp, Ollama, or OpenAI)
-        llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434")
-        llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0")
-        llm_api_key = os.getenv("LLM_API_KEY", "not-needed")
+        # LLM clients (OpenAI — set LLM_BASE_URL to override for local models)
+        llm_base_url = os.getenv("LLM_BASE_URL", "")
+        llm_model = os.getenv("LLM_MODEL", "gpt-4o-mini")
+        llm_api_key = os.getenv("LLM_API_KEY", "")
        self.llm = LLMClient(llm_base_url, llm_model, llm_api_key, db=self.db)

        # Heavy/escalation model for re-analysis, chat, and manual commands
-        llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", llm_model)
+        llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", "gpt-4o")
        self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key, db=self.db)

        # Active mode (server-wide)
@@ -118,17 +118,16 @@ class BCSBot(commands.Bot):
        await self.tree.sync()
        logger.info("Slash commands synced.")

-        # Warm up the LLM so the model is loaded into VRAM before messages arrive
-        logger.info("Warming up LLM model...")
+        # Quick connectivity check
        try:
-            resp = await self.llm._client.chat.completions.create(
+            await self.llm._client.chat.completions.create(
                model=self.llm.model,
                messages=[{"role": "user", "content": "hi"}],
                max_tokens=1,
            )
-            logger.info("LLM warm-up complete.")
+            logger.info("LLM connectivity check passed.")
        except Exception as e:
-            logger.warning("LLM warm-up failed — first messages may be slow: %s", e)
+            logger.warning("LLM connectivity check failed: %s", e)

    async def on_message(self, message: discord.Message):
        logger.info(