Add LLM warm-up request at startup to preload model into VRAM

Sends a minimal 1-token completion during setup_hook so the model is ready before Discord messages start arriving, avoiding connection errors and slow first responses after a restart. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 15:16:52 -05:00
parent b410200146
commit cf88f003ba
1 changed files with 12 additions and 0 deletions
@@ -97,6 +97,18 @@ class BCSBot(commands.Bot):
        await self.tree.sync()
        logger.info("Slash commands synced.")
        # Warm up the LLM so the model is loaded into VRAM before messages arrive
        logger.info("Warming up LLM model...")
        try:
            resp = await self.llm._client.chat.completions.create(
                model=self.llm.model,
                messages=[{"role": "user", "content": "hi"}],
                max_tokens=1,
            )
            logger.info("LLM warm-up complete.")
        except Exception as e:
            logger.warning("LLM warm-up failed — first messages may be slow: %s", e)
    async def on_message(self, message: discord.Message):
        logger.info(
            "EVENT on_message from %s in #%s: %s",