From cf88f003ba09bb27707d866dfcc2fa687a2857d8 Mon Sep 17 00:00:00 2001 From: AJ Isaacs Date: Sat, 21 Feb 2026 15:16:52 -0500 Subject: [PATCH] Add LLM warm-up request at startup to preload model into VRAM Sends a minimal 1-token completion during setup_hook so the model is ready before Discord messages start arriving, avoiding connection errors and slow first responses after a restart. Co-Authored-By: Claude Opus 4.6 --- bot.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/bot.py b/bot.py index 6e1b1be..fc79ade 100644 --- a/bot.py +++ b/bot.py @@ -97,6 +97,18 @@ class BCSBot(commands.Bot): await self.tree.sync() logger.info("Slash commands synced.") + # Warm up the LLM so the model is loaded into VRAM before messages arrive + logger.info("Warming up LLM model...") + try: + resp = await self.llm._client.chat.completions.create( + model=self.llm.model, + messages=[{"role": "user", "content": "hi"}], + max_tokens=1, + ) + logger.info("LLM warm-up complete.") + except Exception as e: + logger.warning("LLM warm-up failed — first messages may be slow: %s", e) + async def on_message(self, message: discord.Message): logger.info( "EVENT on_message from %s in #%s: %s",