From cf88f003ba09bb27707d866dfcc2fa687a2857d8 Mon Sep 17 00:00:00 2001
From: AJ Isaacs <ajisaacs27@gmail.com>
Date: Sat, 21 Feb 2026 15:16:52 -0500
Subject: [PATCH] Add LLM warm-up request at startup to preload model into VRAM

Sends a minimal 1-token completion during setup_hook so the model is
ready before Discord messages start arriving, avoiding connection
errors and slow first responses after a restart.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 bot.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/bot.py b/bot.py
index 6e1b1be..fc79ade 100644
--- a/bot.py
+++ b/bot.py
@@ -97,6 +97,18 @@ class BCSBot(commands.Bot):
         await self.tree.sync()
         logger.info("Slash commands synced.")
 
+        # Warm up the LLM so the model is loaded into VRAM before messages arrive
+        logger.info("Warming up LLM model...")
+        try:
+            resp = await self.llm._client.chat.completions.create(
+                model=self.llm.model,
+                messages=[{"role": "user", "content": "hi"}],
+                max_tokens=1,
+            )
+            logger.info("LLM warm-up complete.")
+        except Exception as e:
+            logger.warning("LLM warm-up failed — first messages may be slow: %s", e)
+
     async def on_message(self, message: discord.Message):
         logger.info(
             "EVENT on_message from %s in #%s: %s",