Add two-tier LLM analysis with triage/escalation

Triage model (LLM_MODEL) handles every message cheaply. If toxicity >= 0.25, off_topic, or coherence < 0.6, the message is re-analyzed with the heavy model (LLM_ESCALATION_MODEL). Chat, image analysis, /bcs-test, and /bcs-scan always use the heavy model. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Add message batching (debounce) for rapid-fire senders
2026-02-21 18:33:36 -05:00 · 2026-02-21 18:19:01 -05:00
5 changed files with 108 additions and 16 deletions
--- a/bot.py
+++ b/bot.py
@@ -65,12 +65,16 @@ class BCSBot(commands.Bot):

        self.config = config

-        # LLM client (OpenAI-compatible — works with llama.cpp, Ollama, or OpenAI)
+        # LLM clients (OpenAI-compatible — works with llama.cpp, Ollama, or OpenAI)
        llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434")
        llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0")
        llm_api_key = os.getenv("LLM_API_KEY", "not-needed")
        self.llm = LLMClient(llm_base_url, llm_model, llm_api_key)

+        # Heavy/escalation model for re-analysis, chat, and manual commands
+        llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", llm_model)
+        self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key)
+
        # Drama tracker
        sentiment = config.get("sentiment", {})
        timeouts = config.get("timeouts", {})
@@ -167,6 +171,7 @@ class BCSBot(commands.Bot):
    async def close(self):
        await self.db.close()
        await self.llm.close()
+        await self.llm_heavy.close()
        await super().close()


--- a/cogs/chat.py
+++ b/cogs/chat.py
@@ -84,7 +84,7 @@ class ChatCog(commands.Cog):
                image_attachment.filename,
                user_text[:80],
            )
-            response = await self.bot.llm.analyze_image(
+            response = await self.bot.llm_heavy.analyze_image(
                image_bytes,
                SCOREBOARD_ROAST,
                user_text=user_text,
@@ -108,7 +108,7 @@ class ChatCog(commands.Cog):
                {"role": "user", "content": f"{score_context}\n{message.author.display_name}: {content}"}
            )

-            response = await self.bot.llm.chat(
+            response = await self.bot.llm_heavy.chat(
                list(self._chat_history[ch_id]),
                CHAT_PERSONALITY,
                on_first_token=start_typing,
--- a/cogs/commands.py
+++ b/cogs/commands.py
@@ -126,9 +126,19 @@ class CommandsCog(commands.Cog):
            inline=True,
        )
        embed.add_field(
-            name="LLM",
-            value=f"`{self.bot.llm.model}` @ `{self.bot.llm.host}`",
-            inline=False,
+            name="Triage Model",
+            value=f"`{self.bot.llm.model}`",
+            inline=True,
+        )
+        embed.add_field(
+            name="Escalation Model",
+            value=f"`{self.bot.llm_heavy.model}`",
+            inline=True,
+        )
+        embed.add_field(
+            name="LLM Host",
+            value=f"`{self.bot.llm.host}`",
+            inline=True,
        )

        await interaction.response.send_message(embed=embed, ephemeral=True)
@@ -301,7 +311,7 @@ class CommandsCog(commands.Cog):
                else "(no prior context)"
            )

-            result = await self.bot.llm.analyze_message(msg.content, context)
+            result = await self.bot.llm_heavy.analyze_message(msg.content, context)
            if result is None:
                embed = discord.Embed(
                    title=f"Analysis: {msg.author.display_name}",
@@ -374,7 +384,7 @@ class CommandsCog(commands.Cog):
            channel_context = "\n".join(lines)

        user_notes = self.bot.drama_tracker.get_user_notes(interaction.user.id)
-        raw, parsed = await self.bot.llm.raw_analyze(
+        raw, parsed = await self.bot.llm_heavy.raw_analyze(
            message, user_notes=user_notes, channel_context=channel_context,
        )

--- a/cogs/sentiment.py
+++ b/cogs/sentiment.py
@@ -21,12 +21,22 @@ class SentimentCog(commands.Cog):
        self._dirty_users: set[int] = set()
        # Per-user redirect cooldown: {user_id: last_redirect_datetime}
        self._redirect_cooldowns: dict[int, datetime] = {}
+        # Debounce buffer: keyed by (channel_id, user_id), stores list of messages
+        self._message_buffer: dict[tuple[int, int], list[discord.Message]] = {}
+        # Pending debounce timer tasks
+        self._debounce_tasks: dict[tuple[int, int], asyncio.Task] = {}

    async def cog_load(self):
        self._flush_states.start()

    async def cog_unload(self):
        self._flush_states.cancel()
+        # Cancel all pending debounce timers and process remaining buffers
+        for task in self._debounce_tasks.values():
+            task.cancel()
+        self._debounce_tasks.clear()
+        for key in list(self._message_buffer):
+            await self._process_buffered(key)
        # Final flush on shutdown
        await self._flush_dirty_states()

@@ -75,27 +85,93 @@ class SentimentCog(commands.Cog):
        if not message.content or not message.content.strip():
            return

-        # Check per-user analysis cooldown
-        sentiment_config = config.get("sentiment", {})
-        cooldown = sentiment_config.get("cooldown_between_analyses", 2)
-        if not self.bot.drama_tracker.can_analyze(message.author.id, cooldown):
+        # Buffer the message and start/reset debounce timer
+        key = (message.channel.id, message.author.id)
+        if key not in self._message_buffer:
+            self._message_buffer[key] = []
+        self._message_buffer[key].append(message)
+
+        # Cancel existing debounce timer for this user+channel
+        existing_task = self._debounce_tasks.get(key)
+        if existing_task and not existing_task.done():
+            existing_task.cancel()
+
+        # Start new debounce timer
+        batch_window = config.get("sentiment", {}).get("batch_window_seconds", 3)
+        self._debounce_tasks[key] = asyncio.create_task(
+            self._debounce_then_process(key, batch_window)
+        )
+
+    async def _debounce_then_process(self, key: tuple[int, int], delay: float):
+        """Sleep for the debounce window, then process the buffered messages."""
+        try:
+            await asyncio.sleep(delay)
+            await self._process_buffered(key)
+        except asyncio.CancelledError:
+            pass  # Timer was reset by a new message — expected
+
+    async def _process_buffered(self, key: tuple[int, int]):
+        """Combine buffered messages and run the analysis pipeline once."""
+        messages = self._message_buffer.pop(key, [])
+        self._debounce_tasks.pop(key, None)
+
+        if not messages:
            return

+        # Use the last message as the reference for channel, author, guild, etc.
+        message = messages[-1]
+        combined_content = "\n".join(m.content for m in messages if m.content and m.content.strip())
+
+        if not combined_content.strip():
+            return
+
+        batch_count = len(messages)
+        if batch_count > 1:
+            logger.info(
+                "Batched %d messages from %s in #%s",
+                batch_count, message.author.display_name,
+                getattr(message.channel, 'name', 'unknown'),
+            )
+
+        config = self.bot.config
+        monitoring = config.get("monitoring", {})
+        sentiment_config = config.get("sentiment", {})
+
        # Build channel context for game detection
        game_channels = config.get("game_channels", {})
        channel_context = self._build_channel_context(message, game_channels)

-        # Analyze the message
+        # Analyze the combined message (triage with lightweight model)
        context = self._get_context(message)
        user_notes = self.bot.drama_tracker.get_user_notes(message.author.id)
        result = await self.bot.llm.analyze_message(
-            message.content, context, user_notes=user_notes,
+            combined_content, context, user_notes=user_notes,
            channel_context=channel_context,
        )

        if result is None:
            return

+        # Escalation: re-analyze with heavy model if triage flags something
+        escalation_threshold = sentiment_config.get("escalation_threshold", 0.25)
+        needs_escalation = (
+            result["toxicity_score"] >= escalation_threshold
+            or result.get("off_topic", False)
+            or result.get("coherence_score", 1.0) < 0.6
+        )
+        if needs_escalation:
+            triage_score = result["toxicity_score"]
+            heavy_result = await self.bot.llm_heavy.analyze_message(
+                combined_content, context, user_notes=user_notes,
+                channel_context=channel_context,
+            )
+            if heavy_result is not None:
+                logger.info(
+                    "Escalated to heavy model (triage_score=%.2f) for %s",
+                    triage_score, message.author.display_name,
+                )
+                result = heavy_result
+
        score = result["toxicity_score"]
        categories = result["categories"]
        reasoning = result["reasoning"]
@@ -128,7 +204,7 @@ class SentimentCog(commands.Cog):
            channel_id=message.channel.id,
            user_id=message.author.id,
            username=message.author.display_name,
-            content=message.content,
+            content=combined_content,
            message_ts=message.created_at.replace(tzinfo=timezone.utc),
            toxicity_score=score,
            drama_score=drama_score,
--- a/config.yaml
+++ b/config.yaml
@@ -17,7 +17,8 @@ sentiment:
  context_messages: 3  # Number of previous messages to include as context
  rolling_window_size: 10  # Number of messages to track per user
  rolling_window_minutes: 15  # Time window for tracking
-  cooldown_between_analyses: 2  # Seconds between analyzing same user's messages
+  batch_window_seconds: 3  # Wait this long for more messages before analyzing (debounce)
+  escalation_threshold: 0.25  # Triage toxicity score that triggers re-analysis with heavy model

 game_channels:
  gta-online: "GTA Online"