From 7417908142e1921cbd9b6507940a3490c6e90ada Mon Sep 17 00:00:00 2001 From: AJ Isaacs Date: Wed, 25 Feb 2026 15:48:02 -0500 Subject: [PATCH] fix: separate context from new messages so prior-cycle chat doesn't inflate scores The conversation analysis was re-scoring old messages alongside new ones, causing users to get penalized repeatedly for already-scored messages. A "--- NEW MESSAGES ---" separator now marks which messages are new, and the prompt instructs the LLM to score only those. Also fixes bot-mention detection to require an explicit @mention in message text rather than treating reply-pings as scans (so toxic replies to bot warnings aren't silently skipped). Co-Authored-By: Claude Opus 4.6 --- cogs/sentiment.py | 28 +++++++++++++++------------- prompts/analysis.txt | 6 ++++-- utils/llm_client.py | 14 ++++++++++++-- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/cogs/sentiment.py b/cogs/sentiment.py index 2e90d2a..8612f51 100644 --- a/cogs/sentiment.py +++ b/cogs/sentiment.py @@ -87,19 +87,18 @@ class SentimentCog(commands.Cog): if self.bot.drama_tracker.is_immune(message.author.id): return - # Messages directed at the bot (mentions, replies) shouldn't be scored - # for toxicity — but @mentions can trigger a scan of recent chat - directed_at_bot = self.bot.user in message.mentions - if not directed_at_bot and message.reference and message.reference.message_id: - ref = message.reference.cached_message - if ref and ref.author.id == self.bot.user.id: - directed_at_bot = True - if directed_at_bot: - # @mention (not just reply-to-bot) triggers a mention scan - if self.bot.user in message.mentions: - mention_config = config.get("mention_scan", {}) - if mention_config.get("enabled", True): - await self._maybe_start_mention_scan(message, mention_config) + # Explicit @mention of the bot triggers a mention scan instead of scoring. + # Reply-pings (Discord auto-adds replied-to user to mentions) should NOT + # trigger scans — and reply-to-bot messages should still be scored normally + # so toxic replies to bot warnings aren't silently skipped. + bot_mentioned_in_text = ( + f"<@{self.bot.user.id}>" in (message.content or "") + or f"<@!{self.bot.user.id}>" in (message.content or "") + ) + if bot_mentioned_in_text: + mention_config = config.get("mention_scan", {}) + if mention_config.get("enabled", True): + await self._maybe_start_mention_scan(message, mention_config) return # Skip if empty @@ -166,6 +165,7 @@ class SentimentCog(commands.Cog): history_messages.reverse() # chronological order # Combine: history (context) + buffered (new messages to analyze) + new_message_start = len(history_messages) all_messages = history_messages + messages # Build msg_id_to_author lookup for reply resolution @@ -215,6 +215,7 @@ class SentimentCog(commands.Cog): conversation, channel_context=channel_context, user_notes_map=user_notes_map, + new_message_start=new_message_start, ) if result is None: @@ -233,6 +234,7 @@ class SentimentCog(commands.Cog): conversation, channel_context=channel_context, user_notes_map=user_notes_map, + new_message_start=new_message_start, ) if heavy_result is not None: logger.info( diff --git a/prompts/analysis.txt b/prompts/analysis.txt index bf8d165..e49a7ad 100644 --- a/prompts/analysis.txt +++ b/prompts/analysis.txt @@ -40,8 +40,10 @@ Use the report_analysis tool to report your analysis of the TARGET MESSAGE only. CONVERSATION-LEVEL ANALYSIS (when given a CONVERSATION BLOCK instead of a single TARGET MESSAGE): When you receive a full conversation block with multiple users, use the report_conversation_scan tool instead: -- Provide ONE finding per user (not per message) — aggregate their behavior across the conversation. -- Weight their average tone and worst message equally when determining the toxicity_score. +- The conversation block may contain a "--- NEW MESSAGES (score only these) ---" separator. Messages ABOVE the separator are CONTEXT ONLY (already scored in a prior cycle) — do NOT let them inflate scores. Messages BELOW the separator are the NEW messages to score. +- Provide ONE finding per user who has NEW messages (not per message). +- Score based ONLY on the user's NEW messages. Use context messages to understand tone and relationships, but do NOT penalize a user for something they said in the context section. +- If a user's only new message is benign (e.g. "I got the 17.."), score it low regardless of what they said in context. - Use the same scoring bands (0.0-1.0) as for single messages. - Quote the worst/most problematic snippet in worst_message (max 100 chars, exact quote). - Flag off_topic if user's messages are primarily personal drama, not gaming. diff --git a/utils/llm_client.py b/utils/llm_client.py index 8108cb1..8ad5357 100644 --- a/utils/llm_client.py +++ b/utils/llm_client.py @@ -383,12 +383,16 @@ class LLMClient: def _format_conversation_block( messages: list[tuple[str, str, datetime, str | None]], now: datetime | None = None, + new_message_start: int | None = None, ) -> str: """Format messages as a compact timestamped chat block. Each tuple is (username, content, timestamp, reply_to_username). Consecutive messages from the same user collapse to indented lines. Replies shown as ``username → replied_to:``. + + If *new_message_start* is given, a separator is inserted before that + index so the LLM can distinguish context from new messages. """ if now is None: now = datetime.now(timezone.utc) @@ -396,7 +400,12 @@ class LLMClient: lines = [f"[Current time: {now.strftime('%I:%M %p')}]", ""] last_user = None - for username, content, ts, reply_to in messages: + for idx, (username, content, ts, reply_to) in enumerate(messages): + if new_message_start is not None and idx == new_message_start: + lines.append("") + lines.append("--- NEW MESSAGES (score only these) ---") + lines.append("") + last_user = None # reset collapse so first new msg gets full header delta = now - ts.replace(tzinfo=timezone.utc) if ts.tzinfo is None else now - ts rel = LLMClient._format_relative_time(delta) @@ -425,12 +434,13 @@ class LLMClient: mention_context: str = "", channel_context: str = "", user_notes_map: dict[str, str] | None = None, + new_message_start: int | None = None, ) -> dict | None: """Analyze a conversation block in one call, returning per-user findings.""" if not messages: return None - convo_block = self._format_conversation_block(messages) + convo_block = self._format_conversation_block(messages, new_message_start=new_message_start) user_content = f"=== CONVERSATION BLOCK ===\n{convo_block}\n\n" if user_notes_map: