fix: anonymize usernames before LLM analysis to prevent name-based scoring bias

Display names like "Calm your tits" were causing the LLM to inflate toxicity scores on completely benign messages. Usernames are now replaced with User1, User2, etc. before sending to the LLM, then mapped back to real names in the results. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-25 22:20:53 -05:00
parent cb8ef8542b
commit c63913cf14
2 changed files with 73 additions and 9 deletions
@@ -204,6 +204,55 @@ class SentimentCog(commands.Cog):
                    notes_map[name] = notes
        return notes_map

+    @staticmethod
+    def _build_anon_map(
+        conversation: list[tuple[str, str, datetime, str | None]],
+    ) -> dict[str, str]:
+        """Build display_name -> 'User1', 'User2', ... mapping for all participants."""
+        seen: dict[str, str] = {}
+        counter = 1
+        for username, _, _, reply_to in conversation:
+            if username not in seen:
+                seen[username] = f"User{counter}"
+                counter += 1
+            if reply_to and reply_to not in seen:
+                seen[reply_to] = f"User{counter}"
+                counter += 1
+        return seen
+
+    @staticmethod
+    def _anonymize_conversation(
+        conversation: list[tuple[str, str, datetime, str | None]],
+        anon_map: dict[str, str],
+    ) -> list[tuple[str, str, datetime, str | None]]:
+        """Replace display names with anonymous keys in conversation tuples."""
+        return [
+            (
+                anon_map.get(username, username),
+                content,
+                ts,
+                anon_map.get(reply_to, reply_to) if reply_to else None,
+            )
+            for username, content, ts, reply_to in conversation
+        ]
+
+    @staticmethod
+    def _anonymize_notes(
+        user_notes_map: dict[str, str],
+        anon_map: dict[str, str],
+    ) -> dict[str, str]:
+        """Replace display name keys with anonymous keys in user notes map."""
+        return {anon_map.get(name, name): notes for name, notes in user_notes_map.items()}
+
+    @staticmethod
+    def _deanonymize_findings(result: dict, anon_map: dict[str, str]) -> None:
+        """Replace anonymous keys back to display names in LLM findings (in-place)."""
+        reverse_map = {v: k for k, v in anon_map.items()}
+        for finding in result.get("user_findings", []):
+            anon_name = finding.get("username", "")
+            if anon_name in reverse_map:
+                finding["username"] = reverse_map[anon_name]
+
    @staticmethod
    def _build_conversation(
        messages: list[discord.Message],
@@ -393,6 +442,11 @@ class SentimentCog(commands.Cog):

        user_notes_map = self._build_user_notes_map(messages)

+        # Anonymize usernames before sending to LLM to prevent name-based bias
+        anon_map = self._build_anon_map(conversation)
+        anon_conversation = self._anonymize_conversation(conversation, anon_map)
+        anon_notes = self._anonymize_notes(user_notes_map, anon_map) if user_notes_map else user_notes_map
+
        channel_context = build_channel_context(ref_message, game_channels)

        logger.info(
@@ -403,9 +457,9 @@ class SentimentCog(commands.Cog):

        # TRIAGE: Lightweight model — conversation-level analysis
        result = await self.bot.llm.analyze_conversation(
-            conversation,
+            anon_conversation,
            channel_context=channel_context,
-            user_notes_map=user_notes_map,
+            user_notes_map=anon_notes,
            new_message_start=new_message_start,
        )

@@ -422,9 +476,9 @@ class SentimentCog(commands.Cog):
        )
        if needs_escalation:
            heavy_result = await self.bot.llm_heavy.analyze_conversation(
-                conversation,
+                anon_conversation,
                channel_context=channel_context,
-                user_notes_map=user_notes_map,
+                user_notes_map=anon_notes,
                new_message_start=new_message_start,
            )
            if heavy_result is not None:
@@ -434,6 +488,9 @@ class SentimentCog(commands.Cog):
                )
                result = heavy_result

+        # De-anonymize findings back to real display names
+        self._deanonymize_findings(result, anon_map)
+
        user_lookup = self._build_user_lookup(messages)

        # Mark all buffered messages as analyzed (for mention scan dedup)
@@ -557,6 +614,11 @@ class SentimentCog(commands.Cog):
        conversation = self._build_conversation(raw_messages)
        user_notes_map = self._build_user_notes_map(raw_messages)

+        # Anonymize usernames before sending to LLM
+        anon_map = self._build_anon_map(conversation)
+        anon_conversation = self._anonymize_conversation(conversation, anon_map)
+        anon_notes = self._anonymize_notes(user_notes_map, anon_map) if user_notes_map else user_notes_map
+
        channel_context = build_channel_context(raw_messages[0], game_channels)
        mention_context = (
            f"A user flagged this conversation and said: \"{mention_text}\"\n"
@@ -565,10 +627,10 @@ class SentimentCog(commands.Cog):

        # Single LLM call
        result = await self.bot.llm.analyze_conversation(
-            conversation,
+            anon_conversation,
            mention_context=mention_context,
            channel_context=channel_context,
-            user_notes_map=user_notes_map,
+            user_notes_map=anon_notes,
        )

        if result is None:
@@ -576,6 +638,9 @@ class SentimentCog(commands.Cog):
            self._mention_scan_results[trigger_message.id] = "Analysis failed."
            return

+        # De-anonymize findings back to real display names
+        self._deanonymize_findings(result, anon_map)
+
        user_lookup = self._build_user_lookup(raw_messages)
        findings: list[str] = []
        dry_run = config.get("monitoring", {}).get("dry_run", False)
@@ -2,7 +2,7 @@ You are a Discord chat moderator AI for a gaming server. You will be given a TAR

 CRITICAL: Only score the TARGET MESSAGE. The context section contains recent messages from ALL users in the channel (including the target user's own prior messages) — it is ONLY for understanding tone, conversation flow, and escalation patterns. Do NOT score the context messages — they are already being analyzed separately.

-CONTEXT — This is a friend group who use crude nicknames and display names. Usernames/display names (the text before the colon in chat lines, e.g. "Calm your tits") are chosen by each user and are NOT part of the message content. NEVER factor a username into the toxicity score — only score the actual message text after the colon. However, you must still flag genuinely aggressive language in message content.
+CONTEXT — This is a friend group's gaming server. Usernames in the chat log are anonymized (User1, User2, etc.) and are NOT part of the message content. Only score the actual message text after the colon.

 SCORING GUIDE — Be precise with scores:
 - 0.0-0.1: Completely harmless. Casual chat, jokes, "lmao", greetings, game talk, nicknames.
@@ -12,11 +12,10 @@ SCORING GUIDE — Be precise with scores:
 - 0.8-1.0: Severely toxic. Threats, targeted harassment, telling someone to leave, attacking insecurities, sustained personal attacks.

 IMPORTANT RULES:
- Usernames/display names (e.g. "Calm your tits", "tits") = ALWAYS IGNORE. Score 0.0 for the username itself. Only score the message content.
+- Nicknames and in-group shorthand used in message content (e.g. shortened names friends call each other) are NOT toxic. Only score actual hostile intent.
 - Profanity ALONE (just "fuck" or "shit" with no target) = low score (0.0-0.1).
 - Profanity DIRECTED AT someone ("fuck you", "you piece of shit") = moderate-to-high score (0.5-0.7) even among friends.
 - Do NOT let friendly context excuse clearly aggressive language. Friends can still cross lines.
- If a message contains BOTH a nickname AND an insult ("fuck you tits you piece of shit"), score the insult, not the nickname.
 - If the target message is just "lmao", "lol", an emoji, or a short neutral reaction, it is ALWAYS 0.0 regardless of what other people said before it.
 - If a user is QUOTING or REPORTING what someone else said (e.g. "you called them X", "he said Y to her"), score based on the user's own intent, NOT the quoted words. Tattling, reporting, or referencing someone else's language is not the same as using that language aggressively. These should score 0.0-0.2 unless the user is clearly weaponizing the quote to attack someone.
 - Sexually crude or vulgar remarks DIRECTED AT someone (e.g. "you watch that to cum", "bet you get off to that") = 0.5-0.7 and category "sexual_vulgar". Adding "lol" or "lmao" does NOT soften sexual content aimed at a person — it's still degrading. General sexual jokes not targeting anyone specific can score lower (0.2-0.3).