fix: anonymize usernames before LLM analysis to prevent name-based scoring bias

Display names like "Calm your tits" were causing the LLM to inflate toxicity scores on completely benign messages. Usernames are now replaced with User1, User2, etc. before sending to the LLM, then mapped back to real names in the results. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-25 22:20:53 -05:00
parent cb8ef8542b
commit c63913cf14
2 changed files with 73 additions and 9 deletions
@@ -204,6 +204,55 @@ class SentimentCog(commands.Cog):
                    notes_map[name] = notes
        return notes_map

+    @staticmethod
+    def _build_anon_map(
+        conversation: list[tuple[str, str, datetime, str | None]],
+    ) -> dict[str, str]:
+        """Build display_name -> 'User1', 'User2', ... mapping for all participants."""
+        seen: dict[str, str] = {}
+        counter = 1
+        for username, _, _, reply_to in conversation:
+            if username not in seen:
+                seen[username] = f"User{counter}"
+                counter += 1
+            if reply_to and reply_to not in seen:
+                seen[reply_to] = f"User{counter}"
+                counter += 1
+        return seen
+
+    @staticmethod
+    def _anonymize_conversation(
+        conversation: list[tuple[str, str, datetime, str | None]],
+        anon_map: dict[str, str],
+    ) -> list[tuple[str, str, datetime, str | None]]:
+        """Replace display names with anonymous keys in conversation tuples."""
+        return [
+            (
+                anon_map.get(username, username),
+                content,
+                ts,
+                anon_map.get(reply_to, reply_to) if reply_to else None,
+            )
+            for username, content, ts, reply_to in conversation
+        ]
+
+    @staticmethod
+    def _anonymize_notes(
+        user_notes_map: dict[str, str],
+        anon_map: dict[str, str],
+    ) -> dict[str, str]:
+        """Replace display name keys with anonymous keys in user notes map."""
+        return {anon_map.get(name, name): notes for name, notes in user_notes_map.items()}
+
+    @staticmethod
+    def _deanonymize_findings(result: dict, anon_map: dict[str, str]) -> None:
+        """Replace anonymous keys back to display names in LLM findings (in-place)."""
+        reverse_map = {v: k for k, v in anon_map.items()}
+        for finding in result.get("user_findings", []):
+            anon_name = finding.get("username", "")
+            if anon_name in reverse_map:
+                finding["username"] = reverse_map[anon_name]
+
    @staticmethod
    def _build_conversation(
        messages: list[discord.Message],
@@ -393,6 +442,11 @@ class SentimentCog(commands.Cog):

        user_notes_map = self._build_user_notes_map(messages)

+        # Anonymize usernames before sending to LLM to prevent name-based bias
+        anon_map = self._build_anon_map(conversation)
+        anon_conversation = self._anonymize_conversation(conversation, anon_map)
+        anon_notes = self._anonymize_notes(user_notes_map, anon_map) if user_notes_map else user_notes_map
+
        channel_context = build_channel_context(ref_message, game_channels)

        logger.info(
@@ -403,9 +457,9 @@ class SentimentCog(commands.Cog):

        # TRIAGE: Lightweight model — conversation-level analysis
        result = await self.bot.llm.analyze_conversation(
-            conversation,
+            anon_conversation,
            channel_context=channel_context,
-            user_notes_map=user_notes_map,
+            user_notes_map=anon_notes,
            new_message_start=new_message_start,
        )

@@ -422,9 +476,9 @@ class SentimentCog(commands.Cog):
        )
        if needs_escalation:
            heavy_result = await self.bot.llm_heavy.analyze_conversation(
-                conversation,
+                anon_conversation,
                channel_context=channel_context,
-                user_notes_map=user_notes_map,
+                user_notes_map=anon_notes,
                new_message_start=new_message_start,
            )
            if heavy_result is not None:
@@ -434,6 +488,9 @@ class SentimentCog(commands.Cog):
                )
                result = heavy_result

+        # De-anonymize findings back to real display names
+        self._deanonymize_findings(result, anon_map)
+
        user_lookup = self._build_user_lookup(messages)

        # Mark all buffered messages as analyzed (for mention scan dedup)
@@ -557,6 +614,11 @@ class SentimentCog(commands.Cog):
        conversation = self._build_conversation(raw_messages)
        user_notes_map = self._build_user_notes_map(raw_messages)

+        # Anonymize usernames before sending to LLM
+        anon_map = self._build_anon_map(conversation)
+        anon_conversation = self._anonymize_conversation(conversation, anon_map)
+        anon_notes = self._anonymize_notes(user_notes_map, anon_map) if user_notes_map else user_notes_map
+
        channel_context = build_channel_context(raw_messages[0], game_channels)
        mention_context = (
            f"A user flagged this conversation and said: \"{mention_text}\"\n"
@@ -565,10 +627,10 @@ class SentimentCog(commands.Cog):

        # Single LLM call
        result = await self.bot.llm.analyze_conversation(
-            conversation,
+            anon_conversation,
            mention_context=mention_context,
            channel_context=channel_context,
-            user_notes_map=user_notes_map,
+            user_notes_map=anon_notes,
        )

        if result is None:
@@ -576,6 +638,9 @@ class SentimentCog(commands.Cog):
            self._mention_scan_results[trigger_message.id] = "Analysis failed."
            return

+        # De-anonymize findings back to real display names
+        self._deanonymize_findings(result, anon_map)
+
        user_lookup = self._build_user_lookup(raw_messages)
        findings: list[str] = []
        dry_run = config.get("monitoring", {}).get("dry_run", False)