fix: anonymize usernames before LLM analysis to prevent name-based scoring bias

Display names like "Calm your tits" were causing the LLM to inflate toxicity
scores on completely benign messages. Usernames are now replaced with User1,
User2, etc. before sending to the LLM, then mapped back to real names in the
results.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-25 22:20:53 -05:00
parent cb8ef8542b
commit c63913cf14
2 changed files with 73 additions and 9 deletions

View File

@@ -204,6 +204,55 @@ class SentimentCog(commands.Cog):
notes_map[name] = notes
return notes_map
@staticmethod
def _build_anon_map(
conversation: list[tuple[str, str, datetime, str | None]],
) -> dict[str, str]:
"""Build display_name -> 'User1', 'User2', ... mapping for all participants."""
seen: dict[str, str] = {}
counter = 1
for username, _, _, reply_to in conversation:
if username not in seen:
seen[username] = f"User{counter}"
counter += 1
if reply_to and reply_to not in seen:
seen[reply_to] = f"User{counter}"
counter += 1
return seen
@staticmethod
def _anonymize_conversation(
conversation: list[tuple[str, str, datetime, str | None]],
anon_map: dict[str, str],
) -> list[tuple[str, str, datetime, str | None]]:
"""Replace display names with anonymous keys in conversation tuples."""
return [
(
anon_map.get(username, username),
content,
ts,
anon_map.get(reply_to, reply_to) if reply_to else None,
)
for username, content, ts, reply_to in conversation
]
@staticmethod
def _anonymize_notes(
user_notes_map: dict[str, str],
anon_map: dict[str, str],
) -> dict[str, str]:
"""Replace display name keys with anonymous keys in user notes map."""
return {anon_map.get(name, name): notes for name, notes in user_notes_map.items()}
@staticmethod
def _deanonymize_findings(result: dict, anon_map: dict[str, str]) -> None:
"""Replace anonymous keys back to display names in LLM findings (in-place)."""
reverse_map = {v: k for k, v in anon_map.items()}
for finding in result.get("user_findings", []):
anon_name = finding.get("username", "")
if anon_name in reverse_map:
finding["username"] = reverse_map[anon_name]
@staticmethod
def _build_conversation(
messages: list[discord.Message],
@@ -393,6 +442,11 @@ class SentimentCog(commands.Cog):
user_notes_map = self._build_user_notes_map(messages)
# Anonymize usernames before sending to LLM to prevent name-based bias
anon_map = self._build_anon_map(conversation)
anon_conversation = self._anonymize_conversation(conversation, anon_map)
anon_notes = self._anonymize_notes(user_notes_map, anon_map) if user_notes_map else user_notes_map
channel_context = build_channel_context(ref_message, game_channels)
logger.info(
@@ -403,9 +457,9 @@ class SentimentCog(commands.Cog):
# TRIAGE: Lightweight model — conversation-level analysis
result = await self.bot.llm.analyze_conversation(
conversation,
anon_conversation,
channel_context=channel_context,
user_notes_map=user_notes_map,
user_notes_map=anon_notes,
new_message_start=new_message_start,
)
@@ -422,9 +476,9 @@ class SentimentCog(commands.Cog):
)
if needs_escalation:
heavy_result = await self.bot.llm_heavy.analyze_conversation(
conversation,
anon_conversation,
channel_context=channel_context,
user_notes_map=user_notes_map,
user_notes_map=anon_notes,
new_message_start=new_message_start,
)
if heavy_result is not None:
@@ -434,6 +488,9 @@ class SentimentCog(commands.Cog):
)
result = heavy_result
# De-anonymize findings back to real display names
self._deanonymize_findings(result, anon_map)
user_lookup = self._build_user_lookup(messages)
# Mark all buffered messages as analyzed (for mention scan dedup)
@@ -557,6 +614,11 @@ class SentimentCog(commands.Cog):
conversation = self._build_conversation(raw_messages)
user_notes_map = self._build_user_notes_map(raw_messages)
# Anonymize usernames before sending to LLM
anon_map = self._build_anon_map(conversation)
anon_conversation = self._anonymize_conversation(conversation, anon_map)
anon_notes = self._anonymize_notes(user_notes_map, anon_map) if user_notes_map else user_notes_map
channel_context = build_channel_context(raw_messages[0], game_channels)
mention_context = (
f"A user flagged this conversation and said: \"{mention_text}\"\n"
@@ -565,10 +627,10 @@ class SentimentCog(commands.Cog):
# Single LLM call
result = await self.bot.llm.analyze_conversation(
conversation,
anon_conversation,
mention_context=mention_context,
channel_context=channel_context,
user_notes_map=user_notes_map,
user_notes_map=anon_notes,
)
if result is None:
@@ -576,6 +638,9 @@ class SentimentCog(commands.Cog):
self._mention_scan_results[trigger_message.id] = "Analysis failed."
return
# De-anonymize findings back to real display names
self._deanonymize_findings(result, anon_map)
user_lookup = self._build_user_lookup(raw_messages)
findings: list[str] = []
dry_run = config.get("monitoring", {}).get("dry_run", False)