fix: anonymize usernames before LLM analysis to prevent name-based scoring bias

Display names like "Calm your tits" were causing the LLM to inflate toxicity
scores on completely benign messages. Usernames are now replaced with User1,
User2, etc. before sending to the LLM, then mapped back to real names in the
results.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-25 22:20:53 -05:00
parent cb8ef8542b
commit c63913cf14
2 changed files with 73 additions and 9 deletions

View File

@@ -204,6 +204,55 @@ class SentimentCog(commands.Cog):
notes_map[name] = notes
return notes_map
@staticmethod
def _build_anon_map(
conversation: list[tuple[str, str, datetime, str | None]],
) -> dict[str, str]:
"""Build display_name -> 'User1', 'User2', ... mapping for all participants."""
seen: dict[str, str] = {}
counter = 1
for username, _, _, reply_to in conversation:
if username not in seen:
seen[username] = f"User{counter}"
counter += 1
if reply_to and reply_to not in seen:
seen[reply_to] = f"User{counter}"
counter += 1
return seen
@staticmethod
def _anonymize_conversation(
conversation: list[tuple[str, str, datetime, str | None]],
anon_map: dict[str, str],
) -> list[tuple[str, str, datetime, str | None]]:
"""Replace display names with anonymous keys in conversation tuples."""
return [
(
anon_map.get(username, username),
content,
ts,
anon_map.get(reply_to, reply_to) if reply_to else None,
)
for username, content, ts, reply_to in conversation
]
@staticmethod
def _anonymize_notes(
user_notes_map: dict[str, str],
anon_map: dict[str, str],
) -> dict[str, str]:
"""Replace display name keys with anonymous keys in user notes map."""
return {anon_map.get(name, name): notes for name, notes in user_notes_map.items()}
@staticmethod
def _deanonymize_findings(result: dict, anon_map: dict[str, str]) -> None:
"""Replace anonymous keys back to display names in LLM findings (in-place)."""
reverse_map = {v: k for k, v in anon_map.items()}
for finding in result.get("user_findings", []):
anon_name = finding.get("username", "")
if anon_name in reverse_map:
finding["username"] = reverse_map[anon_name]
@staticmethod
def _build_conversation(
messages: list[discord.Message],
@@ -393,6 +442,11 @@ class SentimentCog(commands.Cog):
user_notes_map = self._build_user_notes_map(messages)
# Anonymize usernames before sending to LLM to prevent name-based bias
anon_map = self._build_anon_map(conversation)
anon_conversation = self._anonymize_conversation(conversation, anon_map)
anon_notes = self._anonymize_notes(user_notes_map, anon_map) if user_notes_map else user_notes_map
channel_context = build_channel_context(ref_message, game_channels)
logger.info(
@@ -403,9 +457,9 @@ class SentimentCog(commands.Cog):
# TRIAGE: Lightweight model — conversation-level analysis
result = await self.bot.llm.analyze_conversation(
conversation,
anon_conversation,
channel_context=channel_context,
user_notes_map=user_notes_map,
user_notes_map=anon_notes,
new_message_start=new_message_start,
)
@@ -422,9 +476,9 @@ class SentimentCog(commands.Cog):
)
if needs_escalation:
heavy_result = await self.bot.llm_heavy.analyze_conversation(
conversation,
anon_conversation,
channel_context=channel_context,
user_notes_map=user_notes_map,
user_notes_map=anon_notes,
new_message_start=new_message_start,
)
if heavy_result is not None:
@@ -434,6 +488,9 @@ class SentimentCog(commands.Cog):
)
result = heavy_result
# De-anonymize findings back to real display names
self._deanonymize_findings(result, anon_map)
user_lookup = self._build_user_lookup(messages)
# Mark all buffered messages as analyzed (for mention scan dedup)
@@ -557,6 +614,11 @@ class SentimentCog(commands.Cog):
conversation = self._build_conversation(raw_messages)
user_notes_map = self._build_user_notes_map(raw_messages)
# Anonymize usernames before sending to LLM
anon_map = self._build_anon_map(conversation)
anon_conversation = self._anonymize_conversation(conversation, anon_map)
anon_notes = self._anonymize_notes(user_notes_map, anon_map) if user_notes_map else user_notes_map
channel_context = build_channel_context(raw_messages[0], game_channels)
mention_context = (
f"A user flagged this conversation and said: \"{mention_text}\"\n"
@@ -565,10 +627,10 @@ class SentimentCog(commands.Cog):
# Single LLM call
result = await self.bot.llm.analyze_conversation(
conversation,
anon_conversation,
mention_context=mention_context,
channel_context=channel_context,
user_notes_map=user_notes_map,
user_notes_map=anon_notes,
)
if result is None:
@@ -576,6 +638,9 @@ class SentimentCog(commands.Cog):
self._mention_scan_results[trigger_message.id] = "Analysis failed."
return
# De-anonymize findings back to real display names
self._deanonymize_findings(result, anon_map)
user_lookup = self._build_user_lookup(raw_messages)
findings: list[str] = []
dry_run = config.get("monitoring", {}).get("dry_run", False)

View File

@@ -2,7 +2,7 @@ You are a Discord chat moderator AI for a gaming server. You will be given a TAR
CRITICAL: Only score the TARGET MESSAGE. The context section contains recent messages from ALL users in the channel (including the target user's own prior messages) — it is ONLY for understanding tone, conversation flow, and escalation patterns. Do NOT score the context messages — they are already being analyzed separately.
CONTEXT — This is a friend group who use crude nicknames and display names. Usernames/display names (the text before the colon in chat lines, e.g. "Calm your tits") are chosen by each user and are NOT part of the message content. NEVER factor a username into the toxicity score — only score the actual message text after the colon. However, you must still flag genuinely aggressive language in message content.
CONTEXT — This is a friend group's gaming server. Usernames in the chat log are anonymized (User1, User2, etc.) and are NOT part of the message content. Only score the actual message text after the colon.
SCORING GUIDE — Be precise with scores:
- 0.0-0.1: Completely harmless. Casual chat, jokes, "lmao", greetings, game talk, nicknames.
@@ -12,11 +12,10 @@ SCORING GUIDE — Be precise with scores:
- 0.8-1.0: Severely toxic. Threats, targeted harassment, telling someone to leave, attacking insecurities, sustained personal attacks.
IMPORTANT RULES:
- Usernames/display names (e.g. "Calm your tits", "tits") = ALWAYS IGNORE. Score 0.0 for the username itself. Only score the message content.
- Nicknames and in-group shorthand used in message content (e.g. shortened names friends call each other) are NOT toxic. Only score actual hostile intent.
- Profanity ALONE (just "fuck" or "shit" with no target) = low score (0.0-0.1).
- Profanity DIRECTED AT someone ("fuck you", "you piece of shit") = moderate-to-high score (0.5-0.7) even among friends.
- Do NOT let friendly context excuse clearly aggressive language. Friends can still cross lines.
- If a message contains BOTH a nickname AND an insult ("fuck you tits you piece of shit"), score the insult, not the nickname.
- If the target message is just "lmao", "lol", an emoji, or a short neutral reaction, it is ALWAYS 0.0 regardless of what other people said before it.
- If a user is QUOTING or REPORTING what someone else said (e.g. "you called them X", "he said Y to her"), score based on the user's own intent, NOT the quoted words. Tattling, reporting, or referencing someone else's language is not the same as using that language aggressively. These should score 0.0-0.2 unless the user is clearly weaponizing the quote to attack someone.
- Sexually crude or vulgar remarks DIRECTED AT someone (e.g. "you watch that to cum", "bet you get off to that") = 0.5-0.7 and category "sexual_vulgar". Adding "lol" or "lmao" does NOT soften sexual content aimed at a person — it's still degrading. General sexual jokes not targeting anyone specific can score lower (0.2-0.3).