fix: anonymize usernames before LLM analysis to prevent name-based scoring bias
Display names like "Calm your tits" were causing the LLM to inflate toxicity scores on completely benign messages. Usernames are now replaced with User1, User2, etc. before sending to the LLM, then mapped back to real names in the results. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -204,6 +204,55 @@ class SentimentCog(commands.Cog):
|
||||
notes_map[name] = notes
|
||||
return notes_map
|
||||
|
||||
@staticmethod
|
||||
def _build_anon_map(
|
||||
conversation: list[tuple[str, str, datetime, str | None]],
|
||||
) -> dict[str, str]:
|
||||
"""Build display_name -> 'User1', 'User2', ... mapping for all participants."""
|
||||
seen: dict[str, str] = {}
|
||||
counter = 1
|
||||
for username, _, _, reply_to in conversation:
|
||||
if username not in seen:
|
||||
seen[username] = f"User{counter}"
|
||||
counter += 1
|
||||
if reply_to and reply_to not in seen:
|
||||
seen[reply_to] = f"User{counter}"
|
||||
counter += 1
|
||||
return seen
|
||||
|
||||
@staticmethod
|
||||
def _anonymize_conversation(
|
||||
conversation: list[tuple[str, str, datetime, str | None]],
|
||||
anon_map: dict[str, str],
|
||||
) -> list[tuple[str, str, datetime, str | None]]:
|
||||
"""Replace display names with anonymous keys in conversation tuples."""
|
||||
return [
|
||||
(
|
||||
anon_map.get(username, username),
|
||||
content,
|
||||
ts,
|
||||
anon_map.get(reply_to, reply_to) if reply_to else None,
|
||||
)
|
||||
for username, content, ts, reply_to in conversation
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def _anonymize_notes(
|
||||
user_notes_map: dict[str, str],
|
||||
anon_map: dict[str, str],
|
||||
) -> dict[str, str]:
|
||||
"""Replace display name keys with anonymous keys in user notes map."""
|
||||
return {anon_map.get(name, name): notes for name, notes in user_notes_map.items()}
|
||||
|
||||
@staticmethod
|
||||
def _deanonymize_findings(result: dict, anon_map: dict[str, str]) -> None:
|
||||
"""Replace anonymous keys back to display names in LLM findings (in-place)."""
|
||||
reverse_map = {v: k for k, v in anon_map.items()}
|
||||
for finding in result.get("user_findings", []):
|
||||
anon_name = finding.get("username", "")
|
||||
if anon_name in reverse_map:
|
||||
finding["username"] = reverse_map[anon_name]
|
||||
|
||||
@staticmethod
|
||||
def _build_conversation(
|
||||
messages: list[discord.Message],
|
||||
@@ -393,6 +442,11 @@ class SentimentCog(commands.Cog):
|
||||
|
||||
user_notes_map = self._build_user_notes_map(messages)
|
||||
|
||||
# Anonymize usernames before sending to LLM to prevent name-based bias
|
||||
anon_map = self._build_anon_map(conversation)
|
||||
anon_conversation = self._anonymize_conversation(conversation, anon_map)
|
||||
anon_notes = self._anonymize_notes(user_notes_map, anon_map) if user_notes_map else user_notes_map
|
||||
|
||||
channel_context = build_channel_context(ref_message, game_channels)
|
||||
|
||||
logger.info(
|
||||
@@ -403,9 +457,9 @@ class SentimentCog(commands.Cog):
|
||||
|
||||
# TRIAGE: Lightweight model — conversation-level analysis
|
||||
result = await self.bot.llm.analyze_conversation(
|
||||
conversation,
|
||||
anon_conversation,
|
||||
channel_context=channel_context,
|
||||
user_notes_map=user_notes_map,
|
||||
user_notes_map=anon_notes,
|
||||
new_message_start=new_message_start,
|
||||
)
|
||||
|
||||
@@ -422,9 +476,9 @@ class SentimentCog(commands.Cog):
|
||||
)
|
||||
if needs_escalation:
|
||||
heavy_result = await self.bot.llm_heavy.analyze_conversation(
|
||||
conversation,
|
||||
anon_conversation,
|
||||
channel_context=channel_context,
|
||||
user_notes_map=user_notes_map,
|
||||
user_notes_map=anon_notes,
|
||||
new_message_start=new_message_start,
|
||||
)
|
||||
if heavy_result is not None:
|
||||
@@ -434,6 +488,9 @@ class SentimentCog(commands.Cog):
|
||||
)
|
||||
result = heavy_result
|
||||
|
||||
# De-anonymize findings back to real display names
|
||||
self._deanonymize_findings(result, anon_map)
|
||||
|
||||
user_lookup = self._build_user_lookup(messages)
|
||||
|
||||
# Mark all buffered messages as analyzed (for mention scan dedup)
|
||||
@@ -557,6 +614,11 @@ class SentimentCog(commands.Cog):
|
||||
conversation = self._build_conversation(raw_messages)
|
||||
user_notes_map = self._build_user_notes_map(raw_messages)
|
||||
|
||||
# Anonymize usernames before sending to LLM
|
||||
anon_map = self._build_anon_map(conversation)
|
||||
anon_conversation = self._anonymize_conversation(conversation, anon_map)
|
||||
anon_notes = self._anonymize_notes(user_notes_map, anon_map) if user_notes_map else user_notes_map
|
||||
|
||||
channel_context = build_channel_context(raw_messages[0], game_channels)
|
||||
mention_context = (
|
||||
f"A user flagged this conversation and said: \"{mention_text}\"\n"
|
||||
@@ -565,10 +627,10 @@ class SentimentCog(commands.Cog):
|
||||
|
||||
# Single LLM call
|
||||
result = await self.bot.llm.analyze_conversation(
|
||||
conversation,
|
||||
anon_conversation,
|
||||
mention_context=mention_context,
|
||||
channel_context=channel_context,
|
||||
user_notes_map=user_notes_map,
|
||||
user_notes_map=anon_notes,
|
||||
)
|
||||
|
||||
if result is None:
|
||||
@@ -576,6 +638,9 @@ class SentimentCog(commands.Cog):
|
||||
self._mention_scan_results[trigger_message.id] = "Analysis failed."
|
||||
return
|
||||
|
||||
# De-anonymize findings back to real display names
|
||||
self._deanonymize_findings(result, anon_map)
|
||||
|
||||
user_lookup = self._build_user_lookup(raw_messages)
|
||||
findings: list[str] = []
|
||||
dry_run = config.get("monitoring", {}).get("dry_run", False)
|
||||
|
||||
Reference in New Issue
Block a user