From f75a3ca3f4043e4d5d3eb4dc24b086ad529283bf Mon Sep 17 00:00:00 2001 From: AJ Isaacs Date: Mon, 2 Mar 2026 21:58:04 -0500 Subject: [PATCH] fix: instruct LLM to never quote toxic content in note_updates Co-Authored-By: Claude Opus 4.6 --- prompts/analysis.txt | 2 +- utils/llm_client.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/prompts/analysis.txt b/prompts/analysis.txt index 97ad06f..21a78f0 100644 --- a/prompts/analysis.txt +++ b/prompts/analysis.txt @@ -26,7 +26,7 @@ TOPIC: Flag off_topic if the message is personal drama (relationship issues, feu GAME DETECTION: If CHANNEL INFO is provided, set detected_game to the matching channel name from that list, or null if unsure/not game-specific. -USER NOTES: If provided, use to calibrate (e.g. if notes say "uses heavy profanity casually", profanity alone should score lower). Add a note_update only for genuinely new behavioral observations; null otherwise. +USER NOTES: If provided, use to calibrate (e.g. if notes say "uses heavy profanity casually", profanity alone should score lower). Add a note_update only for genuinely new behavioral observations; null otherwise. NEVER quote or repeat toxic/offensive language in note_update — describe patterns abstractly (e.g. "directed a personal insult at another user", NOT "called someone a [slur]"). RULE ENFORCEMENT: If SERVER RULES are provided, report clearly violated rule numbers in violated_rules. Only flag clear violations, not borderline. diff --git a/utils/llm_client.py b/utils/llm_client.py index c975b21..f67898a 100644 --- a/utils/llm_client.py +++ b/utils/llm_client.py @@ -86,7 +86,7 @@ ANALYSIS_TOOL = { }, "note_update": { "type": ["string", "null"], - "description": "Brief new observation about this user's style/behavior for future reference, or null if nothing new.", + "description": "Brief new observation about this user's style/behavior for future reference, or null if nothing new. NEVER quote toxic language — describe patterns abstractly (e.g. 'uses personal insults when frustrated').", }, "detected_game": { "type": ["string", "null"], @@ -189,7 +189,7 @@ CONVERSATION_TOOL = { }, "note_update": { "type": ["string", "null"], - "description": "New observation about this user's pattern, or null.", + "description": "New observation about this user's pattern, or null. NEVER quote toxic language — describe patterns abstractly.", }, "detected_game": { "type": ["string", "null"],