fix: sanitize note_updates before storing in sentiment pipeline

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
fix: sanitize profile updates before storing in chat memory pipeline
2026-03-02 22:04:00 -05:00 · 2026-03-02 22:03:59 -05:00 · 2026-03-02 22:03:59 -05:00 · 2026-03-02 22:03:59 -05:00
4 changed files with 54 additions and 6 deletions
@@ -171,6 +171,8 @@ class ChatCog(commands.Cog):
            # Update profile if warranted
            profile_update = result.get("profile_update")
            if profile_update:
                # Sanitize before storing — strips any quoted toxic language
                profile_update = await self.bot.llm.sanitize_notes(profile_update)
                self.bot.drama_tracker.set_user_profile(user_id, profile_update)
                self._dirty_users.add(user_id)
@@ -469,13 +469,14 @@ class SentimentCog(commands.Cog):
        # Note update — route to memory system
        if note_update:
-            # Still update the legacy notes for backward compat with analysis prompt
+            # Sanitize before storing — strips any quoted toxic language
-            self.bot.drama_tracker.update_user_notes(user_id, note_update)
+            sanitized = await self.bot.llm.sanitize_notes(note_update)
            self.bot.drama_tracker.update_user_notes(user_id, sanitized)
            self._dirty_users.add(user_id)
            # Also save as an expiring memory (7d default for passive observations)
            asyncio.create_task(self.bot.db.save_memory(
                user_id=user_id,
-                memory=note_update[:500],
+                memory=sanitized[:500],
                topics=db_topic_category or "general",
                importance="medium",
                expires_at=datetime.now(timezone.utc) + timedelta(days=7),
@@ -26,7 +26,7 @@ TOPIC: Flag off_topic if the message is personal drama (relationship issues, feu
 GAME DETECTION: If CHANNEL INFO is provided, set detected_game to the matching channel name from that list, or null if unsure/not game-specific.
-USER NOTES: If provided, use to calibrate (e.g. if notes say "uses heavy profanity casually", profanity alone should score lower). Add a note_update only for genuinely new behavioral observations; null otherwise.
+USER NOTES: If provided, use to calibrate (e.g. if notes say "uses heavy profanity casually", profanity alone should score lower). Add a note_update only for genuinely new behavioral observations; null otherwise. NEVER quote or repeat toxic/offensive language in note_update — describe patterns abstractly (e.g. "directed a personal insult at another user", NOT "called someone a [slur]").
 RULE ENFORCEMENT: If SERVER RULES are provided, report clearly violated rule numbers in violated_rules. Only flag clear violations, not borderline.
@@ -86,7 +86,7 @@ ANALYSIS_TOOL = {
                },
                "note_update": {
                    "type": ["string", "null"],
-                    "description": "Brief new observation about this user's style/behavior for future reference, or null if nothing new.",
+                    "description": "Brief new observation about this user's style/behavior for future reference, or null if nothing new. NEVER quote toxic language — describe patterns abstractly (e.g. 'uses personal insults when frustrated').",
                },
                "detected_game": {
                    "type": ["string", "null"],
@@ -189,7 +189,7 @@ CONVERSATION_TOOL = {
                            },
                            "note_update": {
                                "type": ["string", "null"],
-                                "description": "New observation about this user's pattern, or null.",
+                                "description": "New observation about this user's pattern, or null. NEVER quote toxic language — describe patterns abstractly.",
                            },
                            "detected_game": {
                                "type": ["string", "null"],
@@ -977,6 +977,51 @@ class LLMClient:
            "profile_update": profile_update,
        }
    async def sanitize_notes(self, notes: str) -> str:
        """Rewrite user notes to remove any quoted toxic/offensive language.
        Returns the sanitized notes string, or the original on failure.
        """
        if not notes or len(notes.strip()) == 0:
            return notes
        system_prompt = (
            "Rewrite the following user behavior notes. Remove any quoted offensive language, "
            "slurs, or profanity. Replace toxic quotes with abstract descriptions of the behavior "
            "(e.g. 'directed a personal insult at another user' instead of quoting the insult). "
            "Preserve all non-toxic observations, timestamps, and behavioral patterns exactly. "
            "Return ONLY the rewritten notes, nothing else."
        )
        user_content = notes
        if self._no_think:
            user_content += "\n/no_think"
        t0 = time.monotonic()
        async with self._semaphore:
            try:
                temp_kwargs = {"temperature": 0.1} if self._supports_temperature else {}
                response = await self._client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": user_content},
                    ],
                    **temp_kwargs,
                    max_completion_tokens=1024,
                )
                elapsed = int((time.monotonic() - t0) * 1000)
                result = response.choices[0].message.content
                if result and result.strip():
                    self._log_llm("sanitize_notes", elapsed, True, notes[:300], result[:300])
                    return result.strip()
                self._log_llm("sanitize_notes", elapsed, False, notes[:300], error="Empty response")
                return notes
            except Exception as e:
                elapsed = int((time.monotonic() - t0) * 1000)
                logger.error("LLM sanitize_notes error: %s", e)
                self._log_llm("sanitize_notes", elapsed, False, notes[:300], error=str(e))
                return notes
    async def analyze_image(
        self,
        image_bytes: bytes,
Author	SHA1	Message	Date
aj	53803d920f	fix: sanitize note_updates before storing in sentiment pipeline Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-02 22:04:00 -05:00
aj	b7076dffe2	fix: sanitize profile updates before storing in chat memory pipeline Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-02 22:03:59 -05:00
aj	c5316b98d1	feat: add sanitize_notes() method to LLMClient Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-02 22:03:59 -05:00
aj	f75a3ca3f4	fix: instruct LLM to never quote toxic content in note_updates Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-03-02 22:03:59 -05:00