Compare commits

...

4 Commits

Author SHA1 Message Date
aj 53803d920f fix: sanitize note_updates before storing in sentiment pipeline
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 22:04:00 -05:00
aj b7076dffe2 fix: sanitize profile updates before storing in chat memory pipeline
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 22:03:59 -05:00
aj c5316b98d1 feat: add sanitize_notes() method to LLMClient
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 22:03:59 -05:00
aj f75a3ca3f4 fix: instruct LLM to never quote toxic content in note_updates
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 22:03:59 -05:00
4 changed files with 54 additions and 6 deletions
+2
View File
@@ -171,6 +171,8 @@ class ChatCog(commands.Cog):
# Update profile if warranted
profile_update = result.get("profile_update")
if profile_update:
# Sanitize before storing — strips any quoted toxic language
profile_update = await self.bot.llm.sanitize_notes(profile_update)
self.bot.drama_tracker.set_user_profile(user_id, profile_update)
self._dirty_users.add(user_id)
+4 -3
View File
@@ -469,13 +469,14 @@ class SentimentCog(commands.Cog):
# Note update — route to memory system
if note_update:
# Still update the legacy notes for backward compat with analysis prompt
self.bot.drama_tracker.update_user_notes(user_id, note_update)
# Sanitize before storing — strips any quoted toxic language
sanitized = await self.bot.llm.sanitize_notes(note_update)
self.bot.drama_tracker.update_user_notes(user_id, sanitized)
self._dirty_users.add(user_id)
# Also save as an expiring memory (7d default for passive observations)
asyncio.create_task(self.bot.db.save_memory(
user_id=user_id,
memory=note_update[:500],
memory=sanitized[:500],
topics=db_topic_category or "general",
importance="medium",
expires_at=datetime.now(timezone.utc) + timedelta(days=7),
+1 -1
View File
@@ -26,7 +26,7 @@ TOPIC: Flag off_topic if the message is personal drama (relationship issues, feu
GAME DETECTION: If CHANNEL INFO is provided, set detected_game to the matching channel name from that list, or null if unsure/not game-specific.
USER NOTES: If provided, use to calibrate (e.g. if notes say "uses heavy profanity casually", profanity alone should score lower). Add a note_update only for genuinely new behavioral observations; null otherwise.
USER NOTES: If provided, use to calibrate (e.g. if notes say "uses heavy profanity casually", profanity alone should score lower). Add a note_update only for genuinely new behavioral observations; null otherwise. NEVER quote or repeat toxic/offensive language in note_update — describe patterns abstractly (e.g. "directed a personal insult at another user", NOT "called someone a [slur]").
RULE ENFORCEMENT: If SERVER RULES are provided, report clearly violated rule numbers in violated_rules. Only flag clear violations, not borderline.
+47 -2
View File
@@ -86,7 +86,7 @@ ANALYSIS_TOOL = {
},
"note_update": {
"type": ["string", "null"],
"description": "Brief new observation about this user's style/behavior for future reference, or null if nothing new.",
"description": "Brief new observation about this user's style/behavior for future reference, or null if nothing new. NEVER quote toxic language — describe patterns abstractly (e.g. 'uses personal insults when frustrated').",
},
"detected_game": {
"type": ["string", "null"],
@@ -189,7 +189,7 @@ CONVERSATION_TOOL = {
},
"note_update": {
"type": ["string", "null"],
"description": "New observation about this user's pattern, or null.",
"description": "New observation about this user's pattern, or null. NEVER quote toxic language — describe patterns abstractly.",
},
"detected_game": {
"type": ["string", "null"],
@@ -977,6 +977,51 @@ class LLMClient:
"profile_update": profile_update,
}
async def sanitize_notes(self, notes: str) -> str:
"""Rewrite user notes to remove any quoted toxic/offensive language.
Returns the sanitized notes string, or the original on failure.
"""
if not notes or len(notes.strip()) == 0:
return notes
system_prompt = (
"Rewrite the following user behavior notes. Remove any quoted offensive language, "
"slurs, or profanity. Replace toxic quotes with abstract descriptions of the behavior "
"(e.g. 'directed a personal insult at another user' instead of quoting the insult). "
"Preserve all non-toxic observations, timestamps, and behavioral patterns exactly. "
"Return ONLY the rewritten notes, nothing else."
)
user_content = notes
if self._no_think:
user_content += "\n/no_think"
t0 = time.monotonic()
async with self._semaphore:
try:
temp_kwargs = {"temperature": 0.1} if self._supports_temperature else {}
response = await self._client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_content},
],
**temp_kwargs,
max_completion_tokens=1024,
)
elapsed = int((time.monotonic() - t0) * 1000)
result = response.choices[0].message.content
if result and result.strip():
self._log_llm("sanitize_notes", elapsed, True, notes[:300], result[:300])
return result.strip()
self._log_llm("sanitize_notes", elapsed, False, notes[:300], error="Empty response")
return notes
except Exception as e:
elapsed = int((time.monotonic() - t0) * 1000)
logger.error("LLM sanitize_notes error: %s", e)
self._log_llm("sanitize_notes", elapsed, False, notes[:300], error=str(e))
return notes
async def analyze_image(
self,
image_bytes: bytes,