feat: require warning before mute + sustained toxicity escalation

Gate mutes behind a prior warning — first offense always gets a warning, mute only fires if warned_since_reset is True. Warned flag is persisted to DB (new Warned column on UserState) and survives restarts. Add post-warning escalation boost to drama_score: each high-scoring message after a warning adds +0.04 (configurable) so sustained bad behavior ramps toward the mute threshold instead of plateauing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-25 11:07:57 -05:00
parent f02a4ab49d
commit 71c7b45e9a
4 changed files with 56 additions and 16 deletions
@@ -292,7 +292,8 @@ class SentimentCog(commands.Cog):

            # Track the result in DramaTracker
            self.bot.drama_tracker.add_entry(user_id, score, categories, reasoning)
-            drama_score = self.bot.drama_tracker.get_drama_score(user_id)
+            escalation_boost = sentiment_config.get("escalation_boost", 0.04)
+            drama_score = self.bot.drama_tracker.get_drama_score(user_id, escalation_boost=escalation_boost)

            logger.info(
                "User %s (%d) | msg_score=%.2f | drama_score=%.2f | categories=%s | %s",
@@ -358,10 +359,16 @@ class SentimentCog(commands.Cog):
                mute_threshold = self.bot.drama_tracker.get_mute_threshold(
                    user_id, base_mute_threshold
                )
+                user_data = self.bot.drama_tracker.get_user(user_id)
                # Mute: rolling average OR single message spike
                if drama_score >= mute_threshold or score >= spike_mute:
                    effective_score = max(drama_score, score)
-                    await self._mute_user(user_ref_msg, effective_score, categories, db_message_id)
+                    if user_data.warned_since_reset:
+                        await self._mute_user(user_ref_msg, effective_score, categories, db_message_id)
+                    else:
+                        # Downgrade to warning — require a warning before muting
+                        logger.info("Downgrading mute to warning for %s (no prior warning)", user_ref_msg.author)
+                        await self._warn_user(user_ref_msg, effective_score, db_message_id)
                # Warn: rolling average OR single message spike
                elif drama_score >= warning_threshold or score >= spike_warn:
                    effective_score = max(drama_score, score)
@@ -556,7 +563,8 @@ class SentimentCog(commands.Cog):
                self._mark_analyzed(m.id)

            self.bot.drama_tracker.add_entry(user_id, score, categories, reasoning)
-            drama_score = self.bot.drama_tracker.get_drama_score(user_id)
+            escalation_boost = sentiment_config.get("escalation_boost", 0.04)
+            drama_score = self.bot.drama_tracker.get_drama_score(user_id, escalation_boost=escalation_boost)

            # Save to DB
            content_summary = f"[Mention scan] {worst_msg}" if worst_msg else "[Mention scan] See conversation"
@@ -599,9 +607,14 @@ class SentimentCog(commands.Cog):
                mute_threshold = self.bot.drama_tracker.get_mute_threshold(
                    user_id, base_mute_threshold
                )
+                user_data = self.bot.drama_tracker.get_user(user_id)
                if drama_score >= mute_threshold or score >= spike_mute:
                    effective_score = max(drama_score, score)
-                    await self._mute_user(ref_msg, effective_score, categories, db_message_id)
+                    if user_data.warned_since_reset:
+                        await self._mute_user(ref_msg, effective_score, categories, db_message_id)
+                    else:
+                        logger.info("Downgrading mute to warning for %s (no prior warning)", ref_msg.author)
+                        await self._warn_user(ref_msg, effective_score, db_message_id)
                elif drama_score >= warning_threshold or score >= spike_warn:
                    effective_score = max(drama_score, score)
                    await self._warn_user(ref_msg, effective_score, db_message_id)
@@ -747,6 +760,8 @@ class SentimentCog(commands.Cog):
            message_id=db_message_id,
            details=f"score={score:.2f}",
        ))
+        # Persist warned flag immediately so it survives restarts
+        self._save_user_state(message.author.id)

    async def _handle_topic_drift(
        self, message: discord.Message, topic_category: str, topic_reasoning: str,
@@ -897,6 +912,7 @@ class SentimentCog(commands.Cog):
            off_topic_count=user_data.off_topic_count,
            baseline_coherence=user_data.baseline_coherence,
            user_notes=user_data.notes or None,
+            warned=user_data.warned_since_reset,
        ))
        self._dirty_users.discard(user_id)

@@ -923,6 +939,7 @@ class SentimentCog(commands.Cog):
                off_topic_count=user_data.off_topic_count,
                baseline_coherence=user_data.baseline_coherence,
                user_notes=user_data.notes or None,
+                warned=user_data.warned_since_reset,
            )
        logger.info("Flushed %d dirty user states to DB.", len(dirty))

@@ -17,8 +17,9 @@ sentiment:
  context_messages: 8  # Number of previous messages to include as context
  rolling_window_size: 10  # Number of messages to track per user
  rolling_window_minutes: 15  # Time window for tracking
-  batch_window_seconds: 10  # Wait this long for more messages before analyzing (debounce)
+  batch_window_seconds: 4  # Wait this long for more messages before analyzing (debounce)
  escalation_threshold: 0.25  # Triage toxicity score that triggers re-analysis with heavy model
+  escalation_boost: 0.04  # Per-message drama boost after warning (sustained toxicity ramps toward mute)

 game_channels:
  gta-online: "GTA Online"
@@ -126,6 +126,12 @@ class Database:
                ALTER TABLE UserState ADD UserNotes NVARCHAR(MAX) NULL
        """)

+        # --- Schema migration for warned flag (require warning before mute) ---
+        cursor.execute("""
+            IF COL_LENGTH('UserState', 'Warned') IS NULL
+                ALTER TABLE UserState ADD Warned BIT NOT NULL DEFAULT 0
+        """)
+
        cursor.execute("""
            IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = 'BotSettings')
            CREATE TABLE BotSettings (
@@ -284,19 +290,20 @@ class Database:
        off_topic_count: int,
        baseline_coherence: float = 0.85,
        user_notes: str | None = None,
+        warned: bool = False,
    ) -> None:
-        """Upsert user state (offense count, immunity, off-topic count, coherence baseline, notes)."""
+        """Upsert user state (offense count, immunity, off-topic count, coherence baseline, notes, warned)."""
        if not self._available:
            return
        try:
            await asyncio.to_thread(
                self._save_user_state_sync,
-                user_id, offense_count, immune, off_topic_count, baseline_coherence, user_notes,
+                user_id, offense_count, immune, off_topic_count, baseline_coherence, user_notes, warned,
            )
        except Exception:
            logger.exception("Failed to save user state")

-    def _save_user_state_sync(self, user_id, offense_count, immune, off_topic_count, baseline_coherence, user_notes):
+    def _save_user_state_sync(self, user_id, offense_count, immune, off_topic_count, baseline_coherence, user_notes, warned):
        conn = self._connect()
        try:
            cursor = conn.cursor()
@@ -306,14 +313,14 @@ class Database:
                   ON target.UserId = source.UserId
                   WHEN MATCHED THEN
                       UPDATE SET OffenseCount = ?, Immune = ?, OffTopicCount = ?,
-                                  BaselineCoherence = ?, UserNotes = ?,
+                                  BaselineCoherence = ?, UserNotes = ?, Warned = ?,
                                  UpdatedAt = SYSUTCDATETIME()
                   WHEN NOT MATCHED THEN
-                       INSERT (UserId, OffenseCount, Immune, OffTopicCount, BaselineCoherence, UserNotes)
-                       VALUES (?, ?, ?, ?, ?, ?);""",
+                       INSERT (UserId, OffenseCount, Immune, OffTopicCount, BaselineCoherence, UserNotes, Warned)
+                       VALUES (?, ?, ?, ?, ?, ?, ?);""",
                user_id,
-                offense_count, 1 if immune else 0, off_topic_count, baseline_coherence, user_notes,
-                user_id, offense_count, 1 if immune else 0, off_topic_count, baseline_coherence, user_notes,
+                offense_count, 1 if immune else 0, off_topic_count, baseline_coherence, user_notes, 1 if warned else 0,
+                user_id, offense_count, 1 if immune else 0, off_topic_count, baseline_coherence, user_notes, 1 if warned else 0,
            )
            cursor.close()
        finally:
@@ -356,7 +363,7 @@ class Database:
        try:
            cursor = conn.cursor()
            cursor.execute(
-                "SELECT UserId, OffenseCount, Immune, OffTopicCount, BaselineCoherence, UserNotes FROM UserState"
+                "SELECT UserId, OffenseCount, Immune, OffTopicCount, BaselineCoherence, UserNotes, Warned FROM UserState"
            )
            rows = cursor.fetchall()
            cursor.close()
@@ -368,6 +375,7 @@ class Database:
                    "off_topic_count": row[3],
                    "baseline_coherence": float(row[4]),
                    "user_notes": row[5] or "",
+                    "warned": bool(row[6]),
                }
                for row in rows
            ]
@@ -70,7 +70,7 @@ class DramaTracker:
        user.last_analysis_time = now
        self._prune_entries(user, now)

-    def get_drama_score(self, user_id: int) -> float:
+    def get_drama_score(self, user_id: int, escalation_boost: float = 0.04) -> float:
        user = self.get_user(user_id)
        now = time.time()
        self._prune_entries(user, now)
@@ -86,7 +86,19 @@ class DramaTracker:
            weighted_sum += entry.toxicity_score * weight
            total_weight += weight

-        return weighted_sum / total_weight if total_weight > 0 else 0.0
+        base_score = weighted_sum / total_weight if total_weight > 0 else 0.0
+
+        # Escalation: if warned, each high-scoring message AFTER the warning
+        # adds a boost so sustained bad behavior ramps toward mute threshold
+        if user.warned_since_reset and user.last_warning_time > 0:
+            post_warn_high = sum(
+                1 for e in user.entries
+                if e.timestamp > user.last_warning_time and e.toxicity_score >= 0.5
+            )
+            if post_warn_high > 0:
+                base_score += escalation_boost * post_warn_high
+
+        return min(base_score, 1.0)

    def get_mute_threshold(self, user_id: int, base_threshold: float) -> float:
        """Lower the mute threshold if user was already warned."""
@@ -272,6 +284,8 @@ class DramaTracker:
                user.baseline_coherence = state["baseline_coherence"]
            if "user_notes" in state and state["user_notes"]:
                user.notes = state["user_notes"]
+            if state.get("warned"):
+                user.warned_since_reset = True
            count += 1
        return count