feat: require warning before mute + sustained toxicity escalation

Gate mutes behind a prior warning — first offense always gets a warning, mute only fires if warned_since_reset is True. Warned flag is persisted to DB (new Warned column on UserState) and survives restarts. Add post-warning escalation boost to drama_score: each high-scoring message after a warning adds +0.04 (configurable) so sustained bad behavior ramps toward the mute threshold instead of plateauing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-25 11:07:57 -05:00
parent f02a4ab49d
commit 71c7b45e9a
4 changed files with 56 additions and 16 deletions
@@ -70,7 +70,7 @@ class DramaTracker:
        user.last_analysis_time = now
        self._prune_entries(user, now)

-    def get_drama_score(self, user_id: int) -> float:
+    def get_drama_score(self, user_id: int, escalation_boost: float = 0.04) -> float:
        user = self.get_user(user_id)
        now = time.time()
        self._prune_entries(user, now)
@@ -86,7 +86,19 @@ class DramaTracker:
            weighted_sum += entry.toxicity_score * weight
            total_weight += weight

-        return weighted_sum / total_weight if total_weight > 0 else 0.0
+        base_score = weighted_sum / total_weight if total_weight > 0 else 0.0
+
+        # Escalation: if warned, each high-scoring message AFTER the warning
+        # adds a boost so sustained bad behavior ramps toward mute threshold
+        if user.warned_since_reset and user.last_warning_time > 0:
+            post_warn_high = sum(
+                1 for e in user.entries
+                if e.timestamp > user.last_warning_time and e.toxicity_score >= 0.5
+            )
+            if post_warn_high > 0:
+                base_score += escalation_boost * post_warn_high
+
+        return min(base_score, 1.0)

    def get_mute_threshold(self, user_id: int, base_threshold: float) -> float:
        """Lower the mute threshold if user was already warned."""
@@ -272,6 +284,8 @@ class DramaTracker:
                user.baseline_coherence = state["baseline_coherence"]
            if "user_notes" in state and state["user_notes"]:
                user.notes = state["user_notes"]
+            if state.get("warned"):
+                user.warned_since_reset = True
            count += 1
        return count