feat: require warning before mute + sustained toxicity escalation
Gate mutes behind a prior warning — first offense always gets a warning, mute only fires if warned_since_reset is True. Warned flag is persisted to DB (new Warned column on UserState) and survives restarts. Add post-warning escalation boost to drama_score: each high-scoring message after a warning adds +0.04 (configurable) so sustained bad behavior ramps toward the mute threshold instead of plateauing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -126,6 +126,12 @@ class Database:
|
||||
ALTER TABLE UserState ADD UserNotes NVARCHAR(MAX) NULL
|
||||
""")
|
||||
|
||||
# --- Schema migration for warned flag (require warning before mute) ---
|
||||
cursor.execute("""
|
||||
IF COL_LENGTH('UserState', 'Warned') IS NULL
|
||||
ALTER TABLE UserState ADD Warned BIT NOT NULL DEFAULT 0
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = 'BotSettings')
|
||||
CREATE TABLE BotSettings (
|
||||
@@ -284,19 +290,20 @@ class Database:
|
||||
off_topic_count: int,
|
||||
baseline_coherence: float = 0.85,
|
||||
user_notes: str | None = None,
|
||||
warned: bool = False,
|
||||
) -> None:
|
||||
"""Upsert user state (offense count, immunity, off-topic count, coherence baseline, notes)."""
|
||||
"""Upsert user state (offense count, immunity, off-topic count, coherence baseline, notes, warned)."""
|
||||
if not self._available:
|
||||
return
|
||||
try:
|
||||
await asyncio.to_thread(
|
||||
self._save_user_state_sync,
|
||||
user_id, offense_count, immune, off_topic_count, baseline_coherence, user_notes,
|
||||
user_id, offense_count, immune, off_topic_count, baseline_coherence, user_notes, warned,
|
||||
)
|
||||
except Exception:
|
||||
logger.exception("Failed to save user state")
|
||||
|
||||
def _save_user_state_sync(self, user_id, offense_count, immune, off_topic_count, baseline_coherence, user_notes):
|
||||
def _save_user_state_sync(self, user_id, offense_count, immune, off_topic_count, baseline_coherence, user_notes, warned):
|
||||
conn = self._connect()
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
@@ -306,14 +313,14 @@ class Database:
|
||||
ON target.UserId = source.UserId
|
||||
WHEN MATCHED THEN
|
||||
UPDATE SET OffenseCount = ?, Immune = ?, OffTopicCount = ?,
|
||||
BaselineCoherence = ?, UserNotes = ?,
|
||||
BaselineCoherence = ?, UserNotes = ?, Warned = ?,
|
||||
UpdatedAt = SYSUTCDATETIME()
|
||||
WHEN NOT MATCHED THEN
|
||||
INSERT (UserId, OffenseCount, Immune, OffTopicCount, BaselineCoherence, UserNotes)
|
||||
VALUES (?, ?, ?, ?, ?, ?);""",
|
||||
INSERT (UserId, OffenseCount, Immune, OffTopicCount, BaselineCoherence, UserNotes, Warned)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?);""",
|
||||
user_id,
|
||||
offense_count, 1 if immune else 0, off_topic_count, baseline_coherence, user_notes,
|
||||
user_id, offense_count, 1 if immune else 0, off_topic_count, baseline_coherence, user_notes,
|
||||
offense_count, 1 if immune else 0, off_topic_count, baseline_coherence, user_notes, 1 if warned else 0,
|
||||
user_id, offense_count, 1 if immune else 0, off_topic_count, baseline_coherence, user_notes, 1 if warned else 0,
|
||||
)
|
||||
cursor.close()
|
||||
finally:
|
||||
@@ -356,7 +363,7 @@ class Database:
|
||||
try:
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"SELECT UserId, OffenseCount, Immune, OffTopicCount, BaselineCoherence, UserNotes FROM UserState"
|
||||
"SELECT UserId, OffenseCount, Immune, OffTopicCount, BaselineCoherence, UserNotes, Warned FROM UserState"
|
||||
)
|
||||
rows = cursor.fetchall()
|
||||
cursor.close()
|
||||
@@ -368,6 +375,7 @@ class Database:
|
||||
"off_topic_count": row[3],
|
||||
"baseline_coherence": float(row[4]),
|
||||
"user_notes": row[5] or "",
|
||||
"warned": bool(row[6]),
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
|
||||
@@ -70,7 +70,7 @@ class DramaTracker:
|
||||
user.last_analysis_time = now
|
||||
self._prune_entries(user, now)
|
||||
|
||||
def get_drama_score(self, user_id: int) -> float:
|
||||
def get_drama_score(self, user_id: int, escalation_boost: float = 0.04) -> float:
|
||||
user = self.get_user(user_id)
|
||||
now = time.time()
|
||||
self._prune_entries(user, now)
|
||||
@@ -86,7 +86,19 @@ class DramaTracker:
|
||||
weighted_sum += entry.toxicity_score * weight
|
||||
total_weight += weight
|
||||
|
||||
return weighted_sum / total_weight if total_weight > 0 else 0.0
|
||||
base_score = weighted_sum / total_weight if total_weight > 0 else 0.0
|
||||
|
||||
# Escalation: if warned, each high-scoring message AFTER the warning
|
||||
# adds a boost so sustained bad behavior ramps toward mute threshold
|
||||
if user.warned_since_reset and user.last_warning_time > 0:
|
||||
post_warn_high = sum(
|
||||
1 for e in user.entries
|
||||
if e.timestamp > user.last_warning_time and e.toxicity_score >= 0.5
|
||||
)
|
||||
if post_warn_high > 0:
|
||||
base_score += escalation_boost * post_warn_high
|
||||
|
||||
return min(base_score, 1.0)
|
||||
|
||||
def get_mute_threshold(self, user_id: int, base_threshold: float) -> float:
|
||||
"""Lower the mute threshold if user was already warned."""
|
||||
@@ -272,6 +284,8 @@ class DramaTracker:
|
||||
user.baseline_coherence = state["baseline_coherence"]
|
||||
if "user_notes" in state and state["user_notes"]:
|
||||
user.notes = state["user_notes"]
|
||||
if state.get("warned"):
|
||||
user.warned_since_reset = True
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
Reference in New Issue
Block a user