feat: add warning expiration and exclude moderated messages from context

Warning flag now auto-expires after a configurable duration
(warning_expiration_minutes, default 30m). After expiry, the user must
be re-warned before a mute can be issued.

Messages that triggered moderation actions (warnings/mutes) are now
excluded from the LLM context window in both buffered analysis and
mention scans, preventing already-actioned content from influencing
future scoring. Uses in-memory tracking plus bot reaction fallback
for post-restart coverage.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-28 13:39:49 -05:00
parent 36df4cf5a6
commit eb7eb81621
6 changed files with 86 additions and 18 deletions
+1
View File
@@ -112,6 +112,7 @@ class BCSBot(commands.Bot):
window_size=sentiment.get("rolling_window_size", 10), window_size=sentiment.get("rolling_window_size", 10),
window_minutes=sentiment.get("rolling_window_minutes", 15), window_minutes=sentiment.get("rolling_window_minutes", 15),
offense_reset_minutes=timeouts.get("offense_reset_minutes", 120), offense_reset_minutes=timeouts.get("offense_reset_minutes", 120),
warning_expiration_minutes=timeouts.get("warning_expiration_minutes", 30),
) )
def get_mode_config(self) -> dict: def get_mode_config(self) -> dict:
+33 -8
View File
@@ -66,6 +66,7 @@ class SentimentCog(commands.Cog):
self._mention_scan_results: dict[int, str] = {} # {trigger_message_id: findings_summary} self._mention_scan_results: dict[int, str] = {} # {trigger_message_id: findings_summary}
self._analyzed_message_ids: set[int] = set() # Discord message IDs already analyzed self._analyzed_message_ids: set[int] = set() # Discord message IDs already analyzed
self._max_analyzed_ids = 500 self._max_analyzed_ids = 500
self._moderated_message_ids: set[int] = set() # Message IDs that triggered moderation
async def cog_load(self): async def cog_load(self):
@@ -206,21 +207,24 @@ class SentimentCog(commands.Cog):
thresholds: dict, thresholds: dict,
db_message_id: int | None, db_message_id: int | None,
violated_rules: list[int] | None = None, violated_rules: list[int] | None = None,
) -> None: ) -> bool:
"""Issue a warning or mute based on scores and thresholds.""" """Issue a warning or mute based on scores and thresholds.
Returns True if any moderation action was taken."""
rules_config = _RULES_DICT rules_config = _RULES_DICT
mute_threshold = self.bot.drama_tracker.get_mute_threshold(user_id, thresholds["mute"]) mute_threshold = self.bot.drama_tracker.get_mute_threshold(user_id, thresholds["mute"])
user_data = self.bot.drama_tracker.get_user(user_id)
if drama_score >= mute_threshold or score >= thresholds["spike_mute"]: if drama_score >= mute_threshold or score >= thresholds["spike_mute"]:
effective_score = max(drama_score, score) effective_score = max(drama_score, score)
if user_data.warned_since_reset: if self.bot.drama_tracker.is_warned(user_id):
await mute_user(self.bot, message, effective_score, categories, db_message_id, self._dirty_users, violated_rules=violated_rules, rules_config=rules_config) await mute_user(self.bot, message, effective_score, categories, db_message_id, self._dirty_users, violated_rules=violated_rules, rules_config=rules_config)
else: else:
logger.info("Downgrading mute to warning for %s (no prior warning)", message.author) logger.info("Downgrading mute to warning for %s (no prior warning)", message.author)
await warn_user(self.bot, message, effective_score, db_message_id, self._dirty_users, violated_rules=violated_rules, rules_config=rules_config) await warn_user(self.bot, message, effective_score, db_message_id, self._dirty_users, violated_rules=violated_rules, rules_config=rules_config)
return True
elif drama_score >= thresholds["warning"] or score >= thresholds["spike_warn"]: elif drama_score >= thresholds["warning"] or score >= thresholds["spike_warn"]:
effective_score = max(drama_score, score) effective_score = max(drama_score, score)
await warn_user(self.bot, message, effective_score, db_message_id, self._dirty_users, violated_rules=violated_rules, rules_config=rules_config) await warn_user(self.bot, message, effective_score, db_message_id, self._dirty_users, violated_rules=violated_rules, rules_config=rules_config)
return True
return False
@staticmethod @staticmethod
def _build_rules_context() -> str: def _build_rules_context() -> str:
@@ -484,10 +488,14 @@ class SentimentCog(commands.Cog):
# Moderation # Moderation
if not dry_run: if not dry_run:
await self._apply_moderation( acted = await self._apply_moderation(
user_ref_msg, user_id, score, drama_score, categories, thresholds, db_message_id, user_ref_msg, user_id, score, drama_score, categories, thresholds, db_message_id,
violated_rules=violated_rules, violated_rules=violated_rules,
) )
if acted:
for m in user_msgs:
self._moderated_message_ids.add(m.id)
self._prune_moderated_ids()
return (username, score, drama_score, categories) return (username, score, drama_score, categories)
@@ -514,11 +522,13 @@ class SentimentCog(commands.Cog):
oldest_buffered = messages[0] oldest_buffered = messages[0]
history_messages: list[discord.Message] = [] history_messages: list[discord.Message] = []
try: try:
async for msg in channel.history(limit=context_count + 5, before=oldest_buffered): async for msg in channel.history(limit=context_count + 10, before=oldest_buffered):
if msg.author.bot: if msg.author.bot:
continue continue
if not msg.content or not msg.content.strip(): if not msg.content or not msg.content.strip():
continue continue
if self._was_moderated(msg):
continue
history_messages.append(msg) history_messages.append(msg)
if len(history_messages) >= context_count: if len(history_messages) >= context_count:
break break
@@ -636,6 +646,19 @@ class SentimentCog(commands.Cog):
sorted_ids = sorted(self._analyzed_message_ids) sorted_ids = sorted(self._analyzed_message_ids)
self._analyzed_message_ids = set(sorted_ids[len(sorted_ids) // 2:]) self._analyzed_message_ids = set(sorted_ids[len(sorted_ids) // 2:])
def _prune_moderated_ids(self):
"""Cap the moderated message ID set to avoid unbounded growth."""
if len(self._moderated_message_ids) > self._max_analyzed_ids:
sorted_ids = sorted(self._moderated_message_ids)
self._moderated_message_ids = set(sorted_ids[len(sorted_ids) // 2:])
def _was_moderated(self, msg: discord.Message) -> bool:
"""Check if a message already triggered moderation (in-memory or via reaction)."""
if msg.id in self._moderated_message_ids:
return True
# Fall back to checking for bot's warning reaction (survives restarts)
return any(str(r.emoji) == "\u26a0\ufe0f" and r.me for r in msg.reactions)
async def _maybe_start_mention_scan( async def _maybe_start_mention_scan(
self, trigger_message: discord.Message, mention_config: dict self, trigger_message: discord.Message, mention_config: dict
): ):
@@ -683,14 +706,16 @@ class SentimentCog(commands.Cog):
sentiment_config = config.get("sentiment", {}) sentiment_config = config.get("sentiment", {})
game_channels = config.get("game_channels", {}) game_channels = config.get("game_channels", {})
# Fetch recent messages (before the trigger, skip bots/empty) # Fetch recent messages (before the trigger, skip bots/empty/moderated)
raw_messages: list[discord.Message] = [] raw_messages: list[discord.Message] = []
try: try:
async for msg in channel.history(limit=scan_count + 10, before=trigger_message): async for msg in channel.history(limit=scan_count + 20, before=trigger_message):
if msg.author.bot: if msg.author.bot:
continue continue
if not msg.content or not msg.content.strip(): if not msg.content or not msg.content.strip():
continue continue
if self._was_moderated(msg):
continue
raw_messages.append(msg) raw_messages.append(msg)
if len(raw_messages) >= scan_count: if len(raw_messages) >= scan_count:
break break
+2
View File
@@ -22,6 +22,7 @@ def save_user_state(bot, dirty_users: set[int], user_id: int) -> None:
warned=user_data.warned_since_reset, warned=user_data.warned_since_reset,
last_offense_at=user_data.last_offense_time or None, last_offense_at=user_data.last_offense_time or None,
aliases=_aliases_csv(user_data), aliases=_aliases_csv(user_data),
warning_expires_at=user_data.warning_expires_at or None,
)) ))
dirty_users.discard(user_id) dirty_users.discard(user_id)
@@ -44,5 +45,6 @@ async def flush_dirty_states(bot, dirty_users: set[int]) -> None:
warned=user_data.warned_since_reset, warned=user_data.warned_since_reset,
last_offense_at=user_data.last_offense_time or None, last_offense_at=user_data.last_offense_time or None,
aliases=_aliases_csv(user_data), aliases=_aliases_csv(user_data),
warning_expires_at=user_data.warning_expires_at or None,
) )
logger.info("Flushed %d dirty user states to DB.", len(dirty)) logger.info("Flushed %d dirty user states to DB.", len(dirty))
+1
View File
@@ -44,6 +44,7 @@ timeouts:
escalation_minutes: [30, 60, 120, 240] # Escalating timeout durations escalation_minutes: [30, 60, 120, 240] # Escalating timeout durations
offense_reset_minutes: 1440 # Reset offense counter after this much good behavior (24h) offense_reset_minutes: 1440 # Reset offense counter after this much good behavior (24h)
warning_cooldown_minutes: 5 # Don't warn same user more than once per this window warning_cooldown_minutes: 5 # Don't warn same user more than once per this window
warning_expiration_minutes: 30 # Warning expires after this long — user must be re-warned before mute
messages: messages:
warning: "Easy there, {username}. The Breehavior Monitor is watching. \U0001F440" warning: "Easy there, {username}. The Breehavior Monitor is watching. \U0001F440"
+17 -9
View File
@@ -144,6 +144,12 @@ class Database:
ALTER TABLE UserState ADD Aliases NVARCHAR(500) NULL ALTER TABLE UserState ADD Aliases NVARCHAR(500) NULL
""") """)
# --- Schema migration for warning expiration ---
cursor.execute("""
IF COL_LENGTH('UserState', 'WarningExpiresAt') IS NULL
ALTER TABLE UserState ADD WarningExpiresAt FLOAT NULL
""")
cursor.execute(""" cursor.execute("""
IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = 'BotSettings') IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = 'BotSettings')
CREATE TABLE BotSettings ( CREATE TABLE BotSettings (
@@ -321,19 +327,20 @@ class Database:
warned: bool = False, warned: bool = False,
last_offense_at: float | None = None, last_offense_at: float | None = None,
aliases: str | None = None, aliases: str | None = None,
warning_expires_at: float | None = None,
) -> None: ) -> None:
"""Upsert user state (offense count, immunity, off-topic count, coherence baseline, notes, warned, last offense time, aliases).""" """Upsert user state (offense count, immunity, off-topic count, coherence baseline, notes, warned, last offense time, aliases, warning expiration)."""
if not self._available: if not self._available:
return return
try: try:
await asyncio.to_thread( await asyncio.to_thread(
self._save_user_state_sync, self._save_user_state_sync,
user_id, offense_count, immune, off_topic_count, baseline_coherence, user_notes, warned, last_offense_at, aliases, user_id, offense_count, immune, off_topic_count, baseline_coherence, user_notes, warned, last_offense_at, aliases, warning_expires_at,
) )
except Exception: except Exception:
logger.exception("Failed to save user state") logger.exception("Failed to save user state")
def _save_user_state_sync(self, user_id, offense_count, immune, off_topic_count, baseline_coherence, user_notes, warned, last_offense_at, aliases): def _save_user_state_sync(self, user_id, offense_count, immune, off_topic_count, baseline_coherence, user_notes, warned, last_offense_at, aliases, warning_expires_at):
conn = self._connect() conn = self._connect()
try: try:
cursor = conn.cursor() cursor = conn.cursor()
@@ -344,14 +351,14 @@ class Database:
WHEN MATCHED THEN WHEN MATCHED THEN
UPDATE SET OffenseCount = ?, Immune = ?, OffTopicCount = ?, UPDATE SET OffenseCount = ?, Immune = ?, OffTopicCount = ?,
BaselineCoherence = ?, UserNotes = ?, Warned = ?, BaselineCoherence = ?, UserNotes = ?, Warned = ?,
LastOffenseAt = ?, Aliases = ?, LastOffenseAt = ?, Aliases = ?, WarningExpiresAt = ?,
UpdatedAt = SYSUTCDATETIME() UpdatedAt = SYSUTCDATETIME()
WHEN NOT MATCHED THEN WHEN NOT MATCHED THEN
INSERT (UserId, OffenseCount, Immune, OffTopicCount, BaselineCoherence, UserNotes, Warned, LastOffenseAt, Aliases) INSERT (UserId, OffenseCount, Immune, OffTopicCount, BaselineCoherence, UserNotes, Warned, LastOffenseAt, Aliases, WarningExpiresAt)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?);""", VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""",
user_id, user_id,
offense_count, 1 if immune else 0, off_topic_count, baseline_coherence, user_notes, 1 if warned else 0, last_offense_at, aliases, offense_count, 1 if immune else 0, off_topic_count, baseline_coherence, user_notes, 1 if warned else 0, last_offense_at, aliases, warning_expires_at,
user_id, offense_count, 1 if immune else 0, off_topic_count, baseline_coherence, user_notes, 1 if warned else 0, last_offense_at, aliases, user_id, offense_count, 1 if immune else 0, off_topic_count, baseline_coherence, user_notes, 1 if warned else 0, last_offense_at, aliases, warning_expires_at,
) )
cursor.close() cursor.close()
finally: finally:
@@ -394,7 +401,7 @@ class Database:
try: try:
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute( cursor.execute(
"SELECT UserId, OffenseCount, Immune, OffTopicCount, BaselineCoherence, UserNotes, Warned, LastOffenseAt, Aliases FROM UserState" "SELECT UserId, OffenseCount, Immune, OffTopicCount, BaselineCoherence, UserNotes, Warned, LastOffenseAt, Aliases, WarningExpiresAt FROM UserState"
) )
rows = cursor.fetchall() rows = cursor.fetchall()
cursor.close() cursor.close()
@@ -409,6 +416,7 @@ class Database:
"warned": bool(row[6]), "warned": bool(row[6]),
"last_offense_at": float(row[7]) if row[7] is not None else 0.0, "last_offense_at": float(row[7]) if row[7] is not None else 0.0,
"aliases": row[8] or "", "aliases": row[8] or "",
"warning_expires_at": float(row[9]) if row[9] is not None else 0.0,
} }
for row in rows for row in rows
] ]
+32 -1
View File
@@ -19,6 +19,7 @@ class UserDrama:
last_warning_time: float = 0.0 last_warning_time: float = 0.0
last_analysis_time: float = 0.0 last_analysis_time: float = 0.0
warned_since_reset: bool = False warned_since_reset: bool = False
warning_expires_at: float = 0.0
immune: bool = False immune: bool = False
# Topic drift tracking # Topic drift tracking
off_topic_count: int = 0 off_topic_count: int = 0
@@ -40,10 +41,12 @@ class DramaTracker:
window_size: int = 10, window_size: int = 10,
window_minutes: int = 15, window_minutes: int = 15,
offense_reset_minutes: int = 120, offense_reset_minutes: int = 120,
warning_expiration_minutes: int = 30,
): ):
self.window_size = window_size self.window_size = window_size
self.window_seconds = window_minutes * 60 self.window_seconds = window_minutes * 60
self.offense_reset_seconds = offense_reset_minutes * 60 self.offense_reset_seconds = offense_reset_minutes * 60
self.warning_expiration_seconds = warning_expiration_minutes * 60
self._users: dict[int, UserDrama] = {} self._users: dict[int, UserDrama] = {}
def get_user(self, user_id: int) -> UserDrama: def get_user(self, user_id: int) -> UserDrama:
@@ -74,6 +77,7 @@ class DramaTracker:
def get_drama_score(self, user_id: int, escalation_boost: float = 0.04) -> float: def get_drama_score(self, user_id: int, escalation_boost: float = 0.04) -> float:
user = self.get_user(user_id) user = self.get_user(user_id)
self._expire_warning(user)
now = time.time() now = time.time()
self._prune_entries(user, now) self._prune_entries(user, now)
@@ -105,6 +109,7 @@ class DramaTracker:
def get_mute_threshold(self, user_id: int, base_threshold: float) -> float: def get_mute_threshold(self, user_id: int, base_threshold: float) -> float:
"""Lower the mute threshold if user was already warned.""" """Lower the mute threshold if user was already warned."""
user = self.get_user(user_id) user = self.get_user(user_id)
self._expire_warning(user)
if user.warned_since_reset: if user.warned_since_reset:
return base_threshold - 0.05 return base_threshold - 0.05
return base_threshold return base_threshold
@@ -123,12 +128,34 @@ class DramaTracker:
user.offense_count += 1 user.offense_count += 1
user.last_offense_time = now user.last_offense_time = now
user.warned_since_reset = False user.warned_since_reset = False
user.warning_expires_at = 0.0
return user.offense_count return user.offense_count
def record_warning(self, user_id: int) -> None: def record_warning(self, user_id: int) -> None:
user = self.get_user(user_id) user = self.get_user(user_id)
user.last_warning_time = time.time() now = time.time()
user.last_warning_time = now
user.warned_since_reset = True user.warned_since_reset = True
if self.warning_expiration_seconds > 0:
user.warning_expires_at = now + self.warning_expiration_seconds
else:
user.warning_expires_at = 0.0 # Never expires
def _expire_warning(self, user: UserDrama) -> None:
"""Clear warned flag if the warning has expired."""
if (
user.warned_since_reset
and user.warning_expires_at > 0
and time.time() >= user.warning_expires_at
):
user.warned_since_reset = False
user.warning_expires_at = 0.0
def is_warned(self, user_id: int) -> bool:
"""Check if user is currently warned (respects expiration)."""
user = self.get_user(user_id)
self._expire_warning(user)
return user.warned_since_reset
def can_warn(self, user_id: int, cooldown_minutes: int) -> bool: def can_warn(self, user_id: int, cooldown_minutes: int) -> bool:
user = self.get_user(user_id) user = self.get_user(user_id)
@@ -303,12 +330,16 @@ class DramaTracker:
user.notes = state["user_notes"] user.notes = state["user_notes"]
if state.get("warned"): if state.get("warned"):
user.warned_since_reset = True user.warned_since_reset = True
user.warning_expires_at = state.get("warning_expires_at", 0.0) or 0.0
# Expire warning at load time if it's past due
self._expire_warning(user)
if state.get("last_offense_at"): if state.get("last_offense_at"):
user.last_offense_time = state["last_offense_at"] user.last_offense_time = state["last_offense_at"]
# Apply time-based offense reset at load time # Apply time-based offense reset at load time
if time.time() - user.last_offense_time > self.offense_reset_seconds: if time.time() - user.last_offense_time > self.offense_reset_seconds:
user.offense_count = 0 user.offense_count = 0
user.warned_since_reset = False user.warned_since_reset = False
user.warning_expires_at = 0.0
user.last_offense_time = 0.0 user.last_offense_time = 0.0
if state.get("aliases"): if state.get("aliases"):
user.aliases = [a.strip() for a in state["aliases"].split(",") if a.strip()] user.aliases = [a.strip() for a in state["aliases"].split(",") if a.strip()]