feat: add server rule violation detection and compress prompts

- LLM now evaluates messages against numbered server rules and reports violated_rules in analysis output - Warnings and mutes cite the specific rule(s) broken - Rules extracted to prompts/rules.txt for prompt injection - Personality prompts moved to prompts/personalities/ and compressed (~63% reduction across all prompt files) - All prompt files tightened: removed redundancy, consolidated Do NOT sections, trimmed examples while preserving behavioral instructions Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-27 22:14:35 -05:00
parent ed51db527c
commit bf32a9536a
22 changed files with 230 additions and 293 deletions
@@ -92,6 +92,11 @@ ANALYSIS_TOOL = {
                    "type": ["string", "null"],
                    "description": "The game channel name this message is about (e.g. 'gta-online', 'warzone'), or null if not game-specific.",
                },
+                "violated_rules": {
+                    "type": "array",
+                    "items": {"type": "integer"},
+                    "description": "Rule numbers violated (empty array if none).",
+                },
            },
            "required": ["toxicity_score", "categories", "reasoning", "off_topic", "topic_category", "topic_reasoning", "coherence_score", "coherence_flag"],
        },
@@ -190,6 +195,11 @@ CONVERSATION_TOOL = {
                                "type": ["string", "null"],
                                "description": "The game channel name this user's messages are about, or null.",
                            },
+                            "violated_rules": {
+                                "type": "array",
+                                "items": {"type": "integer"},
+                                "description": "Rule numbers violated (empty array if none).",
+                            },
                        },
                        "required": ["username", "toxicity_score", "categories", "reasoning", "off_topic", "topic_category", "topic_reasoning", "coherence_score", "coherence_flag"],
                    },
@@ -299,12 +309,15 @@ class LLMClient:
    async def analyze_message(
        self, message: str, context: str = "", user_notes: str = "",
        channel_context: str = "", mention_context: str = "",
+        rules_context: str = "",
    ) -> dict | None:
        user_content = f"=== RECENT CHANNEL MESSAGES (for background context only) ===\n{context}\n\n"
        if user_notes:
            user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
        if channel_context:
            user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
+        if rules_context:
+            user_content += f"=== SERVER RULES ===\n{rules_context}\n\n"
        if mention_context:
            user_content += f"=== USER REPORT (a user flagged this conversation — focus on this concern) ===\n{mention_context}\n\n"
        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
@@ -382,6 +395,8 @@ class LLMClient:

        result.setdefault("note_update", None)
        result.setdefault("detected_game", None)
+        if not isinstance(result.get("violated_rules"), list):
+            result["violated_rules"] = []

        return result

@@ -490,6 +505,7 @@ class LLMClient:
        user_notes_map: dict[str, str] | None = None,
        new_message_start: int | None = None,
        user_aliases: str = "",
+        rules_context: str = "",
    ) -> dict | None:
        """Analyze a conversation block in one call, returning per-user findings."""
        if not messages:
@@ -506,6 +522,8 @@ class LLMClient:
                user_content += "=== USER NOTES (from prior analysis) ===\n" + "\n".join(notes_lines) + "\n\n"
        if channel_context:
            user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
+        if rules_context:
+            user_content += f"=== SERVER RULES ===\n{rules_context}\n\n"
        if mention_context:
            user_content += f"=== USER REPORT (a user flagged this conversation — focus on this concern) ===\n{mention_context}\n\n"
        user_content += "Analyze the conversation block above and report findings for each user."
@@ -587,6 +605,8 @@ class LLMClient:
            finding.setdefault("coherence_flag", "normal")
            finding.setdefault("note_update", None)
            finding.setdefault("detected_game", None)
+            if not isinstance(finding.get("violated_rules"), list):
+                finding["violated_rules"] = []
        result["user_findings"] = findings
        result.setdefault("conversation_summary", "")
        return result