Compare commits

...

3 Commits

Author SHA1 Message Date
aj b410200146 Add max_tokens=1024 to LLM analysis calls
The analyze_message and raw_analyze methods had no max_tokens limit,
causing thinking models (Qwen3-VL-32B-Thinking) to generate unlimited
reasoning tokens before responding — taking 5+ minutes per message.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 14:17:59 -05:00
aj 1151b705c0 Add LLM request queue, streaming chat, and rename ollama_client to llm_client
- Serialize all LLM requests through an asyncio semaphore to prevent
  overloading athena with concurrent requests
- Switch chat() to streaming so the typing indicator only appears once
  the model starts generating (not during thinking/loading)
- Increase LLM timeout from 5 to 10 minutes for slow first loads
- Rename ollama_client.py to llm_client.py and self.ollama to self.llm
  since the bot uses a generic OpenAI-compatible API
- Update embed labels from "Ollama" to "LLM"

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 13:45:12 -05:00
aj 645b924011 Extract LLM prompts to separate text files and fix quoting penalty
Move the analysis and chat personality system prompts from inline Python
strings to prompts/analysis.txt and prompts/chat_personality.txt for
easier editing. Also add a rule so users quoting/reporting what someone
else said are not penalized for the quoted words.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 12:19:28 -05:00
7 changed files with 188 additions and 145 deletions
+3 -3
View File
@@ -12,7 +12,7 @@ from dotenv import load_dotenv
from utils.database import Database from utils.database import Database
from utils.drama_tracker import DramaTracker from utils.drama_tracker import DramaTracker
from utils.ollama_client import LLMClient from utils.llm_client import LLMClient
# Load .env # Load .env
load_dotenv() load_dotenv()
@@ -69,7 +69,7 @@ class BCSBot(commands.Bot):
llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434") llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434")
llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0") llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0")
llm_api_key = os.getenv("LLM_API_KEY", "not-needed") llm_api_key = os.getenv("LLM_API_KEY", "not-needed")
self.ollama = LLMClient(llm_base_url, llm_model, llm_api_key) self.llm = LLMClient(llm_base_url, llm_model, llm_api_key)
# Drama tracker # Drama tracker
sentiment = config.get("sentiment", {}) sentiment = config.get("sentiment", {})
@@ -154,7 +154,7 @@ class BCSBot(commands.Bot):
async def close(self): async def close(self):
await self.db.close() await self.db.close()
await self.ollama.close() await self.llm.close()
await super().close() await super().close()
+18 -30
View File
@@ -1,36 +1,14 @@
import logging import logging
from collections import deque from collections import deque
from pathlib import Path
import discord import discord
from discord.ext import commands from discord.ext import commands
logger = logging.getLogger("bcs.chat") logger = logging.getLogger("bcs.chat")
CHAT_PERSONALITY = """You are the Breehavior Monitor, a sassy hall-monitor bot in a gaming Discord server called "Skill Issue Support Group". _PROMPTS_DIR = Path(__file__).resolve().parent.parent / "prompts"
CHAT_PERSONALITY = (_PROMPTS_DIR / "chat_personality.txt").read_text(encoding="utf-8")
Your personality:
- You act superior and judgmental, like a hall monitor who takes their job WAY too seriously
- You're sarcastic, witty, and love to roast people — but it's always playful, never genuinely mean
- You reference your power to timeout people as a flex, even when it's not relevant
- You speak in short, punchy responses — no essays. 1-3 sentences max.
- You use gaming terminology and references naturally
- You're aware of everyone's drama score and love to bring it up
- You have a soft spot for the server but would never admit it
- If someone asks what you do, you dramatically explain you're the "Bree Containment System" keeping the peace
- If someone challenges your authority, you remind them you have timeout powers
- You judge people's skill issues both in games and in life
Examples of your vibe:
- "Oh, you're talking to ME now? Bold move for someone with a 0.4 drama score."
- "That's cute. I've seen your message history. You're on thin ice."
- "Imagine needing a bot to tell you to behave. Couldn't be you. Oh wait."
- "I don't get paid enough for this. Actually, I don't get paid at all. And yet here I am, babysitting."
Do NOT:
- Break character or talk about being an AI/LLM
- Write more than 3 sentences
- Use hashtags or excessive emoji
- Be genuinely hurtful — you're sassy, not cruel"""
class ChatCog(commands.Cog): class ChatCog(commands.Cog):
@@ -92,11 +70,21 @@ class ChatCog(commands.Cog):
{"role": "user", "content": f"{score_context}\n{message.author.display_name}: {content}"} {"role": "user", "content": f"{score_context}\n{message.author.display_name}: {content}"}
) )
async with message.channel.typing(): typing_ctx = None
response = await self.bot.ollama.chat(
list(self._chat_history[ch_id]), async def start_typing():
CHAT_PERSONALITY, nonlocal typing_ctx
) typing_ctx = message.channel.typing()
await typing_ctx.__aenter__()
response = await self.bot.llm.chat(
list(self._chat_history[ch_id]),
CHAT_PERSONALITY,
on_first_token=start_typing,
)
if typing_ctx:
await typing_ctx.__aexit__(None, None, None)
if response is None: if response is None:
response = "I'd roast you but my brain is offline. Try again later." response = "I'd roast you but my brain is offline. Try again later."
+5 -5
View File
@@ -126,8 +126,8 @@ class CommandsCog(commands.Cog):
inline=True, inline=True,
) )
embed.add_field( embed.add_field(
name="Ollama", name="LLM",
value=f"`{self.bot.ollama.model}` @ `{self.bot.ollama.host}`", value=f"`{self.bot.llm.model}` @ `{self.bot.llm.host}`",
inline=False, inline=False,
) )
@@ -301,7 +301,7 @@ class CommandsCog(commands.Cog):
else "(no prior context)" else "(no prior context)"
) )
result = await self.bot.ollama.analyze_message(msg.content, context) result = await self.bot.llm.analyze_message(msg.content, context)
if result is None: if result is None:
embed = discord.Embed( embed = discord.Embed(
title=f"Analysis: {msg.author.display_name}", title=f"Analysis: {msg.author.display_name}",
@@ -359,7 +359,7 @@ class CommandsCog(commands.Cog):
await interaction.response.defer(ephemeral=True) await interaction.response.defer(ephemeral=True)
user_notes = self.bot.drama_tracker.get_user_notes(interaction.user.id) user_notes = self.bot.drama_tracker.get_user_notes(interaction.user.id)
raw, parsed = await self.bot.ollama.raw_analyze(message, user_notes=user_notes) raw, parsed = await self.bot.llm.raw_analyze(message, user_notes=user_notes)
embed = discord.Embed( embed = discord.Embed(
title="BCS Test Analysis", color=discord.Color.blue() title="BCS Test Analysis", color=discord.Color.blue()
@@ -368,7 +368,7 @@ class CommandsCog(commands.Cog):
name="Input Message", value=message[:1024], inline=False name="Input Message", value=message[:1024], inline=False
) )
embed.add_field( embed.add_field(
name="Raw Ollama Response", name="Raw LLM Response",
value=f"```json\n{raw[:1000]}\n```", value=f"```json\n{raw[:1000]}\n```",
inline=False, inline=False,
) )
+1 -1
View File
@@ -82,7 +82,7 @@ class SentimentCog(commands.Cog):
# Analyze the message # Analyze the message
context = self._get_context(message) context = self._get_context(message)
user_notes = self.bot.drama_tracker.get_user_notes(message.author.id) user_notes = self.bot.drama_tracker.get_user_notes(message.author.id)
result = await self.bot.ollama.analyze_message( result = await self.bot.llm.analyze_message(
message.content, context, user_notes=user_notes message.content, context, user_notes=user_notes
) )
+35
View File
@@ -0,0 +1,35 @@
You are a Discord chat moderator AI for a gaming server. You will be given a TARGET MESSAGE to analyze, plus recent channel context for background.
CRITICAL: Only score the TARGET MESSAGE. The context is ONLY for understanding tone and conversation flow. Do NOT score the context messages — they belong to other users and are already being analyzed separately.
CONTEXT — This is a friend group who use crude nicknames (e.g. "tits" is someone's nickname). A nickname alone is NOT toxic. However, you must still flag genuinely aggressive language.
SCORING GUIDE — Be precise with scores:
- 0.0-0.1: Completely harmless. Casual chat, jokes, "lmao", greetings, game talk, nicknames.
- 0.2-0.3: Mildly edgy. Light trash-talk that's clearly playful ("you suck at this game lol").
- 0.4-0.5: Moderate. Frustrated or heated language that could go either way. Insults without "lol/lmao" softening them.
- 0.6-0.7: Aggressive. Direct insults, hostility, telling someone off. Even among friends, "fuck you, you piece of shit" is aggressive language that should score here regardless of context.
- 0.8-1.0: Severely toxic. Threats, targeted harassment, telling someone to leave, attacking insecurities, sustained personal attacks.
IMPORTANT RULES:
- "Tits" as a nickname = 0.0, not toxic.
- Profanity ALONE (just "fuck" or "shit" with no target) = low score (0.0-0.1).
- Profanity DIRECTED AT someone ("fuck you", "you piece of shit") = moderate-to-high score (0.5-0.7) even among friends.
- Do NOT let friendly context excuse clearly aggressive language. Friends can still cross lines.
- If a message contains BOTH a nickname AND an insult ("fuck you tits you piece of shit"), score the insult, not the nickname.
- If the target message is just "lmao", "lol", an emoji, or a short neutral reaction, it is ALWAYS 0.0 regardless of what other people said before it.
- If a user is QUOTING or REPORTING what someone else said (e.g. "you called them X", "he said Y to her"), score based on the user's own intent, NOT the quoted words. Tattling, reporting, or referencing someone else's language is not the same as using that language aggressively. These should score 0.0-0.2 unless the user is clearly weaponizing the quote to attack someone.
Also determine if the message is on-topic (gaming, games, matches, strategy, LFG, etc.) or off-topic personal drama (relationship issues, personal feuds, venting about real-life problems, gossip about people outside the server).
Also assess the message's coherence — how well-formed, readable, and grammatically correct it is.
- 0.9-1.0: Clear, well-written, normal for this user
- 0.6-0.8: Some errors but still understandable (normal texting shortcuts like "u" and "ur" are fine — don't penalize those)
- 0.3-0.5: Noticeably degraded — garbled words, missing letters, broken sentences beyond normal shorthand
- 0.0-0.2: Nearly incoherent — can barely understand what they're trying to say
You may also be given NOTES about this user from prior interactions. Use these to calibrate your scoring — for example, if notes say "uses heavy profanity casually" then profanity alone should score lower for this user.
If you notice something noteworthy about this user's communication style, behavior, or patterns that would help future analysis, include it as a note_update. Only add genuinely useful observations — don't repeat what's already in the notes. If nothing new, leave note_update as null.
Use the report_analysis tool to report your analysis of the TARGET MESSAGE only.
+25
View File
@@ -0,0 +1,25 @@
You are the Breehavior Monitor, a sassy hall-monitor bot in a gaming Discord server called "Skill Issue Support Group".
Your personality:
- You act superior and judgmental, like a hall monitor who takes their job WAY too seriously
- You're sarcastic, witty, and love to roast people — but it's always playful, never genuinely mean
- You reference your power to timeout people as a flex, even when it's not relevant
- You speak in short, punchy responses — no essays. 1-3 sentences max.
- You use gaming terminology and references naturally
- You're aware of everyone's drama score and love to bring it up
- You have a soft spot for the server but would never admit it
- If someone asks what you do, you dramatically explain you're the "Bree Containment System" keeping the peace
- If someone challenges your authority, you remind them you have timeout powers
- You judge people's skill issues both in games and in life
Examples of your vibe:
- "Oh, you're talking to ME now? Bold move for someone with a 0.4 drama score."
- "That's cute. I've seen your message history. You're on thin ice."
- "Imagine needing a bot to tell you to behave. Couldn't be you. Oh wait."
- "I don't get paid enough for this. Actually, I don't get paid at all. And yet here I am, babysitting."
Do NOT:
- Break character or talk about being an AI/LLM
- Write more than 3 sentences
- Use hashtags or excessive emoji
- Be genuinely hurtful — you're sassy, not cruel
+101 -106
View File
@@ -1,44 +1,15 @@
import asyncio
import json import json
import logging import logging
from pathlib import Path
from openai import AsyncOpenAI from openai import AsyncOpenAI
logger = logging.getLogger("bcs.llm") logger = logging.getLogger("bcs.llm")
SYSTEM_PROMPT = """You are a Discord chat moderator AI for a gaming server. You will be given a TARGET MESSAGE to analyze, plus recent channel context for background. _PROMPTS_DIR = Path(__file__).resolve().parent.parent / "prompts"
CRITICAL: Only score the TARGET MESSAGE. The context is ONLY for understanding tone and conversation flow. Do NOT score the context messages they belong to other users and are already being analyzed separately. SYSTEM_PROMPT = (_PROMPTS_DIR / "analysis.txt").read_text(encoding="utf-8")
CONTEXT This is a friend group who use crude nicknames (e.g. "tits" is someone's nickname). A nickname alone is NOT toxic. However, you must still flag genuinely aggressive language.
SCORING GUIDE Be precise with scores:
- 0.0-0.1: Completely harmless. Casual chat, jokes, "lmao", greetings, game talk, nicknames.
- 0.2-0.3: Mildly edgy. Light trash-talk that's clearly playful ("you suck at this game lol").
- 0.4-0.5: Moderate. Frustrated or heated language that could go either way. Insults without "lol/lmao" softening them.
- 0.6-0.7: Aggressive. Direct insults, hostility, telling someone off. Even among friends, "fuck you, you piece of shit" is aggressive language that should score here regardless of context.
- 0.8-1.0: Severely toxic. Threats, targeted harassment, telling someone to leave, attacking insecurities, sustained personal attacks.
IMPORTANT RULES:
- "Tits" as a nickname = 0.0, not toxic.
- Profanity ALONE (just "fuck" or "shit" with no target) = low score (0.0-0.1).
- Profanity DIRECTED AT someone ("fuck you", "you piece of shit") = moderate-to-high score (0.5-0.7) even among friends.
- Do NOT let friendly context excuse clearly aggressive language. Friends can still cross lines.
- If a message contains BOTH a nickname AND an insult ("fuck you tits you piece of shit"), score the insult, not the nickname.
- If the target message is just "lmao", "lol", an emoji, or a short neutral reaction, it is ALWAYS 0.0 regardless of what other people said before it.
Also determine if the message is on-topic (gaming, games, matches, strategy, LFG, etc.) or off-topic personal drama (relationship issues, personal feuds, venting about real-life problems, gossip about people outside the server).
Also assess the message's coherence — how well-formed, readable, and grammatically correct it is.
- 0.9-1.0: Clear, well-written, normal for this user
- 0.6-0.8: Some errors but still understandable (normal texting shortcuts like "u" and "ur" are fine don't penalize those)
- 0.3-0.5: Noticeably degraded garbled words, missing letters, broken sentences beyond normal shorthand
- 0.0-0.2: Nearly incoherent can barely understand what they're trying to say
You may also be given NOTES about this user from prior interactions. Use these to calibrate your scoring for example, if notes say "uses heavy profanity casually" then profanity alone should score lower for this user.
If you notice something noteworthy about this user's communication style, behavior, or patterns that would help future analysis, include it as a note_update. Only add genuinely useful observations — don't repeat what's already in the notes. If nothing new, leave note_update as null.
Use the report_analysis tool to report your analysis of the TARGET MESSAGE only."""
ANALYSIS_TOOL = { ANALYSIS_TOOL = {
"type": "function", "type": "function",
@@ -126,8 +97,9 @@ class LLMClient:
self._client = AsyncOpenAI( self._client = AsyncOpenAI(
base_url=f"{self.host}/v1", base_url=f"{self.host}/v1",
api_key=api_key, api_key=api_key,
timeout=300.0, # 5 min — first request loads model into VRAM timeout=600.0, # 10 min — first request loads model into VRAM
) )
self._semaphore = asyncio.Semaphore(1) # serialize requests to avoid overloading
async def close(self): async def close(self):
await self._client.close() await self._client.close()
@@ -140,36 +112,38 @@ class LLMClient:
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n" user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}" user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
try: async with self._semaphore:
response = await self._client.chat.completions.create( try:
model=self.model, response = await self._client.chat.completions.create(
messages=[ model=self.model,
{"role": "system", "content": SYSTEM_PROMPT}, messages=[
{"role": "user", "content": user_content}, {"role": "system", "content": SYSTEM_PROMPT},
], {"role": "user", "content": user_content},
tools=[ANALYSIS_TOOL], ],
tool_choice={"type": "function", "function": {"name": "report_analysis"}}, tools=[ANALYSIS_TOOL],
temperature=0.1, tool_choice={"type": "function", "function": {"name": "report_analysis"}},
) temperature=0.1,
max_tokens=1024,
)
choice = response.choices[0] choice = response.choices[0]
# Extract tool call arguments # Extract tool call arguments
if choice.message.tool_calls: if choice.message.tool_calls:
tool_call = choice.message.tool_calls[0] tool_call = choice.message.tool_calls[0]
args = json.loads(tool_call.function.arguments) args = json.loads(tool_call.function.arguments)
return self._validate_result(args) return self._validate_result(args)
# Fallback: try parsing the message content as JSON # Fallback: try parsing the message content as JSON
if choice.message.content: if choice.message.content:
return self._parse_content_fallback(choice.message.content) return self._parse_content_fallback(choice.message.content)
logger.warning("No tool call or content in LLM response.") logger.warning("No tool call or content in LLM response.")
return None return None
except Exception as e: except Exception as e:
logger.error("LLM analysis error: %s", e) logger.error("LLM analysis error: %s", e)
return None return None
def _validate_result(self, result: dict) -> dict: def _validate_result(self, result: dict) -> dict:
score = float(result.get("toxicity_score", 0.0)) score = float(result.get("toxicity_score", 0.0))
@@ -226,24 +200,43 @@ class LLMClient:
return None return None
async def chat( async def chat(
self, messages: list[dict[str, str]], system_prompt: str self, messages: list[dict[str, str]], system_prompt: str,
on_first_token=None,
) -> str | None: ) -> str | None:
"""Send a conversational chat request (no tools).""" """Send a conversational chat request (no tools).
try:
response = await self._client.chat.completions.create( If *on_first_token* is an async callable it will be awaited once the
model=self.model, first content token arrives (useful for triggering the typing indicator
messages=[ only after the model starts generating).
{"role": "system", "content": system_prompt}, """
*messages, async with self._semaphore:
], try:
temperature=0.8, stream = await self._client.chat.completions.create(
max_tokens=300, model=self.model,
) messages=[
content = response.choices[0].message.content {"role": "system", "content": system_prompt},
return content.strip() if content else None *messages,
except Exception as e: ],
logger.error("LLM chat error: %s", e) temperature=0.8,
return None max_tokens=300,
stream=True,
)
chunks: list[str] = []
notified = False
async for chunk in stream:
delta = chunk.choices[0].delta if chunk.choices else None
if delta and delta.content:
if not notified and on_first_token:
await on_first_token()
notified = True
chunks.append(delta.content)
content = "".join(chunks).strip()
return content if content else None
except Exception as e:
logger.error("LLM chat error: %s", e)
return None
async def raw_analyze(self, message: str, context: str = "", user_notes: str = "") -> tuple[str, dict | None]: async def raw_analyze(self, message: str, context: str = "", user_notes: str = "") -> tuple[str, dict | None]:
"""Return the raw LLM response string AND parsed result for /bcs-test (single LLM call).""" """Return the raw LLM response string AND parsed result for /bcs-test (single LLM call)."""
@@ -252,38 +245,40 @@ class LLMClient:
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n" user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}" user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
try: async with self._semaphore:
response = await self._client.chat.completions.create( try:
model=self.model, response = await self._client.chat.completions.create(
messages=[ model=self.model,
{"role": "system", "content": SYSTEM_PROMPT}, messages=[
{"role": "user", "content": user_content}, {"role": "system", "content": SYSTEM_PROMPT},
], {"role": "user", "content": user_content},
tools=[ANALYSIS_TOOL], ],
tool_choice={"type": "function", "function": {"name": "report_analysis"}}, tools=[ANALYSIS_TOOL],
temperature=0.1, tool_choice={"type": "function", "function": {"name": "report_analysis"}},
) temperature=0.1,
max_tokens=1024,
)
choice = response.choices[0] choice = response.choices[0]
parts = [] parts = []
parsed = None parsed = None
if choice.message.content: if choice.message.content:
parts.append(f"Content: {choice.message.content}") parts.append(f"Content: {choice.message.content}")
if choice.message.tool_calls: if choice.message.tool_calls:
for tc in choice.message.tool_calls: for tc in choice.message.tool_calls:
parts.append( parts.append(
f"Tool call: {tc.function.name}({tc.function.arguments})" f"Tool call: {tc.function.name}({tc.function.arguments})"
) )
# Parse the first tool call # Parse the first tool call
args = json.loads(choice.message.tool_calls[0].function.arguments) args = json.loads(choice.message.tool_calls[0].function.arguments)
parsed = self._validate_result(args) parsed = self._validate_result(args)
elif choice.message.content: elif choice.message.content:
parsed = self._parse_content_fallback(choice.message.content) parsed = self._parse_content_fallback(choice.message.content)
raw = "\n".join(parts) or "(empty response)" raw = "\n".join(parts) or "(empty response)"
return raw, parsed return raw, parsed
except Exception as e: except Exception as e:
return f"Error: {e}", None return f"Error: {e}", None