Add LLM request queue, streaming chat, and rename ollama_client to llm_client

- Serialize all LLM requests through an asyncio semaphore to prevent
  overloading athena with concurrent requests
- Switch chat() to streaming so the typing indicator only appears once
  the model starts generating (not during thinking/loading)
- Increase LLM timeout from 5 to 10 minutes for slow first loads
- Rename ollama_client.py to llm_client.py and self.ollama to self.llm
  since the bot uses a generic OpenAI-compatible API
- Update embed labels from "Ollama" to "LLM"

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-21 13:45:12 -05:00
parent 645b924011
commit 1151b705c0
5 changed files with 120 additions and 87 deletions
+3 -3
View File
@@ -12,7 +12,7 @@ from dotenv import load_dotenv
from utils.database import Database from utils.database import Database
from utils.drama_tracker import DramaTracker from utils.drama_tracker import DramaTracker
from utils.ollama_client import LLMClient from utils.llm_client import LLMClient
# Load .env # Load .env
load_dotenv() load_dotenv()
@@ -69,7 +69,7 @@ class BCSBot(commands.Bot):
llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434") llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434")
llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0") llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0")
llm_api_key = os.getenv("LLM_API_KEY", "not-needed") llm_api_key = os.getenv("LLM_API_KEY", "not-needed")
self.ollama = LLMClient(llm_base_url, llm_model, llm_api_key) self.llm = LLMClient(llm_base_url, llm_model, llm_api_key)
# Drama tracker # Drama tracker
sentiment = config.get("sentiment", {}) sentiment = config.get("sentiment", {})
@@ -154,7 +154,7 @@ class BCSBot(commands.Bot):
async def close(self): async def close(self):
await self.db.close() await self.db.close()
await self.ollama.close() await self.llm.close()
await super().close() await super().close()
+15 -5
View File
@@ -70,11 +70,21 @@ class ChatCog(commands.Cog):
{"role": "user", "content": f"{score_context}\n{message.author.display_name}: {content}"} {"role": "user", "content": f"{score_context}\n{message.author.display_name}: {content}"}
) )
async with message.channel.typing(): typing_ctx = None
response = await self.bot.ollama.chat(
list(self._chat_history[ch_id]), async def start_typing():
CHAT_PERSONALITY, nonlocal typing_ctx
) typing_ctx = message.channel.typing()
await typing_ctx.__aenter__()
response = await self.bot.llm.chat(
list(self._chat_history[ch_id]),
CHAT_PERSONALITY,
on_first_token=start_typing,
)
if typing_ctx:
await typing_ctx.__aexit__(None, None, None)
if response is None: if response is None:
response = "I'd roast you but my brain is offline. Try again later." response = "I'd roast you but my brain is offline. Try again later."
+5 -5
View File
@@ -126,8 +126,8 @@ class CommandsCog(commands.Cog):
inline=True, inline=True,
) )
embed.add_field( embed.add_field(
name="Ollama", name="LLM",
value=f"`{self.bot.ollama.model}` @ `{self.bot.ollama.host}`", value=f"`{self.bot.llm.model}` @ `{self.bot.llm.host}`",
inline=False, inline=False,
) )
@@ -301,7 +301,7 @@ class CommandsCog(commands.Cog):
else "(no prior context)" else "(no prior context)"
) )
result = await self.bot.ollama.analyze_message(msg.content, context) result = await self.bot.llm.analyze_message(msg.content, context)
if result is None: if result is None:
embed = discord.Embed( embed = discord.Embed(
title=f"Analysis: {msg.author.display_name}", title=f"Analysis: {msg.author.display_name}",
@@ -359,7 +359,7 @@ class CommandsCog(commands.Cog):
await interaction.response.defer(ephemeral=True) await interaction.response.defer(ephemeral=True)
user_notes = self.bot.drama_tracker.get_user_notes(interaction.user.id) user_notes = self.bot.drama_tracker.get_user_notes(interaction.user.id)
raw, parsed = await self.bot.ollama.raw_analyze(message, user_notes=user_notes) raw, parsed = await self.bot.llm.raw_analyze(message, user_notes=user_notes)
embed = discord.Embed( embed = discord.Embed(
title="BCS Test Analysis", color=discord.Color.blue() title="BCS Test Analysis", color=discord.Color.blue()
@@ -368,7 +368,7 @@ class CommandsCog(commands.Cog):
name="Input Message", value=message[:1024], inline=False name="Input Message", value=message[:1024], inline=False
) )
embed.add_field( embed.add_field(
name="Raw Ollama Response", name="Raw LLM Response",
value=f"```json\n{raw[:1000]}\n```", value=f"```json\n{raw[:1000]}\n```",
inline=False, inline=False,
) )
+1 -1
View File
@@ -82,7 +82,7 @@ class SentimentCog(commands.Cog):
# Analyze the message # Analyze the message
context = self._get_context(message) context = self._get_context(message)
user_notes = self.bot.drama_tracker.get_user_notes(message.author.id) user_notes = self.bot.drama_tracker.get_user_notes(message.author.id)
result = await self.bot.ollama.analyze_message( result = await self.bot.llm.analyze_message(
message.content, context, user_notes=user_notes message.content, context, user_notes=user_notes
) )
+96 -73
View File
@@ -1,3 +1,4 @@
import asyncio
import json import json
import logging import logging
from pathlib import Path from pathlib import Path
@@ -96,8 +97,9 @@ class LLMClient:
self._client = AsyncOpenAI( self._client = AsyncOpenAI(
base_url=f"{self.host}/v1", base_url=f"{self.host}/v1",
api_key=api_key, api_key=api_key,
timeout=300.0, # 5 min — first request loads model into VRAM timeout=600.0, # 10 min — first request loads model into VRAM
) )
self._semaphore = asyncio.Semaphore(1) # serialize requests to avoid overloading
async def close(self): async def close(self):
await self._client.close() await self._client.close()
@@ -110,36 +112,37 @@ class LLMClient:
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n" user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}" user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
try: async with self._semaphore:
response = await self._client.chat.completions.create( try:
model=self.model, response = await self._client.chat.completions.create(
messages=[ model=self.model,
{"role": "system", "content": SYSTEM_PROMPT}, messages=[
{"role": "user", "content": user_content}, {"role": "system", "content": SYSTEM_PROMPT},
], {"role": "user", "content": user_content},
tools=[ANALYSIS_TOOL], ],
tool_choice={"type": "function", "function": {"name": "report_analysis"}}, tools=[ANALYSIS_TOOL],
temperature=0.1, tool_choice={"type": "function", "function": {"name": "report_analysis"}},
) temperature=0.1,
)
choice = response.choices[0] choice = response.choices[0]
# Extract tool call arguments # Extract tool call arguments
if choice.message.tool_calls: if choice.message.tool_calls:
tool_call = choice.message.tool_calls[0] tool_call = choice.message.tool_calls[0]
args = json.loads(tool_call.function.arguments) args = json.loads(tool_call.function.arguments)
return self._validate_result(args) return self._validate_result(args)
# Fallback: try parsing the message content as JSON # Fallback: try parsing the message content as JSON
if choice.message.content: if choice.message.content:
return self._parse_content_fallback(choice.message.content) return self._parse_content_fallback(choice.message.content)
logger.warning("No tool call or content in LLM response.") logger.warning("No tool call or content in LLM response.")
return None return None
except Exception as e: except Exception as e:
logger.error("LLM analysis error: %s", e) logger.error("LLM analysis error: %s", e)
return None return None
def _validate_result(self, result: dict) -> dict: def _validate_result(self, result: dict) -> dict:
score = float(result.get("toxicity_score", 0.0)) score = float(result.get("toxicity_score", 0.0))
@@ -196,24 +199,43 @@ class LLMClient:
return None return None
async def chat( async def chat(
self, messages: list[dict[str, str]], system_prompt: str self, messages: list[dict[str, str]], system_prompt: str,
on_first_token=None,
) -> str | None: ) -> str | None:
"""Send a conversational chat request (no tools).""" """Send a conversational chat request (no tools).
try:
response = await self._client.chat.completions.create( If *on_first_token* is an async callable it will be awaited once the
model=self.model, first content token arrives (useful for triggering the typing indicator
messages=[ only after the model starts generating).
{"role": "system", "content": system_prompt}, """
*messages, async with self._semaphore:
], try:
temperature=0.8, stream = await self._client.chat.completions.create(
max_tokens=300, model=self.model,
) messages=[
content = response.choices[0].message.content {"role": "system", "content": system_prompt},
return content.strip() if content else None *messages,
except Exception as e: ],
logger.error("LLM chat error: %s", e) temperature=0.8,
return None max_tokens=300,
stream=True,
)
chunks: list[str] = []
notified = False
async for chunk in stream:
delta = chunk.choices[0].delta if chunk.choices else None
if delta and delta.content:
if not notified and on_first_token:
await on_first_token()
notified = True
chunks.append(delta.content)
content = "".join(chunks).strip()
return content if content else None
except Exception as e:
logger.error("LLM chat error: %s", e)
return None
async def raw_analyze(self, message: str, context: str = "", user_notes: str = "") -> tuple[str, dict | None]: async def raw_analyze(self, message: str, context: str = "", user_notes: str = "") -> tuple[str, dict | None]:
"""Return the raw LLM response string AND parsed result for /bcs-test (single LLM call).""" """Return the raw LLM response string AND parsed result for /bcs-test (single LLM call)."""
@@ -222,38 +244,39 @@ class LLMClient:
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n" user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}" user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
try: async with self._semaphore:
response = await self._client.chat.completions.create( try:
model=self.model, response = await self._client.chat.completions.create(
messages=[ model=self.model,
{"role": "system", "content": SYSTEM_PROMPT}, messages=[
{"role": "user", "content": user_content}, {"role": "system", "content": SYSTEM_PROMPT},
], {"role": "user", "content": user_content},
tools=[ANALYSIS_TOOL], ],
tool_choice={"type": "function", "function": {"name": "report_analysis"}}, tools=[ANALYSIS_TOOL],
temperature=0.1, tool_choice={"type": "function", "function": {"name": "report_analysis"}},
) temperature=0.1,
)
choice = response.choices[0] choice = response.choices[0]
parts = [] parts = []
parsed = None parsed = None
if choice.message.content: if choice.message.content:
parts.append(f"Content: {choice.message.content}") parts.append(f"Content: {choice.message.content}")
if choice.message.tool_calls: if choice.message.tool_calls:
for tc in choice.message.tool_calls: for tc in choice.message.tool_calls:
parts.append( parts.append(
f"Tool call: {tc.function.name}({tc.function.arguments})" f"Tool call: {tc.function.name}({tc.function.arguments})"
) )
# Parse the first tool call # Parse the first tool call
args = json.loads(choice.message.tool_calls[0].function.arguments) args = json.loads(choice.message.tool_calls[0].function.arguments)
parsed = self._validate_result(args) parsed = self._validate_result(args)
elif choice.message.content: elif choice.message.content:
parsed = self._parse_content_fallback(choice.message.content) parsed = self._parse_content_fallback(choice.message.content)
raw = "\n".join(parts) or "(empty response)" raw = "\n".join(parts) or "(empty response)"
return raw, parsed return raw, parsed
except Exception as e: except Exception as e:
return f"Error: {e}", None return f"Error: {e}", None