Add LLM request queue, streaming chat, and rename ollama_client to llm_client
- Serialize all LLM requests through an asyncio semaphore to prevent overloading athena with concurrent requests - Switch chat() to streaming so the typing indicator only appears once the model starts generating (not during thinking/loading) - Increase LLM timeout from 5 to 10 minutes for slow first loads - Rename ollama_client.py to llm_client.py and self.ollama to self.llm since the bot uses a generic OpenAI-compatible API - Update embed labels from "Ollama" to "LLM" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -12,7 +12,7 @@ from dotenv import load_dotenv
|
|||||||
|
|
||||||
from utils.database import Database
|
from utils.database import Database
|
||||||
from utils.drama_tracker import DramaTracker
|
from utils.drama_tracker import DramaTracker
|
||||||
from utils.ollama_client import LLMClient
|
from utils.llm_client import LLMClient
|
||||||
|
|
||||||
# Load .env
|
# Load .env
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
@@ -69,7 +69,7 @@ class BCSBot(commands.Bot):
|
|||||||
llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434")
|
llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434")
|
||||||
llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0")
|
llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0")
|
||||||
llm_api_key = os.getenv("LLM_API_KEY", "not-needed")
|
llm_api_key = os.getenv("LLM_API_KEY", "not-needed")
|
||||||
self.ollama = LLMClient(llm_base_url, llm_model, llm_api_key)
|
self.llm = LLMClient(llm_base_url, llm_model, llm_api_key)
|
||||||
|
|
||||||
# Drama tracker
|
# Drama tracker
|
||||||
sentiment = config.get("sentiment", {})
|
sentiment = config.get("sentiment", {})
|
||||||
@@ -154,7 +154,7 @@ class BCSBot(commands.Bot):
|
|||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
await self.db.close()
|
await self.db.close()
|
||||||
await self.ollama.close()
|
await self.llm.close()
|
||||||
await super().close()
|
await super().close()
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
+15
-5
@@ -70,11 +70,21 @@ class ChatCog(commands.Cog):
|
|||||||
{"role": "user", "content": f"{score_context}\n{message.author.display_name}: {content}"}
|
{"role": "user", "content": f"{score_context}\n{message.author.display_name}: {content}"}
|
||||||
)
|
)
|
||||||
|
|
||||||
async with message.channel.typing():
|
typing_ctx = None
|
||||||
response = await self.bot.ollama.chat(
|
|
||||||
list(self._chat_history[ch_id]),
|
async def start_typing():
|
||||||
CHAT_PERSONALITY,
|
nonlocal typing_ctx
|
||||||
)
|
typing_ctx = message.channel.typing()
|
||||||
|
await typing_ctx.__aenter__()
|
||||||
|
|
||||||
|
response = await self.bot.llm.chat(
|
||||||
|
list(self._chat_history[ch_id]),
|
||||||
|
CHAT_PERSONALITY,
|
||||||
|
on_first_token=start_typing,
|
||||||
|
)
|
||||||
|
|
||||||
|
if typing_ctx:
|
||||||
|
await typing_ctx.__aexit__(None, None, None)
|
||||||
|
|
||||||
if response is None:
|
if response is None:
|
||||||
response = "I'd roast you but my brain is offline. Try again later."
|
response = "I'd roast you but my brain is offline. Try again later."
|
||||||
|
|||||||
+5
-5
@@ -126,8 +126,8 @@ class CommandsCog(commands.Cog):
|
|||||||
inline=True,
|
inline=True,
|
||||||
)
|
)
|
||||||
embed.add_field(
|
embed.add_field(
|
||||||
name="Ollama",
|
name="LLM",
|
||||||
value=f"`{self.bot.ollama.model}` @ `{self.bot.ollama.host}`",
|
value=f"`{self.bot.llm.model}` @ `{self.bot.llm.host}`",
|
||||||
inline=False,
|
inline=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -301,7 +301,7 @@ class CommandsCog(commands.Cog):
|
|||||||
else "(no prior context)"
|
else "(no prior context)"
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await self.bot.ollama.analyze_message(msg.content, context)
|
result = await self.bot.llm.analyze_message(msg.content, context)
|
||||||
if result is None:
|
if result is None:
|
||||||
embed = discord.Embed(
|
embed = discord.Embed(
|
||||||
title=f"Analysis: {msg.author.display_name}",
|
title=f"Analysis: {msg.author.display_name}",
|
||||||
@@ -359,7 +359,7 @@ class CommandsCog(commands.Cog):
|
|||||||
await interaction.response.defer(ephemeral=True)
|
await interaction.response.defer(ephemeral=True)
|
||||||
|
|
||||||
user_notes = self.bot.drama_tracker.get_user_notes(interaction.user.id)
|
user_notes = self.bot.drama_tracker.get_user_notes(interaction.user.id)
|
||||||
raw, parsed = await self.bot.ollama.raw_analyze(message, user_notes=user_notes)
|
raw, parsed = await self.bot.llm.raw_analyze(message, user_notes=user_notes)
|
||||||
|
|
||||||
embed = discord.Embed(
|
embed = discord.Embed(
|
||||||
title="BCS Test Analysis", color=discord.Color.blue()
|
title="BCS Test Analysis", color=discord.Color.blue()
|
||||||
@@ -368,7 +368,7 @@ class CommandsCog(commands.Cog):
|
|||||||
name="Input Message", value=message[:1024], inline=False
|
name="Input Message", value=message[:1024], inline=False
|
||||||
)
|
)
|
||||||
embed.add_field(
|
embed.add_field(
|
||||||
name="Raw Ollama Response",
|
name="Raw LLM Response",
|
||||||
value=f"```json\n{raw[:1000]}\n```",
|
value=f"```json\n{raw[:1000]}\n```",
|
||||||
inline=False,
|
inline=False,
|
||||||
)
|
)
|
||||||
|
|||||||
+1
-1
@@ -82,7 +82,7 @@ class SentimentCog(commands.Cog):
|
|||||||
# Analyze the message
|
# Analyze the message
|
||||||
context = self._get_context(message)
|
context = self._get_context(message)
|
||||||
user_notes = self.bot.drama_tracker.get_user_notes(message.author.id)
|
user_notes = self.bot.drama_tracker.get_user_notes(message.author.id)
|
||||||
result = await self.bot.ollama.analyze_message(
|
result = await self.bot.llm.analyze_message(
|
||||||
message.content, context, user_notes=user_notes
|
message.content, context, user_notes=user_notes
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -96,8 +97,9 @@ class LLMClient:
|
|||||||
self._client = AsyncOpenAI(
|
self._client = AsyncOpenAI(
|
||||||
base_url=f"{self.host}/v1",
|
base_url=f"{self.host}/v1",
|
||||||
api_key=api_key,
|
api_key=api_key,
|
||||||
timeout=300.0, # 5 min — first request loads model into VRAM
|
timeout=600.0, # 10 min — first request loads model into VRAM
|
||||||
)
|
)
|
||||||
|
self._semaphore = asyncio.Semaphore(1) # serialize requests to avoid overloading
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
await self._client.close()
|
await self._client.close()
|
||||||
@@ -110,36 +112,37 @@ class LLMClient:
|
|||||||
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
|
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
|
||||||
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
|
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
|
||||||
|
|
||||||
try:
|
async with self._semaphore:
|
||||||
response = await self._client.chat.completions.create(
|
try:
|
||||||
model=self.model,
|
response = await self._client.chat.completions.create(
|
||||||
messages=[
|
model=self.model,
|
||||||
{"role": "system", "content": SYSTEM_PROMPT},
|
messages=[
|
||||||
{"role": "user", "content": user_content},
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
],
|
{"role": "user", "content": user_content},
|
||||||
tools=[ANALYSIS_TOOL],
|
],
|
||||||
tool_choice={"type": "function", "function": {"name": "report_analysis"}},
|
tools=[ANALYSIS_TOOL],
|
||||||
temperature=0.1,
|
tool_choice={"type": "function", "function": {"name": "report_analysis"}},
|
||||||
)
|
temperature=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
choice = response.choices[0]
|
choice = response.choices[0]
|
||||||
|
|
||||||
# Extract tool call arguments
|
# Extract tool call arguments
|
||||||
if choice.message.tool_calls:
|
if choice.message.tool_calls:
|
||||||
tool_call = choice.message.tool_calls[0]
|
tool_call = choice.message.tool_calls[0]
|
||||||
args = json.loads(tool_call.function.arguments)
|
args = json.loads(tool_call.function.arguments)
|
||||||
return self._validate_result(args)
|
return self._validate_result(args)
|
||||||
|
|
||||||
# Fallback: try parsing the message content as JSON
|
# Fallback: try parsing the message content as JSON
|
||||||
if choice.message.content:
|
if choice.message.content:
|
||||||
return self._parse_content_fallback(choice.message.content)
|
return self._parse_content_fallback(choice.message.content)
|
||||||
|
|
||||||
logger.warning("No tool call or content in LLM response.")
|
logger.warning("No tool call or content in LLM response.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error("LLM analysis error: %s", e)
|
logger.error("LLM analysis error: %s", e)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _validate_result(self, result: dict) -> dict:
|
def _validate_result(self, result: dict) -> dict:
|
||||||
score = float(result.get("toxicity_score", 0.0))
|
score = float(result.get("toxicity_score", 0.0))
|
||||||
@@ -196,24 +199,43 @@ class LLMClient:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
async def chat(
|
async def chat(
|
||||||
self, messages: list[dict[str, str]], system_prompt: str
|
self, messages: list[dict[str, str]], system_prompt: str,
|
||||||
|
on_first_token=None,
|
||||||
) -> str | None:
|
) -> str | None:
|
||||||
"""Send a conversational chat request (no tools)."""
|
"""Send a conversational chat request (no tools).
|
||||||
try:
|
|
||||||
response = await self._client.chat.completions.create(
|
If *on_first_token* is an async callable it will be awaited once the
|
||||||
model=self.model,
|
first content token arrives (useful for triggering the typing indicator
|
||||||
messages=[
|
only after the model starts generating).
|
||||||
{"role": "system", "content": system_prompt},
|
"""
|
||||||
*messages,
|
async with self._semaphore:
|
||||||
],
|
try:
|
||||||
temperature=0.8,
|
stream = await self._client.chat.completions.create(
|
||||||
max_tokens=300,
|
model=self.model,
|
||||||
)
|
messages=[
|
||||||
content = response.choices[0].message.content
|
{"role": "system", "content": system_prompt},
|
||||||
return content.strip() if content else None
|
*messages,
|
||||||
except Exception as e:
|
],
|
||||||
logger.error("LLM chat error: %s", e)
|
temperature=0.8,
|
||||||
return None
|
max_tokens=300,
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks: list[str] = []
|
||||||
|
notified = False
|
||||||
|
async for chunk in stream:
|
||||||
|
delta = chunk.choices[0].delta if chunk.choices else None
|
||||||
|
if delta and delta.content:
|
||||||
|
if not notified and on_first_token:
|
||||||
|
await on_first_token()
|
||||||
|
notified = True
|
||||||
|
chunks.append(delta.content)
|
||||||
|
|
||||||
|
content = "".join(chunks).strip()
|
||||||
|
return content if content else None
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("LLM chat error: %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
async def raw_analyze(self, message: str, context: str = "", user_notes: str = "") -> tuple[str, dict | None]:
|
async def raw_analyze(self, message: str, context: str = "", user_notes: str = "") -> tuple[str, dict | None]:
|
||||||
"""Return the raw LLM response string AND parsed result for /bcs-test (single LLM call)."""
|
"""Return the raw LLM response string AND parsed result for /bcs-test (single LLM call)."""
|
||||||
@@ -222,38 +244,39 @@ class LLMClient:
|
|||||||
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
|
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
|
||||||
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
|
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
|
||||||
|
|
||||||
try:
|
async with self._semaphore:
|
||||||
response = await self._client.chat.completions.create(
|
try:
|
||||||
model=self.model,
|
response = await self._client.chat.completions.create(
|
||||||
messages=[
|
model=self.model,
|
||||||
{"role": "system", "content": SYSTEM_PROMPT},
|
messages=[
|
||||||
{"role": "user", "content": user_content},
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
],
|
{"role": "user", "content": user_content},
|
||||||
tools=[ANALYSIS_TOOL],
|
],
|
||||||
tool_choice={"type": "function", "function": {"name": "report_analysis"}},
|
tools=[ANALYSIS_TOOL],
|
||||||
temperature=0.1,
|
tool_choice={"type": "function", "function": {"name": "report_analysis"}},
|
||||||
)
|
temperature=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
choice = response.choices[0]
|
choice = response.choices[0]
|
||||||
parts = []
|
parts = []
|
||||||
parsed = None
|
parsed = None
|
||||||
|
|
||||||
if choice.message.content:
|
if choice.message.content:
|
||||||
parts.append(f"Content: {choice.message.content}")
|
parts.append(f"Content: {choice.message.content}")
|
||||||
|
|
||||||
if choice.message.tool_calls:
|
if choice.message.tool_calls:
|
||||||
for tc in choice.message.tool_calls:
|
for tc in choice.message.tool_calls:
|
||||||
parts.append(
|
parts.append(
|
||||||
f"Tool call: {tc.function.name}({tc.function.arguments})"
|
f"Tool call: {tc.function.name}({tc.function.arguments})"
|
||||||
)
|
)
|
||||||
# Parse the first tool call
|
# Parse the first tool call
|
||||||
args = json.loads(choice.message.tool_calls[0].function.arguments)
|
args = json.loads(choice.message.tool_calls[0].function.arguments)
|
||||||
parsed = self._validate_result(args)
|
parsed = self._validate_result(args)
|
||||||
elif choice.message.content:
|
elif choice.message.content:
|
||||||
parsed = self._parse_content_fallback(choice.message.content)
|
parsed = self._parse_content_fallback(choice.message.content)
|
||||||
|
|
||||||
raw = "\n".join(parts) or "(empty response)"
|
raw = "\n".join(parts) or "(empty response)"
|
||||||
return raw, parsed
|
return raw, parsed
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return f"Error: {e}", None
|
return f"Error: {e}", None
|
||||||
Reference in New Issue
Block a user