Add LLM request queue, streaming chat, and rename ollama_client to llm_client

- Serialize all LLM requests through an asyncio semaphore to prevent
  overloading athena with concurrent requests
- Switch chat() to streaming so the typing indicator only appears once
  the model starts generating (not during thinking/loading)
- Increase LLM timeout from 5 to 10 minutes for slow first loads
- Rename ollama_client.py to llm_client.py and self.ollama to self.llm
  since the bot uses a generic OpenAI-compatible API
- Update embed labels from "Ollama" to "LLM"

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-21 13:45:12 -05:00
parent 645b924011
commit 1151b705c0
5 changed files with 120 additions and 87 deletions

View File

@@ -126,8 +126,8 @@ class CommandsCog(commands.Cog):
inline=True,
)
embed.add_field(
name="Ollama",
value=f"`{self.bot.ollama.model}` @ `{self.bot.ollama.host}`",
name="LLM",
value=f"`{self.bot.llm.model}` @ `{self.bot.llm.host}`",
inline=False,
)
@@ -301,7 +301,7 @@ class CommandsCog(commands.Cog):
else "(no prior context)"
)
result = await self.bot.ollama.analyze_message(msg.content, context)
result = await self.bot.llm.analyze_message(msg.content, context)
if result is None:
embed = discord.Embed(
title=f"Analysis: {msg.author.display_name}",
@@ -359,7 +359,7 @@ class CommandsCog(commands.Cog):
await interaction.response.defer(ephemeral=True)
user_notes = self.bot.drama_tracker.get_user_notes(interaction.user.id)
raw, parsed = await self.bot.ollama.raw_analyze(message, user_notes=user_notes)
raw, parsed = await self.bot.llm.raw_analyze(message, user_notes=user_notes)
embed = discord.Embed(
title="BCS Test Analysis", color=discord.Color.blue()
@@ -368,7 +368,7 @@ class CommandsCog(commands.Cog):
name="Input Message", value=message[:1024], inline=False
)
embed.add_field(
name="Raw Ollama Response",
name="Raw LLM Response",
value=f"```json\n{raw[:1000]}\n```",
inline=False,
)