Add LLM request queue, streaming chat, and rename ollama_client to llm_client
- Serialize all LLM requests through an asyncio semaphore to prevent overloading athena with concurrent requests - Switch chat() to streaming so the typing indicator only appears once the model starts generating (not during thinking/loading) - Increase LLM timeout from 5 to 10 minutes for slow first loads - Rename ollama_client.py to llm_client.py and self.ollama to self.llm since the bot uses a generic OpenAI-compatible API - Update embed labels from "Ollama" to "LLM" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
20
cogs/chat.py
20
cogs/chat.py
@@ -70,11 +70,21 @@ class ChatCog(commands.Cog):
|
||||
{"role": "user", "content": f"{score_context}\n{message.author.display_name}: {content}"}
|
||||
)
|
||||
|
||||
async with message.channel.typing():
|
||||
response = await self.bot.ollama.chat(
|
||||
list(self._chat_history[ch_id]),
|
||||
CHAT_PERSONALITY,
|
||||
)
|
||||
typing_ctx = None
|
||||
|
||||
async def start_typing():
|
||||
nonlocal typing_ctx
|
||||
typing_ctx = message.channel.typing()
|
||||
await typing_ctx.__aenter__()
|
||||
|
||||
response = await self.bot.llm.chat(
|
||||
list(self._chat_history[ch_id]),
|
||||
CHAT_PERSONALITY,
|
||||
on_first_token=start_typing,
|
||||
)
|
||||
|
||||
if typing_ctx:
|
||||
await typing_ctx.__aexit__(None, None, None)
|
||||
|
||||
if response is None:
|
||||
response = "I'd roast you but my brain is offline. Try again later."
|
||||
|
||||
Reference in New Issue
Block a user