Add LLM request queue, streaming chat, and rename ollama_client to llm_client

- Serialize all LLM requests through an asyncio semaphore to prevent overloading athena with concurrent requests - Switch chat() to streaming so the typing indicator only appears once the model starts generating (not during thinking/loading) - Increase LLM timeout from 5 to 10 minutes for slow first loads - Rename ollama_client.py to llm_client.py and self.ollama to self.llm since the bot uses a generic OpenAI-compatible API - Update embed labels from "Ollama" to "LLM" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 13:45:12 -05:00
parent 645b924011
commit 1151b705c0
5 changed files with 120 additions and 87 deletions
--- a/cogs/chat.py
+++ b/cogs/chat.py
@@ -70,11 +70,21 @@ class ChatCog(commands.Cog):
            {"role": "user", "content": f"{score_context}\n{message.author.display_name}: {content}"}
        )

-        async with message.channel.typing():
-            response = await self.bot.ollama.chat(
-                list(self._chat_history[ch_id]),
-                CHAT_PERSONALITY,
-            )
+        typing_ctx = None
+
+        async def start_typing():
+            nonlocal typing_ctx
+            typing_ctx = message.channel.typing()
+            await typing_ctx.__aenter__()
+
+        response = await self.bot.llm.chat(
+            list(self._chat_history[ch_id]),
+            CHAT_PERSONALITY,
+            on_first_token=start_typing,
+        )
+
+        if typing_ctx:
+            await typing_ctx.__aexit__(None, None, None)

        if response is None:
            response = "I'd roast you but my brain is offline. Try again later."