Add LLM request queue, streaming chat, and rename ollama_client to llm_client
- Serialize all LLM requests through an asyncio semaphore to prevent overloading athena with concurrent requests - Switch chat() to streaming so the typing indicator only appears once the model starts generating (not during thinking/loading) - Increase LLM timeout from 5 to 10 minutes for slow first loads - Rename ollama_client.py to llm_client.py and self.ollama to self.llm since the bot uses a generic OpenAI-compatible API - Update embed labels from "Ollama" to "LLM" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
6
bot.py
6
bot.py
@@ -12,7 +12,7 @@ from dotenv import load_dotenv
|
||||
|
||||
from utils.database import Database
|
||||
from utils.drama_tracker import DramaTracker
|
||||
from utils.ollama_client import LLMClient
|
||||
from utils.llm_client import LLMClient
|
||||
|
||||
# Load .env
|
||||
load_dotenv()
|
||||
@@ -69,7 +69,7 @@ class BCSBot(commands.Bot):
|
||||
llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434")
|
||||
llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0")
|
||||
llm_api_key = os.getenv("LLM_API_KEY", "not-needed")
|
||||
self.ollama = LLMClient(llm_base_url, llm_model, llm_api_key)
|
||||
self.llm = LLMClient(llm_base_url, llm_model, llm_api_key)
|
||||
|
||||
# Drama tracker
|
||||
sentiment = config.get("sentiment", {})
|
||||
@@ -154,7 +154,7 @@ class BCSBot(commands.Bot):
|
||||
|
||||
async def close(self):
|
||||
await self.db.close()
|
||||
await self.ollama.close()
|
||||
await self.llm.close()
|
||||
await super().close()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user