Switch LLM backend from llama.cpp/Qwen to OpenAI
- Default models: gpt-4o-mini (triage), gpt-4o (escalation) - Remove Qwen-specific /no_think hacks - Reduce timeout from 600s to 120s, increase concurrency semaphore to 4 - Support empty LLM_BASE_URL to use OpenAI directly Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
19
bot.py
19
bot.py
@@ -68,14 +68,14 @@ class BCSBot(commands.Bot):
|
||||
# Database (initialized async in setup_hook)
|
||||
self.db = Database()
|
||||
|
||||
# LLM clients (OpenAI-compatible — works with llama.cpp, Ollama, or OpenAI)
|
||||
llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434")
|
||||
llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0")
|
||||
llm_api_key = os.getenv("LLM_API_KEY", "not-needed")
|
||||
# LLM clients (OpenAI — set LLM_BASE_URL to override for local models)
|
||||
llm_base_url = os.getenv("LLM_BASE_URL", "")
|
||||
llm_model = os.getenv("LLM_MODEL", "gpt-4o-mini")
|
||||
llm_api_key = os.getenv("LLM_API_KEY", "")
|
||||
self.llm = LLMClient(llm_base_url, llm_model, llm_api_key, db=self.db)
|
||||
|
||||
# Heavy/escalation model for re-analysis, chat, and manual commands
|
||||
llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", llm_model)
|
||||
llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", "gpt-4o")
|
||||
self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key, db=self.db)
|
||||
|
||||
# Active mode (server-wide)
|
||||
@@ -118,17 +118,16 @@ class BCSBot(commands.Bot):
|
||||
await self.tree.sync()
|
||||
logger.info("Slash commands synced.")
|
||||
|
||||
# Warm up the LLM so the model is loaded into VRAM before messages arrive
|
||||
logger.info("Warming up LLM model...")
|
||||
# Quick connectivity check
|
||||
try:
|
||||
resp = await self.llm._client.chat.completions.create(
|
||||
await self.llm._client.chat.completions.create(
|
||||
model=self.llm.model,
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
max_tokens=1,
|
||||
)
|
||||
logger.info("LLM warm-up complete.")
|
||||
logger.info("LLM connectivity check passed.")
|
||||
except Exception as e:
|
||||
logger.warning("LLM warm-up failed — first messages may be slow: %s", e)
|
||||
logger.warning("LLM connectivity check failed: %s", e)
|
||||
|
||||
async def on_message(self, message: discord.Message):
|
||||
logger.info(
|
||||
|
||||
Reference in New Issue
Block a user