Switch LLM backend from llama.cpp/Qwen to OpenAI

- Default models: gpt-4o-mini (triage), gpt-4o (escalation)
- Remove Qwen-specific /no_think hacks
- Reduce timeout from 600s to 120s, increase concurrency semaphore to 4
- Support empty LLM_BASE_URL to use OpenAI directly

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-23 12:07:53 -05:00
parent a9bc24e48e
commit 28fb66d5f9
3 changed files with 24 additions and 31 deletions

View File

@@ -1,6 +1,7 @@
DISCORD_BOT_TOKEN=your_token_here DISCORD_BOT_TOKEN=your_token_here
LLM_BASE_URL=http://athena.lan:11434 LLM_BASE_URL=
LLM_MODEL=Qwen3-VL-32B-Thinking-Q8_0 LLM_MODEL=gpt-4o-mini
LLM_API_KEY=not-needed LLM_ESCALATION_MODEL=gpt-4o
LLM_API_KEY=your_openai_api_key_here
MSSQL_SA_PASSWORD=YourStrong!Passw0rd MSSQL_SA_PASSWORD=YourStrong!Passw0rd
DB_CONNECTION_STRING=DRIVER={ODBC Driver 18 for SQL Server};SERVER=localhost,1433;DATABASE=BreehaviorMonitor;UID=sa;PWD=YourStrong!Passw0rd;TrustServerCertificate=yes DB_CONNECTION_STRING=DRIVER={ODBC Driver 18 for SQL Server};SERVER=localhost,1433;DATABASE=BreehaviorMonitor;UID=sa;PWD=YourStrong!Passw0rd;TrustServerCertificate=yes

19
bot.py
View File

@@ -68,14 +68,14 @@ class BCSBot(commands.Bot):
# Database (initialized async in setup_hook) # Database (initialized async in setup_hook)
self.db = Database() self.db = Database()
# LLM clients (OpenAI-compatible — works with llama.cpp, Ollama, or OpenAI) # LLM clients (OpenAI — set LLM_BASE_URL to override for local models)
llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434") llm_base_url = os.getenv("LLM_BASE_URL", "")
llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0") llm_model = os.getenv("LLM_MODEL", "gpt-4o-mini")
llm_api_key = os.getenv("LLM_API_KEY", "not-needed") llm_api_key = os.getenv("LLM_API_KEY", "")
self.llm = LLMClient(llm_base_url, llm_model, llm_api_key, db=self.db) self.llm = LLMClient(llm_base_url, llm_model, llm_api_key, db=self.db)
# Heavy/escalation model for re-analysis, chat, and manual commands # Heavy/escalation model for re-analysis, chat, and manual commands
llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", llm_model) llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", "gpt-4o")
self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key, db=self.db) self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key, db=self.db)
# Active mode (server-wide) # Active mode (server-wide)
@@ -118,17 +118,16 @@ class BCSBot(commands.Bot):
await self.tree.sync() await self.tree.sync()
logger.info("Slash commands synced.") logger.info("Slash commands synced.")
# Warm up the LLM so the model is loaded into VRAM before messages arrive # Quick connectivity check
logger.info("Warming up LLM model...")
try: try:
resp = await self.llm._client.chat.completions.create( await self.llm._client.chat.completions.create(
model=self.llm.model, model=self.llm.model,
messages=[{"role": "user", "content": "hi"}], messages=[{"role": "user", "content": "hi"}],
max_tokens=1, max_tokens=1,
) )
logger.info("LLM warm-up complete.") logger.info("LLM connectivity check passed.")
except Exception as e: except Exception as e:
logger.warning("LLM warm-up failed — first messages may be slow: %s", e) logger.warning("LLM connectivity check failed: %s", e)
async def on_message(self, message: discord.Message): async def on_message(self, message: discord.Message):
logger.info( logger.info(

View File

@@ -132,12 +132,11 @@ class LLMClient:
self.model = model self.model = model
self.host = base_url.rstrip("/") self.host = base_url.rstrip("/")
self._db = db self._db = db
self._client = AsyncOpenAI( client_kwargs = {"api_key": api_key, "timeout": 120.0}
base_url=f"{self.host}/v1", if self.host:
api_key=api_key, client_kwargs["base_url"] = f"{self.host}/v1"
timeout=600.0, # 10 min — first request loads model into VRAM self._client = AsyncOpenAI(**client_kwargs)
) self._semaphore = asyncio.Semaphore(4)
self._semaphore = asyncio.Semaphore(1) # serialize requests to avoid overloading
def _log_llm(self, request_type: str, duration_ms: int, success: bool, def _log_llm(self, request_type: str, duration_ms: int, success: bool,
request: str, response: str | None = None, error: str | None = None, request: str, response: str | None = None, error: str | None = None,
@@ -169,7 +168,7 @@ class LLMClient:
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n" user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
if channel_context: if channel_context:
user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n" user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n/no_think" user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n"
req_json = json.dumps([ req_json = json.dumps([
{"role": "system", "content": SYSTEM_PROMPT[:500]}, {"role": "system", "content": SYSTEM_PROMPT[:500]},
@@ -300,16 +299,9 @@ class LLMClient:
first content token arrives (useful for triggering the typing indicator first content token arrives (useful for triggering the typing indicator
only after the model starts generating). only after the model starts generating).
""" """
# Append /no_think to the last user message to disable thinking on Qwen3
patched = []
for m in messages:
patched.append(m)
if patched and patched[-1].get("role") == "user":
patched[-1] = {**patched[-1], "content": patched[-1]["content"] + "\n/no_think"}
req_json = json.dumps([ req_json = json.dumps([
{"role": "system", "content": system_prompt[:500]}, {"role": "system", "content": system_prompt[:500]},
*[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in patched], *[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in messages],
], default=str) ], default=str)
t0 = time.monotonic() t0 = time.monotonic()
@@ -319,7 +311,7 @@ class LLMClient:
model=self.model, model=self.model,
messages=[ messages=[
{"role": "system", "content": system_prompt}, {"role": "system", "content": system_prompt},
*patched, *messages,
], ],
temperature=0.8, temperature=0.8,
max_tokens=2048, max_tokens=2048,
@@ -363,7 +355,8 @@ class LLMClient:
user_content: list[dict] = [ user_content: list[dict] = [
{"type": "image_url", "image_url": {"url": data_url}}, {"type": "image_url", "image_url": {"url": data_url}},
] ]
user_content.append({"type": "text", "text": (user_text or "") + "\n/no_think"}) if user_text:
user_content.append({"type": "text", "text": user_text})
req_json = json.dumps([ req_json = json.dumps([
{"role": "system", "content": system_prompt[:500]}, {"role": "system", "content": system_prompt[:500]},
@@ -422,7 +415,7 @@ class LLMClient:
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n" user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
if channel_context: if channel_context:
user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n" user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n/no_think" user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n"
req_json = json.dumps([ req_json = json.dumps([
{"role": "system", "content": SYSTEM_PROMPT[:500]}, {"role": "system", "content": SYSTEM_PROMPT[:500]},