Support hybrid LLM: local Qwen triage + OpenAI escalation

Triage analysis runs on Qwen 8B (athena.lan) for free first-pass.
Escalation, chat, image roasts, and commands use GPT-4o via OpenAI.

Each tier gets its own base URL, API key, and concurrency settings.
Local models get /no_think and serialized requests automatically.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-23 12:20:07 -05:00
parent b5e401f036
commit 8a06ddbd6e
3 changed files with 49 additions and 18 deletions

View File

@@ -1,7 +1,12 @@
DISCORD_BOT_TOKEN=your_token_here DISCORD_BOT_TOKEN=your_token_here
LLM_BASE_URL= # Triage model (local llama.cpp / Ollama — leave BASE_URL empty for OpenAI)
LLM_MODEL=gpt-4o-mini LLM_BASE_URL=http://athena.lan:11434
LLM_MODEL=Qwen3-8B-Q6_K
LLM_API_KEY=not-needed
# Escalation model (OpenAI — leave BASE_URL empty for OpenAI)
LLM_ESCALATION_BASE_URL=
LLM_ESCALATION_MODEL=gpt-4o LLM_ESCALATION_MODEL=gpt-4o
LLM_API_KEY=your_openai_api_key_here LLM_ESCALATION_API_KEY=your_openai_api_key_here
# Database
MSSQL_SA_PASSWORD=YourStrong!Passw0rd MSSQL_SA_PASSWORD=YourStrong!Passw0rd
DB_CONNECTION_STRING=DRIVER={ODBC Driver 18 for SQL Server};SERVER=localhost,1433;DATABASE=BreehaviorMonitor;UID=sa;PWD=YourStrong!Passw0rd;TrustServerCertificate=yes DB_CONNECTION_STRING=DRIVER={ODBC Driver 18 for SQL Server};SERVER=localhost,1433;DATABASE=BreehaviorMonitor;UID=sa;PWD=YourStrong!Passw0rd;TrustServerCertificate=yes

22
bot.py
View File

@@ -68,15 +68,25 @@ class BCSBot(commands.Bot):
# Database (initialized async in setup_hook) # Database (initialized async in setup_hook)
self.db = Database() self.db = Database()
# LLM clients (OpenAI — set LLM_BASE_URL to override for local models) # Triage LLM (local Qwen on athena for cheap first-pass analysis)
llm_base_url = os.getenv("LLM_BASE_URL", "") llm_base_url = os.getenv("LLM_BASE_URL", "")
llm_model = os.getenv("LLM_MODEL", "gpt-4o-mini") llm_model = os.getenv("LLM_MODEL", "gpt-4o-mini")
llm_api_key = os.getenv("LLM_API_KEY", "") llm_api_key = os.getenv("LLM_API_KEY", "not-needed")
self.llm = LLMClient(llm_base_url, llm_model, llm_api_key, db=self.db) is_local = bool(llm_base_url)
self.llm = LLMClient(
llm_base_url, llm_model, llm_api_key, db=self.db,
no_think=is_local, concurrency=1 if is_local else 4,
)
# Heavy/escalation model for re-analysis, chat, and manual commands # Heavy/escalation LLM (OpenAI for re-analysis, chat, image roasts, commands)
llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", "gpt-4o") esc_base_url = os.getenv("LLM_ESCALATION_BASE_URL", "")
self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key, db=self.db) esc_model = os.getenv("LLM_ESCALATION_MODEL", "gpt-4o")
esc_api_key = os.getenv("LLM_ESCALATION_API_KEY", llm_api_key)
esc_is_local = bool(esc_base_url)
self.llm_heavy = LLMClient(
esc_base_url, esc_model, esc_api_key, db=self.db,
no_think=esc_is_local, concurrency=1 if esc_is_local else 4,
)
# Active mode (server-wide) # Active mode (server-wide)
modes_config = config.get("modes", {}) modes_config = config.get("modes", {})

View File

@@ -128,15 +128,18 @@ ANALYSIS_TOOL = {
class LLMClient: class LLMClient:
def __init__(self, base_url: str, model: str, api_key: str = "not-needed", db=None): def __init__(self, base_url: str, model: str, api_key: str = "not-needed",
db=None, no_think: bool = False, concurrency: int = 4):
self.model = model self.model = model
self.host = base_url.rstrip("/") self.host = base_url.rstrip("/")
self._db = db self._db = db
client_kwargs = {"api_key": api_key, "timeout": 120.0} self._no_think = no_think
timeout = 600.0 if self.host else 120.0 # local models need longer for VRAM load
client_kwargs = {"api_key": api_key, "timeout": timeout}
if self.host: if self.host:
client_kwargs["base_url"] = f"{self.host}/v1" client_kwargs["base_url"] = f"{self.host}/v1"
self._client = AsyncOpenAI(**client_kwargs) self._client = AsyncOpenAI(**client_kwargs)
self._semaphore = asyncio.Semaphore(4) self._semaphore = asyncio.Semaphore(concurrency)
def _log_llm(self, request_type: str, duration_ms: int, success: bool, def _log_llm(self, request_type: str, duration_ms: int, success: bool,
request: str, response: str | None = None, error: str | None = None, request: str, response: str | None = None, error: str | None = None,
@@ -156,6 +159,9 @@ class LLMClient:
output_tokens=output_tokens, output_tokens=output_tokens,
)) ))
def _append_no_think(self, text: str) -> str:
return text + "\n/no_think" if self._no_think else text
async def close(self): async def close(self):
await self._client.close() await self._client.close()
@@ -168,7 +174,8 @@ class LLMClient:
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n" user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
if channel_context: if channel_context:
user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n" user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n" user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
user_content = self._append_no_think(user_content)
req_json = json.dumps([ req_json = json.dumps([
{"role": "system", "content": SYSTEM_PROMPT[:500]}, {"role": "system", "content": SYSTEM_PROMPT[:500]},
@@ -299,9 +306,14 @@ class LLMClient:
first content token arrives (useful for triggering the typing indicator first content token arrives (useful for triggering the typing indicator
only after the model starts generating). only after the model starts generating).
""" """
# Append /no_think to the last user message for local Qwen models
patched = list(messages)
if self._no_think and patched and patched[-1].get("role") == "user":
patched[-1] = {**patched[-1], "content": self._append_no_think(patched[-1]["content"])}
req_json = json.dumps([ req_json = json.dumps([
{"role": "system", "content": system_prompt[:500]}, {"role": "system", "content": system_prompt[:500]},
*[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in messages], *[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in patched],
], default=str) ], default=str)
t0 = time.monotonic() t0 = time.monotonic()
@@ -311,7 +323,7 @@ class LLMClient:
model=self.model, model=self.model,
messages=[ messages=[
{"role": "system", "content": system_prompt}, {"role": "system", "content": system_prompt},
*messages, *patched,
], ],
temperature=0.8, temperature=0.8,
max_tokens=2048, max_tokens=2048,
@@ -355,8 +367,11 @@ class LLMClient:
user_content: list[dict] = [ user_content: list[dict] = [
{"type": "image_url", "image_url": {"url": data_url}}, {"type": "image_url", "image_url": {"url": data_url}},
] ]
if user_text: text_part = user_text or ""
user_content.append({"type": "text", "text": user_text}) if self._no_think:
text_part = (text_part + "\n/no_think").strip()
if text_part:
user_content.append({"type": "text", "text": text_part})
req_json = json.dumps([ req_json = json.dumps([
{"role": "system", "content": system_prompt[:500]}, {"role": "system", "content": system_prompt[:500]},
@@ -415,7 +430,8 @@ class LLMClient:
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n" user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
if channel_context: if channel_context:
user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n" user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n" user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
user_content = self._append_no_think(user_content)
req_json = json.dumps([ req_json = json.dumps([
{"role": "system", "content": SYSTEM_PROMPT[:500]}, {"role": "system", "content": SYSTEM_PROMPT[:500]},