From 8a06ddbd6e6dd2b30529095bff12725b834285e7 Mon Sep 17 00:00:00 2001 From: AJ Isaacs Date: Mon, 23 Feb 2026 12:20:07 -0500 Subject: [PATCH] Support hybrid LLM: local Qwen triage + OpenAI escalation Triage analysis runs on Qwen 8B (athena.lan) for free first-pass. Escalation, chat, image roasts, and commands use GPT-4o via OpenAI. Each tier gets its own base URL, API key, and concurrency settings. Local models get /no_think and serialized requests automatically. Co-Authored-By: Claude Opus 4.6 --- .env.example | 11 ++++++++--- bot.py | 22 ++++++++++++++++------ utils/llm_client.py | 34 +++++++++++++++++++++++++--------- 3 files changed, 49 insertions(+), 18 deletions(-) diff --git a/.env.example b/.env.example index 22ea6c3..fbe79e2 100644 --- a/.env.example +++ b/.env.example @@ -1,7 +1,12 @@ DISCORD_BOT_TOKEN=your_token_here -LLM_BASE_URL= -LLM_MODEL=gpt-4o-mini +# Triage model (local llama.cpp / Ollama — leave BASE_URL empty for OpenAI) +LLM_BASE_URL=http://athena.lan:11434 +LLM_MODEL=Qwen3-8B-Q6_K +LLM_API_KEY=not-needed +# Escalation model (OpenAI — leave BASE_URL empty for OpenAI) +LLM_ESCALATION_BASE_URL= LLM_ESCALATION_MODEL=gpt-4o -LLM_API_KEY=your_openai_api_key_here +LLM_ESCALATION_API_KEY=your_openai_api_key_here +# Database MSSQL_SA_PASSWORD=YourStrong!Passw0rd DB_CONNECTION_STRING=DRIVER={ODBC Driver 18 for SQL Server};SERVER=localhost,1433;DATABASE=BreehaviorMonitor;UID=sa;PWD=YourStrong!Passw0rd;TrustServerCertificate=yes diff --git a/bot.py b/bot.py index df65f0a..d9c88c1 100644 --- a/bot.py +++ b/bot.py @@ -68,15 +68,25 @@ class BCSBot(commands.Bot): # Database (initialized async in setup_hook) self.db = Database() - # LLM clients (OpenAI — set LLM_BASE_URL to override for local models) + # Triage LLM (local Qwen on athena for cheap first-pass analysis) llm_base_url = os.getenv("LLM_BASE_URL", "") llm_model = os.getenv("LLM_MODEL", "gpt-4o-mini") - llm_api_key = os.getenv("LLM_API_KEY", "") - self.llm = LLMClient(llm_base_url, llm_model, llm_api_key, db=self.db) + llm_api_key = os.getenv("LLM_API_KEY", "not-needed") + is_local = bool(llm_base_url) + self.llm = LLMClient( + llm_base_url, llm_model, llm_api_key, db=self.db, + no_think=is_local, concurrency=1 if is_local else 4, + ) - # Heavy/escalation model for re-analysis, chat, and manual commands - llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", "gpt-4o") - self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key, db=self.db) + # Heavy/escalation LLM (OpenAI for re-analysis, chat, image roasts, commands) + esc_base_url = os.getenv("LLM_ESCALATION_BASE_URL", "") + esc_model = os.getenv("LLM_ESCALATION_MODEL", "gpt-4o") + esc_api_key = os.getenv("LLM_ESCALATION_API_KEY", llm_api_key) + esc_is_local = bool(esc_base_url) + self.llm_heavy = LLMClient( + esc_base_url, esc_model, esc_api_key, db=self.db, + no_think=esc_is_local, concurrency=1 if esc_is_local else 4, + ) # Active mode (server-wide) modes_config = config.get("modes", {}) diff --git a/utils/llm_client.py b/utils/llm_client.py index dc5bbea..d04eab4 100644 --- a/utils/llm_client.py +++ b/utils/llm_client.py @@ -128,15 +128,18 @@ ANALYSIS_TOOL = { class LLMClient: - def __init__(self, base_url: str, model: str, api_key: str = "not-needed", db=None): + def __init__(self, base_url: str, model: str, api_key: str = "not-needed", + db=None, no_think: bool = False, concurrency: int = 4): self.model = model self.host = base_url.rstrip("/") self._db = db - client_kwargs = {"api_key": api_key, "timeout": 120.0} + self._no_think = no_think + timeout = 600.0 if self.host else 120.0 # local models need longer for VRAM load + client_kwargs = {"api_key": api_key, "timeout": timeout} if self.host: client_kwargs["base_url"] = f"{self.host}/v1" self._client = AsyncOpenAI(**client_kwargs) - self._semaphore = asyncio.Semaphore(4) + self._semaphore = asyncio.Semaphore(concurrency) def _log_llm(self, request_type: str, duration_ms: int, success: bool, request: str, response: str | None = None, error: str | None = None, @@ -156,6 +159,9 @@ class LLMClient: output_tokens=output_tokens, )) + def _append_no_think(self, text: str) -> str: + return text + "\n/no_think" if self._no_think else text + async def close(self): await self._client.close() @@ -168,7 +174,8 @@ class LLMClient: user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n" if channel_context: user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n" - user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n" + user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}" + user_content = self._append_no_think(user_content) req_json = json.dumps([ {"role": "system", "content": SYSTEM_PROMPT[:500]}, @@ -299,9 +306,14 @@ class LLMClient: first content token arrives (useful for triggering the typing indicator only after the model starts generating). """ + # Append /no_think to the last user message for local Qwen models + patched = list(messages) + if self._no_think and patched and patched[-1].get("role") == "user": + patched[-1] = {**patched[-1], "content": self._append_no_think(patched[-1]["content"])} + req_json = json.dumps([ {"role": "system", "content": system_prompt[:500]}, - *[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in messages], + *[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in patched], ], default=str) t0 = time.monotonic() @@ -311,7 +323,7 @@ class LLMClient: model=self.model, messages=[ {"role": "system", "content": system_prompt}, - *messages, + *patched, ], temperature=0.8, max_tokens=2048, @@ -355,8 +367,11 @@ class LLMClient: user_content: list[dict] = [ {"type": "image_url", "image_url": {"url": data_url}}, ] - if user_text: - user_content.append({"type": "text", "text": user_text}) + text_part = user_text or "" + if self._no_think: + text_part = (text_part + "\n/no_think").strip() + if text_part: + user_content.append({"type": "text", "text": text_part}) req_json = json.dumps([ {"role": "system", "content": system_prompt[:500]}, @@ -415,7 +430,8 @@ class LLMClient: user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n" if channel_context: user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n" - user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n" + user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}" + user_content = self._append_no_think(user_content) req_json = json.dumps([ {"role": "system", "content": SYSTEM_PROMPT[:500]},