From 8a06ddbd6e6dd2b30529095bff12725b834285e7 Mon Sep 17 00:00:00 2001
From: AJ Isaacs <ajisaacs27@gmail.com>
Date: Mon, 23 Feb 2026 12:20:07 -0500
Subject: [PATCH] Support hybrid LLM: local Qwen triage + OpenAI escalation

Triage analysis runs on Qwen 8B (athena.lan) for free first-pass.
Escalation, chat, image roasts, and commands use GPT-4o via OpenAI.

Each tier gets its own base URL, API key, and concurrency settings.
Local models get /no_think and serialized requests automatically.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .env.example        | 11 ++++++++---
 bot.py              | 22 ++++++++++++++++------
 utils/llm_client.py | 34 +++++++++++++++++++++++++---------
 3 files changed, 49 insertions(+), 18 deletions(-)

diff --git a/.env.example b/.env.example
index 22ea6c3..fbe79e2 100644
--- a/.env.example
+++ b/.env.example
@@ -1,7 +1,12 @@
 DISCORD_BOT_TOKEN=your_token_here
-LLM_BASE_URL=
-LLM_MODEL=gpt-4o-mini
+# Triage model (local llama.cpp / Ollama — leave BASE_URL empty for OpenAI)
+LLM_BASE_URL=http://athena.lan:11434
+LLM_MODEL=Qwen3-8B-Q6_K
+LLM_API_KEY=not-needed
+# Escalation model (OpenAI — leave BASE_URL empty for OpenAI)
+LLM_ESCALATION_BASE_URL=
 LLM_ESCALATION_MODEL=gpt-4o
-LLM_API_KEY=your_openai_api_key_here
+LLM_ESCALATION_API_KEY=your_openai_api_key_here
+# Database
 MSSQL_SA_PASSWORD=YourStrong!Passw0rd
 DB_CONNECTION_STRING=DRIVER={ODBC Driver 18 for SQL Server};SERVER=localhost,1433;DATABASE=BreehaviorMonitor;UID=sa;PWD=YourStrong!Passw0rd;TrustServerCertificate=yes
diff --git a/bot.py b/bot.py
index df65f0a..d9c88c1 100644
--- a/bot.py
+++ b/bot.py
@@ -68,15 +68,25 @@ class BCSBot(commands.Bot):
         # Database (initialized async in setup_hook)
         self.db = Database()
 
-        # LLM clients (OpenAI — set LLM_BASE_URL to override for local models)
+        # Triage LLM (local Qwen on athena for cheap first-pass analysis)
         llm_base_url = os.getenv("LLM_BASE_URL", "")
         llm_model = os.getenv("LLM_MODEL", "gpt-4o-mini")
-        llm_api_key = os.getenv("LLM_API_KEY", "")
-        self.llm = LLMClient(llm_base_url, llm_model, llm_api_key, db=self.db)
+        llm_api_key = os.getenv("LLM_API_KEY", "not-needed")
+        is_local = bool(llm_base_url)
+        self.llm = LLMClient(
+            llm_base_url, llm_model, llm_api_key, db=self.db,
+            no_think=is_local, concurrency=1 if is_local else 4,
+        )
 
-        # Heavy/escalation model for re-analysis, chat, and manual commands
-        llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", "gpt-4o")
-        self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key, db=self.db)
+        # Heavy/escalation LLM (OpenAI for re-analysis, chat, image roasts, commands)
+        esc_base_url = os.getenv("LLM_ESCALATION_BASE_URL", "")
+        esc_model = os.getenv("LLM_ESCALATION_MODEL", "gpt-4o")
+        esc_api_key = os.getenv("LLM_ESCALATION_API_KEY", llm_api_key)
+        esc_is_local = bool(esc_base_url)
+        self.llm_heavy = LLMClient(
+            esc_base_url, esc_model, esc_api_key, db=self.db,
+            no_think=esc_is_local, concurrency=1 if esc_is_local else 4,
+        )
 
         # Active mode (server-wide)
         modes_config = config.get("modes", {})
diff --git a/utils/llm_client.py b/utils/llm_client.py
index dc5bbea..d04eab4 100644
--- a/utils/llm_client.py
+++ b/utils/llm_client.py
@@ -128,15 +128,18 @@ ANALYSIS_TOOL = {
 
 
 class LLMClient:
-    def __init__(self, base_url: str, model: str, api_key: str = "not-needed", db=None):
+    def __init__(self, base_url: str, model: str, api_key: str = "not-needed",
+                 db=None, no_think: bool = False, concurrency: int = 4):
         self.model = model
         self.host = base_url.rstrip("/")
         self._db = db
-        client_kwargs = {"api_key": api_key, "timeout": 120.0}
+        self._no_think = no_think
+        timeout = 600.0 if self.host else 120.0  # local models need longer for VRAM load
+        client_kwargs = {"api_key": api_key, "timeout": timeout}
         if self.host:
             client_kwargs["base_url"] = f"{self.host}/v1"
         self._client = AsyncOpenAI(**client_kwargs)
-        self._semaphore = asyncio.Semaphore(4)
+        self._semaphore = asyncio.Semaphore(concurrency)
 
     def _log_llm(self, request_type: str, duration_ms: int, success: bool,
                  request: str, response: str | None = None, error: str | None = None,
@@ -156,6 +159,9 @@ class LLMClient:
             output_tokens=output_tokens,
         ))
 
+    def _append_no_think(self, text: str) -> str:
+        return text + "\n/no_think" if self._no_think else text
+
     async def close(self):
         await self._client.close()
 
@@ -168,7 +174,8 @@ class LLMClient:
             user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
         if channel_context:
             user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
-        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n"
+        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
+        user_content = self._append_no_think(user_content)
 
         req_json = json.dumps([
             {"role": "system", "content": SYSTEM_PROMPT[:500]},
@@ -299,9 +306,14 @@ class LLMClient:
         first content token arrives (useful for triggering the typing indicator
         only after the model starts generating).
         """
+        # Append /no_think to the last user message for local Qwen models
+        patched = list(messages)
+        if self._no_think and patched and patched[-1].get("role") == "user":
+            patched[-1] = {**patched[-1], "content": self._append_no_think(patched[-1]["content"])}
+
         req_json = json.dumps([
             {"role": "system", "content": system_prompt[:500]},
-            *[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in messages],
+            *[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in patched],
         ], default=str)
         t0 = time.monotonic()
 
@@ -311,7 +323,7 @@ class LLMClient:
                     model=self.model,
                     messages=[
                         {"role": "system", "content": system_prompt},
-                        *messages,
+                        *patched,
                     ],
                     temperature=0.8,
                     max_tokens=2048,
@@ -355,8 +367,11 @@ class LLMClient:
         user_content: list[dict] = [
             {"type": "image_url", "image_url": {"url": data_url}},
         ]
-        if user_text:
-            user_content.append({"type": "text", "text": user_text})
+        text_part = user_text or ""
+        if self._no_think:
+            text_part = (text_part + "\n/no_think").strip()
+        if text_part:
+            user_content.append({"type": "text", "text": text_part})
 
         req_json = json.dumps([
             {"role": "system", "content": system_prompt[:500]},
@@ -415,7 +430,8 @@ class LLMClient:
             user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
         if channel_context:
             user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
-        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n"
+        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
+        user_content = self._append_no_think(user_content)
 
         req_json = json.dumps([
             {"role": "system", "content": SYSTEM_PROMPT[:500]},