Support hybrid LLM: local Qwen triage + OpenAI escalation

Triage analysis runs on Qwen 8B (athena.lan) for free first-pass. Escalation, chat, image roasts, and commands use GPT-4o via OpenAI. Each tier gets its own base URL, API key, and concurrency settings. Local models get /no_think and serialized requests automatically. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 12:20:07 -05:00
parent b5e401f036
commit 8a06ddbd6e
3 changed files with 49 additions and 18 deletions
@@ -128,15 +128,18 @@ ANALYSIS_TOOL = {


 class LLMClient:
-    def __init__(self, base_url: str, model: str, api_key: str = "not-needed", db=None):
+    def __init__(self, base_url: str, model: str, api_key: str = "not-needed",
+                 db=None, no_think: bool = False, concurrency: int = 4):
        self.model = model
        self.host = base_url.rstrip("/")
        self._db = db
-        client_kwargs = {"api_key": api_key, "timeout": 120.0}
+        self._no_think = no_think
+        timeout = 600.0 if self.host else 120.0  # local models need longer for VRAM load
+        client_kwargs = {"api_key": api_key, "timeout": timeout}
        if self.host:
            client_kwargs["base_url"] = f"{self.host}/v1"
        self._client = AsyncOpenAI(**client_kwargs)
-        self._semaphore = asyncio.Semaphore(4)
+        self._semaphore = asyncio.Semaphore(concurrency)

    def _log_llm(self, request_type: str, duration_ms: int, success: bool,
                 request: str, response: str | None = None, error: str | None = None,
@@ -156,6 +159,9 @@ class LLMClient:
            output_tokens=output_tokens,
        ))

+    def _append_no_think(self, text: str) -> str:
+        return text + "\n/no_think" if self._no_think else text
+
    async def close(self):
        await self._client.close()

@@ -168,7 +174,8 @@ class LLMClient:
            user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
        if channel_context:
            user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
-        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n"
+        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
+        user_content = self._append_no_think(user_content)

        req_json = json.dumps([
            {"role": "system", "content": SYSTEM_PROMPT[:500]},
@@ -299,9 +306,14 @@ class LLMClient:
        first content token arrives (useful for triggering the typing indicator
        only after the model starts generating).
        """
+        # Append /no_think to the last user message for local Qwen models
+        patched = list(messages)
+        if self._no_think and patched and patched[-1].get("role") == "user":
+            patched[-1] = {**patched[-1], "content": self._append_no_think(patched[-1]["content"])}
+
        req_json = json.dumps([
            {"role": "system", "content": system_prompt[:500]},
-            *[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in messages],
+            *[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in patched],
        ], default=str)
        t0 = time.monotonic()

@@ -311,7 +323,7 @@ class LLMClient:
                    model=self.model,
                    messages=[
                        {"role": "system", "content": system_prompt},
-                        *messages,
+                        *patched,
                    ],
                    temperature=0.8,
                    max_tokens=2048,
@@ -355,8 +367,11 @@ class LLMClient:
        user_content: list[dict] = [
            {"type": "image_url", "image_url": {"url": data_url}},
        ]
-        if user_text:
-            user_content.append({"type": "text", "text": user_text})
+        text_part = user_text or ""
+        if self._no_think:
+            text_part = (text_part + "\n/no_think").strip()
+        if text_part:
+            user_content.append({"type": "text", "text": text_part})

        req_json = json.dumps([
            {"role": "system", "content": system_prompt[:500]},
@@ -415,7 +430,8 @@ class LLMClient:
            user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
        if channel_context:
            user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
-        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n"
+        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
+        user_content = self._append_no_think(user_content)

        req_json = json.dumps([
            {"role": "system", "content": SYSTEM_PROMPT[:500]},