Switch LLM backend from llama.cpp/Qwen to OpenAI

- Default models: gpt-4o-mini (triage), gpt-4o (escalation) - Remove Qwen-specific /no_think hacks - Reduce timeout from 600s to 120s, increase concurrency semaphore to 4 - Support empty LLM_BASE_URL to use OpenAI directly Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-23 12:07:53 -05:00
parent a9bc24e48e
commit 28fb66d5f9
3 changed files with 24 additions and 31 deletions
@@ -132,12 +132,11 @@ class LLMClient:
        self.model = model
        self.host = base_url.rstrip("/")
        self._db = db
-        self._client = AsyncOpenAI(
-            base_url=f"{self.host}/v1",
-            api_key=api_key,
-            timeout=600.0,  # 10 min — first request loads model into VRAM
-        )
-        self._semaphore = asyncio.Semaphore(1)  # serialize requests to avoid overloading
+        client_kwargs = {"api_key": api_key, "timeout": 120.0}
+        if self.host:
+            client_kwargs["base_url"] = f"{self.host}/v1"
+        self._client = AsyncOpenAI(**client_kwargs)
+        self._semaphore = asyncio.Semaphore(4)

    def _log_llm(self, request_type: str, duration_ms: int, success: bool,
                 request: str, response: str | None = None, error: str | None = None,
@@ -169,7 +168,7 @@ class LLMClient:
            user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
        if channel_context:
            user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
-        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n/no_think"
+        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n"

        req_json = json.dumps([
            {"role": "system", "content": SYSTEM_PROMPT[:500]},
@@ -300,16 +299,9 @@ class LLMClient:
        first content token arrives (useful for triggering the typing indicator
        only after the model starts generating).
        """
-        # Append /no_think to the last user message to disable thinking on Qwen3
-        patched = []
-        for m in messages:
-            patched.append(m)
-        if patched and patched[-1].get("role") == "user":
-            patched[-1] = {**patched[-1], "content": patched[-1]["content"] + "\n/no_think"}
-
        req_json = json.dumps([
            {"role": "system", "content": system_prompt[:500]},
-            *[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in patched],
+            *[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in messages],
        ], default=str)
        t0 = time.monotonic()

@@ -319,7 +311,7 @@ class LLMClient:
                    model=self.model,
                    messages=[
                        {"role": "system", "content": system_prompt},
-                        *patched,
+                        *messages,
                    ],
                    temperature=0.8,
                    max_tokens=2048,
@@ -363,7 +355,8 @@ class LLMClient:
        user_content: list[dict] = [
            {"type": "image_url", "image_url": {"url": data_url}},
        ]
-        user_content.append({"type": "text", "text": (user_text or "") + "\n/no_think"})
+        if user_text:
+            user_content.append({"type": "text", "text": user_text})

        req_json = json.dumps([
            {"role": "system", "content": system_prompt[:500]},
@@ -422,7 +415,7 @@ class LLMClient:
            user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
        if channel_context:
            user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
-        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n/no_think"
+        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n"

        req_json = json.dumps([
            {"role": "system", "content": SYSTEM_PROMPT[:500]},