From b04d3da2bf80f83d87c5bef3c8c3851335be9d6e Mon Sep 17 00:00:00 2001
From: AJ Isaacs <ajisaacs27@gmail.com>
Date: Sun, 22 Feb 2026 22:55:19 -0500
Subject: [PATCH] Add LLM request/response logging to database

Log every LLM call (analysis, chat, image, raw_analyze) to a new
LlmLog table with request type, model, token counts, duration,
success/failure, and truncated request/response payloads. Enables
debugging prompt issues and tracking usage.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 bot.py              | 10 ++---
 utils/database.py   | 66 ++++++++++++++++++++++++++++++
 utils/llm_client.py | 99 ++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 159 insertions(+), 16 deletions(-)

diff --git a/bot.py b/bot.py
index 5650d96..de639f2 100644
--- a/bot.py
+++ b/bot.py
@@ -65,15 +65,18 @@ class BCSBot(commands.Bot):
 
         self.config = config
 
+        # Database (initialized async in setup_hook)
+        self.db = Database()
+
         # LLM clients (OpenAI-compatible — works with llama.cpp, Ollama, or OpenAI)
         llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434")
         llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0")
         llm_api_key = os.getenv("LLM_API_KEY", "not-needed")
-        self.llm = LLMClient(llm_base_url, llm_model, llm_api_key)
+        self.llm = LLMClient(llm_base_url, llm_model, llm_api_key, db=self.db)
 
         # Heavy/escalation model for re-analysis, chat, and manual commands
         llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", llm_model)
-        self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key)
+        self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key, db=self.db)
 
         # Drama tracker
         sentiment = config.get("sentiment", {})
@@ -84,9 +87,6 @@ class BCSBot(commands.Bot):
             offense_reset_minutes=timeouts.get("offense_reset_minutes", 120),
         )
 
-        # Database (initialized async in setup_hook)
-        self.db = Database()
-
     async def setup_hook(self):
         # Initialize database and hydrate DramaTracker
         db_ok = await self.db.init()
diff --git a/utils/database.py b/utils/database.py
index 93e1f17..67ee0de 100644
--- a/utils/database.py
+++ b/utils/database.py
@@ -126,6 +126,23 @@ class Database:
                 ALTER TABLE UserState ADD UserNotes NVARCHAR(MAX) NULL
         """)
 
+        cursor.execute("""
+            IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = 'LlmLog')
+            CREATE TABLE LlmLog (
+                Id            BIGINT IDENTITY(1,1) PRIMARY KEY,
+                RequestType   NVARCHAR(50) NOT NULL,
+                Model         NVARCHAR(100) NOT NULL,
+                InputTokens   INT NULL,
+                OutputTokens  INT NULL,
+                DurationMs    INT NOT NULL,
+                Success       BIT NOT NULL,
+                Request       NVARCHAR(MAX) NOT NULL,
+                Response      NVARCHAR(MAX) NULL,
+                Error         NVARCHAR(MAX) NULL,
+                CreatedAt     DATETIME2 NOT NULL DEFAULT SYSUTCDATETIME()
+            )
+        """)
+
         cursor.close()
 
     def _parse_database_name(self) -> str:
@@ -348,6 +365,55 @@ class Database:
         finally:
             conn.close()
 
+    # ------------------------------------------------------------------
+    # LLM Log (fire-and-forget via asyncio.create_task)
+    # ------------------------------------------------------------------
+    async def save_llm_log(
+        self,
+        request_type: str,
+        model: str,
+        duration_ms: int,
+        success: bool,
+        request: str,
+        response: str | None = None,
+        error: str | None = None,
+        input_tokens: int | None = None,
+        output_tokens: int | None = None,
+    ) -> None:
+        """Save an LLM request/response log entry."""
+        if not self._available:
+            return
+        try:
+            await asyncio.to_thread(
+                self._save_llm_log_sync,
+                request_type, model, duration_ms, success, request,
+                response, error, input_tokens, output_tokens,
+            )
+        except Exception:
+            logger.exception("Failed to save LLM log")
+
+    def _save_llm_log_sync(
+        self, request_type, model, duration_ms, success, request,
+        response, error, input_tokens, output_tokens,
+    ):
+        conn = self._connect()
+        try:
+            cursor = conn.cursor()
+            cursor.execute(
+                """INSERT INTO LlmLog
+                   (RequestType, Model, InputTokens, OutputTokens, DurationMs,
+                    Success, Request, Response, Error)
+                   VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+                request_type, model, input_tokens, output_tokens, duration_ms,
+                1 if success else 0,
+                request[:4000] if request else "",
+                response[:4000] if response else None,
+                error[:4000] if error else None,
+            )
+            cursor.close()
+        finally:
+            conn.close()
+
     async def close(self):
         """No persistent connection to close (connections are per-operation)."""
         pass
diff --git a/utils/llm_client.py b/utils/llm_client.py
index c54e9b9..e6b2c27 100644
--- a/utils/llm_client.py
+++ b/utils/llm_client.py
@@ -2,6 +2,7 @@ import asyncio
 import base64
 import json
 import logging
+import time
 from pathlib import Path
 
 from openai import AsyncOpenAI
@@ -97,9 +98,10 @@ ANALYSIS_TOOL = {
 
 
 class LLMClient:
-    def __init__(self, base_url: str, model: str, api_key: str = "not-needed"):
+    def __init__(self, base_url: str, model: str, api_key: str = "not-needed", db=None):
         self.model = model
         self.host = base_url.rstrip("/")
+        self._db = db
         self._client = AsyncOpenAI(
             base_url=f"{self.host}/v1",
             api_key=api_key,
@@ -107,6 +109,24 @@ class LLMClient:
         )
         self._semaphore = asyncio.Semaphore(1)  # serialize requests to avoid overloading
 
+    def _log_llm(self, request_type: str, duration_ms: int, success: bool,
+                 request: str, response: str | None = None, error: str | None = None,
+                 input_tokens: int | None = None, output_tokens: int | None = None):
+        """Fire-and-forget LLM log entry to the database."""
+        if not self._db:
+            return
+        asyncio.create_task(self._db.save_llm_log(
+            request_type=request_type,
+            model=self.model,
+            duration_ms=duration_ms,
+            success=success,
+            request=request,
+            response=response,
+            error=error,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+        ))
+
     async def close(self):
         await self._client.close()
 
@@ -119,7 +139,13 @@ class LLMClient:
             user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
         if channel_context:
             user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
-        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
+        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n/no_think"
+
+        req_json = json.dumps([
+            {"role": "system", "content": SYSTEM_PROMPT[:500]},
+            {"role": "user", "content": user_content},
+        ], default=str)
+        t0 = time.monotonic()
 
         async with self._semaphore:
             try:
@@ -132,26 +158,38 @@ class LLMClient:
                     tools=[ANALYSIS_TOOL],
                     tool_choice={"type": "function", "function": {"name": "report_analysis"}},
                     temperature=0.1,
-                    max_tokens=1024,
+                    max_tokens=2048,
                 )
 
+                elapsed = int((time.monotonic() - t0) * 1000)
                 choice = response.choices[0]
+                usage = response.usage
 
                 # Extract tool call arguments
                 if choice.message.tool_calls:
                     tool_call = choice.message.tool_calls[0]
-                    args = json.loads(tool_call.function.arguments)
+                    resp_text = tool_call.function.arguments
+                    args = json.loads(resp_text)
+                    self._log_llm("analysis", elapsed, True, req_json, resp_text,
+                                  input_tokens=usage.prompt_tokens if usage else None,
+                                  output_tokens=usage.completion_tokens if usage else None)
                     return self._validate_result(args)
 
                 # Fallback: try parsing the message content as JSON
                 if choice.message.content:
+                    self._log_llm("analysis", elapsed, True, req_json, choice.message.content,
+                                  input_tokens=usage.prompt_tokens if usage else None,
+                                  output_tokens=usage.completion_tokens if usage else None)
                     return self._parse_content_fallback(choice.message.content)
 
                 logger.warning("No tool call or content in LLM response.")
+                self._log_llm("analysis", elapsed, False, req_json, error="Empty response")
                 return None
 
             except Exception as e:
+                elapsed = int((time.monotonic() - t0) * 1000)
                 logger.error("LLM analysis error: %s", e)
+                self._log_llm("analysis", elapsed, False, req_json, error=str(e))
                 return None
 
     def _validate_result(self, result: dict) -> dict:
@@ -219,16 +257,29 @@ class LLMClient:
         first content token arrives (useful for triggering the typing indicator
         only after the model starts generating).
         """
+        # Append /no_think to the last user message to disable thinking on Qwen3
+        patched = []
+        for m in messages:
+            patched.append(m)
+        if patched and patched[-1].get("role") == "user":
+            patched[-1] = {**patched[-1], "content": patched[-1]["content"] + "\n/no_think"}
+
+        req_json = json.dumps([
+            {"role": "system", "content": system_prompt[:500]},
+            *[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in patched],
+        ], default=str)
+        t0 = time.monotonic()
+
         async with self._semaphore:
             try:
                 stream = await self._client.chat.completions.create(
                     model=self.model,
                     messages=[
                         {"role": "system", "content": system_prompt},
-                        *messages,
+                        *patched,
                     ],
                     temperature=0.8,
-                    max_tokens=300,
+                    max_tokens=2048,
                     stream=True,
                 )
 
@@ -243,9 +294,13 @@ class LLMClient:
                         chunks.append(delta.content)
 
                 content = "".join(chunks).strip()
+                elapsed = int((time.monotonic() - t0) * 1000)
+                self._log_llm("chat", elapsed, bool(content), req_json, content or None)
                 return content if content else None
             except Exception as e:
+                elapsed = int((time.monotonic() - t0) * 1000)
                 logger.error("LLM chat error: %s", e)
+                self._log_llm("chat", elapsed, False, req_json, error=str(e))
                 return None
 
     async def analyze_image(
@@ -265,8 +320,13 @@ class LLMClient:
         user_content: list[dict] = [
             {"type": "image_url", "image_url": {"url": data_url}},
         ]
-        if user_text:
-            user_content.append({"type": "text", "text": user_text})
+        user_content.append({"type": "text", "text": (user_text or "") + "\n/no_think"})
+
+        req_json = json.dumps([
+            {"role": "system", "content": system_prompt[:500]},
+            {"role": "user", "content": f"[image {len(image_bytes)} bytes] {user_text or ''}"},
+        ], default=str)
+        t0 = time.monotonic()
 
         async with self._semaphore:
             try:
@@ -277,7 +337,7 @@ class LLMClient:
                         {"role": "user", "content": user_content},
                     ],
                     temperature=0.8,
-                    max_tokens=500,
+                    max_tokens=2048,
                     stream=True,
                 )
 
@@ -292,9 +352,13 @@ class LLMClient:
                         chunks.append(delta.content)
 
                 content = "".join(chunks).strip()
+                elapsed = int((time.monotonic() - t0) * 1000)
+                self._log_llm("image", elapsed, bool(content), req_json, content or None)
                 return content if content else None
             except Exception as e:
+                elapsed = int((time.monotonic() - t0) * 1000)
                 logger.error("LLM image analysis error: %s", e)
+                self._log_llm("image", elapsed, False, req_json, error=str(e))
                 return None
 
     async def raw_analyze(
@@ -307,7 +371,13 @@ class LLMClient:
             user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
         if channel_context:
             user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
-        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}"
+        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n/no_think"
+
+        req_json = json.dumps([
+            {"role": "system", "content": SYSTEM_PROMPT[:500]},
+            {"role": "user", "content": user_content},
+        ], default=str)
+        t0 = time.monotonic()
 
         async with self._semaphore:
             try:
@@ -320,10 +390,12 @@ class LLMClient:
                     tools=[ANALYSIS_TOOL],
                     tool_choice={"type": "function", "function": {"name": "report_analysis"}},
                     temperature=0.1,
-                    max_tokens=1024,
+                    max_tokens=2048,
                 )
 
+                elapsed = int((time.monotonic() - t0) * 1000)
                 choice = response.choices[0]
+                usage = response.usage
                 parts = []
                 parsed = None
 
@@ -342,7 +414,12 @@ class LLMClient:
                     parsed = self._parse_content_fallback(choice.message.content)
 
                 raw = "\n".join(parts) or "(empty response)"
+                self._log_llm("raw_analyze", elapsed, parsed is not None, req_json, raw,
+                              input_tokens=usage.prompt_tokens if usage else None,
+                              output_tokens=usage.completion_tokens if usage else None)
                 return raw, parsed
 
             except Exception as e:
+                elapsed = int((time.monotonic() - t0) * 1000)
+                self._log_llm("raw_analyze", elapsed, False, req_json, error=str(e))
                 return f"Error: {e}", None