From b04d3da2bf80f83d87c5bef3c8c3851335be9d6e Mon Sep 17 00:00:00 2001 From: AJ Isaacs Date: Sun, 22 Feb 2026 22:55:19 -0500 Subject: [PATCH] Add LLM request/response logging to database Log every LLM call (analysis, chat, image, raw_analyze) to a new LlmLog table with request type, model, token counts, duration, success/failure, and truncated request/response payloads. Enables debugging prompt issues and tracking usage. Co-Authored-By: Claude Opus 4.6 --- bot.py | 10 ++--- utils/database.py | 66 ++++++++++++++++++++++++++++++ utils/llm_client.py | 99 ++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 159 insertions(+), 16 deletions(-) diff --git a/bot.py b/bot.py index 5650d96..de639f2 100644 --- a/bot.py +++ b/bot.py @@ -65,15 +65,18 @@ class BCSBot(commands.Bot): self.config = config + # Database (initialized async in setup_hook) + self.db = Database() + # LLM clients (OpenAI-compatible — works with llama.cpp, Ollama, or OpenAI) llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434") llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0") llm_api_key = os.getenv("LLM_API_KEY", "not-needed") - self.llm = LLMClient(llm_base_url, llm_model, llm_api_key) + self.llm = LLMClient(llm_base_url, llm_model, llm_api_key, db=self.db) # Heavy/escalation model for re-analysis, chat, and manual commands llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", llm_model) - self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key) + self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key, db=self.db) # Drama tracker sentiment = config.get("sentiment", {}) @@ -84,9 +87,6 @@ class BCSBot(commands.Bot): offense_reset_minutes=timeouts.get("offense_reset_minutes", 120), ) - # Database (initialized async in setup_hook) - self.db = Database() - async def setup_hook(self): # Initialize database and hydrate DramaTracker db_ok = await self.db.init() diff --git a/utils/database.py b/utils/database.py index 93e1f17..67ee0de 100644 --- a/utils/database.py +++ b/utils/database.py @@ -126,6 +126,23 @@ class Database: ALTER TABLE UserState ADD UserNotes NVARCHAR(MAX) NULL """) + cursor.execute(""" + IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = 'LlmLog') + CREATE TABLE LlmLog ( + Id BIGINT IDENTITY(1,1) PRIMARY KEY, + RequestType NVARCHAR(50) NOT NULL, + Model NVARCHAR(100) NOT NULL, + InputTokens INT NULL, + OutputTokens INT NULL, + DurationMs INT NOT NULL, + Success BIT NOT NULL, + Request NVARCHAR(MAX) NOT NULL, + Response NVARCHAR(MAX) NULL, + Error NVARCHAR(MAX) NULL, + CreatedAt DATETIME2 NOT NULL DEFAULT SYSUTCDATETIME() + ) + """) + cursor.close() def _parse_database_name(self) -> str: @@ -348,6 +365,55 @@ class Database: finally: conn.close() + # ------------------------------------------------------------------ + # LLM Log (fire-and-forget via asyncio.create_task) + # ------------------------------------------------------------------ + async def save_llm_log( + self, + request_type: str, + model: str, + duration_ms: int, + success: bool, + request: str, + response: str | None = None, + error: str | None = None, + input_tokens: int | None = None, + output_tokens: int | None = None, + ) -> None: + """Save an LLM request/response log entry.""" + if not self._available: + return + try: + await asyncio.to_thread( + self._save_llm_log_sync, + request_type, model, duration_ms, success, request, + response, error, input_tokens, output_tokens, + ) + except Exception: + logger.exception("Failed to save LLM log") + + def _save_llm_log_sync( + self, request_type, model, duration_ms, success, request, + response, error, input_tokens, output_tokens, + ): + conn = self._connect() + try: + cursor = conn.cursor() + cursor.execute( + """INSERT INTO LlmLog + (RequestType, Model, InputTokens, OutputTokens, DurationMs, + Success, Request, Response, Error) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", + request_type, model, input_tokens, output_tokens, duration_ms, + 1 if success else 0, + request[:4000] if request else "", + response[:4000] if response else None, + error[:4000] if error else None, + ) + cursor.close() + finally: + conn.close() + async def close(self): """No persistent connection to close (connections are per-operation).""" pass diff --git a/utils/llm_client.py b/utils/llm_client.py index c54e9b9..e6b2c27 100644 --- a/utils/llm_client.py +++ b/utils/llm_client.py @@ -2,6 +2,7 @@ import asyncio import base64 import json import logging +import time from pathlib import Path from openai import AsyncOpenAI @@ -97,9 +98,10 @@ ANALYSIS_TOOL = { class LLMClient: - def __init__(self, base_url: str, model: str, api_key: str = "not-needed"): + def __init__(self, base_url: str, model: str, api_key: str = "not-needed", db=None): self.model = model self.host = base_url.rstrip("/") + self._db = db self._client = AsyncOpenAI( base_url=f"{self.host}/v1", api_key=api_key, @@ -107,6 +109,24 @@ class LLMClient: ) self._semaphore = asyncio.Semaphore(1) # serialize requests to avoid overloading + def _log_llm(self, request_type: str, duration_ms: int, success: bool, + request: str, response: str | None = None, error: str | None = None, + input_tokens: int | None = None, output_tokens: int | None = None): + """Fire-and-forget LLM log entry to the database.""" + if not self._db: + return + asyncio.create_task(self._db.save_llm_log( + request_type=request_type, + model=self.model, + duration_ms=duration_ms, + success=success, + request=request, + response=response, + error=error, + input_tokens=input_tokens, + output_tokens=output_tokens, + )) + async def close(self): await self._client.close() @@ -119,7 +139,13 @@ class LLMClient: user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n" if channel_context: user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n" - user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}" + user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n/no_think" + + req_json = json.dumps([ + {"role": "system", "content": SYSTEM_PROMPT[:500]}, + {"role": "user", "content": user_content}, + ], default=str) + t0 = time.monotonic() async with self._semaphore: try: @@ -132,26 +158,38 @@ class LLMClient: tools=[ANALYSIS_TOOL], tool_choice={"type": "function", "function": {"name": "report_analysis"}}, temperature=0.1, - max_tokens=1024, + max_tokens=2048, ) + elapsed = int((time.monotonic() - t0) * 1000) choice = response.choices[0] + usage = response.usage # Extract tool call arguments if choice.message.tool_calls: tool_call = choice.message.tool_calls[0] - args = json.loads(tool_call.function.arguments) + resp_text = tool_call.function.arguments + args = json.loads(resp_text) + self._log_llm("analysis", elapsed, True, req_json, resp_text, + input_tokens=usage.prompt_tokens if usage else None, + output_tokens=usage.completion_tokens if usage else None) return self._validate_result(args) # Fallback: try parsing the message content as JSON if choice.message.content: + self._log_llm("analysis", elapsed, True, req_json, choice.message.content, + input_tokens=usage.prompt_tokens if usage else None, + output_tokens=usage.completion_tokens if usage else None) return self._parse_content_fallback(choice.message.content) logger.warning("No tool call or content in LLM response.") + self._log_llm("analysis", elapsed, False, req_json, error="Empty response") return None except Exception as e: + elapsed = int((time.monotonic() - t0) * 1000) logger.error("LLM analysis error: %s", e) + self._log_llm("analysis", elapsed, False, req_json, error=str(e)) return None def _validate_result(self, result: dict) -> dict: @@ -219,16 +257,29 @@ class LLMClient: first content token arrives (useful for triggering the typing indicator only after the model starts generating). """ + # Append /no_think to the last user message to disable thinking on Qwen3 + patched = [] + for m in messages: + patched.append(m) + if patched and patched[-1].get("role") == "user": + patched[-1] = {**patched[-1], "content": patched[-1]["content"] + "\n/no_think"} + + req_json = json.dumps([ + {"role": "system", "content": system_prompt[:500]}, + *[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in patched], + ], default=str) + t0 = time.monotonic() + async with self._semaphore: try: stream = await self._client.chat.completions.create( model=self.model, messages=[ {"role": "system", "content": system_prompt}, - *messages, + *patched, ], temperature=0.8, - max_tokens=300, + max_tokens=2048, stream=True, ) @@ -243,9 +294,13 @@ class LLMClient: chunks.append(delta.content) content = "".join(chunks).strip() + elapsed = int((time.monotonic() - t0) * 1000) + self._log_llm("chat", elapsed, bool(content), req_json, content or None) return content if content else None except Exception as e: + elapsed = int((time.monotonic() - t0) * 1000) logger.error("LLM chat error: %s", e) + self._log_llm("chat", elapsed, False, req_json, error=str(e)) return None async def analyze_image( @@ -265,8 +320,13 @@ class LLMClient: user_content: list[dict] = [ {"type": "image_url", "image_url": {"url": data_url}}, ] - if user_text: - user_content.append({"type": "text", "text": user_text}) + user_content.append({"type": "text", "text": (user_text or "") + "\n/no_think"}) + + req_json = json.dumps([ + {"role": "system", "content": system_prompt[:500]}, + {"role": "user", "content": f"[image {len(image_bytes)} bytes] {user_text or ''}"}, + ], default=str) + t0 = time.monotonic() async with self._semaphore: try: @@ -277,7 +337,7 @@ class LLMClient: {"role": "user", "content": user_content}, ], temperature=0.8, - max_tokens=500, + max_tokens=2048, stream=True, ) @@ -292,9 +352,13 @@ class LLMClient: chunks.append(delta.content) content = "".join(chunks).strip() + elapsed = int((time.monotonic() - t0) * 1000) + self._log_llm("image", elapsed, bool(content), req_json, content or None) return content if content else None except Exception as e: + elapsed = int((time.monotonic() - t0) * 1000) logger.error("LLM image analysis error: %s", e) + self._log_llm("image", elapsed, False, req_json, error=str(e)) return None async def raw_analyze( @@ -307,7 +371,13 @@ class LLMClient: user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n" if channel_context: user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n" - user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}" + user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n/no_think" + + req_json = json.dumps([ + {"role": "system", "content": SYSTEM_PROMPT[:500]}, + {"role": "user", "content": user_content}, + ], default=str) + t0 = time.monotonic() async with self._semaphore: try: @@ -320,10 +390,12 @@ class LLMClient: tools=[ANALYSIS_TOOL], tool_choice={"type": "function", "function": {"name": "report_analysis"}}, temperature=0.1, - max_tokens=1024, + max_tokens=2048, ) + elapsed = int((time.monotonic() - t0) * 1000) choice = response.choices[0] + usage = response.usage parts = [] parsed = None @@ -342,7 +414,12 @@ class LLMClient: parsed = self._parse_content_fallback(choice.message.content) raw = "\n".join(parts) or "(empty response)" + self._log_llm("raw_analyze", elapsed, parsed is not None, req_json, raw, + input_tokens=usage.prompt_tokens if usage else None, + output_tokens=usage.completion_tokens if usage else None) return raw, parsed except Exception as e: + elapsed = int((time.monotonic() - t0) * 1000) + self._log_llm("raw_analyze", elapsed, False, req_json, error=str(e)) return f"Error: {e}", None