Add LLM request/response logging to database

Log every LLM call (analysis, chat, image, raw_analyze) to a new
LlmLog table with request type, model, token counts, duration,
success/failure, and truncated request/response payloads. Enables
debugging prompt issues and tracking usage.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-22 22:55:19 -05:00
parent fd798ce027
commit b04d3da2bf
3 changed files with 159 additions and 16 deletions
+5 -5
View File
@@ -65,15 +65,18 @@ class BCSBot(commands.Bot):
self.config = config self.config = config
# Database (initialized async in setup_hook)
self.db = Database()
# LLM clients (OpenAI-compatible — works with llama.cpp, Ollama, or OpenAI) # LLM clients (OpenAI-compatible — works with llama.cpp, Ollama, or OpenAI)
llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434") llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434")
llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0") llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0")
llm_api_key = os.getenv("LLM_API_KEY", "not-needed") llm_api_key = os.getenv("LLM_API_KEY", "not-needed")
self.llm = LLMClient(llm_base_url, llm_model, llm_api_key) self.llm = LLMClient(llm_base_url, llm_model, llm_api_key, db=self.db)
# Heavy/escalation model for re-analysis, chat, and manual commands # Heavy/escalation model for re-analysis, chat, and manual commands
llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", llm_model) llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", llm_model)
self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key) self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key, db=self.db)
# Drama tracker # Drama tracker
sentiment = config.get("sentiment", {}) sentiment = config.get("sentiment", {})
@@ -84,9 +87,6 @@ class BCSBot(commands.Bot):
offense_reset_minutes=timeouts.get("offense_reset_minutes", 120), offense_reset_minutes=timeouts.get("offense_reset_minutes", 120),
) )
# Database (initialized async in setup_hook)
self.db = Database()
async def setup_hook(self): async def setup_hook(self):
# Initialize database and hydrate DramaTracker # Initialize database and hydrate DramaTracker
db_ok = await self.db.init() db_ok = await self.db.init()
+66
View File
@@ -126,6 +126,23 @@ class Database:
ALTER TABLE UserState ADD UserNotes NVARCHAR(MAX) NULL ALTER TABLE UserState ADD UserNotes NVARCHAR(MAX) NULL
""") """)
cursor.execute("""
IF NOT EXISTS (SELECT * FROM sys.tables WHERE name = 'LlmLog')
CREATE TABLE LlmLog (
Id BIGINT IDENTITY(1,1) PRIMARY KEY,
RequestType NVARCHAR(50) NOT NULL,
Model NVARCHAR(100) NOT NULL,
InputTokens INT NULL,
OutputTokens INT NULL,
DurationMs INT NOT NULL,
Success BIT NOT NULL,
Request NVARCHAR(MAX) NOT NULL,
Response NVARCHAR(MAX) NULL,
Error NVARCHAR(MAX) NULL,
CreatedAt DATETIME2 NOT NULL DEFAULT SYSUTCDATETIME()
)
""")
cursor.close() cursor.close()
def _parse_database_name(self) -> str: def _parse_database_name(self) -> str:
@@ -348,6 +365,55 @@ class Database:
finally: finally:
conn.close() conn.close()
# ------------------------------------------------------------------
# LLM Log (fire-and-forget via asyncio.create_task)
# ------------------------------------------------------------------
async def save_llm_log(
self,
request_type: str,
model: str,
duration_ms: int,
success: bool,
request: str,
response: str | None = None,
error: str | None = None,
input_tokens: int | None = None,
output_tokens: int | None = None,
) -> None:
"""Save an LLM request/response log entry."""
if not self._available:
return
try:
await asyncio.to_thread(
self._save_llm_log_sync,
request_type, model, duration_ms, success, request,
response, error, input_tokens, output_tokens,
)
except Exception:
logger.exception("Failed to save LLM log")
def _save_llm_log_sync(
self, request_type, model, duration_ms, success, request,
response, error, input_tokens, output_tokens,
):
conn = self._connect()
try:
cursor = conn.cursor()
cursor.execute(
"""INSERT INTO LlmLog
(RequestType, Model, InputTokens, OutputTokens, DurationMs,
Success, Request, Response, Error)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
request_type, model, input_tokens, output_tokens, duration_ms,
1 if success else 0,
request[:4000] if request else "",
response[:4000] if response else None,
error[:4000] if error else None,
)
cursor.close()
finally:
conn.close()
async def close(self): async def close(self):
"""No persistent connection to close (connections are per-operation).""" """No persistent connection to close (connections are per-operation)."""
pass pass
+88 -11
View File
@@ -2,6 +2,7 @@ import asyncio
import base64 import base64
import json import json
import logging import logging
import time
from pathlib import Path from pathlib import Path
from openai import AsyncOpenAI from openai import AsyncOpenAI
@@ -97,9 +98,10 @@ ANALYSIS_TOOL = {
class LLMClient: class LLMClient:
def __init__(self, base_url: str, model: str, api_key: str = "not-needed"): def __init__(self, base_url: str, model: str, api_key: str = "not-needed", db=None):
self.model = model self.model = model
self.host = base_url.rstrip("/") self.host = base_url.rstrip("/")
self._db = db
self._client = AsyncOpenAI( self._client = AsyncOpenAI(
base_url=f"{self.host}/v1", base_url=f"{self.host}/v1",
api_key=api_key, api_key=api_key,
@@ -107,6 +109,24 @@ class LLMClient:
) )
self._semaphore = asyncio.Semaphore(1) # serialize requests to avoid overloading self._semaphore = asyncio.Semaphore(1) # serialize requests to avoid overloading
def _log_llm(self, request_type: str, duration_ms: int, success: bool,
request: str, response: str | None = None, error: str | None = None,
input_tokens: int | None = None, output_tokens: int | None = None):
"""Fire-and-forget LLM log entry to the database."""
if not self._db:
return
asyncio.create_task(self._db.save_llm_log(
request_type=request_type,
model=self.model,
duration_ms=duration_ms,
success=success,
request=request,
response=response,
error=error,
input_tokens=input_tokens,
output_tokens=output_tokens,
))
async def close(self): async def close(self):
await self._client.close() await self._client.close()
@@ -119,7 +139,13 @@ class LLMClient:
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n" user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
if channel_context: if channel_context:
user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n" user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}" user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n/no_think"
req_json = json.dumps([
{"role": "system", "content": SYSTEM_PROMPT[:500]},
{"role": "user", "content": user_content},
], default=str)
t0 = time.monotonic()
async with self._semaphore: async with self._semaphore:
try: try:
@@ -132,26 +158,38 @@ class LLMClient:
tools=[ANALYSIS_TOOL], tools=[ANALYSIS_TOOL],
tool_choice={"type": "function", "function": {"name": "report_analysis"}}, tool_choice={"type": "function", "function": {"name": "report_analysis"}},
temperature=0.1, temperature=0.1,
max_tokens=1024, max_tokens=2048,
) )
elapsed = int((time.monotonic() - t0) * 1000)
choice = response.choices[0] choice = response.choices[0]
usage = response.usage
# Extract tool call arguments # Extract tool call arguments
if choice.message.tool_calls: if choice.message.tool_calls:
tool_call = choice.message.tool_calls[0] tool_call = choice.message.tool_calls[0]
args = json.loads(tool_call.function.arguments) resp_text = tool_call.function.arguments
args = json.loads(resp_text)
self._log_llm("analysis", elapsed, True, req_json, resp_text,
input_tokens=usage.prompt_tokens if usage else None,
output_tokens=usage.completion_tokens if usage else None)
return self._validate_result(args) return self._validate_result(args)
# Fallback: try parsing the message content as JSON # Fallback: try parsing the message content as JSON
if choice.message.content: if choice.message.content:
self._log_llm("analysis", elapsed, True, req_json, choice.message.content,
input_tokens=usage.prompt_tokens if usage else None,
output_tokens=usage.completion_tokens if usage else None)
return self._parse_content_fallback(choice.message.content) return self._parse_content_fallback(choice.message.content)
logger.warning("No tool call or content in LLM response.") logger.warning("No tool call or content in LLM response.")
self._log_llm("analysis", elapsed, False, req_json, error="Empty response")
return None return None
except Exception as e: except Exception as e:
elapsed = int((time.monotonic() - t0) * 1000)
logger.error("LLM analysis error: %s", e) logger.error("LLM analysis error: %s", e)
self._log_llm("analysis", elapsed, False, req_json, error=str(e))
return None return None
def _validate_result(self, result: dict) -> dict: def _validate_result(self, result: dict) -> dict:
@@ -219,16 +257,29 @@ class LLMClient:
first content token arrives (useful for triggering the typing indicator first content token arrives (useful for triggering the typing indicator
only after the model starts generating). only after the model starts generating).
""" """
# Append /no_think to the last user message to disable thinking on Qwen3
patched = []
for m in messages:
patched.append(m)
if patched and patched[-1].get("role") == "user":
patched[-1] = {**patched[-1], "content": patched[-1]["content"] + "\n/no_think"}
req_json = json.dumps([
{"role": "system", "content": system_prompt[:500]},
*[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in patched],
], default=str)
t0 = time.monotonic()
async with self._semaphore: async with self._semaphore:
try: try:
stream = await self._client.chat.completions.create( stream = await self._client.chat.completions.create(
model=self.model, model=self.model,
messages=[ messages=[
{"role": "system", "content": system_prompt}, {"role": "system", "content": system_prompt},
*messages, *patched,
], ],
temperature=0.8, temperature=0.8,
max_tokens=300, max_tokens=2048,
stream=True, stream=True,
) )
@@ -243,9 +294,13 @@ class LLMClient:
chunks.append(delta.content) chunks.append(delta.content)
content = "".join(chunks).strip() content = "".join(chunks).strip()
elapsed = int((time.monotonic() - t0) * 1000)
self._log_llm("chat", elapsed, bool(content), req_json, content or None)
return content if content else None return content if content else None
except Exception as e: except Exception as e:
elapsed = int((time.monotonic() - t0) * 1000)
logger.error("LLM chat error: %s", e) logger.error("LLM chat error: %s", e)
self._log_llm("chat", elapsed, False, req_json, error=str(e))
return None return None
async def analyze_image( async def analyze_image(
@@ -265,8 +320,13 @@ class LLMClient:
user_content: list[dict] = [ user_content: list[dict] = [
{"type": "image_url", "image_url": {"url": data_url}}, {"type": "image_url", "image_url": {"url": data_url}},
] ]
if user_text: user_content.append({"type": "text", "text": (user_text or "") + "\n/no_think"})
user_content.append({"type": "text", "text": user_text})
req_json = json.dumps([
{"role": "system", "content": system_prompt[:500]},
{"role": "user", "content": f"[image {len(image_bytes)} bytes] {user_text or ''}"},
], default=str)
t0 = time.monotonic()
async with self._semaphore: async with self._semaphore:
try: try:
@@ -277,7 +337,7 @@ class LLMClient:
{"role": "user", "content": user_content}, {"role": "user", "content": user_content},
], ],
temperature=0.8, temperature=0.8,
max_tokens=500, max_tokens=2048,
stream=True, stream=True,
) )
@@ -292,9 +352,13 @@ class LLMClient:
chunks.append(delta.content) chunks.append(delta.content)
content = "".join(chunks).strip() content = "".join(chunks).strip()
elapsed = int((time.monotonic() - t0) * 1000)
self._log_llm("image", elapsed, bool(content), req_json, content or None)
return content if content else None return content if content else None
except Exception as e: except Exception as e:
elapsed = int((time.monotonic() - t0) * 1000)
logger.error("LLM image analysis error: %s", e) logger.error("LLM image analysis error: %s", e)
self._log_llm("image", elapsed, False, req_json, error=str(e))
return None return None
async def raw_analyze( async def raw_analyze(
@@ -307,7 +371,13 @@ class LLMClient:
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n" user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
if channel_context: if channel_context:
user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n" user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}" user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n/no_think"
req_json = json.dumps([
{"role": "system", "content": SYSTEM_PROMPT[:500]},
{"role": "user", "content": user_content},
], default=str)
t0 = time.monotonic()
async with self._semaphore: async with self._semaphore:
try: try:
@@ -320,10 +390,12 @@ class LLMClient:
tools=[ANALYSIS_TOOL], tools=[ANALYSIS_TOOL],
tool_choice={"type": "function", "function": {"name": "report_analysis"}}, tool_choice={"type": "function", "function": {"name": "report_analysis"}},
temperature=0.1, temperature=0.1,
max_tokens=1024, max_tokens=2048,
) )
elapsed = int((time.monotonic() - t0) * 1000)
choice = response.choices[0] choice = response.choices[0]
usage = response.usage
parts = [] parts = []
parsed = None parsed = None
@@ -342,7 +414,12 @@ class LLMClient:
parsed = self._parse_content_fallback(choice.message.content) parsed = self._parse_content_fallback(choice.message.content)
raw = "\n".join(parts) or "(empty response)" raw = "\n".join(parts) or "(empty response)"
self._log_llm("raw_analyze", elapsed, parsed is not None, req_json, raw,
input_tokens=usage.prompt_tokens if usage else None,
output_tokens=usage.completion_tokens if usage else None)
return raw, parsed return raw, parsed
except Exception as e: except Exception as e:
elapsed = int((time.monotonic() - t0) * 1000)
self._log_llm("raw_analyze", elapsed, False, req_json, error=str(e))
return f"Error: {e}", None return f"Error: {e}", None