Switch LLM backend from llama.cpp/Qwen to OpenAI
- Default models: gpt-4o-mini (triage), gpt-4o (escalation) - Remove Qwen-specific /no_think hacks - Reduce timeout from 600s to 120s, increase concurrency semaphore to 4 - Support empty LLM_BASE_URL to use OpenAI directly Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
DISCORD_BOT_TOKEN=your_token_here
|
DISCORD_BOT_TOKEN=your_token_here
|
||||||
LLM_BASE_URL=http://athena.lan:11434
|
LLM_BASE_URL=
|
||||||
LLM_MODEL=Qwen3-VL-32B-Thinking-Q8_0
|
LLM_MODEL=gpt-4o-mini
|
||||||
LLM_API_KEY=not-needed
|
LLM_ESCALATION_MODEL=gpt-4o
|
||||||
|
LLM_API_KEY=your_openai_api_key_here
|
||||||
MSSQL_SA_PASSWORD=YourStrong!Passw0rd
|
MSSQL_SA_PASSWORD=YourStrong!Passw0rd
|
||||||
DB_CONNECTION_STRING=DRIVER={ODBC Driver 18 for SQL Server};SERVER=localhost,1433;DATABASE=BreehaviorMonitor;UID=sa;PWD=YourStrong!Passw0rd;TrustServerCertificate=yes
|
DB_CONNECTION_STRING=DRIVER={ODBC Driver 18 for SQL Server};SERVER=localhost,1433;DATABASE=BreehaviorMonitor;UID=sa;PWD=YourStrong!Passw0rd;TrustServerCertificate=yes
|
||||||
|
|||||||
19
bot.py
19
bot.py
@@ -68,14 +68,14 @@ class BCSBot(commands.Bot):
|
|||||||
# Database (initialized async in setup_hook)
|
# Database (initialized async in setup_hook)
|
||||||
self.db = Database()
|
self.db = Database()
|
||||||
|
|
||||||
# LLM clients (OpenAI-compatible — works with llama.cpp, Ollama, or OpenAI)
|
# LLM clients (OpenAI — set LLM_BASE_URL to override for local models)
|
||||||
llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434")
|
llm_base_url = os.getenv("LLM_BASE_URL", "")
|
||||||
llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0")
|
llm_model = os.getenv("LLM_MODEL", "gpt-4o-mini")
|
||||||
llm_api_key = os.getenv("LLM_API_KEY", "not-needed")
|
llm_api_key = os.getenv("LLM_API_KEY", "")
|
||||||
self.llm = LLMClient(llm_base_url, llm_model, llm_api_key, db=self.db)
|
self.llm = LLMClient(llm_base_url, llm_model, llm_api_key, db=self.db)
|
||||||
|
|
||||||
# Heavy/escalation model for re-analysis, chat, and manual commands
|
# Heavy/escalation model for re-analysis, chat, and manual commands
|
||||||
llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", llm_model)
|
llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", "gpt-4o")
|
||||||
self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key, db=self.db)
|
self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key, db=self.db)
|
||||||
|
|
||||||
# Active mode (server-wide)
|
# Active mode (server-wide)
|
||||||
@@ -118,17 +118,16 @@ class BCSBot(commands.Bot):
|
|||||||
await self.tree.sync()
|
await self.tree.sync()
|
||||||
logger.info("Slash commands synced.")
|
logger.info("Slash commands synced.")
|
||||||
|
|
||||||
# Warm up the LLM so the model is loaded into VRAM before messages arrive
|
# Quick connectivity check
|
||||||
logger.info("Warming up LLM model...")
|
|
||||||
try:
|
try:
|
||||||
resp = await self.llm._client.chat.completions.create(
|
await self.llm._client.chat.completions.create(
|
||||||
model=self.llm.model,
|
model=self.llm.model,
|
||||||
messages=[{"role": "user", "content": "hi"}],
|
messages=[{"role": "user", "content": "hi"}],
|
||||||
max_tokens=1,
|
max_tokens=1,
|
||||||
)
|
)
|
||||||
logger.info("LLM warm-up complete.")
|
logger.info("LLM connectivity check passed.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("LLM warm-up failed — first messages may be slow: %s", e)
|
logger.warning("LLM connectivity check failed: %s", e)
|
||||||
|
|
||||||
async def on_message(self, message: discord.Message):
|
async def on_message(self, message: discord.Message):
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|||||||
@@ -132,12 +132,11 @@ class LLMClient:
|
|||||||
self.model = model
|
self.model = model
|
||||||
self.host = base_url.rstrip("/")
|
self.host = base_url.rstrip("/")
|
||||||
self._db = db
|
self._db = db
|
||||||
self._client = AsyncOpenAI(
|
client_kwargs = {"api_key": api_key, "timeout": 120.0}
|
||||||
base_url=f"{self.host}/v1",
|
if self.host:
|
||||||
api_key=api_key,
|
client_kwargs["base_url"] = f"{self.host}/v1"
|
||||||
timeout=600.0, # 10 min — first request loads model into VRAM
|
self._client = AsyncOpenAI(**client_kwargs)
|
||||||
)
|
self._semaphore = asyncio.Semaphore(4)
|
||||||
self._semaphore = asyncio.Semaphore(1) # serialize requests to avoid overloading
|
|
||||||
|
|
||||||
def _log_llm(self, request_type: str, duration_ms: int, success: bool,
|
def _log_llm(self, request_type: str, duration_ms: int, success: bool,
|
||||||
request: str, response: str | None = None, error: str | None = None,
|
request: str, response: str | None = None, error: str | None = None,
|
||||||
@@ -169,7 +168,7 @@ class LLMClient:
|
|||||||
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
|
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
|
||||||
if channel_context:
|
if channel_context:
|
||||||
user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
|
user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
|
||||||
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n/no_think"
|
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n"
|
||||||
|
|
||||||
req_json = json.dumps([
|
req_json = json.dumps([
|
||||||
{"role": "system", "content": SYSTEM_PROMPT[:500]},
|
{"role": "system", "content": SYSTEM_PROMPT[:500]},
|
||||||
@@ -300,16 +299,9 @@ class LLMClient:
|
|||||||
first content token arrives (useful for triggering the typing indicator
|
first content token arrives (useful for triggering the typing indicator
|
||||||
only after the model starts generating).
|
only after the model starts generating).
|
||||||
"""
|
"""
|
||||||
# Append /no_think to the last user message to disable thinking on Qwen3
|
|
||||||
patched = []
|
|
||||||
for m in messages:
|
|
||||||
patched.append(m)
|
|
||||||
if patched and patched[-1].get("role") == "user":
|
|
||||||
patched[-1] = {**patched[-1], "content": patched[-1]["content"] + "\n/no_think"}
|
|
||||||
|
|
||||||
req_json = json.dumps([
|
req_json = json.dumps([
|
||||||
{"role": "system", "content": system_prompt[:500]},
|
{"role": "system", "content": system_prompt[:500]},
|
||||||
*[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in patched],
|
*[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in messages],
|
||||||
], default=str)
|
], default=str)
|
||||||
t0 = time.monotonic()
|
t0 = time.monotonic()
|
||||||
|
|
||||||
@@ -319,7 +311,7 @@ class LLMClient:
|
|||||||
model=self.model,
|
model=self.model,
|
||||||
messages=[
|
messages=[
|
||||||
{"role": "system", "content": system_prompt},
|
{"role": "system", "content": system_prompt},
|
||||||
*patched,
|
*messages,
|
||||||
],
|
],
|
||||||
temperature=0.8,
|
temperature=0.8,
|
||||||
max_tokens=2048,
|
max_tokens=2048,
|
||||||
@@ -363,7 +355,8 @@ class LLMClient:
|
|||||||
user_content: list[dict] = [
|
user_content: list[dict] = [
|
||||||
{"type": "image_url", "image_url": {"url": data_url}},
|
{"type": "image_url", "image_url": {"url": data_url}},
|
||||||
]
|
]
|
||||||
user_content.append({"type": "text", "text": (user_text or "") + "\n/no_think"})
|
if user_text:
|
||||||
|
user_content.append({"type": "text", "text": user_text})
|
||||||
|
|
||||||
req_json = json.dumps([
|
req_json = json.dumps([
|
||||||
{"role": "system", "content": system_prompt[:500]},
|
{"role": "system", "content": system_prompt[:500]},
|
||||||
@@ -422,7 +415,7 @@ class LLMClient:
|
|||||||
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
|
user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
|
||||||
if channel_context:
|
if channel_context:
|
||||||
user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
|
user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
|
||||||
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n/no_think"
|
user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n"
|
||||||
|
|
||||||
req_json = json.dumps([
|
req_json = json.dumps([
|
||||||
{"role": "system", "content": SYSTEM_PROMPT[:500]},
|
{"role": "system", "content": SYSTEM_PROMPT[:500]},
|
||||||
|
|||||||
Reference in New Issue
Block a user