diff --git a/.env.example b/.env.example index 8f5efbe..22ea6c3 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,7 @@ DISCORD_BOT_TOKEN=your_token_here -LLM_BASE_URL=http://athena.lan:11434 -LLM_MODEL=Qwen3-VL-32B-Thinking-Q8_0 -LLM_API_KEY=not-needed +LLM_BASE_URL= +LLM_MODEL=gpt-4o-mini +LLM_ESCALATION_MODEL=gpt-4o +LLM_API_KEY=your_openai_api_key_here MSSQL_SA_PASSWORD=YourStrong!Passw0rd DB_CONNECTION_STRING=DRIVER={ODBC Driver 18 for SQL Server};SERVER=localhost,1433;DATABASE=BreehaviorMonitor;UID=sa;PWD=YourStrong!Passw0rd;TrustServerCertificate=yes diff --git a/bot.py b/bot.py index 47b7353..df65f0a 100644 --- a/bot.py +++ b/bot.py @@ -68,14 +68,14 @@ class BCSBot(commands.Bot): # Database (initialized async in setup_hook) self.db = Database() - # LLM clients (OpenAI-compatible — works with llama.cpp, Ollama, or OpenAI) - llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434") - llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0") - llm_api_key = os.getenv("LLM_API_KEY", "not-needed") + # LLM clients (OpenAI — set LLM_BASE_URL to override for local models) + llm_base_url = os.getenv("LLM_BASE_URL", "") + llm_model = os.getenv("LLM_MODEL", "gpt-4o-mini") + llm_api_key = os.getenv("LLM_API_KEY", "") self.llm = LLMClient(llm_base_url, llm_model, llm_api_key, db=self.db) # Heavy/escalation model for re-analysis, chat, and manual commands - llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", llm_model) + llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", "gpt-4o") self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key, db=self.db) # Active mode (server-wide) @@ -118,17 +118,16 @@ class BCSBot(commands.Bot): await self.tree.sync() logger.info("Slash commands synced.") - # Warm up the LLM so the model is loaded into VRAM before messages arrive - logger.info("Warming up LLM model...") + # Quick connectivity check try: - resp = await self.llm._client.chat.completions.create( + await self.llm._client.chat.completions.create( model=self.llm.model, messages=[{"role": "user", "content": "hi"}], max_tokens=1, ) - logger.info("LLM warm-up complete.") + logger.info("LLM connectivity check passed.") except Exception as e: - logger.warning("LLM warm-up failed — first messages may be slow: %s", e) + logger.warning("LLM connectivity check failed: %s", e) async def on_message(self, message: discord.Message): logger.info( diff --git a/utils/llm_client.py b/utils/llm_client.py index 1d1628f..dc5bbea 100644 --- a/utils/llm_client.py +++ b/utils/llm_client.py @@ -132,12 +132,11 @@ class LLMClient: self.model = model self.host = base_url.rstrip("/") self._db = db - self._client = AsyncOpenAI( - base_url=f"{self.host}/v1", - api_key=api_key, - timeout=600.0, # 10 min — first request loads model into VRAM - ) - self._semaphore = asyncio.Semaphore(1) # serialize requests to avoid overloading + client_kwargs = {"api_key": api_key, "timeout": 120.0} + if self.host: + client_kwargs["base_url"] = f"{self.host}/v1" + self._client = AsyncOpenAI(**client_kwargs) + self._semaphore = asyncio.Semaphore(4) def _log_llm(self, request_type: str, duration_ms: int, success: bool, request: str, response: str | None = None, error: str | None = None, @@ -169,7 +168,7 @@ class LLMClient: user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n" if channel_context: user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n" - user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n/no_think" + user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n" req_json = json.dumps([ {"role": "system", "content": SYSTEM_PROMPT[:500]}, @@ -300,16 +299,9 @@ class LLMClient: first content token arrives (useful for triggering the typing indicator only after the model starts generating). """ - # Append /no_think to the last user message to disable thinking on Qwen3 - patched = [] - for m in messages: - patched.append(m) - if patched and patched[-1].get("role") == "user": - patched[-1] = {**patched[-1], "content": patched[-1]["content"] + "\n/no_think"} - req_json = json.dumps([ {"role": "system", "content": system_prompt[:500]}, - *[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in patched], + *[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in messages], ], default=str) t0 = time.monotonic() @@ -319,7 +311,7 @@ class LLMClient: model=self.model, messages=[ {"role": "system", "content": system_prompt}, - *patched, + *messages, ], temperature=0.8, max_tokens=2048, @@ -363,7 +355,8 @@ class LLMClient: user_content: list[dict] = [ {"type": "image_url", "image_url": {"url": data_url}}, ] - user_content.append({"type": "text", "text": (user_text or "") + "\n/no_think"}) + if user_text: + user_content.append({"type": "text", "text": user_text}) req_json = json.dumps([ {"role": "system", "content": system_prompt[:500]}, @@ -422,7 +415,7 @@ class LLMClient: user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n" if channel_context: user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n" - user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n/no_think" + user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n" req_json = json.dumps([ {"role": "system", "content": SYSTEM_PROMPT[:500]},