diff --git a/.env.example b/.env.example
index 8f5efbe..22ea6c3 100644
--- a/.env.example
+++ b/.env.example
@@ -1,6 +1,7 @@
 DISCORD_BOT_TOKEN=your_token_here
-LLM_BASE_URL=http://athena.lan:11434
-LLM_MODEL=Qwen3-VL-32B-Thinking-Q8_0
-LLM_API_KEY=not-needed
+LLM_BASE_URL=
+LLM_MODEL=gpt-4o-mini
+LLM_ESCALATION_MODEL=gpt-4o
+LLM_API_KEY=your_openai_api_key_here
 MSSQL_SA_PASSWORD=YourStrong!Passw0rd
 DB_CONNECTION_STRING=DRIVER={ODBC Driver 18 for SQL Server};SERVER=localhost,1433;DATABASE=BreehaviorMonitor;UID=sa;PWD=YourStrong!Passw0rd;TrustServerCertificate=yes
diff --git a/bot.py b/bot.py
index 47b7353..df65f0a 100644
--- a/bot.py
+++ b/bot.py
@@ -68,14 +68,14 @@ class BCSBot(commands.Bot):
         # Database (initialized async in setup_hook)
         self.db = Database()
 
-        # LLM clients (OpenAI-compatible — works with llama.cpp, Ollama, or OpenAI)
-        llm_base_url = os.getenv("LLM_BASE_URL", "http://athena.lan:11434")
-        llm_model = os.getenv("LLM_MODEL", "Qwen3-VL-32B-Thinking-Q8_0")
-        llm_api_key = os.getenv("LLM_API_KEY", "not-needed")
+        # LLM clients (OpenAI — set LLM_BASE_URL to override for local models)
+        llm_base_url = os.getenv("LLM_BASE_URL", "")
+        llm_model = os.getenv("LLM_MODEL", "gpt-4o-mini")
+        llm_api_key = os.getenv("LLM_API_KEY", "")
         self.llm = LLMClient(llm_base_url, llm_model, llm_api_key, db=self.db)
 
         # Heavy/escalation model for re-analysis, chat, and manual commands
-        llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", llm_model)
+        llm_heavy_model = os.getenv("LLM_ESCALATION_MODEL", "gpt-4o")
         self.llm_heavy = LLMClient(llm_base_url, llm_heavy_model, llm_api_key, db=self.db)
 
         # Active mode (server-wide)
@@ -118,17 +118,16 @@ class BCSBot(commands.Bot):
         await self.tree.sync()
         logger.info("Slash commands synced.")
 
-        # Warm up the LLM so the model is loaded into VRAM before messages arrive
-        logger.info("Warming up LLM model...")
+        # Quick connectivity check
         try:
-            resp = await self.llm._client.chat.completions.create(
+            await self.llm._client.chat.completions.create(
                 model=self.llm.model,
                 messages=[{"role": "user", "content": "hi"}],
                 max_tokens=1,
             )
-            logger.info("LLM warm-up complete.")
+            logger.info("LLM connectivity check passed.")
         except Exception as e:
-            logger.warning("LLM warm-up failed — first messages may be slow: %s", e)
+            logger.warning("LLM connectivity check failed: %s", e)
 
     async def on_message(self, message: discord.Message):
         logger.info(
diff --git a/utils/llm_client.py b/utils/llm_client.py
index 1d1628f..dc5bbea 100644
--- a/utils/llm_client.py
+++ b/utils/llm_client.py
@@ -132,12 +132,11 @@ class LLMClient:
         self.model = model
         self.host = base_url.rstrip("/")
         self._db = db
-        self._client = AsyncOpenAI(
-            base_url=f"{self.host}/v1",
-            api_key=api_key,
-            timeout=600.0,  # 10 min — first request loads model into VRAM
-        )
-        self._semaphore = asyncio.Semaphore(1)  # serialize requests to avoid overloading
+        client_kwargs = {"api_key": api_key, "timeout": 120.0}
+        if self.host:
+            client_kwargs["base_url"] = f"{self.host}/v1"
+        self._client = AsyncOpenAI(**client_kwargs)
+        self._semaphore = asyncio.Semaphore(4)
 
     def _log_llm(self, request_type: str, duration_ms: int, success: bool,
                  request: str, response: str | None = None, error: str | None = None,
@@ -169,7 +168,7 @@ class LLMClient:
             user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
         if channel_context:
             user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
-        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n/no_think"
+        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n"
 
         req_json = json.dumps([
             {"role": "system", "content": SYSTEM_PROMPT[:500]},
@@ -300,16 +299,9 @@ class LLMClient:
         first content token arrives (useful for triggering the typing indicator
         only after the model starts generating).
         """
-        # Append /no_think to the last user message to disable thinking on Qwen3
-        patched = []
-        for m in messages:
-            patched.append(m)
-        if patched and patched[-1].get("role") == "user":
-            patched[-1] = {**patched[-1], "content": patched[-1]["content"] + "\n/no_think"}
-
         req_json = json.dumps([
             {"role": "system", "content": system_prompt[:500]},
-            *[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in patched],
+            *[{"role": m["role"], "content": str(m.get("content", ""))[:200]} for m in messages],
         ], default=str)
         t0 = time.monotonic()
 
@@ -319,7 +311,7 @@ class LLMClient:
                     model=self.model,
                     messages=[
                         {"role": "system", "content": system_prompt},
-                        *patched,
+                        *messages,
                     ],
                     temperature=0.8,
                     max_tokens=2048,
@@ -363,7 +355,8 @@ class LLMClient:
         user_content: list[dict] = [
             {"type": "image_url", "image_url": {"url": data_url}},
         ]
-        user_content.append({"type": "text", "text": (user_text or "") + "\n/no_think"})
+        if user_text:
+            user_content.append({"type": "text", "text": user_text})
 
         req_json = json.dumps([
             {"role": "system", "content": system_prompt[:500]},
@@ -422,7 +415,7 @@ class LLMClient:
             user_content += f"=== NOTES ABOUT THIS USER (from prior analysis) ===\n{user_notes}\n\n"
         if channel_context:
             user_content += f"=== CHANNEL INFO ===\n{channel_context}\n\n"
-        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n/no_think"
+        user_content += f"=== TARGET MESSAGE (analyze THIS message only) ===\n{message}\n"
 
         req_json = json.dumps([
             {"role": "system", "content": SYSTEM_PROMPT[:500]},