From b41020014672efb59596f4ff434fd6c7628d6635 Mon Sep 17 00:00:00 2001
From: AJ Isaacs <ajisaacs27@gmail.com>
Date: Sat, 21 Feb 2026 14:17:59 -0500
Subject: [PATCH] Add max_tokens=1024 to LLM analysis calls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The analyze_message and raw_analyze methods had no max_tokens limit,
causing thinking models (Qwen3-VL-32B-Thinking) to generate unlimited
reasoning tokens before responding — taking 5+ minutes per message.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 utils/llm_client.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/llm_client.py b/utils/llm_client.py
index ea9cf59..760c5eb 100644
--- a/utils/llm_client.py
+++ b/utils/llm_client.py
@@ -123,6 +123,7 @@ class LLMClient:
                     tools=[ANALYSIS_TOOL],
                     tool_choice={"type": "function", "function": {"name": "report_analysis"}},
                     temperature=0.1,
+                    max_tokens=1024,
                 )
 
                 choice = response.choices[0]
@@ -255,6 +256,7 @@ class LLMClient:
                     tools=[ANALYSIS_TOOL],
                     tool_choice={"type": "function", "function": {"name": "report_analysis"}},
                     temperature=0.1,
+                    max_tokens=1024,
                 )
 
                 choice = response.choices[0]