From b41020014672efb59596f4ff434fd6c7628d6635 Mon Sep 17 00:00:00 2001 From: AJ Isaacs Date: Sat, 21 Feb 2026 14:17:59 -0500 Subject: [PATCH] Add max_tokens=1024 to LLM analysis calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The analyze_message and raw_analyze methods had no max_tokens limit, causing thinking models (Qwen3-VL-32B-Thinking) to generate unlimited reasoning tokens before responding — taking 5+ minutes per message. Co-Authored-By: Claude Opus 4.6 --- utils/llm_client.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/llm_client.py b/utils/llm_client.py index ea9cf59..760c5eb 100644 --- a/utils/llm_client.py +++ b/utils/llm_client.py @@ -123,6 +123,7 @@ class LLMClient: tools=[ANALYSIS_TOOL], tool_choice={"type": "function", "function": {"name": "report_analysis"}}, temperature=0.1, + max_tokens=1024, ) choice = response.choices[0] @@ -255,6 +256,7 @@ class LLMClient: tools=[ANALYSIS_TOOL], tool_choice={"type": "function", "function": {"name": "report_analysis"}}, temperature=0.1, + max_tokens=1024, ) choice = response.choices[0]