Feature: Add tool-use support to AI vision clients
Add IAIToolAwareVisionClient interface and OpenAIToolUseHelper for function-calling via /v1/chat/completions. OpenAI and LlamaCpp clients now support multi-round tool calls, letting the AI query the database during receipt image analysis. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
using MoneyMap.Services.AITools;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
@@ -29,9 +30,226 @@ namespace MoneyMap.Services
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// OpenAI Vision API client.
|
||||
/// Extended interface for vision clients that support tool use / function calling.
|
||||
/// </summary>
|
||||
public class OpenAIVisionClient : IAIVisionClient
|
||||
public interface IAIToolAwareVisionClient : IAIVisionClient
|
||||
{
|
||||
bool SupportsToolUse { get; }
|
||||
|
||||
Task<VisionApiResult> AnalyzeImageWithToolsAsync(
|
||||
string base64Image,
|
||||
string mediaType,
|
||||
string prompt,
|
||||
string model,
|
||||
List<AIToolDefinition> tools,
|
||||
Func<AIToolCall, Task<AIToolResult>> toolExecutor,
|
||||
int maxToolRounds = 5);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Shared helper for the OpenAI-compatible tool-use wire format.
|
||||
/// Used by both OpenAIVisionClient and LlamaCppVisionClient since they share /v1/chat/completions.
|
||||
/// </summary>
|
||||
public static class OpenAIToolUseHelper
|
||||
{
|
||||
/// <summary>
|
||||
/// Convert AIToolDefinitions to the OpenAI tools array format.
|
||||
/// </summary>
|
||||
public static List<object> BuildToolsArray(List<AIToolDefinition> tools)
|
||||
{
|
||||
return tools.Select(t => (object)new
|
||||
{
|
||||
type = "function",
|
||||
function = new
|
||||
{
|
||||
name = t.Name,
|
||||
description = t.Description,
|
||||
parameters = new
|
||||
{
|
||||
type = "object",
|
||||
properties = t.Parameters.ToDictionary(
|
||||
p => p.Name,
|
||||
p => (object)new { type = p.Type, description = p.Description }
|
||||
),
|
||||
required = t.Parameters.Where(p => p.Required).Select(p => p.Name).ToArray()
|
||||
}
|
||||
}
|
||||
}).ToList();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Execute the tool-use loop for OpenAI-compatible /v1/chat/completions endpoints.
|
||||
/// </summary>
|
||||
public static async Task<VisionApiResult> ExecuteWithToolsAsync(
|
||||
HttpClient httpClient,
|
||||
string apiUrl,
|
||||
Action<HttpClient> configureHeaders,
|
||||
string model,
|
||||
List<object> initialMessages,
|
||||
List<object> toolsArray,
|
||||
Func<AIToolCall, Task<AIToolResult>> toolExecutor,
|
||||
int maxToolRounds,
|
||||
int maxTokens,
|
||||
ILogger logger)
|
||||
{
|
||||
// Build mutable message list
|
||||
var messages = new List<object>(initialMessages);
|
||||
|
||||
for (int round = 0; round <= maxToolRounds; round++)
|
||||
{
|
||||
var requestBody = new Dictionary<string, object>
|
||||
{
|
||||
["model"] = model,
|
||||
["messages"] = messages,
|
||||
["max_tokens"] = maxTokens,
|
||||
["temperature"] = 0.1
|
||||
};
|
||||
|
||||
// Only include tools if we haven't exhausted rounds
|
||||
if (round < maxToolRounds && toolsArray.Count > 0)
|
||||
{
|
||||
requestBody["tools"] = toolsArray;
|
||||
requestBody["tool_choice"] = "auto";
|
||||
}
|
||||
|
||||
configureHeaders(httpClient);
|
||||
var json = JsonSerializer.Serialize(requestBody);
|
||||
var content = new StringContent(json, Encoding.UTF8, "application/json");
|
||||
|
||||
var response = await httpClient.PostAsync(apiUrl, content);
|
||||
|
||||
if (!response.IsSuccessStatusCode)
|
||||
{
|
||||
var errorContent = await response.Content.ReadAsStringAsync();
|
||||
logger.LogError("API error ({StatusCode}): {Error}", response.StatusCode, errorContent);
|
||||
return VisionApiResult.Failure($"API error ({response.StatusCode}): {errorContent}");
|
||||
}
|
||||
|
||||
var responseJson = await response.Content.ReadAsStringAsync();
|
||||
var responseObj = JsonSerializer.Deserialize<JsonElement>(responseJson);
|
||||
|
||||
var choice = responseObj.GetProperty("choices")[0];
|
||||
var message = choice.GetProperty("message");
|
||||
var finishReason = choice.GetProperty("finish_reason").GetString();
|
||||
|
||||
// Check for tool calls
|
||||
var hasToolCalls = message.TryGetProperty("tool_calls", out var toolCallsElement) &&
|
||||
toolCallsElement.ValueKind == JsonValueKind.Array &&
|
||||
toolCallsElement.GetArrayLength() > 0;
|
||||
|
||||
if (hasToolCalls || finishReason == "tool_calls")
|
||||
{
|
||||
if (!hasToolCalls)
|
||||
{
|
||||
// finish_reason says tool_calls but no tool_calls array - treat as final response
|
||||
var fallbackContent = message.TryGetProperty("content", out var fc) ? fc.GetString() : null;
|
||||
return VisionApiResult.Success(CleanJsonResponse(fallbackContent));
|
||||
}
|
||||
|
||||
logger.LogInformation("Tool-use round {Round}: model requested {Count} tool calls",
|
||||
round + 1, toolCallsElement.GetArrayLength());
|
||||
|
||||
// Add the assistant message (with tool_calls) to conversation
|
||||
messages.Add(JsonSerializer.Deserialize<object>(message.GetRawText())!);
|
||||
|
||||
// Execute each tool call and add results
|
||||
foreach (var tc in toolCallsElement.EnumerateArray())
|
||||
{
|
||||
var toolCall = new AIToolCall
|
||||
{
|
||||
Id = tc.GetProperty("id").GetString() ?? "",
|
||||
Name = tc.GetProperty("function").GetProperty("name").GetString() ?? "",
|
||||
Arguments = ParseArguments(tc.GetProperty("function").GetProperty("arguments").GetString())
|
||||
};
|
||||
|
||||
logger.LogInformation("Executing tool: {ToolName}", toolCall.Name);
|
||||
var result = await toolExecutor(toolCall);
|
||||
|
||||
messages.Add(new
|
||||
{
|
||||
role = "tool",
|
||||
tool_call_id = toolCall.Id,
|
||||
content = result.Content
|
||||
});
|
||||
}
|
||||
|
||||
continue; // Send another request with tool results
|
||||
}
|
||||
|
||||
// No tool calls - extract final content
|
||||
var messageContent = message.TryGetProperty("content", out var contentElement)
|
||||
? contentElement.GetString()
|
||||
: null;
|
||||
|
||||
return VisionApiResult.Success(CleanJsonResponse(messageContent));
|
||||
}
|
||||
|
||||
return VisionApiResult.Failure("Exceeded maximum tool-use rounds without getting a final response.");
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> ParseArguments(string? argsJson)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(argsJson))
|
||||
return new();
|
||||
|
||||
try
|
||||
{
|
||||
var element = JsonSerializer.Deserialize<JsonElement>(argsJson);
|
||||
var dict = new Dictionary<string, object?>();
|
||||
foreach (var prop in element.EnumerateObject())
|
||||
{
|
||||
dict[prop.Name] = prop.Value.ValueKind switch
|
||||
{
|
||||
JsonValueKind.String => prop.Value.GetString(),
|
||||
JsonValueKind.Number => prop.Value.GetRawText(),
|
||||
JsonValueKind.True => "true",
|
||||
JsonValueKind.False => "false",
|
||||
JsonValueKind.Null => null,
|
||||
_ => prop.Value.GetRawText()
|
||||
};
|
||||
}
|
||||
return dict;
|
||||
}
|
||||
catch
|
||||
{
|
||||
return new();
|
||||
}
|
||||
}
|
||||
|
||||
public static string CleanJsonResponse(string? content)
|
||||
{
|
||||
var trimmed = content?.Trim() ?? "";
|
||||
|
||||
// Strip markdown code fences
|
||||
if (trimmed.StartsWith("```json"))
|
||||
{
|
||||
trimmed = trimmed.Replace("```json", "").Replace("```", "").Trim();
|
||||
}
|
||||
else if (trimmed.StartsWith("```"))
|
||||
{
|
||||
trimmed = trimmed.Replace("```", "").Trim();
|
||||
}
|
||||
|
||||
// If the response doesn't start with '{', try to extract the JSON object.
|
||||
// This handles HTML error pages, XML-wrapped responses, or other non-JSON wrapping.
|
||||
if (!trimmed.StartsWith("{"))
|
||||
{
|
||||
var firstBrace = trimmed.IndexOf('{');
|
||||
var lastBrace = trimmed.LastIndexOf('}');
|
||||
if (firstBrace >= 0 && lastBrace > firstBrace)
|
||||
{
|
||||
trimmed = trimmed[firstBrace..(lastBrace + 1)];
|
||||
}
|
||||
}
|
||||
|
||||
return trimmed;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// OpenAI Vision API client with tool-use support.
|
||||
/// </summary>
|
||||
public class OpenAIVisionClient : IAIToolAwareVisionClient
|
||||
{
|
||||
private readonly HttpClient _httpClient;
|
||||
private readonly IConfiguration _configuration;
|
||||
@@ -44,12 +262,12 @@ namespace MoneyMap.Services
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public bool SupportsToolUse => true;
|
||||
|
||||
public async Task<VisionApiResult> AnalyzeImageAsync(string base64Image, string mediaType, string prompt, string model)
|
||||
{
|
||||
var apiKey = Environment.GetEnvironmentVariable("OPENAI_API_KEY")
|
||||
?? _configuration["OpenAI:ApiKey"];
|
||||
|
||||
if (string.IsNullOrWhiteSpace(apiKey))
|
||||
var apiKey = GetApiKey();
|
||||
if (apiKey == null)
|
||||
return VisionApiResult.Failure("OpenAI API key not configured. Set OPENAI_API_KEY environment variable or OpenAI:ApiKey in appsettings.json");
|
||||
|
||||
var requestBody = new
|
||||
@@ -101,7 +319,7 @@ namespace MoneyMap.Services
|
||||
.GetProperty("content")
|
||||
.GetString();
|
||||
|
||||
return VisionApiResult.Success(CleanJsonResponse(messageContent));
|
||||
return VisionApiResult.Success(OpenAIToolUseHelper.CleanJsonResponse(messageContent));
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
@@ -110,21 +328,66 @@ namespace MoneyMap.Services
|
||||
}
|
||||
}
|
||||
|
||||
private static string CleanJsonResponse(string? content)
|
||||
public async Task<VisionApiResult> AnalyzeImageWithToolsAsync(
|
||||
string base64Image, string mediaType, string prompt, string model,
|
||||
List<AIToolDefinition> tools, Func<AIToolCall, Task<AIToolResult>> toolExecutor,
|
||||
int maxToolRounds = 5)
|
||||
{
|
||||
var trimmed = content?.Trim() ?? "";
|
||||
if (trimmed.StartsWith("```json"))
|
||||
var apiKey = GetApiKey();
|
||||
if (apiKey == null)
|
||||
return VisionApiResult.Failure("OpenAI API key not configured.");
|
||||
|
||||
var initialMessages = new List<object>
|
||||
{
|
||||
trimmed = trimmed.Replace("```json", "").Replace("```", "").Trim();
|
||||
new
|
||||
{
|
||||
role = "user",
|
||||
content = new object[]
|
||||
{
|
||||
new { type = "text", text = prompt },
|
||||
new
|
||||
{
|
||||
type = "image_url",
|
||||
image_url = new { url = $"data:{mediaType};base64,{base64Image}" }
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
return await OpenAIToolUseHelper.ExecuteWithToolsAsync(
|
||||
_httpClient,
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
client =>
|
||||
{
|
||||
client.DefaultRequestHeaders.Clear();
|
||||
client.DefaultRequestHeaders.Add("Authorization", $"Bearer {apiKey}");
|
||||
},
|
||||
model,
|
||||
initialMessages,
|
||||
OpenAIToolUseHelper.BuildToolsArray(tools),
|
||||
toolExecutor,
|
||||
maxToolRounds,
|
||||
maxTokens: 4096,
|
||||
_logger);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "OpenAI tool-use call failed: {Message}", ex.Message);
|
||||
return VisionApiResult.Failure($"OpenAI API error: {ex.Message}");
|
||||
}
|
||||
return trimmed;
|
||||
}
|
||||
|
||||
private string? GetApiKey() =>
|
||||
Environment.GetEnvironmentVariable("OPENAI_API_KEY")
|
||||
?? _configuration["OpenAI:ApiKey"];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Anthropic Claude Vision API client.
|
||||
/// Anthropic Claude Vision API client with tool-use support.
|
||||
/// </summary>
|
||||
public class ClaudeVisionClient : IAIVisionClient
|
||||
public class ClaudeVisionClient : IAIToolAwareVisionClient
|
||||
{
|
||||
private readonly HttpClient _httpClient;
|
||||
private readonly IConfiguration _configuration;
|
||||
@@ -137,12 +400,12 @@ namespace MoneyMap.Services
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
public bool SupportsToolUse => true;
|
||||
|
||||
public async Task<VisionApiResult> AnalyzeImageAsync(string base64Image, string mediaType, string prompt, string model)
|
||||
{
|
||||
var apiKey = Environment.GetEnvironmentVariable("ANTHROPIC_API_KEY")
|
||||
?? _configuration["Anthropic:ApiKey"];
|
||||
|
||||
if (string.IsNullOrWhiteSpace(apiKey))
|
||||
var apiKey = GetApiKey();
|
||||
if (apiKey == null)
|
||||
return VisionApiResult.Failure("Anthropic API key not configured. Set ANTHROPIC_API_KEY environment variable or Anthropic:ApiKey in appsettings.json");
|
||||
|
||||
var requestBody = new
|
||||
@@ -174,10 +437,7 @@ namespace MoneyMap.Services
|
||||
|
||||
try
|
||||
{
|
||||
_httpClient.DefaultRequestHeaders.Clear();
|
||||
_httpClient.DefaultRequestHeaders.Add("x-api-key", apiKey);
|
||||
_httpClient.DefaultRequestHeaders.Add("anthropic-version", "2023-06-01");
|
||||
|
||||
ConfigureHeaders();
|
||||
var json = JsonSerializer.Serialize(requestBody);
|
||||
var content = new StringContent(json, Encoding.UTF8, "application/json");
|
||||
|
||||
@@ -198,7 +458,7 @@ namespace MoneyMap.Services
|
||||
.GetProperty("text")
|
||||
.GetString();
|
||||
|
||||
return VisionApiResult.Success(CleanJsonResponse(messageContent));
|
||||
return VisionApiResult.Success(OpenAIToolUseHelper.CleanJsonResponse(messageContent));
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
@@ -207,34 +467,225 @@ namespace MoneyMap.Services
|
||||
}
|
||||
}
|
||||
|
||||
private static string CleanJsonResponse(string? content)
|
||||
public async Task<VisionApiResult> AnalyzeImageWithToolsAsync(
|
||||
string base64Image, string mediaType, string prompt, string model,
|
||||
List<AIToolDefinition> tools, Func<AIToolCall, Task<AIToolResult>> toolExecutor,
|
||||
int maxToolRounds = 5)
|
||||
{
|
||||
var trimmed = content?.Trim() ?? "";
|
||||
if (trimmed.StartsWith("```json"))
|
||||
var apiKey = GetApiKey();
|
||||
if (apiKey == null)
|
||||
return VisionApiResult.Failure("Anthropic API key not configured.");
|
||||
|
||||
// Build Anthropic-format tools array
|
||||
var anthropicTools = tools.Select(t => new
|
||||
{
|
||||
trimmed = trimmed.Replace("```json", "").Replace("```", "").Trim();
|
||||
name = t.Name,
|
||||
description = t.Description,
|
||||
input_schema = new
|
||||
{
|
||||
type = "object",
|
||||
properties = t.Parameters.ToDictionary(
|
||||
p => p.Name,
|
||||
p => (object)new { type = p.Type, description = p.Description }
|
||||
),
|
||||
required = t.Parameters.Where(p => p.Required).Select(p => p.Name).ToArray()
|
||||
}
|
||||
}).ToList();
|
||||
|
||||
// Initial message with image
|
||||
var messages = new List<object>
|
||||
{
|
||||
new
|
||||
{
|
||||
role = "user",
|
||||
content = new object[]
|
||||
{
|
||||
new
|
||||
{
|
||||
type = "image",
|
||||
source = new
|
||||
{
|
||||
type = "base64",
|
||||
media_type = mediaType,
|
||||
data = base64Image
|
||||
}
|
||||
},
|
||||
new { type = "text", text = prompt }
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
for (int round = 0; round <= maxToolRounds; round++)
|
||||
{
|
||||
var requestBody = new Dictionary<string, object>
|
||||
{
|
||||
["model"] = model,
|
||||
["max_tokens"] = 4096,
|
||||
["messages"] = messages
|
||||
};
|
||||
|
||||
if (round < maxToolRounds && anthropicTools.Count > 0)
|
||||
requestBody["tools"] = anthropicTools;
|
||||
|
||||
ConfigureHeaders();
|
||||
var json = JsonSerializer.Serialize(requestBody);
|
||||
var content = new StringContent(json, Encoding.UTF8, "application/json");
|
||||
|
||||
var response = await _httpClient.PostAsync("https://api.anthropic.com/v1/messages", content);
|
||||
|
||||
if (!response.IsSuccessStatusCode)
|
||||
{
|
||||
var errorContent = await response.Content.ReadAsStringAsync();
|
||||
_logger.LogError("Anthropic API error ({StatusCode}): {Error}", response.StatusCode, errorContent);
|
||||
return VisionApiResult.Failure($"Anthropic API error ({response.StatusCode}): {errorContent}");
|
||||
}
|
||||
|
||||
var responseJson = await response.Content.ReadAsStringAsync();
|
||||
var responseObj = JsonSerializer.Deserialize<JsonElement>(responseJson);
|
||||
|
||||
var stopReason = responseObj.GetProperty("stop_reason").GetString();
|
||||
var contentBlocks = responseObj.GetProperty("content");
|
||||
|
||||
// Check for tool_use blocks
|
||||
var toolUseBlocks = contentBlocks.EnumerateArray()
|
||||
.Where(b => b.GetProperty("type").GetString() == "tool_use")
|
||||
.ToList();
|
||||
|
||||
if (stopReason == "tool_use" && toolUseBlocks.Count > 0)
|
||||
{
|
||||
_logger.LogInformation("Claude tool-use round {Round}: {Count} tool calls",
|
||||
round + 1, toolUseBlocks.Count);
|
||||
|
||||
// Add assistant response to messages (contains tool_use blocks)
|
||||
var assistantContent = JsonSerializer.Deserialize<object>(contentBlocks.GetRawText())!;
|
||||
messages.Add(new { role = "assistant", content = assistantContent });
|
||||
|
||||
// Build tool_result blocks
|
||||
var toolResults = new List<object>();
|
||||
foreach (var block in toolUseBlocks)
|
||||
{
|
||||
var toolCall = new AIToolCall
|
||||
{
|
||||
Id = block.GetProperty("id").GetString() ?? "",
|
||||
Name = block.GetProperty("name").GetString() ?? "",
|
||||
Arguments = ParseAnthropicInput(block.GetProperty("input"))
|
||||
};
|
||||
|
||||
_logger.LogInformation("Executing tool: {ToolName}", toolCall.Name);
|
||||
var result = await toolExecutor(toolCall);
|
||||
|
||||
toolResults.Add(new
|
||||
{
|
||||
type = "tool_result",
|
||||
tool_use_id = toolCall.Id,
|
||||
content = result.Content,
|
||||
is_error = result.IsError
|
||||
});
|
||||
}
|
||||
|
||||
messages.Add(new { role = "user", content = toolResults });
|
||||
continue;
|
||||
}
|
||||
|
||||
// Extract final text content
|
||||
var textBlock = contentBlocks.EnumerateArray()
|
||||
.FirstOrDefault(b => b.GetProperty("type").GetString() == "text");
|
||||
|
||||
var text = textBlock.ValueKind != JsonValueKind.Undefined
|
||||
? textBlock.GetProperty("text").GetString()
|
||||
: null;
|
||||
|
||||
return VisionApiResult.Success(OpenAIToolUseHelper.CleanJsonResponse(text));
|
||||
}
|
||||
|
||||
return VisionApiResult.Failure("Exceeded maximum tool-use rounds.");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Claude tool-use call failed: {Message}", ex.Message);
|
||||
return VisionApiResult.Failure($"Anthropic API error: {ex.Message}");
|
||||
}
|
||||
return trimmed;
|
||||
}
|
||||
|
||||
private static Dictionary<string, object?> ParseAnthropicInput(JsonElement input)
|
||||
{
|
||||
var dict = new Dictionary<string, object?>();
|
||||
if (input.ValueKind == JsonValueKind.Object)
|
||||
{
|
||||
foreach (var prop in input.EnumerateObject())
|
||||
{
|
||||
dict[prop.Name] = prop.Value.ValueKind switch
|
||||
{
|
||||
JsonValueKind.String => prop.Value.GetString(),
|
||||
JsonValueKind.Number => prop.Value.GetRawText(),
|
||||
JsonValueKind.True => "true",
|
||||
JsonValueKind.False => "false",
|
||||
JsonValueKind.Null => null,
|
||||
_ => prop.Value.GetRawText()
|
||||
};
|
||||
}
|
||||
}
|
||||
return dict;
|
||||
}
|
||||
|
||||
private void ConfigureHeaders()
|
||||
{
|
||||
var apiKey = GetApiKey()!;
|
||||
_httpClient.DefaultRequestHeaders.Clear();
|
||||
_httpClient.DefaultRequestHeaders.Add("x-api-key", apiKey);
|
||||
_httpClient.DefaultRequestHeaders.Add("anthropic-version", "2023-06-01");
|
||||
}
|
||||
|
||||
private string? GetApiKey() =>
|
||||
Environment.GetEnvironmentVariable("ANTHROPIC_API_KEY")
|
||||
?? _configuration["Anthropic:ApiKey"];
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// llama.cpp server client using OpenAI-compatible vision API for local LLM inference.
|
||||
/// llama.cpp server client using OpenAI-compatible vision API with tool-use support.
|
||||
/// </summary>
|
||||
public class LlamaCppVisionClient : IAIVisionClient
|
||||
public class LlamaCppVisionClient : IAIToolAwareVisionClient
|
||||
{
|
||||
private readonly HttpClient _httpClient;
|
||||
private readonly IConfiguration _configuration;
|
||||
private readonly ILogger<LlamaCppVisionClient> _logger;
|
||||
|
||||
// Model families whose Jinja chat templates support the OpenAI tool role format.
|
||||
private static readonly string[] _toolCapableModelPrefixes = new[]
|
||||
{
|
||||
"qwen3", "qwen2.5", "hermes", "functionary", "mistral"
|
||||
};
|
||||
|
||||
private string? _currentModel;
|
||||
|
||||
public LlamaCppVisionClient(HttpClient httpClient, IConfiguration configuration, ILogger<LlamaCppVisionClient> logger)
|
||||
{
|
||||
_httpClient = httpClient;
|
||||
_httpClient.Timeout = TimeSpan.FromMinutes(5); // Local models can be slow
|
||||
_httpClient.Timeout = TimeSpan.FromMinutes(5);
|
||||
_configuration = configuration;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Whether the current model supports OpenAI-style tool/function calling.
|
||||
/// Only certain model families (Qwen3, Hermes, etc.) have chat templates that handle the tool role.
|
||||
/// </summary>
|
||||
public bool SupportsToolUse =>
|
||||
_currentModel != null &&
|
||||
_toolCapableModelPrefixes.Any(p =>
|
||||
_currentModel.StartsWith(p, StringComparison.OrdinalIgnoreCase));
|
||||
|
||||
/// <summary>
|
||||
/// Set the model name so SupportsToolUse can be evaluated per-model.
|
||||
/// Called by AIReceiptParser before the tool-use check.
|
||||
/// </summary>
|
||||
public void SetCurrentModel(string model)
|
||||
{
|
||||
_currentModel = model.StartsWith("llamacpp:") ? model[9..] : model;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get available models from the llama.cpp server.
|
||||
/// </summary>
|
||||
@@ -256,7 +707,7 @@ namespace MoneyMap.Services
|
||||
var modelsResponse = JsonSerializer.Deserialize<LlamaCppModelsResponse>(json);
|
||||
|
||||
return modelsResponse?.Data?
|
||||
.Where(m => !m.Id.StartsWith("mmproj-")) // Filter out multimodal projectors
|
||||
.Where(m => !m.Id.StartsWith("mmproj-"))
|
||||
.Select(m => new LlamaCppModel
|
||||
{
|
||||
Id = m.Id,
|
||||
@@ -279,7 +730,7 @@ namespace MoneyMap.Services
|
||||
public async Task<VisionApiResult> SendTextPromptAsync(string prompt, string? model = null)
|
||||
{
|
||||
var baseUrl = _configuration["AI:ModelsEndpoint"] ?? "http://athena.lan:11434";
|
||||
var llamaModel = model ?? "GLM-4.6V-UD-Q4_K_XL-00001-of-00002";
|
||||
var llamaModel = model ?? _configuration["AI:ReceiptParsingModel"] ?? "Qwen3-8B-Q6_K";
|
||||
if (llamaModel.StartsWith("llamacpp:"))
|
||||
llamaModel = llamaModel[9..];
|
||||
|
||||
@@ -324,7 +775,7 @@ namespace MoneyMap.Services
|
||||
.GetString();
|
||||
|
||||
_logger.LogInformation("LlamaCpp: Text prompt completed successfully");
|
||||
return VisionApiResult.Success(CleanJsonResponse(messageContent));
|
||||
return VisionApiResult.Success(OpenAIToolUseHelper.CleanJsonResponse(messageContent));
|
||||
}
|
||||
catch (TaskCanceledException ex) when (ex.InnerException is TimeoutException)
|
||||
{
|
||||
@@ -341,8 +792,6 @@ namespace MoneyMap.Services
|
||||
public async Task<VisionApiResult> AnalyzeImageAsync(string base64Image, string mediaType, string prompt, string model)
|
||||
{
|
||||
var baseUrl = _configuration["AI:ModelsEndpoint"] ?? "http://athena.lan:11434";
|
||||
|
||||
// Strip "llamacpp:" prefix if present
|
||||
var llamaModel = model.StartsWith("llamacpp:") ? model[9..] : model;
|
||||
|
||||
_logger.LogInformation("LlamaCpp: Sending request to {BaseUrl} with model {Model}, image size: {Size} bytes",
|
||||
@@ -397,7 +846,7 @@ namespace MoneyMap.Services
|
||||
.GetString();
|
||||
|
||||
_logger.LogInformation("LlamaCpp: Successfully parsed response");
|
||||
return VisionApiResult.Success(CleanJsonResponse(messageContent));
|
||||
return VisionApiResult.Success(OpenAIToolUseHelper.CleanJsonResponse(messageContent));
|
||||
}
|
||||
catch (TaskCanceledException ex) when (ex.InnerException is TimeoutException)
|
||||
{
|
||||
@@ -411,19 +860,64 @@ namespace MoneyMap.Services
|
||||
}
|
||||
}
|
||||
|
||||
private static string CleanJsonResponse(string? content)
|
||||
public async Task<VisionApiResult> AnalyzeImageWithToolsAsync(
|
||||
string base64Image, string mediaType, string prompt, string model,
|
||||
List<AIToolDefinition> tools, Func<AIToolCall, Task<AIToolResult>> toolExecutor,
|
||||
int maxToolRounds = 5)
|
||||
{
|
||||
var trimmed = content?.Trim() ?? "";
|
||||
if (trimmed.StartsWith("```json"))
|
||||
var baseUrl = _configuration["AI:ModelsEndpoint"] ?? "http://athena.lan:11434";
|
||||
var llamaModel = model.StartsWith("llamacpp:") ? model[9..] : model;
|
||||
|
||||
_logger.LogInformation("LlamaCpp: Starting tool-use request with model {Model}", llamaModel);
|
||||
|
||||
var initialMessages = new List<object>
|
||||
{
|
||||
trimmed = trimmed.Replace("```json", "").Replace("```", "").Trim();
|
||||
new
|
||||
{
|
||||
role = "user",
|
||||
content = new object[]
|
||||
{
|
||||
new
|
||||
{
|
||||
type = "image_url",
|
||||
image_url = new { url = $"data:{mediaType};base64,{base64Image}" }
|
||||
},
|
||||
new { type = "text", text = prompt }
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
try
|
||||
{
|
||||
return await OpenAIToolUseHelper.ExecuteWithToolsAsync(
|
||||
_httpClient,
|
||||
$"{baseUrl.TrimEnd('/')}/v1/chat/completions",
|
||||
_ => { }, // No auth headers needed for local llama.cpp
|
||||
llamaModel,
|
||||
initialMessages,
|
||||
OpenAIToolUseHelper.BuildToolsArray(tools),
|
||||
toolExecutor,
|
||||
maxToolRounds,
|
||||
maxTokens: 4096,
|
||||
_logger);
|
||||
}
|
||||
catch (TaskCanceledException ex) when (ex.InnerException is TimeoutException)
|
||||
{
|
||||
_logger.LogError("llama.cpp tool-use request timed out");
|
||||
return VisionApiResult.Failure("llama.cpp request timed out.");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "llama.cpp tool-use call failed: {Message}", ex.Message);
|
||||
return VisionApiResult.Failure($"llama.cpp API error: {ex.Message}");
|
||||
}
|
||||
return trimmed;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Ollama Vision API client for local LLM inference.
|
||||
/// Does NOT support tool use (uses /api/generate endpoint).
|
||||
/// Falls back to enriched prompt with pre-fetched context.
|
||||
/// </summary>
|
||||
public class OllamaVisionClient : IAIVisionClient
|
||||
{
|
||||
@@ -434,7 +928,7 @@ namespace MoneyMap.Services
|
||||
public OllamaVisionClient(HttpClient httpClient, IConfiguration configuration, ILogger<OllamaVisionClient> logger)
|
||||
{
|
||||
_httpClient = httpClient;
|
||||
_httpClient.Timeout = TimeSpan.FromMinutes(5); // Local models can be slow
|
||||
_httpClient.Timeout = TimeSpan.FromMinutes(5);
|
||||
_configuration = configuration;
|
||||
_logger = logger;
|
||||
}
|
||||
@@ -442,8 +936,6 @@ namespace MoneyMap.Services
|
||||
public async Task<VisionApiResult> AnalyzeImageAsync(string base64Image, string mediaType, string prompt, string model)
|
||||
{
|
||||
var baseUrl = _configuration["AI:ModelsEndpoint"] ?? "http://athena.lan:11434";
|
||||
|
||||
// Strip "ollama:" prefix if present
|
||||
var ollamaModel = model.StartsWith("ollama:") ? model[7..] : model;
|
||||
|
||||
_logger.LogInformation("Ollama: Sending request to {BaseUrl} with model {Model}, image size: {Size} bytes",
|
||||
@@ -483,7 +975,7 @@ namespace MoneyMap.Services
|
||||
var messageContent = responseObj.GetProperty("response").GetString();
|
||||
|
||||
_logger.LogInformation("Ollama: Successfully parsed response");
|
||||
return VisionApiResult.Success(CleanJsonResponse(messageContent));
|
||||
return VisionApiResult.Success(OpenAIToolUseHelper.CleanJsonResponse(messageContent));
|
||||
}
|
||||
catch (TaskCanceledException ex) when (ex.InnerException is TimeoutException)
|
||||
{
|
||||
@@ -496,16 +988,6 @@ namespace MoneyMap.Services
|
||||
return VisionApiResult.Failure($"Ollama API error: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
private static string CleanJsonResponse(string? content)
|
||||
{
|
||||
var trimmed = content?.Trim() ?? "";
|
||||
if (trimmed.StartsWith("```json"))
|
||||
{
|
||||
trimmed = trimmed.Replace("```json", "").Replace("```", "").Trim();
|
||||
}
|
||||
return trimmed;
|
||||
}
|
||||
}
|
||||
|
||||
// Models for llama.cpp /v1/models endpoint
|
||||
|
||||
Reference in New Issue
Block a user