Filter duplicate check by date range for better performance

Optimize duplicate detection to only query existing transactions within the date range of the uploaded CSV file (plus 1-day buffer). This prevents loading the entire transaction history into memory when checking duplicates.

For example, uploading 2800 transactions from Jan-Mar 2024 will now only load existing transactions from that period rather than all historical transactions.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
AJ
2025-10-11 22:22:02 -04:00
parent e481c58464
commit 3153bd50aa

View File

@@ -255,35 +255,64 @@ namespace MoneyMap.Pages
var previewItems = new List<TransactionPreview>();
var addedInThisBatch = new HashSet<TransactionKey>();
// Load all existing transactions into memory for fast duplicate checking
var existingTransactions = await _db.Transactions
.Select(t => new TransactionKey(t.Date, t.Amount, t.Name, t.Memo, t.AccountId, t.CardId))
.ToHashSetAsync();
// First pass: read CSV to get date range and all transactions
var csvTransactions = new List<(TransactionCsvRow Row, Transaction Transaction, TransactionKey Key)>();
DateTime? minDate = null;
DateTime? maxDate = null;
using var reader = new StreamReader(csvStream);
using var csv = new CsvReader(reader, new CsvConfiguration(CultureInfo.InvariantCulture)
using (var reader = new StreamReader(csvStream))
using (var csv = new CsvReader(reader, new CsvConfiguration(CultureInfo.InvariantCulture)
{
HasHeaderRecord = true,
HeaderValidated = null,
MissingFieldFound = null
});
csv.Read();
csv.ReadHeader();
var hasCategory = csv.HeaderRecord?.Any(h => h.Equals("Category", StringComparison.OrdinalIgnoreCase)) ?? false;
csv.Context.RegisterClassMap(new TransactionCsvRowMap(hasCategory));
while (csv.Read())
}))
{
var row = csv.GetRecord<TransactionCsvRow>();
csv.Read();
csv.ReadHeader();
var hasCategory = csv.HeaderRecord?.Any(h => h.Equals("Category", StringComparison.OrdinalIgnoreCase)) ?? false;
csv.Context.RegisterClassMap(new TransactionCsvRowMap(hasCategory));
var paymentResolution = await _cardResolver.ResolvePaymentAsync(row.Memo, context);
if (!paymentResolution.IsSuccess)
return PreviewOperationResult.Failure(paymentResolution.ErrorMessage!);
while (csv.Read())
{
var row = csv.GetRecord<TransactionCsvRow>();
var transaction = MapToTransaction(row, paymentResolution);
var key = new TransactionKey(transaction);
var paymentResolution = await _cardResolver.ResolvePaymentAsync(row.Memo, context);
if (!paymentResolution.IsSuccess)
return PreviewOperationResult.Failure(paymentResolution.ErrorMessage!);
var transaction = MapToTransaction(row, paymentResolution);
var key = new TransactionKey(transaction);
csvTransactions.Add((row, transaction, key));
// Track date range
if (minDate == null || transaction.Date < minDate) minDate = transaction.Date;
if (maxDate == null || transaction.Date > maxDate) maxDate = transaction.Date;
}
}
// Load existing transactions within the date range for fast duplicate checking
HashSet<TransactionKey> existingTransactions;
if (minDate.HasValue && maxDate.HasValue)
{
// Add a buffer of 1 day on each side to catch any edge cases
var startDate = minDate.Value.AddDays(-1);
var endDate = maxDate.Value.AddDays(1);
existingTransactions = await _db.Transactions
.Where(t => t.Date >= startDate && t.Date <= endDate)
.Select(t => new TransactionKey(t.Date, t.Amount, t.Name, t.Memo, t.AccountId, t.CardId))
.ToHashSetAsync();
}
else
{
existingTransactions = new HashSet<TransactionKey>();
}
// Second pass: check for duplicates and build preview
foreach (var (row, transaction, key) in csvTransactions)
{
// Fast in-memory duplicate checking
bool isDuplicate = addedInThisBatch.Contains(key) || existingTransactions.Contains(key);