diff --git a/EmailSearch/SpamDetection/AttachmentAnalyzer.cs b/EmailSearch/SpamDetection/AttachmentAnalyzer.cs new file mode 100644 index 0000000..f04532c --- /dev/null +++ b/EmailSearch/SpamDetection/AttachmentAnalyzer.cs @@ -0,0 +1,72 @@ +using NetOffice.OutlookApi; + +namespace EmailSearch.SpamDetection; + +internal static class AttachmentAnalyzer +{ + // Risk scores by extension type (0.0 = safe, 1.0 = very dangerous) + private static readonly Dictionary AttachmentRiskScores = new(StringComparer.OrdinalIgnoreCase) + { + // Critical risk - direct executables + { ".exe", 1.0 }, + { ".scr", 1.0 }, + { ".bat", 0.95 }, + { ".cmd", 0.95 }, + { ".com", 0.95 }, + { ".pif", 0.95 }, + { ".msi", 0.9 }, + { ".vbs", 0.9 }, + { ".js", 0.9 }, + { ".ps1", 0.9 }, + { ".wsf", 0.9 }, + + // High risk - macro-enabled documents + { ".docm", 0.8 }, + { ".xlsm", 0.8 }, + { ".pptm", 0.8 }, + { ".xlam", 0.8 }, + + // Medium-high risk - can contain executables + { ".iso", 0.7 }, + { ".img", 0.7 }, + { ".lnk", 0.75 }, + { ".hta", 0.7 }, + + // Medium risk - HTML can be phishing + { ".html", 0.6 }, + { ".htm", 0.6 }, + { ".svg", 0.5 }, + + // Low-medium risk - archives + { ".zip", 0.3 }, + { ".rar", 0.35 }, + { ".7z", 0.35 }, + { ".tar", 0.3 }, + { ".gz", 0.3 } + }; + + public static double GetAttachmentRiskScore(MailItem mail) + { + if (mail.Attachments == null || mail.Attachments.Count == 0) + return 0.0; + + double maxRisk = 0.0; + + foreach (var attachment in mail.Attachments) + { + if (attachment is Attachment att) + { + var name = att.FileName?.ToLowerInvariant() ?? ""; + foreach (var kvp in AttachmentRiskScores) + { + if (name.EndsWith(kvp.Key)) + { + maxRisk = Math.Max(maxRisk, kvp.Value); + } + } + } + } + + return maxRisk; + } +} diff --git a/EmailSearch/SpamDetection/FeatureExtractors.cs b/EmailSearch/SpamDetection/FeatureExtractors.cs new file mode 100644 index 0000000..7d9a1ee --- /dev/null +++ b/EmailSearch/SpamDetection/FeatureExtractors.cs @@ -0,0 +1,30 @@ +using System.Text.RegularExpressions; + +namespace EmailSearch.SpamDetection; + +internal static class FeatureExtractors +{ + public static List ExtractUrls(string text) => + Regex.Matches(text ?? "", @"https?://[^\s'""<>()]+", RegexOptions.IgnoreCase) + .Select(m => m.Value) + .ToList(); + + public static string ExtractFirstEmail(string headerLine) + { + var m = Regex.Match( + headerLine ?? "", + @"[A-Z0-9._%+\-]+@[A-Z0-9.\-]+\.[A-Z]{2,}", + RegexOptions.IgnoreCase); + return m.Success ? m.Value : ""; + } + + public static string MatchHeader(string headers, string pattern) + { + if (string.IsNullOrEmpty(headers)) return string.Empty; + var match = Regex.Match( + headers, + pattern, + RegexOptions.IgnoreCase | RegexOptions.Multiline); + return match.Success ? match.Groups["val"].Value : string.Empty; + } +} diff --git a/EmailSearch/SpamDetection/SpamAnalysisResult.cs b/EmailSearch/SpamDetection/SpamAnalysisResult.cs new file mode 100644 index 0000000..70da180 --- /dev/null +++ b/EmailSearch/SpamDetection/SpamAnalysisResult.cs @@ -0,0 +1,25 @@ +namespace EmailSearch.SpamDetection; + +/// +/// Result of spam analysis containing scores and detected features. +/// +public sealed class SpamAnalysisResult +{ + public double RuleBasedScore { get; set; } + public double FinalScore { get; set; } + public bool PredictedSpam { get; set; } + public SpamFeatures? Features { get; set; } + public List RedFlags { get; set; } = new(); + + /// + /// Gets a human-readable spam likelihood category. + /// + public string SpamLikelihood => FinalScore switch + { + >= 0.9 => "Very High", + >= 0.7 => "High", + >= 0.5 => "Medium", + >= 0.3 => "Low", + _ => "Very Low" + }; +} diff --git a/EmailSearch/SpamDetection/SpamDetector.cs b/EmailSearch/SpamDetection/SpamDetector.cs new file mode 100644 index 0000000..72fdf59 --- /dev/null +++ b/EmailSearch/SpamDetection/SpamDetector.cs @@ -0,0 +1,686 @@ +using System.Text.Json; +using System.Text.RegularExpressions; +using NetOffice.OutlookApi; +using NetOffice.OutlookApi.Enums; + +namespace EmailSearch.SpamDetection; + +public sealed class SpamDetector +{ + private readonly SpamDetectorConfig _config; + private readonly HashSet _blocklistEmails; + private readonly HashSet _blocklistDomains; + + public SpamDetector() : this(null) { } + + public SpamDetector(SpamDetectorConfig? config) + { + _config = config ?? LoadConfiguration() ?? SpamDetectorConfig.GetDefault(); + (_blocklistEmails, _blocklistDomains) = LoadBlocklist(); + } + + private static SpamDetectorConfig? LoadConfiguration() + { + try + { + var configPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "SpamDetectorConfig.json"); + if (File.Exists(configPath)) + { + var json = File.ReadAllText(configPath); + return JsonSerializer.Deserialize(json, new JsonSerializerOptions + { + PropertyNameCaseInsensitive = true + }); + } + } + catch { } + return null; + } + + private static (HashSet emails, HashSet domains) LoadBlocklist() + { + var emails = new HashSet(StringComparer.OrdinalIgnoreCase); + var domains = new HashSet(StringComparer.OrdinalIgnoreCase); + + try + { + var blocklistPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "BlockList.txt"); + if (!File.Exists(blocklistPath)) + return (emails, domains); + + var lines = File.ReadAllLines(blocklistPath, System.Text.Encoding.Unicode); + + foreach (var line in lines) + { + var entry = line.Trim(); + if (string.IsNullOrWhiteSpace(entry)) + continue; + + if (entry.StartsWith("@")) + domains.Add(entry.Substring(1).ToLowerInvariant()); + else if (entry.Contains("@")) + emails.Add(entry.ToLowerInvariant()); + } + } + catch { } + + return (emails, domains); + } + + public SpamAnalysisResult Analyze(MailItem mailItem) + { + var features = BuildFeatures(mailItem); + var score = CalculateScore(features); + var redFlags = GetRedFlags(features); + + return new SpamAnalysisResult + { + RuleBasedScore = score, + FinalScore = score, + PredictedSpam = score >= 0.7, + Features = features, + RedFlags = redFlags + }; + } + + private SpamFeatures BuildFeatures(MailItem m) + { + var f = new SpamFeatures(); + + // Sender/display + f.DisplayName = m.SenderName ?? ""; + f.FromAddress = GetSmtpAddress(m); + f.FromDomain = DomainOf(f.FromAddress); + + // Body/headers + var headers = GetInternetHeaders(m); + var bodyPreview = (m.Body ?? "").Trim(); + var html = m.HTMLBody ?? ""; + + // Auth results + f.SpfFail = Contains(headers, "spf=fail"); + f.DkimFail = Contains(headers, "dkim=fail"); + f.DmarcFail = Contains(headers, "dmarc=fail"); + + // Reply-To mismatch + var replyTo = FeatureExtractors.MatchHeader(headers, @"(?im)^\s*Reply-To:\s*(?.+)$"); + var replyToAddr = FeatureExtractors.ExtractFirstEmail(replyTo); + f.ReplyToDomainMismatch = !string.IsNullOrEmpty(replyToAddr) && + !string.Equals(DomainOf(replyToAddr), f.FromDomain, StringComparison.OrdinalIgnoreCase); + + // Display name impersonation + f.DisplayImpersonation = LooksLikeVendorName(f.DisplayName) && !IsKnownVendorDomain(f.FromDomain); + + // Subject domain impersonation + f.SubjectDomainImpersonation = CheckSubjectDomainImpersonation(m.Subject ?? "", f.FromDomain); + + // URLs + var urls = FeatureExtractors.ExtractUrls(html.Length > 0 ? html : bodyPreview); + f.UrlCount = urls.Count; + f.HasUrl = f.UrlCount > 0; + f.HasIpLink = urls.Any(u => UrlAnalyzer.IsIpUrl(u)); + f.UsesShortener = urls.Any(u => UrlAnalyzer.IsShortener(u)); + + // Suspicious TLDs + f.SuspiciousTld = _config.BadTlds.Contains(TldOf(f.FromDomain), StringComparer.OrdinalIgnoreCase) || + urls.Any(url => HasSuspiciousTld(url)); + + // Free-mail domains + f.FreeMailboxDomain = _config.FreeDomains.Contains(f.FromDomain, StringComparer.OrdinalIgnoreCase); + + // Unknown domain + f.UnknownDomain = !string.IsNullOrEmpty(f.FromDomain) && + !_config.Domains.Vendors.ContainsKey(f.FromDomain) && + !_config.Domains.Trusted.Any(d => f.FromDomain.Equals(d, StringComparison.OrdinalIgnoreCase)); + + // Blocklist + f.IsBlocklisted = IsBlocklisted(f.FromAddress, f.FromDomain); + + // Tracking pixel + f.HasTrackingPixel = Regex.IsMatch(html, + @"]+(width=['""]?1['""]?[^>]*height=['""]?1['""]?|height=['""]?1['""]?[^>]*width=['""]?1['""]?)", + RegexOptions.IgnoreCase); + + // Attachments + f.HasAttachment = m.Attachments?.Count > 0; + f.AttachmentRiskScore = AttachmentAnalyzer.GetAttachmentRiskScore(m); + f.HasRiskyAttachment = f.AttachmentRiskScore >= 0.5; + + // Keyword bait + var lower = (m.Subject + " " + bodyPreview).ToLowerInvariant(); + f.KeywordBait = _config.BaitKeywords.Any(k => lower.Contains(k, StringComparison.OrdinalIgnoreCase)); + + // Placeholder text + f.HasPlaceholderText = HasPlaceholderText(m.Subject + " " + bodyPreview); + + // Generic sender + f.GenericSenderName = IsGenericSender(f.DisplayName, f.FromAddress); + + // Single link with minimal text + var isMinimal = IsMinimalContent(bodyPreview, html); + f.SingleLinkOnly = f.UrlCount == 1 && bodyPreview.Length < 2000 && isMinimal; + + // Unicode lookalikes + f.UnicodeLookalike = HasHomoglyphs(f.FromDomain); + + // Reputation + f.SenderReputation = _config.Domains.Vendors.TryGetValue(f.FromDomain, out var vendorInfo) + ? vendorInfo.Reputation + : 0; + + // List-Unsubscribe header + f.HasListUnsub = Contains(headers, "List-Unsubscribe:"); + + // Advanced patterns + f.CompanySubdomainSpoof = CheckCompanySubdomainSpoof(f.FromDomain); + f.FakeQuarantineReport = CheckFakeQuarantineReport(m.Subject ?? "", bodyPreview, f.FromDomain); + f.HasZeroWidthChars = HasZeroWidthCharacters(m.Subject + " " + f.DisplayName + " " + bodyPreview); + f.HasRandomRefId = HasRandomReferenceId(m.Subject ?? ""); + f.HasTimestampInSubject = HasTimestampInSubject(m.Subject ?? ""); + f.ColdEmailSolicitation = CheckColdEmailSolicitation(m.Subject ?? "", bodyPreview); + f.FakeVoicemailNotification = CheckFakeVoicemailNotification(m.Subject ?? "", bodyPreview, f.FromDomain); + f.FakeSystemNotification = CheckFakeSystemNotification(m.Subject ?? "", bodyPreview, f.FromDomain); + + return f; + } + + private double CalculateScore(SpamFeatures f) + { + var w = _config.SpamScoreWeights; + double s = 0; + + // Auth & identity + if (f.SpfFail) s += w.SpfFail; + if (f.DkimFail) s += w.DkimFail; + if (f.DmarcFail) s += w.DmarcFail; + if (f.ReplyToDomainMismatch) s += w.ReplyToDomainMismatch; + if (f.DisplayImpersonation) s += w.DisplayImpersonation; + if (f.UnicodeLookalike) s += w.UnicodeLookalike; + if (f.GenericSenderName) s += 0.18; + if (f.SubjectDomainImpersonation) s += 0.35; + if (f.IsBlocklisted) s += 0.95; + + // Content/links + if (f.HasUrl) s += w.HasUrl + Math.Min(0.10, f.UrlCount * w.UrlCountMultiplier); + if (f.HasIpLink) s += w.HasIpLink; + if (f.UsesShortener) s += w.UsesShortener; + if (f.SuspiciousTld) s += w.SuspiciousTld; + if (f.HasTrackingPixel) s += w.HasTrackingPixel; + + // Attachments & bait + if (f.HasAttachment) s += w.HasAttachment; + s += f.AttachmentRiskScore * w.HasRiskyAttachment; + if (f.KeywordBait) s += w.KeywordBait; + if (f.HasPlaceholderText) s += 0.30; + if (f.SingleLinkOnly) s += 0.25; + + // Unknown domain + if (f.UnknownDomain) + { + s += w.UnknownDomain; + if (f.KeywordBait || f.UsesShortener) + s += 0.15; + } + + // Freemail + if (f.FreeMailboxDomain && f.HasUrl) s += w.FreeMailboxWithUrl; + else if (f.FreeMailboxDomain) s += w.FreeMailboxOnly; + + // Reputation + s += Math.Clamp(-w.ReputationMultiplier * f.SenderReputation, -0.25, 0.25); + + // Legitimacy signals + if (f.HasListUnsub) s += w.HasListUnsubscribe; + + // Advanced patterns + if (f.CompanySubdomainSpoof) s += w.CompanySubdomainSpoof; + if (f.FakeQuarantineReport) s += w.FakeQuarantineReport; + if (f.HasZeroWidthChars) s += w.HasZeroWidthChars; + if (f.HasRandomRefId) s += w.HasRandomRefId; + if (f.HasTimestampInSubject) s += w.HasTimestampInSubject; + if (f.ColdEmailSolicitation) s += w.ColdEmailSolicitation; + if (f.FakeVoicemailNotification) s += w.FakeVoicemailNotification; + if (f.FakeSystemNotification) s += w.FakeSystemNotification; + + return Math.Max(0, Math.Min(1, s)); + } + + private List GetRedFlags(SpamFeatures f) + { + var flags = new List(); + + if (f.IsBlocklisted) flags.Add("Sender is blocklisted"); + if (f.SpfFail) flags.Add("SPF authentication failed"); + if (f.DkimFail) flags.Add("DKIM authentication failed"); + if (f.DmarcFail) flags.Add("DMARC authentication failed"); + if (f.ReplyToDomainMismatch) flags.Add("Reply-To domain doesn't match sender"); + if (f.DisplayImpersonation) flags.Add("Display name may impersonate known vendor"); + if (f.SubjectDomainImpersonation) flags.Add("Subject mentions known domain but sender differs"); + if (f.UnicodeLookalike) flags.Add("Domain contains suspicious Unicode characters"); + if (f.GenericSenderName) flags.Add("Generic/automated sender name"); + if (f.HasIpLink) flags.Add("Contains IP address-based URL"); + if (f.UsesShortener) flags.Add("Uses URL shortener service"); + if (f.SuspiciousTld) flags.Add("Suspicious top-level domain"); + if (f.HasRiskyAttachment) flags.Add($"Risky attachment type (risk: {f.AttachmentRiskScore:P0})"); + if (f.KeywordBait) flags.Add("Contains spam/phishing keywords"); + if (f.HasPlaceholderText) flags.Add("Contains placeholder/merge field text"); + if (f.SingleLinkOnly) flags.Add("Minimal content with single link"); + if (f.CompanySubdomainSpoof) flags.Add("Subdomain spoofing detected"); + if (f.FakeQuarantineReport) flags.Add("Fake quarantine/spam report"); + if (f.HasZeroWidthChars) flags.Add("Contains zero-width characters (filter evasion)"); + if (f.HasRandomRefId) flags.Add("Random reference ID in subject"); + if (f.HasTimestampInSubject) flags.Add("Automated timestamp in subject"); + if (f.ColdEmailSolicitation) flags.Add("Cold email solicitation"); + if (f.FakeVoicemailNotification) flags.Add("Fake voicemail notification"); + if (f.FakeSystemNotification) flags.Add("Fake system notification"); + if (f.FreeMailboxDomain && f.HasUrl) flags.Add("Free email with links (potential phishing)"); + + return flags; + } + + // ---- Helper Methods ---- + + private static string GetSmtpAddress(MailItem m) + { + try + { + if (m.Sender != null) + { + var addressEntry = m.Sender; + if (addressEntry.AddressEntryUserType == OlAddressEntryUserType.olSmtpAddressEntry) + { + return Safe(addressEntry.Address); + } + + if (addressEntry.AddressEntryUserType == OlAddressEntryUserType.olExchangeUserAddressEntry || + addressEntry.AddressEntryUserType == OlAddressEntryUserType.olExchangeRemoteUserAddressEntry) + { + try + { + var pa = addressEntry.PropertyAccessor; + var smtpAddress = pa.GetProperty("http://schemas.microsoft.com/mapi/proptag/0x39FE001E"); + if (smtpAddress is string s && !string.IsNullOrEmpty(s)) + { + return s.Trim(); + } + } + catch { } + + try + { + var exchangeUser = addressEntry.GetExchangeUser(); + if (exchangeUser != null && !string.IsNullOrEmpty(exchangeUser.PrimarySmtpAddress)) + { + return exchangeUser.PrimarySmtpAddress.Trim(); + } + } + catch { } + } + } + + var senderEmail = m.SenderEmailAddress ?? ""; + + if (senderEmail.StartsWith("/O=", StringComparison.OrdinalIgnoreCase)) + { + var headers = GetInternetHeaders(m); + var fromHeader = FeatureExtractors.MatchHeader(headers, @"(?im)^\s*From:\s*(?.+)$"); + var extractedEmail = FeatureExtractors.ExtractFirstEmail(fromHeader); + if (!string.IsNullOrEmpty(extractedEmail)) + { + return extractedEmail; + } + } + + return Safe(senderEmail); + } + catch + { + return Safe(m.SenderEmailAddress); + } + } + + private static string GetInternetHeaders(MailItem m) + { + try + { + var pa = m.PropertyAccessor; + var raw = pa.GetProperty("http://schemas.microsoft.com/mapi/proptag/0x007D001E"); + return raw is string s ? s : ""; + } + catch { return ""; } + } + + private static string Safe(string? s) => s?.Trim() ?? ""; + private static bool Contains(string hay, string needle) => + hay?.IndexOf(needle, StringComparison.OrdinalIgnoreCase) >= 0; + + private static string DomainOf(string email) + { + var at = email.IndexOf('@'); + if (at < 0) return ""; + return email[(at + 1)..].Trim().ToLowerInvariant(); + } + + private static string TldOf(string domain) + { + var dot = domain.LastIndexOf('.'); + return dot >= 0 ? domain[(dot + 1)..] : domain; + } + + private bool LooksLikeVendorName(string name) + { + if (string.IsNullOrEmpty(name)) + return false; + + var patterns = _config.Domains.Vendors.Values + .SelectMany(v => v.DisplayNamePatterns) + .Where(p => !string.IsNullOrEmpty(p)) + .ToList(); + + if (patterns.Count == 0) + return false; + + var pattern = "(" + string.Join("|", patterns) + ")"; + return Regex.IsMatch(name, pattern, RegexOptions.IgnoreCase); + } + + private bool IsKnownVendorDomain(string domain) + { + if (string.IsNullOrEmpty(domain)) + return false; + + return _config.Domains.Vendors.ContainsKey(domain); + } + + private static bool HasHomoglyphs(string domain) => + domain.Any(ch => ch > 127); + + private static bool IsMinimalContent(string bodyText, string html) + { + try + { + if (string.IsNullOrWhiteSpace(html)) + return bodyText.Length < 200; + + // Strip HTML tags for word count + var plainText = Regex.Replace(html, "<[^>]+>", " "); + plainText = System.Net.WebUtility.HtmlDecode(plainText); + plainText = Regex.Replace(plainText, @"\s+", " ").Trim(); + + var words = plainText.Split(new[] { ' ', '\t', '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries) + .Where(w => w.Length > 1) + .ToArray(); + + return words.Length < 50; + } + catch + { + return bodyText.Length < 200; + } + } + + private static bool IsGenericSender(string displayName, string fromAddress) + { + var genericPatterns = new[] + { + "noreply", "no-reply", "no.reply", "donotreply", "do-not-reply", + "notification", "notify", "alert", "system", "admin", "administrator", + "support", "helpdesk", "help.desk", "technical support", "tech support", + "voice message", "voicemail", "fax", "scanner", "document center", + "storage center", "help center", "incident", "ticket" + }; + + var lowerDisplay = displayName.ToLowerInvariant(); + var lowerAddress = fromAddress.ToLowerInvariant(); + + return genericPatterns.Any(p => + lowerDisplay.Contains(p, StringComparison.OrdinalIgnoreCase) || + lowerAddress.Contains(p, StringComparison.OrdinalIgnoreCase)); + } + + private bool HasSuspiciousTld(string url) + { + try + { + var host = new Uri(url).Host.ToLowerInvariant(); + var tld = TldOf(host); + return _config.BadTlds.Contains(tld, StringComparer.OrdinalIgnoreCase); + } + catch { return false; } + } + + private static bool HasPlaceholderText(string text) + { + if (string.IsNullOrWhiteSpace(text)) + return false; + + var placeholderKeywords = new[] + { + "email", "name", "user", "recipient", "customer", "client", + "address", "company", "account", "localpart", "domain" + }; + + var keywordPattern = string.Join("|", placeholderKeywords); + + var bracketPatterns = new[] + { + $@"\[.*?(?:{keywordPattern}).*?\]", + $@"\{{.*?(?:{keywordPattern}).*?\}}", + $@"\{{\{{.*?(?:{keywordPattern}).*?\}}\}}", + $@"<.*?(?:{keywordPattern}).*?>", + $@"\$\{{.*?(?:{keywordPattern}).*?\}}", + $@"%.*?(?:{keywordPattern}).*?%" + }; + + return bracketPatterns.Any(p => Regex.IsMatch(text, p, RegexOptions.IgnoreCase)); + } + + private bool IsBlocklisted(string fromAddress, string fromDomain) + { + if (_blocklistEmails.Contains(fromAddress.ToLowerInvariant())) + return true; + + if (_blocklistDomains.Contains(fromDomain.ToLowerInvariant())) + return true; + + return false; + } + + private bool CheckSubjectDomainImpersonation(string subject, string fromDomain) + { + if (string.IsNullOrWhiteSpace(subject)) + return false; + + var subjectLower = subject.ToLowerInvariant(); + + foreach (var vendorDomain in _config.Domains.Vendors.Keys) + { + if (fromDomain.Equals(vendorDomain, StringComparison.OrdinalIgnoreCase)) + continue; + + var pattern = $@"\b{Regex.Escape(vendorDomain)}\b"; + if (Regex.IsMatch(subjectLower, pattern, RegexOptions.IgnoreCase)) + return true; + } + + foreach (var trustedDomain in _config.Domains.Trusted) + { + if (fromDomain.Equals(trustedDomain, StringComparison.OrdinalIgnoreCase)) + continue; + + var pattern = $@"\b{Regex.Escape(trustedDomain)}\b"; + if (Regex.IsMatch(subjectLower, pattern, RegexOptions.IgnoreCase)) + return true; + } + + return false; + } + + private bool CheckCompanySubdomainSpoof(string fromDomain) + { + if (string.IsNullOrEmpty(fromDomain)) + return false; + + var parts = fromDomain.Split('.'); + if (parts.Length < 3) + return false; + + var subdomain = parts[0].ToLowerInvariant(); + + foreach (var vendor in _config.Domains.Vendors) + { + var vendorDomainParts = vendor.Key.Split('.'); + var vendorName = vendorDomainParts[0].ToLowerInvariant(); + + if (subdomain.Contains(vendorName, StringComparison.OrdinalIgnoreCase)) + { + if (!fromDomain.Equals(vendor.Key, StringComparison.OrdinalIgnoreCase)) + return true; + } + } + + var fakeServiceDomains = _config.FakeServiceDomains.Count > 0 + ? _config.FakeServiceDomains + : GetDefaultFakeServiceDomains(); + + var baseDomain = string.Join(".", parts.Skip(1)); + return fakeServiceDomains.Any(fsd => baseDomain.EndsWith(fsd, StringComparison.OrdinalIgnoreCase)); + } + + private static List GetDefaultFakeServiceDomains() => new() + { + "voiceservicing.net", "audios.net", "voicemail.net", "audioservices.net", + "mailservicing.net", "emailservicing.net", "securemail.net", "mailprotect.net" + }; + + private bool CheckFakeQuarantineReport(string subject, string body, string fromDomain) + { + var text = (subject + " " + body).ToLowerInvariant(); + + var quarantineKeywords = _config.QuarantineKeywords.Count > 0 + ? _config.QuarantineKeywords + : new List { "quarantine summary", "spam report", "quarantine folder", "email quarantine" }; + + var hasQuarantineKeyword = quarantineKeywords.Any(k => text.Contains(k, StringComparison.OrdinalIgnoreCase)); + if (!hasQuarantineKeyword) + return false; + + var legitimateQuarantineDomains = new[] + { + "microsoft.com", "office365.com", "mimecast.com", "proofpoint.com", + "barracuda.com", "sophos.com", "fortinet.com", "cisco.com" + }; + + return !legitimateQuarantineDomains.Any(d => fromDomain.EndsWith(d, StringComparison.OrdinalIgnoreCase)) && + !_config.Domains.Vendors.Keys.Any(v => fromDomain.Equals(v, StringComparison.OrdinalIgnoreCase)); + } + + private static bool HasZeroWidthCharacters(string text) + { + if (string.IsNullOrEmpty(text)) + return false; + + var zeroWidthChars = new[] + { + '\u200B', '\u200C', '\u200D', '\u200E', '\u200F', + '\u2060', '\uFEFF', '\u00AD', '\u034F', '\u061C', + '\u115F', '\u1160', '\u17B4', '\u17B5', '\u180E' + }; + + return text.Any(c => zeroWidthChars.Contains(c)); + } + + private static bool HasRandomReferenceId(string subject) + { + if (string.IsNullOrEmpty(subject)) + return false; + + var patterns = new[] + { + @"Ref[:#]?\s*[A-Za-z0-9]{20,}", + @"#\d{8}[-_]?[A-Za-z0-9]{8,}", + @"ID[:#]?\s*[A-Za-z0-9]{15,}", + @"[A-Za-z0-9]{25,}", + @"_[A-Za-z0-9]{20,}" + }; + + return patterns.Any(p => Regex.IsMatch(subject, p, RegexOptions.IgnoreCase)); + } + + private static bool HasTimestampInSubject(string subject) + { + if (string.IsNullOrEmpty(subject)) + return false; + + var patterns = new[] + { + @"<\d{2}:\d{2}:\d{2}\.\d{3}\s+\d{2}/\d{2}/\d{4}>", + @"\[\d{2}:\d{2}:\d{2}\]", + @"\(\d{2}:\d{2}:\d{2}\)", + @"\d{2}:\d{2}:\d{2}\.\d{3}" + }; + + return patterns.Any(p => Regex.IsMatch(subject, p)); + } + + private bool CheckColdEmailSolicitation(string subject, string body) + { + var text = (subject + " " + body).ToLowerInvariant(); + + var coldEmailKeywords = _config.ColdEmailKeywords.Count > 0 + ? _config.ColdEmailKeywords + : new List { "seo services", "website design", "reaching out", "hope this finds you" }; + + var matchCount = coldEmailKeywords.Count(k => text.Contains(k, StringComparison.OrdinalIgnoreCase)); + return matchCount >= 2; + } + + private bool CheckFakeVoicemailNotification(string subject, string body, string fromDomain) + { + var text = (subject + " " + body).ToLowerInvariant(); + + var voicemailKeywords = _config.VoicemailKeywords.Count > 0 + ? _config.VoicemailKeywords + : new List { "voicemail", "voice message", "missed call" }; + + var hasVoicemailKeyword = voicemailKeywords.Any(k => text.Contains(k, StringComparison.OrdinalIgnoreCase)); + if (!hasVoicemailKeyword) + return false; + + var legitimateVoicemailDomains = new[] + { + "ringcentral.com", "vonage.com", "grasshopper.com", "dialpad.com", + "8x8.com", "goto.com", "zoom.us", "microsoft.com", "office365.com" + }; + + var isFromLegitimate = legitimateVoicemailDomains.Any(d => fromDomain.EndsWith(d, StringComparison.OrdinalIgnoreCase)) || + _config.Domains.Vendors.Keys.Any(v => fromDomain.Equals(v, StringComparison.OrdinalIgnoreCase)); + + var isSubdomainSpoof = CheckCompanySubdomainSpoof(fromDomain); + + return !isFromLegitimate || isSubdomainSpoof; + } + + private bool CheckFakeSystemNotification(string subject, string body, string fromDomain) + { + var text = (subject + " " + body).ToLowerInvariant(); + + var systemNotificationKeywords = _config.SystemNotificationKeywords.Count > 0 + ? _config.SystemNotificationKeywords + : new List { "verify your email", "account suspended", "storage limit" }; + + var hasSystemKeyword = systemNotificationKeywords.Any(k => text.Contains(k, StringComparison.OrdinalIgnoreCase)); + if (!hasSystemKeyword) + return false; + + var legitimateSystemDomains = new[] + { + "microsoft.com", "office365.com", "google.com", "godaddy.com", + "intermedia.net", "hostpilot.com", "networksolutions.com", "namecheap.com", + "cloudflare.com", "amazon.com", "aws.amazon.com" + }; + + return !legitimateSystemDomains.Any(d => fromDomain.EndsWith(d, StringComparison.OrdinalIgnoreCase)) && + !_config.Domains.Vendors.Keys.Any(v => fromDomain.Equals(v, StringComparison.OrdinalIgnoreCase)); + } +} diff --git a/EmailSearch/SpamDetection/SpamDetectorConfig.cs b/EmailSearch/SpamDetection/SpamDetectorConfig.cs new file mode 100644 index 0000000..abdc6ce --- /dev/null +++ b/EmailSearch/SpamDetection/SpamDetectorConfig.cs @@ -0,0 +1,140 @@ +namespace EmailSearch.SpamDetection; + +public sealed class SpamDetectorConfig +{ + public List FreeDomains { get; set; } = new(); + public List BadTlds { get; set; } = new(); + public List BaitKeywords { get; set; } = new(); + public DomainConfiguration Domains { get; set; } = new(); + public SpamScoreWeights SpamScoreWeights { get; set; } = new(); + public List QuarantineKeywords { get; set; } = new(); + public List VoicemailKeywords { get; set; } = new(); + public List SystemNotificationKeywords { get; set; } = new(); + public List ColdEmailKeywords { get; set; } = new(); + public List FakeServiceDomains { get; set; } = new(); + + public static SpamDetectorConfig GetDefault() + { + return new SpamDetectorConfig + { + FreeDomains = new List + { + "gmail.com", "outlook.com", "hotmail.com", "yahoo.com", + "icloud.com", "aol.com", "proton.me", "protonmail.com", + "live.com", "msn.com", "ymail.com", "mail.com" + }, + BadTlds = new List + { + "icu", "top", "click", "xyz", "mom", "quest", "work", + "shop", "rest", "tokyo", "pics", "zip", "com.br", "net", + "buzz", "cam", "link", "loan", "online", "site", "website" + }, + BaitKeywords = new List + { + // Financial + "invoice", "overdue", "wire", "zelle", "gift card", "payroll", + "remit", "ach", "payment", "past due", "bank transfer", + // Urgency/Action + "review & sign", "sign now", "action required", "urgent", + "verify", "confirm your", "suspended", "expire", "limited time", + // Account/System + "storage limit", "storage quota", "account", "password", + "security alert", "unusual activity", "locked", + // Messages/Notifications + "voice message", "voicemail", "fax", "document", "shared with you", + // Domain/SEO spam + "domain for sale", "premium domain", "seo", "website design", + // Cold sales + "setup request", "follow up", "checking in", "quick question" + }, + Domains = new DomainConfiguration + { + Vendors = new Dictionary(), + Trusted = new List + { + "microsoft.com", "office365.com", "google.com", "amazon.com", + "apple.com", "github.com", "linkedin.com" + } + }, + SpamScoreWeights = new SpamScoreWeights(), + QuarantineKeywords = new List + { + "quarantine summary", "spam report", "quarantine folder", + "email quarantine", "quarantined email", "spam summary", + "junk summary", "blocked messages", "held messages" + }, + VoicemailKeywords = new List + { + "voicemail", "voice message", "voice mail", "audio message", + "new voicemail", "play voicemail", "missed call", "phone message" + }, + SystemNotificationKeywords = new List + { + "verify your email", "email verification", "verify now", + "confirm your email", "account suspended", "account locked", + "storage limit", "storage quota", "mailbox full", + "password expir", "credentials expir", "unusual activity", + "security alert", "suspicious activity", "action required" + }, + ColdEmailKeywords = new List + { + "seo services", "seo affordable", "search engine optimization", + "website ranking", "google ranking", "backlinks", "link building", + "website redesign", "web development", "web developer", + "website design", "graphic designer", "mobile app", "app development", + "reaching out", "hope this finds you", "i came across your", + "outsource", "offshore", "dedicated team", "cost-effective" + }, + FakeServiceDomains = new List + { + "voiceservicing.net", "audios.net", "voicemail.net", + "audioservices.net", "mailservicing.net", "emailservicing.net", + "securemail.net", "mailprotect.net", "docuservices.net" + } + }; + } +} + +public sealed class DomainConfiguration +{ + public Dictionary Vendors { get; set; } = new(); + public List Trusted { get; set; } = new(); +} + +public sealed class VendorDomainInfo +{ + public int Reputation { get; set; } + public List DisplayNamePatterns { get; set; } = new(); +} + +public sealed class SpamScoreWeights +{ + public double SpfFail { get; set; } = 0.28; + public double DkimFail { get; set; } = 0.25; + public double DmarcFail { get; set; } = 0.30; + public double ReplyToDomainMismatch { get; set; } = 0.20; + public double DisplayImpersonation { get; set; } = 0.22; + public double UnicodeLookalike { get; set; } = 0.20; + public double HasUrl { get; set; } = 0.06; + public double UrlCountMultiplier { get; set; } = 0.02; + public double HasIpLink { get; set; } = 0.18; + public double UsesShortener { get; set; } = 0.12; + public double SuspiciousTld { get; set; } = 0.10; + public double HasTrackingPixel { get; set; } = 0.06; + public double HasAttachment { get; set; } = 0.06; + public double HasRiskyAttachment { get; set; } = 0.22; + public double KeywordBait { get; set; } = 0.22; + public double FreeMailboxWithUrl { get; set; } = 0.18; + public double FreeMailboxOnly { get; set; } = 0.08; + public double HasListUnsubscribe { get; set; } = -0.04; + public double ReputationMultiplier { get; set; } = 0.05; + public double UnknownDomain { get; set; } = 0.15; + public double CompanySubdomainSpoof { get; set; } = 0.45; + public double FakeQuarantineReport { get; set; } = 0.40; + public double HasZeroWidthChars { get; set; } = 0.35; + public double HasRandomRefId { get; set; } = 0.18; + public double HasTimestampInSubject { get; set; } = 0.15; + public double ColdEmailSolicitation { get; set; } = 0.30; + public double FakeVoicemailNotification { get; set; } = 0.42; + public double FakeSystemNotification { get; set; } = 0.38; +} diff --git a/EmailSearch/SpamDetection/SpamFeatures.cs b/EmailSearch/SpamDetection/SpamFeatures.cs new file mode 100644 index 0000000..727d560 --- /dev/null +++ b/EmailSearch/SpamDetection/SpamFeatures.cs @@ -0,0 +1,57 @@ +namespace EmailSearch.SpamDetection; + +/// +/// Contains all extracted features from an email for spam analysis. +/// +public sealed class SpamFeatures +{ + // Identity + public string DisplayName { get; set; } = ""; + public string FromAddress { get; set; } = ""; + public string FromDomain { get; set; } = ""; + + // Auth & headers + public bool SpfFail { get; set; } + public bool DkimFail { get; set; } + public bool DmarcFail { get; set; } + public bool ReplyToDomainMismatch { get; set; } + public bool HasListUnsub { get; set; } + + // Impersonation / lookalikes + public bool DisplayImpersonation { get; set; } + public bool UnicodeLookalike { get; set; } + public bool GenericSenderName { get; set; } + public bool SubjectDomainImpersonation { get; set; } + + // Links + public bool HasUrl { get; set; } + public int UrlCount { get; set; } + public bool HasIpLink { get; set; } + public bool UsesShortener { get; set; } + public bool SuspiciousTld { get; set; } + + // Sender/domain traits + public bool FreeMailboxDomain { get; set; } + public bool UnknownDomain { get; set; } + public bool IsBlocklisted { get; set; } + public int SenderReputation { get; set; } + + // Content/attachments + public bool HasTrackingPixel { get; set; } + public bool HasAttachment { get; set; } + public bool HasRiskyAttachment { get; set; } + public double AttachmentRiskScore { get; set; } + public bool KeywordBait { get; set; } + public bool SingleLinkOnly { get; set; } + public bool HasPlaceholderText { get; set; } + + // Advanced patterns + public bool CompanySubdomainSpoof { get; set; } + public bool FakeQuarantineReport { get; set; } + public bool HasZeroWidthChars { get; set; } + public bool HasRandomRefId { get; set; } + public bool HasTimestampInSubject { get; set; } + public bool ColdEmailSolicitation { get; set; } + public bool FakeVoicemailNotification { get; set; } + public bool FakeSystemNotification { get; set; } +} diff --git a/EmailSearch/SpamDetection/UrlAnalyzer.cs b/EmailSearch/SpamDetection/UrlAnalyzer.cs new file mode 100644 index 0000000..bbcb3f7 --- /dev/null +++ b/EmailSearch/SpamDetection/UrlAnalyzer.cs @@ -0,0 +1,31 @@ +namespace EmailSearch.SpamDetection; + +internal static class UrlAnalyzer +{ + private static readonly HashSet Shorteners = new(StringComparer.OrdinalIgnoreCase) + { + "bit.ly", "tinyurl.com", "t.co", "goo.gl", "is.gd", "buff.ly", + "ow.ly", "rb.gy", "rebrand.ly", "cutt.ly", "soo.gd", "tiny.cc", + "short.io", "bl.ink", "shorte.st", "clicky.me" + }; + + public static bool IsIpUrl(string url) + { + try + { + var host = new Uri(url).Host; + return System.Net.IPAddress.TryParse(host, out _); + } + catch { return false; } + } + + public static bool IsShortener(string url) + { + try + { + var host = new Uri(url).Host.ToLowerInvariant(); + return Shorteners.Any(s => host == s || host.EndsWith("." + s)); + } + catch { return false; } + } +} diff --git a/EmailSearch/SpamDetectorConfig.json b/EmailSearch/SpamDetectorConfig.json new file mode 100644 index 0000000..de8d8af --- /dev/null +++ b/EmailSearch/SpamDetectorConfig.json @@ -0,0 +1,94 @@ +{ + "freeDomains": [ + "gmail.com", "outlook.com", "hotmail.com", "yahoo.com", + "icloud.com", "aol.com", "proton.me", "protonmail.com", + "live.com", "msn.com", "ymail.com", "mail.com" + ], + "badTlds": [ + "icu", "top", "click", "xyz", "mom", "quest", "work", + "shop", "rest", "tokyo", "pics", "zip", "com.br", + "buzz", "cam", "link", "loan", "online", "site", "website" + ], + "baitKeywords": [ + "invoice", "overdue", "wire", "zelle", "gift card", "payroll", + "remit", "ach", "payment", "past due", "bank transfer", + "review & sign", "sign now", "action required", "urgent", + "verify", "confirm your", "suspended", "expire", "limited time", + "storage limit", "storage quota", "account", "password", + "security alert", "unusual activity", "locked", + "voice message", "voicemail", "fax", "document", "shared with you", + "domain for sale", "premium domain", "seo", "website design", + "setup request", "follow up", "checking in", "quick question" + ], + "domains": { + "vendors": { + "example-vendor.com": { + "reputation": 5, + "displayNamePatterns": ["example", "vendor"] + } + }, + "trusted": [ + "microsoft.com", "office365.com", "google.com", "amazon.com", + "apple.com", "github.com", "linkedin.com" + ] + }, + "quarantineKeywords": [ + "quarantine summary", "spam report", "quarantine folder", + "email quarantine", "quarantined email", "spam summary", + "junk summary", "blocked messages", "held messages" + ], + "voicemailKeywords": [ + "voicemail", "voice message", "voice mail", "audio message", + "new voicemail", "play voicemail", "missed call", "phone message" + ], + "systemNotificationKeywords": [ + "verify your email", "email verification", "verify now", + "confirm your email", "account suspended", "account locked", + "storage limit", "storage quota", "mailbox full", + "password expir", "credentials expir", "unusual activity", + "security alert", "suspicious activity", "action required" + ], + "coldEmailKeywords": [ + "seo services", "seo affordable", "search engine optimization", + "website ranking", "google ranking", "backlinks", "link building", + "website redesign", "web development", "web developer", + "website design", "graphic designer", "mobile app", "app development", + "reaching out", "hope this finds you", "i came across your", + "outsource", "offshore", "dedicated team", "cost-effective" + ], + "fakeServiceDomains": [ + "voiceservicing.net", "audios.net", "voicemail.net", + "audioservices.net", "mailservicing.net", "emailservicing.net", + "securemail.net", "mailprotect.net", "docuservices.net" + ], + "spamScoreWeights": { + "spfFail": 0.28, + "dkimFail": 0.25, + "dmarcFail": 0.30, + "replyToDomainMismatch": 0.20, + "displayImpersonation": 0.22, + "unicodeLookalike": 0.20, + "hasUrl": 0.06, + "urlCountMultiplier": 0.02, + "hasIpLink": 0.18, + "usesShortener": 0.12, + "suspiciousTld": 0.10, + "hasTrackingPixel": 0.06, + "hasAttachment": 0.06, + "hasRiskyAttachment": 0.22, + "keywordBait": 0.22, + "freeMailboxWithUrl": 0.18, + "freeMailboxOnly": 0.08, + "hasListUnsubscribe": -0.04, + "reputationMultiplier": 0.05, + "unknownDomain": 0.15, + "companySubdomainSpoof": 0.45, + "fakeQuarantineReport": 0.40, + "hasZeroWidthChars": 0.35, + "hasRandomRefId": 0.18, + "hasTimestampInSubject": 0.15, + "coldEmailSolicitation": 0.30, + "fakeVoicemailNotification": 0.42, + "fakeSystemNotification": 0.38 + } +}