From dfd4075e0e0e3abc358118dfa0f1ff10d74e1712 Mon Sep 17 00:00:00 2001 From: Arjun <6592213+arkml@users.noreply.github.com> Date: Fri, 19 Jun 2026 14:59:45 +0530 Subject: [PATCH] filter out random contacts --- .../core/src/knowledge/contact_filters.ts | 114 +++++++++++++++++ .../core/src/knowledge/gmail_contacts.ts | 120 +----------------- .../core/src/knowledge/gmail_sent_contacts.ts | 4 + 3 files changed, 119 insertions(+), 119 deletions(-) create mode 100644 apps/x/packages/core/src/knowledge/contact_filters.ts diff --git a/apps/x/packages/core/src/knowledge/contact_filters.ts b/apps/x/packages/core/src/knowledge/contact_filters.ts new file mode 100644 index 00000000..c7c27c1f --- /dev/null +++ b/apps/x/packages/core/src/knowledge/contact_filters.ts @@ -0,0 +1,114 @@ +// Local-part aliases that are almost always automated/role addresses you don't +// compose a fresh message to. Matched as a whole segment of the local part +// (segments split on . _ - +). +const AUTOMATED_LOCAL_PARTS = new Set([ + 'noreply', 'no-reply', 'donotreply', 'do-not-reply', 'reply', + 'notifications', 'notification', 'notify', + 'alerts', 'alert', 'updates', 'update', + 'news', 'newsletter', 'newsletters', + 'info', 'information', 'hello', 'hi', 'hey', + 'welcome', 'onboarding', 'getstarted', + 'team', 'marketing', 'promo', 'promos', 'promotions', + 'offer', 'offers', 'deals', 'deal', + 'accounts', 'account', 'billing', 'invoices', 'statements', 'statement', + 'learn', 'learning', 'courses', + 'mailer-daemon', 'mailerdaemon', 'postmaster', 'bounce', 'bounces', + 'automated', 'auto', 'autoconfirm', + 'support-bot', 'noticeboard', 'system', + 'contact', 'connect', + 'sender', 'broadcast', 'digest', 'campaign', 'campaigns', + 'support', 'service', 'help', 'helpdesk', 'feedback', + 'mailer', 'mailers', 'members', 'membership', + 'careers', 'jobs', 'recruit', 'recruiting', + 'tickets', 'orders', 'order', 'receipts', 'receipt', + 'applications', 'apply', 'admissions', + 'health', 'security', 'auth', +]); + +// Subdomain labels that flag a bulk/marketing infrastructure domain. +const AUTOMATED_SUBDOMAIN_LABELS = new Set([ + 'mail', 'mailer', 'mailers', 'mailing', 'mailgun', 'sendgrid', 'mta', + 'email', 'em', 'e', 'm', + 'news', 'newsletter', 'newsletters', + 'marketing', 'mkt', 'promo', 'promos', 'offers', + 'event', 'events', 'ecomm', 'commerce', + 'notifications', 'notification', 'notify', 'alerts', 'alert', 'updates', + 'messaging', 'message', 'msg', + 'noreply', 'donotreply', + 'creators', 'partners', 'team', + 'info', 'welcome', 'hi', 'hello', + 'bounces', 'bounce', + 'reply', 'user', 'usr', 'auto', +]); + +// Specific bulk-mail provider domains (substring match on full domain). +const AUTOMATED_DOMAIN_KEYWORDS = [ + 'facebookmail', 'kajabimail', 'substack', 'mailgun', 'sendgrid', + 'mcsv.net', 'mailchimp', 'mailerlite', 'createsend', 'cmail', + 'amazonses', 'sparkpost', 'sendinblue', 'brevo', + 'luma-mail', 'lumamail', + 'umusic-online', 'icloud-mail', +]; + +function localSegments(local: string): string[] { + return local.toLowerCase().split(/[._\-+]/).filter(Boolean); +} + +export function isAutomatedAddress(email: string): boolean { + if (!email) return true; + const at = email.indexOf('@'); + if (at < 0) return true; + const local = email.slice(0, at).toLowerCase(); + const domain = email.slice(at + 1).toLowerCase(); + + // Plus-aliased reply bots: `reply+abc123@...` + if (/^reply\+/i.test(local)) return true; + + // Encoded VERP/list aliases, e.g. long-token-arjun=rowboat...@domain. + if (local.includes('=') && /^[a-z0-9]{16,}[-+].*=/.test(local)) return true; + + const segs = localSegments(local); + for (const s of segs) { + if (AUTOMATED_LOCAL_PARTS.has(s)) return true; + } + + if (/(no.?reply|do.?not.?reply|notifications?|news.?letter|mailer.?daemon|postmaster|automated|broadcast|statement)/i.test(local)) { + return true; + } + + if (local.length >= 20 && /^[a-z0-9=._\-+]+$/.test(local) && /[0-9]/.test(local)) { + const digits = (local.match(/[0-9]/g) || []).length; + const letters = (local.match(/[a-z]/g) || []).length; + if (digits / local.length >= 0.2 || (digits >= 3 && letters >= 12 && !local.includes('.'))) return true; + } + + const labels = domain.split('.'); + if (labels.length >= 3) { + const subs = labels.slice(0, -2); + for (const label of subs) { + if (AUTOMATED_SUBDOMAIN_LABELS.has(label)) return true; + } + } + + for (const kw of AUTOMATED_DOMAIN_KEYWORDS) { + if (domain.includes(kw)) return true; + } + + if (/(^|\.)(mailers?|mailer|mailgun|sendgrid|mailchimp|mailerlite|bounces?|marketing|promo|notifications?|newsletter)(\.|$)/i.test(domain)) { + return true; + } + + const sld = labels[labels.length - 1]; + if (['email', 'mail', 'marketing', 'promo', 'news', 'newsletter', 'click', 'link'].includes(sld)) { + return true; + } + + // Brand-identity addresses like `uber@uber.com`, `lenovo@lenovo.com` - + // local part equals the first label of the domain. Almost always a + // transactional/marketing sender. + if (labels.length >= 2 && local === labels[0]) { + return true; + } + + return false; +} diff --git a/apps/x/packages/core/src/knowledge/gmail_contacts.ts b/apps/x/packages/core/src/knowledge/gmail_contacts.ts index 30842d4b..ba3a0fcf 100644 --- a/apps/x/packages/core/src/knowledge/gmail_contacts.ts +++ b/apps/x/packages/core/src/knowledge/gmail_contacts.ts @@ -4,6 +4,7 @@ import path from 'path'; import { WorkDir } from '../config/config.js'; import type { GmailThreadSnapshot } from './sync_gmail.js'; import { getAccountEmail } from './sync_gmail.js'; +import { isAutomatedAddress } from './contact_filters.js'; const CACHE_DIR = path.join(WorkDir, 'inbox_lists'); const INDEX_TTL_MS = 5 * 60 * 1000; @@ -62,125 +63,6 @@ function parseAddressList(header: string): Array<{ name: string; email: string } return result; } -// Local-part aliases that are almost always automated/role addresses you don't -// compose a fresh message to. Matched as a whole segment of the local part -// (segments split on . _ - +). -const AUTOMATED_LOCAL_PARTS = new Set([ - 'noreply', 'no-reply', 'donotreply', 'do-not-reply', 'reply', - 'notifications', 'notification', 'notify', - 'alerts', 'alert', 'updates', 'update', - 'news', 'newsletter', 'newsletters', - 'info', 'information', 'hello', 'hi', 'hey', - 'welcome', 'onboarding', 'getstarted', - 'team', 'marketing', 'promo', 'promos', 'promotions', - 'offer', 'offers', 'deals', 'deal', - 'accounts', 'account', 'billing', 'invoices', 'statements', 'statement', - 'learn', 'learning', 'courses', - 'mailer-daemon', 'mailerdaemon', 'postmaster', 'bounce', 'bounces', - 'automated', 'auto', 'autoconfirm', - 'support-bot', 'noticeboard', 'system', - 'contact', 'connect', - 'sender', 'broadcast', 'digest', 'campaign', 'campaigns', - 'support', 'service', 'help', 'helpdesk', 'feedback', - 'mailer', 'mailers', 'members', 'membership', - 'careers', 'jobs', 'recruit', 'recruiting', - 'tickets', 'orders', 'order', 'receipts', 'receipt', - 'applications', 'apply', 'admissions', - 'health', 'security', 'auth', -]); - -// Subdomain labels that flag a bulk/marketing infrastructure domain. -const AUTOMATED_SUBDOMAIN_LABELS = new Set([ - 'mail', 'mailer', 'mailers', 'mailing', 'mailgun', 'sendgrid', 'mta', - 'email', 'em', 'e', 'm', - 'news', 'newsletter', 'newsletters', - 'marketing', 'mkt', 'promo', 'promos', 'offers', - 'event', 'events', 'ecomm', 'commerce', - 'notifications', 'notification', 'notify', 'alerts', 'alert', 'updates', - 'messaging', 'message', 'msg', - 'noreply', 'donotreply', - 'creators', 'partners', 'team', - 'info', 'welcome', 'hi', 'hello', - 'bounces', 'bounce', - 'reply', 'user', 'usr', 'auto', -]); - -// Specific bulk-mail provider domains (substring match on full domain). -const AUTOMATED_DOMAIN_KEYWORDS = [ - 'facebookmail', 'kajabimail', 'substack', 'mailgun', 'sendgrid', - 'mcsv.net', 'mailchimp', 'mailerlite', 'createsend', 'cmail', - 'amazonses', 'sparkpost', 'sendinblue', 'brevo', - 'luma-mail', 'lumamail', - 'umusic-online', 'icloud-mail', -]; - -function localSegments(local: string): string[] { - return local.toLowerCase().split(/[._\-+]/).filter(Boolean); -} - -function isAutomatedAddress(email: string): boolean { - if (!email) return true; - const at = email.indexOf('@'); - if (at < 0) return true; - const local = email.slice(0, at).toLowerCase(); - const domain = email.slice(at + 1).toLowerCase(); - - // Plus-aliased reply bots: `reply+abc123@…` - if (/^reply\+/i.test(local)) return true; - - // Whole-segment local-part matches. - const segs = localSegments(local); - for (const s of segs) { - if (AUTOMATED_LOCAL_PARTS.has(s)) return true; - } - // Some senders pack noise into the local part with no separators - // (e.g. `hdfcbanksmartstatement`). Catch the common ones. - if (/(no.?reply|do.?not.?reply|notifications?|news.?letter|mailer.?daemon|postmaster|automated|broadcast|statement)/i.test(local)) { - return true; - } - - // Random-looking machine local parts: long, mostly hex/base32-ish. - if (local.length >= 20 && /^[a-z0-9]+(-[a-z0-9]+)*$/.test(local) && /[0-9]/.test(local)) { - const digits = (local.match(/[0-9]/g) || []).length; - if (digits / local.length >= 0.25) return true; - } - - // Subdomain-label check (everything except the registrable last two labels). - const labels = domain.split('.'); - if (labels.length >= 3) { - const subs = labels.slice(0, -2); - for (const label of subs) { - if (AUTOMATED_SUBDOMAIN_LABELS.has(label)) return true; - } - } - - // Provider keyword anywhere in the domain. - for (const kw of AUTOMATED_DOMAIN_KEYWORDS) { - if (domain.includes(kw)) return true; - } - - // Domain itself contains tell-tale tokens. - if (/(^|\.)(mailers?|mailer|mailgun|sendgrid|mailchimp|mailerlite|bounces?|marketing|promo|notifications?|newsletter)(\.|$)/i.test(domain)) { - return true; - } - - // Marketing-style TLD / second-level domain (e.g. bookmyshow.email, - // foo.marketing, bar.news). These domains exist almost exclusively for bulk. - const sld = labels[labels.length - 1]; - if (['email', 'mail', 'marketing', 'promo', 'news', 'newsletter', 'click', 'link'].includes(sld)) { - return true; - } - - // Brand-identity addresses like `uber@uber.com`, `lenovo@lenovo.com` — - // local part equals the first label of the domain. Almost always a - // transactional/marketing sender. - if (labels.length >= 2 && local === labels[0]) { - return true; - } - - return false; -} - function ingestSnapshot(snapshot: GmailThreadSnapshot, selfEmail: string, map: Map): void { if (!snapshot?.messages) return; for (const msg of snapshot.messages) { diff --git a/apps/x/packages/core/src/knowledge/gmail_sent_contacts.ts b/apps/x/packages/core/src/knowledge/gmail_sent_contacts.ts index 15ccf65e..3738dc8b 100644 --- a/apps/x/packages/core/src/knowledge/gmail_sent_contacts.ts +++ b/apps/x/packages/core/src/knowledge/gmail_sent_contacts.ts @@ -6,6 +6,7 @@ import { OAuth2Client } from 'google-auth-library'; import { WorkDir } from '../config/config.js'; import { GoogleClientFactory } from './google-client-factory.js'; import { getUserEmail } from './classify_thread.js'; +import { isAutomatedAddress } from './contact_filters.js'; const STATE_FILE = path.join(WorkDir, 'contacts_sent.json'); const RECENCY_HALFLIFE_DAYS = 60; @@ -104,6 +105,7 @@ async function saveState(state: StoredState): Promise { function indexFromStored(state: StoredState): Map { const map = new Map(); for (const e of state.entries) { + if (isAutomatedAddress(e.email)) continue; map.set(e.email, { name: e.name, email: e.email, @@ -167,6 +169,7 @@ async function ingestMessage( ]; for (const { name, email } of recipients) { if (!email || email === selfEmail) continue; + if (isAutomatedAddress(email)) continue; let entry = map.get(email); if (!entry) { entry = { name, email, count: 0, lastSeenMs: 0, nameCounts: new Map() }; @@ -374,6 +377,7 @@ export async function searchSentContacts(query: string, opts: SearchOpts = {}): const matches: Array<{ entry: IndexEntry; tier: number; s: number }> = []; for (const entry of cachedIndex.values()) { if (excluded.has(entry.email)) continue; + if (isAutomatedAddress(entry.email)) continue; const tier = matchTier(q, entry); if (tier < 0) continue; matches.push({ entry, tier, s: score(entry, nowMs) });