mirror of
https://github.com/rowboatlabs/rowboat.git
synced 2026-06-24 20:28:16 +02:00
filter out random contacts
This commit is contained in:
parent
851c3be69f
commit
dfd4075e0e
3 changed files with 119 additions and 119 deletions
114
apps/x/packages/core/src/knowledge/contact_filters.ts
Normal file
114
apps/x/packages/core/src/knowledge/contact_filters.ts
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
// Local-part aliases that are almost always automated/role addresses you don't
|
||||
// compose a fresh message to. Matched as a whole segment of the local part
|
||||
// (segments split on . _ - +).
|
||||
const AUTOMATED_LOCAL_PARTS = new Set([
|
||||
'noreply', 'no-reply', 'donotreply', 'do-not-reply', 'reply',
|
||||
'notifications', 'notification', 'notify',
|
||||
'alerts', 'alert', 'updates', 'update',
|
||||
'news', 'newsletter', 'newsletters',
|
||||
'info', 'information', 'hello', 'hi', 'hey',
|
||||
'welcome', 'onboarding', 'getstarted',
|
||||
'team', 'marketing', 'promo', 'promos', 'promotions',
|
||||
'offer', 'offers', 'deals', 'deal',
|
||||
'accounts', 'account', 'billing', 'invoices', 'statements', 'statement',
|
||||
'learn', 'learning', 'courses',
|
||||
'mailer-daemon', 'mailerdaemon', 'postmaster', 'bounce', 'bounces',
|
||||
'automated', 'auto', 'autoconfirm',
|
||||
'support-bot', 'noticeboard', 'system',
|
||||
'contact', 'connect',
|
||||
'sender', 'broadcast', 'digest', 'campaign', 'campaigns',
|
||||
'support', 'service', 'help', 'helpdesk', 'feedback',
|
||||
'mailer', 'mailers', 'members', 'membership',
|
||||
'careers', 'jobs', 'recruit', 'recruiting',
|
||||
'tickets', 'orders', 'order', 'receipts', 'receipt',
|
||||
'applications', 'apply', 'admissions',
|
||||
'health', 'security', 'auth',
|
||||
]);
|
||||
|
||||
// Subdomain labels that flag a bulk/marketing infrastructure domain.
|
||||
const AUTOMATED_SUBDOMAIN_LABELS = new Set([
|
||||
'mail', 'mailer', 'mailers', 'mailing', 'mailgun', 'sendgrid', 'mta',
|
||||
'email', 'em', 'e', 'm',
|
||||
'news', 'newsletter', 'newsletters',
|
||||
'marketing', 'mkt', 'promo', 'promos', 'offers',
|
||||
'event', 'events', 'ecomm', 'commerce',
|
||||
'notifications', 'notification', 'notify', 'alerts', 'alert', 'updates',
|
||||
'messaging', 'message', 'msg',
|
||||
'noreply', 'donotreply',
|
||||
'creators', 'partners', 'team',
|
||||
'info', 'welcome', 'hi', 'hello',
|
||||
'bounces', 'bounce',
|
||||
'reply', 'user', 'usr', 'auto',
|
||||
]);
|
||||
|
||||
// Specific bulk-mail provider domains (substring match on full domain).
|
||||
const AUTOMATED_DOMAIN_KEYWORDS = [
|
||||
'facebookmail', 'kajabimail', 'substack', 'mailgun', 'sendgrid',
|
||||
'mcsv.net', 'mailchimp', 'mailerlite', 'createsend', 'cmail',
|
||||
'amazonses', 'sparkpost', 'sendinblue', 'brevo',
|
||||
'luma-mail', 'lumamail',
|
||||
'umusic-online', 'icloud-mail',
|
||||
];
|
||||
|
||||
function localSegments(local: string): string[] {
|
||||
return local.toLowerCase().split(/[._\-+]/).filter(Boolean);
|
||||
}
|
||||
|
||||
export function isAutomatedAddress(email: string): boolean {
|
||||
if (!email) return true;
|
||||
const at = email.indexOf('@');
|
||||
if (at < 0) return true;
|
||||
const local = email.slice(0, at).toLowerCase();
|
||||
const domain = email.slice(at + 1).toLowerCase();
|
||||
|
||||
// Plus-aliased reply bots: `reply+abc123@...`
|
||||
if (/^reply\+/i.test(local)) return true;
|
||||
|
||||
// Encoded VERP/list aliases, e.g. long-token-arjun=rowboat...@domain.
|
||||
if (local.includes('=') && /^[a-z0-9]{16,}[-+].*=/.test(local)) return true;
|
||||
|
||||
const segs = localSegments(local);
|
||||
for (const s of segs) {
|
||||
if (AUTOMATED_LOCAL_PARTS.has(s)) return true;
|
||||
}
|
||||
|
||||
if (/(no.?reply|do.?not.?reply|notifications?|news.?letter|mailer.?daemon|postmaster|automated|broadcast|statement)/i.test(local)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (local.length >= 20 && /^[a-z0-9=._\-+]+$/.test(local) && /[0-9]/.test(local)) {
|
||||
const digits = (local.match(/[0-9]/g) || []).length;
|
||||
const letters = (local.match(/[a-z]/g) || []).length;
|
||||
if (digits / local.length >= 0.2 || (digits >= 3 && letters >= 12 && !local.includes('.'))) return true;
|
||||
}
|
||||
|
||||
const labels = domain.split('.');
|
||||
if (labels.length >= 3) {
|
||||
const subs = labels.slice(0, -2);
|
||||
for (const label of subs) {
|
||||
if (AUTOMATED_SUBDOMAIN_LABELS.has(label)) return true;
|
||||
}
|
||||
}
|
||||
|
||||
for (const kw of AUTOMATED_DOMAIN_KEYWORDS) {
|
||||
if (domain.includes(kw)) return true;
|
||||
}
|
||||
|
||||
if (/(^|\.)(mailers?|mailer|mailgun|sendgrid|mailchimp|mailerlite|bounces?|marketing|promo|notifications?|newsletter)(\.|$)/i.test(domain)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const sld = labels[labels.length - 1];
|
||||
if (['email', 'mail', 'marketing', 'promo', 'news', 'newsletter', 'click', 'link'].includes(sld)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Brand-identity addresses like `uber@uber.com`, `lenovo@lenovo.com` -
|
||||
// local part equals the first label of the domain. Almost always a
|
||||
// transactional/marketing sender.
|
||||
if (labels.length >= 2 && local === labels[0]) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
|
@ -4,6 +4,7 @@ import path from 'path';
|
|||
import { WorkDir } from '../config/config.js';
|
||||
import type { GmailThreadSnapshot } from './sync_gmail.js';
|
||||
import { getAccountEmail } from './sync_gmail.js';
|
||||
import { isAutomatedAddress } from './contact_filters.js';
|
||||
|
||||
const CACHE_DIR = path.join(WorkDir, 'inbox_lists');
|
||||
const INDEX_TTL_MS = 5 * 60 * 1000;
|
||||
|
|
@ -62,125 +63,6 @@ function parseAddressList(header: string): Array<{ name: string; email: string }
|
|||
return result;
|
||||
}
|
||||
|
||||
// Local-part aliases that are almost always automated/role addresses you don't
|
||||
// compose a fresh message to. Matched as a whole segment of the local part
|
||||
// (segments split on . _ - +).
|
||||
const AUTOMATED_LOCAL_PARTS = new Set([
|
||||
'noreply', 'no-reply', 'donotreply', 'do-not-reply', 'reply',
|
||||
'notifications', 'notification', 'notify',
|
||||
'alerts', 'alert', 'updates', 'update',
|
||||
'news', 'newsletter', 'newsletters',
|
||||
'info', 'information', 'hello', 'hi', 'hey',
|
||||
'welcome', 'onboarding', 'getstarted',
|
||||
'team', 'marketing', 'promo', 'promos', 'promotions',
|
||||
'offer', 'offers', 'deals', 'deal',
|
||||
'accounts', 'account', 'billing', 'invoices', 'statements', 'statement',
|
||||
'learn', 'learning', 'courses',
|
||||
'mailer-daemon', 'mailerdaemon', 'postmaster', 'bounce', 'bounces',
|
||||
'automated', 'auto', 'autoconfirm',
|
||||
'support-bot', 'noticeboard', 'system',
|
||||
'contact', 'connect',
|
||||
'sender', 'broadcast', 'digest', 'campaign', 'campaigns',
|
||||
'support', 'service', 'help', 'helpdesk', 'feedback',
|
||||
'mailer', 'mailers', 'members', 'membership',
|
||||
'careers', 'jobs', 'recruit', 'recruiting',
|
||||
'tickets', 'orders', 'order', 'receipts', 'receipt',
|
||||
'applications', 'apply', 'admissions',
|
||||
'health', 'security', 'auth',
|
||||
]);
|
||||
|
||||
// Subdomain labels that flag a bulk/marketing infrastructure domain.
|
||||
const AUTOMATED_SUBDOMAIN_LABELS = new Set([
|
||||
'mail', 'mailer', 'mailers', 'mailing', 'mailgun', 'sendgrid', 'mta',
|
||||
'email', 'em', 'e', 'm',
|
||||
'news', 'newsletter', 'newsletters',
|
||||
'marketing', 'mkt', 'promo', 'promos', 'offers',
|
||||
'event', 'events', 'ecomm', 'commerce',
|
||||
'notifications', 'notification', 'notify', 'alerts', 'alert', 'updates',
|
||||
'messaging', 'message', 'msg',
|
||||
'noreply', 'donotreply',
|
||||
'creators', 'partners', 'team',
|
||||
'info', 'welcome', 'hi', 'hello',
|
||||
'bounces', 'bounce',
|
||||
'reply', 'user', 'usr', 'auto',
|
||||
]);
|
||||
|
||||
// Specific bulk-mail provider domains (substring match on full domain).
|
||||
const AUTOMATED_DOMAIN_KEYWORDS = [
|
||||
'facebookmail', 'kajabimail', 'substack', 'mailgun', 'sendgrid',
|
||||
'mcsv.net', 'mailchimp', 'mailerlite', 'createsend', 'cmail',
|
||||
'amazonses', 'sparkpost', 'sendinblue', 'brevo',
|
||||
'luma-mail', 'lumamail',
|
||||
'umusic-online', 'icloud-mail',
|
||||
];
|
||||
|
||||
function localSegments(local: string): string[] {
|
||||
return local.toLowerCase().split(/[._\-+]/).filter(Boolean);
|
||||
}
|
||||
|
||||
function isAutomatedAddress(email: string): boolean {
|
||||
if (!email) return true;
|
||||
const at = email.indexOf('@');
|
||||
if (at < 0) return true;
|
||||
const local = email.slice(0, at).toLowerCase();
|
||||
const domain = email.slice(at + 1).toLowerCase();
|
||||
|
||||
// Plus-aliased reply bots: `reply+abc123@…`
|
||||
if (/^reply\+/i.test(local)) return true;
|
||||
|
||||
// Whole-segment local-part matches.
|
||||
const segs = localSegments(local);
|
||||
for (const s of segs) {
|
||||
if (AUTOMATED_LOCAL_PARTS.has(s)) return true;
|
||||
}
|
||||
// Some senders pack noise into the local part with no separators
|
||||
// (e.g. `hdfcbanksmartstatement`). Catch the common ones.
|
||||
if (/(no.?reply|do.?not.?reply|notifications?|news.?letter|mailer.?daemon|postmaster|automated|broadcast|statement)/i.test(local)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Random-looking machine local parts: long, mostly hex/base32-ish.
|
||||
if (local.length >= 20 && /^[a-z0-9]+(-[a-z0-9]+)*$/.test(local) && /[0-9]/.test(local)) {
|
||||
const digits = (local.match(/[0-9]/g) || []).length;
|
||||
if (digits / local.length >= 0.25) return true;
|
||||
}
|
||||
|
||||
// Subdomain-label check (everything except the registrable last two labels).
|
||||
const labels = domain.split('.');
|
||||
if (labels.length >= 3) {
|
||||
const subs = labels.slice(0, -2);
|
||||
for (const label of subs) {
|
||||
if (AUTOMATED_SUBDOMAIN_LABELS.has(label)) return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Provider keyword anywhere in the domain.
|
||||
for (const kw of AUTOMATED_DOMAIN_KEYWORDS) {
|
||||
if (domain.includes(kw)) return true;
|
||||
}
|
||||
|
||||
// Domain itself contains tell-tale tokens.
|
||||
if (/(^|\.)(mailers?|mailer|mailgun|sendgrid|mailchimp|mailerlite|bounces?|marketing|promo|notifications?|newsletter)(\.|$)/i.test(domain)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Marketing-style TLD / second-level domain (e.g. bookmyshow.email,
|
||||
// foo.marketing, bar.news). These domains exist almost exclusively for bulk.
|
||||
const sld = labels[labels.length - 1];
|
||||
if (['email', 'mail', 'marketing', 'promo', 'news', 'newsletter', 'click', 'link'].includes(sld)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Brand-identity addresses like `uber@uber.com`, `lenovo@lenovo.com` —
|
||||
// local part equals the first label of the domain. Almost always a
|
||||
// transactional/marketing sender.
|
||||
if (labels.length >= 2 && local === labels[0]) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
function ingestSnapshot(snapshot: GmailThreadSnapshot, selfEmail: string, map: Map<string, IndexEntry>): void {
|
||||
if (!snapshot?.messages) return;
|
||||
for (const msg of snapshot.messages) {
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ import { OAuth2Client } from 'google-auth-library';
|
|||
import { WorkDir } from '../config/config.js';
|
||||
import { GoogleClientFactory } from './google-client-factory.js';
|
||||
import { getUserEmail } from './classify_thread.js';
|
||||
import { isAutomatedAddress } from './contact_filters.js';
|
||||
|
||||
const STATE_FILE = path.join(WorkDir, 'contacts_sent.json');
|
||||
const RECENCY_HALFLIFE_DAYS = 60;
|
||||
|
|
@ -104,6 +105,7 @@ async function saveState(state: StoredState): Promise<void> {
|
|||
function indexFromStored(state: StoredState): Map<string, IndexEntry> {
|
||||
const map = new Map<string, IndexEntry>();
|
||||
for (const e of state.entries) {
|
||||
if (isAutomatedAddress(e.email)) continue;
|
||||
map.set(e.email, {
|
||||
name: e.name,
|
||||
email: e.email,
|
||||
|
|
@ -167,6 +169,7 @@ async function ingestMessage(
|
|||
];
|
||||
for (const { name, email } of recipients) {
|
||||
if (!email || email === selfEmail) continue;
|
||||
if (isAutomatedAddress(email)) continue;
|
||||
let entry = map.get(email);
|
||||
if (!entry) {
|
||||
entry = { name, email, count: 0, lastSeenMs: 0, nameCounts: new Map() };
|
||||
|
|
@ -374,6 +377,7 @@ export async function searchSentContacts(query: string, opts: SearchOpts = {}):
|
|||
const matches: Array<{ entry: IndexEntry; tier: number; s: number }> = [];
|
||||
for (const entry of cachedIndex.values()) {
|
||||
if (excluded.has(entry.email)) continue;
|
||||
if (isAutomatedAddress(entry.email)) continue;
|
||||
const tier = matchTier(q, entry);
|
||||
if (tier < 0) continue;
|
||||
matches.push({ entry, tier, s: score(entry, nowMs) });
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue