diff --git a/apps/x/packages/core/src/knowledge/build_graph.ts b/apps/x/packages/core/src/knowledge/build_graph.ts index 71eca37e..9cc4d061 100644 --- a/apps/x/packages/core/src/knowledge/build_graph.ts +++ b/apps/x/packages/core/src/knowledge/build_graph.ts @@ -15,6 +15,7 @@ import { import { buildKnowledgeIndex, formatIndexForPrompt } from './knowledge_index.js'; import { limitEventItems } from './limit_event_items.js'; import { commitAll } from './version_history.js'; +import { getTagDefinitions } from './tag_system.js'; /** * Build obsidian-style knowledge graph by running topic extraction @@ -35,6 +36,54 @@ const SOURCE_FOLDERS = [ // Voice memos are now created directly in knowledge/Voice Memos// const VOICE_MEMOS_KNOWLEDGE_DIR = path.join(NOTES_OUTPUT_DIR, 'Voice Memos'); +/** + * Parse YAML frontmatter from a markdown file and check if it has any noise/skip labels. + * Returns true if the email should be skipped (has noise filter tags). + */ +function hasNoiseLabels(content: string): boolean { + if (!content.startsWith('---')) return false; + + const endIdx = content.indexOf('---', 3); + if (endIdx === -1) return false; + + const frontmatter = content.slice(3, endIdx); + + // Get all noise tags from the tag system + const noiseTags = new Set( + getTagDefinitions() + .filter(t => t.type === 'noise') + .map(t => t.tag) + ); + + // Extract filter array values from frontmatter + // Matches lines like " - cold-outreach" under the filter: key + const filterMatch = frontmatter.match(/filter:\s*\n((?:\s+-\s+.+\n?)*)/); + if (filterMatch) { + const filterLines = filterMatch[1].match(/^\s+-\s+(.+)$/gm); + if (filterLines) { + for (const line of filterLines) { + const tag = line.replace(/^\s+-\s+/, '').trim().replace(/['"]/g, ''); + if (noiseTags.has(tag)) { + return true; + } + } + } + } + + // Also check for inline filter array like "filter: ['cold-outreach']" or "filter: [cold-outreach]" + const inlineMatch = frontmatter.match(/filter:\s*\[([^\]]*)\]/); + if (inlineMatch && inlineMatch[1].trim()) { + const tags = inlineMatch[1].split(',').map(t => t.trim().replace(/['"]/g, '')); + for (const tag of tags) { + if (noiseTags.has(tag)) { + return true; + } + } + } + + return false; +} + function extractPathFromToolInput(input: string): string | null { try { const parsed = JSON.parse(input) as { path?: string }; @@ -366,16 +415,24 @@ export async function buildGraph(sourceDir: string): Promise { // Get files that need processing (new or changed) let filesToProcess = getFilesToProcess(sourceDir, state); - // For gmail_sync, only process emails that have been labeled (have YAML frontmatter) + // For gmail_sync, only process emails that have been labeled AND don't have noise filter tags if (sourceDir.endsWith('gmail_sync')) { filesToProcess = filesToProcess.filter(filePath => { try { const content = fs.readFileSync(filePath, 'utf-8'); - return content.startsWith('---'); + if (!content.startsWith('---')) return false; + if (hasNoiseLabels(content)) { + console.log(`[buildGraph] Skipping noise email: ${path.basename(filePath)}`); + // Mark as processed so we don't re-check it + markFileAsProcessed(filePath, state); + return false; + } + return true; } catch { return false; } }); + saveState(state); } if (filesToProcess.length === 0) { @@ -568,16 +625,23 @@ export async function processAllSources(): Promise { try { let filesToProcess = getFilesToProcess(sourceDir, state); - // For gmail_sync, only process emails that have been labeled (have YAML frontmatter) + // For gmail_sync, only process emails that have been labeled AND don't have noise filter tags if (folder === 'gmail_sync') { filesToProcess = filesToProcess.filter(filePath => { try { const content = fs.readFileSync(filePath, 'utf-8'); - return content.startsWith('---'); + if (!content.startsWith('---')) return false; + if (hasNoiseLabels(content)) { + console.log(`[GraphBuilder] Skipping noise email: ${path.basename(filePath)}`); + markFileAsProcessed(filePath, state); + return false; + } + return true; } catch { return false; } }); + saveState(state); } if (filesToProcess.length > 0) { diff --git a/apps/x/packages/core/src/knowledge/labeling_agent.ts b/apps/x/packages/core/src/knowledge/labeling_agent.ts index d28649b1..e9568676 100644 --- a/apps/x/packages/core/src/knowledge/labeling_agent.ts +++ b/apps/x/packages/core/src/knowledge/labeling_agent.ts @@ -57,7 +57,7 @@ ${renderTagSystemForEmails()} # Instructions 1. For each email file provided in the message, read its content carefully. -2. Classify the email using the taxonomy above. Think like a **YC startup founder** triaging their inbox — your time is your scarcest resource: +2. Classify the email using the taxonomy above. Think like a **busy YC startup founder** triaging their inbox. You receive a mix of real business conversations and unsolicited inbound (cold pitches, marketing, newsletters). Your job is to tell them apart accurately — catch the noise without mislabeling real relationships. - **Relationship**: Who is this from? An investor, customer, team member, vendor, candidate, etc.? - **Topic**: What is this about? Legal, finance, hiring, fundraising, security, infrastructure, etc.? - **Email Type**: Is this a warm intro or a followup on an existing conversation? @@ -97,23 +97,23 @@ Before finalizing labels, ask: **"Would a busy YC founder want a note about this - Is a spam digest or Google Groups moderation report - Is routine operational correspondence where the transaction is complete and no follow-up remains -# Cold Outreach Detection (Critical for Precision) +# Cold Outreach vs Real Relationships -Many emails disguise themselves as real relationships. Before assigning \`vendor\`, \`candidate\`, \`partner\`, or \`followup\`, apply these tests: +**First, check for engagement signals. If ANY are present, it is NOT cold outreach — classify normally:** +- The inbox owner replied in the thread +- There is a calendar invite or scheduled meeting between the sender and inbox owner +- The email references shared documents, Slack conversations, prior calls, or other collaboration +- The sender was introduced via a warm intro +- The sender is from a company the inbox owner is actively doing business with (customer, vendor under contract, investor) +- The thread has messages from both sides -**It's \`cold-outreach\` (noise), NOT a real relationship, if:** -- The sender is pitching their own product or service — design agencies, compliance firms, content/copy writers, dev shops, freelancers, trademark services, company closure/winding-down services, hiring platforms, etc. — even if they reference your company by name, your YC batch, or offer something "free" or "exclusive for YC founders." -- The thread consists entirely of the same sender following up on their own unanswered messages. A real followup requires prior two-way engagement. -- A student, job-seeker, freelancer, or founder cold-emails asking for your time, feedback, or offering free work/trials. These are NOT \`candidate\` — they are \`cold-outreach\`. -- Someone invites you to an event you didn't sign up for, especially if the email has marketing formatting (tracking links, unsubscribe footers, HTML banners). This is \`promotion\`, not \`event\`. +**Only if NONE of the above are present, check if it's cold outreach:** +- The sender is pitching a service or product TO the inbox owner (agencies, dev shops, freelancers, SaaS tools, etc.) → \`cold-outreach\` +- A one-sided thread where the sender follows up on their own unanswered messages → \`cold-outreach\` +- A stranger cold-emailing about jobs, internships, or offering free work → \`cold-outreach\`, not \`candidate\` +- Someone referencing your YC batch or company name to seem personal, but with no prior engagement → \`cold-outreach\` -**It IS a real relationship (not noise) if:** -- You (the inbox owner) are a participant in the thread (you sent a reply, or someone on your team did). -- The sender is from a company you are already paying, or they are providing a service under contract (e.g., your law firm, your accountant, your cloud provider support). -- The sender was introduced to you by someone you know (warm intro present in the thread). -- The sender references a specific ongoing engagement with concrete details — e.g., they are your assigned compliance assessor for an audit you initiated, or they are following up after a call you participated in. This is NOT the same as a generic "I noticed your company uses X" pitch. - -**Key heuristic:** If every message in the thread is FROM the same external person and the inbox owner never replied, it's almost certainly cold outreach — regardless of how personalized it sounds. Label it \`cold-outreach\`. +**Remember:** A \`prospect\` is someone who wants to BUY from you. Someone pitching their services to you is \`cold-outreach\`, not \`prospect\` or \`partner\`. # Routine Operations & Finance (Often Missed as Noise) @@ -156,9 +156,13 @@ These are noise even from a vendor you recognize or a platform you use: If the sender is \`noreply-spamdigest\` (Google Groups spam moderation reports), label it \`filter: ['spam']\`. Google already flagged these as spam. Do not evaluate the held messages inside — the digest itself is noise. -# Filter array must only contain tags from the Noise category +# Filter and Relationship arrays — correct placement is critical -Do not put topic or relationship tags into the filter array. If an email is an event promotion, use \`promotion\` in filter — not \`event\`. +- The \`filter\` array must only contain tags from the **Noise** category. +- The \`relationship\` array must only contain tags from the **Relationship** category. +- **\`cold-outreach\` is a NOISE tag — it goes in \`filter\`, NEVER in \`relationship\`.** If an email is cold outreach, set \`filter: ['cold-outreach']\`. The relationship array should be empty \`[]\` for cold outreach emails. +- Do not put topic or relationship tags into the filter array. If an email is an event promotion, use \`promotion\` in filter — not \`event\`. +- If an email is cold outreach, do NOT also tag it as \`prospect\`, \`candidate\`, \`partner\`, or \`vendor\` in relationship. Cold outreach overrides — the relationship array should be \`[]\`. # Frontmatter Format diff --git a/apps/x/packages/core/src/knowledge/note_creation.ts b/apps/x/packages/core/src/knowledge/note_creation.ts index 478ced81..c9f2d91a 100644 --- a/apps/x/packages/core/src/knowledge/note_creation.ts +++ b/apps/x/packages/core/src/knowledge/note_creation.ts @@ -221,12 +221,14 @@ Emails containing calendar invites (\`.ics\` attachments or inline calendar data --- -# Step 1: Source Filtering (Label-Based) +# Step 1: Source Filtering ## For Meetings and Voice Memos -Always process — no filtering needed. +Always process — no filtering needed. Skip to Step 2. -## For Emails — Read YAML Frontmatter +## For Emails — TWO mandatory gates, BOTH must pass + +### Gate 1: Label Check Emails have YAML frontmatter with labels prepended by the labeling agent: @@ -249,15 +251,51 @@ labeled_at: "2026-02-28T12:00:00Z" ${renderNoteEffectRules()} -## Filter Decision Output - -If skipping: +**Gate 1 verdict — if ANY filter/noise label is present, you MUST output:** \`\`\` -SKIP -Reason: Labels indicate skip-only categories: {list the labels} +STOP — GATE 1 FAILED +Labels: {list the filter labels} +Action: Do not create any notes. Do not proceed to Gate 2. End here. \`\`\` +**After outputting this, STOP. Do not write any files. Do not create any notes. Your task is complete.** -If processing, continue to Step 2. +If no filter labels are present, proceed to Gate 2. + +--- + +### Gate 2: Engagement Test + +**This gate applies to ALL emails that passed Gate 1. It is a HARD STOP — not a suggestion.** + +The inbox owner is a busy startup founder who receives a high volume of unsolicited inbound. Most emails from strangers are noise. The labeling agent sometimes misclassifies cold outreach. + +**Read the email content and answer this single question: "Has the inbox owner ever engaged with this sender?"** + +Evidence of engagement (at least one MUST be true to proceed): +- The inbox owner sent a reply in the thread +- There is a prior meeting or introduction with this person +- The person is from a company the owner is already doing business with (paying customer, active vendor under contract, investor) + +**If NONE of the above are true, you MUST output:** +\`\`\` +STOP — GATE 2 FAILED +Reason: No evidence of two-way engagement — {brief explanation} +Action: Do not create any notes. End here. +\`\`\` +**After outputting this, STOP. Do not write any files. Do not create any notes. Your task is complete.** + +This means NO notes for: +- Cold outreach senders (even if labeled as prospect/candidate/partner) +- Newsletter/digest/promo senders +- Webinar speakers mentioned in marketing emails +- People from community digests (Bookface, YC digest) +- Anyone who followed up multiple times with no reply from the owner +- Students/freelancers/developers cold-emailing about jobs or offering work +- Event platform senders (Luma, Beehiiv, etc.) + +**When in doubt, STOP. A missed note can be created later. A junk note pollutes the knowledge base permanently.** + +If engagement evidence exists, proceed to Step 2. --- @@ -506,18 +544,21 @@ For entities not resolved to existing notes, determine if they warrant new notes ### Who Gets a Note +**The golden rule: only create notes for people the inbox owner has a real relationship with.** A "real relationship" means two-way engagement — the owner has met, replied to, or is actively doing business with this person. Passing through the label filter is necessary but NOT sufficient. + **CREATE a note for people who are:** -- External (not @user.domain) -- Attendees in meetings -- Email correspondents (emails that reach this step already passed label-based filtering) -- Decision makers or contacts at customers, prospects, or partners -- Investors or potential investors -- Candidates you are interviewing -- Advisors or mentors -- Key collaborators -- Introducers who connect you to valuable contacts +- Attendees in meetings the owner participated in +- People the owner has replied to or engaged with in email +- People introduced via warm intros from known contacts +- Contacts at companies the owner is actively doing business with (customers, investors, vendors under contract, partners) +- Candidates the owner is actively interviewing (responded to, scheduled with) **DO NOT create notes for:** +- Anyone from a one-time inbound email the owner never responded to +- People mentioned in newsletters, digests, or community roundups +- Speakers listed in webinar or event promotion emails +- Senders from marketing/event platforms (Luma, Beehiiv, etc.) +- Cold outreach senders, even after multiple follow-ups with no reply - Large group meeting attendees you didn't interact with - Internal colleagues (@user.domain) - Assistants handling only logistics @@ -559,16 +600,19 @@ If role is not explicitly stated, infer from context: |-------------------|----------------------|------------------| | Customer (active deal) | Yes — key contacts | Yes | | Customer (support ticket) | No | Maybe update existing | -| Prospect | Yes — decision makers | Yes | +| Prospect (owner has engaged) | Yes — decision makers | Yes | +| Prospect (no engagement) | No — this is cold outreach | No | | Investor | Yes | Yes | -| Strategic partner | Yes — key contacts | Yes | +| Strategic partner (mutual engagement) | Yes — key contacts | Yes | | Vendor (strategic) | Yes — main contact only | Yes | | Vendor (transactional) | No | Optional | | Bank/Financial services | No | Yes (one note) | -| Candidate | Yes | No | +| Candidate (owner is interviewing) | Yes | No | +| Candidate (unsolicited) | No — this is cold outreach | No | | Service provider (one-time) | No | No | -| Personalized outreach | Yes | Yes | -| Generic cold outreach | No | No | +| Cold outreach / unsolicited inbound | No | No | +| Newsletter / digest / promo sender | No | No | +| Event/webinar invite sender | No | No | ### Handling Non-Note-Worthy People @@ -581,15 +625,23 @@ For people who don't warrant their own note, add to Organization note's Contacts ## Organizations +**Only create org notes for organizations the user directly does business with.** The test is: "Does the user have an active, ongoing relationship with this organization?" + **CREATE a note if:** -- Someone from that org attended a meeting -- They're a customer, prospect, investor, or partner -- Someone from that org sent relevant personalized correspondence +- They're a customer, investor, or partner the owner is actively engaged with +- The owner is actively working with someone there (meetings, email exchanges, contracts) +- They are a vendor the owner has a contract with or is actively evaluating **DO NOT create for:** +- Organizations mentioned only as background context (a contact's previous employer, university, portfolio company, etc.) +- A candidate's current or former employer — the person note is enough +- Organizations from cold outreach emails +- Organizations mentioned in newsletters, digests, or promotional emails - Tool/service providers mentioned in passing - One-time transactional vendors -- Consumer service companies +- Consumer service companies (banks, airlines, etc.) +- Event/marketing platforms (Luma, Beehiiv, etc.) +- Organizations referenced only because a contact works there, unless the user is doing business with that org directly ## Projects @@ -601,8 +653,13 @@ For people who don't warrant their own note, add to Organization note's Contacts ## Topics **CREATE a note if:** -- Recurring theme discussed -- Will come up again across conversations +- A substantive business topic discussed across multiple conversations with real contacts (e.g., "SOC 2 Compliance", "Series A Fundraise") +- Has concrete facts, decisions, or action items attached to it + +**DO NOT create topic notes for:** +- Internal system concepts (email filtering, labeling, noise categories, tags) +- Abstract or meta categories (e.g., "cold outreach", "newsletters", "noise") +- Topics only mentioned in skipped/noise emails --- diff --git a/apps/x/packages/core/src/knowledge/tag_system.ts b/apps/x/packages/core/src/knowledge/tag_system.ts index d95ea816..4c9741d6 100644 --- a/apps/x/packages/core/src/knowledge/tag_system.ts +++ b/apps/x/packages/core/src/knowledge/tag_system.ts @@ -32,10 +32,10 @@ const DEFAULT_TAG_DEFINITIONS: TagDefinition[] = [ // ── Relationship — who is this from/about (all create) ──────────────── { tag: 'investor', type: 'relationship', applicability: 'both', noteEffect: 'create', description: 'Investors, VCs, or angels', example: 'Following up on our meeting — we\'d like to move forward with the Series A term sheet.' }, { tag: 'customer', type: 'relationship', applicability: 'both', noteEffect: 'create', description: 'Paying customers', example: 'We\'re seeing great results with Rowboat. Can we discuss expanding to more teams?' }, - { tag: 'prospect', type: 'relationship', applicability: 'both', noteEffect: 'create', description: 'Potential customers', example: 'Thanks for the demo yesterday. We\'re interested in starting a pilot.' }, + { tag: 'prospect', type: 'relationship', applicability: 'both', noteEffect: 'create', description: 'Potential customers — people who want to BUY from you. NOT someone pitching their services to you — that is cold-outreach.', example: 'Thanks for the demo yesterday. We\'re interested in starting a pilot.' }, { tag: 'partner', type: 'relationship', applicability: 'both', noteEffect: 'create', description: 'Business partners, corp dev, or strategic contacts', example: 'Let\'s discuss how we can promote the integration to both our user bases.' }, { tag: 'vendor', type: 'relationship', applicability: 'both', noteEffect: 'create', description: 'Service providers you already pay or have a contract with (legal, accounting, infra). NOT someone pitching their services to you — that is cold-outreach.', example: 'Here are the updated employment agreements you requested.' }, - { tag: 'candidate', type: 'relationship', applicability: 'both', noteEffect: 'create', description: 'Job applicants responding to a specific open role, or recruiters pitching candidates for your roles. NOT unsolicited students or strangers asking for your time — that is cold-outreach.', example: 'Thanks for reaching out. I\'d love to learn more about the engineering role.' }, + { tag: 'candidate', type: 'relationship', applicability: 'both', noteEffect: 'create', description: 'Job applicants responding to a specific open role, or recruiters pitching candidates for your roles. NOT unsolicited students, strangers, or anyone cold-emailing about jobs, internships, or offering free work — that is cold-outreach.', example: 'Thanks for reaching out. I\'d love to learn more about the engineering role.' }, { tag: 'team', type: 'relationship', applicability: 'both', noteEffect: 'create', description: 'Internal team members and co-founders', example: 'Here\'s the updated roadmap for Q2. Let\'s discuss in our sync.' }, { tag: 'advisor', type: 'relationship', applicability: 'both', noteEffect: 'create', description: 'Advisors, mentors, or board members', example: 'I\'ve reviewed the deck. Here are my thoughts on the GTM strategy.' }, { tag: 'personal', type: 'relationship', applicability: 'both', noteEffect: 'create', description: 'Family or friends', example: 'Are you coming to Thanksgiving this year? Let me know your travel dates.' },