fix(wiki): ignore empty embedding vectors

This commit is contained in:
Luca Martial 2026-05-11 23:31:07 -07:00
parent 7a86aa9ddc
commit 77dce6fdb3
3 changed files with 56 additions and 2 deletions

View file

@ -403,6 +403,50 @@ describe('canonical local ingest', () => {
}
});
it('does not persist noop embedding vectors when local embeddings are disabled', async () => {
await writeFile(
join(project.projectDir, 'ktx.yaml'),
[
'project: warehouse',
'connections:',
' warehouse:',
' driver: postgres',
'ingest:',
' adapters:',
' - fake',
' embeddings:',
' backend: none',
'',
].join('\n'),
'utf-8',
);
project = await loadKtxProject({ projectDir: project.projectDir });
const sourceDir = join(tempDir, 'source');
await mkdir(join(sourceDir, 'orders'), { recursive: true });
await writeFile(join(sourceDir, 'orders', 'orders.json'), '{"name":"orders"}\n', 'utf-8');
const agentRunner = new WikiWritingAgentRunner();
const result = await runLocalIngest({
project,
adapters: [new FakeSourceAdapter()],
adapter: 'fake',
connectionId: 'warehouse',
sourceDir,
jobId: 'wiki-local-no-embeddings-1',
agentRunner,
});
expect(result.result.failedWorkUnits).toEqual([]);
const db = new Database(join(project.projectDir, '.ktx', 'db.sqlite'), { readonly: true });
try {
expect(db.prepare('SELECT key, summary, embedding_json IS NOT NULL AS has_embedding FROM knowledge_pages ORDER BY key').all()).toEqual([
{ key: 'orders_context', summary: 'Orders source context', has_embedding: 0 },
]);
} finally {
db.close();
}
});
it('uses explicit action raw paths to avoid over-attributing work-unit provenance', async () => {
const sourceDir = join(tempDir, 'source');
await mkdir(join(sourceDir, 'orders'), { recursive: true });

View file

@ -82,6 +82,14 @@ describe('SqliteKnowledgeIndex', () => {
);
});
it('does not treat empty embeddings as indexed semantic vectors', () => {
const index = new SqliteKnowledgeIndex({ dbPath });
index.sync([page({ path: 'knowledge/global/revenue.md', key: 'revenue', embedding: [] })]);
expect(index.getExistingPages().get('knowledge/global/revenue.md')?.embedding).toBeNull();
expect(index.searchSemanticCandidates({ queryEmbedding: [1, 0], limit: 10 })).toEqual([]);
});
it('returns semantic lane candidates from stored page embeddings', () => {
const index = new SqliteKnowledgeIndex({ dbPath });
index.sync([

View file

@ -75,7 +75,9 @@ function parseEmbedding(raw: string | null): number[] | null {
}
try {
const embedding = JSON.parse(raw) as unknown;
return Array.isArray(embedding) && embedding.every((value) => typeof value === 'number') ? embedding : null;
return Array.isArray(embedding) && embedding.length > 0 && embedding.every((value) => typeof value === 'number')
? embedding
: null;
} catch {
return null;
}
@ -170,7 +172,7 @@ export class SqliteKnowledgeIndex {
content: searchText,
tags: page.tags.join(' '),
searchText,
embeddingJson: page.embedding ? JSON.stringify(page.embedding) : null,
embeddingJson: page.embedding && page.embedding.length > 0 ? JSON.stringify(page.embedding) : null,
};
upsertPage.run(row);
deleteFts.run(row);