fix: keep lexical-only reindex incremental

This commit is contained in:
Andrey Avtomonov 2026-05-20 01:12:27 +02:00
parent a4ae2213b4
commit ba5d01046b
5 changed files with 123 additions and 10 deletions

View file

@ -83,6 +83,44 @@ describe('reindexLocalIndexes', () => {
expect(summary.embeddingsAvailable).toBe(true);
});
it('does not report unchanged lexical-only rows as updated on repeated runs', async () => {
const project = await createProject(tempDir);
await writeFile(
join(project.projectDir, 'wiki/global/revenue.md'),
'---\nsummary: Revenue\nusage_mode: auto\n---\n\nPaid orders.\n',
'utf-8',
);
await mkdir(join(project.projectDir, 'semantic-layer/warehouse'), { recursive: true });
await writeFile(
join(project.projectDir, 'semantic-layer/warehouse/orders.yaml'),
'name: orders\ntable: public.orders\ngrain: [id]\ncolumns:\n - name: id\n type: number\njoins: []\nmeasures: []\n',
'utf-8',
);
const first = await reindexLocalIndexes(project, { force: false, embeddingService: null });
expect(first.totals).toMatchObject({
scanned: 2,
updated: 2,
deleted: 0,
embeddingsRecomputed: 0,
embeddingsFailed: 0,
});
const second = await reindexLocalIndexes(project, { force: false, embeddingService: null });
expect(second.totals).toMatchObject({
scanned: 2,
updated: 0,
deleted: 0,
embeddingsRecomputed: 0,
embeddingsFailed: 0,
});
expect(second.scopes.map((scope) => [scope.label, scope.updated])).toEqual([
['global', 0],
['warehouse', 0],
]);
});
it('force clears stale rows before rebuilding each discovered scope', async () => {
const project = await createProject(tempDir);
const wikiIndex = new SqliteKnowledgeIndex({ dbPath: join(project.projectDir, '.ktx/db.sqlite') });

View file

@ -258,4 +258,38 @@ describe('SlSearchService', () => {
expect.objectContaining({ sourceName: 'orders', embedding: null }),
]);
});
it('does not update unchanged lexical-only SL rows on repeated sync', async () => {
const repository = {
upsertSources: vi.fn().mockResolvedValue(undefined),
getExistingSearchTexts: vi.fn().mockResolvedValue(
new Map([
['orders', { searchText: 'orders. table: public.orders. id (number)', hasEmbedding: false }],
]),
),
deleteStale: vi.fn().mockResolvedValue(0),
deleteByConnection: vi.fn().mockResolvedValue(0),
deleteByConnectionAndName: vi.fn(),
search: vi.fn(),
};
const service = new SlSearchService(null, repository);
const source: SemanticLayerSource = {
name: 'orders',
table: 'public.orders',
grain: ['id'],
columns: [{ name: 'id', type: 'number' }],
joins: [],
measures: [],
};
await expect(service.indexSources('warehouse', [source])).resolves.toEqual({
scanned: 1,
updated: 0,
deleted: 0,
embeddingsRecomputed: 0,
embeddingsFailed: 0,
});
expect(repository.upsertSources).toHaveBeenCalledWith('warehouse', []);
expect(repository.deleteStale).toHaveBeenCalledWith('warehouse', ['orders']);
});
});

View file

@ -109,10 +109,15 @@ export class SlSearchService {
const searchTexts = sources.map((s) => this.buildSearchText(s));
const embeddingService = this.embeddingService;
const changedIndices: number[] = [];
for (let i = 0; i < sources.length; i += 1) {
const previous = existing.get(sources[i]!.name);
if (!previous || previous.searchText !== searchTexts[i] || !previous.hasEmbedding) {
if (
!previous ||
previous.searchText !== searchTexts[i] ||
(embeddingService !== null && !previous.hasEmbedding)
) {
changedIndices.push(i);
}
}
@ -121,13 +126,13 @@ export class SlSearchService {
let embeddingsRecomputed = 0;
let embeddingsFailed = 0;
if (this.embeddingService && changedIndices.length > 0) {
if (embeddingService && changedIndices.length > 0) {
try {
const changedTexts = changedIndices.map((index) => searchTexts[index]!);
const allEmbeddings: number[][] = [];
for (let i = 0; i < changedTexts.length; i += this.embeddingService.maxBatchSize) {
const batch = changedTexts.slice(i, i + this.embeddingService.maxBatchSize);
allEmbeddings.push(...(await this.embeddingService.computeEmbeddingsBulk(batch)));
for (let i = 0; i < changedTexts.length; i += embeddingService.maxBatchSize) {
const batch = changedTexts.slice(i, i + embeddingService.maxBatchSize);
allEmbeddings.push(...(await embeddingService.computeEmbeddingsBulk(batch)));
}
changedEmbeddings = allEmbeddings;
embeddingsRecomputed = allEmbeddings.length;

View file

@ -98,6 +98,37 @@ describe('KnowledgeWikiService.syncIndex result stats', () => {
expect.objectContaining({ pageKey: 'revenue', embedding: null }),
);
});
it('does not update unchanged lexical-only wiki rows on repeated sync', async () => {
const { pagesRepository, configService, gitService, logger } = makeService();
const service = new KnowledgeWikiService(
configService as any,
null,
pagesRepository as any,
gitService as any,
logger as any,
);
configService.listFiles.mockResolvedValue({ files: ['wiki/global/revenue.md'] });
configService.readFile.mockResolvedValue({
content: '---\nsummary: Revenue\nusage_mode: auto\n---\n\nPaid orders.\n',
});
pagesRepository.getExistingSearchTexts.mockResolvedValue(
new Map([
['revenue', { searchText: 'revenue\nRevenue\nPaid orders.', hasEmbedding: false }],
]),
);
pagesRepository.deleteStale.mockResolvedValue(0);
await expect(service.syncIndex('GLOBAL', null)).resolves.toEqual({
scanned: 1,
updated: 0,
deleted: 0,
embeddingsRecomputed: 0,
embeddingsFailed: 0,
});
expect(pagesRepository.upsertPage).not.toHaveBeenCalled();
expect(pagesRepository.deleteStale).toHaveBeenCalledWith('GLOBAL', null, ['revenue']);
});
});
describe('KnowledgeWikiService.forWorktree isolation', () => {

View file

@ -296,22 +296,27 @@ export class KnowledgeWikiService {
}
}
const embeddingService = this.embeddingService;
const changedPages = pages.filter((page) => {
const previous = existing.get(page.pageKey);
return !previous || previous.searchText !== page.searchText || !previous.hasEmbedding;
return (
!previous ||
previous.searchText !== page.searchText ||
(embeddingService !== null && !previous.hasEmbedding)
);
});
let embeddings: (number[] | null)[] = changedPages.map(() => null);
let embeddingsRecomputed = 0;
let embeddingsFailed = 0;
if (this.embeddingService && changedPages.length > 0) {
if (embeddingService && changedPages.length > 0) {
try {
const changedTexts = changedPages.map((page) => page.searchText);
const all: number[][] = [];
for (let i = 0; i < changedTexts.length; i += this.embeddingService.maxBatchSize) {
const batch = changedTexts.slice(i, i + this.embeddingService.maxBatchSize);
all.push(...(await this.embeddingService.computeEmbeddingsBulk(batch)));
for (let i = 0; i < changedTexts.length; i += embeddingService.maxBatchSize) {
const batch = changedTexts.slice(i, i + embeddingService.maxBatchSize);
all.push(...(await embeddingService.computeEmbeddingsBulk(batch)));
}
embeddings = all;
embeddingsRecomputed = all.length;