mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-12 19:55:17 +02:00
refactor(filesystem): unify pifs metadata state
This commit is contained in:
parent
fc0be1aeee
commit
7c021a7dd0
8 changed files with 479 additions and 324 deletions
|
|
@ -98,12 +98,23 @@ def test_stable_path_targets_work_without_session_refs(tmp_path):
|
|||
assert "Root document fixture text" in text
|
||||
|
||||
|
||||
def test_stat_shell_output_includes_generated_metadata(tmp_path):
|
||||
def test_stat_shell_output_includes_unified_metadata_status(tmp_path):
|
||||
from pageindex.filesystem import PIFSCommandExecutor, PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
source = tmp_path / "source.txt"
|
||||
source.write_text("fixture text", encoding="utf-8")
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
|
||||
class SummaryGenerator:
|
||||
def generate(self, document, *, fields):
|
||||
return MetadataGenerationResult(
|
||||
values={field: "Generated summary for retrieval." for field in fields}
|
||||
)
|
||||
|
||||
filesystem = PageIndexFileSystem(
|
||||
workspace=tmp_path / "workspace",
|
||||
metadata_generator=SummaryGenerator(),
|
||||
)
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/source.txt",
|
||||
|
|
@ -112,8 +123,7 @@ def test_stat_shell_output_includes_generated_metadata(tmp_path):
|
|||
title="Generated metadata document",
|
||||
content=source.read_text(encoding="utf-8"),
|
||||
metadata={"department": "ops"},
|
||||
derived_metadata={"summary": "Generated summary for retrieval."},
|
||||
metadata_generation_policy={
|
||||
metadata_policy={
|
||||
"fields": {
|
||||
"summary": True,
|
||||
"doc_type": False,
|
||||
|
|
@ -128,9 +138,126 @@ def test_stat_shell_output_includes_generated_metadata(tmp_path):
|
|||
|
||||
assert "metadata:" in stat
|
||||
assert " department: ops" in stat
|
||||
assert "generated_metadata:" in stat
|
||||
assert " summary: Generated summary for retrieval." in stat
|
||||
assert "metadata_generation_status: generated" in stat
|
||||
assert "metadata_status: generated" in stat
|
||||
|
||||
|
||||
def test_register_rejects_pifs_owned_metadata_fields(tmp_path):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
|
||||
source = tmp_path / "source.txt"
|
||||
source.write_text("fixture text", encoding="utf-8")
|
||||
filesystem = PageIndexFileSystem(workspace=tmp_path / "workspace")
|
||||
|
||||
with pytest.raises(ValueError, match="PIFS-owned generated field"):
|
||||
filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/source.txt",
|
||||
folder_path="/documents",
|
||||
external_id="doc_conflict",
|
||||
title="Conflict document",
|
||||
content=source.read_text(encoding="utf-8"),
|
||||
metadata={"summary": "caller summary"},
|
||||
)
|
||||
|
||||
|
||||
def test_batch_metadata_status_generates_into_unified_metadata(tmp_path):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
from pageindex.filesystem.metadata_generation import MetadataGenerationResult
|
||||
|
||||
source = tmp_path / "source.txt"
|
||||
source.write_text("fixture text", encoding="utf-8")
|
||||
|
||||
class SummaryGenerator:
|
||||
def generate(self, document, *, fields):
|
||||
return MetadataGenerationResult(values={"summary": "Batch generated summary."})
|
||||
|
||||
filesystem = PageIndexFileSystem(
|
||||
workspace=tmp_path / "workspace",
|
||||
metadata_generator=SummaryGenerator(),
|
||||
)
|
||||
file_ref = filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/source.txt",
|
||||
folder_path="/documents",
|
||||
external_id="doc_batch",
|
||||
title="Batch document",
|
||||
content=source.read_text(encoding="utf-8"),
|
||||
metadata={"department": "ops"},
|
||||
metadata_policy={
|
||||
"batch": True,
|
||||
"fields": {
|
||||
"summary": True,
|
||||
"doc_type": False,
|
||||
"domain": False,
|
||||
"topic": False,
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
before = filesystem.store.get_file(file_ref)
|
||||
assert "summary" not in before.metadata
|
||||
assert before.metadata_status["fields"]["summary"]["status"] == "pending_submit"
|
||||
|
||||
result = filesystem.batch_generate()
|
||||
after = filesystem.store.get_file(file_ref)
|
||||
|
||||
assert result["generated"] == 1
|
||||
assert after.metadata["summary"] == "Batch generated summary."
|
||||
assert after.metadata["department"] == "ops"
|
||||
assert after.metadata_status["fields"]["summary"]["status"] == "generated"
|
||||
|
||||
|
||||
def test_v4_metadata_columns_migrate_to_unified_metadata_status(tmp_path):
|
||||
from pageindex.filesystem import PageIndexFileSystem
|
||||
|
||||
source = tmp_path / "source.txt"
|
||||
source.write_text("fixture text", encoding="utf-8")
|
||||
workspace = tmp_path / "workspace"
|
||||
filesystem = PageIndexFileSystem(workspace=workspace)
|
||||
file_ref = filesystem.register_file(
|
||||
storage_uri=source.as_uri(),
|
||||
source_path="docs/source.txt",
|
||||
folder_path="/documents",
|
||||
external_id="doc_migrate",
|
||||
title="Migrated document",
|
||||
content=source.read_text(encoding="utf-8"),
|
||||
metadata={"department": "ops"},
|
||||
)
|
||||
|
||||
with filesystem.store.connect() as conn:
|
||||
conn.execute(
|
||||
"ALTER TABLE files ADD COLUMN derived_metadata_json TEXT NOT NULL DEFAULT '{}'"
|
||||
)
|
||||
conn.execute(
|
||||
"ALTER TABLE files ADD COLUMN metadata_generation_json TEXT NOT NULL DEFAULT '{}'"
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE files
|
||||
SET metadata_json = ?,
|
||||
derived_metadata_json = ?,
|
||||
metadata_generation_json = ?,
|
||||
metadata_status_json = '{}'
|
||||
WHERE file_ref = ?
|
||||
""",
|
||||
(
|
||||
'{"department":"ops","summary":"raw summary"}',
|
||||
'{"summary":"generated summary"}',
|
||||
'{"status":"generated","fields":{"summary":{"requested":true,"status":"generated"}}}',
|
||||
file_ref,
|
||||
),
|
||||
)
|
||||
conn.execute("PRAGMA user_version = 4")
|
||||
|
||||
migrated = PageIndexFileSystem(workspace=workspace)
|
||||
entry = migrated.store.get_file(file_ref)
|
||||
|
||||
assert entry.metadata["department"] == "ops"
|
||||
assert entry.metadata["summary"] == "generated summary"
|
||||
assert entry.metadata_status["status"] == "generated"
|
||||
assert entry.metadata_status["fields"]["summary"]["owner"] == "pifs"
|
||||
assert entry.metadata_status["fields"]["summary"]["source"] == "llm"
|
||||
|
||||
|
||||
def test_find_maxdepth_zero_type_directory_returns_start_folder(tmp_path):
|
||||
|
|
|
|||
|
|
@ -51,3 +51,39 @@ def test_sqlite_vec_semantic_index_round_trip(tmp_path):
|
|||
filters={"source_type": "slack"},
|
||||
)
|
||||
assert [item.external_id for item in filtered] == ["doc_b"]
|
||||
|
||||
|
||||
def test_summary_projection_indexes_unified_metadata_summary(tmp_path):
|
||||
from pageindex.filesystem.projection_indexing import SummaryProjectionIndexer
|
||||
|
||||
class FakeEmbedder:
|
||||
def embed(self, texts):
|
||||
return [[1.0, 0.0, 0.0] for _ in texts]
|
||||
|
||||
indexer = SummaryProjectionIndexer(
|
||||
tmp_path / "projection",
|
||||
embedder=FakeEmbedder(),
|
||||
embedding_provider="test",
|
||||
embedding_model="fake",
|
||||
embedding_dimensions=3,
|
||||
)
|
||||
|
||||
result = indexer.upsert_summary(
|
||||
{
|
||||
"file_ref": "file_a",
|
||||
"external_id": "doc_a",
|
||||
"source_type": "documents",
|
||||
"source_path": "docs/a.pdf",
|
||||
"title": "A",
|
||||
"metadata": {
|
||||
"summary": "Unified metadata summary.",
|
||||
"department": "ops",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
assert result["status"] == "ready"
|
||||
hits = indexer.index.search([1.0, 0.0, 0.0], limit=1)
|
||||
assert hits[0].external_id == "doc_a"
|
||||
assert hits[0].metadata["summary"] == "Unified metadata summary."
|
||||
assert hits[0].metadata["department"] == "ops"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue