mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-30 20:39:43 +02:00
fix(pifs): keep add imports semantically atomic
This commit is contained in:
parent
7096ba1388
commit
1c5ed63ef8
5 changed files with 243 additions and 47 deletions
|
|
@ -203,6 +203,8 @@ class PageIndexFileSystem:
|
|||
)
|
||||
if self.store.file_basename_exists_in_folder(folder_path, filename):
|
||||
raise FileExistsError(f"File already exists at {virtual_path}")
|
||||
if not self.summary_projection_index:
|
||||
raise RuntimeError("pifs add requires the summary projection index")
|
||||
|
||||
self._ensure_add_completion_defaults()
|
||||
file_ref = make_file_ref(virtual_path.strip("/"))
|
||||
|
|
@ -210,6 +212,7 @@ class PageIndexFileSystem:
|
|||
final_dir = uploads_dir / file_ref
|
||||
final_path = final_dir / filename
|
||||
final_dir_created = False
|
||||
catalog_inserted = False
|
||||
records: list[dict[str, Any]] = []
|
||||
|
||||
uploads_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
|
@ -242,12 +245,22 @@ class PageIndexFileSystem:
|
|||
self._require_add_pageindex_ready(record)
|
||||
self._generate_register_metadata(record)
|
||||
self._require_add_metadata_ready(record)
|
||||
self._complete_summary_projection_index(record)
|
||||
self._require_add_summary_projection_ready(record)
|
||||
self._register_generation_policy_schema(records)
|
||||
self._sync_owned_raw_artifact(record)
|
||||
self.store.insert_files(records)
|
||||
catalog_inserted = True
|
||||
if self._complete_summary_projection_index(record):
|
||||
self.store.update_file_metadata_status(
|
||||
record["file_ref"],
|
||||
metadata=record["metadata"],
|
||||
metadata_status=record["metadata_status"],
|
||||
)
|
||||
self._require_add_summary_projection_ready(record)
|
||||
self._sync_owned_raw_artifact(record)
|
||||
self._ensure_add_semantic_retrieval_ready()
|
||||
except Exception:
|
||||
if catalog_inserted:
|
||||
self._cleanup_add_catalog_record(file_ref)
|
||||
self._cleanup_add_summary_projection(records)
|
||||
self._cleanup_failed_register_artifacts(records)
|
||||
if final_dir_created:
|
||||
shutil.rmtree(final_dir, ignore_errors=True)
|
||||
|
|
@ -355,6 +368,63 @@ class PageIndexFileSystem:
|
|||
embedding_timeout=self.summary_projection_embedding_timeout,
|
||||
)
|
||||
|
||||
def _ensure_add_semantic_retrieval_ready(self) -> None:
|
||||
indexer = self.summary_projection_indexer
|
||||
if indexer is None:
|
||||
raise RuntimeError("pifs add requires a summary projection indexer")
|
||||
from .hybrid_projection import HybridProjectionSearchBackend
|
||||
|
||||
index_dir = Path(getattr(indexer, "index_dir", self.summary_projection_index_dir))
|
||||
embedder = getattr(indexer, "embedder", None)
|
||||
if embedder is None:
|
||||
self.configure_hybrid_projection_retrieval(
|
||||
index_dir,
|
||||
embedding_provider=str(
|
||||
getattr(
|
||||
indexer,
|
||||
"embedding_provider",
|
||||
self.summary_projection_embedding_provider,
|
||||
)
|
||||
),
|
||||
embedding_model=str(
|
||||
getattr(indexer, "embedding_model", self.summary_projection_embedding_model)
|
||||
),
|
||||
embedding_dimensions=int(
|
||||
getattr(
|
||||
indexer,
|
||||
"embedding_dimensions",
|
||||
self.summary_projection_embedding_dimensions,
|
||||
)
|
||||
),
|
||||
embedding_timeout=self.summary_projection_embedding_timeout,
|
||||
)
|
||||
else:
|
||||
embedding_cache = getattr(indexer, "embedding_cache", None)
|
||||
self.semantic_retrieval_backend = HybridProjectionSearchBackend(
|
||||
index_dir,
|
||||
embedder=embedder,
|
||||
embedding_provider=str(
|
||||
getattr(
|
||||
indexer,
|
||||
"embedding_provider",
|
||||
self.summary_projection_embedding_provider,
|
||||
)
|
||||
),
|
||||
embedding_model=str(
|
||||
getattr(indexer, "embedding_model", self.summary_projection_embedding_model)
|
||||
),
|
||||
embedding_dimensions=int(
|
||||
getattr(
|
||||
indexer,
|
||||
"embedding_dimensions",
|
||||
self.summary_projection_embedding_dimensions,
|
||||
)
|
||||
),
|
||||
embedding_cache_path=getattr(embedding_cache, "db_path", None),
|
||||
)
|
||||
if "summary" not in self.semantic_retrieval_channels():
|
||||
raise RuntimeError("pifs add failed to configure summary semantic retrieval")
|
||||
|
||||
def configure_existing_projection_retrieval(self) -> bool:
|
||||
"""Attach semantic retrieval to already-built projection indexes.
|
||||
|
||||
|
|
@ -1615,6 +1685,32 @@ class PageIndexFileSystem:
|
|||
if record.get("_pifs_owned_raw_artifact") and record.get("raw_artifact_path"):
|
||||
self._unlink_artifact(record["raw_artifact_path"])
|
||||
|
||||
def _cleanup_add_catalog_record(self, file_ref: str) -> None:
|
||||
try:
|
||||
self.store.delete_file(file_ref)
|
||||
except Exception:
|
||||
return
|
||||
|
||||
def _cleanup_add_summary_projection(self, records: list[dict[str, Any]]) -> None:
|
||||
indexer = self.summary_projection_indexer
|
||||
if indexer is None:
|
||||
return
|
||||
delete_summary = getattr(indexer, "delete_summary", None)
|
||||
for record in records:
|
||||
file_ref = str(record.get("file_ref") or "")
|
||||
if not file_ref:
|
||||
continue
|
||||
try:
|
||||
if callable(delete_summary):
|
||||
delete_summary(file_ref)
|
||||
continue
|
||||
index = getattr(indexer, "index", None)
|
||||
delete_file_refs = getattr(index, "delete_file_refs", None)
|
||||
if callable(delete_file_refs):
|
||||
delete_file_refs([file_ref])
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
@staticmethod
|
||||
def _metadata_policy_is_batch(policy: dict[str, Any]) -> bool:
|
||||
return bool(policy.get("batch")) or policy.get("mode") == "batch"
|
||||
|
|
|
|||
|
|
@ -104,6 +104,9 @@ class SummaryProjectionIndexer:
|
|||
"embedding_dimensions": self.embedding_dimensions,
|
||||
}
|
||||
|
||||
def delete_summary(self, file_ref: str) -> int:
|
||||
return self.index.delete_file_refs([file_ref])
|
||||
|
||||
def _ensure_index(self) -> None:
|
||||
if not self.index.db_path.exists():
|
||||
self.index.reset(
|
||||
|
|
|
|||
|
|
@ -146,6 +146,35 @@ class SQLiteVecSemanticIndex:
|
|||
conn.commit()
|
||||
return inserted
|
||||
|
||||
def delete_file_refs(self, file_refs: list[str]) -> int:
|
||||
refs = [str(file_ref) for file_ref in file_refs if str(file_ref)]
|
||||
if not refs:
|
||||
return 0
|
||||
placeholders = ", ".join("?" for _ in refs)
|
||||
with self.connect() as conn:
|
||||
rows = conn.execute(
|
||||
f"""
|
||||
SELECT rowid
|
||||
FROM semantic_index_docs
|
||||
WHERE file_ref IN ({placeholders})
|
||||
""",
|
||||
refs,
|
||||
).fetchall()
|
||||
rowids = [int(row["rowid"]) for row in rows]
|
||||
if not rowids:
|
||||
return 0
|
||||
rowid_placeholders = ", ".join("?" for _ in rowids)
|
||||
conn.execute(
|
||||
f"DELETE FROM semantic_index_vec WHERE rowid IN ({rowid_placeholders})",
|
||||
rowids,
|
||||
)
|
||||
conn.execute(
|
||||
f"DELETE FROM semantic_index_docs WHERE rowid IN ({rowid_placeholders})",
|
||||
rowids,
|
||||
)
|
||||
conn.commit()
|
||||
return len(rowids)
|
||||
|
||||
def search(
|
||||
self,
|
||||
vector: list[float],
|
||||
|
|
|
|||
|
|
@ -1056,6 +1056,13 @@ class SQLiteFileSystemStore:
|
|||
(metadata_text_value, file_ref),
|
||||
)
|
||||
|
||||
def delete_file(self, target: str) -> None:
|
||||
with self.connect() as conn:
|
||||
file_ref = self._resolve_file_ref(conn, target)
|
||||
conn.execute("DELETE FROM file_fts WHERE file_ref = ?", (file_ref,))
|
||||
conn.execute("DELETE FROM metadata_values WHERE file_ref = ?", (file_ref,))
|
||||
conn.execute("DELETE FROM files WHERE file_ref = ?", (file_ref,))
|
||||
|
||||
def resolve_file_ref(self, target: str) -> str:
|
||||
with self.connect() as conn:
|
||||
return self._resolve_file_ref(conn, target)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue