mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-04-25 08:06:22 +02:00
feat: add PageIndex SDK with local/cloud dual-mode support (#207)
This commit is contained in:
parent
f2dcffc0b7
commit
c7fe93bb56
45 changed files with 4225 additions and 274 deletions
164
pageindex/storage/sqlite.py
Normal file
164
pageindex/storage/sqlite.py
Normal file
|
|
@ -0,0 +1,164 @@
|
|||
import json
|
||||
import sqlite3
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class SQLiteStorage:
|
||||
def __init__(self, db_path: str):
|
||||
self._db_path = Path(db_path).expanduser()
|
||||
self._db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._local = threading.local()
|
||||
self._connections: list[sqlite3.Connection] = []
|
||||
self._conn_lock = threading.Lock()
|
||||
self._init_schema()
|
||||
|
||||
def _get_conn(self) -> sqlite3.Connection:
|
||||
"""Return a thread-local SQLite connection."""
|
||||
if not hasattr(self._local, "conn"):
|
||||
conn = sqlite3.connect(str(self._db_path))
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
self._local.conn = conn
|
||||
with self._conn_lock:
|
||||
self._connections.append(conn)
|
||||
return self._local.conn
|
||||
|
||||
def _init_schema(self):
|
||||
conn = self._get_conn()
|
||||
conn.execute("PRAGMA user_version = 1")
|
||||
conn.executescript("""
|
||||
CREATE TABLE IF NOT EXISTS collections (
|
||||
name TEXT PRIMARY KEY CHECK(length(name) <= 128 AND name GLOB '[a-zA-Z0-9_-]*'),
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
CREATE TABLE IF NOT EXISTS documents (
|
||||
doc_id TEXT PRIMARY KEY,
|
||||
collection_name TEXT NOT NULL REFERENCES collections(name) ON DELETE CASCADE,
|
||||
doc_name TEXT,
|
||||
doc_description TEXT,
|
||||
file_path TEXT,
|
||||
file_hash TEXT,
|
||||
doc_type TEXT NOT NULL,
|
||||
structure JSON,
|
||||
pages JSON,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_docs_collection ON documents(collection_name);
|
||||
CREATE INDEX IF NOT EXISTS idx_docs_hash ON documents(collection_name, file_hash);
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
def create_collection(self, name: str) -> None:
|
||||
conn = self._get_conn()
|
||||
conn.execute("INSERT INTO collections (name) VALUES (?)", (name,))
|
||||
conn.commit()
|
||||
|
||||
def get_or_create_collection(self, name: str) -> None:
|
||||
conn = self._get_conn()
|
||||
conn.execute("INSERT OR IGNORE INTO collections (name) VALUES (?)", (name,))
|
||||
conn.commit()
|
||||
|
||||
def list_collections(self) -> list[str]:
|
||||
conn = self._get_conn()
|
||||
rows = conn.execute("SELECT name FROM collections ORDER BY name").fetchall()
|
||||
return [r[0] for r in rows]
|
||||
|
||||
def delete_collection(self, name: str) -> None:
|
||||
conn = self._get_conn()
|
||||
conn.execute("DELETE FROM collections WHERE name = ?", (name,))
|
||||
conn.commit()
|
||||
|
||||
def save_document(self, collection: str, doc_id: str, doc: dict) -> None:
|
||||
conn = self._get_conn()
|
||||
conn.execute(
|
||||
"""INSERT OR REPLACE INTO documents
|
||||
(doc_id, collection_name, doc_name, doc_description, file_path, file_hash, doc_type, structure, pages)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(doc_id, collection, doc.get("doc_name"), doc.get("doc_description"),
|
||||
doc.get("file_path"), doc.get("file_hash"), doc["doc_type"],
|
||||
json.dumps(doc.get("structure", [])),
|
||||
json.dumps(doc.get("pages")) if doc.get("pages") else None),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def find_document_by_hash(self, collection: str, file_hash: str) -> str | None:
|
||||
conn = self._get_conn()
|
||||
row = conn.execute(
|
||||
"SELECT doc_id FROM documents WHERE collection_name = ? AND file_hash = ?",
|
||||
(collection, file_hash),
|
||||
).fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
def get_document(self, collection: str, doc_id: str) -> dict:
|
||||
conn = self._get_conn()
|
||||
row = conn.execute(
|
||||
"SELECT doc_id, doc_name, doc_description, file_path, doc_type FROM documents WHERE doc_id = ? AND collection_name = ?",
|
||||
(doc_id, collection),
|
||||
).fetchone()
|
||||
if not row:
|
||||
return {}
|
||||
return {"doc_id": row[0], "doc_name": row[1], "doc_description": row[2],
|
||||
"file_path": row[3], "doc_type": row[4]}
|
||||
|
||||
def get_document_structure(self, collection: str, doc_id: str) -> list:
|
||||
conn = self._get_conn()
|
||||
row = conn.execute(
|
||||
"SELECT structure FROM documents WHERE doc_id = ? AND collection_name = ?",
|
||||
(doc_id, collection),
|
||||
).fetchone()
|
||||
if not row:
|
||||
return []
|
||||
return json.loads(row[0])
|
||||
|
||||
def get_pages(self, collection: str, doc_id: str) -> list | None:
|
||||
"""Return cached page content, or None if not cached."""
|
||||
conn = self._get_conn()
|
||||
row = conn.execute(
|
||||
"SELECT pages FROM documents WHERE doc_id = ? AND collection_name = ?",
|
||||
(doc_id, collection),
|
||||
).fetchone()
|
||||
if not row or not row[0]:
|
||||
return None
|
||||
return json.loads(row[0])
|
||||
|
||||
def list_documents(self, collection: str) -> list[dict]:
|
||||
conn = self._get_conn()
|
||||
rows = conn.execute(
|
||||
"SELECT doc_id, doc_name, doc_type FROM documents WHERE collection_name = ? ORDER BY created_at",
|
||||
(collection,),
|
||||
).fetchall()
|
||||
return [{"doc_id": r[0], "doc_name": r[1], "doc_type": r[2]} for r in rows]
|
||||
|
||||
def delete_document(self, collection: str, doc_id: str) -> None:
|
||||
conn = self._get_conn()
|
||||
conn.execute(
|
||||
"DELETE FROM documents WHERE doc_id = ? AND collection_name = ?",
|
||||
(doc_id, collection),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
return False
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close all tracked SQLite connections across all threads."""
|
||||
with self._conn_lock:
|
||||
for conn in self._connections:
|
||||
try:
|
||||
conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._connections.clear()
|
||||
if hasattr(self._local, "conn"):
|
||||
del self._local.conn
|
||||
|
||||
def __del__(self):
|
||||
try:
|
||||
self.close()
|
||||
except Exception:
|
||||
pass
|
||||
Loading…
Add table
Add a link
Reference in a new issue