mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-02 19:55:18 +02:00
feat: optimize source_markdown migration by processing documents in batches to reduce memory usage and improve performance
This commit is contained in:
parent
e1087937e6
commit
ef53203e4b
1 changed files with 55 additions and 31 deletions
|
|
@ -6,7 +6,7 @@ Create Date: 2026-02-17
|
||||||
|
|
||||||
Adds source_markdown column and converts only documents that have
|
Adds source_markdown column and converts only documents that have
|
||||||
blocknote_document data. Uses a pure-Python BlockNote JSON → Markdown
|
blocknote_document data. Uses a pure-Python BlockNote JSON → Markdown
|
||||||
converter. No external dependencies (no Node.js, no Celery, no HTTP calls).
|
converter without external dependencies.
|
||||||
|
|
||||||
Documents without blocknote_document keep source_markdown = NULL and
|
Documents without blocknote_document keep source_markdown = NULL and
|
||||||
get populated lazily by the editor route when a user first opens them.
|
get populated lazily by the editor route when a user first opens them.
|
||||||
|
|
@ -50,22 +50,24 @@ def upgrade() -> None:
|
||||||
_populate_source_markdown(conn)
|
_populate_source_markdown(conn)
|
||||||
|
|
||||||
|
|
||||||
def _populate_source_markdown(conn) -> None:
|
def _populate_source_markdown(conn, batch_size: int = 500) -> None:
|
||||||
"""Populate source_markdown only for documents that have blocknote_document."""
|
"""Populate source_markdown only for documents that have blocknote_document.
|
||||||
|
|
||||||
|
Processes in batches to avoid long-running transactions and high memory usage.
|
||||||
|
"""
|
||||||
from app.utils.blocknote_to_markdown import blocknote_to_markdown
|
from app.utils.blocknote_to_markdown import blocknote_to_markdown
|
||||||
|
|
||||||
# Only fetch documents that have blocknote_document content
|
# Get total count first
|
||||||
result = conn.execute(
|
count_result = conn.execute(
|
||||||
sa.text("""
|
sa.text("""
|
||||||
SELECT id, title, blocknote_document
|
SELECT count(*)
|
||||||
FROM documents
|
FROM documents
|
||||||
WHERE source_markdown IS NULL
|
WHERE source_markdown IS NULL
|
||||||
AND blocknote_document IS NOT NULL
|
AND blocknote_document IS NOT NULL
|
||||||
""")
|
""")
|
||||||
)
|
)
|
||||||
rows = result.fetchall()
|
total = count_result.scalar()
|
||||||
|
|
||||||
total = len(rows)
|
|
||||||
if total == 0:
|
if total == 0:
|
||||||
print("✓ No documents with blocknote_document need migration")
|
print("✓ No documents with blocknote_document need migration")
|
||||||
return
|
return
|
||||||
|
|
@ -74,35 +76,57 @@ def _populate_source_markdown(conn) -> None:
|
||||||
|
|
||||||
migrated = 0
|
migrated = 0
|
||||||
failed = 0
|
failed = 0
|
||||||
|
offset = 0
|
||||||
|
|
||||||
for row in rows:
|
while offset < total:
|
||||||
doc_id = row[0]
|
# Fetch one batch at a time
|
||||||
doc_title = row[1]
|
result = conn.execute(
|
||||||
blocknote_doc = row[2]
|
sa.text("""
|
||||||
|
SELECT id, title, blocknote_document
|
||||||
|
FROM documents
|
||||||
|
WHERE source_markdown IS NULL
|
||||||
|
AND blocknote_document IS NOT NULL
|
||||||
|
ORDER BY id
|
||||||
|
LIMIT :limit OFFSET :offset
|
||||||
|
"""),
|
||||||
|
{"limit": batch_size, "offset": offset},
|
||||||
|
)
|
||||||
|
rows = result.fetchall()
|
||||||
|
|
||||||
try:
|
if not rows:
|
||||||
if isinstance(blocknote_doc, str):
|
break
|
||||||
blocknote_doc = json.loads(blocknote_doc)
|
|
||||||
markdown = blocknote_to_markdown(blocknote_doc)
|
|
||||||
|
|
||||||
if markdown:
|
for row in rows:
|
||||||
conn.execute(
|
doc_id = row[0]
|
||||||
sa.text("""
|
doc_title = row[1]
|
||||||
UPDATE documents SET source_markdown = :md WHERE id = :doc_id
|
blocknote_doc = row[2]
|
||||||
"""),
|
|
||||||
{"md": markdown, "doc_id": doc_id},
|
try:
|
||||||
)
|
if isinstance(blocknote_doc, str):
|
||||||
migrated += 1
|
blocknote_doc = json.loads(blocknote_doc)
|
||||||
else:
|
markdown = blocknote_to_markdown(blocknote_doc)
|
||||||
|
|
||||||
|
if markdown:
|
||||||
|
conn.execute(
|
||||||
|
sa.text("""
|
||||||
|
UPDATE documents SET source_markdown = :md WHERE id = :doc_id
|
||||||
|
"""),
|
||||||
|
{"md": markdown, "doc_id": doc_id},
|
||||||
|
)
|
||||||
|
migrated += 1
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
f" Doc {doc_id} ({doc_title}): blocknote conversion produced empty result"
|
||||||
|
)
|
||||||
|
failed += 1
|
||||||
|
except Exception as e:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f" Doc {doc_id} ({doc_title}): blocknote conversion produced empty result"
|
f" Doc {doc_id} ({doc_title}): blocknote conversion failed ({e})"
|
||||||
)
|
)
|
||||||
failed += 1
|
failed += 1
|
||||||
except Exception as e:
|
|
||||||
logger.warning(
|
print(f" Batch complete: processed {min(offset + batch_size, total)}/{total}")
|
||||||
f" Doc {doc_id} ({doc_title}): blocknote conversion failed ({e})"
|
offset += batch_size
|
||||||
)
|
|
||||||
failed += 1
|
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"✓ source_markdown migration complete: {migrated} migrated, "
|
f"✓ source_markdown migration complete: {migrated} migrated, "
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue