Restructure examples directory and improve document storage (#189)

* Consolidate tests/ into examples/documents/

* Add line_count and reorder structure keys

* Lazy-load documents with _meta.json index

* Update demo script and add pre-shipped workspace

* Extract shared helpers for JSON reading and meta entry building
This commit is contained in:
Ray 2026-03-28 04:28:59 +08:00 committed by GitHub
parent 74e549a23a
commit 77722838e1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
26 changed files with 430 additions and 61 deletions

View file

@ -5,10 +5,15 @@ import asyncio
import concurrent.futures
from pathlib import Path
import PyPDF2
from .page_index import page_index
from .page_index_md import md_to_tree
from .retrieve import get_document, get_document_structure, get_page_content
from .utils import ConfigLoader
from .utils import ConfigLoader, remove_fields
META_INDEX = "_meta.json"
class PageIndexClient:
"""
@ -39,6 +44,9 @@ class PageIndexClient:
def index(self, file_path: str, mode: str = "auto") -> str:
"""Index a document. Returns a document_id."""
# Persist a canonical absolute path so workspace reloads do not
# reinterpret caller-relative paths against the workspace directory.
file_path = os.path.abspath(os.path.expanduser(file_path))
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
@ -58,13 +66,22 @@ class PageIndexClient:
if_add_node_id='yes',
if_add_doc_description='yes'
)
# Extract per-page text so queries don't need the original PDF
pages = []
with open(file_path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
for i, page in enumerate(pdf_reader.pages, 1):
pages.append({'page': i, 'content': page.extract_text() or ''})
self.documents[doc_id] = {
'id': doc_id,
'path': file_path,
'type': 'pdf',
'structure': result['structure'],
'path': file_path,
'doc_name': result.get('doc_name', ''),
'doc_description': result.get('doc_description', '')
'doc_description': result.get('doc_description', ''),
'page_count': len(pages),
'structure': result['structure'],
'pages': pages,
}
elif mode == "md" or (mode == "auto" and is_md):
@ -87,11 +104,12 @@ class PageIndexClient:
result = asyncio.run(coro)
self.documents[doc_id] = {
'id': doc_id,
'path': file_path,
'type': 'md',
'structure': result['structure'],
'path': file_path,
'doc_name': result.get('doc_name', ''),
'doc_description': result.get('doc_description', '')
'doc_description': result.get('doc_description', ''),
'line_count': result.get('line_count', 0),
'structure': result['structure'],
}
else:
raise ValueError(f"Unsupported file format for: {file_path}")
@ -101,23 +119,93 @@ class PageIndexClient:
self._save_doc(doc_id)
return doc_id
@staticmethod
def _make_meta_entry(doc: dict) -> dict:
"""Build a lightweight meta entry from a document dict."""
entry = {
'type': doc.get('type', ''),
'doc_name': doc.get('doc_name', ''),
'doc_description': doc.get('doc_description', ''),
'path': doc.get('path', ''),
}
if doc.get('type') == 'pdf':
entry['page_count'] = doc.get('page_count')
elif doc.get('type') == 'md':
entry['line_count'] = doc.get('line_count')
return entry
@staticmethod
def _read_json(path) -> dict | None:
"""Read a JSON file, returning None on any error."""
try:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
except (json.JSONDecodeError, OSError) as e:
print(f"Warning: corrupt {Path(path).name}: {e}")
return None
def _save_doc(self, doc_id: str):
doc = self.documents[doc_id].copy()
# Strip text from structure nodes — redundant with pages (PDF only)
if doc.get('structure') and doc.get('type') == 'pdf':
doc['structure'] = remove_fields(doc['structure'], fields=['text'])
path = self.workspace / f"{doc_id}.json"
with open(path, "w", encoding="utf-8") as f:
json.dump(self.documents[doc_id], f, ensure_ascii=False, indent=2)
json.dump(doc, f, ensure_ascii=False, indent=2)
self._save_meta(doc_id, self._make_meta_entry(doc))
# Drop heavy fields; will lazy-load on demand
self.documents[doc_id].pop('structure', None)
self.documents[doc_id].pop('pages', None)
def _rebuild_meta(self) -> dict:
"""Scan individual doc JSON files and return a meta dict."""
meta = {}
for path in self.workspace.glob("*.json"):
if path.name == META_INDEX:
continue
doc = self._read_json(path)
if doc and isinstance(doc, dict):
meta[path.stem] = self._make_meta_entry(doc)
return meta
def _read_meta(self) -> dict | None:
"""Read and validate _meta.json, returning None on any corruption."""
meta = self._read_json(self.workspace / META_INDEX)
if meta is not None and not isinstance(meta, dict):
print(f"Warning: {META_INDEX} is not a JSON object, ignoring")
return None
return meta
def _save_meta(self, doc_id: str, entry: dict):
meta = self._read_meta() or self._rebuild_meta()
meta[doc_id] = entry
meta_path = self.workspace / META_INDEX
with open(meta_path, "w", encoding="utf-8") as f:
json.dump(meta, f, ensure_ascii=False, indent=2)
def _load_workspace(self):
loaded = 0
for path in self.workspace.glob("*.json"):
try:
with open(path, "r", encoding="utf-8") as f:
doc = json.load(f)
self.documents[path.stem] = doc
loaded += 1
except (json.JSONDecodeError, OSError) as e:
print(f"Warning: skipping corrupt workspace file {path.name}: {e}")
if loaded:
print(f"Loaded {loaded} document(s) from workspace.")
meta = self._read_meta()
if meta is None:
meta = self._rebuild_meta()
if meta:
print(f"Loaded {len(meta)} document(s) from workspace (legacy mode).")
for doc_id, entry in meta.items():
doc = dict(entry, id=doc_id)
if doc.get('path') and not os.path.isabs(doc['path']):
doc['path'] = str((self.workspace / doc['path']).resolve())
self.documents[doc_id] = doc
def _ensure_doc_loaded(self, doc_id: str):
"""Load full document JSON on demand (structure, pages, etc.)."""
doc = self.documents.get(doc_id)
if not doc or doc.get('structure') is not None:
return
full = self._read_json(self.workspace / f"{doc_id}.json")
if not full:
return
doc['structure'] = full.get('structure', [])
if full.get('pages'):
doc['pages'] = full['pages']
def get_document(self, doc_id: str) -> str:
"""Return document metadata JSON."""
@ -125,8 +213,12 @@ class PageIndexClient:
def get_document_structure(self, doc_id: str) -> str:
"""Return document tree structure JSON (without text fields)."""
if self.workspace:
self._ensure_doc_loaded(doc_id)
return get_document_structure(self.documents, doc_id)
def get_page_content(self, doc_id: str, pages: str) -> str:
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
if self.workspace:
self._ensure_doc_loaded(doc_id)
return get_page_content(self.documents, doc_id, pages)

View file

@ -1095,11 +1095,13 @@ def page_index_main(doc, opt=None):
# Create a clean structure without unnecessary fields for description generation
clean_structure = create_clean_structure_for_description(structure)
doc_description = generate_doc_description(clean_structure, model=opt.model)
structure = format_structure(structure, order=['title', 'node_id', 'start_index', 'end_index', 'summary', 'text', 'nodes'])
return {
'doc_name': get_pdf_name(doc),
'doc_description': doc_description,
'structure': structure,
}
structure = format_structure(structure, order=['title', 'node_id', 'start_index', 'end_index', 'summary', 'text', 'nodes'])
return {
'doc_name': get_pdf_name(doc),
'structure': structure,

View file

@ -243,7 +243,8 @@ def clean_tree_for_output(tree_nodes):
async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes'):
with open(md_path, 'r', encoding='utf-8') as f:
markdown_content = f.read()
line_count = markdown_content.count('\n') + 1
print(f"Extracting nodes from markdown...")
node_list, markdown_lines = extract_nodes_from_markdown(markdown_content)
@ -265,14 +266,14 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
if if_add_node_summary == 'yes':
# Always include text for summary generation
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
print(f"Generating summaries for each node...")
tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model)
if if_add_node_text == 'no':
# Remove text after summary generation if not requested
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
if if_add_doc_description == 'yes':
print(f"Generating document description...")
@ -282,17 +283,19 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
return {
'doc_name': os.path.splitext(os.path.basename(md_path))[0],
'doc_description': doc_description,
'line_count': line_count,
'structure': tree_structure,
}
else:
# No summaries needed, format based on text preference
if if_add_node_text == 'yes':
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
else:
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
return {
'doc_name': os.path.splitext(os.path.basename(md_path))[0],
'line_count': line_count,
'structure': tree_structure,
}
@ -303,7 +306,7 @@ if __name__ == "__main__":
# MD_NAME = 'Detect-Order-Construct'
MD_NAME = 'cognitive-load'
MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'tests/markdowns/', f'{MD_NAME}.md')
MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'examples/documents/', f'{MD_NAME}.md')
MODEL="gpt-4.1"

View file

@ -25,25 +25,23 @@ def _parse_pages(pages: str) -> list[int]:
def _count_pages(doc_info: dict) -> int:
"""Return total page count for a document."""
if doc_info.get('type') == 'pdf':
return get_number_of_pages(doc_info['path'])
# For MD, find max line_num across all nodes
max_line = 0
def _traverse(nodes):
nonlocal max_line
for node in nodes:
ln = node.get('line_num', 0)
if ln and ln > max_line:
max_line = ln
if node.get('nodes'):
_traverse(node['nodes'])
_traverse(doc_info.get('structure', []))
return max_line
"""Return total page count for a PDF document."""
if doc_info.get('page_count'):
return doc_info['page_count']
if doc_info.get('pages'):
return len(doc_info['pages'])
return get_number_of_pages(doc_info['path'])
def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
"""Extract text for specific PDF pages (1-indexed), opening the PDF once."""
"""Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF."""
cached_pages = doc_info.get('pages')
if cached_pages:
page_map = {p['page']: p['content'] for p in cached_pages}
return [
{'page': p, 'content': page_map[p]}
for p in page_nums if p in page_map
]
path = doc_info['path']
with open(path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
@ -95,7 +93,7 @@ def get_document(documents: dict, doc_id: str) -> str:
if doc_info.get('type') == 'pdf':
result['page_count'] = _count_pages(doc_info)
else:
result['line_count'] = _count_pages(doc_info)
result['line_count'] = doc_info.get('line_count', 0)
return json.dumps(result)