Restructure examples directory and improve document storage (#189)

* Consolidate tests/ into examples/documents/

* Add line_count and reorder structure keys

* Lazy-load documents with _meta.json index

* Update demo script and add pre-shipped workspace

* Extract shared helpers for JSON reading and meta entry building
This commit is contained in:
Ray 2026-03-28 04:28:59 +08:00 committed by GitHub
parent 74e549a23a
commit 77722838e1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
26 changed files with 430 additions and 61 deletions

10
.gitignore vendored
View file

@ -1,16 +1,6 @@
.ipynb_checkpoints
__pycache__
files
index
temp/*
chroma-collections.parquet
chroma-embeddings.parquet
.DS_Store
.env*
.venv/
notebook
SDK/*
log/*
logs/
parts/*
json_results/*

View file

@ -105,7 +105,7 @@ The PageIndex service is available as a ChatGPT-style [chat platform](https://ch
PageIndex can transform lengthy PDF documents into a semantic **tree structure**, similar to a _"table of contents"_ but optimized for use with Large Language Models (LLMs). It's ideal for: financial reports, regulatory filings, academic textbooks, legal or technical manuals, and any document that exceeds LLM context limits.
Below is an example PageIndex tree structure. Also see more example [documents](https://github.com/VectifyAI/PageIndex/tree/main/tests/pdfs) and generated [tree structures](https://github.com/VectifyAI/PageIndex/tree/main/tests/results).
Below is an example PageIndex tree structure. Also see more example [documents](https://github.com/VectifyAI/PageIndex/tree/main/examples/documents) and generated [tree structures](https://github.com/VectifyAI/PageIndex/tree/main/examples/documents/results).
```jsonc
...

Binary file not shown.

View file

@ -18,10 +18,10 @@ Steps:
"""
import os
import sys
import json
import asyncio
import concurrent.futures
import requests
from pathlib import Path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@ -32,9 +32,10 @@ from openai.types.responses import ResponseTextDeltaEvent, ResponseReasoningSumm
from pageindex import PageIndexClient
import pageindex.utils as utils
_EXAMPLES_DIR = os.path.dirname(os.path.abspath(__file__))
PDF_URL = "https://arxiv.org/pdf/2603.15031"
PDF_PATH = "tests/pdfs/attention-residuals.pdf"
WORKSPACE = "./pageindex_workspace"
PDF_PATH = os.path.join(_EXAMPLES_DIR, "documents", "attention-residuals.pdf")
WORKSPACE = os.path.join(_EXAMPLES_DIR, "workspace")
AGENT_SYSTEM_PROMPT = """
You are PageIndex, a document QA assistant.
@ -147,16 +148,16 @@ client = PageIndexClient(workspace=WORKSPACE)
print("=" * 60)
print("Step 1: Indexing PDF and inspecting tree structure")
print("=" * 60)
_id_cache = Path(WORKSPACE).expanduser() / "demo_doc_id.txt"
if _id_cache.exists() and (doc_id := _id_cache.read_text().strip()) in client.documents:
doc_id = next((did for did, doc in client.documents.items()
if doc.get('doc_name') == os.path.basename(PDF_PATH)), None)
if doc_id:
print(f"\nLoaded cached doc_id: {doc_id}")
else:
doc_id = client.index(PDF_PATH)
_id_cache.parent.mkdir(parents=True, exist_ok=True)
_id_cache.write_text(doc_id)
print(f"\nIndexed. doc_id: {doc_id}")
print("\nTree Structure (top-level sections):")
utils.print_tree(client.documents[doc_id]["structure"])
structure = json.loads(client.get_document_structure(doc_id))
utils.print_tree(structure)
# ── Step 2: Document Metadata ──────────────────────────────────────────────────
print("\n" + "=" * 60)

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,9 @@
{
"12345678-abcd-4321-abcd-123456789abc": {
"type": "pdf",
"doc_name": "attention-residuals.pdf",
"doc_description": "This document introduces \"Attention Residuals\" (AttnRes) and its scalable variant \"Block AttnRes,\" novel mechanisms for replacing fixed residual accumulation in neural networks with learned, input-dependent depth-wise attention, addressing limitations of standard residual connections while optimizing memory, computation, and scalability for large-scale training and inference.",
"page_count": 21,
"path": "../documents/attention-residuals.pdf"
}
}

View file

@ -5,10 +5,15 @@ import asyncio
import concurrent.futures
from pathlib import Path
import PyPDF2
from .page_index import page_index
from .page_index_md import md_to_tree
from .retrieve import get_document, get_document_structure, get_page_content
from .utils import ConfigLoader
from .utils import ConfigLoader, remove_fields
META_INDEX = "_meta.json"
class PageIndexClient:
"""
@ -39,6 +44,9 @@ class PageIndexClient:
def index(self, file_path: str, mode: str = "auto") -> str:
"""Index a document. Returns a document_id."""
# Persist a canonical absolute path so workspace reloads do not
# reinterpret caller-relative paths against the workspace directory.
file_path = os.path.abspath(os.path.expanduser(file_path))
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
@ -58,13 +66,22 @@ class PageIndexClient:
if_add_node_id='yes',
if_add_doc_description='yes'
)
# Extract per-page text so queries don't need the original PDF
pages = []
with open(file_path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
for i, page in enumerate(pdf_reader.pages, 1):
pages.append({'page': i, 'content': page.extract_text() or ''})
self.documents[doc_id] = {
'id': doc_id,
'path': file_path,
'type': 'pdf',
'structure': result['structure'],
'path': file_path,
'doc_name': result.get('doc_name', ''),
'doc_description': result.get('doc_description', '')
'doc_description': result.get('doc_description', ''),
'page_count': len(pages),
'structure': result['structure'],
'pages': pages,
}
elif mode == "md" or (mode == "auto" and is_md):
@ -87,11 +104,12 @@ class PageIndexClient:
result = asyncio.run(coro)
self.documents[doc_id] = {
'id': doc_id,
'path': file_path,
'type': 'md',
'structure': result['structure'],
'path': file_path,
'doc_name': result.get('doc_name', ''),
'doc_description': result.get('doc_description', '')
'doc_description': result.get('doc_description', ''),
'line_count': result.get('line_count', 0),
'structure': result['structure'],
}
else:
raise ValueError(f"Unsupported file format for: {file_path}")
@ -101,23 +119,93 @@ class PageIndexClient:
self._save_doc(doc_id)
return doc_id
@staticmethod
def _make_meta_entry(doc: dict) -> dict:
"""Build a lightweight meta entry from a document dict."""
entry = {
'type': doc.get('type', ''),
'doc_name': doc.get('doc_name', ''),
'doc_description': doc.get('doc_description', ''),
'path': doc.get('path', ''),
}
if doc.get('type') == 'pdf':
entry['page_count'] = doc.get('page_count')
elif doc.get('type') == 'md':
entry['line_count'] = doc.get('line_count')
return entry
@staticmethod
def _read_json(path) -> dict | None:
"""Read a JSON file, returning None on any error."""
try:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
except (json.JSONDecodeError, OSError) as e:
print(f"Warning: corrupt {Path(path).name}: {e}")
return None
def _save_doc(self, doc_id: str):
doc = self.documents[doc_id].copy()
# Strip text from structure nodes — redundant with pages (PDF only)
if doc.get('structure') and doc.get('type') == 'pdf':
doc['structure'] = remove_fields(doc['structure'], fields=['text'])
path = self.workspace / f"{doc_id}.json"
with open(path, "w", encoding="utf-8") as f:
json.dump(self.documents[doc_id], f, ensure_ascii=False, indent=2)
json.dump(doc, f, ensure_ascii=False, indent=2)
self._save_meta(doc_id, self._make_meta_entry(doc))
# Drop heavy fields; will lazy-load on demand
self.documents[doc_id].pop('structure', None)
self.documents[doc_id].pop('pages', None)
def _rebuild_meta(self) -> dict:
"""Scan individual doc JSON files and return a meta dict."""
meta = {}
for path in self.workspace.glob("*.json"):
if path.name == META_INDEX:
continue
doc = self._read_json(path)
if doc and isinstance(doc, dict):
meta[path.stem] = self._make_meta_entry(doc)
return meta
def _read_meta(self) -> dict | None:
"""Read and validate _meta.json, returning None on any corruption."""
meta = self._read_json(self.workspace / META_INDEX)
if meta is not None and not isinstance(meta, dict):
print(f"Warning: {META_INDEX} is not a JSON object, ignoring")
return None
return meta
def _save_meta(self, doc_id: str, entry: dict):
meta = self._read_meta() or self._rebuild_meta()
meta[doc_id] = entry
meta_path = self.workspace / META_INDEX
with open(meta_path, "w", encoding="utf-8") as f:
json.dump(meta, f, ensure_ascii=False, indent=2)
def _load_workspace(self):
loaded = 0
for path in self.workspace.glob("*.json"):
try:
with open(path, "r", encoding="utf-8") as f:
doc = json.load(f)
self.documents[path.stem] = doc
loaded += 1
except (json.JSONDecodeError, OSError) as e:
print(f"Warning: skipping corrupt workspace file {path.name}: {e}")
if loaded:
print(f"Loaded {loaded} document(s) from workspace.")
meta = self._read_meta()
if meta is None:
meta = self._rebuild_meta()
if meta:
print(f"Loaded {len(meta)} document(s) from workspace (legacy mode).")
for doc_id, entry in meta.items():
doc = dict(entry, id=doc_id)
if doc.get('path') and not os.path.isabs(doc['path']):
doc['path'] = str((self.workspace / doc['path']).resolve())
self.documents[doc_id] = doc
def _ensure_doc_loaded(self, doc_id: str):
"""Load full document JSON on demand (structure, pages, etc.)."""
doc = self.documents.get(doc_id)
if not doc or doc.get('structure') is not None:
return
full = self._read_json(self.workspace / f"{doc_id}.json")
if not full:
return
doc['structure'] = full.get('structure', [])
if full.get('pages'):
doc['pages'] = full['pages']
def get_document(self, doc_id: str) -> str:
"""Return document metadata JSON."""
@ -125,8 +213,12 @@ class PageIndexClient:
def get_document_structure(self, doc_id: str) -> str:
"""Return document tree structure JSON (without text fields)."""
if self.workspace:
self._ensure_doc_loaded(doc_id)
return get_document_structure(self.documents, doc_id)
def get_page_content(self, doc_id: str, pages: str) -> str:
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
if self.workspace:
self._ensure_doc_loaded(doc_id)
return get_page_content(self.documents, doc_id, pages)

View file

@ -1095,11 +1095,13 @@ def page_index_main(doc, opt=None):
# Create a clean structure without unnecessary fields for description generation
clean_structure = create_clean_structure_for_description(structure)
doc_description = generate_doc_description(clean_structure, model=opt.model)
structure = format_structure(structure, order=['title', 'node_id', 'start_index', 'end_index', 'summary', 'text', 'nodes'])
return {
'doc_name': get_pdf_name(doc),
'doc_description': doc_description,
'structure': structure,
}
structure = format_structure(structure, order=['title', 'node_id', 'start_index', 'end_index', 'summary', 'text', 'nodes'])
return {
'doc_name': get_pdf_name(doc),
'structure': structure,

View file

@ -243,6 +243,7 @@ def clean_tree_for_output(tree_nodes):
async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes'):
with open(md_path, 'r', encoding='utf-8') as f:
markdown_content = f.read()
line_count = markdown_content.count('\n') + 1
print(f"Extracting nodes from markdown...")
node_list, markdown_lines = extract_nodes_from_markdown(markdown_content)
@ -265,14 +266,14 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
if if_add_node_summary == 'yes':
# Always include text for summary generation
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
print(f"Generating summaries for each node...")
tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model)
if if_add_node_text == 'no':
# Remove text after summary generation if not requested
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
if if_add_doc_description == 'yes':
print(f"Generating document description...")
@ -282,17 +283,19 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
return {
'doc_name': os.path.splitext(os.path.basename(md_path))[0],
'doc_description': doc_description,
'line_count': line_count,
'structure': tree_structure,
}
else:
# No summaries needed, format based on text preference
if if_add_node_text == 'yes':
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
else:
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
return {
'doc_name': os.path.splitext(os.path.basename(md_path))[0],
'line_count': line_count,
'structure': tree_structure,
}
@ -303,7 +306,7 @@ if __name__ == "__main__":
# MD_NAME = 'Detect-Order-Construct'
MD_NAME = 'cognitive-load'
MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'tests/markdowns/', f'{MD_NAME}.md')
MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'examples/documents/', f'{MD_NAME}.md')
MODEL="gpt-4.1"

View file

@ -25,25 +25,23 @@ def _parse_pages(pages: str) -> list[int]:
def _count_pages(doc_info: dict) -> int:
"""Return total page count for a document."""
if doc_info.get('type') == 'pdf':
return get_number_of_pages(doc_info['path'])
# For MD, find max line_num across all nodes
max_line = 0
def _traverse(nodes):
nonlocal max_line
for node in nodes:
ln = node.get('line_num', 0)
if ln and ln > max_line:
max_line = ln
if node.get('nodes'):
_traverse(node['nodes'])
_traverse(doc_info.get('structure', []))
return max_line
"""Return total page count for a PDF document."""
if doc_info.get('page_count'):
return doc_info['page_count']
if doc_info.get('pages'):
return len(doc_info['pages'])
return get_number_of_pages(doc_info['path'])
def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
"""Extract text for specific PDF pages (1-indexed), opening the PDF once."""
"""Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF."""
cached_pages = doc_info.get('pages')
if cached_pages:
page_map = {p['page']: p['content'] for p in cached_pages}
return [
{'page': p, 'content': page_map[p]}
for p in page_nums if p in page_map
]
path = doc_info['path']
with open(path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
@ -95,7 +93,7 @@ def get_document(documents: dict, doc_id: str) -> str:
if doc_info.get('type') == 'pdf':
result['page_count'] = _count_pages(doc_info)
else:
result['line_count'] = _count_pages(doc_info)
result['line_count'] = doc_info.get('line_count', 0)
return json.dumps(result)