mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-04-24 23:56:21 +02:00
Restructure examples directory and improve document storage (#189)
* Consolidate tests/ into examples/documents/ * Add line_count and reorder structure keys * Lazy-load documents with _meta.json index * Update demo script and add pre-shipped workspace * Extract shared helpers for JSON reading and meta entry building
This commit is contained in:
parent
74e549a23a
commit
77722838e1
26 changed files with 430 additions and 61 deletions
10
.gitignore
vendored
10
.gitignore
vendored
|
|
@ -1,16 +1,6 @@
|
|||
.ipynb_checkpoints
|
||||
__pycache__
|
||||
files
|
||||
index
|
||||
temp/*
|
||||
chroma-collections.parquet
|
||||
chroma-embeddings.parquet
|
||||
.DS_Store
|
||||
.env*
|
||||
.venv/
|
||||
notebook
|
||||
SDK/*
|
||||
log/*
|
||||
logs/
|
||||
parts/*
|
||||
json_results/*
|
||||
|
|
|
|||
|
|
@ -105,7 +105,7 @@ The PageIndex service is available as a ChatGPT-style [chat platform](https://ch
|
|||
|
||||
PageIndex can transform lengthy PDF documents into a semantic **tree structure**, similar to a _"table of contents"_ but optimized for use with Large Language Models (LLMs). It's ideal for: financial reports, regulatory filings, academic textbooks, legal or technical manuals, and any document that exceeds LLM context limits.
|
||||
|
||||
Below is an example PageIndex tree structure. Also see more example [documents](https://github.com/VectifyAI/PageIndex/tree/main/tests/pdfs) and generated [tree structures](https://github.com/VectifyAI/PageIndex/tree/main/tests/results).
|
||||
Below is an example PageIndex tree structure. Also see more example [documents](https://github.com/VectifyAI/PageIndex/tree/main/examples/documents) and generated [tree structures](https://github.com/VectifyAI/PageIndex/tree/main/examples/documents/results).
|
||||
|
||||
```jsonc
|
||||
...
|
||||
|
|
|
|||
BIN
examples/documents/attention-residuals.pdf
Normal file
BIN
examples/documents/attention-residuals.pdf
Normal file
Binary file not shown.
|
|
@ -18,10 +18,10 @@ Steps:
|
|||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import asyncio
|
||||
import concurrent.futures
|
||||
import requests
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
|
|
@ -32,9 +32,10 @@ from openai.types.responses import ResponseTextDeltaEvent, ResponseReasoningSumm
|
|||
from pageindex import PageIndexClient
|
||||
import pageindex.utils as utils
|
||||
|
||||
_EXAMPLES_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
PDF_URL = "https://arxiv.org/pdf/2603.15031"
|
||||
PDF_PATH = "tests/pdfs/attention-residuals.pdf"
|
||||
WORKSPACE = "./pageindex_workspace"
|
||||
PDF_PATH = os.path.join(_EXAMPLES_DIR, "documents", "attention-residuals.pdf")
|
||||
WORKSPACE = os.path.join(_EXAMPLES_DIR, "workspace")
|
||||
|
||||
AGENT_SYSTEM_PROMPT = """
|
||||
You are PageIndex, a document QA assistant.
|
||||
|
|
@ -147,16 +148,16 @@ client = PageIndexClient(workspace=WORKSPACE)
|
|||
print("=" * 60)
|
||||
print("Step 1: Indexing PDF and inspecting tree structure")
|
||||
print("=" * 60)
|
||||
_id_cache = Path(WORKSPACE).expanduser() / "demo_doc_id.txt"
|
||||
if _id_cache.exists() and (doc_id := _id_cache.read_text().strip()) in client.documents:
|
||||
doc_id = next((did for did, doc in client.documents.items()
|
||||
if doc.get('doc_name') == os.path.basename(PDF_PATH)), None)
|
||||
if doc_id:
|
||||
print(f"\nLoaded cached doc_id: {doc_id}")
|
||||
else:
|
||||
doc_id = client.index(PDF_PATH)
|
||||
_id_cache.parent.mkdir(parents=True, exist_ok=True)
|
||||
_id_cache.write_text(doc_id)
|
||||
print(f"\nIndexed. doc_id: {doc_id}")
|
||||
print("\nTree Structure (top-level sections):")
|
||||
utils.print_tree(client.documents[doc_id]["structure"])
|
||||
structure = json.loads(client.get_document_structure(doc_id))
|
||||
utils.print_tree(structure)
|
||||
|
||||
# ── Step 2: Document Metadata ──────────────────────────────────────────────────
|
||||
print("\n" + "=" * 60)
|
||||
|
|
|
|||
274
examples/workspace/12345678-abcd-4321-abcd-123456789abc.json
Normal file
274
examples/workspace/12345678-abcd-4321-abcd-123456789abc.json
Normal file
File diff suppressed because one or more lines are too long
9
examples/workspace/_meta.json
Normal file
9
examples/workspace/_meta.json
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
{
|
||||
"12345678-abcd-4321-abcd-123456789abc": {
|
||||
"type": "pdf",
|
||||
"doc_name": "attention-residuals.pdf",
|
||||
"doc_description": "This document introduces \"Attention Residuals\" (AttnRes) and its scalable variant \"Block AttnRes,\" novel mechanisms for replacing fixed residual accumulation in neural networks with learned, input-dependent depth-wise attention, addressing limitations of standard residual connections while optimizing memory, computation, and scalability for large-scale training and inference.",
|
||||
"page_count": 21,
|
||||
"path": "../documents/attention-residuals.pdf"
|
||||
}
|
||||
}
|
||||
|
|
@ -5,10 +5,15 @@ import asyncio
|
|||
import concurrent.futures
|
||||
from pathlib import Path
|
||||
|
||||
import PyPDF2
|
||||
|
||||
from .page_index import page_index
|
||||
from .page_index_md import md_to_tree
|
||||
from .retrieve import get_document, get_document_structure, get_page_content
|
||||
from .utils import ConfigLoader
|
||||
from .utils import ConfigLoader, remove_fields
|
||||
|
||||
META_INDEX = "_meta.json"
|
||||
|
||||
|
||||
class PageIndexClient:
|
||||
"""
|
||||
|
|
@ -39,6 +44,9 @@ class PageIndexClient:
|
|||
|
||||
def index(self, file_path: str, mode: str = "auto") -> str:
|
||||
"""Index a document. Returns a document_id."""
|
||||
# Persist a canonical absolute path so workspace reloads do not
|
||||
# reinterpret caller-relative paths against the workspace directory.
|
||||
file_path = os.path.abspath(os.path.expanduser(file_path))
|
||||
if not os.path.exists(file_path):
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
|
|
@ -58,13 +66,22 @@ class PageIndexClient:
|
|||
if_add_node_id='yes',
|
||||
if_add_doc_description='yes'
|
||||
)
|
||||
# Extract per-page text so queries don't need the original PDF
|
||||
pages = []
|
||||
with open(file_path, 'rb') as f:
|
||||
pdf_reader = PyPDF2.PdfReader(f)
|
||||
for i, page in enumerate(pdf_reader.pages, 1):
|
||||
pages.append({'page': i, 'content': page.extract_text() or ''})
|
||||
|
||||
self.documents[doc_id] = {
|
||||
'id': doc_id,
|
||||
'path': file_path,
|
||||
'type': 'pdf',
|
||||
'structure': result['structure'],
|
||||
'path': file_path,
|
||||
'doc_name': result.get('doc_name', ''),
|
||||
'doc_description': result.get('doc_description', '')
|
||||
'doc_description': result.get('doc_description', ''),
|
||||
'page_count': len(pages),
|
||||
'structure': result['structure'],
|
||||
'pages': pages,
|
||||
}
|
||||
|
||||
elif mode == "md" or (mode == "auto" and is_md):
|
||||
|
|
@ -87,11 +104,12 @@ class PageIndexClient:
|
|||
result = asyncio.run(coro)
|
||||
self.documents[doc_id] = {
|
||||
'id': doc_id,
|
||||
'path': file_path,
|
||||
'type': 'md',
|
||||
'structure': result['structure'],
|
||||
'path': file_path,
|
||||
'doc_name': result.get('doc_name', ''),
|
||||
'doc_description': result.get('doc_description', '')
|
||||
'doc_description': result.get('doc_description', ''),
|
||||
'line_count': result.get('line_count', 0),
|
||||
'structure': result['structure'],
|
||||
}
|
||||
else:
|
||||
raise ValueError(f"Unsupported file format for: {file_path}")
|
||||
|
|
@ -101,23 +119,93 @@ class PageIndexClient:
|
|||
self._save_doc(doc_id)
|
||||
return doc_id
|
||||
|
||||
@staticmethod
|
||||
def _make_meta_entry(doc: dict) -> dict:
|
||||
"""Build a lightweight meta entry from a document dict."""
|
||||
entry = {
|
||||
'type': doc.get('type', ''),
|
||||
'doc_name': doc.get('doc_name', ''),
|
||||
'doc_description': doc.get('doc_description', ''),
|
||||
'path': doc.get('path', ''),
|
||||
}
|
||||
if doc.get('type') == 'pdf':
|
||||
entry['page_count'] = doc.get('page_count')
|
||||
elif doc.get('type') == 'md':
|
||||
entry['line_count'] = doc.get('line_count')
|
||||
return entry
|
||||
|
||||
@staticmethod
|
||||
def _read_json(path) -> dict | None:
|
||||
"""Read a JSON file, returning None on any error."""
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except (json.JSONDecodeError, OSError) as e:
|
||||
print(f"Warning: corrupt {Path(path).name}: {e}")
|
||||
return None
|
||||
|
||||
def _save_doc(self, doc_id: str):
|
||||
doc = self.documents[doc_id].copy()
|
||||
# Strip text from structure nodes — redundant with pages (PDF only)
|
||||
if doc.get('structure') and doc.get('type') == 'pdf':
|
||||
doc['structure'] = remove_fields(doc['structure'], fields=['text'])
|
||||
path = self.workspace / f"{doc_id}.json"
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(self.documents[doc_id], f, ensure_ascii=False, indent=2)
|
||||
json.dump(doc, f, ensure_ascii=False, indent=2)
|
||||
self._save_meta(doc_id, self._make_meta_entry(doc))
|
||||
# Drop heavy fields; will lazy-load on demand
|
||||
self.documents[doc_id].pop('structure', None)
|
||||
self.documents[doc_id].pop('pages', None)
|
||||
|
||||
def _rebuild_meta(self) -> dict:
|
||||
"""Scan individual doc JSON files and return a meta dict."""
|
||||
meta = {}
|
||||
for path in self.workspace.glob("*.json"):
|
||||
if path.name == META_INDEX:
|
||||
continue
|
||||
doc = self._read_json(path)
|
||||
if doc and isinstance(doc, dict):
|
||||
meta[path.stem] = self._make_meta_entry(doc)
|
||||
return meta
|
||||
|
||||
def _read_meta(self) -> dict | None:
|
||||
"""Read and validate _meta.json, returning None on any corruption."""
|
||||
meta = self._read_json(self.workspace / META_INDEX)
|
||||
if meta is not None and not isinstance(meta, dict):
|
||||
print(f"Warning: {META_INDEX} is not a JSON object, ignoring")
|
||||
return None
|
||||
return meta
|
||||
|
||||
def _save_meta(self, doc_id: str, entry: dict):
|
||||
meta = self._read_meta() or self._rebuild_meta()
|
||||
meta[doc_id] = entry
|
||||
meta_path = self.workspace / META_INDEX
|
||||
with open(meta_path, "w", encoding="utf-8") as f:
|
||||
json.dump(meta, f, ensure_ascii=False, indent=2)
|
||||
|
||||
def _load_workspace(self):
|
||||
loaded = 0
|
||||
for path in self.workspace.glob("*.json"):
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
doc = json.load(f)
|
||||
self.documents[path.stem] = doc
|
||||
loaded += 1
|
||||
except (json.JSONDecodeError, OSError) as e:
|
||||
print(f"Warning: skipping corrupt workspace file {path.name}: {e}")
|
||||
if loaded:
|
||||
print(f"Loaded {loaded} document(s) from workspace.")
|
||||
meta = self._read_meta()
|
||||
if meta is None:
|
||||
meta = self._rebuild_meta()
|
||||
if meta:
|
||||
print(f"Loaded {len(meta)} document(s) from workspace (legacy mode).")
|
||||
for doc_id, entry in meta.items():
|
||||
doc = dict(entry, id=doc_id)
|
||||
if doc.get('path') and not os.path.isabs(doc['path']):
|
||||
doc['path'] = str((self.workspace / doc['path']).resolve())
|
||||
self.documents[doc_id] = doc
|
||||
|
||||
def _ensure_doc_loaded(self, doc_id: str):
|
||||
"""Load full document JSON on demand (structure, pages, etc.)."""
|
||||
doc = self.documents.get(doc_id)
|
||||
if not doc or doc.get('structure') is not None:
|
||||
return
|
||||
full = self._read_json(self.workspace / f"{doc_id}.json")
|
||||
if not full:
|
||||
return
|
||||
doc['structure'] = full.get('structure', [])
|
||||
if full.get('pages'):
|
||||
doc['pages'] = full['pages']
|
||||
|
||||
def get_document(self, doc_id: str) -> str:
|
||||
"""Return document metadata JSON."""
|
||||
|
|
@ -125,8 +213,12 @@ class PageIndexClient:
|
|||
|
||||
def get_document_structure(self, doc_id: str) -> str:
|
||||
"""Return document tree structure JSON (without text fields)."""
|
||||
if self.workspace:
|
||||
self._ensure_doc_loaded(doc_id)
|
||||
return get_document_structure(self.documents, doc_id)
|
||||
|
||||
def get_page_content(self, doc_id: str, pages: str) -> str:
|
||||
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
|
||||
if self.workspace:
|
||||
self._ensure_doc_loaded(doc_id)
|
||||
return get_page_content(self.documents, doc_id, pages)
|
||||
|
|
|
|||
|
|
@ -1095,11 +1095,13 @@ def page_index_main(doc, opt=None):
|
|||
# Create a clean structure without unnecessary fields for description generation
|
||||
clean_structure = create_clean_structure_for_description(structure)
|
||||
doc_description = generate_doc_description(clean_structure, model=opt.model)
|
||||
structure = format_structure(structure, order=['title', 'node_id', 'start_index', 'end_index', 'summary', 'text', 'nodes'])
|
||||
return {
|
||||
'doc_name': get_pdf_name(doc),
|
||||
'doc_description': doc_description,
|
||||
'structure': structure,
|
||||
}
|
||||
structure = format_structure(structure, order=['title', 'node_id', 'start_index', 'end_index', 'summary', 'text', 'nodes'])
|
||||
return {
|
||||
'doc_name': get_pdf_name(doc),
|
||||
'structure': structure,
|
||||
|
|
|
|||
|
|
@ -243,7 +243,8 @@ def clean_tree_for_output(tree_nodes):
|
|||
async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes'):
|
||||
with open(md_path, 'r', encoding='utf-8') as f:
|
||||
markdown_content = f.read()
|
||||
|
||||
line_count = markdown_content.count('\n') + 1
|
||||
|
||||
print(f"Extracting nodes from markdown...")
|
||||
node_list, markdown_lines = extract_nodes_from_markdown(markdown_content)
|
||||
|
||||
|
|
@ -265,14 +266,14 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
|
|||
|
||||
if if_add_node_summary == 'yes':
|
||||
# Always include text for summary generation
|
||||
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
|
||||
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
|
||||
|
||||
print(f"Generating summaries for each node...")
|
||||
tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model)
|
||||
|
||||
if if_add_node_text == 'no':
|
||||
# Remove text after summary generation if not requested
|
||||
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
|
||||
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
|
||||
|
||||
if if_add_doc_description == 'yes':
|
||||
print(f"Generating document description...")
|
||||
|
|
@ -282,17 +283,19 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
|
|||
return {
|
||||
'doc_name': os.path.splitext(os.path.basename(md_path))[0],
|
||||
'doc_description': doc_description,
|
||||
'line_count': line_count,
|
||||
'structure': tree_structure,
|
||||
}
|
||||
else:
|
||||
# No summaries needed, format based on text preference
|
||||
if if_add_node_text == 'yes':
|
||||
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
|
||||
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
|
||||
else:
|
||||
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
|
||||
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
|
||||
|
||||
return {
|
||||
'doc_name': os.path.splitext(os.path.basename(md_path))[0],
|
||||
'line_count': line_count,
|
||||
'structure': tree_structure,
|
||||
}
|
||||
|
||||
|
|
@ -303,7 +306,7 @@ if __name__ == "__main__":
|
|||
|
||||
# MD_NAME = 'Detect-Order-Construct'
|
||||
MD_NAME = 'cognitive-load'
|
||||
MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'tests/markdowns/', f'{MD_NAME}.md')
|
||||
MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'examples/documents/', f'{MD_NAME}.md')
|
||||
|
||||
|
||||
MODEL="gpt-4.1"
|
||||
|
|
|
|||
|
|
@ -25,25 +25,23 @@ def _parse_pages(pages: str) -> list[int]:
|
|||
|
||||
|
||||
def _count_pages(doc_info: dict) -> int:
|
||||
"""Return total page count for a document."""
|
||||
if doc_info.get('type') == 'pdf':
|
||||
return get_number_of_pages(doc_info['path'])
|
||||
# For MD, find max line_num across all nodes
|
||||
max_line = 0
|
||||
def _traverse(nodes):
|
||||
nonlocal max_line
|
||||
for node in nodes:
|
||||
ln = node.get('line_num', 0)
|
||||
if ln and ln > max_line:
|
||||
max_line = ln
|
||||
if node.get('nodes'):
|
||||
_traverse(node['nodes'])
|
||||
_traverse(doc_info.get('structure', []))
|
||||
return max_line
|
||||
"""Return total page count for a PDF document."""
|
||||
if doc_info.get('page_count'):
|
||||
return doc_info['page_count']
|
||||
if doc_info.get('pages'):
|
||||
return len(doc_info['pages'])
|
||||
return get_number_of_pages(doc_info['path'])
|
||||
|
||||
|
||||
def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
|
||||
"""Extract text for specific PDF pages (1-indexed), opening the PDF once."""
|
||||
"""Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF."""
|
||||
cached_pages = doc_info.get('pages')
|
||||
if cached_pages:
|
||||
page_map = {p['page']: p['content'] for p in cached_pages}
|
||||
return [
|
||||
{'page': p, 'content': page_map[p]}
|
||||
for p in page_nums if p in page_map
|
||||
]
|
||||
path = doc_info['path']
|
||||
with open(path, 'rb') as f:
|
||||
pdf_reader = PyPDF2.PdfReader(f)
|
||||
|
|
@ -95,7 +93,7 @@ def get_document(documents: dict, doc_id: str) -> str:
|
|||
if doc_info.get('type') == 'pdf':
|
||||
result['page_count'] = _count_pages(doc_info)
|
||||
else:
|
||||
result['line_count'] = _count_pages(doc_info)
|
||||
result['line_count'] = doc_info.get('line_count', 0)
|
||||
return json.dumps(result)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue