Restructure examples directory and improve document storage (#189)

* Consolidate tests/ into examples/documents/ * Add line_count and reorder structure keys * Lazy-load documents with _meta.json index * Update demo script and add pre-shipped workspace * Extract shared helpers for JSON reading and meta entry building
2026-04-24 23:56:21 +02:00 · 2026-03-28 04:28:59 +08:00 · 2026-03-28 04:28:59 +08:00 · 77722838e1
commit 77722838e1
parent 74e549a23a
26 changed files with 430 additions and 61 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,16 +1,6 @@
 .ipynb_checkpoints
 __pycache__
-files
-index
-temp/*
-chroma-collections.parquet
-chroma-embeddings.parquet
 .DS_Store
 .env*
 .venv/
-notebook
-SDK/*
-log/*
 logs/
-parts/*
-json_results/*
--- a/README.md
+++ b/README.md
@ -105,7 +105,7 @@ The PageIndex service is available as a ChatGPT-style [chat platform](https://ch

 PageIndex can transform lengthy PDF documents into a semantic **tree structure**, similar to a _"table of contents"_ but optimized for use with Large Language Models (LLMs). It's ideal for: financial reports, regulatory filings, academic textbooks, legal or technical manuals, and any document that exceeds LLM context limits.

-Below is an example PageIndex tree structure. Also see more example [documents](https://github.com/VectifyAI/PageIndex/tree/main/tests/pdfs) and generated [tree structures](https://github.com/VectifyAI/PageIndex/tree/main/tests/results).
+Below is an example PageIndex tree structure. Also see more example [documents](https://github.com/VectifyAI/PageIndex/tree/main/examples/documents) and generated [tree structures](https://github.com/VectifyAI/PageIndex/tree/main/examples/documents/results).

 ```jsonc
 ...
--- a/examples/documents/2023-annual-report-truncated.pdf
+++ b/examples/documents/2023-annual-report-truncated.pdf
--- a/examples/documents/2023-annual-report.pdf
+++ b/examples/documents/2023-annual-report.pdf
--- a/examples/documents/PRML.pdf
+++ b/examples/documents/PRML.pdf
--- a/examples/documents/Regulation
+++ b/examples/documents/Regulation
--- a/examples/documents/Regulation
+++ b/examples/documents/Regulation
--- a/examples/documents/attention-residuals.pdf
+++ b/examples/documents/attention-residuals.pdf
--- a/examples/documents/earthmover.pdf
+++ b/examples/documents/earthmover.pdf
--- a/examples/documents/four-lectures.pdf
+++ b/examples/documents/four-lectures.pdf
--- a/examples/documents/q1-fy25-earnings.pdf
+++ b/examples/documents/q1-fy25-earnings.pdf
--- a/examples/documents/results/2023-annual-report-truncated_structure.json
+++ b/examples/documents/results/2023-annual-report-truncated_structure.json
--- a/examples/documents/results/2023-annual-report_structure.json
+++ b/examples/documents/results/2023-annual-report_structure.json
--- a/examples/documents/results/PRML_structure.json
+++ b/examples/documents/results/PRML_structure.json
--- a/examples/documents/results/Regulation
+++ b/examples/documents/results/Regulation
--- a/examples/documents/results/Regulation
+++ b/examples/documents/results/Regulation
--- a/examples/documents/results/earthmover_structure.json
+++ b/examples/documents/results/earthmover_structure.json
--- a/examples/documents/results/four-lectures_structure.json
+++ b/examples/documents/results/four-lectures_structure.json
--- a/examples/documents/results/q1-fy25-earnings_structure.json
+++ b/examples/documents/results/q1-fy25-earnings_structure.json
--- a/examples/openai_agents_demo.py
+++ b/examples/openai_agents_demo.py
@ -18,10 +18,10 @@ Steps:
 """
 import os
 import sys
+import json
 import asyncio
 import concurrent.futures
 import requests
-from pathlib import Path

 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

@ -32,9 +32,10 @@ from openai.types.responses import ResponseTextDeltaEvent, ResponseReasoningSumm
 from pageindex import PageIndexClient
 import pageindex.utils as utils

+_EXAMPLES_DIR = os.path.dirname(os.path.abspath(__file__))
 PDF_URL = "https://arxiv.org/pdf/2603.15031"
-PDF_PATH = "tests/pdfs/attention-residuals.pdf"
-WORKSPACE = "./pageindex_workspace"
+PDF_PATH = os.path.join(_EXAMPLES_DIR, "documents", "attention-residuals.pdf")
+WORKSPACE = os.path.join(_EXAMPLES_DIR, "workspace")

 AGENT_SYSTEM_PROMPT = """
 You are PageIndex, a document QA assistant.
@ -147,16 +148,16 @@ client = PageIndexClient(workspace=WORKSPACE)
 print("=" * 60)
 print("Step 1: Indexing PDF and inspecting tree structure")
 print("=" * 60)
-_id_cache = Path(WORKSPACE).expanduser() / "demo_doc_id.txt"
-if _id_cache.exists() and (doc_id := _id_cache.read_text().strip()) in client.documents:
+doc_id = next((did for did, doc in client.documents.items()
+                if doc.get('doc_name') == os.path.basename(PDF_PATH)), None)
+if doc_id:
    print(f"\nLoaded cached doc_id: {doc_id}")
 else:
    doc_id = client.index(PDF_PATH)
-    _id_cache.parent.mkdir(parents=True, exist_ok=True)
-    _id_cache.write_text(doc_id)
    print(f"\nIndexed. doc_id: {doc_id}")
 print("\nTree Structure (top-level sections):")
-utils.print_tree(client.documents[doc_id]["structure"])
+structure = json.loads(client.get_document_structure(doc_id))
+utils.print_tree(structure)

 # ── Step 2: Document Metadata ──────────────────────────────────────────────────
 print("\n" + "=" * 60)
--- a/examples/workspace/12345678-abcd-4321-abcd-123456789abc.json
+++ b/examples/workspace/12345678-abcd-4321-abcd-123456789abc.json
--- a/examples/workspace/_meta.json
+++ b/examples/workspace/_meta.json
@ -0,0 +1,9 @@
+{
+  "12345678-abcd-4321-abcd-123456789abc": {
+    "type": "pdf",
+    "doc_name": "attention-residuals.pdf",
+    "doc_description": "This document introduces \"Attention Residuals\" (AttnRes) and its scalable variant \"Block AttnRes,\" novel mechanisms for replacing fixed residual accumulation in neural networks with learned, input-dependent depth-wise attention, addressing limitations of standard residual connections while optimizing memory, computation, and scalability for large-scale training and inference.",
+    "page_count": 21,
+    "path": "../documents/attention-residuals.pdf"
+  }
+}
--- a/pageindex/client.py
+++ b/pageindex/client.py
@ -5,10 +5,15 @@ import asyncio
 import concurrent.futures
 from pathlib import Path

+import PyPDF2
+
 from .page_index import page_index
 from .page_index_md import md_to_tree
 from .retrieve import get_document, get_document_structure, get_page_content
-from .utils import ConfigLoader
+from .utils import ConfigLoader, remove_fields
+
+META_INDEX = "_meta.json"
+

 class PageIndexClient:
    """
@ -39,6 +44,9 @@ class PageIndexClient:

    def index(self, file_path: str, mode: str = "auto") -> str:
        """Index a document. Returns a document_id."""
+        # Persist a canonical absolute path so workspace reloads do not
+        # reinterpret caller-relative paths against the workspace directory.
+        file_path = os.path.abspath(os.path.expanduser(file_path))
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found: {file_path}")

@ -58,13 +66,22 @@ class PageIndexClient:
                if_add_node_id='yes',
                if_add_doc_description='yes'
            )
+            # Extract per-page text so queries don't need the original PDF
+            pages = []
+            with open(file_path, 'rb') as f:
+                pdf_reader = PyPDF2.PdfReader(f)
+                for i, page in enumerate(pdf_reader.pages, 1):
+                    pages.append({'page': i, 'content': page.extract_text() or ''})
+
            self.documents[doc_id] = {
                'id': doc_id,
-                'path': file_path,
                'type': 'pdf',
-                'structure': result['structure'],
+                'path': file_path,
                'doc_name': result.get('doc_name', ''),
-                'doc_description': result.get('doc_description', '')
+                'doc_description': result.get('doc_description', ''),
+                'page_count': len(pages),
+                'structure': result['structure'],
+                'pages': pages,
            }

        elif mode == "md" or (mode == "auto" and is_md):
@ -87,11 +104,12 @@ class PageIndexClient:
                result = asyncio.run(coro)
            self.documents[doc_id] = {
                'id': doc_id,
-                'path': file_path,
                'type': 'md',
-                'structure': result['structure'],
+                'path': file_path,
                'doc_name': result.get('doc_name', ''),
-                'doc_description': result.get('doc_description', '')
+                'doc_description': result.get('doc_description', ''),
+                'line_count': result.get('line_count', 0),
+                'structure': result['structure'],
            }
        else:
            raise ValueError(f"Unsupported file format for: {file_path}")
@ -101,23 +119,93 @@ class PageIndexClient:
            self._save_doc(doc_id)
        return doc_id

+    @staticmethod
+    def _make_meta_entry(doc: dict) -> dict:
+        """Build a lightweight meta entry from a document dict."""
+        entry = {
+            'type': doc.get('type', ''),
+            'doc_name': doc.get('doc_name', ''),
+            'doc_description': doc.get('doc_description', ''),
+            'path': doc.get('path', ''),
+        }
+        if doc.get('type') == 'pdf':
+            entry['page_count'] = doc.get('page_count')
+        elif doc.get('type') == 'md':
+            entry['line_count'] = doc.get('line_count')
+        return entry
+
+    @staticmethod
+    def _read_json(path) -> dict | None:
+        """Read a JSON file, returning None on any error."""
+        try:
+            with open(path, "r", encoding="utf-8") as f:
+                return json.load(f)
+        except (json.JSONDecodeError, OSError) as e:
+            print(f"Warning: corrupt {Path(path).name}: {e}")
+            return None
+
    def _save_doc(self, doc_id: str):
+        doc = self.documents[doc_id].copy()
+        # Strip text from structure nodes — redundant with pages (PDF only)
+        if doc.get('structure') and doc.get('type') == 'pdf':
+            doc['structure'] = remove_fields(doc['structure'], fields=['text'])
        path = self.workspace / f"{doc_id}.json"
        with open(path, "w", encoding="utf-8") as f:
-            json.dump(self.documents[doc_id], f, ensure_ascii=False, indent=2)
+            json.dump(doc, f, ensure_ascii=False, indent=2)
+        self._save_meta(doc_id, self._make_meta_entry(doc))
+        # Drop heavy fields; will lazy-load on demand
+        self.documents[doc_id].pop('structure', None)
+        self.documents[doc_id].pop('pages', None)
+
+    def _rebuild_meta(self) -> dict:
+        """Scan individual doc JSON files and return a meta dict."""
+        meta = {}
+        for path in self.workspace.glob("*.json"):
+            if path.name == META_INDEX:
+                continue
+            doc = self._read_json(path)
+            if doc and isinstance(doc, dict):
+                meta[path.stem] = self._make_meta_entry(doc)
+        return meta
+
+    def _read_meta(self) -> dict | None:
+        """Read and validate _meta.json, returning None on any corruption."""
+        meta = self._read_json(self.workspace / META_INDEX)
+        if meta is not None and not isinstance(meta, dict):
+            print(f"Warning: {META_INDEX} is not a JSON object, ignoring")
+            return None
+        return meta
+
+    def _save_meta(self, doc_id: str, entry: dict):
+        meta = self._read_meta() or self._rebuild_meta()
+        meta[doc_id] = entry
+        meta_path = self.workspace / META_INDEX
+        with open(meta_path, "w", encoding="utf-8") as f:
+            json.dump(meta, f, ensure_ascii=False, indent=2)

    def _load_workspace(self):
-        loaded = 0
-        for path in self.workspace.glob("*.json"):
-            try:
-                with open(path, "r", encoding="utf-8") as f:
-                    doc = json.load(f)
-                self.documents[path.stem] = doc
-                loaded += 1
-            except (json.JSONDecodeError, OSError) as e:
-                print(f"Warning: skipping corrupt workspace file {path.name}: {e}")
-        if loaded:
-            print(f"Loaded {loaded} document(s) from workspace.")
+        meta = self._read_meta()
+        if meta is None:
+            meta = self._rebuild_meta()
+            if meta:
+                print(f"Loaded {len(meta)} document(s) from workspace (legacy mode).")
+        for doc_id, entry in meta.items():
+            doc = dict(entry, id=doc_id)
+            if doc.get('path') and not os.path.isabs(doc['path']):
+                doc['path'] = str((self.workspace / doc['path']).resolve())
+            self.documents[doc_id] = doc
+
+    def _ensure_doc_loaded(self, doc_id: str):
+        """Load full document JSON on demand (structure, pages, etc.)."""
+        doc = self.documents.get(doc_id)
+        if not doc or doc.get('structure') is not None:
+            return
+        full = self._read_json(self.workspace / f"{doc_id}.json")
+        if not full:
+            return
+        doc['structure'] = full.get('structure', [])
+        if full.get('pages'):
+            doc['pages'] = full['pages']

    def get_document(self, doc_id: str) -> str:
        """Return document metadata JSON."""
@ -125,8 +213,12 @@ class PageIndexClient:

    def get_document_structure(self, doc_id: str) -> str:
        """Return document tree structure JSON (without text fields)."""
+        if self.workspace:
+            self._ensure_doc_loaded(doc_id)
        return get_document_structure(self.documents, doc_id)

    def get_page_content(self, doc_id: str, pages: str) -> str:
        """Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
+        if self.workspace:
+            self._ensure_doc_loaded(doc_id)
        return get_page_content(self.documents, doc_id, pages)
--- a/pageindex/page_index.py
+++ b/pageindex/page_index.py
@ -1095,11 +1095,13 @@ def page_index_main(doc, opt=None):
                # Create a clean structure without unnecessary fields for description generation
                clean_structure = create_clean_structure_for_description(structure)
                doc_description = generate_doc_description(clean_structure, model=opt.model)
+                structure = format_structure(structure, order=['title', 'node_id', 'start_index', 'end_index', 'summary', 'text', 'nodes'])
                return {
                    'doc_name': get_pdf_name(doc),
                    'doc_description': doc_description,
                    'structure': structure,
                }
+        structure = format_structure(structure, order=['title', 'node_id', 'start_index', 'end_index', 'summary', 'text', 'nodes'])
        return {
            'doc_name': get_pdf_name(doc),
            'structure': structure,
--- a/pageindex/page_index_md.py
+++ b/pageindex/page_index_md.py
@ -243,7 +243,8 @@ def clean_tree_for_output(tree_nodes):
 async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='no', if_add_node_text='no', if_add_node_id='yes'):
    with open(md_path, 'r', encoding='utf-8') as f:
        markdown_content = f.read()
-    
+    line_count = markdown_content.count('\n') + 1
+
    print(f"Extracting nodes from markdown...")
    node_list, markdown_lines = extract_nodes_from_markdown(markdown_content)

@ -265,14 +266,14 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
    
    if if_add_node_summary == 'yes':
        # Always include text for summary generation
-        tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
+        tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
        
        print(f"Generating summaries for each node...")
        tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model)
        
        if if_add_node_text == 'no':
            # Remove text after summary generation if not requested
-            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
+            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
        
        if if_add_doc_description == 'yes':
            print(f"Generating document description...")
@ -282,17 +283,19 @@ async def md_to_tree(md_path, if_thinning=False, min_token_threshold=None, if_ad
            return {
                'doc_name': os.path.splitext(os.path.basename(md_path))[0],
                'doc_description': doc_description,
+                'line_count': line_count,
                'structure': tree_structure,
            }
    else:
        # No summaries needed, format based on text preference
        if if_add_node_text == 'yes':
-            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
+            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'])
        else:
-            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
+            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'nodes'])
    
    return {
        'doc_name': os.path.splitext(os.path.basename(md_path))[0],
+        'line_count': line_count,
        'structure': tree_structure,
    }

@ -303,7 +306,7 @@ if __name__ == "__main__":
    
    # MD_NAME = 'Detect-Order-Construct'
    MD_NAME = 'cognitive-load'
-    MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'tests/markdowns/', f'{MD_NAME}.md')
+    MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'examples/documents/', f'{MD_NAME}.md')


    MODEL="gpt-4.1"
--- a/pageindex/retrieve.py
+++ b/pageindex/retrieve.py
@ -25,25 +25,23 @@ def _parse_pages(pages: str) -> list[int]:


 def _count_pages(doc_info: dict) -> int:
-    """Return total page count for a document."""
-    if doc_info.get('type') == 'pdf':
-        return get_number_of_pages(doc_info['path'])
-    # For MD, find max line_num across all nodes
-    max_line = 0
-    def _traverse(nodes):
-        nonlocal max_line
-        for node in nodes:
-            ln = node.get('line_num', 0)
-            if ln and ln > max_line:
-                max_line = ln
-            if node.get('nodes'):
-                _traverse(node['nodes'])
-    _traverse(doc_info.get('structure', []))
-    return max_line
+    """Return total page count for a PDF document."""
+    if doc_info.get('page_count'):
+        return doc_info['page_count']
+    if doc_info.get('pages'):
+        return len(doc_info['pages'])
+    return get_number_of_pages(doc_info['path'])


 def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]:
-    """Extract text for specific PDF pages (1-indexed), opening the PDF once."""
+    """Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF."""
+    cached_pages = doc_info.get('pages')
+    if cached_pages:
+        page_map = {p['page']: p['content'] for p in cached_pages}
+        return [
+            {'page': p, 'content': page_map[p]}
+            for p in page_nums if p in page_map
+        ]
    path = doc_info['path']
    with open(path, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
@ -95,7 +93,7 @@ def get_document(documents: dict, doc_id: str) -> str:
    if doc_info.get('type') == 'pdf':
        result['page_count'] = _count_pages(doc_info)
    else:
-        result['line_count'] = _count_pages(doc_info)
+        result['line_count'] = doc_info.get('line_count', 0)
    return json.dumps(result)