mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-05-05 13:02:37 +02:00
122 lines
4 KiB
Python
122 lines
4 KiB
Python
# pageindex/index/pipeline.py
|
|
from __future__ import annotations
|
|
from ..parser.protocol import ContentNode, ParsedDocument
|
|
|
|
|
|
def detect_strategy(nodes: list[ContentNode]) -> str:
|
|
"""Determine which indexing strategy to use based on node data."""
|
|
if any(n.level is not None for n in nodes):
|
|
return "level_based"
|
|
return "content_based"
|
|
|
|
|
|
def build_tree_from_levels(nodes: list[ContentNode]) -> list[dict]:
|
|
"""Strategy 0: Build tree from explicit level information.
|
|
Adapted from pageindex/page_index_md.py:build_tree_from_nodes."""
|
|
stack = []
|
|
root_nodes = []
|
|
|
|
for node in nodes:
|
|
tree_node = {
|
|
"title": node.title or "",
|
|
"text": node.content,
|
|
"line_num": node.index,
|
|
"nodes": [],
|
|
}
|
|
current_level = node.level or 1
|
|
|
|
while stack and stack[-1][1] >= current_level:
|
|
stack.pop()
|
|
|
|
if not stack:
|
|
root_nodes.append(tree_node)
|
|
else:
|
|
parent_node, _ = stack[-1]
|
|
parent_node["nodes"].append(tree_node)
|
|
|
|
stack.append((tree_node, current_level))
|
|
|
|
return root_nodes
|
|
|
|
|
|
def _run_async(coro):
|
|
"""Run an async coroutine, handling the case where an event loop is already running."""
|
|
import asyncio
|
|
import concurrent.futures
|
|
try:
|
|
asyncio.get_running_loop()
|
|
# Already inside an event loop -- run in a separate thread
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
|
return pool.submit(asyncio.run, coro).result()
|
|
except RuntimeError:
|
|
return asyncio.run(coro)
|
|
|
|
|
|
def build_index(parsed: ParsedDocument, model: str = None, opt=None) -> dict:
|
|
"""Main entry point: ParsedDocument -> tree structure dict.
|
|
Routes to the appropriate strategy and runs enhancement."""
|
|
from .utils import (write_node_id, add_node_text, remove_structure_text,
|
|
generate_summaries_for_structure, generate_doc_description,
|
|
create_clean_structure_for_description)
|
|
from ..config import IndexConfig
|
|
|
|
if opt is None:
|
|
opt = IndexConfig(model=model) if model else IndexConfig()
|
|
|
|
nodes = parsed.nodes
|
|
strategy = detect_strategy(nodes)
|
|
|
|
if strategy == "level_based":
|
|
structure = build_tree_from_levels(nodes)
|
|
# For level-based, text is already in the tree nodes
|
|
else:
|
|
# Strategies 1-3: convert ContentNode list to page_list format for existing pipeline
|
|
page_list = [(n.content, n.tokens) for n in nodes]
|
|
structure = _run_async(_content_based_pipeline(page_list, opt))
|
|
|
|
# Unified enhancement
|
|
if opt.if_add_node_id:
|
|
write_node_id(structure)
|
|
|
|
if strategy != "level_based":
|
|
if opt.if_add_node_text or opt.if_add_node_summary:
|
|
add_node_text(structure, page_list)
|
|
|
|
if opt.if_add_node_summary:
|
|
_run_async(generate_summaries_for_structure(structure, model=opt.model))
|
|
|
|
if not opt.if_add_node_text and strategy != "level_based":
|
|
remove_structure_text(structure)
|
|
|
|
result = {
|
|
"doc_name": parsed.doc_name,
|
|
"structure": structure,
|
|
}
|
|
|
|
if opt.if_add_doc_description:
|
|
clean_structure = create_clean_structure_for_description(structure)
|
|
result["doc_description"] = generate_doc_description(
|
|
clean_structure, model=opt.model
|
|
)
|
|
|
|
return result
|
|
|
|
|
|
class _NullLogger:
|
|
"""Minimal logger that satisfies the tree_parser interface without writing files."""
|
|
def info(self, message, **kwargs): pass
|
|
def error(self, message, **kwargs): pass
|
|
def debug(self, message, **kwargs): pass
|
|
|
|
|
|
async def _content_based_pipeline(page_list, opt):
|
|
"""Strategies 1-3: delegates to the existing PDF pipeline from pageindex/page_index.py.
|
|
|
|
The page_list is already in the format expected by tree_parser:
|
|
[(page_text, token_count), ...]
|
|
"""
|
|
from .page_index import tree_parser
|
|
|
|
logger = _NullLogger()
|
|
structure = await tree_parser(page_list, opt, doc=None, logger=logger)
|
|
return structure
|