Make PageIndexClient parser-agnostic, pdf_parser per index() call

This commit is contained in:
Ray 2026-05-11 16:46:13 +08:00
parent 108cb28518
commit 63e11ef152

View file

@ -31,7 +31,7 @@ class PageIndexClient:
For agent-based QA, see examples/agentic_vectorless_rag_demo.py.
"""
def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None,
workspace: str = None, pdf_parser: str = DEFAULT_PDF_PARSER):
workspace: str = None):
if api_key:
os.environ["OPENAI_API_KEY"] = api_key
elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"):
@ -45,15 +45,14 @@ class PageIndexClient:
opt = ConfigLoader().load(overrides or None)
self.model = opt.model
self.retrieve_model = _normalize_retrieve_model(opt.retrieve_model or self.model)
self.pdf_parser = pdf_parser
if self.workspace:
self.workspace.mkdir(parents=True, exist_ok=True)
self.documents = {}
if self.workspace:
self._load_workspace()
def index(self, file_path: str, mode: str = "auto") -> str:
"""Index a document. Returns a document_id."""
def index(self, file_path: str, mode: str = "auto", pdf_parser: str = DEFAULT_PDF_PARSER) -> str:
"""Index a document. Returns a document_id. pdf_parser only affects PDF mode."""
# Persist a canonical absolute path so workspace reloads do not
# reinterpret caller-relative paths against the workspace directory.
file_path = os.path.abspath(os.path.expanduser(file_path))
@ -75,10 +74,10 @@ class PageIndexClient:
if_add_node_text='yes',
if_add_node_id='yes',
if_add_doc_description='yes',
pdf_parser=self.pdf_parser,
pdf_parser=pdf_parser,
)
# Extract per-page text so queries don't need the original PDF
page_texts = read_pdf_pages(file_path, pdf_parser=self.pdf_parser)
page_texts = read_pdf_pages(file_path, pdf_parser=pdf_parser)
pages = [{'page': i, 'content': text} for i, text in enumerate(page_texts, 1)]
self.documents[doc_id] = {
@ -226,7 +225,12 @@ class PageIndexClient:
return get_document_structure(self.documents, doc_id)
def get_page_content(self, doc_id: str, pages: str) -> str:
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12')."""
"""Return page content for the given pages string (e.g. '5-7', '3,8', '12').
Cache hit returns originally-indexed text. The rare cache-miss path
re-reads with the default parser; callers needing parser-consistent
fallback can use the low-level retrieve.get_page_content directly.
"""
if self.workspace:
self._ensure_doc_loaded(doc_id)
return get_page_content(self.documents, doc_id, pages, pdf_parser=self.pdf_parser)
return get_page_content(self.documents, doc_id, pages)