From 63e11ef152102101f22a6c9d8f6ff074004ea942 Mon Sep 17 00:00:00 2001 From: Ray Date: Mon, 11 May 2026 16:46:13 +0800 Subject: [PATCH] Make PageIndexClient parser-agnostic, pdf_parser per index() call --- pageindex/client.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/pageindex/client.py b/pageindex/client.py index 1d36409..f74c825 100644 --- a/pageindex/client.py +++ b/pageindex/client.py @@ -31,7 +31,7 @@ class PageIndexClient: For agent-based QA, see examples/agentic_vectorless_rag_demo.py. """ def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, - workspace: str = None, pdf_parser: str = DEFAULT_PDF_PARSER): + workspace: str = None): if api_key: os.environ["OPENAI_API_KEY"] = api_key elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"): @@ -45,15 +45,14 @@ class PageIndexClient: opt = ConfigLoader().load(overrides or None) self.model = opt.model self.retrieve_model = _normalize_retrieve_model(opt.retrieve_model or self.model) - self.pdf_parser = pdf_parser if self.workspace: self.workspace.mkdir(parents=True, exist_ok=True) self.documents = {} if self.workspace: self._load_workspace() - def index(self, file_path: str, mode: str = "auto") -> str: - """Index a document. Returns a document_id.""" + def index(self, file_path: str, mode: str = "auto", pdf_parser: str = DEFAULT_PDF_PARSER) -> str: + """Index a document. Returns a document_id. pdf_parser only affects PDF mode.""" # Persist a canonical absolute path so workspace reloads do not # reinterpret caller-relative paths against the workspace directory. file_path = os.path.abspath(os.path.expanduser(file_path)) @@ -75,10 +74,10 @@ class PageIndexClient: if_add_node_text='yes', if_add_node_id='yes', if_add_doc_description='yes', - pdf_parser=self.pdf_parser, + pdf_parser=pdf_parser, ) # Extract per-page text so queries don't need the original PDF - page_texts = read_pdf_pages(file_path, pdf_parser=self.pdf_parser) + page_texts = read_pdf_pages(file_path, pdf_parser=pdf_parser) pages = [{'page': i, 'content': text} for i, text in enumerate(page_texts, 1)] self.documents[doc_id] = { @@ -226,7 +225,12 @@ class PageIndexClient: return get_document_structure(self.documents, doc_id) def get_page_content(self, doc_id: str, pages: str) -> str: - """Return page content for the given pages string (e.g. '5-7', '3,8', '12').""" + """Return page content for the given pages string (e.g. '5-7', '3,8', '12'). + + Cache hit returns originally-indexed text. The rare cache-miss path + re-reads with the default parser; callers needing parser-consistent + fallback can use the low-level retrieve.get_page_content directly. + """ if self.workspace: self._ensure_doc_loaded(doc_id) - return get_page_content(self.documents, doc_id, pages, pdf_parser=self.pdf_parser) + return get_page_content(self.documents, doc_id, pages)