From 4dec4d66a98d249af7d324420ce568f8cecfb1be Mon Sep 17 00:00:00 2001 From: Ray Date: Mon, 11 May 2026 18:43:51 +0800 Subject: [PATCH] Replace pdf_parser plumbing with mutable DEFAULT_PDF_PARSER global --- pageindex/client.py | 16 +++++----------- pageindex/page_index.py | 11 +++++------ pageindex/retrieve.py | 12 ++++++------ pageindex/utils.py | 28 ++++++++++++++-------------- run_pageindex.py | 9 +++++++-- 5 files changed, 37 insertions(+), 39 deletions(-) diff --git a/pageindex/client.py b/pageindex/client.py index f74c825..1924ad1 100644 --- a/pageindex/client.py +++ b/pageindex/client.py @@ -8,7 +8,7 @@ from pathlib import Path from .page_index import page_index from .page_index_md import md_to_tree from .retrieve import get_document, get_document_structure, get_page_content -from .utils import ConfigLoader, DEFAULT_PDF_PARSER, read_pdf_pages, remove_fields +from .utils import ConfigLoader, read_pdf_pages, remove_fields META_INDEX = "_meta.json" @@ -51,8 +51,8 @@ class PageIndexClient: if self.workspace: self._load_workspace() - def index(self, file_path: str, mode: str = "auto", pdf_parser: str = DEFAULT_PDF_PARSER) -> str: - """Index a document. Returns a document_id. pdf_parser only affects PDF mode.""" + def index(self, file_path: str, mode: str = "auto") -> str: + """Index a document. Returns a document_id.""" # Persist a canonical absolute path so workspace reloads do not # reinterpret caller-relative paths against the workspace directory. file_path = os.path.abspath(os.path.expanduser(file_path)) @@ -74,10 +74,9 @@ class PageIndexClient: if_add_node_text='yes', if_add_node_id='yes', if_add_doc_description='yes', - pdf_parser=pdf_parser, ) # Extract per-page text so queries don't need the original PDF - page_texts = read_pdf_pages(file_path, pdf_parser=pdf_parser) + page_texts = read_pdf_pages(file_path) pages = [{'page': i, 'content': text} for i, text in enumerate(page_texts, 1)] self.documents[doc_id] = { @@ -225,12 +224,7 @@ class PageIndexClient: return get_document_structure(self.documents, doc_id) def get_page_content(self, doc_id: str, pages: str) -> str: - """Return page content for the given pages string (e.g. '5-7', '3,8', '12'). - - Cache hit returns originally-indexed text. The rare cache-miss path - re-reads with the default parser; callers needing parser-consistent - fallback can use the low-level retrieve.get_page_content directly. - """ + """Return page content for the given pages string (e.g. '5-7', '3,8', '12').""" if self.workspace: self._ensure_doc_loaded(doc_id) return get_page_content(self.documents, doc_id, pages) diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 201824c..735f1ed 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -1063,7 +1063,7 @@ async def tree_parser(page_list, opt, doc=None, logger=None): return toc_tree -def page_index_main(doc, opt=None, pdf_parser=DEFAULT_PDF_PARSER): +def page_index_main(doc, opt=None): logger = JsonLogger(doc) is_valid_pdf = ( @@ -1074,7 +1074,7 @@ def page_index_main(doc, opt=None, pdf_parser=DEFAULT_PDF_PARSER): raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.") print('Parsing PDF...') - page_list = get_page_tokens(doc, model=opt.model, pdf_parser=pdf_parser) + page_list = get_page_tokens(doc, model=opt.model) logger.info({'total_page_number': len(page_list)}) logger.info({'total_token': sum([page[1] for page in page_list])}) @@ -1111,15 +1111,14 @@ def page_index_main(doc, opt=None, pdf_parser=DEFAULT_PDF_PARSER): def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None, - if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None, - pdf_parser=DEFAULT_PDF_PARSER): + if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None): user_opt = { arg: value for arg, value in locals().items() - if arg not in ("doc", "pdf_parser") and value is not None + if arg != "doc" and value is not None } opt = ConfigLoader().load(user_opt) - return page_index_main(doc, opt, pdf_parser=pdf_parser) + return page_index_main(doc, opt) def validate_and_truncate_physical_indices(toc_with_page_number, page_list_length, start_index=1, logger=None): diff --git a/pageindex/retrieve.py b/pageindex/retrieve.py index dabd258..81c643e 100644 --- a/pageindex/retrieve.py +++ b/pageindex/retrieve.py @@ -1,9 +1,9 @@ import json try: - from .utils import DEFAULT_PDF_PARSER, get_number_of_pages, read_pdf_pages, remove_fields + from .utils import get_number_of_pages, read_pdf_pages, remove_fields except ImportError: - from utils import DEFAULT_PDF_PARSER, get_number_of_pages, read_pdf_pages, remove_fields + from utils import get_number_of_pages, read_pdf_pages, remove_fields # ── Helpers ────────────────────────────────────────────────────────────────── @@ -32,7 +32,7 @@ def _count_pages(doc_info: dict) -> int: return get_number_of_pages(doc_info['path']) -def _get_pdf_page_content(doc_info: dict, page_nums: list[int], pdf_parser: str = DEFAULT_PDF_PARSER) -> list[dict]: +def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: """Extract text for specific PDF pages (1-indexed). Prefer cached pages, fallback to PDF.""" cached_pages = doc_info.get('pages') if cached_pages: @@ -41,7 +41,7 @@ def _get_pdf_page_content(doc_info: dict, page_nums: list[int], pdf_parser: str {'page': p, 'content': page_map[p]} for p in page_nums if p in page_map ] - all_pages = read_pdf_pages(doc_info['path'], pdf_parser=pdf_parser) + all_pages = read_pdf_pages(doc_info['path']) total = len(all_pages) valid_pages = [p for p in page_nums if 1 <= p <= total] return [ @@ -104,7 +104,7 @@ def get_document_structure(documents: dict, doc_id: str) -> str: return json.dumps(structure_no_text, ensure_ascii=False) -def get_page_content(documents: dict, doc_id: str, pages: str, pdf_parser: str = DEFAULT_PDF_PARSER) -> str: +def get_page_content(documents: dict, doc_id: str, pages: str) -> str: """ Retrieve page content for a document. @@ -125,7 +125,7 @@ def get_page_content(documents: dict, doc_id: str, pages: str, pdf_parser: str = try: if doc_info.get('type') == 'pdf': - content = _get_pdf_page_content(doc_info, page_nums, pdf_parser=pdf_parser) + content = _get_pdf_page_content(doc_info, page_nums) else: content = _get_md_page_content(doc_info, page_nums) except Exception as e: diff --git a/pageindex/utils.py b/pageindex/utils.py index e0ebb54..6563d26 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -385,26 +385,26 @@ def add_preface_if_needed(data): SUPPORTED_PDF_PARSERS = ("PyPDF2", "pypdfium2", "PyMuPDF") -DEFAULT_PDF_PARSER = SUPPORTED_PDF_PARSERS[0] + +# Module-level setting. Override by mutating this attribute or setting +# PAGEINDEX_PDF_PARSER in the environment before import. +DEFAULT_PDF_PARSER = os.getenv("PAGEINDEX_PDF_PARSER") or SUPPORTED_PDF_PARSERS[0] -def read_pdf_pages(doc, pdf_parser=DEFAULT_PDF_PARSER): - """Return a list of per-page text strings using the selected parser. +def read_pdf_pages(doc): + """Return a list of per-page text strings using the currently configured parser.""" + parser = DEFAULT_PDF_PARSER - `doc` may be a file path (str/Path) or a BytesIO. `pdf_parser` is one of - SUPPORTED_PDF_PARSERS. PyPDF2 is the default and only required dependency; - pypdfium2 is lazy-imported so users opt in by installing it separately. - """ - if pdf_parser == "PyPDF2": + if parser == "PyPDF2": reader = PyPDF2.PdfReader(doc) return [(p.extract_text() or "") for p in reader.pages] - if pdf_parser == "pypdfium2": + if parser == "pypdfium2": try: import pypdfium2 as pdfium except ImportError as e: raise ImportError( - "pdf_parser='pypdfium2' requires the optional dependency. " + "DEFAULT_PDF_PARSER='pypdfium2' requires the optional dependency. " "Install it with: pip install pypdfium2" ) from e source = doc.getvalue() if isinstance(doc, BytesIO) else str(doc) @@ -424,7 +424,7 @@ def read_pdf_pages(doc, pdf_parser=DEFAULT_PDF_PARSER): finally: pdf.close() - if pdf_parser == "PyMuPDF": + if parser == "PyMuPDF": if isinstance(doc, BytesIO): d = pymupdf.open(stream=doc, filetype="pdf") else: @@ -435,12 +435,12 @@ def read_pdf_pages(doc, pdf_parser=DEFAULT_PDF_PARSER): d.close() raise ValueError( - f"Unsupported pdf_parser={pdf_parser!r}. Choose from {SUPPORTED_PDF_PARSERS}." + f"Unsupported DEFAULT_PDF_PARSER={parser!r}. Choose from {SUPPORTED_PDF_PARSERS}." ) -def get_page_tokens(pdf_path, model=None, pdf_parser=DEFAULT_PDF_PARSER): - pages = read_pdf_pages(pdf_path, pdf_parser=pdf_parser) +def get_page_tokens(pdf_path, model=None): + pages = read_pdf_pages(pdf_path) return [(text, litellm.token_counter(model=model, text=text)) for text in pages] diff --git a/run_pageindex.py b/run_pageindex.py index 295f3ed..4103a49 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -2,8 +2,9 @@ import argparse import os import json from pageindex import * +import pageindex.utils as pageindex_utils from pageindex.page_index_md import md_to_tree -from pageindex.utils import ConfigLoader, DEFAULT_PDF_PARSER +from pageindex.utils import ConfigLoader if __name__ == "__main__": # Set up argument parser @@ -66,8 +67,12 @@ if __name__ == "__main__": } opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None}) + # CLI flag overrides the module-level default (and env var PAGEINDEX_PDF_PARSER). + if args.pdf_parser: + pageindex_utils.DEFAULT_PDF_PARSER = args.pdf_parser + # Process the PDF - toc_with_page_number = page_index_main(args.pdf_path, opt, pdf_parser=args.pdf_parser or DEFAULT_PDF_PARSER) + toc_with_page_number = page_index_main(args.pdf_path, opt) print('Parsing done, saving to file...') # Save results