From 1629ef4318c551aaea8af069425efe537ce7cbb5 Mon Sep 17 00:00:00 2001 From: Ray Date: Mon, 11 May 2026 16:20:45 +0800 Subject: [PATCH] Take pdf_parser out of ConfigLoader, use plain function arg --- pageindex/client.py | 6 ++---- pageindex/page_index.py | 14 +++++++------- pageindex/utils.py | 6 +----- run_pageindex.py | 3 +-- 4 files changed, 11 insertions(+), 18 deletions(-) diff --git a/pageindex/client.py b/pageindex/client.py index e04574c..cdd3d7c 100644 --- a/pageindex/client.py +++ b/pageindex/client.py @@ -31,7 +31,7 @@ class PageIndexClient: For agent-based QA, see examples/agentic_vectorless_rag_demo.py. """ def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, - workspace: str = None, pdf_parser: str = None): + workspace: str = None, pdf_parser: str = "PyPDF2"): if api_key: os.environ["OPENAI_API_KEY"] = api_key elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"): @@ -42,12 +42,10 @@ class PageIndexClient: overrides["model"] = model if retrieve_model: overrides["retrieve_model"] = retrieve_model - if pdf_parser: - overrides["pdf_parser"] = pdf_parser opt = ConfigLoader().load(overrides or None) self.model = opt.model self.retrieve_model = _normalize_retrieve_model(opt.retrieve_model or self.model) - self.pdf_parser = opt.pdf_parser + self.pdf_parser = pdf_parser if self.workspace: self.workspace.mkdir(parents=True, exist_ok=True) self.documents = {} diff --git a/pageindex/page_index.py b/pageindex/page_index.py index d80896f..ef9ac09 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -1063,18 +1063,18 @@ async def tree_parser(page_list, opt, doc=None, logger=None): return toc_tree -def page_index_main(doc, opt=None): +def page_index_main(doc, opt=None, pdf_parser="PyPDF2"): logger = JsonLogger(doc) - + is_valid_pdf = ( - (isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or + (isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or isinstance(doc, BytesIO) ) if not is_valid_pdf: raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.") print('Parsing PDF...') - page_list = get_page_tokens(doc, model=opt.model, pdf_parser=opt.pdf_parser) + page_list = get_page_tokens(doc, model=opt.model, pdf_parser=pdf_parser) logger.info({'total_page_number': len(page_list)}) logger.info({'total_token': sum([page[1] for page in page_list])}) @@ -1112,14 +1112,14 @@ def page_index_main(doc, opt=None): def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None, if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None, - pdf_parser=None): + pdf_parser="PyPDF2"): user_opt = { arg: value for arg, value in locals().items() - if arg != "doc" and value is not None + if arg not in ("doc", "pdf_parser") and value is not None } opt = ConfigLoader().load(user_opt) - return page_index_main(doc, opt) + return page_index_main(doc, opt, pdf_parser=pdf_parser) def validate_and_truncate_physical_indices(toc_with_page_number, page_list_length, start_index=1, logger=None): diff --git a/pageindex/utils.py b/pageindex/utils.py index a5adc54..5c73a62 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -685,14 +685,10 @@ def format_structure(structure, order=None): class ConfigLoader: - # Code-side defaults for non-tuning settings (kept out of config.yaml). - # yaml entries override these if present. - _CODE_DEFAULTS = {"pdf_parser": "PyPDF2"} - def __init__(self, default_path: str = None): if default_path is None: default_path = Path(__file__).parent / "config.yaml" - self._default_dict = {**self._CODE_DEFAULTS, **self._load_yaml(default_path)} + self._default_dict = self._load_yaml(default_path) @staticmethod def _load_yaml(path): diff --git a/run_pageindex.py b/run_pageindex.py index 76661d8..d4eaa51 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -63,12 +63,11 @@ if __name__ == "__main__": 'if_add_node_summary': args.if_add_node_summary, 'if_add_doc_description': args.if_add_doc_description, 'if_add_node_text': args.if_add_node_text, - 'pdf_parser': args.pdf_parser, } opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None}) # Process the PDF - toc_with_page_number = page_index_main(args.pdf_path, opt) + toc_with_page_number = page_index_main(args.pdf_path, opt, pdf_parser=args.pdf_parser or "PyPDF2") print('Parsing done, saving to file...') # Save results