diff --git a/pageindex/client.py b/pageindex/client.py index cdd3d7c..30a6bb0 100644 --- a/pageindex/client.py +++ b/pageindex/client.py @@ -8,7 +8,7 @@ from pathlib import Path from .page_index import page_index from .page_index_md import md_to_tree from .retrieve import get_document, get_document_structure, get_page_content -from .utils import ConfigLoader, read_pdf_pages, remove_fields +from .utils import ConfigLoader, DEFAULT_PDF_PARSER, read_pdf_pages, remove_fields META_INDEX = "_meta.json" @@ -31,7 +31,7 @@ class PageIndexClient: For agent-based QA, see examples/agentic_vectorless_rag_demo.py. """ def __init__(self, api_key: str = None, model: str = None, retrieve_model: str = None, - workspace: str = None, pdf_parser: str = "PyPDF2"): + workspace: str = None, pdf_parser: str = DEFAULT_PDF_PARSER): if api_key: os.environ["OPENAI_API_KEY"] = api_key elif not os.getenv("OPENAI_API_KEY") and os.getenv("CHATGPT_API_KEY"): diff --git a/pageindex/page_index.py b/pageindex/page_index.py index ef9ac09..201824c 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -1063,7 +1063,7 @@ async def tree_parser(page_list, opt, doc=None, logger=None): return toc_tree -def page_index_main(doc, opt=None, pdf_parser="PyPDF2"): +def page_index_main(doc, opt=None, pdf_parser=DEFAULT_PDF_PARSER): logger = JsonLogger(doc) is_valid_pdf = ( @@ -1112,7 +1112,7 @@ def page_index_main(doc, opt=None, pdf_parser="PyPDF2"): def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None, if_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None, if_add_node_text=None, - pdf_parser="PyPDF2"): + pdf_parser=DEFAULT_PDF_PARSER): user_opt = { arg: value for arg, value in locals().items() diff --git a/pageindex/retrieve.py b/pageindex/retrieve.py index 9a10681..52bc2eb 100644 --- a/pageindex/retrieve.py +++ b/pageindex/retrieve.py @@ -1,9 +1,9 @@ import json try: - from .utils import get_number_of_pages, read_pdf_pages, remove_fields + from .utils import DEFAULT_PDF_PARSER, get_number_of_pages, read_pdf_pages, remove_fields except ImportError: - from utils import get_number_of_pages, read_pdf_pages, remove_fields + from utils import DEFAULT_PDF_PARSER, get_number_of_pages, read_pdf_pages, remove_fields # ── Helpers ────────────────────────────────────────────────────────────────── @@ -41,7 +41,7 @@ def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: {'page': p, 'content': page_map[p]} for p in page_nums if p in page_map ] - parser = doc_info.get('pdf_parser') or 'PyPDF2' + parser = doc_info.get('pdf_parser') or DEFAULT_PDF_PARSER all_pages = read_pdf_pages(doc_info['path'], pdf_parser=parser) total = len(all_pages) valid_pages = [p for p in page_nums if 1 <= p <= total] diff --git a/pageindex/utils.py b/pageindex/utils.py index 5c73a62..e0ebb54 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -385,9 +385,10 @@ def add_preface_if_needed(data): SUPPORTED_PDF_PARSERS = ("PyPDF2", "pypdfium2", "PyMuPDF") +DEFAULT_PDF_PARSER = SUPPORTED_PDF_PARSERS[0] -def read_pdf_pages(doc, pdf_parser="PyPDF2"): +def read_pdf_pages(doc, pdf_parser=DEFAULT_PDF_PARSER): """Return a list of per-page text strings using the selected parser. `doc` may be a file path (str/Path) or a BytesIO. `pdf_parser` is one of @@ -438,7 +439,7 @@ def read_pdf_pages(doc, pdf_parser="PyPDF2"): ) -def get_page_tokens(pdf_path, model=None, pdf_parser="PyPDF2"): +def get_page_tokens(pdf_path, model=None, pdf_parser=DEFAULT_PDF_PARSER): pages = read_pdf_pages(pdf_path, pdf_parser=pdf_parser) return [(text, litellm.token_counter(model=model, text=text)) for text in pages] diff --git a/run_pageindex.py b/run_pageindex.py index d4eaa51..295f3ed 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -3,7 +3,7 @@ import os import json from pageindex import * from pageindex.page_index_md import md_to_tree -from pageindex.utils import ConfigLoader +from pageindex.utils import ConfigLoader, DEFAULT_PDF_PARSER if __name__ == "__main__": # Set up argument parser @@ -67,7 +67,7 @@ if __name__ == "__main__": opt = ConfigLoader().load({k: v for k, v in user_opt.items() if v is not None}) # Process the PDF - toc_with_page_number = page_index_main(args.pdf_path, opt, pdf_parser=args.pdf_parser or "PyPDF2") + toc_with_page_number = page_index_main(args.pdf_path, opt, pdf_parser=args.pdf_parser or DEFAULT_PDF_PARSER) print('Parsing done, saving to file...') # Save results