diff --git a/pageindex/config.yaml b/pageindex/config.yaml index e7c13e5..591fe93 100644 --- a/pageindex/config.yaml +++ b/pageindex/config.yaml @@ -7,5 +7,4 @@ max_token_num_each_node: 20000 if_add_node_id: "yes" if_add_node_summary: "yes" if_add_doc_description: "no" -if_add_node_text: "no" -pdf_parser: "PyPDF2" # text extractor: "PyPDF2" (default, no extra install), "pypdfium2" (pip install pypdfium2), or "PyMuPDF" \ No newline at end of file +if_add_node_text: "no" \ No newline at end of file diff --git a/pageindex/utils.py b/pageindex/utils.py index 5c73a62..a5adc54 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -685,10 +685,14 @@ def format_structure(structure, order=None): class ConfigLoader: + # Code-side defaults for non-tuning settings (kept out of config.yaml). + # yaml entries override these if present. + _CODE_DEFAULTS = {"pdf_parser": "PyPDF2"} + def __init__(self, default_path: str = None): if default_path is None: default_path = Path(__file__).parent / "config.yaml" - self._default_dict = self._load_yaml(default_path) + self._default_dict = {**self._CODE_DEFAULTS, **self._load_yaml(default_path)} @staticmethod def _load_yaml(path):