Merge pull request #3 from rejojer/feat/fix

Feat/fix
2026-05-16 18:25:14 +02:00 · 2025-04-06 19:32:13 +08:00 · 2025-04-06 19:32:13 +08:00 · 6d06d0786e
commit 6d06d0786e
parent 911233b2ad 95dbc87158
4 changed files with 72 additions and 17 deletions
--- a/init.py
+++ b/init.py
@ -0,0 +1 @@
+from .page_index import *
--- a/config.yaml
+++ b/config.yaml
@ -0,0 +1,7 @@
+model: gpt-4o-2024-11-20
+toc_check_page_num: 20
+max_page_num_each_node: 10
+max_token_num_each_node: 20000
+if_add_node_id: yes
+if_add_node_summary: no
+if_add_doc_description: yes
--- a/page_index.py
+++ b/page_index.py
@ -2,13 +2,10 @@ import os
 import json
 import copy
 import math
-import sys
 import random
-sys.path.append('../..')
 import re
 from utils import *
 import os
-from types import SimpleNamespace as config
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import argparse

@ -1033,16 +1030,26 @@ def page_index_main(doc, opt=None):
        if opt.if_add_doc_description == 'yes':
            doc_description = generate_doc_description(structure, model=opt.model)
            return {
-                'doc_name': os.path.basename(doc),
+                'doc_name': get_pdf_name(doc),
                'doc_description': doc_description,
                'structure': structure,
            }
    return {
-        'doc_name': os.path.basename(doc),
+        'doc_name': get_pdf_name(doc),
        'structure': structure,
    }


+def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
+               f_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None):
+    
+    user_opt = {
+        arg: value for arg, value in locals().items()
+        if arg != "doc" and value is not None
+    }
+    opt = ConfigLoader().load(user_opt)
+    return page_index_main(doc, opt)
+

 if __name__ == "__main__":
    # Set up argument parser
--- a/utils.py
+++ b/utils.py
@ -13,6 +13,9 @@ from io import BytesIO
 from dotenv import load_dotenv
 load_dotenv()
 import logging
+import yaml
+from pathlib import Path
+from types import SimpleNamespace as config

 CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")

@ -284,24 +287,27 @@ def get_last_start_page_from_text(text):
    return start_page


-
-
 def sanitize_filename(filename, replacement='-'):
    # In Linux, only '/' and '\0' (null) are invalid in filenames.
    # Null can't be represented in strings, so we only handle '/'.
    return filename.replace('/', replacement)

+def get_pdf_name(pdf_path):
+    # Extract PDF name
+    if isinstance(pdf_path, str):
+        pdf_name = os.path.basename(pdf_path)
+    elif isinstance(pdf_path, BytesIO):
+        pdf_reader = PyPDF2.PdfReader(pdf_path)
+        meta = pdf_reader.metadata
+        pdf_name = meta.title if meta.title else 'Untitled'
+        pdf_name = sanitize_filename(pdf_name)
+    return pdf_name
+
+
 class JsonLogger:
    def __init__(self, file_path):
-        # Extract PDF name without extension for logger name and filename
-        # pdf_name = os.path.splitext(os.path.basename(file_path))[0]
-        if isinstance(file_path, str):
-            pdf_name = os.path.splitext(os.path.basename(file_path))[0]
-        elif isinstance(file_path, BytesIO):
-            pdf_reader = PyPDF2.PdfReader(file_path)
-            meta = pdf_reader.metadata
-            pdf_name = meta.title if meta.title else 'Untitled'
-            pdf_name = sanitize_filename(pdf_name)
+        # Extract PDF name for logger name
+        pdf_name = get_pdf_name(file_path)
            
        current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.filename = f"{pdf_name}_{current_time}.json"
@ -583,4 +589,38 @@ def generate_doc_description(structure, model=None):
    Directly return the description, do not include any other text.
    """
    response = ChatGPT_API(model, prompt)
-    return response
+    return response
+
+
+class ConfigLoader:
+    def __init__(self, default_path: str = None):
+        if default_path is None:
+            default_path = Path(__file__).parent / "config.yaml"
+        self._default_dict = self._load_yaml(default_path)
+
+    @staticmethod
+    def _load_yaml(path):
+        with open(path, "r", encoding="utf-8") as f:
+            return yaml.safe_load(f) or {}
+
+    def _validate_keys(self, user_dict):
+        unknown_keys = set(user_dict) - set(self._default_dict)
+        if unknown_keys:
+            raise ValueError(f"Unknown config keys: {unknown_keys}")
+
+    def load(self, user_opt=None) -> config:
+        """
+        Load the configuration, merging user options with default values.
+        """
+        if user_opt is None:
+            user_dict = {}
+        elif isinstance(user_opt, config):
+            user_dict = vars(user_opt)
+        elif isinstance(user_opt, dict):
+            user_dict = user_opt
+        else:
+            raise TypeError("user_opt must be dict, config(SimpleNamespace) or None")
+
+        self._validate_keys(user_dict)
+        merged = {**self._default_dict, **user_dict}
+        return config(**merged)