fix config and logger

2026-07-03 20:41:02 +02:00 · 2025-04-06 14:49:12 +08:00 · 2025-04-06 14:49:12 +08:00 · e2cf8bb271
commit e2cf8bb271
parent 9daa4101d1
3 changed files with 53 additions and 18 deletions
--- a/init.py
+++ b/init.py
@ -0,0 +1 @@
 from .page_index import *
--- a/page_index.py
+++ b/page_index.py
@ -2,13 +2,10 @@ import os
 import json
 import copy
 import math
 import sys
 import random
 sys.path.append('../..')
 import re
-from utils import *
+from .utils import *
 import os
 from types import SimpleNamespace as config
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import argparse
@ -1015,6 +1012,8 @@ def tree_parser(page_list, opt, logger=None):
 def page_index_main(doc, opt=None):
    opt = merge_config(opt, get_default_opt())
    logger = JsonLogger(doc)
    is_valid_pdf = (
@ -1039,12 +1038,12 @@ def page_index_main(doc, opt=None):
        if opt.if_add_doc_description == 'yes':
            doc_description = generate_doc_description(structure, model=opt.model)
            return {
-                'doc_name': os.path.basename(doc),
+                'doc_name': get_pdf_name(doc),
                'doc_description': doc_description,
                'structure': structure,
            }
    return {
-        'doc_name': os.path.basename(doc),
+        'doc_name': get_pdf_name(doc),
        'structure': structure,
    }
--- a/utils.py
+++ b/utils.py
@ -13,6 +13,7 @@ from io import BytesIO
 from dotenv import load_dotenv
 load_dotenv()
 import logging
 from types import SimpleNamespace as config
 CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")
@ -284,24 +285,27 @@ def get_last_start_page_from_text(text):
    return start_page
 def sanitize_filename(filename, replacement='-'):
    # In Linux, only '/' and '\0' (null) are invalid in filenames.
    # Null can't be represented in strings, so we only handle '/'.
    return filename.replace('/', replacement)
 def get_pdf_name(pdf_path):
    # Extract PDF name
    if isinstance(pdf_path, str):
        pdf_name = os.path.basename(pdf_path)
    elif isinstance(pdf_path, BytesIO):
        pdf_reader = PyPDF2.PdfReader(pdf_path)
        meta = pdf_reader.metadata
        pdf_name = meta.title if meta.title else 'Untitled'
        pdf_name = sanitize_filename(pdf_name)
    return pdf_name
 class JsonLogger:
    def __init__(self, file_path):
-        # Extract PDF name without extension for logger name and filename
+        # Extract PDF name for logger name
-        # pdf_name = os.path.splitext(os.path.basename(file_path))[0]
+        pdf_name = get_pdf_name(file_path)
        if isinstance(file_path, str):
            pdf_name = os.path.splitext(os.path.basename(file_path))[0]
        elif isinstance(file_path, BytesIO):
            pdf_reader = PyPDF2.PdfReader(file_path)
            meta = pdf_reader.metadata
            pdf_name = meta.title if meta.title else 'Untitled'
            pdf_name = sanitize_filename(pdf_name)
        current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.filename = f"{pdf_name}_{current_time}.json"
@ -582,4 +586,35 @@ def generate_doc_description(structure, model=None):
    Directly return the description, do not include any other text.
    """
    response = ChatGPT_API(model, prompt)
-    return response
+    return response
 def get_default_opt():
    return {
        'model': 'gpt-4o-2024-11-20',
        'toc_check_page_num': 20,
        'max_page_num_each_node': 10,
        'max_token_num_each_node': 20000,
        'if_add_node_id': 'yes',
        'if_add_node_summary': 'no',
        'if_add_doc_description': 'yes'
    }
 def validate_config_keys(user_opt_dict, default_keys):
    unknown_keys = set(user_opt_dict) - set(default_keys)
    if unknown_keys:
        raise ValueError(f"Unknown config keys: {unknown_keys}")
 def merge_config(user_opt, default_opt):
    if isinstance(user_opt, config):
        user_opt = vars(user_opt)
    elif user_opt is None:
        user_opt = {}
    elif not isinstance(user_opt, dict):
        raise TypeError("opt must be dict, SimpleNamespace or None")
    validate_config_keys(user_opt, default_opt)
    merged = {**default_opt, **user_opt}
    return config(**merged)