From 4b4ae4d51d5a310069aac7b403cc3b944c9903c2 Mon Sep 17 00:00:00 2001 From: Ray Date: Thu, 28 Aug 2025 13:07:15 +0800 Subject: [PATCH] fix model --- README.md | 2 +- pageindex/config.yaml | 2 +- pageindex/page_index.py | 4 ++-- pageindex/utils.py | 2 +- run_pageindex.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index a32d643..2b85725 100644 --- a/README.md +++ b/README.md @@ -123,7 +123,7 @@ python3 run_pageindex.py --pdf_path /path/to/your/document.pdf You can customize the processing with additional optional arguments: ``` ---model OpenAI model to use (default: gpt-4.1) +--model OpenAI model to use (default: gpt-4o-2024-11-20) --toc-check-pages Pages to check for table of contents (default: 20) --max-pages-per-node Max pages per node (default: 10) --max-tokens-per-node Max tokens per node (default: 20000) diff --git a/pageindex/config.yaml b/pageindex/config.yaml index 7927090..2ec1618 100644 --- a/pageindex/config.yaml +++ b/pageindex/config.yaml @@ -1,4 +1,4 @@ -model: "gpt-4.1" +model: "gpt-4o-2024-11-20" toc_check_page_num: 20 max_page_num_each_node: 10 max_token_num_each_node: 20000 diff --git a/pageindex/page_index.py b/pageindex/page_index.py index edbcc18..882fb5d 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -496,7 +496,7 @@ def remove_first_physical_index_section(text): return text ### add verify completeness -def generate_toc_continue(toc_content, part, model="gpt-4.1"): +def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"): print('start generate_toc_continue') prompt = """ You are an expert in extracting hierarchical tree structure. @@ -729,7 +729,7 @@ def check_toc(page_list, opt=None): ################### fix incorrect toc ######################################################### -def single_toc_item_index_fixer(section_title, content, model="gpt-4.1"): +def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20"): tob_extractor_prompt = """ You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document. diff --git a/pageindex/utils.py b/pageindex/utils.py index d879296..dc7acd8 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -410,7 +410,7 @@ def add_preface_if_needed(data): -def get_page_tokens(pdf_path, model="gpt-4.1", pdf_parser="PyPDF2"): +def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): enc = tiktoken.encoding_for_model(model) if pdf_parser == "PyPDF2": pdf_reader = PyPDF2.PdfReader(pdf_path) diff --git a/run_pageindex.py b/run_pageindex.py index 2522b23..318a483 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -10,7 +10,7 @@ if __name__ == "__main__": parser.add_argument('--pdf_path', type=str, help='Path to the PDF file') parser.add_argument('--md_path', type=str, help='Path to the Markdown file') - parser.add_argument('--model', type=str, default='gpt-4.1', help='Model to use') + parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use') parser.add_argument('--toc-check-pages', type=int, default=20, help='Number of pages to check for table of contents (PDF only)')