From 4b4ae4d51d5a310069aac7b403cc3b944c9903c2 Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Thu, 28 Aug 2025 13:07:15 +0800
Subject: [PATCH] fix model

---
 README.md               | 2 +-
 pageindex/config.yaml   | 2 +-
 pageindex/page_index.py | 4 ++--
 pageindex/utils.py      | 2 +-
 run_pageindex.py        | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index a32d643..2b85725 100644
--- a/README.md
+++ b/README.md
@@ -123,7 +123,7 @@ python3 run_pageindex.py --pdf_path /path/to/your/document.pdf
 You can customize the processing with additional optional arguments:
 
 ```
---model                 OpenAI model to use (default: gpt-4.1)
+--model                 OpenAI model to use (default: gpt-4o-2024-11-20)
 --toc-check-pages       Pages to check for table of contents (default: 20)
 --max-pages-per-node    Max pages per node (default: 10)
 --max-tokens-per-node   Max tokens per node (default: 20000)
diff --git a/pageindex/config.yaml b/pageindex/config.yaml
index 7927090..2ec1618 100644
--- a/pageindex/config.yaml
+++ b/pageindex/config.yaml
@@ -1,4 +1,4 @@
-model: "gpt-4.1"
+model: "gpt-4o-2024-11-20"
 toc_check_page_num: 20
 max_page_num_each_node: 10
 max_token_num_each_node: 20000
diff --git a/pageindex/page_index.py b/pageindex/page_index.py
index edbcc18..882fb5d 100644
--- a/pageindex/page_index.py
+++ b/pageindex/page_index.py
@@ -496,7 +496,7 @@ def remove_first_physical_index_section(text):
     return text
 
 ### add verify completeness
-def generate_toc_continue(toc_content, part, model="gpt-4.1"):
+def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"):
     print('start generate_toc_continue')
     prompt = """
     You are an expert in extracting hierarchical tree structure.
@@ -729,7 +729,7 @@ def check_toc(page_list, opt=None):
 
 
 ################### fix incorrect toc #########################################################
-def single_toc_item_index_fixer(section_title, content, model="gpt-4.1"):
+def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20"):
     tob_extractor_prompt = """
     You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document.
 
diff --git a/pageindex/utils.py b/pageindex/utils.py
index d879296..dc7acd8 100644
--- a/pageindex/utils.py
+++ b/pageindex/utils.py
@@ -410,7 +410,7 @@ def add_preface_if_needed(data):
 
 
 
-def get_page_tokens(pdf_path, model="gpt-4.1", pdf_parser="PyPDF2"):
+def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
     enc = tiktoken.encoding_for_model(model)
     if pdf_parser == "PyPDF2":
         pdf_reader = PyPDF2.PdfReader(pdf_path)
diff --git a/run_pageindex.py b/run_pageindex.py
index 2522b23..318a483 100644
--- a/run_pageindex.py
+++ b/run_pageindex.py
@@ -10,7 +10,7 @@ if __name__ == "__main__":
     parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
     parser.add_argument('--md_path', type=str, help='Path to the Markdown file')
 
-    parser.add_argument('--model', type=str, default='gpt-4.1', help='Model to use')
+    parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use')
 
     parser.add_argument('--toc-check-pages', type=int, default=20, 
                       help='Number of pages to check for table of contents (PDF only)')