From 6d1b505541b4e2af604b91784bab47f26529484f Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Thu, 28 Aug 2025 12:45:39 +0800
Subject: [PATCH] fix params

---
 README.md                  |  6 +--
 pageindex/config.yaml      |  4 +-
 pageindex/page_index.py    |  8 ++--
 pageindex/page_index_md.py | 43 +++++++++++++++++---
 pageindex/utils.py         | 25 +++++++++++-
 run_pageindex.py           | 82 +++++++++++++++++++++++++-------------
 6 files changed, 126 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index 54b840b..a32d643 100644
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ You can follow these steps to generate a PageIndex tree from a PDF document.
 ### 1. Install dependencies
 
 ```bash
-pip3 install -r requirements.txt
+pip3 install --upgrade -r requirements.txt
 ```
 
 ### 2. Set your OpenAI API key
@@ -123,13 +123,13 @@ python3 run_pageindex.py --pdf_path /path/to/your/document.pdf
 You can customize the processing with additional optional arguments:
 
 ```
---model                 OpenAI model to use (default: gpt-4o-2024-11-20)
+--model                 OpenAI model to use (default: gpt-4.1)
 --toc-check-pages       Pages to check for table of contents (default: 20)
 --max-pages-per-node    Max pages per node (default: 10)
 --max-tokens-per-node   Max tokens per node (default: 20000)
 --if-add-node-id        Add node ID (yes/no, default: yes)
 --if-add-node-summary   Add node summary (yes/no, default: no)
---if-add-doc-description Add doc description (yes/no, default: yes)
+--if-add-doc-description Add doc description (yes/no, default: no)
 ```
 
 
diff --git a/pageindex/config.yaml b/pageindex/config.yaml
index 382a3e9..7927090 100644
--- a/pageindex/config.yaml
+++ b/pageindex/config.yaml
@@ -1,8 +1,8 @@
-model: "gpt-4o-2024-11-20"
+model: "gpt-4.1"
 toc_check_page_num: 20
 max_page_num_each_node: 10
 max_token_num_each_node: 20000
 if_add_node_id: "yes"
 if_add_node_summary: "no"
-if_add_doc_description: "yes"
+if_add_doc_description: "no"
 if_add_node_text: "no"
\ No newline at end of file
diff --git a/pageindex/page_index.py b/pageindex/page_index.py
index 0f225f2..edbcc18 100644
--- a/pageindex/page_index.py
+++ b/pageindex/page_index.py
@@ -496,7 +496,7 @@ def remove_first_physical_index_section(text):
     return text
 
 ### add verify completeness
-def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"):
+def generate_toc_continue(toc_content, part, model="gpt-4.1"):
     print('start generate_toc_continue')
     prompt = """
     You are an expert in extracting hierarchical tree structure.
@@ -729,7 +729,7 @@ def check_toc(page_list, opt=None):
 
 
 ################### fix incorrect toc #########################################################
-def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20"):
+def single_toc_item_index_fixer(section_title, content, model="gpt-4.1"):
     tob_extractor_prompt = """
     You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document.
 
@@ -1084,7 +1084,9 @@ def page_index_main(doc, opt=None):
             if opt.if_add_node_text == 'no':
                 remove_structure_text(structure)
             if opt.if_add_doc_description == 'yes':
-                doc_description = generate_doc_description(structure, model=opt.model)
+                # Create a clean structure without unnecessary fields for description generation
+                clean_structure = create_clean_structure_for_description(structure)
+                doc_description = generate_doc_description(clean_structure, model=opt.model)
                 return {
                     'doc_name': get_pdf_name(doc),
                     'doc_description': doc_description,
diff --git a/pageindex/page_index_md.py b/pageindex/page_index_md.py
index d353d11..cfcec71 100644
--- a/pageindex/page_index_md.py
+++ b/pageindex/page_index_md.py
@@ -1,6 +1,7 @@
 import asyncio
 import json
 import re
+import os
 try:
     from .utils import *
 except:
@@ -239,7 +240,7 @@ def clean_tree_for_output(tree_nodes):
     return cleaned_nodes
 
 
-async def md_to_tree(md_path, if_thinning=True, min_token_threshold=None, if_summary=True, summary_token_threshold=None, model=None):
+async def md_to_tree(md_path, if_thinning=True, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='yes', if_add_node_text='no', if_add_node_id='yes'):
     with open(md_path, 'r', encoding='utf-8') as f:
         markdown_content = f.read()
     
@@ -257,14 +258,44 @@ async def md_to_tree(md_path, if_thinning=True, min_token_threshold=None, if_sum
     print(f"Building tree from nodes...")
     tree_structure = build_tree_from_nodes(nodes_with_content)
 
-    if if_summary:
-        print(f"Generating summaries for each node...")
-        tree_structure = await generate_summaries_for_structure_md(tree_structure,summary_token_threshold=summary_token_threshold, model=model)
+    # Add node IDs if requested (matching PDF behavior)
+    if if_add_node_id == 'yes':
+        write_node_id(tree_structure)
 
     print(f"Formatting tree structure...")
-    tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
     
-    return tree_structure
+    if if_add_node_summary == 'yes':
+        # Always include text for summary generation
+        tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
+        
+        print(f"Generating summaries for each node...")
+        tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model)
+        
+        if if_add_node_text == 'no':
+            # Remove text after summary generation if not requested
+            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
+        
+        if if_add_doc_description == 'yes':
+            print(f"Generating document description...")
+            # Create a clean structure without unnecessary fields for description generation
+            clean_structure = create_clean_structure_for_description(tree_structure)
+            doc_description = generate_doc_description(clean_structure, model=model)
+            return {
+                'doc_name': os.path.splitext(os.path.basename(md_path))[0],
+                'doc_description': doc_description,
+                'structure': tree_structure,
+            }
+    else:
+        # No summaries needed, format based on text preference
+        if if_add_node_text == 'yes':
+            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
+        else:
+            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
+    
+    return {
+        'doc_name': os.path.splitext(os.path.basename(md_path))[0],
+        'structure': tree_structure,
+    }
 
 
 if __name__ == "__main__":
diff --git a/pageindex/utils.py b/pageindex/utils.py
index db29ce8..d879296 100644
--- a/pageindex/utils.py
+++ b/pageindex/utils.py
@@ -410,7 +410,7 @@ def add_preface_if_needed(data):
 
 
 
-def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
+def get_page_tokens(pdf_path, model="gpt-4.1", pdf_parser="PyPDF2"):
     enc = tiktoken.encoding_for_model(model)
     if pdf_parser == "PyPDF2":
         pdf_reader = PyPDF2.PdfReader(pdf_path)
@@ -623,6 +623,29 @@ async def generate_summaries_for_structure(structure, model=None):
     return structure
 
 
+def create_clean_structure_for_description(structure):
+    """
+    Create a clean structure for document description generation,
+    excluding unnecessary fields like 'text'.
+    """
+    if isinstance(structure, dict):
+        clean_node = {}
+        # Only include essential fields for description
+        for key in ['title', 'node_id', 'summary', 'prefix_summary']:
+            if key in structure:
+                clean_node[key] = structure[key]
+        
+        # Recursively process child nodes
+        if 'nodes' in structure and structure['nodes']:
+            clean_node['nodes'] = create_clean_structure_for_description(structure['nodes'])
+        
+        return clean_node
+    elif isinstance(structure, list):
+        return [create_clean_structure_for_description(item) for item in structure]
+    else:
+        return structure
+
+
 def generate_doc_description(structure, model=None):
     prompt = f"""Your are an expert in generating descriptions for a document.
     You are given a structure of a document. Your task is to generate a one-sentence description for the document, which makes it easy to distinguish the document from other documents.
diff --git a/run_pageindex.py b/run_pageindex.py
index 00d606d..2522b23 100644
--- a/run_pageindex.py
+++ b/run_pageindex.py
@@ -7,24 +7,27 @@ from pageindex.page_index_md import md_to_tree
 if __name__ == "__main__":
     # Set up argument parser
     parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure')
-    parser.add_argument('--file_path', type=str, help='Path to the PDF or Markdown file')
-    parser.add_argument('--file_type', type=str, choices=['pdf', 'markdown', 'md'], default='pdf',
-                      help='Type of file to process (pdf, markdown, or md)')
-    parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use')
+    parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
+    parser.add_argument('--md_path', type=str, help='Path to the Markdown file')
+
+    parser.add_argument('--model', type=str, default='gpt-4.1', help='Model to use')
+
     parser.add_argument('--toc-check-pages', type=int, default=20, 
                       help='Number of pages to check for table of contents (PDF only)')
     parser.add_argument('--max-pages-per-node', type=int, default=10,
                       help='Maximum number of pages per node (PDF only)')
     parser.add_argument('--max-tokens-per-node', type=int, default=20000,
                       help='Maximum number of tokens per node (PDF only)')
+
     parser.add_argument('--if-add-node-id', type=str, default='yes',
                       help='Whether to add node id to the node')
     parser.add_argument('--if-add-node-summary', type=str, default='no',
                       help='Whether to add summary to the node')
-    parser.add_argument('--if-add-doc-description', type=str, default='yes',
+    parser.add_argument('--if-add-doc-description', type=str, default='no',
                       help='Whether to add doc description to the doc')
     parser.add_argument('--if-add-node-text', type=str, default='no',
                       help='Whether to add text to the node')
+                      
     # Markdown specific arguments
     parser.add_argument('--if-thinning', type=str, default='no',
                       help='Whether to apply tree thinning for markdown (markdown only)')
@@ -34,14 +37,19 @@ if __name__ == "__main__":
                       help='Token threshold for generating summaries (markdown only)')
     args = parser.parse_args()
     
-    # Determine file type from extension if not specified
-    if args.file_type == 'pdf' and args.file_path:
-        if args.file_path.lower().endswith(('.md', '.markdown')):
-            args.file_type = 'markdown'
-        elif not args.file_path.lower().endswith('.pdf'):
-            raise ValueError("File must be a PDF or Markdown file")
+    # Validate that exactly one file type is specified
+    if not args.pdf_path and not args.md_path:
+        raise ValueError("Either --pdf_path or --md_path must be specified")
+    if args.pdf_path and args.md_path:
+        raise ValueError("Only one of --pdf_path or --md_path can be specified")
     
-    if args.file_type == 'pdf':
+    if args.pdf_path:
+        # Validate PDF file
+        if not args.pdf_path.lower().endswith('.pdf'):
+            raise ValueError("PDF file must have .pdf extension")
+        if not os.path.isfile(args.pdf_path):
+            raise ValueError(f"PDF file not found: {args.pdf_path}")
+            
         # Process PDF file
         # Configure options
         opt = config(
@@ -56,42 +64,62 @@ if __name__ == "__main__":
         )
 
         # Process the PDF
-        toc_with_page_number = page_index_main(args.file_path, opt)
+        toc_with_page_number = page_index_main(args.pdf_path, opt)
         print('Parsing done, saving to file...')
         
         # Save results
-        pdf_name = os.path.splitext(os.path.basename(args.file_path))[0]    
+        pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]    
         os.makedirs('./results', exist_ok=True)
         
         with open(f'./results/{pdf_name}_structure.json', 'w', encoding='utf-8') as f:
             json.dump(toc_with_page_number, f, indent=2)
             
-    elif args.file_type in ['markdown', 'md']:
+    elif args.md_path:
+        # Validate Markdown file
+        if not args.md_path.lower().endswith(('.md', '.markdown')):
+            raise ValueError("Markdown file must have .md or .markdown extension")
+        if not os.path.isfile(args.md_path):
+            raise ValueError(f"Markdown file not found: {args.md_path}")
+            
         # Process markdown file
         print('Processing markdown file...')
         
-        # Configure markdown options
-        if_thinning = args.if_thinning.lower() == 'yes'
-        if_summary = args.if_add_node_summary.lower() == 'yes'
-        
         # Process the markdown
         import asyncio
+        
+        # Use ConfigLoader to get consistent defaults (matching PDF behavior)
+        from pageindex.utils import ConfigLoader
+        config_loader = ConfigLoader()
+        
+        # Create options dict with user args
+        user_opt = {
+            'model': args.model,
+            'if_add_node_summary': args.if_add_node_summary,
+            'if_add_doc_description': args.if_add_doc_description,
+            'if_add_node_text': args.if_add_node_text,
+            'if_add_node_id': args.if_add_node_id
+        }
+        
+        # Load config with defaults from config.yaml
+        opt = config_loader.load(user_opt)
+        
         toc_with_page_number = asyncio.run(md_to_tree(
-            md_path=args.file_path,
-            if_thinning=if_thinning,
+            md_path=args.md_path,
+            if_thinning=args.if_thinning.lower() == 'yes',
             min_token_threshold=args.thinning_threshold,
-            if_summary=if_summary,
+            if_add_node_summary=opt.if_add_node_summary,
             summary_token_threshold=args.summary_token_threshold,
-            model=args.model
+            model=opt.model,
+            if_add_doc_description=opt.if_add_doc_description,
+            if_add_node_text=opt.if_add_node_text,
+            if_add_node_id=opt.if_add_node_id
         ))
         
         print('Parsing done, saving to file...')
         
         # Save results
-        md_name = os.path.splitext(os.path.basename(args.file_path))[0]    
+        md_name = os.path.splitext(os.path.basename(args.md_path))[0]    
         os.makedirs('./results', exist_ok=True)
         
         with open(f'./results/{md_name}_structure.json', 'w', encoding='utf-8') as f:
-            json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)
-    else:
-        raise ValueError(f"Unsupported file type: {args.file_type}. Supported types are 'pdf', 'markdown', or 'md'")
\ No newline at end of file
+            json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)
\ No newline at end of file