fix params

2026-04-26 00:26:21 +02:00 · 2025-08-28 12:45:39 +08:00 · 2025-08-28 12:45:39 +08:00 · 6d1b505541
commit 6d1b505541
parent 480f7583f7
6 changed files with 126 additions and 42 deletions
--- a/README.md
+++ b/README.md
@ -103,7 +103,7 @@ You can follow these steps to generate a PageIndex tree from a PDF document.
 ### 1. Install dependencies
 ```bash
-pip3 install -r requirements.txt
+pip3 install --upgrade -r requirements.txt
 ```
 ### 2. Set your OpenAI API key
@ -123,13 +123,13 @@ python3 run_pageindex.py --pdf_path /path/to/your/document.pdf
 You can customize the processing with additional optional arguments:
 ```
--model                 OpenAI model to use (default: gpt-4o-2024-11-20)
+--model                 OpenAI model to use (default: gpt-4.1)
 --toc-check-pages       Pages to check for table of contents (default: 20)
 --max-pages-per-node    Max pages per node (default: 10)
 --max-tokens-per-node   Max tokens per node (default: 20000)
 --if-add-node-id        Add node ID (yes/no, default: yes)
 --if-add-node-summary   Add node summary (yes/no, default: no)
--if-add-doc-description Add doc description (yes/no, default: yes)
+--if-add-doc-description Add doc description (yes/no, default: no)
 ```
--- a/pageindex/config.yaml
+++ b/pageindex/config.yaml
@ -1,8 +1,8 @@
-model: "gpt-4o-2024-11-20"
+model: "gpt-4.1"
 toc_check_page_num: 20
 max_page_num_each_node: 10
 max_token_num_each_node: 20000
 if_add_node_id: "yes"
 if_add_node_summary: "no"
-if_add_doc_description: "yes"
+if_add_doc_description: "no"
 if_add_node_text: "no"
--- a/pageindex/page_index.py
+++ b/pageindex/page_index.py
@ -496,7 +496,7 @@ def remove_first_physical_index_section(text):
    return text
 ### add verify completeness
-def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"):
+def generate_toc_continue(toc_content, part, model="gpt-4.1"):
    print('start generate_toc_continue')
    prompt = """
    You are an expert in extracting hierarchical tree structure.
@ -729,7 +729,7 @@ def check_toc(page_list, opt=None):
 ################### fix incorrect toc #########################################################
-def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20"):
+def single_toc_item_index_fixer(section_title, content, model="gpt-4.1"):
    tob_extractor_prompt = """
    You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document.
@ -1084,7 +1084,9 @@ def page_index_main(doc, opt=None):
            if opt.if_add_node_text == 'no':
                remove_structure_text(structure)
            if opt.if_add_doc_description == 'yes':
-                doc_description = generate_doc_description(structure, model=opt.model)
+                # Create a clean structure without unnecessary fields for description generation
                clean_structure = create_clean_structure_for_description(structure)
                doc_description = generate_doc_description(clean_structure, model=opt.model)
                return {
                    'doc_name': get_pdf_name(doc),
                    'doc_description': doc_description,
--- a/pageindex/page_index_md.py
+++ b/pageindex/page_index_md.py
@ -1,6 +1,7 @@
 import asyncio
 import json
 import re
 import os
 try:
    from .utils import *
 except:
@ -239,7 +240,7 @@ def clean_tree_for_output(tree_nodes):
    return cleaned_nodes
-async def md_to_tree(md_path, if_thinning=True, min_token_threshold=None, if_summary=True, summary_token_threshold=None, model=None):
+async def md_to_tree(md_path, if_thinning=True, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='yes', if_add_node_text='no', if_add_node_id='yes'):
    with open(md_path, 'r', encoding='utf-8') as f:
        markdown_content = f.read()
@ -257,14 +258,44 @@ async def md_to_tree(md_path, if_thinning=True, min_token_threshold=None, if_sum
    print(f"Building tree from nodes...")
    tree_structure = build_tree_from_nodes(nodes_with_content)
-    if if_summary:
+    # Add node IDs if requested (matching PDF behavior)
-        print(f"Generating summaries for each node...")
+    if if_add_node_id == 'yes':
-        tree_structure = await generate_summaries_for_structure_md(tree_structure,summary_token_threshold=summary_token_threshold, model=model)
+        write_node_id(tree_structure)
    print(f"Formatting tree structure...")
    if if_add_node_summary == 'yes':
        # Always include text for summary generation
        tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
-    return tree_structure
+        print(f"Generating summaries for each node...")
        tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model)
        if if_add_node_text == 'no':
            # Remove text after summary generation if not requested
            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
        if if_add_doc_description == 'yes':
            print(f"Generating document description...")
            # Create a clean structure without unnecessary fields for description generation
            clean_structure = create_clean_structure_for_description(tree_structure)
            doc_description = generate_doc_description(clean_structure, model=model)
            return {
                'doc_name': os.path.splitext(os.path.basename(md_path))[0],
                'doc_description': doc_description,
                'structure': tree_structure,
            }
    else:
        # No summaries needed, format based on text preference
        if if_add_node_text == 'yes':
            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
        else:
            tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes'])
    return {
        'doc_name': os.path.splitext(os.path.basename(md_path))[0],
        'structure': tree_structure,
    }
 if __name__ == "__main__":
--- a/pageindex/utils.py
+++ b/pageindex/utils.py
@ -410,7 +410,7 @@ def add_preface_if_needed(data):
-def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
+def get_page_tokens(pdf_path, model="gpt-4.1", pdf_parser="PyPDF2"):
    enc = tiktoken.encoding_for_model(model)
    if pdf_parser == "PyPDF2":
        pdf_reader = PyPDF2.PdfReader(pdf_path)
@ -623,6 +623,29 @@ async def generate_summaries_for_structure(structure, model=None):
    return structure
 def create_clean_structure_for_description(structure):
    """
    Create a clean structure for document description generation,
    excluding unnecessary fields like 'text'.
    """
    if isinstance(structure, dict):
        clean_node = {}
        # Only include essential fields for description
        for key in ['title', 'node_id', 'summary', 'prefix_summary']:
            if key in structure:
                clean_node[key] = structure[key]
        # Recursively process child nodes
        if 'nodes' in structure and structure['nodes']:
            clean_node['nodes'] = create_clean_structure_for_description(structure['nodes'])
        return clean_node
    elif isinstance(structure, list):
        return [create_clean_structure_for_description(item) for item in structure]
    else:
        return structure
 def generate_doc_description(structure, model=None):
    prompt = f"""Your are an expert in generating descriptions for a document.
    You are given a structure of a document. Your task is to generate a one-sentence description for the document, which makes it easy to distinguish the document from other documents.
--- a/run_pageindex.py
+++ b/run_pageindex.py
@ -7,24 +7,27 @@ from pageindex.page_index_md import md_to_tree
 if __name__ == "__main__":
    # Set up argument parser
    parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure')
-    parser.add_argument('--file_path', type=str, help='Path to the PDF or Markdown file')
+    parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
-    parser.add_argument('--file_type', type=str, choices=['pdf', 'markdown', 'md'], default='pdf',
+    parser.add_argument('--md_path', type=str, help='Path to the Markdown file')
-                      help='Type of file to process (pdf, markdown, or md)')
+
-    parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use')
+    parser.add_argument('--model', type=str, default='gpt-4.1', help='Model to use')
    parser.add_argument('--toc-check-pages', type=int, default=20, 
                      help='Number of pages to check for table of contents (PDF only)')
    parser.add_argument('--max-pages-per-node', type=int, default=10,
                      help='Maximum number of pages per node (PDF only)')
    parser.add_argument('--max-tokens-per-node', type=int, default=20000,
                      help='Maximum number of tokens per node (PDF only)')
    parser.add_argument('--if-add-node-id', type=str, default='yes',
                      help='Whether to add node id to the node')
    parser.add_argument('--if-add-node-summary', type=str, default='no',
                      help='Whether to add summary to the node')
-    parser.add_argument('--if-add-doc-description', type=str, default='yes',
+    parser.add_argument('--if-add-doc-description', type=str, default='no',
                      help='Whether to add doc description to the doc')
    parser.add_argument('--if-add-node-text', type=str, default='no',
                      help='Whether to add text to the node')
    # Markdown specific arguments
    parser.add_argument('--if-thinning', type=str, default='no',
                      help='Whether to apply tree thinning for markdown (markdown only)')
@ -34,14 +37,19 @@ if __name__ == "__main__":
                      help='Token threshold for generating summaries (markdown only)')
    args = parser.parse_args()
-    # Determine file type from extension if not specified
+    # Validate that exactly one file type is specified
-    if args.file_type == 'pdf' and args.file_path:
+    if not args.pdf_path and not args.md_path:
-        if args.file_path.lower().endswith(('.md', '.markdown')):
+        raise ValueError("Either --pdf_path or --md_path must be specified")
-            args.file_type = 'markdown'
+    if args.pdf_path and args.md_path:
-        elif not args.file_path.lower().endswith('.pdf'):
+        raise ValueError("Only one of --pdf_path or --md_path can be specified")
-            raise ValueError("File must be a PDF or Markdown file")
+    
    if args.pdf_path:
        # Validate PDF file
        if not args.pdf_path.lower().endswith('.pdf'):
            raise ValueError("PDF file must have .pdf extension")
        if not os.path.isfile(args.pdf_path):
            raise ValueError(f"PDF file not found: {args.pdf_path}")
    if args.file_type == 'pdf':
        # Process PDF file
        # Configure options
        opt = config(
@ -56,42 +64,62 @@ if __name__ == "__main__":
        )
        # Process the PDF
-        toc_with_page_number = page_index_main(args.file_path, opt)
+        toc_with_page_number = page_index_main(args.pdf_path, opt)
        print('Parsing done, saving to file...')
        # Save results
-        pdf_name = os.path.splitext(os.path.basename(args.file_path))[0]    
+        pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]    
        os.makedirs('./results', exist_ok=True)
        with open(f'./results/{pdf_name}_structure.json', 'w', encoding='utf-8') as f:
            json.dump(toc_with_page_number, f, indent=2)
-    elif args.file_type in ['markdown', 'md']:
+    elif args.md_path:
        # Validate Markdown file
        if not args.md_path.lower().endswith(('.md', '.markdown')):
            raise ValueError("Markdown file must have .md or .markdown extension")
        if not os.path.isfile(args.md_path):
            raise ValueError(f"Markdown file not found: {args.md_path}")
        # Process markdown file
        print('Processing markdown file...')
        # Configure markdown options
        if_thinning = args.if_thinning.lower() == 'yes'
        if_summary = args.if_add_node_summary.lower() == 'yes'
        # Process the markdown
        import asyncio
        # Use ConfigLoader to get consistent defaults (matching PDF behavior)
        from pageindex.utils import ConfigLoader
        config_loader = ConfigLoader()
        # Create options dict with user args
        user_opt = {
            'model': args.model,
            'if_add_node_summary': args.if_add_node_summary,
            'if_add_doc_description': args.if_add_doc_description,
            'if_add_node_text': args.if_add_node_text,
            'if_add_node_id': args.if_add_node_id
        }
        # Load config with defaults from config.yaml
        opt = config_loader.load(user_opt)
        toc_with_page_number = asyncio.run(md_to_tree(
-            md_path=args.file_path,
+            md_path=args.md_path,
-            if_thinning=if_thinning,
+            if_thinning=args.if_thinning.lower() == 'yes',
            min_token_threshold=args.thinning_threshold,
-            if_summary=if_summary,
+            if_add_node_summary=opt.if_add_node_summary,
            summary_token_threshold=args.summary_token_threshold,
-            model=args.model
+            model=opt.model,
            if_add_doc_description=opt.if_add_doc_description,
            if_add_node_text=opt.if_add_node_text,
            if_add_node_id=opt.if_add_node_id
        ))
        print('Parsing done, saving to file...')
        # Save results
-        md_name = os.path.splitext(os.path.basename(args.file_path))[0]    
+        md_name = os.path.splitext(os.path.basename(args.md_path))[0]    
        os.makedirs('./results', exist_ok=True)
        with open(f'./results/{md_name}_structure.json', 'w', encoding='utf-8') as f:
            json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)
    else:
        raise ValueError(f"Unsupported file type: {args.file_type}. Supported types are 'pdf', 'markdown', or 'md'")