From 6d1b505541b4e2af604b91784bab47f26529484f Mon Sep 17 00:00:00 2001 From: Ray Date: Thu, 28 Aug 2025 12:45:39 +0800 Subject: [PATCH] fix params --- README.md | 6 +-- pageindex/config.yaml | 4 +- pageindex/page_index.py | 8 ++-- pageindex/page_index_md.py | 43 +++++++++++++++++--- pageindex/utils.py | 25 +++++++++++- run_pageindex.py | 82 +++++++++++++++++++++++++------------- 6 files changed, 126 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 54b840b..a32d643 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ You can follow these steps to generate a PageIndex tree from a PDF document. ### 1. Install dependencies ```bash -pip3 install -r requirements.txt +pip3 install --upgrade -r requirements.txt ``` ### 2. Set your OpenAI API key @@ -123,13 +123,13 @@ python3 run_pageindex.py --pdf_path /path/to/your/document.pdf You can customize the processing with additional optional arguments: ``` ---model OpenAI model to use (default: gpt-4o-2024-11-20) +--model OpenAI model to use (default: gpt-4.1) --toc-check-pages Pages to check for table of contents (default: 20) --max-pages-per-node Max pages per node (default: 10) --max-tokens-per-node Max tokens per node (default: 20000) --if-add-node-id Add node ID (yes/no, default: yes) --if-add-node-summary Add node summary (yes/no, default: no) ---if-add-doc-description Add doc description (yes/no, default: yes) +--if-add-doc-description Add doc description (yes/no, default: no) ``` diff --git a/pageindex/config.yaml b/pageindex/config.yaml index 382a3e9..7927090 100644 --- a/pageindex/config.yaml +++ b/pageindex/config.yaml @@ -1,8 +1,8 @@ -model: "gpt-4o-2024-11-20" +model: "gpt-4.1" toc_check_page_num: 20 max_page_num_each_node: 10 max_token_num_each_node: 20000 if_add_node_id: "yes" if_add_node_summary: "no" -if_add_doc_description: "yes" +if_add_doc_description: "no" if_add_node_text: "no" \ No newline at end of file diff --git a/pageindex/page_index.py b/pageindex/page_index.py index 0f225f2..edbcc18 100644 --- a/pageindex/page_index.py +++ b/pageindex/page_index.py @@ -496,7 +496,7 @@ def remove_first_physical_index_section(text): return text ### add verify completeness -def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"): +def generate_toc_continue(toc_content, part, model="gpt-4.1"): print('start generate_toc_continue') prompt = """ You are an expert in extracting hierarchical tree structure. @@ -729,7 +729,7 @@ def check_toc(page_list, opt=None): ################### fix incorrect toc ######################################################### -def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20"): +def single_toc_item_index_fixer(section_title, content, model="gpt-4.1"): tob_extractor_prompt = """ You are given a section title and several pages of a document, your job is to find the physical index of the start page of the section in the partial document. @@ -1084,7 +1084,9 @@ def page_index_main(doc, opt=None): if opt.if_add_node_text == 'no': remove_structure_text(structure) if opt.if_add_doc_description == 'yes': - doc_description = generate_doc_description(structure, model=opt.model) + # Create a clean structure without unnecessary fields for description generation + clean_structure = create_clean_structure_for_description(structure) + doc_description = generate_doc_description(clean_structure, model=opt.model) return { 'doc_name': get_pdf_name(doc), 'doc_description': doc_description, diff --git a/pageindex/page_index_md.py b/pageindex/page_index_md.py index d353d11..cfcec71 100644 --- a/pageindex/page_index_md.py +++ b/pageindex/page_index_md.py @@ -1,6 +1,7 @@ import asyncio import json import re +import os try: from .utils import * except: @@ -239,7 +240,7 @@ def clean_tree_for_output(tree_nodes): return cleaned_nodes -async def md_to_tree(md_path, if_thinning=True, min_token_threshold=None, if_summary=True, summary_token_threshold=None, model=None): +async def md_to_tree(md_path, if_thinning=True, min_token_threshold=None, if_add_node_summary='no', summary_token_threshold=None, model=None, if_add_doc_description='yes', if_add_node_text='no', if_add_node_id='yes'): with open(md_path, 'r', encoding='utf-8') as f: markdown_content = f.read() @@ -257,14 +258,44 @@ async def md_to_tree(md_path, if_thinning=True, min_token_threshold=None, if_sum print(f"Building tree from nodes...") tree_structure = build_tree_from_nodes(nodes_with_content) - if if_summary: - print(f"Generating summaries for each node...") - tree_structure = await generate_summaries_for_structure_md(tree_structure,summary_token_threshold=summary_token_threshold, model=model) + # Add node IDs if requested (matching PDF behavior) + if if_add_node_id == 'yes': + write_node_id(tree_structure) print(f"Formatting tree structure...") - tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes']) - return tree_structure + if if_add_node_summary == 'yes': + # Always include text for summary generation + tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes']) + + print(f"Generating summaries for each node...") + tree_structure = await generate_summaries_for_structure_md(tree_structure, summary_token_threshold=summary_token_threshold, model=model) + + if if_add_node_text == 'no': + # Remove text after summary generation if not requested + tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes']) + + if if_add_doc_description == 'yes': + print(f"Generating document description...") + # Create a clean structure without unnecessary fields for description generation + clean_structure = create_clean_structure_for_description(tree_structure) + doc_description = generate_doc_description(clean_structure, model=model) + return { + 'doc_name': os.path.splitext(os.path.basename(md_path))[0], + 'doc_description': doc_description, + 'structure': tree_structure, + } + else: + # No summaries needed, format based on text preference + if if_add_node_text == 'yes': + tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes']) + else: + tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'line_num', 'nodes']) + + return { + 'doc_name': os.path.splitext(os.path.basename(md_path))[0], + 'structure': tree_structure, + } if __name__ == "__main__": diff --git a/pageindex/utils.py b/pageindex/utils.py index db29ce8..d879296 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -410,7 +410,7 @@ def add_preface_if_needed(data): -def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): +def get_page_tokens(pdf_path, model="gpt-4.1", pdf_parser="PyPDF2"): enc = tiktoken.encoding_for_model(model) if pdf_parser == "PyPDF2": pdf_reader = PyPDF2.PdfReader(pdf_path) @@ -623,6 +623,29 @@ async def generate_summaries_for_structure(structure, model=None): return structure +def create_clean_structure_for_description(structure): + """ + Create a clean structure for document description generation, + excluding unnecessary fields like 'text'. + """ + if isinstance(structure, dict): + clean_node = {} + # Only include essential fields for description + for key in ['title', 'node_id', 'summary', 'prefix_summary']: + if key in structure: + clean_node[key] = structure[key] + + # Recursively process child nodes + if 'nodes' in structure and structure['nodes']: + clean_node['nodes'] = create_clean_structure_for_description(structure['nodes']) + + return clean_node + elif isinstance(structure, list): + return [create_clean_structure_for_description(item) for item in structure] + else: + return structure + + def generate_doc_description(structure, model=None): prompt = f"""Your are an expert in generating descriptions for a document. You are given a structure of a document. Your task is to generate a one-sentence description for the document, which makes it easy to distinguish the document from other documents. diff --git a/run_pageindex.py b/run_pageindex.py index 00d606d..2522b23 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -7,24 +7,27 @@ from pageindex.page_index_md import md_to_tree if __name__ == "__main__": # Set up argument parser parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure') - parser.add_argument('--file_path', type=str, help='Path to the PDF or Markdown file') - parser.add_argument('--file_type', type=str, choices=['pdf', 'markdown', 'md'], default='pdf', - help='Type of file to process (pdf, markdown, or md)') - parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use') + parser.add_argument('--pdf_path', type=str, help='Path to the PDF file') + parser.add_argument('--md_path', type=str, help='Path to the Markdown file') + + parser.add_argument('--model', type=str, default='gpt-4.1', help='Model to use') + parser.add_argument('--toc-check-pages', type=int, default=20, help='Number of pages to check for table of contents (PDF only)') parser.add_argument('--max-pages-per-node', type=int, default=10, help='Maximum number of pages per node (PDF only)') parser.add_argument('--max-tokens-per-node', type=int, default=20000, help='Maximum number of tokens per node (PDF only)') + parser.add_argument('--if-add-node-id', type=str, default='yes', help='Whether to add node id to the node') parser.add_argument('--if-add-node-summary', type=str, default='no', help='Whether to add summary to the node') - parser.add_argument('--if-add-doc-description', type=str, default='yes', + parser.add_argument('--if-add-doc-description', type=str, default='no', help='Whether to add doc description to the doc') parser.add_argument('--if-add-node-text', type=str, default='no', help='Whether to add text to the node') + # Markdown specific arguments parser.add_argument('--if-thinning', type=str, default='no', help='Whether to apply tree thinning for markdown (markdown only)') @@ -34,14 +37,19 @@ if __name__ == "__main__": help='Token threshold for generating summaries (markdown only)') args = parser.parse_args() - # Determine file type from extension if not specified - if args.file_type == 'pdf' and args.file_path: - if args.file_path.lower().endswith(('.md', '.markdown')): - args.file_type = 'markdown' - elif not args.file_path.lower().endswith('.pdf'): - raise ValueError("File must be a PDF or Markdown file") + # Validate that exactly one file type is specified + if not args.pdf_path and not args.md_path: + raise ValueError("Either --pdf_path or --md_path must be specified") + if args.pdf_path and args.md_path: + raise ValueError("Only one of --pdf_path or --md_path can be specified") - if args.file_type == 'pdf': + if args.pdf_path: + # Validate PDF file + if not args.pdf_path.lower().endswith('.pdf'): + raise ValueError("PDF file must have .pdf extension") + if not os.path.isfile(args.pdf_path): + raise ValueError(f"PDF file not found: {args.pdf_path}") + # Process PDF file # Configure options opt = config( @@ -56,42 +64,62 @@ if __name__ == "__main__": ) # Process the PDF - toc_with_page_number = page_index_main(args.file_path, opt) + toc_with_page_number = page_index_main(args.pdf_path, opt) print('Parsing done, saving to file...') # Save results - pdf_name = os.path.splitext(os.path.basename(args.file_path))[0] + pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0] os.makedirs('./results', exist_ok=True) with open(f'./results/{pdf_name}_structure.json', 'w', encoding='utf-8') as f: json.dump(toc_with_page_number, f, indent=2) - elif args.file_type in ['markdown', 'md']: + elif args.md_path: + # Validate Markdown file + if not args.md_path.lower().endswith(('.md', '.markdown')): + raise ValueError("Markdown file must have .md or .markdown extension") + if not os.path.isfile(args.md_path): + raise ValueError(f"Markdown file not found: {args.md_path}") + # Process markdown file print('Processing markdown file...') - # Configure markdown options - if_thinning = args.if_thinning.lower() == 'yes' - if_summary = args.if_add_node_summary.lower() == 'yes' - # Process the markdown import asyncio + + # Use ConfigLoader to get consistent defaults (matching PDF behavior) + from pageindex.utils import ConfigLoader + config_loader = ConfigLoader() + + # Create options dict with user args + user_opt = { + 'model': args.model, + 'if_add_node_summary': args.if_add_node_summary, + 'if_add_doc_description': args.if_add_doc_description, + 'if_add_node_text': args.if_add_node_text, + 'if_add_node_id': args.if_add_node_id + } + + # Load config with defaults from config.yaml + opt = config_loader.load(user_opt) + toc_with_page_number = asyncio.run(md_to_tree( - md_path=args.file_path, - if_thinning=if_thinning, + md_path=args.md_path, + if_thinning=args.if_thinning.lower() == 'yes', min_token_threshold=args.thinning_threshold, - if_summary=if_summary, + if_add_node_summary=opt.if_add_node_summary, summary_token_threshold=args.summary_token_threshold, - model=args.model + model=opt.model, + if_add_doc_description=opt.if_add_doc_description, + if_add_node_text=opt.if_add_node_text, + if_add_node_id=opt.if_add_node_id )) print('Parsing done, saving to file...') # Save results - md_name = os.path.splitext(os.path.basename(args.file_path))[0] + md_name = os.path.splitext(os.path.basename(args.md_path))[0] os.makedirs('./results', exist_ok=True) with open(f'./results/{md_name}_structure.json', 'w', encoding='utf-8') as f: - json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False) - else: - raise ValueError(f"Unsupported file type: {args.file_type}. Supported types are 'pdf', 'markdown', or 'md'") \ No newline at end of file + json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False) \ No newline at end of file