diff --git a/pageindex/__init__.py b/pageindex/__init__.py index f8a75d5..4606eb3 100644 --- a/pageindex/__init__.py +++ b/pageindex/__init__.py @@ -1 +1,2 @@ -from .page_index import * \ No newline at end of file +from .page_index import * +from .page_index_md import md_to_tree \ No newline at end of file diff --git a/pageindex/page_index_md.py b/pageindex/page_index_md.py new file mode 100644 index 0000000..be5949d --- /dev/null +++ b/pageindex/page_index_md.py @@ -0,0 +1,288 @@ +import asyncio +import json +import re +from .utils import * + + +async def get_node_summary(node, summary_token_threshold=200, model=None): + node_text = node.get('text') + num_tokens = count_tokens(node_text, model=model) + if num_tokens < summary_token_threshold: + return node_text + else: + return await generate_node_summary(node, model=model) + + +async def generate_summaries_for_structure_md(structure, summary_token_threshold, model=None): + nodes = structure_to_list(structure) + tasks = [get_node_summary(node, summary_token_threshold=summary_token_threshold, model=model) for node in nodes] + summaries = await asyncio.gather(*tasks) + + for node, summary in zip(nodes, summaries): + if not node.get('nodes'): + node['summary'] = summary + else: + node['prefix_summary'] = summary + return structure + + +def extract_nodes_from_markdown(markdown_content): + header_pattern = r'^(#{1,6})\s+(.+)$' + node_list = [] + + lines = markdown_content.split('\n') + for line_num, line in enumerate(lines, 1): + line = line.strip() + if not line: + continue + + match = re.match(header_pattern, line) + if match: + title = match.group(2).strip() + node_list.append({'node_title': title, 'line_num': line_num}) + + return node_list,lines + + +def extract_node_text_content(node_list, markdown_lines): + all_nodes = [] + for node in node_list: + processed_node = { + 'title': node['node_title'], + 'line_num': node['line_num'], + 'level': len(re.match(r'^(#{1,6})', markdown_lines[node['line_num'] - 1]).group(1)) + } + all_nodes.append(processed_node) + + for i, node in enumerate(all_nodes): + start_line = node['line_num'] - 1 + if i + 1 < len(all_nodes): + end_line = all_nodes[i + 1]['line_num'] - 1 + else: + end_line = len(markdown_lines) + + node['text'] = '\n'.join(markdown_lines[start_line:end_line]).strip() + return all_nodes + +def update_node_list_with_text_token_count(node_list, model=None): + + def find_all_children(parent_index, parent_level, node_list): + """Find all direct and indirect children of a parent node""" + children_indices = [] + + # Look for children after the parent + for i in range(parent_index + 1, len(node_list)): + current_level = node_list[i]['level'] + + # If we hit a node at same or higher level than parent, stop + if current_level <= parent_level: + break + + # This is a descendant + children_indices.append(i) + + return children_indices + + # Make a copy to avoid modifying the original + result_list = node_list.copy() + + # Process nodes from end to beginning to ensure children are processed before parents + for i in range(len(result_list) - 1, -1, -1): + current_node = result_list[i] + current_level = current_node['level'] + + # Get all children of this node + children_indices = find_all_children(i, current_level, result_list) + + # Start with the node's own text + node_text = current_node.get('text', '') + total_text = node_text + + # Add all children's text + for child_index in children_indices: + child_text = result_list[child_index].get('text', '') + if child_text: + total_text += '\n' + child_text + + # Calculate token count for combined text + result_list[i]['text_token_count'] = count_tokens(total_text, model=model) + + return result_list + + +def tree_thinning_for_index(node_list, min_node_token=None, model=None): + def find_all_children(parent_index, parent_level, node_list): + children_indices = [] + + for i in range(parent_index + 1, len(node_list)): + current_level = node_list[i]['level'] + + if current_level <= parent_level: + break + + children_indices.append(i) + + return children_indices + + result_list = node_list.copy() + nodes_to_remove = set() + + for i in range(len(result_list) - 1, -1, -1): + if i in nodes_to_remove: + continue + + current_node = result_list[i] + current_level = current_node['level'] + + total_tokens = current_node.get('text_token_count', 0) + + if total_tokens < min_node_token: + children_indices = find_all_children(i, current_level, result_list) + + children_texts = [] + for child_index in sorted(children_indices): + if child_index not in nodes_to_remove: + child_text = result_list[child_index].get('text', '') + if child_text.strip(): + children_texts.append(child_text) + nodes_to_remove.add(child_index) + + if children_texts: + parent_text = current_node.get('text', '') + merged_text = parent_text + for child_text in children_texts: + if merged_text and not merged_text.endswith('\n'): + merged_text += '\n\n' + merged_text += child_text + + result_list[i]['text'] = merged_text + + result_list[i]['text_token_count'] = count_tokens(merged_text, model=model) + + for index in sorted(nodes_to_remove, reverse=True): + result_list.pop(index) + + return result_list + + +def build_tree_from_nodes(node_list): + if not node_list: + return [] + + stack = [] + root_nodes = [] + node_counter = 1 + + for node in node_list: + current_level = node['level'] + + tree_node = { + 'title': node['title'], + 'node_id': str(node_counter).zfill(4), + 'text': node['text'], + 'line_num': node['line_num'], + 'nodes': [] + } + node_counter += 1 + + while stack and stack[-1][1] >= current_level: + stack.pop() + + if not stack: + root_nodes.append(tree_node) + else: + parent_node, parent_level = stack[-1] + parent_node['nodes'].append(tree_node) + + stack.append((tree_node, current_level)) + + return root_nodes + + +def clean_tree_for_output(tree_nodes): + cleaned_nodes = [] + + for node in tree_nodes: + cleaned_node = { + 'title': node['title'], + 'node_id': node['node_id'], + 'text': node['text'], + 'line_num': node['line_num'] + } + + if node['nodes']: + cleaned_node['nodes'] = clean_tree_for_output(node['nodes']) + + cleaned_nodes.append(cleaned_node) + + return cleaned_nodes + + +async def md_to_tree(md_path, if_thinning=True, min_token_threshold=None, if_summary=True, summary_token_threshold=None, model=None): + with open(md_path, 'r', encoding='utf-8') as f: + markdown_content = f.read() + + print(f"Extracting nodes from markdown...") + node_list, markdown_lines = extract_nodes_from_markdown(markdown_content) + + print(f"Extracting text content from nodes...") + nodes_with_content = extract_node_text_content(node_list, markdown_lines) + + if if_thinning: + nodes_with_content = update_node_list_with_text_token_count(nodes_with_content, model=model) + print(f"Thinning nodes...") + nodes_with_content = tree_thinning_for_index(nodes_with_content, min_token_threshold, model=model) + + print(f"Building tree from nodes...") + tree_structure = build_tree_from_nodes(nodes_with_content) + + if if_summary: + print(f"Generating summaries for each node...") + tree_structure = await generate_summaries_for_structure_md(tree_structure,summary_token_threshold=summary_token_threshold, model=model) + + print(f"Formatting tree structure...") + tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes']) + + return tree_structure + + +if __name__ == "__main__": + import os + import json + + MD_NAME = 'Detect-Order-Construct' + # MD_NAME = 'Welcome' + MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'docs', f'{MD_NAME}.md') + + + MODEL="gpt-4.1" + IF_THINNING=False + THINNING_THRESHOLD=5000 + SUMMARY_TOKEN_THRESHOLD=200 + IF_SUMMARY=True + + tree_structure = asyncio.run(md_to_tree( + md_path=MD_PATH, + if_thinning=IF_THINNING, + min_token_threshold=THINNING_THRESHOLD, + if_summary=IF_SUMMARY, + summary_token_threshold=SUMMARY_TOKEN_THRESHOLD, + model=MODEL)) + + print('\n' + '='*60) + print('TREE STRUCTURE') + print('='*60) + print_json(tree_structure) + + print('\n' + '='*60) + print('TABLE OF CONTENTS') + print('='*60) + print_toc(tree_structure) + + output_path = os.path.join(os.path.dirname(__file__), '..', 'results', f'{MD_NAME}_structure.json') + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(tree_structure, f, indent=2, ensure_ascii=False) + + print(f"\nTree structure saved to: {output_path}") \ No newline at end of file diff --git a/pageindex/utils.py b/pageindex/utils.py index e25a534..db29ce8 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -19,8 +19,9 @@ from types import SimpleNamespace as config CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY") - -def count_tokens(text, model): +def count_tokens(text, model=None): + if not text: + return 0 enc = tiktoken.encoding_for_model(model) tokens = enc.encode(text) return len(tokens) @@ -489,6 +490,34 @@ def clean_structure_post(data): clean_structure_post(section) return data +def remove_fields(data, fields=['text']): + if isinstance(data, dict): + return {k: remove_fields(v, fields) + for k, v in data.items() if k not in fields} + elif isinstance(data, list): + return [remove_fields(item, fields) for item in data] + return data + +def print_toc(tree, indent=0): + for node in tree: + print(' ' * indent + node['title']) + if node.get('nodes'): + print_toc(node['nodes'], indent + 1) + +def print_json(data, max_len=40, indent=2): + def simplify_data(obj): + if isinstance(obj, dict): + return {k: simplify_data(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [simplify_data(item) for item in obj] + elif isinstance(obj, str) and len(obj) > max_len: + return obj[:max_len] + '...' + else: + return obj + + simplified = simplify_data(data) + print(json.dumps(simplified, indent=indent, ensure_ascii=False)) + def remove_structure_text(data): if isinstance(data, dict): @@ -606,6 +635,26 @@ def generate_doc_description(structure, model=None): return response +def reorder_dict(data, key_order): + if not key_order: + return data + return {key: data[key] for key in key_order if key in data} + + +def format_structure(structure, order=None): + if not order: + return structure + if isinstance(structure, dict): + if 'nodes' in structure: + structure['nodes'] = format_structure(structure['nodes'], order) + if not structure.get('nodes'): + structure.pop('nodes', None) + structure = reorder_dict(structure, order) + elif isinstance(structure, list): + structure = [format_structure(item, order) for item in structure] + return structure + + class ConfigLoader: def __init__(self, default_path: str = None): if default_path is None: diff --git a/requirements.txt b/requirements.txt index ad43fe1..463db58 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -openai==1.70.0 -pymupdf==1.25.5 +openai==1.101.0 +pymupdf==1.26.4 PyPDF2==3.0.1 python-dotenv==1.1.0 -tiktoken==0.7.0 +tiktoken==0.11.0 pyyaml==6.0.2 diff --git a/run_pageindex.py b/run_pageindex.py index 63b2206..2a9f29a 100644 --- a/run_pageindex.py +++ b/run_pageindex.py @@ -1,17 +1,22 @@ import argparse +import os +import json from pageindex import * +from pageindex.page_index_md import md_to_tree if __name__ == "__main__": # Set up argument parser - parser = argparse.ArgumentParser(description='Process PDF document and generate structure') - parser.add_argument('--pdf_path', type=str, help='Path to the PDF file') + parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure') + parser.add_argument('--file_path', type=str, help='Path to the PDF or Markdown file') + parser.add_argument('--file_type', type=str, choices=['pdf', 'markdown', 'md'], default='pdf', + help='Type of file to process (pdf, markdown, or md)') parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use') parser.add_argument('--toc-check-pages', type=int, default=20, - help='Number of pages to check for table of contents') + help='Number of pages to check for table of contents (PDF only)') parser.add_argument('--max-pages-per-node', type=int, default=10, - help='Maximum number of pages per node') + help='Maximum number of pages per node (PDF only)') parser.add_argument('--max-tokens-per-node', type=int, default=20000, - help='Maximum number of tokens per node') + help='Maximum number of tokens per node (PDF only)') parser.add_argument('--if-add-node-id', type=str, default='yes', help='Whether to add node id to the node') parser.add_argument('--if-add-node-summary', type=str, default='no', @@ -20,27 +25,73 @@ if __name__ == "__main__": help='Whether to add doc description to the doc') parser.add_argument('--if-add-node-text', type=str, default='no', help='Whether to add text to the node') + # Markdown specific arguments + parser.add_argument('--if-thinning', type=str, default='yes', + help='Whether to apply tree thinning for markdown (markdown only)') + parser.add_argument('--thinning-threshold', type=int, default=5000, + help='Minimum token threshold for thinning (markdown only)') + parser.add_argument('--summary-token-threshold', type=int, default=200, + help='Token threshold for generating summaries (markdown only)') args = parser.parse_args() - + + # Determine file type from extension if not specified + if args.file_type == 'pdf' and args.file_path: + if args.file_path.lower().endswith(('.md', '.markdown')): + args.file_type = 'markdown' + elif not args.file_path.lower().endswith('.pdf'): + raise ValueError("File must be a PDF or Markdown file") + + if args.file_type == 'pdf': + # Process PDF file # Configure options - opt = config( - model=args.model, - toc_check_page_num=args.toc_check_pages, - max_page_num_each_node=args.max_pages_per_node, - max_token_num_each_node=args.max_tokens_per_node, - if_add_node_id=args.if_add_node_id, - if_add_node_summary=args.if_add_node_summary, - if_add_doc_description=args.if_add_doc_description, - if_add_node_text=args.if_add_node_text - ) + opt = config( + model=args.model, + toc_check_page_num=args.toc_check_pages, + max_page_num_each_node=args.max_pages_per_node, + max_token_num_each_node=args.max_tokens_per_node, + if_add_node_id=args.if_add_node_id, + if_add_node_summary=args.if_add_node_summary, + if_add_doc_description=args.if_add_doc_description, + if_add_node_text=args.if_add_node_text + ) - # Process the PDF - toc_with_page_number = page_index_main(args.pdf_path, opt) - print('Parsing done, saving to file...') - - # Save results - pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0] - os.makedirs('./results', exist_ok=True) - - with open(f'./results/{pdf_name}_structure.json', 'w', encoding='utf-8') as f: - json.dump(toc_with_page_number, f, indent=2) \ No newline at end of file + # Process the PDF + toc_with_page_number = page_index_main(args.file_path, opt) + print('Parsing done, saving to file...') + + # Save results + pdf_name = os.path.splitext(os.path.basename(args.file_path))[0] + os.makedirs('./results', exist_ok=True) + + with open(f'./results/{pdf_name}_structure.json', 'w', encoding='utf-8') as f: + json.dump(toc_with_page_number, f, indent=2) + + elif args.file_type in ['markdown', 'md']: + # Process markdown file + print('Processing markdown file...') + + # Configure markdown options + if_thinning = args.if_thinning.lower() == 'yes' + if_summary = args.if_add_node_summary.lower() == 'yes' + + # Process the markdown + import asyncio + toc_with_page_number = asyncio.run(md_to_tree( + md_path=args.file_path, + if_thinning=if_thinning, + min_token_threshold=args.thinning_threshold, + if_summary=if_summary, + summary_token_threshold=args.summary_token_threshold, + model=args.model + )) + + print('Parsing done, saving to file...') + + # Save results + md_name = os.path.splitext(os.path.basename(args.file_path))[0] + os.makedirs('./results', exist_ok=True) + + with open(f'./results/{md_name}_structure.json', 'w', encoding='utf-8') as f: + json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False) + else: + raise ValueError(f"Unsupported file type: {args.file_type}. Supported types are 'pdf', 'markdown', or 'md'") \ No newline at end of file