PageIndex/pageindex/page_index_md.py

import asyncio
import json
import re
from utils import *


async def get_node_summary(node, summary_token_threshold=200, model=None):
    node_text = node.get('text')
    num_tokens = count_tokens(node_text, model=model)
    if num_tokens < summary_token_threshold:
        return node_text
    else:
        return await generate_node_summary(node, model=model)


async def generate_summaries_for_structure_md(structure, summary_token_threshold, model=None):
    nodes = structure_to_list(structure)
    tasks = [get_node_summary(node, summary_token_threshold=summary_token_threshold, model=model) for node in nodes]
    summaries = await asyncio.gather(*tasks)

    for node, summary in zip(nodes, summaries):
        if not node.get('nodes'):
            node['summary'] = summary
        else:
            node['prefix_summary'] = summary
    return structure


def extract_nodes_from_markdown(markdown_content):
    header_pattern = r'^(#{1,6})\s+(.+)$'
    node_list = []

    lines = markdown_content.split('\n')
    for line_num, line in enumerate(lines, 1):
        line = line.strip()
        if not line:
            continue

        match = re.match(header_pattern, line)
        if match:
            title = match.group(2).strip()
            node_list.append({'node_title': title, 'line_num': line_num})

    return node_list,lines


def extract_node_text_content(node_list, markdown_lines):
    all_nodes = []
    for node in node_list:
        processed_node = {
            'title': node['node_title'],
            'line_num': node['line_num'],
            'level': len(re.match(r'^(#{1,6})', markdown_lines[node['line_num'] - 1]).group(1))
        }
        all_nodes.append(processed_node)

    for i, node in enumerate(all_nodes):
        start_line = node['line_num'] - 1
        if i + 1 < len(all_nodes):
            end_line = all_nodes[i + 1]['line_num'] - 1
        else:
            end_line = len(markdown_lines)

        node['text'] = '\n'.join(markdown_lines[start_line:end_line]).strip()
    return all_nodes

def update_node_list_with_text_token_count(node_list, model=None):

    def find_all_children(parent_index, parent_level, node_list):
        """Find all direct and indirect children of a parent node"""
        children_indices = []

        # Look for children after the parent
        for i in range(parent_index + 1, len(node_list)):
            current_level = node_list[i]['level']

            # If we hit a node at same or higher level than parent, stop
            if current_level <= parent_level:
                break

            # This is a descendant
            children_indices.append(i)

        return children_indices

    # Make a copy to avoid modifying the original
    result_list = node_list.copy()

    # Process nodes from end to beginning to ensure children are processed before parents
    for i in range(len(result_list) - 1, -1, -1):
        current_node = result_list[i]
        current_level = current_node['level']

        # Get all children of this node
        children_indices = find_all_children(i, current_level, result_list)

        # Start with the node's own text
        node_text = current_node.get('text', '')
        total_text = node_text

        # Add all children's text
        for child_index in children_indices:
            child_text = result_list[child_index].get('text', '')
            if child_text:
                total_text += '\n' + child_text

        # Calculate token count for combined text
        result_list[i]['text_token_count'] = count_tokens(total_text, model=model)

    return result_list


def tree_thinning_for_index(node_list, min_node_token=None, model=None):
    def find_all_children(parent_index, parent_level, node_list):
        children_indices = []

        for i in range(parent_index + 1, len(node_list)):
            current_level = node_list[i]['level']

            if current_level <= parent_level:
                break

            children_indices.append(i)

        return children_indices

    result_list = node_list.copy()
    nodes_to_remove = set()

    for i in range(len(result_list) - 1, -1, -1):
        if i in nodes_to_remove:
            continue

        current_node = result_list[i]
        current_level = current_node['level']

        total_tokens = current_node.get('text_token_count', 0)

        if total_tokens < min_node_token:
            children_indices = find_all_children(i, current_level, result_list)

            children_texts = []
            for child_index in sorted(children_indices):
                if child_index not in nodes_to_remove:
                    child_text = result_list[child_index].get('text', '')
                    if child_text.strip():
                        children_texts.append(child_text)
                    nodes_to_remove.add(child_index)

            if children_texts:
                parent_text = current_node.get('text', '')
                merged_text = parent_text
                for child_text in children_texts:
                    if merged_text and not merged_text.endswith('\n'):
                        merged_text += '\n\n'
                    merged_text += child_text

                result_list[i]['text'] = merged_text

                result_list[i]['text_token_count'] = count_tokens(merged_text, model=model)

    for index in sorted(nodes_to_remove, reverse=True):
        result_list.pop(index)

    return result_list


def build_tree_from_nodes(node_list):
    if not node_list:
        return []

    stack = []
    root_nodes = []
    node_counter = 1

    for node in node_list:
        current_level = node['level']

        tree_node = {
            'title': node['title'],
            'node_id': str(node_counter).zfill(4),
            'text': node['text'],
            'line_num': node['line_num'],
            'nodes': []
        }
        node_counter += 1

        while stack and stack[-1][1] >= current_level:
            stack.pop()

        if not stack:
            root_nodes.append(tree_node)
        else:
            parent_node, parent_level = stack[-1]
            parent_node['nodes'].append(tree_node)

        stack.append((tree_node, current_level))

    return root_nodes


def clean_tree_for_output(tree_nodes):
    cleaned_nodes = []

    for node in tree_nodes:
        cleaned_node = {
            'title': node['title'],
            'node_id': node['node_id'],
            'text': node['text'],
            'line_num': node['line_num']
        }

        if node['nodes']:
            cleaned_node['nodes'] = clean_tree_for_output(node['nodes'])

        cleaned_nodes.append(cleaned_node)

    return cleaned_nodes


async def md_to_tree(md_path, if_thinning=True, min_token_threshold=None, if_summary=True, summary_token_threshold=None, model=None):
    with open(md_path, 'r', encoding='utf-8') as f:
        markdown_content = f.read()

    print(f"Extracting nodes from markdown...")
    node_list, markdown_lines = extract_nodes_from_markdown(markdown_content)

    print(f"Extracting text content from nodes...")
    nodes_with_content = extract_node_text_content(node_list, markdown_lines)

    if if_thinning:
        nodes_with_content = update_node_list_with_text_token_count(nodes_with_content, model=model)
        print(f"Thinning nodes...")
        nodes_with_content = tree_thinning_for_index(nodes_with_content, min_token_threshold, model=model)

    print(f"Building tree from nodes...")
    tree_structure = build_tree_from_nodes(nodes_with_content)

    if if_summary:
        print(f"Generating summaries for each node...")
        tree_structure = await generate_summaries_for_structure_md(tree_structure,summary_token_threshold=summary_token_threshold, model=model)

    print(f"Formatting tree structure...")
    tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])

    return tree_structure


if __name__ == "__main__":
    import os
    import json

    MD_NAME = 'Detect-Order-Construct'
    # MD_NAME = 'Welcome'
    MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'docs', f'{MD_NAME}.md')


    MODEL="gpt-4.1"
    IF_THINNING=False
    THINNING_THRESHOLD=5000
    SUMMARY_TOKEN_THRESHOLD=200
    IF_SUMMARY=True

    tree_structure = asyncio.run(md_to_tree(
        md_path=MD_PATH,
        if_thinning=IF_THINNING,
        min_token_threshold=THINNING_THRESHOLD,
        if_summary=IF_SUMMARY,
        summary_token_threshold=SUMMARY_TOKEN_THRESHOLD,
        model=MODEL))

    tree_structure=remove_fields(tree_structure, fields=['text'])

    print('\n' + '='*60)
    print('TREE STRUCTURE')
    print('='*60)
    print_json(tree_structure)

    print('\n' + '='*60)
    print('TABLE OF CONTENTS')
    print('='*60)
    print_toc(tree_structure)

    output_path = os.path.join(os.path.dirname(__file__), '..', 'results', f'{MD_NAME}_structure.json')
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(tree_structure, f, indent=2, ensure_ascii=False)

    print(f"\nTree structure saved to: {output_path}")