add markdown_to_tree

This commit is contained in:
zmtomorrow 2025-08-26 12:17:05 +01:00
parent c22778f85d
commit 78cce56b33
3 changed files with 337 additions and 46 deletions

212
pageindex/node_list.json Normal file

File diff suppressed because one or more lines are too long

View file

@ -1,34 +1,21 @@
import asyncio import asyncio
import json import json
import re import re
import tiktoken
from utils import * from utils import *
def count_tokens(text, model='gpt-4o'):
if not text:
return 0
enc = tiktoken.encoding_for_model(model)
tokens = enc.encode(text)
return len(tokens)
async def get_node_summary(node, summary_token_threshold=200, model=None): async def get_node_summary(node, summary_token_threshold=200, model=None):
"""
This function gets the summary of a node.
If the node's text is less than summary_token_threshold, return the node's text.
Otherwise, return the node's summary generated by LLM.
"""
node_text = node.get('text') node_text = node.get('text')
num_tokens = count_tokens(node_text) num_tokens = count_tokens(node_text, model=model)
if num_tokens < summary_token_threshold: if num_tokens < summary_token_threshold:
return node_text return node_text
else: else:
return await generate_node_summary(node, model=model) return await generate_node_summary(node, model=model)
async def generate_summaries_for_structure_md(structure, model="gpt-4.1"): async def generate_summaries_for_structure_md(structure, summary_token_threshold, model=None):
nodes = structure_to_list(structure) nodes = structure_to_list(structure)
tasks = [get_node_summary(node, model=model) for node in nodes] tasks = [get_node_summary(node, summary_token_threshold=summary_token_threshold, model=model) for node in nodes]
summaries = await asyncio.gather(*tasks) summaries = await asyncio.gather(*tasks)
for node, summary in zip(nodes, summaries): for node, summary in zip(nodes, summaries):
@ -74,13 +61,56 @@ def extract_node_text_content(node_list, markdown_lines):
else: else:
end_line = len(markdown_lines) end_line = len(markdown_lines)
node['text'] = '\n'.join(markdown_lines[start_line:end_line]).strip() node['text'] = '\n'.join(markdown_lines[start_line:end_line]).strip()
node['text_token_count'] = count_tokens(node['text'])
return all_nodes return all_nodes
def update_node_list_with_text_token_count(node_list, model=None):
def tree_thinning_for_index(node_list, min_node_token=None): def find_all_children(parent_index, parent_level, node_list):
"""Find all direct and indirect children of a parent node"""
children_indices = []
# Look for children after the parent
for i in range(parent_index + 1, len(node_list)):
current_level = node_list[i]['level']
# If we hit a node at same or higher level than parent, stop
if current_level <= parent_level:
break
# This is a descendant
children_indices.append(i)
return children_indices
# Make a copy to avoid modifying the original
result_list = node_list.copy()
# Process nodes from end to beginning to ensure children are processed before parents
for i in range(len(result_list) - 1, -1, -1):
current_node = result_list[i]
current_level = current_node['level']
# Get all children of this node
children_indices = find_all_children(i, current_level, result_list)
# Start with the node's own text
node_text = current_node.get('text', '')
total_text = node_text
# Add all children's text
for child_index in children_indices:
child_text = result_list[child_index].get('text', '')
if child_text:
total_text += '\n' + child_text
# Calculate token count for combined text
result_list[i]['text_token_count'] = count_tokens(total_text, model=model)
return result_list
def tree_thinning_for_index(node_list, min_node_token=None, model=None):
def find_all_children(parent_index, parent_level, node_list): def find_all_children(parent_index, parent_level, node_list):
children_indices = [] children_indices = []
@ -127,7 +157,7 @@ def tree_thinning_for_index(node_list, min_node_token=None):
result_list[i]['text'] = merged_text result_list[i]['text'] = merged_text
result_list[i]['text_token_count'] = count_tokens(merged_text, "gpt-4o") result_list[i]['text_token_count'] = count_tokens(merged_text, model=model)
for index in sorted(nodes_to_remove, reverse=True): for index in sorted(nodes_to_remove, reverse=True):
result_list.pop(index) result_list.pop(index)
@ -188,25 +218,31 @@ def clean_tree_for_output(tree_nodes):
return cleaned_nodes return cleaned_nodes
async def md_to_tree(md_path, if_thinning=True, min_token_threshold=None, if_summary=True): async def md_to_tree(md_path, if_thinning=True, min_token_threshold=None, if_summary=True, summary_token_threshold=None, model=None):
with open(md_path, 'r', encoding='utf-8') as f: with open(md_path, 'r', encoding='utf-8') as f:
markdown_content = f.read() markdown_content = f.read()
print(f"Extracting nodes from markdown...")
node_list, markdown_lines = extract_nodes_from_markdown(markdown_content) node_list, markdown_lines = extract_nodes_from_markdown(markdown_content)
print(f"Extracting text content from nodes...")
nodes_with_content = extract_node_text_content(node_list, markdown_lines) nodes_with_content = extract_node_text_content(node_list, markdown_lines)
if if_thinning: if if_thinning:
thinned_nodes = tree_thinning_for_index(nodes_with_content, min_token_threshold) nodes_with_content = update_node_list_with_text_token_count(nodes_with_content, model=model)
else: print(f"Thinning nodes...")
thinned_nodes = nodes_with_content nodes_with_content = tree_thinning_for_index(nodes_with_content, min_token_threshold, model=model)
tree_structure = build_tree_from_nodes(thinned_nodes) print(f"Building tree from nodes...")
tree_structure = build_tree_from_nodes(nodes_with_content)
if if_summary: if if_summary:
tree_structure = await generate_summaries_for_structure_md(tree_structure) print(f"Generating summaries for each node...")
tree_structure = await generate_summaries_for_structure_md(tree_structure,summary_token_threshold=summary_token_threshold, model=model)
print(f"Formatting tree structure...")
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes']) tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
return tree_structure return tree_structure
@ -214,27 +250,41 @@ if __name__ == "__main__":
import os import os
import json import json
# Path to the Welcome.md file MD_NAME = 'Detect-Order-Construct'
md_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'Welcome.md') # MD_NAME = 'Welcome'
MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'docs', f'{MD_NAME}.md')
tree_structure = asyncio.run(md_to_tree(md_path, if_thinning=True, min_token_threshold=100, if_summary=True))
def print_tree(nodes, indent=0): MODEL="gpt-4.1"
for node in nodes: IF_THINNING=False
prefix = " " * indent THINNING_THRESHOLD=5000
has_children = 'nodes' in node and node['nodes'] SUMMARY_TOKEN_THRESHOLD=200
children_info = f" ({len(node['nodes'])} children)" if has_children else "" IF_SUMMARY=True
print(f"{prefix}- {node['title']} [ID: {node['node_id']}]{children_info}")
if has_children: tree_structure = asyncio.run(md_to_tree(
print_tree(node['nodes'], indent + 1) md_path=MD_PATH,
if_thinning=IF_THINNING,
min_token_threshold=THINNING_THRESHOLD,
if_summary=IF_SUMMARY,
summary_token_threshold=SUMMARY_TOKEN_THRESHOLD,
model=MODEL))
tree_structure=remove_fields(tree_structure, fields=['text'])
print("\n🌳 Tree Structure:") print('\n' + '='*60)
print_tree(tree_structure) print('TREE STRUCTURE')
print('='*60)
output_path = os.path.join(os.path.dirname(__file__), '..', 'results', 'Welcome_structure.json') print_json(tree_structure)
print('\n' + '='*60)
print('TABLE OF CONTENTS')
print('='*60)
print_toc(tree_structure)
output_path = os.path.join(os.path.dirname(__file__), '..', 'results', f'{MD_NAME}_structure.json')
os.makedirs(os.path.dirname(output_path), exist_ok=True) os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f: with open(output_path, 'w', encoding='utf-8') as f:
json.dump(tree_structure, f, indent=2, ensure_ascii=False) json.dump(tree_structure, f, indent=2, ensure_ascii=False)
print(f"\n💾 Tree structure saved to: {output_path}") print(f"\nTree structure saved to: {output_path}")

View file

@ -19,8 +19,9 @@ from types import SimpleNamespace as config
CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY") CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")
def count_tokens(text, model=None):
def count_tokens(text, model): if not text:
return 0
enc = tiktoken.encoding_for_model(model) enc = tiktoken.encoding_for_model(model)
tokens = enc.encode(text) tokens = enc.encode(text)
return len(tokens) return len(tokens)
@ -489,6 +490,34 @@ def clean_structure_post(data):
clean_structure_post(section) clean_structure_post(section)
return data return data
def remove_fields(data, fields=['text']):
if isinstance(data, dict):
return {k: remove_fields(v, fields)
for k, v in data.items() if k not in fields}
elif isinstance(data, list):
return [remove_fields(item, fields) for item in data]
return data
def print_toc(tree, indent=0):
for node in tree:
print(' ' * indent + node['title'])
if node.get('nodes'):
print_toc(node['nodes'], indent + 1)
def print_json(data, max_len=40, indent=2):
def simplify_data(obj):
if isinstance(obj, dict):
return {k: simplify_data(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [simplify_data(item) for item in obj]
elif isinstance(obj, str) and len(obj) > max_len:
return obj[:max_len] + '...'
else:
return obj
simplified = simplify_data(data)
print(json.dumps(simplified, indent=indent, ensure_ascii=False))
def remove_structure_text(data): def remove_structure_text(data):
if isinstance(data, dict): if isinstance(data, dict):