mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-04-24 23:56:21 +02:00
add summary
This commit is contained in:
parent
04bbdae647
commit
802f149dd1
1 changed files with 11 additions and 6 deletions
|
|
@ -2,8 +2,9 @@ import asyncio
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import tiktoken
|
import tiktoken
|
||||||
|
from utils import generate_summaries_for_structure
|
||||||
|
|
||||||
def count_tokens(text, model):
|
def count_tokens(text, model='gpt-4o'):
|
||||||
enc = tiktoken.encoding_for_model(model)
|
enc = tiktoken.encoding_for_model(model)
|
||||||
tokens = enc.encode(text)
|
tokens = enc.encode(text)
|
||||||
return len(tokens)
|
return len(tokens)
|
||||||
|
|
@ -27,7 +28,7 @@ def extract_nodes_from_markdown(markdown_content):
|
||||||
return node_list,lines
|
return node_list,lines
|
||||||
|
|
||||||
|
|
||||||
def extract_node_text_content(node_list, markdown_lines, model="gpt-4o"):
|
def extract_node_text_content(node_list, markdown_lines):
|
||||||
all_nodes = []
|
all_nodes = []
|
||||||
for node in node_list:
|
for node in node_list:
|
||||||
processed_node = {
|
processed_node = {
|
||||||
|
|
@ -45,7 +46,7 @@ def extract_node_text_content(node_list, markdown_lines, model="gpt-4o"):
|
||||||
end_line = len(markdown_lines)
|
end_line = len(markdown_lines)
|
||||||
|
|
||||||
node['text'] = '\n'.join(markdown_lines[start_line:end_line]).strip()
|
node['text'] = '\n'.join(markdown_lines[start_line:end_line]).strip()
|
||||||
node['text_token_count'] = count_tokens(node['text'], model)
|
node['text_token_count'] = count_tokens(node['text'])
|
||||||
|
|
||||||
return all_nodes
|
return all_nodes
|
||||||
|
|
||||||
|
|
@ -157,7 +158,8 @@ def clean_tree_for_output(tree_nodes):
|
||||||
|
|
||||||
return cleaned_nodes
|
return cleaned_nodes
|
||||||
|
|
||||||
def md_to_tree(md_path, if_thinning=True, min_token_threshold=None):
|
|
||||||
|
async def md_to_tree(md_path, if_thinning=True, min_token_threshold=None, if_summary=True, model="gpt-4.1"):
|
||||||
with open(md_path, 'r', encoding='utf-8') as f:
|
with open(md_path, 'r', encoding='utf-8') as f:
|
||||||
markdown_content = f.read()
|
markdown_content = f.read()
|
||||||
|
|
||||||
|
|
@ -170,6 +172,10 @@ def md_to_tree(md_path, if_thinning=True, min_token_threshold=None):
|
||||||
thinned_nodes = nodes_with_content
|
thinned_nodes = nodes_with_content
|
||||||
|
|
||||||
tree_structure = build_tree_from_nodes(thinned_nodes)
|
tree_structure = build_tree_from_nodes(thinned_nodes)
|
||||||
|
|
||||||
|
if if_summary:
|
||||||
|
tree_structure = await generate_summaries_for_structure(tree_structure, model=model)
|
||||||
|
|
||||||
return tree_structure
|
return tree_structure
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -179,9 +185,8 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# Path to the Welcome.md file
|
# Path to the Welcome.md file
|
||||||
md_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'Welcome.md')
|
md_path = os.path.join(os.path.dirname(__file__), '..', 'docs', 'Welcome.md')
|
||||||
|
|
||||||
|
|
||||||
tree_structure = md_to_tree(md_path, if_thinning=True, min_token_threshold=100)
|
tree_structure = asyncio.run(md_to_tree(md_path, if_thinning=True, min_token_threshold=100, if_summary=True))
|
||||||
|
|
||||||
def print_tree(nodes, indent=0):
|
def print_tree(nodes, indent=0):
|
||||||
for node in nodes:
|
for node in nodes:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue