From c22778f85d2de8ba2617ca3c0665867096686435 Mon Sep 17 00:00:00 2001 From: Ray Date: Tue, 26 Aug 2025 16:30:12 +0800 Subject: [PATCH] fix node summary --- pageindex/page_index_md.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/pageindex/page_index_md.py b/pageindex/page_index_md.py index 598a64c..002717c 100644 --- a/pageindex/page_index_md.py +++ b/pageindex/page_index_md.py @@ -5,14 +5,30 @@ import tiktoken from utils import * def count_tokens(text, model='gpt-4o'): + if not text: + return 0 enc = tiktoken.encoding_for_model(model) tokens = enc.encode(text) return len(tokens) +async def get_node_summary(node, summary_token_threshold=200, model=None): + """ + This function gets the summary of a node. + If the node's text is less than summary_token_threshold, return the node's text. + Otherwise, return the node's summary generated by LLM. + """ + node_text = node.get('text') + num_tokens = count_tokens(node_text) + if num_tokens < summary_token_threshold: + return node_text + else: + return await generate_node_summary(node, model=model) + + async def generate_summaries_for_structure_md(structure, model="gpt-4.1"): nodes = structure_to_list(structure) - tasks = [generate_node_summary(node, model=model) for node in nodes] + tasks = [get_node_summary(node, model=model) for node in nodes] summaries = await asyncio.gather(*tasks) for node, summary in zip(nodes, summaries):