fix node summary

2026-06-06 19:35:41 +02:00 · 2025-08-26 16:30:12 +08:00 · 2025-08-26 16:30:12 +08:00 · c22778f85d
commit c22778f85d
parent 19faaad74f
1 changed files with 17 additions and 1 deletions
--- a/pageindex/page_index_md.py
+++ b/pageindex/page_index_md.py
@ -5,14 +5,30 @@ import tiktoken
 from utils import *

 def count_tokens(text, model='gpt-4o'):
+    if not text:
+        return 0
    enc = tiktoken.encoding_for_model(model)
    tokens = enc.encode(text)
    return len(tokens)


+async def get_node_summary(node, summary_token_threshold=200, model=None):
+    """
+    This function gets the summary of a node.
+    If the node's text is less than summary_token_threshold, return the node's text.
+    Otherwise, return the node's summary generated by LLM.
+    """
+    node_text = node.get('text')
+    num_tokens = count_tokens(node_text)
+    if num_tokens < summary_token_threshold:
+        return node_text
+    else:
+        return await generate_node_summary(node, model=model)
+
+
 async def generate_summaries_for_structure_md(structure, model="gpt-4.1"):
    nodes = structure_to_list(structure)
-    tasks = [generate_node_summary(node, model=model) for node in nodes]
+    tasks = [get_node_summary(node, model=model) for node in nodes]
    summaries = await asyncio.gather(*tasks)
    
    for node, summary in zip(nodes, summaries):