From c22778f85d2de8ba2617ca3c0665867096686435 Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Tue, 26 Aug 2025 16:30:12 +0800
Subject: [PATCH] fix node summary

---
 pageindex/page_index_md.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/pageindex/page_index_md.py b/pageindex/page_index_md.py
index 598a64c..002717c 100644
--- a/pageindex/page_index_md.py
+++ b/pageindex/page_index_md.py
@@ -5,14 +5,30 @@ import tiktoken
 from utils import *
 
 def count_tokens(text, model='gpt-4o'):
+    if not text:
+        return 0
     enc = tiktoken.encoding_for_model(model)
     tokens = enc.encode(text)
     return len(tokens)
 
 
+async def get_node_summary(node, summary_token_threshold=200, model=None):
+    """
+    This function gets the summary of a node.
+    If the node's text is less than summary_token_threshold, return the node's text.
+    Otherwise, return the node's summary generated by LLM.
+    """
+    node_text = node.get('text')
+    num_tokens = count_tokens(node_text)
+    if num_tokens < summary_token_threshold:
+        return node_text
+    else:
+        return await generate_node_summary(node, model=model)
+
+
 async def generate_summaries_for_structure_md(structure, model="gpt-4.1"):
     nodes = structure_to_list(structure)
-    tasks = [generate_node_summary(node, model=model) for node in nodes]
+    tasks = [get_node_summary(node, model=model) for node in nodes]
     summaries = await asyncio.gather(*tasks)
     
     for node, summary in zip(nodes, summaries):