Merge pull request #33 from VectifyAI/feat/markdown-tree

Feat/markdown tree
2026-05-16 18:25:14 +02:00 · 2025-08-27 15:39:06 +01:00 · 2025-08-27 15:39:06 +01:00 · 480f7583f7
commit 480f7583f7
parent d320011c45 3c770d833f
2 changed files with 35 additions and 13 deletions
--- a/pageindex/page_index_md.py
+++ b/pageindex/page_index_md.py
@ -1,8 +1,10 @@
 import asyncio
 import json
 import re
-from .utils import *
+try:
-
+    from .utils import *
 except:
    from utils import *
 async def get_node_summary(node, summary_token_threshold=200, model=None):
    node_text = node.get('text')
@ -28,29 +30,48 @@ async def generate_summaries_for_structure_md(structure, summary_token_threshold
 def extract_nodes_from_markdown(markdown_content):
    header_pattern = r'^(#{1,6})\s+(.+)$'
    code_block_pattern = r'^```'
    node_list = []
    lines = markdown_content.split('\n')
    in_code_block = False
    for line_num, line in enumerate(lines, 1):
-        line = line.strip()
+        stripped_line = line.strip()
-        if not line:
+        
        # Check for code block delimiters (triple backticks)
        if re.match(code_block_pattern, stripped_line):
            in_code_block = not in_code_block
            continue
-        match = re.match(header_pattern, line)
+        # Skip empty lines
-        if match:
+        if not stripped_line:
-            title = match.group(2).strip()
+            continue
            node_list.append({'node_title': title, 'line_num': line_num})
-    return node_list,lines
+        # Only look for headers when not inside a code block
        if not in_code_block:
            match = re.match(header_pattern, stripped_line)
            if match:
                title = match.group(2).strip()
                node_list.append({'node_title': title, 'line_num': line_num})
    return node_list, lines
 def extract_node_text_content(node_list, markdown_lines):    
    all_nodes = []
    for node in node_list:
        line_content = markdown_lines[node['line_num'] - 1]
        header_match = re.match(r'^(#{1,6})', line_content)
        if header_match is None:
            print(f"Warning: Line {node['line_num']} does not contain a valid header: '{line_content}'")
            continue
        processed_node = {
            'title': node['node_title'],
            'line_num': node['line_num'],
-            'level': len(re.match(r'^(#{1,6})', markdown_lines[node['line_num'] - 1]).group(1))
+            'level': len(header_match.group(1))
        }
        all_nodes.append(processed_node)
@ -250,7 +271,8 @@ if __name__ == "__main__":
    import os
    import json
-    MD_NAME = 'Detect-Order-Construct'
+    # MD_NAME = 'Detect-Order-Construct'
    MD_NAME = 'mcp'
    # MD_NAME = 'Welcome'
    MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'docs', f'{MD_NAME}.md')
--- a/run_pageindex.py
+++ b/run_pageindex.py
@ -26,7 +26,7 @@ if __name__ == "__main__":
    parser.add_argument('--if-add-node-text', type=str, default='no',
                      help='Whether to add text to the node')
    # Markdown specific arguments
-    parser.add_argument('--if-thinning', type=str, default='yes',
+    parser.add_argument('--if-thinning', type=str, default='no',
                      help='Whether to apply tree thinning for markdown (markdown only)')
    parser.add_argument('--thinning-threshold', type=int, default=5000,
                      help='Minimum token threshold for thinning (markdown only)')