filter code

2026-07-03 20:41:02 +02:00 · 2025-08-27 15:12:45 +01:00 · 2025-08-27 15:12:45 +01:00 · edd1be353c
commit edd1be353c
parent 82ad5e2651
1 changed files with 30 additions and 10 deletions
--- a/pageindex/page_index_md.py
+++ b/pageindex/page_index_md.py
@ -28,29 +28,48 @@ async def generate_summaries_for_structure_md(structure, summary_token_threshold
 def extract_nodes_from_markdown(markdown_content):
    header_pattern = r'^(#{1,6})\s+(.+)$'
    code_block_pattern = r'^```'
    node_list = []
    lines = markdown_content.split('\n')
    in_code_block = False
    for line_num, line in enumerate(lines, 1):
-        line = line.strip()
+        stripped_line = line.strip()
-        if not line:
+        
        # Check for code block delimiters (triple backticks)
        if re.match(code_block_pattern, stripped_line):
            in_code_block = not in_code_block
            continue
-        match = re.match(header_pattern, line)
+        # Skip empty lines
-        if match:
+        if not stripped_line:
-            title = match.group(2).strip()
+            continue
            node_list.append({'node_title': title, 'line_num': line_num})
-    return node_list,lines
+        # Only look for headers when not inside a code block
        if not in_code_block:
            match = re.match(header_pattern, stripped_line)
            if match:
                title = match.group(2).strip()
                node_list.append({'node_title': title, 'line_num': line_num})
    return node_list, lines
 def extract_node_text_content(node_list, markdown_lines):    
    all_nodes = []
    for node in node_list:
        line_content = markdown_lines[node['line_num'] - 1]
        header_match = re.match(r'^(#{1,6})', line_content)
        if header_match is None:
            print(f"Warning: Line {node['line_num']} does not contain a valid header: '{line_content}'")
            continue
        processed_node = {
            'title': node['node_title'],
            'line_num': node['line_num'],
-            'level': len(re.match(r'^(#{1,6})', markdown_lines[node['line_num'] - 1]).group(1))
+            'level': len(header_match.group(1))
        }
        all_nodes.append(processed_node)
@ -250,7 +269,8 @@ if __name__ == "__main__":
    import os
    import json
-    MD_NAME = 'Detect-Order-Construct'
+    # MD_NAME = 'Detect-Order-Construct'
    MD_NAME = 'mcp'
    # MD_NAME = 'Welcome'
    MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'docs', f'{MD_NAME}.md')