mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-05-16 18:25:14 +02:00
Merge pull request #33 from VectifyAI/feat/markdown-tree
Feat/markdown tree
This commit is contained in:
commit
480f7583f7
2 changed files with 35 additions and 13 deletions
|
|
@ -1,8 +1,10 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from .utils import *
|
try:
|
||||||
|
from .utils import *
|
||||||
|
except:
|
||||||
|
from utils import *
|
||||||
|
|
||||||
async def get_node_summary(node, summary_token_threshold=200, model=None):
|
async def get_node_summary(node, summary_token_threshold=200, model=None):
|
||||||
node_text = node.get('text')
|
node_text = node.get('text')
|
||||||
|
|
@ -28,29 +30,48 @@ async def generate_summaries_for_structure_md(structure, summary_token_threshold
|
||||||
|
|
||||||
def extract_nodes_from_markdown(markdown_content):
|
def extract_nodes_from_markdown(markdown_content):
|
||||||
header_pattern = r'^(#{1,6})\s+(.+)$'
|
header_pattern = r'^(#{1,6})\s+(.+)$'
|
||||||
|
code_block_pattern = r'^```'
|
||||||
node_list = []
|
node_list = []
|
||||||
|
|
||||||
lines = markdown_content.split('\n')
|
lines = markdown_content.split('\n')
|
||||||
|
in_code_block = False
|
||||||
|
|
||||||
for line_num, line in enumerate(lines, 1):
|
for line_num, line in enumerate(lines, 1):
|
||||||
line = line.strip()
|
stripped_line = line.strip()
|
||||||
if not line:
|
|
||||||
|
# Check for code block delimiters (triple backticks)
|
||||||
|
if re.match(code_block_pattern, stripped_line):
|
||||||
|
in_code_block = not in_code_block
|
||||||
continue
|
continue
|
||||||
|
|
||||||
match = re.match(header_pattern, line)
|
# Skip empty lines
|
||||||
if match:
|
if not stripped_line:
|
||||||
title = match.group(2).strip()
|
continue
|
||||||
node_list.append({'node_title': title, 'line_num': line_num})
|
|
||||||
|
|
||||||
return node_list,lines
|
# Only look for headers when not inside a code block
|
||||||
|
if not in_code_block:
|
||||||
|
match = re.match(header_pattern, stripped_line)
|
||||||
|
if match:
|
||||||
|
title = match.group(2).strip()
|
||||||
|
node_list.append({'node_title': title, 'line_num': line_num})
|
||||||
|
|
||||||
|
return node_list, lines
|
||||||
|
|
||||||
|
|
||||||
def extract_node_text_content(node_list, markdown_lines):
|
def extract_node_text_content(node_list, markdown_lines):
|
||||||
all_nodes = []
|
all_nodes = []
|
||||||
for node in node_list:
|
for node in node_list:
|
||||||
|
line_content = markdown_lines[node['line_num'] - 1]
|
||||||
|
header_match = re.match(r'^(#{1,6})', line_content)
|
||||||
|
|
||||||
|
if header_match is None:
|
||||||
|
print(f"Warning: Line {node['line_num']} does not contain a valid header: '{line_content}'")
|
||||||
|
continue
|
||||||
|
|
||||||
processed_node = {
|
processed_node = {
|
||||||
'title': node['node_title'],
|
'title': node['node_title'],
|
||||||
'line_num': node['line_num'],
|
'line_num': node['line_num'],
|
||||||
'level': len(re.match(r'^(#{1,6})', markdown_lines[node['line_num'] - 1]).group(1))
|
'level': len(header_match.group(1))
|
||||||
}
|
}
|
||||||
all_nodes.append(processed_node)
|
all_nodes.append(processed_node)
|
||||||
|
|
||||||
|
|
@ -250,7 +271,8 @@ if __name__ == "__main__":
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
|
||||||
MD_NAME = 'Detect-Order-Construct'
|
# MD_NAME = 'Detect-Order-Construct'
|
||||||
|
MD_NAME = 'mcp'
|
||||||
# MD_NAME = 'Welcome'
|
# MD_NAME = 'Welcome'
|
||||||
MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'docs', f'{MD_NAME}.md')
|
MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'docs', f'{MD_NAME}.md')
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@ if __name__ == "__main__":
|
||||||
parser.add_argument('--if-add-node-text', type=str, default='no',
|
parser.add_argument('--if-add-node-text', type=str, default='no',
|
||||||
help='Whether to add text to the node')
|
help='Whether to add text to the node')
|
||||||
# Markdown specific arguments
|
# Markdown specific arguments
|
||||||
parser.add_argument('--if-thinning', type=str, default='yes',
|
parser.add_argument('--if-thinning', type=str, default='no',
|
||||||
help='Whether to apply tree thinning for markdown (markdown only)')
|
help='Whether to apply tree thinning for markdown (markdown only)')
|
||||||
parser.add_argument('--thinning-threshold', type=int, default=5000,
|
parser.add_argument('--thinning-threshold', type=int, default=5000,
|
||||||
help='Minimum token threshold for thinning (markdown only)')
|
help='Minimum token threshold for thinning (markdown only)')
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue