mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-06-09 19:45:15 +02:00
Recognize whole-line bold as level-1 heading in markdown parser
extract_nodes_from_markdown now matches `**Title**` lines as level-1
headings (alongside ATX `#` headings) and attaches the heading level
on the producer side. extract_node_text_content reads the level from
the node instead of re-running a `^#{1,6}` regex on the source line,
which was silently dropping bold-heading nodes from OCR / MinerU output.
Bold maps to level 1 even when mixed with `#` / `##` / `###` — bold-as-
heading is a courtesy heuristic for non-ATX markdown sources, and
CommonMark has no concept of bold heading depth.
This commit is contained in:
parent
a51d97f63c
commit
6fd237986e
1 changed files with 10 additions and 9 deletions
|
|
@ -31,6 +31,7 @@ async def generate_summaries_for_structure_md(structure, summary_token_threshold
|
|||
|
||||
def extract_nodes_from_markdown(markdown_content):
|
||||
header_pattern = r'^(#{1,6})\s+(.+)$'
|
||||
bold_heading_pattern = r'^\*\*(.+?)\*\*\s*$'
|
||||
code_block_pattern = r'^```'
|
||||
node_list = []
|
||||
|
||||
|
|
@ -54,7 +55,14 @@ def extract_nodes_from_markdown(markdown_content):
|
|||
match = re.match(header_pattern, stripped_line)
|
||||
if match:
|
||||
title = match.group(2).strip()
|
||||
node_list.append({'node_title': title, 'line_num': line_num})
|
||||
level = len(match.group(1))
|
||||
node_list.append({'node_title': title, 'line_num': line_num, 'level': level})
|
||||
continue
|
||||
|
||||
bold_match = re.match(bold_heading_pattern, stripped_line)
|
||||
if bold_match:
|
||||
title = bold_match.group(1).strip()
|
||||
node_list.append({'node_title': title, 'line_num': line_num, 'level': 1})
|
||||
|
||||
return node_list, lines
|
||||
|
||||
|
|
@ -62,17 +70,10 @@ def extract_nodes_from_markdown(markdown_content):
|
|||
def extract_node_text_content(node_list, markdown_lines):
|
||||
all_nodes = []
|
||||
for node in node_list:
|
||||
line_content = markdown_lines[node['line_num'] - 1]
|
||||
header_match = re.match(r'^(#{1,6})', line_content)
|
||||
|
||||
if header_match is None:
|
||||
print(f"Warning: Line {node['line_num']} does not contain a valid header: '{line_content}'")
|
||||
continue
|
||||
|
||||
processed_node = {
|
||||
'title': node['node_title'],
|
||||
'line_num': node['line_num'],
|
||||
'level': len(header_match.group(1))
|
||||
'level': node['level']
|
||||
}
|
||||
all_nodes.append(processed_node)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue