Recognize whole-line bold as level-1 heading in markdown parser

extract_nodes_from_markdown now matches `**Title**` lines as level-1
headings (alongside ATX `#` headings) and attaches the heading level
on the producer side. extract_node_text_content reads the level from
the node instead of re-running a `^#{1,6}` regex on the source line,
which was silently dropping bold-heading nodes from OCR / MinerU output.

Bold maps to level 1 even when mixed with `#` / `##` / `###` — bold-as-
heading is a courtesy heuristic for non-ATX markdown sources, and
CommonMark has no concept of bold heading depth.
This commit is contained in:
BukeLy 2026-04-28 15:23:34 +08:00
parent a51d97f63c
commit 6fd237986e

View file

@ -31,6 +31,7 @@ async def generate_summaries_for_structure_md(structure, summary_token_threshold
def extract_nodes_from_markdown(markdown_content):
header_pattern = r'^(#{1,6})\s+(.+)$'
bold_heading_pattern = r'^\*\*(.+?)\*\*\s*$'
code_block_pattern = r'^```'
node_list = []
@ -54,7 +55,14 @@ def extract_nodes_from_markdown(markdown_content):
match = re.match(header_pattern, stripped_line)
if match:
title = match.group(2).strip()
node_list.append({'node_title': title, 'line_num': line_num})
level = len(match.group(1))
node_list.append({'node_title': title, 'line_num': line_num, 'level': level})
continue
bold_match = re.match(bold_heading_pattern, stripped_line)
if bold_match:
title = bold_match.group(1).strip()
node_list.append({'node_title': title, 'line_num': line_num, 'level': 1})
return node_list, lines
@ -62,17 +70,10 @@ def extract_nodes_from_markdown(markdown_content):
def extract_node_text_content(node_list, markdown_lines):
all_nodes = []
for node in node_list:
line_content = markdown_lines[node['line_num'] - 1]
header_match = re.match(r'^(#{1,6})', line_content)
if header_match is None:
print(f"Warning: Line {node['line_num']} does not contain a valid header: '{line_content}'")
continue
processed_node = {
'title': node['node_title'],
'line_num': node['line_num'],
'level': len(header_match.group(1))
'level': node['level']
}
all_nodes.append(processed_node)