mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-04-24 23:56:21 +02:00
Merge pull request #32 from VectifyAI/feat/markdown-tree
Feat/markdown tree
This commit is contained in:
commit
d320011c45
5 changed files with 421 additions and 32 deletions
|
|
@ -1 +1,2 @@
|
||||||
from .page_index import *
|
from .page_index import *
|
||||||
|
from .page_index_md import md_to_tree
|
||||||
288
pageindex/page_index_md.py
Normal file
288
pageindex/page_index_md.py
Normal file
|
|
@ -0,0 +1,288 @@
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from .utils import *
|
||||||
|
|
||||||
|
|
||||||
|
async def get_node_summary(node, summary_token_threshold=200, model=None):
|
||||||
|
node_text = node.get('text')
|
||||||
|
num_tokens = count_tokens(node_text, model=model)
|
||||||
|
if num_tokens < summary_token_threshold:
|
||||||
|
return node_text
|
||||||
|
else:
|
||||||
|
return await generate_node_summary(node, model=model)
|
||||||
|
|
||||||
|
|
||||||
|
async def generate_summaries_for_structure_md(structure, summary_token_threshold, model=None):
|
||||||
|
nodes = structure_to_list(structure)
|
||||||
|
tasks = [get_node_summary(node, summary_token_threshold=summary_token_threshold, model=model) for node in nodes]
|
||||||
|
summaries = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
for node, summary in zip(nodes, summaries):
|
||||||
|
if not node.get('nodes'):
|
||||||
|
node['summary'] = summary
|
||||||
|
else:
|
||||||
|
node['prefix_summary'] = summary
|
||||||
|
return structure
|
||||||
|
|
||||||
|
|
||||||
|
def extract_nodes_from_markdown(markdown_content):
|
||||||
|
header_pattern = r'^(#{1,6})\s+(.+)$'
|
||||||
|
node_list = []
|
||||||
|
|
||||||
|
lines = markdown_content.split('\n')
|
||||||
|
for line_num, line in enumerate(lines, 1):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
match = re.match(header_pattern, line)
|
||||||
|
if match:
|
||||||
|
title = match.group(2).strip()
|
||||||
|
node_list.append({'node_title': title, 'line_num': line_num})
|
||||||
|
|
||||||
|
return node_list,lines
|
||||||
|
|
||||||
|
|
||||||
|
def extract_node_text_content(node_list, markdown_lines):
|
||||||
|
all_nodes = []
|
||||||
|
for node in node_list:
|
||||||
|
processed_node = {
|
||||||
|
'title': node['node_title'],
|
||||||
|
'line_num': node['line_num'],
|
||||||
|
'level': len(re.match(r'^(#{1,6})', markdown_lines[node['line_num'] - 1]).group(1))
|
||||||
|
}
|
||||||
|
all_nodes.append(processed_node)
|
||||||
|
|
||||||
|
for i, node in enumerate(all_nodes):
|
||||||
|
start_line = node['line_num'] - 1
|
||||||
|
if i + 1 < len(all_nodes):
|
||||||
|
end_line = all_nodes[i + 1]['line_num'] - 1
|
||||||
|
else:
|
||||||
|
end_line = len(markdown_lines)
|
||||||
|
|
||||||
|
node['text'] = '\n'.join(markdown_lines[start_line:end_line]).strip()
|
||||||
|
return all_nodes
|
||||||
|
|
||||||
|
def update_node_list_with_text_token_count(node_list, model=None):
|
||||||
|
|
||||||
|
def find_all_children(parent_index, parent_level, node_list):
|
||||||
|
"""Find all direct and indirect children of a parent node"""
|
||||||
|
children_indices = []
|
||||||
|
|
||||||
|
# Look for children after the parent
|
||||||
|
for i in range(parent_index + 1, len(node_list)):
|
||||||
|
current_level = node_list[i]['level']
|
||||||
|
|
||||||
|
# If we hit a node at same or higher level than parent, stop
|
||||||
|
if current_level <= parent_level:
|
||||||
|
break
|
||||||
|
|
||||||
|
# This is a descendant
|
||||||
|
children_indices.append(i)
|
||||||
|
|
||||||
|
return children_indices
|
||||||
|
|
||||||
|
# Make a copy to avoid modifying the original
|
||||||
|
result_list = node_list.copy()
|
||||||
|
|
||||||
|
# Process nodes from end to beginning to ensure children are processed before parents
|
||||||
|
for i in range(len(result_list) - 1, -1, -1):
|
||||||
|
current_node = result_list[i]
|
||||||
|
current_level = current_node['level']
|
||||||
|
|
||||||
|
# Get all children of this node
|
||||||
|
children_indices = find_all_children(i, current_level, result_list)
|
||||||
|
|
||||||
|
# Start with the node's own text
|
||||||
|
node_text = current_node.get('text', '')
|
||||||
|
total_text = node_text
|
||||||
|
|
||||||
|
# Add all children's text
|
||||||
|
for child_index in children_indices:
|
||||||
|
child_text = result_list[child_index].get('text', '')
|
||||||
|
if child_text:
|
||||||
|
total_text += '\n' + child_text
|
||||||
|
|
||||||
|
# Calculate token count for combined text
|
||||||
|
result_list[i]['text_token_count'] = count_tokens(total_text, model=model)
|
||||||
|
|
||||||
|
return result_list
|
||||||
|
|
||||||
|
|
||||||
|
def tree_thinning_for_index(node_list, min_node_token=None, model=None):
|
||||||
|
def find_all_children(parent_index, parent_level, node_list):
|
||||||
|
children_indices = []
|
||||||
|
|
||||||
|
for i in range(parent_index + 1, len(node_list)):
|
||||||
|
current_level = node_list[i]['level']
|
||||||
|
|
||||||
|
if current_level <= parent_level:
|
||||||
|
break
|
||||||
|
|
||||||
|
children_indices.append(i)
|
||||||
|
|
||||||
|
return children_indices
|
||||||
|
|
||||||
|
result_list = node_list.copy()
|
||||||
|
nodes_to_remove = set()
|
||||||
|
|
||||||
|
for i in range(len(result_list) - 1, -1, -1):
|
||||||
|
if i in nodes_to_remove:
|
||||||
|
continue
|
||||||
|
|
||||||
|
current_node = result_list[i]
|
||||||
|
current_level = current_node['level']
|
||||||
|
|
||||||
|
total_tokens = current_node.get('text_token_count', 0)
|
||||||
|
|
||||||
|
if total_tokens < min_node_token:
|
||||||
|
children_indices = find_all_children(i, current_level, result_list)
|
||||||
|
|
||||||
|
children_texts = []
|
||||||
|
for child_index in sorted(children_indices):
|
||||||
|
if child_index not in nodes_to_remove:
|
||||||
|
child_text = result_list[child_index].get('text', '')
|
||||||
|
if child_text.strip():
|
||||||
|
children_texts.append(child_text)
|
||||||
|
nodes_to_remove.add(child_index)
|
||||||
|
|
||||||
|
if children_texts:
|
||||||
|
parent_text = current_node.get('text', '')
|
||||||
|
merged_text = parent_text
|
||||||
|
for child_text in children_texts:
|
||||||
|
if merged_text and not merged_text.endswith('\n'):
|
||||||
|
merged_text += '\n\n'
|
||||||
|
merged_text += child_text
|
||||||
|
|
||||||
|
result_list[i]['text'] = merged_text
|
||||||
|
|
||||||
|
result_list[i]['text_token_count'] = count_tokens(merged_text, model=model)
|
||||||
|
|
||||||
|
for index in sorted(nodes_to_remove, reverse=True):
|
||||||
|
result_list.pop(index)
|
||||||
|
|
||||||
|
return result_list
|
||||||
|
|
||||||
|
|
||||||
|
def build_tree_from_nodes(node_list):
|
||||||
|
if not node_list:
|
||||||
|
return []
|
||||||
|
|
||||||
|
stack = []
|
||||||
|
root_nodes = []
|
||||||
|
node_counter = 1
|
||||||
|
|
||||||
|
for node in node_list:
|
||||||
|
current_level = node['level']
|
||||||
|
|
||||||
|
tree_node = {
|
||||||
|
'title': node['title'],
|
||||||
|
'node_id': str(node_counter).zfill(4),
|
||||||
|
'text': node['text'],
|
||||||
|
'line_num': node['line_num'],
|
||||||
|
'nodes': []
|
||||||
|
}
|
||||||
|
node_counter += 1
|
||||||
|
|
||||||
|
while stack and stack[-1][1] >= current_level:
|
||||||
|
stack.pop()
|
||||||
|
|
||||||
|
if not stack:
|
||||||
|
root_nodes.append(tree_node)
|
||||||
|
else:
|
||||||
|
parent_node, parent_level = stack[-1]
|
||||||
|
parent_node['nodes'].append(tree_node)
|
||||||
|
|
||||||
|
stack.append((tree_node, current_level))
|
||||||
|
|
||||||
|
return root_nodes
|
||||||
|
|
||||||
|
|
||||||
|
def clean_tree_for_output(tree_nodes):
|
||||||
|
cleaned_nodes = []
|
||||||
|
|
||||||
|
for node in tree_nodes:
|
||||||
|
cleaned_node = {
|
||||||
|
'title': node['title'],
|
||||||
|
'node_id': node['node_id'],
|
||||||
|
'text': node['text'],
|
||||||
|
'line_num': node['line_num']
|
||||||
|
}
|
||||||
|
|
||||||
|
if node['nodes']:
|
||||||
|
cleaned_node['nodes'] = clean_tree_for_output(node['nodes'])
|
||||||
|
|
||||||
|
cleaned_nodes.append(cleaned_node)
|
||||||
|
|
||||||
|
return cleaned_nodes
|
||||||
|
|
||||||
|
|
||||||
|
async def md_to_tree(md_path, if_thinning=True, min_token_threshold=None, if_summary=True, summary_token_threshold=None, model=None):
|
||||||
|
with open(md_path, 'r', encoding='utf-8') as f:
|
||||||
|
markdown_content = f.read()
|
||||||
|
|
||||||
|
print(f"Extracting nodes from markdown...")
|
||||||
|
node_list, markdown_lines = extract_nodes_from_markdown(markdown_content)
|
||||||
|
|
||||||
|
print(f"Extracting text content from nodes...")
|
||||||
|
nodes_with_content = extract_node_text_content(node_list, markdown_lines)
|
||||||
|
|
||||||
|
if if_thinning:
|
||||||
|
nodes_with_content = update_node_list_with_text_token_count(nodes_with_content, model=model)
|
||||||
|
print(f"Thinning nodes...")
|
||||||
|
nodes_with_content = tree_thinning_for_index(nodes_with_content, min_token_threshold, model=model)
|
||||||
|
|
||||||
|
print(f"Building tree from nodes...")
|
||||||
|
tree_structure = build_tree_from_nodes(nodes_with_content)
|
||||||
|
|
||||||
|
if if_summary:
|
||||||
|
print(f"Generating summaries for each node...")
|
||||||
|
tree_structure = await generate_summaries_for_structure_md(tree_structure,summary_token_threshold=summary_token_threshold, model=model)
|
||||||
|
|
||||||
|
print(f"Formatting tree structure...")
|
||||||
|
tree_structure = format_structure(tree_structure, order = ['title', 'node_id', 'summary', 'prefix_summary', 'text', 'line_num', 'nodes'])
|
||||||
|
|
||||||
|
return tree_structure
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
|
||||||
|
MD_NAME = 'Detect-Order-Construct'
|
||||||
|
# MD_NAME = 'Welcome'
|
||||||
|
MD_PATH = os.path.join(os.path.dirname(__file__), '..', 'docs', f'{MD_NAME}.md')
|
||||||
|
|
||||||
|
|
||||||
|
MODEL="gpt-4.1"
|
||||||
|
IF_THINNING=False
|
||||||
|
THINNING_THRESHOLD=5000
|
||||||
|
SUMMARY_TOKEN_THRESHOLD=200
|
||||||
|
IF_SUMMARY=True
|
||||||
|
|
||||||
|
tree_structure = asyncio.run(md_to_tree(
|
||||||
|
md_path=MD_PATH,
|
||||||
|
if_thinning=IF_THINNING,
|
||||||
|
min_token_threshold=THINNING_THRESHOLD,
|
||||||
|
if_summary=IF_SUMMARY,
|
||||||
|
summary_token_threshold=SUMMARY_TOKEN_THRESHOLD,
|
||||||
|
model=MODEL))
|
||||||
|
|
||||||
|
print('\n' + '='*60)
|
||||||
|
print('TREE STRUCTURE')
|
||||||
|
print('='*60)
|
||||||
|
print_json(tree_structure)
|
||||||
|
|
||||||
|
print('\n' + '='*60)
|
||||||
|
print('TABLE OF CONTENTS')
|
||||||
|
print('='*60)
|
||||||
|
print_toc(tree_structure)
|
||||||
|
|
||||||
|
output_path = os.path.join(os.path.dirname(__file__), '..', 'results', f'{MD_NAME}_structure.json')
|
||||||
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
||||||
|
|
||||||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(tree_structure, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
print(f"\nTree structure saved to: {output_path}")
|
||||||
|
|
@ -19,8 +19,9 @@ from types import SimpleNamespace as config
|
||||||
|
|
||||||
CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")
|
CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")
|
||||||
|
|
||||||
|
def count_tokens(text, model=None):
|
||||||
def count_tokens(text, model):
|
if not text:
|
||||||
|
return 0
|
||||||
enc = tiktoken.encoding_for_model(model)
|
enc = tiktoken.encoding_for_model(model)
|
||||||
tokens = enc.encode(text)
|
tokens = enc.encode(text)
|
||||||
return len(tokens)
|
return len(tokens)
|
||||||
|
|
@ -489,6 +490,34 @@ def clean_structure_post(data):
|
||||||
clean_structure_post(section)
|
clean_structure_post(section)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
def remove_fields(data, fields=['text']):
|
||||||
|
if isinstance(data, dict):
|
||||||
|
return {k: remove_fields(v, fields)
|
||||||
|
for k, v in data.items() if k not in fields}
|
||||||
|
elif isinstance(data, list):
|
||||||
|
return [remove_fields(item, fields) for item in data]
|
||||||
|
return data
|
||||||
|
|
||||||
|
def print_toc(tree, indent=0):
|
||||||
|
for node in tree:
|
||||||
|
print(' ' * indent + node['title'])
|
||||||
|
if node.get('nodes'):
|
||||||
|
print_toc(node['nodes'], indent + 1)
|
||||||
|
|
||||||
|
def print_json(data, max_len=40, indent=2):
|
||||||
|
def simplify_data(obj):
|
||||||
|
if isinstance(obj, dict):
|
||||||
|
return {k: simplify_data(v) for k, v in obj.items()}
|
||||||
|
elif isinstance(obj, list):
|
||||||
|
return [simplify_data(item) for item in obj]
|
||||||
|
elif isinstance(obj, str) and len(obj) > max_len:
|
||||||
|
return obj[:max_len] + '...'
|
||||||
|
else:
|
||||||
|
return obj
|
||||||
|
|
||||||
|
simplified = simplify_data(data)
|
||||||
|
print(json.dumps(simplified, indent=indent, ensure_ascii=False))
|
||||||
|
|
||||||
|
|
||||||
def remove_structure_text(data):
|
def remove_structure_text(data):
|
||||||
if isinstance(data, dict):
|
if isinstance(data, dict):
|
||||||
|
|
@ -606,6 +635,26 @@ def generate_doc_description(structure, model=None):
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
def reorder_dict(data, key_order):
|
||||||
|
if not key_order:
|
||||||
|
return data
|
||||||
|
return {key: data[key] for key in key_order if key in data}
|
||||||
|
|
||||||
|
|
||||||
|
def format_structure(structure, order=None):
|
||||||
|
if not order:
|
||||||
|
return structure
|
||||||
|
if isinstance(structure, dict):
|
||||||
|
if 'nodes' in structure:
|
||||||
|
structure['nodes'] = format_structure(structure['nodes'], order)
|
||||||
|
if not structure.get('nodes'):
|
||||||
|
structure.pop('nodes', None)
|
||||||
|
structure = reorder_dict(structure, order)
|
||||||
|
elif isinstance(structure, list):
|
||||||
|
structure = [format_structure(item, order) for item in structure]
|
||||||
|
return structure
|
||||||
|
|
||||||
|
|
||||||
class ConfigLoader:
|
class ConfigLoader:
|
||||||
def __init__(self, default_path: str = None):
|
def __init__(self, default_path: str = None):
|
||||||
if default_path is None:
|
if default_path is None:
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
openai==1.70.0
|
openai==1.101.0
|
||||||
pymupdf==1.25.5
|
pymupdf==1.26.4
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
python-dotenv==1.1.0
|
python-dotenv==1.1.0
|
||||||
tiktoken==0.7.0
|
tiktoken==0.11.0
|
||||||
pyyaml==6.0.2
|
pyyaml==6.0.2
|
||||||
|
|
|
||||||
|
|
@ -1,17 +1,22 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import os
|
||||||
|
import json
|
||||||
from pageindex import *
|
from pageindex import *
|
||||||
|
from pageindex.page_index_md import md_to_tree
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Set up argument parser
|
# Set up argument parser
|
||||||
parser = argparse.ArgumentParser(description='Process PDF document and generate structure')
|
parser = argparse.ArgumentParser(description='Process PDF or Markdown document and generate structure')
|
||||||
parser.add_argument('--pdf_path', type=str, help='Path to the PDF file')
|
parser.add_argument('--file_path', type=str, help='Path to the PDF or Markdown file')
|
||||||
|
parser.add_argument('--file_type', type=str, choices=['pdf', 'markdown', 'md'], default='pdf',
|
||||||
|
help='Type of file to process (pdf, markdown, or md)')
|
||||||
parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use')
|
parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use')
|
||||||
parser.add_argument('--toc-check-pages', type=int, default=20,
|
parser.add_argument('--toc-check-pages', type=int, default=20,
|
||||||
help='Number of pages to check for table of contents')
|
help='Number of pages to check for table of contents (PDF only)')
|
||||||
parser.add_argument('--max-pages-per-node', type=int, default=10,
|
parser.add_argument('--max-pages-per-node', type=int, default=10,
|
||||||
help='Maximum number of pages per node')
|
help='Maximum number of pages per node (PDF only)')
|
||||||
parser.add_argument('--max-tokens-per-node', type=int, default=20000,
|
parser.add_argument('--max-tokens-per-node', type=int, default=20000,
|
||||||
help='Maximum number of tokens per node')
|
help='Maximum number of tokens per node (PDF only)')
|
||||||
parser.add_argument('--if-add-node-id', type=str, default='yes',
|
parser.add_argument('--if-add-node-id', type=str, default='yes',
|
||||||
help='Whether to add node id to the node')
|
help='Whether to add node id to the node')
|
||||||
parser.add_argument('--if-add-node-summary', type=str, default='no',
|
parser.add_argument('--if-add-node-summary', type=str, default='no',
|
||||||
|
|
@ -20,27 +25,73 @@ if __name__ == "__main__":
|
||||||
help='Whether to add doc description to the doc')
|
help='Whether to add doc description to the doc')
|
||||||
parser.add_argument('--if-add-node-text', type=str, default='no',
|
parser.add_argument('--if-add-node-text', type=str, default='no',
|
||||||
help='Whether to add text to the node')
|
help='Whether to add text to the node')
|
||||||
|
# Markdown specific arguments
|
||||||
|
parser.add_argument('--if-thinning', type=str, default='yes',
|
||||||
|
help='Whether to apply tree thinning for markdown (markdown only)')
|
||||||
|
parser.add_argument('--thinning-threshold', type=int, default=5000,
|
||||||
|
help='Minimum token threshold for thinning (markdown only)')
|
||||||
|
parser.add_argument('--summary-token-threshold', type=int, default=200,
|
||||||
|
help='Token threshold for generating summaries (markdown only)')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Determine file type from extension if not specified
|
||||||
|
if args.file_type == 'pdf' and args.file_path:
|
||||||
|
if args.file_path.lower().endswith(('.md', '.markdown')):
|
||||||
|
args.file_type = 'markdown'
|
||||||
|
elif not args.file_path.lower().endswith('.pdf'):
|
||||||
|
raise ValueError("File must be a PDF or Markdown file")
|
||||||
|
|
||||||
|
if args.file_type == 'pdf':
|
||||||
|
# Process PDF file
|
||||||
# Configure options
|
# Configure options
|
||||||
opt = config(
|
opt = config(
|
||||||
model=args.model,
|
model=args.model,
|
||||||
toc_check_page_num=args.toc_check_pages,
|
toc_check_page_num=args.toc_check_pages,
|
||||||
max_page_num_each_node=args.max_pages_per_node,
|
max_page_num_each_node=args.max_pages_per_node,
|
||||||
max_token_num_each_node=args.max_tokens_per_node,
|
max_token_num_each_node=args.max_tokens_per_node,
|
||||||
if_add_node_id=args.if_add_node_id,
|
if_add_node_id=args.if_add_node_id,
|
||||||
if_add_node_summary=args.if_add_node_summary,
|
if_add_node_summary=args.if_add_node_summary,
|
||||||
if_add_doc_description=args.if_add_doc_description,
|
if_add_doc_description=args.if_add_doc_description,
|
||||||
if_add_node_text=args.if_add_node_text
|
if_add_node_text=args.if_add_node_text
|
||||||
)
|
)
|
||||||
|
|
||||||
# Process the PDF
|
# Process the PDF
|
||||||
toc_with_page_number = page_index_main(args.pdf_path, opt)
|
toc_with_page_number = page_index_main(args.file_path, opt)
|
||||||
print('Parsing done, saving to file...')
|
print('Parsing done, saving to file...')
|
||||||
|
|
||||||
# Save results
|
# Save results
|
||||||
pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0]
|
pdf_name = os.path.splitext(os.path.basename(args.file_path))[0]
|
||||||
os.makedirs('./results', exist_ok=True)
|
os.makedirs('./results', exist_ok=True)
|
||||||
|
|
||||||
with open(f'./results/{pdf_name}_structure.json', 'w', encoding='utf-8') as f:
|
with open(f'./results/{pdf_name}_structure.json', 'w', encoding='utf-8') as f:
|
||||||
json.dump(toc_with_page_number, f, indent=2)
|
json.dump(toc_with_page_number, f, indent=2)
|
||||||
|
|
||||||
|
elif args.file_type in ['markdown', 'md']:
|
||||||
|
# Process markdown file
|
||||||
|
print('Processing markdown file...')
|
||||||
|
|
||||||
|
# Configure markdown options
|
||||||
|
if_thinning = args.if_thinning.lower() == 'yes'
|
||||||
|
if_summary = args.if_add_node_summary.lower() == 'yes'
|
||||||
|
|
||||||
|
# Process the markdown
|
||||||
|
import asyncio
|
||||||
|
toc_with_page_number = asyncio.run(md_to_tree(
|
||||||
|
md_path=args.file_path,
|
||||||
|
if_thinning=if_thinning,
|
||||||
|
min_token_threshold=args.thinning_threshold,
|
||||||
|
if_summary=if_summary,
|
||||||
|
summary_token_threshold=args.summary_token_threshold,
|
||||||
|
model=args.model
|
||||||
|
))
|
||||||
|
|
||||||
|
print('Parsing done, saving to file...')
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
md_name = os.path.splitext(os.path.basename(args.file_path))[0]
|
||||||
|
os.makedirs('./results', exist_ok=True)
|
||||||
|
|
||||||
|
with open(f'./results/{md_name}_structure.json', 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(toc_with_page_number, f, indent=2, ensure_ascii=False)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported file type: {args.file_type}. Supported types are 'pdf', 'markdown', or 'md'")
|
||||||
Loading…
Add table
Add a link
Reference in a new issue