mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-05-16 18:25:14 +02:00
commit
6d06d0786e
4 changed files with 72 additions and 17 deletions
|
|
@ -0,0 +1 @@
|
|||
from .page_index import *
|
||||
7
config.yaml
Normal file
7
config.yaml
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
model: gpt-4o-2024-11-20
|
||||
toc_check_page_num: 20
|
||||
max_page_num_each_node: 10
|
||||
max_token_num_each_node: 20000
|
||||
if_add_node_id: yes
|
||||
if_add_node_summary: no
|
||||
if_add_doc_description: yes
|
||||
|
|
@ -2,13 +2,10 @@ import os
|
|||
import json
|
||||
import copy
|
||||
import math
|
||||
import sys
|
||||
import random
|
||||
sys.path.append('../..')
|
||||
import re
|
||||
from utils import *
|
||||
import os
|
||||
from types import SimpleNamespace as config
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import argparse
|
||||
|
||||
|
|
@ -1033,16 +1030,26 @@ def page_index_main(doc, opt=None):
|
|||
if opt.if_add_doc_description == 'yes':
|
||||
doc_description = generate_doc_description(structure, model=opt.model)
|
||||
return {
|
||||
'doc_name': os.path.basename(doc),
|
||||
'doc_name': get_pdf_name(doc),
|
||||
'doc_description': doc_description,
|
||||
'structure': structure,
|
||||
}
|
||||
return {
|
||||
'doc_name': os.path.basename(doc),
|
||||
'doc_name': get_pdf_name(doc),
|
||||
'structure': structure,
|
||||
}
|
||||
|
||||
|
||||
def page_index(doc, model=None, toc_check_page_num=None, max_page_num_each_node=None, max_token_num_each_node=None,
|
||||
f_add_node_id=None, if_add_node_summary=None, if_add_doc_description=None):
|
||||
|
||||
user_opt = {
|
||||
arg: value for arg, value in locals().items()
|
||||
if arg != "doc" and value is not None
|
||||
}
|
||||
opt = ConfigLoader().load(user_opt)
|
||||
return page_index_main(doc, opt)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Set up argument parser
|
||||
|
|
|
|||
64
utils.py
64
utils.py
|
|
@ -13,6 +13,9 @@ from io import BytesIO
|
|||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
import logging
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace as config
|
||||
|
||||
CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")
|
||||
|
||||
|
|
@ -284,24 +287,27 @@ def get_last_start_page_from_text(text):
|
|||
return start_page
|
||||
|
||||
|
||||
|
||||
|
||||
def sanitize_filename(filename, replacement='-'):
|
||||
# In Linux, only '/' and '\0' (null) are invalid in filenames.
|
||||
# Null can't be represented in strings, so we only handle '/'.
|
||||
return filename.replace('/', replacement)
|
||||
|
||||
def get_pdf_name(pdf_path):
|
||||
# Extract PDF name
|
||||
if isinstance(pdf_path, str):
|
||||
pdf_name = os.path.basename(pdf_path)
|
||||
elif isinstance(pdf_path, BytesIO):
|
||||
pdf_reader = PyPDF2.PdfReader(pdf_path)
|
||||
meta = pdf_reader.metadata
|
||||
pdf_name = meta.title if meta.title else 'Untitled'
|
||||
pdf_name = sanitize_filename(pdf_name)
|
||||
return pdf_name
|
||||
|
||||
|
||||
class JsonLogger:
|
||||
def __init__(self, file_path):
|
||||
# Extract PDF name without extension for logger name and filename
|
||||
# pdf_name = os.path.splitext(os.path.basename(file_path))[0]
|
||||
if isinstance(file_path, str):
|
||||
pdf_name = os.path.splitext(os.path.basename(file_path))[0]
|
||||
elif isinstance(file_path, BytesIO):
|
||||
pdf_reader = PyPDF2.PdfReader(file_path)
|
||||
meta = pdf_reader.metadata
|
||||
pdf_name = meta.title if meta.title else 'Untitled'
|
||||
pdf_name = sanitize_filename(pdf_name)
|
||||
# Extract PDF name for logger name
|
||||
pdf_name = get_pdf_name(file_path)
|
||||
|
||||
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
self.filename = f"{pdf_name}_{current_time}.json"
|
||||
|
|
@ -583,4 +589,38 @@ def generate_doc_description(structure, model=None):
|
|||
Directly return the description, do not include any other text.
|
||||
"""
|
||||
response = ChatGPT_API(model, prompt)
|
||||
return response
|
||||
return response
|
||||
|
||||
|
||||
class ConfigLoader:
|
||||
def __init__(self, default_path: str = None):
|
||||
if default_path is None:
|
||||
default_path = Path(__file__).parent / "config.yaml"
|
||||
self._default_dict = self._load_yaml(default_path)
|
||||
|
||||
@staticmethod
|
||||
def _load_yaml(path):
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return yaml.safe_load(f) or {}
|
||||
|
||||
def _validate_keys(self, user_dict):
|
||||
unknown_keys = set(user_dict) - set(self._default_dict)
|
||||
if unknown_keys:
|
||||
raise ValueError(f"Unknown config keys: {unknown_keys}")
|
||||
|
||||
def load(self, user_opt=None) -> config:
|
||||
"""
|
||||
Load the configuration, merging user options with default values.
|
||||
"""
|
||||
if user_opt is None:
|
||||
user_dict = {}
|
||||
elif isinstance(user_opt, config):
|
||||
user_dict = vars(user_opt)
|
||||
elif isinstance(user_opt, dict):
|
||||
user_dict = user_opt
|
||||
else:
|
||||
raise TypeError("user_opt must be dict, config(SimpleNamespace) or None")
|
||||
|
||||
self._validate_keys(user_dict)
|
||||
merged = {**self._default_dict, **user_dict}
|
||||
return config(**merged)
|
||||
Loading…
Add table
Add a link
Reference in a new issue