add node id, node summary and doc summary

2026-05-16 18:25:14 +02:00 · 2025-04-03 13:35:38 +08:00 · 2025-04-03 13:35:38 +08:00 · db746f35e0
commit db746f35e0
parent 1de05d8675
9 changed files with 3663 additions and 2810 deletions
--- a/utils.py
+++ b/utils.py
@ -10,15 +10,19 @@ import copy
 import asyncio
 import pymupdf
 from io import BytesIO
+from dotenv import load_dotenv
+load_dotenv()
 import logging

+CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")
+

 def count_tokens(text, model):
    enc = tiktoken.encoding_for_model(model)
    tokens = enc.encode(text)
    return len(tokens)

-def ChatGPT_API_with_finish_reason(model, prompt, api_key, chat_history=None):
+def ChatGPT_API_with_finish_reason(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
    max_retries = 10
    client = openai.OpenAI(api_key=api_key)
    for i in range(max_retries):
@ -50,7 +54,7 @@ def ChatGPT_API_with_finish_reason(model, prompt, api_key, chat_history=None):



-def ChatGPT_API(model, prompt, api_key, chat_history=None):
+def ChatGPT_API(model, prompt, api_key=CHATGPT_API_KEY, chat_history=None):
    max_retries = 10
    client = openai.OpenAI(api_key=api_key)
    for i in range(max_retries):
@ -78,7 +82,7 @@ def ChatGPT_API(model, prompt, api_key, chat_history=None):
                return "Error"
            

-async def ChatGPT_API_async(model, prompt, api_key):
+async def ChatGPT_API_async(model, prompt, api_key=CHATGPT_API_KEY):
    max_retries = 10
    client = openai.AsyncOpenAI(api_key=api_key)
    for i in range(max_retries):
@ -151,7 +155,7 @@ def write_node_id(data, node_id=0):
        data['node_id'] = str(node_id).zfill(4)
        node_id += 1
        for key in list(data.keys()):
-            if 'child_nodes' in key:
+            if 'nodes' in key:
                node_id = write_node_id(data[key], node_id)
    elif isinstance(data, list):
        for index in range(len(data)):
@ -161,10 +165,10 @@ def write_node_id(data, node_id=0):
 def get_nodes(structure):
    if isinstance(structure, dict):
        structure_node = copy.deepcopy(structure)
-        structure_node.pop('child_nodes', None)
+        structure_node.pop('nodes', None)
        nodes = [structure_node]
        for key in list(structure.keys()):
-            if 'child_nodes' in key:
+            if 'nodes' in key:
                nodes.extend(get_nodes(structure[key]))
        return nodes
    elif isinstance(structure, list):
@ -177,8 +181,8 @@ def structure_to_list(structure):
    if isinstance(structure, dict):
        nodes = []
        nodes.append(structure)
-        if 'child_nodes' in structure:
-            nodes.extend(structure_to_list(structure['child_nodes']))
+        if 'nodes' in structure:
+            nodes.extend(structure_to_list(structure['nodes']))
        return nodes
    elif isinstance(structure, list):
        nodes = []
@ -189,14 +193,14 @@ def structure_to_list(structure):
    
 def get_leaf_nodes(structure):
    if isinstance(structure, dict):
-        if not structure['child_nodes']:
+        if not structure['nodes']:
            structure_node = copy.deepcopy(structure)
-            structure_node.pop('child_nodes', None)
+            structure_node.pop('nodes', None)
            return [structure_node]
        else:
            leaf_nodes = []
            for key in list(structure.keys()):
-                if 'child_nodes' in key:
+                if 'nodes' in key:
                    leaf_nodes.extend(get_leaf_nodes(structure[key]))
            return leaf_nodes
    elif isinstance(structure, list):
@ -212,7 +216,7 @@ def is_leaf_node(data, node_id):
            if data.get('node_id') == node_id:
                return data
            for key in data.keys():
-                if 'child_nodes' in key:
+                if 'nodes' in key:
                    result = find_node(data[key], node_id)
                    if result:
                        return result
@ -227,7 +231,7 @@ def is_leaf_node(data, node_id):
    node = find_node(data, node_id)

    # Check if the node is a leaf node
-    if node and not node.get('child_nodes'):
+    if node and not node.get('nodes'):
        return True
    return False

@ -353,7 +357,7 @@ def list_to_tree(data):
            'title': item.get('title'),
            'start_index': item.get('start_index'),
            'end_index': item.get('end_index'),
-            'child_nodes': []
+            'nodes': []
        }
        
        nodes[structure] = node
@ -364,7 +368,7 @@ def list_to_tree(data):
        if parent_structure:
            # Add as child to parent if parent exists
            if parent_structure in nodes:
-                nodes[parent_structure]['child_nodes'].append(node)
+                nodes[parent_structure]['nodes'].append(node)
            else:
                root_nodes.append(node)
        else:
@ -373,10 +377,10 @@ def list_to_tree(data):
    
    # Helper function to clean empty children arrays
    def clean_node(node):
-        if not node['child_nodes']:
-            del node['child_nodes']
+        if not node['nodes']:
+            del node['nodes']
        else:
-            for child in node['child_nodes']:
+            for child in node['nodes']:
                clean_node(child)
        return node
    
@ -424,7 +428,7 @@ def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
 def get_text_of_pdf_pages(pdf_pages, start_page, end_page):
    text = ""
    for page_num in range(start_page-1, end_page):
-        text += pdf_pages[page_num]
+        text += pdf_pages[page_num][0]
    return text

 def get_number_of_pages(pdf_path):
@ -460,8 +464,8 @@ def clean_structure_post(data):
        data.pop('page_number', None)
        data.pop('start_index', None)
        data.pop('end_index', None)
-        if 'child_nodes' in data:
-            clean_structure_post(data['child_nodes'])
+        if 'nodes' in data:
+            clean_structure_post(data['nodes'])
    elif isinstance(data, list):
        for section in data:
            clean_structure_post(section)
@ -471,8 +475,8 @@ def clean_structure_post(data):
 def remove_structure_text(data):
    if isinstance(data, dict):
        data.pop('text', None)
-        if 'child_nodes' in data:
-            remove_structure_text(data['child_nodes'])
+        if 'nodes' in data:
+            remove_structure_text(data['nodes'])
    elif isinstance(data, list):
        for item in data:
            remove_structure_text(item)
@ -522,3 +526,60 @@ def convert_page_to_int(data):
                # Keep original value if conversion fails
                pass
    return data
+
+def write_node_id(data, node_id=0):
+    if isinstance(data, dict):
+        data['node_id'] = str(node_id).zfill(4)
+        node_id += 1
+        for key in list(data.keys()):
+            if 'nodes' in key:
+                node_id = write_node_id(data[key], node_id)
+    elif isinstance(data, list):
+        for index in range(len(data)):
+            node_id = write_node_id(data[index], node_id)
+    return node_id
+
+
+def add_node_text(node, pdf_pages):
+    if isinstance(node, dict):
+        start_page = node.get('start_index')
+        end_page = node.get('end_index')
+        node['text'] = get_text_of_pdf_pages(pdf_pages, start_page, end_page)
+        if 'nodes' in node:
+            add_node_text(node['nodes'], pdf_pages)
+    elif isinstance(node, list):
+        for index in range(len(node)):
+            add_node_text(node[index], pdf_pages)
+    return
+
+async def generate_node_summary(node, model=None):
+    prompt = f"""You are given a part of a document, your task is to generate a description of the partial document about what are main points covered in the partial document.
+
+    Partial Document Text: {node['text']}
+    
+    Directly return the description, do not include any other text.
+    """
+    response = await ChatGPT_API_async(model, prompt)
+    return response
+
+
+async def generate_summaries_for_structure(structure, model=None):
+    nodes = structure_to_list(structure)
+    tasks = [generate_node_summary(node, model=model) for node in nodes]
+    summaries = await asyncio.gather(*tasks)
+    
+    for node, summary in zip(nodes, summaries):
+        node['summary'] = summary
+    return structure
+
+
+def generate_doc_description(structure, model=None):
+    prompt = f"""Your are an expert in generating descriptions for a document.
+    You are given a structure of a document. Your task is to generate a one-sentence description for the document, which makes it easy to distinguish the document from other documents.
+        
+    Document Structure: {structure}
+    
+    Directly return the description, do not include any other text.
+    """
+    response = ChatGPT_API(model, prompt)
+    return response