commit 6f43b477d386796b39323de98125f8998b3f73db Author: mingtian Date: Tue Apr 1 18:54:08 2025 +0800 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..47d38ba --- /dev/null +++ b/.gitignore @@ -0,0 +1,15 @@ +.ipynb_checkpoints +__pycache__ +files +index +temp/* +chroma-collections.parquet +chroma-embeddings.parquet +.DS_Store +.env* +notebook +SDK/* +log/* +logs/ +parts/* +json_results/* diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c9081e4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Vectify AI + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..a041e03 --- /dev/null +++ b/README.md @@ -0,0 +1,136 @@ +# PageIndex + +### **Document Index System for Reasoning-Based RAG** + +Traditional vector-based retrieval relies heavily on semantic similarity. But when working with professional documents that require domain expertise and multi-step reasoning, similarity search often falls short. + +**Reasoning-Based RAG** offers a better alternative: enabling LLMs to *think* and *reason* their way to the most relevant document sections. Inspired by **AlphaGo**, we leverage **tree search** to perform structured document retrieval. + +**PageIndex** is an indexing system that builds search trees from long documents, making them ready for reasoning-based RAG. + +Built by [Vectify AI](https://vectify.ai/pageindex) + +--- + +## 🔍 What is PageIndex? + +**PageIndex** transforms lengthy PDF documents into a semantic **tree structure**, similar to a "table of contents" but optimized for use with Large Language Models (LLMs). +It’s ideal for: financial reports, regulatory filings, academic textbooks, legal or technical manuals or any document that exceeds LLM context limits. + +### ✅ Key Features + +- **Scales to Massive Documents** + Designed to handle hundreds or even thousands of pages with ease. + +- **Hierarchical Tree Structure** + Enables LLMs to traverse documents logically—like an intelligent, LLM-optimized table of contents. + +- **Precise Page Referencing** + Every node contains its own summary and start/end page physical index, allowing pinpoint retrieval. + +- **Chunk-Free Segmentation** + No arbitrary chunking. Nodes follow the natural structure of the document. + +--- + +## 📦 PageIndex Format + +Here is an example output. See more [example documents](https://github.com/VectifyAI/PageIndex/tree/main/docs) and [generated trees](https://github.com/VectifyAI/PageIndex/tree/main/results). + +```json +{ + "title": "Financial Stability", + "node_id": "0006", + "start_index": 21, + "end_index": 22, + "summary": "The Federal Reserve ...", + "child_nodes": [ + { + "title": "Monitoring Financial Vulnerabilities", + "node_id": "0007", + "start_index": 22, + "end_index": 28, + "summary": "The Federal Reserve's monitoring ..." + }, + { + "title": "Domestic and International Cooperation and Coordination", + "node_id": "0008", + "start_index": 28, + "end_index": 31, + "summary": "In 2023, the Federal Reserve collaborated ..." + } + ] +} + +``` +Notice: the node_id and summary generation function will be added soon. + +## 🧠 Reasoning-Based RAG with PageIndex + +Use PageIndex to build **reasoning-based retrieval systems** without relying on semantic similarity. Great for domain-specific tasks where nuance matters. + +### 🛠️ Example Prompt + +```python +prompt = f""" +You are given a question and a tree structure of a document. +You need to find all nodes that are likely to contain the answer. + +Question: {question} + +Document tree structure: {structure} + +Reply in the following JSON format: +{{ + "thinking": , + "node_list": [node_id1, node_id2, ...] +}} +""" +``` + +## 🚀 Usage + +Follow these steps to generate a PageIndex tree from a PDF document. + +### 1. Install dependencies + +```bash +pip3 install -r requirements.txt +``` + +### 2. Set your OpenAI API key + +Create a `.env` file in the root directory and add your API key: + +```bash +CHATGPT_API_KEY=your_openai_key_here +``` + +### 3. Run PageIndex on your PDF + +```bash +python3 page_index.py --pdf_path /path/to/your/document.pdf +``` + +The results will be saved in the `./results/` directory. + +## 🛤 Roadmap + +- [ ] Add node summary and document selection +- [ ] Technical report on PageIndex design +- [ ] Efficient tree search algorithms for large documents +- [ ] Integration with vector-based semantic retrieval + +## 📈 Case Study: Mafin 2.5 + +[Mafin 2.5](https://vectify.ai/blog/Mafin2.5) is a state-of-the-art reasoning-based RAG model designed specifically for financial document analysis. Built on top of **PageIndex**, it achieved an impressive **98.7% accuracy** on the [FinanceBench](https://github.com/VectifyAI/Mafin2.5-FinanceBench) benchmark—significantly outperforming traditional vector-based RAG systems. + +PageIndex’s hierarchical indexing enabled precise navigation and extraction of relevant content from complex financial reports, such as SEC filings and earnings disclosures. + +👉 See full [benchmark results](https://github.com/VectifyAI/Mafin2.5-FinanceBench) for detailed comparisons and performance metrics. + +## 📬 Contact Us + +Need customized support for your documents or reasoning-based RAG system? + +👉 [Contact us here](https://ii2abc2jejf.typeform.com/to/meB40zV0) diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/docs/2023-annual-report.pdf b/docs/2023-annual-report.pdf new file mode 100644 index 0000000..3dcbd14 Binary files /dev/null and b/docs/2023-annual-report.pdf differ diff --git a/docs/PRML.pdf b/docs/PRML.pdf new file mode 100644 index 0000000..af7d777 Binary files /dev/null and b/docs/PRML.pdf differ diff --git a/docs/Regulation Best Interest_Interpretive release.pdf b/docs/Regulation Best Interest_Interpretive release.pdf new file mode 100644 index 0000000..ce8e039 Binary files /dev/null and b/docs/Regulation Best Interest_Interpretive release.pdf differ diff --git a/docs/Regulation Best Interest_proposed rule.pdf b/docs/Regulation Best Interest_proposed rule.pdf new file mode 100644 index 0000000..4d764b7 Binary files /dev/null and b/docs/Regulation Best Interest_proposed rule.pdf differ diff --git a/docs/q1-fy25-earnings.pdf b/docs/q1-fy25-earnings.pdf new file mode 100644 index 0000000..7d4f0d9 Binary files /dev/null and b/docs/q1-fy25-earnings.pdf differ diff --git a/page_index.py b/page_index.py new file mode 100644 index 0000000..d010c59 --- /dev/null +++ b/page_index.py @@ -0,0 +1,1073 @@ +import os +import json +import copy +import math +import sys +import random +sys.path.append('../..') +import re +from utils import * +import os +from types import SimpleNamespace as config +from dotenv import load_dotenv +load_dotenv() +from concurrent.futures import ThreadPoolExecutor, as_completed +import argparse + +CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY") + +################### check title in page ######################################################### +def check_title_appearance(item, page_list, start_index=1, model=None): + title=item['title'] + if 'physical_index' not in item or item['physical_index'] is None: + return {'list_index': item.get('list_index'), 'answer': 'no', 'title':title, 'page_number': None} + + + page_number = item['physical_index'] + page_text = page_list[page_number-start_index][0] + + + prompt = f""" + Your job is to check if the given section appears or starts in the given page_text. + + Note: ignore any space inconsistency in the page_text. + + The given section title is {title}. + The given page_text is {page_text}. + + Reply format: + {{ + + "thinking": + "answer": "yes or no" (yes if the section appears or starts in the page_text, no otherwise) + }} + Directly return the final JSON structure. Do not output anything else.""" + + response = ChatGPT_API(model=model, prompt=prompt, api_key=CHATGPT_API_KEY) + response = extract_json(response) + if 'answer' in response: + answer = response['answer'] + else: + answer = 'no' + return {'list_index': item['list_index'], 'answer': answer, 'title': title, 'page_number': page_number} + + +def check_title_appearance_in_start(title, page_text, model=None, logger=None): + prompt = f""" + You will be given given the current section title and the current page_text. + Your job is to check if the current section starts in the beginning of the given page_text. + If there are other contents before the current section title, then the current section does not start in the beginning of the given page_text. + If the current section title is the first content in the given page_text, then the current section starts in the beginning of the given page_text. + + Note: do fuzzy matching, ignore any space inconsistency in the page_text. + + The given section title is {title}. + The given page_text is {page_text}. + + reply format: + {{ + "thinking": + "start_begin": "yes or no" (yes if the section starts in the beginning of the page_text, no otherwise) + }} + Directly return the final JSON structure. Do not output anything else.""" + + response = ChatGPT_API(model=model, prompt=prompt, api_key=CHATGPT_API_KEY) + response = extract_json(response) + if logger: + logger.info(f"Response: {response}") + if 'start_begin' in response: + return response['start_begin'] + else: + return 'no' + + +def check_title_appearance_in_start_parallel(structure, page_list, model=None, logger=None): + if logger: + logger.info(f"Checking title appearance in start parallel") + with ThreadPoolExecutor(max_workers=10) as executor: + future_to_item = { + executor.submit(check_title_appearance_in_start, item['title'], page_list[item['physical_index']-1][0], model=model, logger=logger): item + for item in structure + } + + # Process completed futures and attach results to items + for future in as_completed(future_to_item): + item = future_to_item[future] + try: + result = future.result() + item['appear_start'] = result + except Exception as e: + if logger: + logger.error(f"Error processing item {item['title']}: {str(e)}") + item['appear_start'] = 'no' + + return structure + + +def toc_detector_single_page(content, model=None): + prompt = f""" + Your job is to detect if there is a table of content provided in the given text. + + Given text: {content} + + return the following JSON format: + {{ + "thinking": + "toc_detected": "", + }} + + Directly return the final JSON structure. Do not output anything else. + Please note: abstract,summary, notation list, figure list, table list, etc. are not table of contents.""" + + response = ChatGPT_API(model=model, prompt=prompt, api_key=CHATGPT_API_KEY) + # print('response', response) + json_content = extract_json(response) + return json_content['toc_detected'] + + +def check_if_toc_extraction_is_complete(content, toc, model=None): + prompt = f""" + You are given a partial document and a table of contents. + Your job is to check if the table of contents is complete, which it contains all the main sections in the partial document. + + Reply format: + {{ + "thinking": + "completed": "yes" or "no" + }} + Directly return the final JSON structure. Do not output anything else.""" + + prompt = prompt + '\n Document:\n' + content + '\n Table of contents:\n' + toc + response = ChatGPT_API(model=model, prompt=prompt, api_key=CHATGPT_API_KEY) + json_content = extract_json(response) + return json_content['completed'] + + +def check_if_toc_transformation_is_complete(content, toc, model=None): + prompt = f""" + You are given a raw table of contents and a table of contents. + Your job is to check if the table of contents is complete. + + Reply format: + {{ + "thinking": + "completed": "yes" or "no" + }} + Directly return the final JSON structure. Do not output anything else.""" + + prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc + response = ChatGPT_API(model=model, prompt=prompt, api_key=CHATGPT_API_KEY) + json_content = extract_json(response) + return json_content['completed'] + +def extract_toc_content(content, model=None): + prompt = f""" + Your job is to extract the full table of contents from the given text, replace ... with : + + Given text: {content} + + Directly return the full table of contents content. Do not output anything else.""" + + response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, api_key=CHATGPT_API_KEY) + + if_complete = check_if_toc_transformation_is_complete(content, response, model) + if if_complete == "yes" and finish_reason == "finished": + return response + + chat_history = [ + {"role": "user", "content": prompt}, + {"role": "assistant", "content": response}, + ] + prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" + new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, api_key=CHATGPT_API_KEY, chat_history=chat_history) + response = response + new_response + if_complete = check_if_toc_transformation_is_complete(content, response) + + while not (if_complete == "yes" and finish_reason == "finished"): + chat_history = [ + {"role": "user", "content": prompt}, + {"role": "assistant", "content": response}, + ] + prompt = f"""please continue the generation of table of contents , directly output the remaining part of the structure""" + new_response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, api_key=CHATGPT_API_KEY, chat_history=chat_history) + response = response + new_response + if_complete = check_if_toc_transformation_is_complete(content, response) + + # Optional: Add a maximum retry limit to prevent infinite loops + if len(chat_history) > 5: # Arbitrary limit of 10 attempts + raise Exception('Failed to complete table of contents after maximum retries') + + return response + +def detect_page_index(toc_content, model=None): + print('start detect_page_index') + prompt = f""" + You will be given a table of contents. + + Your job is to detect if there are page numbers/indices given within the table of contents. + + Given text: {toc_content} + + Reply format: + {{ + "page_index_given_in_toc": "" + }} + Directly return the final JSON structure. Do not output anything else.""" + + response = ChatGPT_API(model=model, prompt=prompt, api_key=CHATGPT_API_KEY) + json_content = extract_json(response) + return json_content['page_index_given_in_toc'] + +def toc_extractor(page_list, toc_page_list, model): + def transform_dots_to_colon(text): + text = re.sub(r'\.{5,}', ': ', text) + # Handle dots separated by spaces + text = re.sub(r'(?:\. ){5,}\.?', ': ', text) + return text + + toc_content = "" + for page_index in toc_page_list: + toc_content += page_list[page_index][0] + toc_content = transform_dots_to_colon(toc_content) + has_page_index = detect_page_index(toc_content, model=model) + + return { + "toc_content": toc_content, + "page_index_given_in_toc": has_page_index + } + + + + +def toc_index_extractor(toc, content, model=None): + print('start toc_index_extractor') + tob_extractor_prompt = """ + You are given a table of contents in a json format and serveral pages of a document, your job is to add the physical_index to the table of contents in the json format. + + The provided pages contains tags like and to indicate the physical location of the page X. + + The structure variable is the numeric system which represents the index of the heirachy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc. + + The response should be in the following JSON format: + [ + { + "structure": (string), + "title": , + "physical_index": "<physical_index_X>" (keep the format) + }, + ... + ] + + Only add the physical_index to the sections that are in the provided pages. + If the section is not in the provided pages, do not add the physical_index to it. + Directly return the final JSON structure. Do not output anything else.""" + + prompt = tob_extractor_prompt + '\nTable of contents:\n' + str(toc) + '\nDocument pages:\n' + content + response = ChatGPT_API(model=model, prompt=prompt, api_key=CHATGPT_API_KEY) + json_content = extract_json(response) + return json_content + + + +def toc_transformer(toc_content, model=None): + print('start toc_transformer') + init_prompt = """ + You are given a table of contents, You job is to transform the whole table of content into a JSON format included table_of_contents. + + structure is the numeric system which represents the index of the heirachy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc. + + The response should be in the following JSON format: + { + table_of_contents: [ + { + "structure": <structure index, "x.x.x" or None> (string), + "title": <title of the section>, + "page": <page number or None>, + }, + ... + ], + } + You should transform the full table of contents in one go. + Directly return the final JSON structure, do not output anything else. """ + + prompt = init_prompt + '\n Given table of contents\n:' + toc_content + last_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, api_key=CHATGPT_API_KEY) + if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete, model) + if if_complete == "yes" and finish_reason == "finished": + last_complete = extract_json(last_complete) + cleaned_response=convert_page_to_int(last_complete['table_of_contents']) + return cleaned_response + + last_complete = get_json_content(last_complete) + while not (if_complete == "yes" and finish_reason == "finished"): + position = last_complete.rfind('}') + if position != -1: + last_complete = last_complete[:position+2] + prompt = f""" + Your task is to continue the table of contents json structure, directly output the remaining part of the json structure. + The response should be in the following JSON format: + + The raw table of contents json structure is: + {toc_content} + + The incomplete transformed table of contents json structure is: + {last_complete} + + Please continue the json structure, directly output the remaining part of the json structure.""" + + new_complete, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, api_key=CHATGPT_API_KEY) + + if new_complete.startswith('```json'): + new_complete = get_json_content(new_complete) + last_complete = last_complete+new_complete + + if_complete = check_if_toc_transformation_is_complete(toc_content, last_complete) + + + last_complete = json.loads(last_complete) + + cleaned_response=convert_page_to_int(last_complete['table_of_contents']) + return cleaned_response + + + + +def find_toc_pages(start_page_index, page_list,opt, logger=None): + print('start find_toc_pages') + last_page_is_yes = False + toc_page_list = [] + i = start_page_index + + while i < len(page_list): + # Only check beyond max_pages if we're still finding TOC pages + if i >= opt.toc_check_page_num and not last_page_is_yes: + break + detected_result = toc_detector_single_page(page_list[i][0],model=opt.model) + if detected_result == 'yes': + if logger: + logger.info(f'Page {i} has toc') + toc_page_list.append(i) + last_page_is_yes = True + elif detected_result == 'no' and last_page_is_yes: + if logger: + logger.info(f'Found the last page with toc: {i-1}') + break + i += 1 + + if not toc_page_list and logger: + logger.info('No toc found') + + return toc_page_list + +def remove_page_number(data): + if isinstance(data, dict): + data.pop('page_number', None) + for key in list(data.keys()): + if 'child_nodes' in key: + remove_page_number(data[key]) + elif isinstance(data, list): + for item in data: + remove_page_number(item) + return data + +def extract_matching_page_pairs(toc_page, toc_physical_index, start_page_index): + pairs = [] + for phy_item in toc_physical_index: + for page_item in toc_page: + if phy_item.get('title') == page_item.get('title'): + physical_index = phy_item.get('physical_index') + if physical_index is not None and int(physical_index) >= start_page_index: + pairs.append({ + 'title': phy_item.get('title'), + 'page': page_item.get('page'), + 'physical_index': physical_index + }) + return pairs + + +def calculate_page_offset(pairs): + differences = [] + for pair in pairs: + try: + physical_index = pair['physical_index'] + page_number = pair['page'] + difference = physical_index - page_number + differences.append(difference) + except (KeyError, TypeError): + continue + + if not differences: + return None + + difference_counts = {} + for diff in differences: + difference_counts[diff] = difference_counts.get(diff, 0) + 1 + + most_common = max(difference_counts.items(), key=lambda x: x[1])[0] + + return most_common + +def add_page_offset_to_toc_json(data, offset): + for i in range(len(data)): + if data[i].get('page') is not None and isinstance(data[i]['page'], int): + data[i]['physical_index'] = data[i]['page'] + offset + del data[i]['page'] + + return data + + + +def page_list_to_group_text(page_contents, token_lengths, max_tokens=20000, overlap_page=1): + num_tokens = sum(token_lengths) + + if num_tokens <= max_tokens: + # merge all pages into one text + page_text = "".join(page_contents) + return [page_text] + + subsets = [] + current_subset = [] + current_token_count = 0 + + expected_parts_num = math.ceil(num_tokens / max_tokens) + average_tokens_per_part = math.ceil(((num_tokens / expected_parts_num) + max_tokens) / 2) + + for i, (page_content, page_tokens) in enumerate(zip(page_contents, token_lengths)): + if current_token_count + page_tokens > average_tokens_per_part: + + subsets.append(''.join(current_subset)) + # Start new subset from overlap if specified + overlap_start = max(i - overlap_page, 0) + current_subset = page_contents[overlap_start:i] + current_token_count = sum(token_lengths[overlap_start:i]) + + # Add current page to the subset + current_subset.append(page_content) + current_token_count += page_tokens + + # Add the last subset if it contains any pages + if current_subset: + subsets.append(''.join(current_subset)) + + print('divide page_list to groups', len(subsets)) + return subsets + +def add_page_number_to_toc(part, structure, model=None): + fill_prompt_seq = """ + You are given an JSON structure of a document and a patial part of the document. Your task is to check if the title that is described in the structure is started the partial given document. + + The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X. + + If the full target section starts in the partial given document, insert the given JSON structure with the "start": "yes", and "start_index": "<physical_index_X>". + + If the full target section does not start in the partial given document, insert "start": "no", "start_index": None. + + The response should be in the following format. + [ + { + "structure": <structure index, "x.x.x" or None> (string), + "title": <title of the section>, + "start": "<yes or no>", + "physical_index": "<physical_index_X> (keep the format)" or None + }, + ... + ] + The given structure contains the result of the previous part, you need to fill the result of the current part, do not change the previous result. + Directly return the final JSON structure. Do not output anything else.""" + + prompt = fill_prompt_seq + f"\n\nCurrent Partial Document:\n{part}\n\nGiven Structure\n{json.dumps(structure, indent=2)}\n" + current_json_raw = ChatGPT_API(model=model, prompt=prompt, api_key=CHATGPT_API_KEY) + json_result = extract_json(current_json_raw) + + for item in json_result: + if 'start' in item: + del item['start'] + return json_result + + +def remove_first_physical_index_section(text): + """ + Removes the first section between <physical_index_X> and <physical_index_X> tags, + and returns the remaining text. + """ + pattern = r'<physical_index_\d+>.*?<physical_index_\d+>' + match = re.search(pattern, text, re.DOTALL) + if match: + # Remove the first matched section + return text.replace(match.group(0), '', 1) + return text + +### add verify completness +def generate_toc_continue(toc_content, part, model="gpt-4o-2024-11-20"): + print('start generate_toc_continue') + prompt = """ + You are an expert in extracting hierarchical tree structure. + You are given a tree structure of the previous part and the text of the current part. + Your task is to continue the tree structure from the previous part to include the current part. + + The structure variable is the numeric system which represents the index of the heirachy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc. + + For the title, you need to extract the original title from the text, only fix the space inconsistency. + + The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the start and end of page X. \ + + + The response should be in the following format. + [ + { + "structure": <structure index, "x.x.x" or None> (string), + "title": <title of the section, keep the original title>, + "physical_index": "<physical_index_X> (keep the format)" or None + }, + ... + ] + + Directly return the additional part of the final JSON structure. Do not output anything else.""" + + prompt = prompt + '\nGiven text\n:' + part + '\nPrevious tree structure\n:' + json.dumps(toc_content, indent=2) + response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, api_key=CHATGPT_API_KEY) + if finish_reason == 'finished': + return extract_json(response) + else: + raise Exception(f'finish reason: {finish_reason}') + +### add verify completness +def generate_toc_init(part, model=None): + print('start generate_toc_init') + prompt = """ + You are an expert in extracting hierarchical tree structure, your task is to generate the tree structure of the document. + + The structure variable is the numeric system which represents the index of the heirachy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc. + + For the title, you need to extract the original title from the text, only fix the space inconsistency. + + The provided text contains tags like <physical_index_X> and <physical_index_X> to indicate the start and end of page X. + + The response should be in the following format. + [ + { + "structure": <structure index, "x.x.x" or None> (string), + "title": <title of the section, keep the original title>, + "physical_index": "<physical_index_X> (keep the format)" or None + }, + + ], + + + Directly return the final JSON structure. Do not output anything else.""" + + prompt = prompt + '\nGiven text\n:' + part + response, finish_reason = ChatGPT_API_with_finish_reason(model=model, prompt=prompt, api_key=CHATGPT_API_KEY) + + if finish_reason == 'finished': + return extract_json(response) + else: + raise Exception(f'finish reason: {finish_reason}') + +def process_no_toc(page_list, start_index=1, model=None, logger=None): + page_contents=[] + token_lengths=[] + for page_index in range(start_index, start_index+len(page_list)): + page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n" + page_contents.append(page_text) + token_lengths.append(count_tokens(page_text, model)) + group_texts = page_list_to_group_text(page_contents, token_lengths) + logger.info(f'len(group_texts): {len(group_texts)}') + + toc_with_page_number= generate_toc_init(group_texts[0], model) + for group_text in group_texts[1:]: + toc_with_page_number_additional = generate_toc_continue(toc_with_page_number, group_text, model) + toc_with_page_number.extend(toc_with_page_number_additional) + logger.info(f'generate_toc: {toc_with_page_number}') + + toc_with_page_number = convert_physical_index_to_int(toc_with_page_number) + logger.info(f'convert_physical_index_to_int: {toc_with_page_number}') + + return toc_with_page_number + +def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, model=None, logger=None): + page_contents=[] + token_lengths=[] + toc_content = toc_transformer(toc_content, model) + logger.info(f'toc_transformer: {toc_content}') + for page_index in range(len(page_list)): + page_text = f"<physical_index_{page_index+1}>\n{page_list[page_index][0]}\n<physical_index_{page_index+1}>\n\n" + page_contents.append(page_text) + token_lengths.append(count_tokens(page_text, model)) + + group_texts = page_list_to_group_text(page_contents, token_lengths) + logger.info(f'len(group_texts): {len(group_texts)}') + + toc_with_page_number=copy.deepcopy(toc_content) + for group_text in group_texts: + toc_with_page_number = add_page_number_to_toc(group_text, toc_with_page_number, model) + logger.info(f'add_page_number_to_toc: {toc_with_page_number}') + + toc_with_page_number = convert_physical_index_to_int(toc_with_page_number) + logger.info(f'convert_physical_index_to_int: {toc_with_page_number}') + + return toc_with_page_number + + + +def process_toc_with_page_numbers(toc_content, toc_page_list, page_list, model=None, logger=None): + toc_with_page_number = toc_transformer(toc_content, model) + logger.info(f'toc_with_page_number: {toc_with_page_number}') + + toc_no_page_number = remove_page_number(copy.deepcopy(toc_with_page_number)) + + start_page_index = toc_page_list[-1] + 1 + main_content = "" + for page_index in range(start_page_index, start_page_index + 20): + main_content += f"<physical_index_{page_index+1}>\n{page_list[page_index][0]}\n<physical_index_{page_index+1}>\n\n" + + toc_with_physical_index = toc_index_extractor(toc_no_page_number, main_content, model) + logger.info(f'toc_with_physical_index: {toc_with_physical_index}') + + toc_with_physical_index = convert_physical_index_to_int(toc_with_physical_index) + logger.info(f'toc_with_physical_index: {toc_with_physical_index}') + + matching_pairs = extract_matching_page_pairs(toc_with_page_number, toc_with_physical_index, start_page_index) + logger.info(f'matching_pairs: {matching_pairs}') + + offset = calculate_page_offset(matching_pairs) + logger.info(f'offset: {offset}') + + toc_with_page_number = add_page_offset_to_toc_json(toc_with_page_number, offset) + logger.info(f'toc_with_page_number: {toc_with_page_number}') + + toc_with_page_number = process_none_page_numbers(toc_with_page_number,page_list, model) + logger.info(f'toc_with_page_number: {toc_with_page_number}') + + return toc_with_page_number + + + +##check if needed to process none page numbers +def process_none_page_numbers(toc_items, page_list, model=None): + for i, item in enumerate(toc_items): + if "physical_index" not in item: + # logger.info(f"fix item: {item}") + # Find previous physical_index + prev_index = 0 # Default if no previous item exists + for j in range(i - 1, -1, -1): + if toc_items[j].get('physical_index') is not None: + prev_index = toc_items[j]['physical_index']-1 + break + + # Find next physical_index + next_index = -1 # Default if no next item exists + for j in range(i + 1, len(toc_items)): + if toc_items[j].get('physical_index') is not None: + next_index = toc_items[j]['physical_index'] + break + + page_contents = [] + for page_index in range(prev_index, next_index): + page_text = f"<physical_index_{page_index+1}>\n{page_list[page_index][0]}\n<physical_index_{page_index+1}>\n\n" + page_contents.append(page_text) + + item_copy = copy.deepcopy(item) + del item_copy['page'] + result = add_page_number_to_toc(page_contents, item_copy, model) + if isinstance(result[0]['physical_index'], str) and result[0]['physical_index'].startswith('<physical_index'): + item['physical_index'] = int(result[0]['physical_index'].split('_')[-1].rstrip('>').strip()) + del item['page'] + + return toc_items + + + + +def check_toc(page_list, opt=None): + toc_page_list = find_toc_pages(start_page_index=0, page_list=page_list, opt=opt) + if len(toc_page_list) == 0: + print('no toc found') + return {'toc_content': None, 'toc_page_list': [], 'page_index_given_in_toc': 'no'} + else: + print('toc found') + toc_json = toc_extractor(page_list, toc_page_list, opt.model) + + if toc_json['page_index_given_in_toc'] == 'yes': + print('index found') + return {'toc_content': toc_json['toc_content'], 'toc_page_list': toc_page_list, 'page_index_given_in_toc': 'yes'} + else: + current_start_index = toc_page_list[-1] + 1 + + while (toc_json['page_index_given_in_toc'] == 'no' and + current_start_index < len(page_list) and + current_start_index < opt.toc_check_page_num): + + additional_toc_pages = find_toc_pages( + start_page_index=current_start_index, + page_list=page_list, + opt=opt + ) + + if len(additional_toc_pages) == 0: + break + + additional_toc_json = toc_extractor(page_list, additional_toc_pages, opt.model) + if additional_toc_json['page_index_given_in_toc'] == 'yes': + print('index found') + return {'toc_content': additional_toc_json['toc_content'], 'toc_page_list': additional_toc_pages, 'page_index_given_in_toc': 'yes'} + + else: + current_start_index = additional_toc_pages[-1] + 1 + print('index not found') + return {'toc_content': toc_json['toc_content'], 'toc_page_list': toc_page_list, 'page_index_given_in_toc': 'no'} + + + + + + +################### fix incorrect toc ######################################################### +def single_toc_item_index_fixer(section_title, content, model="gpt-4o-2024-11-20"): + tob_extractor_prompt = """ + You are given a section title and serveral pages of a document, your job is to + give find the physical index of the start page of the section in the partial document. + + The provided pages contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X. + + Reply in a JSON format: + { + "thinking": <explain which page, started and closed by <physical_index_X>, contains the start of this section>, + "physical_index": "<physical_index_X>" (keep the format) + } + Directly return the final JSON structure. Do not output anything else.""" + + prompt = tob_extractor_prompt + '\nSection Title:\n' + str(section_title) + '\nDocument pages:\n' + content + response = ChatGPT_API(model=model, prompt=prompt, api_key=CHATGPT_API_KEY) + json_content = extract_json(response) + return convert_physical_index_to_int(json_content['physical_index']) + + + +def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, start_index=1, model=None, logger=None): + print(f'start fix_incorrect_toc with {len(incorrect_results)} incorrect results') + incorrect_indices = {result['list_index'] for result in incorrect_results} + + end_index = len(page_list) + start_index - 1 + + incorrect_results_and_range_logs = [] + # Helper function to process and check a single incorrect item + def process_and_check_item(incorrect_item): + list_index = incorrect_item['list_index'] + # Find the previous correct item + prev_correct = None + for i in range(list_index-1, -1, -1): + if i not in incorrect_indices: + prev_correct = toc_with_page_number[i]['physical_index'] + break + # If no previous correct item found, use start_index + if prev_correct is None: + prev_correct = start_index - 1 + + # Find the next correct item + next_correct = None + for i in range(list_index+1, len(toc_with_page_number)): + if i not in incorrect_indices: + next_correct = toc_with_page_number[i]['physical_index'] + break + # If no next correct item found, use end_index + if next_correct is None: + next_correct = end_index + + incorrect_results_and_range_logs.append({ + 'list_index': list_index, + 'title': incorrect_item['title'], + 'prev_correct': prev_correct, + 'next_correct': next_correct + }) + + page_contents=[] + for page_index in range(prev_correct, next_correct+1): + page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index-start_index+1}>\n\n" + page_contents.append(page_text) + content_range = ''.join(page_contents) + + physical_index = single_toc_item_index_fixer(incorrect_item['title'], content_range, model) + + # Convert to int for checking + physical_index_int = convert_physical_index_to_int(physical_index) + + # Check if the result is correct + check_item = incorrect_item.copy() + check_item['physical_index'] = physical_index_int + check_result = check_title_appearance(check_item, page_list, start_index, model) + + return { + 'list_index': list_index, + 'title': incorrect_item['title'], + 'physical_index': physical_index_int, + 'is_valid': check_result['answer'] == 'yes' + } + + + results = [] + with ThreadPoolExecutor() as executor: + future_to_item = {executor.submit(process_and_check_item, item): item for item in incorrect_results} + for future in as_completed(future_to_item): + item = future_to_item[future] + + try: + result = future.result() + results.append(result) + except Exception as exc: + print(f"Processing item {item} generated an exception: {exc}") + + # Update the toc_with_page_number with the fixed indices and check for any invalid results + invalid_results = [] + for result in results: + if result['is_valid']: + toc_with_page_number[result['list_index']]['physical_index'] = result['physical_index'] + else: + invalid_results.append({ + 'list_index': result['list_index'], + 'title': result['title'], + 'physical_index': result['physical_index'], + }) + + logger.info(f'incorrect_results_and_range_logs: {incorrect_results_and_range_logs}') + logger.info(f'invalid_results: {invalid_results}') + + return toc_with_page_number, invalid_results + + + +def fix_incorrect_toc_with_retries(toc_with_page_number, page_list, incorrect_results, start_index=1, max_attempts=3, model=None, logger=None): + print('start fix_incorrect_toc') + fix_attempt = 0 + current_toc = toc_with_page_number + current_incorrect = incorrect_results + + while current_incorrect: + print(f"Fixing {len(current_incorrect)} incorrect results") + + current_toc, current_incorrect = fix_incorrect_toc(current_toc, page_list, current_incorrect, start_index, model, logger) + + logger.info({'current_toc': current_toc}) + + fix_attempt += 1 + if fix_attempt >= max_attempts: + logger.info("Maximum fix attempts reached") + break + + return current_toc, current_incorrect + + + + +################### verify toc ######################################################### +def verify_toc(page_list, list_result, start_index=1, N=None, model=None): + print('start verify_toc') + # Find the last non-None physical_index + last_physical_index = None + for item in reversed(list_result): + if item.get('physical_index') is not None: + last_physical_index = item['physical_index'] + break + + # Early return if we don't have valid physical indices + if last_physical_index is None or last_physical_index < len(page_list)/2: + return 0, [] + + # Determine which items to check + if N is None: + print('check all items') + sample_indices = range(0, len(list_result)) + else: + N = min(N, len(list_result)) + print(f'check {N} items') + sample_indices = random.sample(range(0, len(list_result)), N) + + # Prepare items with their list indices + indexed_sample_list = [] + for idx in sample_indices: + item = list_result[idx] + item_with_index = item.copy() + item_with_index['list_index'] = idx # Add the original index in list_result + indexed_sample_list.append(item_with_index) + + # Run checks in parallel + results = [] + with ThreadPoolExecutor(max_workers=10) as executor: + future_to_item = { + executor.submit(check_title_appearance, item, page_list, start_index, model): item + for item in indexed_sample_list + } + + for future in as_completed(future_to_item): + results.append(future.result()) + + # Process results + correct_count = 0 + incorrect_results = [] + for result in results: + if result['answer'] == 'yes': + correct_count += 1 + else: + incorrect_results.append(result) + + # Calculate accuracy + checked_count = len(results) + accuracy = correct_count / checked_count if checked_count > 0 else 0 + print(f"accuracy: {accuracy*100:.2f}%") + return accuracy, incorrect_results + + + + + +################### main process ######################################################### +def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=None, start_index=1, opt=None, logger=None): + print(mode) + print(f'start_index: {start_index}') + + if mode == 'process_toc_with_page_numbers': + toc_with_page_number = process_toc_with_page_numbers(toc_content, toc_page_list, page_list, model=opt.model, logger=logger) + elif mode == 'process_toc_no_page_numbers': + toc_with_page_number = process_toc_no_page_numbers(toc_content, toc_page_list, page_list, model=opt.model, logger=logger) + else: + toc_with_page_number = process_no_toc(page_list, start_index=start_index, model=opt.model, logger=logger) + + toc_with_page_number = [item for item in toc_with_page_number if item.get('physical_index') is not None] + accuracy, incorrect_results = verify_toc(page_list, toc_with_page_number, start_index=start_index, model=opt.model) + + logger.info({ + 'mode': 'process_toc_with_page_numbers', + 'accuracy': accuracy, + 'incorrect_results': incorrect_results + }) + if accuracy == 1.0 and len(incorrect_results) == 0: + return toc_with_page_number + if accuracy > 0.6 and len(incorrect_results) > 0: + toc_with_page_number, incorrect_results = fix_incorrect_toc_with_retries(toc_with_page_number, page_list, incorrect_results,start_index=start_index, max_attempts=3, model=opt.model, logger=logger) + return toc_with_page_number + else: + if mode == 'process_toc_with_page_numbers': + return meta_processor(page_list, mode='process_toc_no_page_numbers', toc_content=toc_content, toc_page_list=toc_page_list, start_index=start_index, opt=opt, logger=logger) + elif mode == 'process_toc_no_page_numbers': + return meta_processor(page_list, mode='process_no_toc', start_index=start_index, opt=opt, logger=logger) + else: + raise Exception('Processing failed') + + +def process_large_node_recursively(node, page_list, opt=None, logger=None): + node_page_list = page_list[node['start_index']-1:node['end_index']-1] + token_num = sum([page[1] for page in node_page_list]) + + if node['end_index'] - node['start_index'] > opt.max_page_num_each_node and token_num >= opt.max_token_num_each_node: + print('large node:', node['title'], 'start_index:', node['start_index'], 'end_index:', node['end_index'], 'token_num:', token_num) + + node_toc_tree = meta_processor(node_page_list, mode='process_no_toc', start_index=node['start_index'], opt=opt, logger=logger) + node_toc_tree = check_title_appearance_in_start_parallel(node_toc_tree, page_list, model=opt.model, logger=logger) + + if node['title'].strip() == node_toc_tree[0]['title'].strip(): + node['child_nodes'] = post_processing(node_toc_tree[1:], node['end_index']) + node['end_index'] = node_toc_tree[1]['start_index'] + else: + node['child_nodes'] = post_processing(node_toc_tree, node['end_index']) + node['end_index'] = node_toc_tree[0]['start_index'] + + if 'child_nodes' in node and node['child_nodes']: + for child_node in node['child_nodes']: + process_large_node_recursively(child_node, page_list, opt, logger=logger) + + return node + +def tree_parser(page_list, opt, logger=None): + check_toc_result = check_toc(page_list, opt) + logger.info(check_toc_result) + + if check_toc_result['toc_content'] is None: + toc_with_page_number = meta_processor( + page_list, + mode='process_no_toc', + start_index=1, + opt=opt, + logger=logger) + else: + if check_toc_result['page_index_given_in_toc'] == 'yes': + toc_with_page_number = meta_processor( + page_list, + mode='process_toc_with_page_numbers', + start_index=1, + toc_content=check_toc_result['toc_content'], + toc_page_list=check_toc_result['toc_page_list'], + opt=opt, + logger=logger) + else: + toc_with_page_number = meta_processor( + page_list, + mode='process_toc_no_page_numbers', + start_index=1, + toc_content=check_toc_result['toc_content'], + toc_page_list=check_toc_result['toc_page_list'], + opt=opt, + logger=logger) + + toc_with_page_number = add_preface_if_needed(toc_with_page_number) + toc_with_page_number = check_title_appearance_in_start_parallel(toc_with_page_number, page_list, model=opt.model, logger=logger) + toc_tree = post_processing(toc_with_page_number, len(page_list)) + for node in toc_tree: + process_large_node_recursively(node, page_list, opt, logger=logger) + + return toc_tree + + +def page_index_main(doc, opt=None): + logger = JsonLogger(doc) + + is_valid_pdf = ( + (isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or + isinstance(doc, BytesIO) + ) + if not is_valid_pdf: + raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.") + + print('Parsing PDF...') + page_list = get_page_tokens(doc) + logger.info({'total_page_number': len(page_list)}) + logger.info({'total_token': sum([page[1] for page in page_list])}) + + structure = tree_parser(page_list, opt, logger=logger) + return structure + + + +if __name__ == "__main__": + # Set up argument parser + parser = argparse.ArgumentParser(description='Process PDF document and generate structure') + parser.add_argument('--pdf_path', type=str, help='Path to the PDF file') + parser.add_argument('--model', type=str, default='gpt-4o-2024-11-20', help='Model to use') + parser.add_argument('--toc-check-pages', type=int, default=20, + help='Number of pages to check for table of contents') + parser.add_argument('--max-pages-per-node', type=int, default=10, + help='Maximum number of pages per node') + parser.add_argument('--max-tokens-per-node', type=int, default=20000, + help='Maximum number of tokens per node') + + args = parser.parse_args() + + # Configure options + opt = config( + model=args.model, + toc_check_page_num=args.toc_check_pages, + max_page_num_each_node=args.max_pages_per_node, + max_token_num_each_node=args.max_tokens_per_node, + ) + + # Process the PDF + toc_with_page_number = page_index_main(args.pdf_path, opt) + print('Parsing done, saving to file...') + + # Save results + pdf_name = os.path.splitext(os.path.basename(args.pdf_path))[0] + os.makedirs('./results', exist_ok=True) + + with open(f'./results/{pdf_name}_structure.json', 'w', encoding='utf-8') as f: + json.dump(toc_with_page_number, f, indent=2) + + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0a8869e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +openai==1.70.0 +pymupdf==1.25.5 +PyPDF2==3.0.1 +python-dotenv==1.1.0 +tiktoken==0.7.0 diff --git a/results/2023-annual-report_structure.json b/results/2023-annual-report_structure.json new file mode 100644 index 0000000..208cef2 --- /dev/null +++ b/results/2023-annual-report_structure.json @@ -0,0 +1,460 @@ +[ + { + "title": "Preface", + "start_index": 1, + "end_index": 4 + }, + { + "title": "About the Federal Reserve", + "start_index": 5, + "end_index": 7 + }, + { + "title": "Overview", + "start_index": 7, + "end_index": 8 + }, + { + "title": "Monetary Policy and Economic Developments", + "start_index": 9, + "end_index": 9, + "child_nodes": [ + { + "title": "March 2024 Summary", + "start_index": 9, + "end_index": 14 + }, + { + "title": "June 2023 Summary", + "start_index": 15, + "end_index": 20 + } + ] + }, + { + "title": "Financial Stability", + "start_index": 21, + "end_index": 21, + "child_nodes": [ + { + "title": "Monitoring Financial Vulnerabilities", + "start_index": 22, + "end_index": 28 + }, + { + "title": "Domestic and International Cooperation and Coordination", + "start_index": 28, + "end_index": 31 + } + ] + }, + { + "title": "Supervision and Regulation", + "start_index": 31, + "end_index": 31, + "child_nodes": [ + { + "title": "Supervised and Regulated Institutions", + "start_index": 32, + "end_index": 35 + }, + { + "title": "Supervisory Developments", + "start_index": 35, + "end_index": 54 + }, + { + "title": "Regulatory Developments", + "start_index": 55, + "end_index": 59 + } + ] + }, + { + "title": "Payment System and Reserve Bank Oversight", + "start_index": 59, + "end_index": 59, + "child_nodes": [ + { + "title": "Payment Services to Depository and Other Institutions", + "start_index": 60, + "end_index": 65 + }, + { + "title": "Currency and Coin", + "start_index": 66, + "end_index": 68 + }, + { + "title": "Fiscal Agency and Government Depository Services", + "start_index": 69, + "end_index": 72 + }, + { + "title": "Evolutions and Improvements to the System", + "start_index": 72, + "end_index": 75 + }, + { + "title": "Oversight of Federal Reserve Banks", + "start_index": 75, + "end_index": 81 + }, + { + "title": "Pro Forma Financial Statements for Federal Reserve Priced Services", + "start_index": 82, + "end_index": 88 + } + ] + }, + { + "title": "Consumer and Community Affairs", + "start_index": 89, + "end_index": 89, + "child_nodes": [ + { + "title": "Consumer Compliance Supervision", + "start_index": 89, + "end_index": 101 + }, + { + "title": "Consumer Laws and Regulations", + "start_index": 101, + "end_index": 102 + }, + { + "title": "Consumer Research and Analysis of Emerging Issues and Policy", + "start_index": 102, + "end_index": 105 + }, + { + "title": "Community Development", + "start_index": 105, + "end_index": 106 + } + ] + }, + { + "title": "Appendixes", + "start_index": 107, + "end_index": 108 + }, + { + "title": "Federal Reserve System Organization", + "start_index": 109, + "end_index": 109, + "child_nodes": [ + { + "title": "Board of Governors", + "start_index": 109, + "end_index": 116 + }, + { + "title": "Federal Open Market Committee", + "start_index": 117, + "end_index": 118 + }, + { + "title": "Board of Governors Advisory Councils", + "start_index": 119, + "end_index": 122 + }, + { + "title": "Federal Reserve Banks and Branches", + "start_index": 123, + "end_index": 146 + } + ] + }, + { + "title": "Minutes of Federal Open Market Committee Meetings", + "start_index": 147, + "end_index": 147, + "child_nodes": [ + { + "title": "Meeting Minutes", + "start_index": 147, + "end_index": 149 + } + ] + }, + { + "title": "Federal Reserve System Audits", + "start_index": 149, + "end_index": 149, + "child_nodes": [ + { + "title": "Office of Inspector General Activities", + "start_index": 149, + "end_index": 151 + }, + { + "title": "Government Accountability Office Reviews", + "start_index": 151, + "end_index": 153 + } + ] + }, + { + "title": "Federal Reserve System Budgets", + "start_index": 153, + "end_index": 153, + "child_nodes": [ + { + "title": "System Budgets Overview", + "start_index": 153, + "end_index": 157 + }, + { + "title": "Board of Governors Budgets", + "start_index": 157, + "end_index": 163 + }, + { + "title": "Federal Reserve Banks Budgets", + "start_index": 163, + "end_index": 169 + }, + { + "title": "Currency Budget", + "start_index": 169, + "end_index": 174 + } + ] + }, + { + "title": "Record of Policy Actions of the Board of Governors", + "start_index": 175, + "end_index": 175, + "child_nodes": [ + { + "title": "Rules and Regulations", + "start_index": 175, + "end_index": 176 + }, + { + "title": "Policy Statements and Other Actions", + "start_index": 177, + "end_index": 181 + }, + { + "title": "Discount Rates for Depository Institutions in 2023", + "start_index": 181, + "end_index": 183 + }, + { + "title": "The Board of Governors and the Government Performance and Results Act", + "start_index": 184, + "end_index": 184 + } + ] + }, + { + "title": "Litigation", + "start_index": 185, + "end_index": 185, + "child_nodes": [ + { + "title": "Pending", + "start_index": 185, + "end_index": 186 + }, + { + "title": "Resolved", + "start_index": 186, + "end_index": 186 + } + ] + }, + { + "title": "Statistical Tables", + "start_index": 187, + "end_index": 187, + "child_nodes": [ + { + "title": "Federal Reserve open market transactions, 2023", + "start_index": 187, + "end_index": 187, + "child_nodes": [ + { + "title": "Federal Reserve open market transactions, 2023\u2014continued", + "start_index": 187, + "end_index": 188 + } + ] + }, + { + "title": "Federal Reserve Bank holdings of U.S. Treasury and federal agency securities, December 31, 2021\u201323", + "start_index": 189, + "end_index": 188, + "child_nodes": [ + { + "title": "Federal Reserve Bank holdings of U.S. Treasury and federal agency securities, December 31, 2021\u201323\u2014continued", + "start_index": 189, + "end_index": 190 + } + ] + }, + { + "title": "Reserve requirements of depository institutions, December 31, 2023", + "start_index": 191, + "end_index": 191 + }, + { + "title": "Banking offices and banks affiliated with bank holding companies in the United States, December 31, 2022 and 2023", + "start_index": 192, + "end_index": 192 + }, + { + "title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1984\u20132023 and month-end 2023", + "start_index": 193, + "end_index": 194, + "child_nodes": [ + { + "title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1984\u20132023 and month-end 2023\u2014continued", + "start_index": 194, + "end_index": 194 + }, + { + "title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1984\u20132023 and month-end 2023\u2014continued", + "start_index": 195, + "end_index": 196 + }, + { + "title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1984\u20132023 and month-end 2023\u2014continued", + "start_index": 196, + "end_index": 196 + } + ] + }, + { + "title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1918\u20131983", + "start_index": 197, + "end_index": 198, + "child_nodes": [ + { + "title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1918\u20131983\u2014continued", + "start_index": 199, + "end_index": 198 + }, + { + "title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1918\u20131983\u2014continued", + "start_index": 199, + "end_index": 198 + }, + { + "title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1918\u20131983\u2014continued", + "start_index": 199, + "end_index": 200 + } + ] + }, + { + "title": "Principal assets and liabilities of insured commercial banks, by class of bank, June 30, 2023 and 2022", + "start_index": 201, + "end_index": 201 + }, + { + "title": "Initial margin requirements under Regulations T, U, and X", + "start_index": 202, + "end_index": 203 + }, + { + "title": "Statement of condition of the Federal Reserve Banks, by Bank, December 31, 2023 and 2022", + "start_index": 203, + "end_index": 206, + "child_nodes": [ + { + "title": "Statement of condition of the Federal Reserve Banks, by Bank, December 31, 2023 and 2022\u2014continued", + "start_index": 206, + "end_index": 206 + }, + { + "title": "Statement of condition of the Federal Reserve Banks, by Bank, December 31, 2023 and 2022\u2014continued", + "start_index": 206, + "end_index": 206 + }, + { + "title": "Statement of condition of the Federal Reserve Banks, by Bank, December 31, 2023 and 2022\u2014continued", + "start_index": 206, + "end_index": 206 + }, + { + "title": "Statement of condition of the Federal Reserve Banks, by Bank, December 31, 2023 and 2022\u2014continued", + "start_index": 206, + "end_index": 206 + }, + { + "title": "Statement of condition of the Federal Reserve Banks, by Bank, December 31, 2023 and 2022\u2014continued", + "start_index": 206, + "end_index": 209 + } + ] + }, + { + "title": "Statement of condition of the Federal Reserve Banks, December 31, 2023 and 2022", + "start_index": 209, + "end_index": 210 + }, + { + "title": "Income and expenses of the Federal Reserve Banks, by Bank, 2023", + "start_index": 210, + "end_index": 211, + "child_nodes": [ + { + "title": "Income and expenses of the Federal Reserve Banks, by Bank, 2023\u2014continued", + "start_index": 211, + "end_index": 212 + }, + { + "title": "Income and expenses of the Federal Reserve Banks, by Bank, 2023\u2014continued", + "start_index": 212, + "end_index": 212 + }, + { + "title": "Income and expenses of the Federal Reserve Banks, by Bank, 2023\u2014continued", + "start_index": 212, + "end_index": 214 + } + ] + }, + { + "title": "Income and expenses of the Federal Reserve Banks, 1914\u20132023", + "start_index": 214, + "end_index": 214, + "child_nodes": [ + { + "title": "Income and expenses of the Federal Reserve Banks, 1914\u20132023\u2014continued", + "start_index": 214, + "end_index": 214 + }, + { + "title": "Income and expenses of the Federal Reserve Banks, 1914\u20132023\u2014continued", + "start_index": 214, + "end_index": 217 + }, + { + "title": "Income and expenses of the Federal Reserve Banks, 1914\u20132023\u2014continued", + "start_index": 217, + "end_index": 217 + } + ] + }, + { + "title": "Operations in principal departments of the Federal Reserve Banks, 2020\u201323", + "start_index": 218, + "end_index": 218 + }, + { + "title": "Number and annual salaries of officers and employees of the Federal Reserve Banks, December 31, 2023", + "start_index": 219, + "end_index": 220 + }, + { + "title": "Acquisition costs and net book value of the premises of the Federal Reserve Banks and Branches, December 31, 2023", + "start_index": 220, + "end_index": 222 + } + ] + } +] \ No newline at end of file diff --git a/results/PRML_structure.json b/results/PRML_structure.json new file mode 100644 index 0000000..a19fea3 --- /dev/null +++ b/results/PRML_structure.json @@ -0,0 +1,1558 @@ +[ + { + "title": "Preface", + "start_index": 1, + "end_index": 6 + }, + { + "title": "Preface", + "start_index": 7, + "end_index": 10 + }, + { + "title": "Mathematical notation", + "start_index": 11, + "end_index": 13 + }, + { + "title": "Contents", + "start_index": 13, + "end_index": 20 + }, + { + "title": "Introduction", + "start_index": 21, + "end_index": 24, + "child_nodes": [ + { + "title": "Example: Polynomial Curve Fitting", + "start_index": 24, + "end_index": 32 + }, + { + "title": "Probability Theory", + "start_index": 32, + "end_index": 37, + "child_nodes": [ + { + "title": "Probability densities", + "start_index": 37, + "end_index": 39 + }, + { + "title": "Expectations and covariances", + "start_index": 39, + "end_index": 41 + }, + { + "title": "Bayesian probabilities", + "start_index": 41, + "end_index": 44 + }, + { + "title": "The Gaussian distribution", + "start_index": 44, + "end_index": 48 + }, + { + "title": "Curve fitting re-visited", + "start_index": 48, + "end_index": 50 + }, + { + "title": "Bayesian curve fitting", + "start_index": 50, + "end_index": 52 + } + ] + }, + { + "title": "Model Selection", + "start_index": 52, + "end_index": 53 + }, + { + "title": "The Curse of Dimensionality", + "start_index": 53, + "end_index": 58 + }, + { + "title": "Decision Theory", + "start_index": 58, + "end_index": 59, + "child_nodes": [ + { + "title": "Minimizing the misclassification rate", + "start_index": 59, + "end_index": 61 + }, + { + "title": "Minimizing the expected loss", + "start_index": 61, + "end_index": 62 + }, + { + "title": "The reject option", + "start_index": 62, + "end_index": 62 + }, + { + "title": "Inference and decision", + "start_index": 62, + "end_index": 66 + }, + { + "title": "Loss functions for regression", + "start_index": 66, + "end_index": 68 + } + ] + }, + { + "title": "Information Theory", + "start_index": 68, + "end_index": 75, + "child_nodes": [ + { + "title": "Relative entropy and mutual information", + "start_index": 75, + "end_index": 78 + } + ] + } + ] + }, + { + "title": "Exercises", + "start_index": 78, + "end_index": 87 + }, + { + "title": "Probability Distributions", + "start_index": 87, + "end_index": 88, + "child_nodes": [ + { + "title": "Binary Variables", + "start_index": 88, + "end_index": 91, + "child_nodes": [ + { + "title": "The beta distribution", + "start_index": 91, + "end_index": 94 + } + ] + }, + { + "title": "Multinomial Variables", + "start_index": 94, + "end_index": 96, + "child_nodes": [ + { + "title": "The Dirichlet distribution", + "start_index": 96, + "end_index": 98 + } + ] + }, + { + "title": "The Gaussian Distribution", + "start_index": 98, + "end_index": 105, + "child_nodes": [ + { + "title": "Conditional Gaussian distributions", + "start_index": 105, + "end_index": 108 + }, + { + "title": "Marginal Gaussian distributions", + "start_index": 108, + "end_index": 110 + }, + { + "title": "Bayes\u2019 theorem for Gaussian variables", + "start_index": 110, + "end_index": 113 + }, + { + "title": "Maximum likelihood for the Gaussian", + "start_index": 113, + "end_index": 114 + }, + { + "title": "Sequential estimation", + "start_index": 114, + "end_index": 117 + }, + { + "title": "Bayesian inference for the Gaussian", + "start_index": 117, + "end_index": 122 + }, + { + "title": "Student\u2019s t-distribution", + "start_index": 122, + "end_index": 125 + }, + { + "title": "Periodic variables", + "start_index": 125, + "end_index": 130 + }, + { + "title": "Mixtures of Gaussians", + "start_index": 130, + "end_index": 133 + } + ] + }, + { + "title": "The Exponential Family", + "start_index": 133, + "end_index": 136, + "child_nodes": [ + { + "title": "Maximum likelihood and sufficient statistics", + "start_index": 136, + "end_index": 137 + }, + { + "title": "Conjugate priors", + "start_index": 137, + "end_index": 137 + }, + { + "title": "Noninformative priors", + "start_index": 137, + "end_index": 140 + } + ] + }, + { + "title": "Nonparametric Methods", + "start_index": 140, + "end_index": 142, + "child_nodes": [ + { + "title": "Kernel density estimators", + "start_index": 142, + "end_index": 144 + }, + { + "title": "Nearest-neighbour methods", + "start_index": 144, + "end_index": 147 + } + ] + } + ] + }, + { + "title": "Exercises", + "start_index": 147, + "end_index": 156 + }, + { + "title": "Linear Models for Regression", + "start_index": 157, + "end_index": 158, + "child_nodes": [ + { + "title": "Linear Basis Function Models", + "start_index": 158, + "end_index": 160, + "child_nodes": [ + { + "title": "Maximum likelihood and least squares", + "start_index": 160, + "end_index": 163 + }, + { + "title": "Geometry of least squares", + "start_index": 163, + "end_index": 163 + }, + { + "title": "Sequential learning", + "start_index": 163, + "end_index": 164 + }, + { + "title": "Regularized least squares", + "start_index": 164, + "end_index": 166 + }, + { + "title": "Multiple outputs", + "start_index": 166, + "end_index": 167 + } + ] + }, + { + "title": "The Bias-Variance Decomposition", + "start_index": 167, + "end_index": 172 + }, + { + "title": "Bayesian Linear Regression", + "start_index": 172, + "end_index": 172, + "child_nodes": [ + { + "title": "Parameter distribution", + "start_index": 172, + "end_index": 176 + }, + { + "title": "Predictive distribution", + "start_index": 176, + "end_index": 179 + }, + { + "title": "Equivalent kernel", + "start_index": 179, + "end_index": 181 + } + ] + }, + { + "title": "Bayesian Model Comparison", + "start_index": 181, + "end_index": 185 + }, + { + "title": "The Evidence Approximation", + "start_index": 185, + "end_index": 186, + "child_nodes": [ + { + "title": "Evaluation of the evidence function", + "start_index": 186, + "end_index": 188 + }, + { + "title": "Maximizing the evidence function", + "start_index": 188, + "end_index": 190 + }, + { + "title": "Effective number of parameters", + "start_index": 190, + "end_index": 192 + } + ] + }, + { + "title": "Limitations of Fixed Basis Functions", + "start_index": 192, + "end_index": 193 + } + ] + }, + { + "title": "Exercises", + "start_index": 193, + "end_index": 198 + }, + { + "title": "Linear Models for Classification", + "start_index": 199, + "end_index": 201, + "child_nodes": [ + { + "title": "Discriminant Functions", + "start_index": 201, + "end_index": 201, + "child_nodes": [ + { + "title": "Two classes", + "start_index": 201, + "end_index": 202 + }, + { + "title": "Multiple classes", + "start_index": 202, + "end_index": 204 + }, + { + "title": "Least squares for classification", + "start_index": 204, + "end_index": 206 + }, + { + "title": "Fisher\u2019s linear discriminant", + "start_index": 206, + "end_index": 209 + }, + { + "title": "Relation to least squares", + "start_index": 209, + "end_index": 211 + }, + { + "title": "Fisher\u2019s discriminant for multiple classes", + "start_index": 211, + "end_index": 212 + }, + { + "title": "The perceptron algorithm", + "start_index": 212, + "end_index": 216 + } + ] + }, + { + "title": "Probabilistic Generative Models", + "start_index": 216, + "end_index": 218, + "child_nodes": [ + { + "title": "Continuous inputs", + "start_index": 218, + "end_index": 220 + }, + { + "title": "Maximum likelihood solution", + "start_index": 220, + "end_index": 222 + }, + { + "title": "Discrete features", + "start_index": 222, + "end_index": 222 + }, + { + "title": "Exponential family", + "start_index": 222, + "end_index": 223 + } + ] + }, + { + "title": "Probabilistic Discriminative Models", + "start_index": 223, + "end_index": 224, + "child_nodes": [ + { + "title": "Fixed basis functions", + "start_index": 224, + "end_index": 225 + }, + { + "title": "Logistic regression", + "start_index": 225, + "end_index": 227 + }, + { + "title": "Iterative reweighted least squares", + "start_index": 227, + "end_index": 229 + }, + { + "title": "Multiclass logistic regression", + "start_index": 229, + "end_index": 230 + }, + { + "title": "Probit regression", + "start_index": 230, + "end_index": 232 + }, + { + "title": "Canonical link functions", + "start_index": 232, + "end_index": 232 + } + ] + }, + { + "title": "The Laplace Approximation", + "start_index": 233, + "end_index": 236, + "child_nodes": [ + { + "title": "Model comparison and BIC", + "start_index": 236, + "end_index": 237 + } + ] + }, + { + "title": "Bayesian Logistic Regression", + "start_index": 237, + "end_index": 237, + "child_nodes": [ + { + "title": "Laplace approximation", + "start_index": 237, + "end_index": 238 + }, + { + "title": "Predictive distribution", + "start_index": 238, + "end_index": 240 + } + ] + } + ] + }, + { + "title": "Exercises", + "start_index": 240, + "end_index": 245 + }, + { + "title": "Neural Networks", + "start_index": 245, + "end_index": 247, + "child_nodes": [ + { + "title": "Feed-forward Network Functions", + "start_index": 247, + "end_index": 251, + "child_nodes": [ + { + "title": "Weight-space symmetries", + "start_index": 251, + "end_index": 252 + } + ] + }, + { + "title": "Network Training", + "start_index": 252, + "end_index": 256, + "child_nodes": [ + { + "title": "Parameter optimization", + "start_index": 256, + "end_index": 257 + }, + { + "title": "Local quadratic approximation", + "start_index": 257, + "end_index": 259 + }, + { + "title": "Use of gradient information", + "start_index": 259, + "end_index": 260 + }, + { + "title": "Gradient descent optimization", + "start_index": 260, + "end_index": 261 + } + ] + }, + { + "title": "Error Backpropagation", + "start_index": 261, + "end_index": 262, + "child_nodes": [ + { + "title": "Evaluation of error-function derivatives", + "start_index": 262, + "end_index": 265 + }, + { + "title": "A simple example", + "start_index": 265, + "end_index": 266 + }, + { + "title": "Efficiency of backpropagation", + "start_index": 266, + "end_index": 267 + }, + { + "title": "The Jacobian matrix", + "start_index": 267, + "end_index": 269 + } + ] + }, + { + "title": "The Hessian Matrix", + "start_index": 269, + "end_index": 270, + "child_nodes": [ + { + "title": "Diagonal approximation", + "start_index": 270, + "end_index": 271 + }, + { + "title": "Outer product approximation", + "start_index": 271, + "end_index": 272 + }, + { + "title": "Inverse Hessian", + "start_index": 272, + "end_index": 272 + }, + { + "title": "Finite differences", + "start_index": 272, + "end_index": 273 + }, + { + "title": "Exact evaluation of the Hessian", + "start_index": 273, + "end_index": 274 + }, + { + "title": "Fast multiplication by the Hessian", + "start_index": 274, + "end_index": 276 + } + ] + }, + { + "title": "Regularization in Neural Networks", + "start_index": 276, + "end_index": 277, + "child_nodes": [ + { + "title": "Consistent Gaussian priors", + "start_index": 277, + "end_index": 279 + }, + { + "title": "Early stopping", + "start_index": 279, + "end_index": 281 + }, + { + "title": "Invariances", + "start_index": 281, + "end_index": 283 + }, + { + "title": "Tangent propagation", + "start_index": 283, + "end_index": 285 + }, + { + "title": "Training with transformed data", + "start_index": 285, + "end_index": 287 + }, + { + "title": "Convolutional networks", + "start_index": 287, + "end_index": 289 + }, + { + "title": "Soft weight sharing", + "start_index": 289, + "end_index": 292 + } + ] + }, + { + "title": "Mixture Density Networks", + "start_index": 292, + "end_index": 297 + }, + { + "title": "Bayesian Neural Networks", + "start_index": 297, + "end_index": 298, + "child_nodes": [ + { + "title": "Posterior parameter distribution", + "start_index": 298, + "end_index": 300 + }, + { + "title": "Hyperparameter optimization", + "start_index": 300, + "end_index": 301 + }, + { + "title": "Bayesian neural networks for classification", + "start_index": 301, + "end_index": 304 + } + ] + } + ] + }, + { + "title": "Exercises", + "start_index": 304, + "end_index": 311 + }, + { + "title": "Kernel Methods", + "start_index": 311, + "end_index": 313, + "child_nodes": [ + { + "title": "Dual Representations", + "start_index": 313, + "end_index": 314 + }, + { + "title": "Constructing Kernels", + "start_index": 314, + "end_index": 319 + }, + { + "title": "Radial Basis Function Networks", + "start_index": 319, + "end_index": 321, + "child_nodes": [ + { + "title": "Nadaraya-Watson model", + "start_index": 321, + "end_index": 323 + } + ] + }, + { + "title": "Gaussian Processes", + "start_index": 323, + "end_index": 324, + "child_nodes": [ + { + "title": "Linear regression revisited", + "start_index": 324, + "end_index": 326 + }, + { + "title": "Gaussian processes for regression", + "start_index": 326, + "end_index": 331 + }, + { + "title": "Learning the hyperparameters", + "start_index": 331, + "end_index": 332 + }, + { + "title": "Automatic relevance determination", + "start_index": 332, + "end_index": 333 + }, + { + "title": "Gaussian processes for classification", + "start_index": 333, + "end_index": 335 + }, + { + "title": "Laplace approximation", + "start_index": 335, + "end_index": 339 + }, + { + "title": "Connection to neural networks", + "start_index": 339, + "end_index": 340 + } + ] + } + ] + }, + { + "title": "Exercises", + "start_index": 340, + "end_index": 344 + }, + { + "title": "Sparse Kernel Machines", + "start_index": 345, + "end_index": 346, + "child_nodes": [ + { + "title": "Maximum Margin Classifiers", + "start_index": 346, + "end_index": 351, + "child_nodes": [ + { + "title": "Overlapping class distributions", + "start_index": 351, + "end_index": 356 + }, + { + "title": "Relation to logistic regression", + "start_index": 356, + "end_index": 358 + }, + { + "title": "Multiclass SVMs", + "start_index": 358, + "end_index": 359 + }, + { + "title": "SVMs for regression", + "start_index": 359, + "end_index": 364 + }, + { + "title": "Computational learning theory", + "start_index": 364, + "end_index": 365 + } + ] + }, + { + "title": "Relevance Vector Machines", + "start_index": 365, + "end_index": 365, + "child_nodes": [ + { + "title": "RVM for regression", + "start_index": 365, + "end_index": 369 + }, + { + "title": "Analysis of sparsity", + "start_index": 369, + "end_index": 373 + }, + { + "title": "RVM for classification", + "start_index": 373, + "end_index": 377 + } + ] + } + ] + }, + { + "title": "Exercises", + "start_index": 377, + "end_index": 379 + }, + { + "title": "Graphical Models", + "start_index": 379, + "end_index": 380, + "child_nodes": [ + { + "title": "Bayesian Networks", + "start_index": 380, + "end_index": 382, + "child_nodes": [ + { + "title": "Example: Polynomial regression", + "start_index": 382, + "end_index": 385 + }, + { + "title": "Generative models", + "start_index": 385, + "end_index": 386 + }, + { + "title": "Discrete variables", + "start_index": 386, + "end_index": 390 + }, + { + "title": "Linear-Gaussian models", + "start_index": 390, + "end_index": 392 + } + ] + }, + { + "title": "Conditional Independence", + "start_index": 392, + "end_index": 393, + "child_nodes": [ + { + "title": "Three example graphs", + "start_index": 393, + "end_index": 398 + }, + { + "title": "D-separation", + "start_index": 398, + "end_index": 403 + } + ] + }, + { + "title": "Markov Random Fields", + "start_index": 403, + "end_index": 403, + "child_nodes": [ + { + "title": "Conditional independence properties", + "start_index": 403, + "end_index": 404 + }, + { + "title": "Factorization properties", + "start_index": 404, + "end_index": 407 + }, + { + "title": "Illustration: Image de-noising", + "start_index": 407, + "end_index": 410 + }, + { + "title": "Relation to directed graphs", + "start_index": 410, + "end_index": 413 + } + ] + }, + { + "title": "Inference in Graphical Models", + "start_index": 413, + "end_index": 414, + "child_nodes": [ + { + "title": "Inference on a chain", + "start_index": 414, + "end_index": 418 + }, + { + "title": "Trees", + "start_index": 418, + "end_index": 419 + }, + { + "title": "Factor graphs", + "start_index": 419, + "end_index": 422 + }, + { + "title": "The sum-product algorithm", + "start_index": 422, + "end_index": 431 + }, + { + "title": "The max-sum algorithm", + "start_index": 431, + "end_index": 436 + }, + { + "title": "Exact inference in general graphs", + "start_index": 436, + "end_index": 437 + }, + { + "title": "Loopy belief propagation", + "start_index": 437, + "end_index": 438 + }, + { + "title": "Learning the graph structure", + "start_index": 438, + "end_index": 438 + } + ] + } + ] + }, + { + "title": "Exercises", + "start_index": 438, + "end_index": 442 + }, + { + "title": "Mixture Models and EM", + "start_index": 443, + "end_index": 444, + "child_nodes": [ + { + "title": "K-means Clustering", + "start_index": 444, + "end_index": 448, + "child_nodes": [ + { + "title": "Image segmentation and compression", + "start_index": 448, + "end_index": 450 + } + ] + }, + { + "title": "Mixtures of Gaussians", + "start_index": 450, + "end_index": 452, + "child_nodes": [ + { + "title": "Maximum likelihood", + "start_index": 452, + "end_index": 455 + }, + { + "title": "EM for Gaussian mixtures", + "start_index": 455, + "end_index": 459 + } + ] + }, + { + "title": "An Alternative View of EM", + "start_index": 459, + "end_index": 461, + "child_nodes": [ + { + "title": "Gaussian mixtures revisited", + "start_index": 461, + "end_index": 463 + }, + { + "title": "Relation to K-means", + "start_index": 463, + "end_index": 464 + }, + { + "title": "Mixtures of Bernoulli distributions", + "start_index": 464, + "end_index": 468 + }, + { + "title": "EM for Bayesian linear regression", + "start_index": 468, + "end_index": 470 + } + ] + }, + { + "title": "The EM Algorithm in General", + "start_index": 470, + "end_index": 475 + } + ] + }, + { + "title": "Exercises", + "start_index": 475, + "end_index": 480 + }, + { + "title": "Approximate Inference", + "start_index": 481, + "end_index": 482, + "child_nodes": [ + { + "title": "Variational Inference", + "start_index": 482, + "end_index": 484, + "child_nodes": [ + { + "title": "Factorized distributions", + "start_index": 484, + "end_index": 486 + }, + { + "title": "Properties of factorized approximations", + "start_index": 486, + "end_index": 490 + }, + { + "title": "Example: The univariate Gaussian", + "start_index": 490, + "end_index": 493 + }, + { + "title": "Model comparison", + "start_index": 493, + "end_index": 494 + } + ] + }, + { + "title": "Illustration: Variational Mixture of Gaussians", + "start_index": 494, + "end_index": 495, + "child_nodes": [ + { + "title": "Variational distribution", + "start_index": 495, + "end_index": 501 + }, + { + "title": "Variational lower bound", + "start_index": 501, + "end_index": 502 + }, + { + "title": "Predictive density", + "start_index": 502, + "end_index": 503 + }, + { + "title": "Determining the number of components", + "start_index": 503, + "end_index": 505 + }, + { + "title": "Induced factorizations", + "start_index": 505, + "end_index": 506 + } + ] + }, + { + "title": "Variational Linear Regression", + "start_index": 506, + "end_index": 506, + "child_nodes": [ + { + "title": "Variational distribution", + "start_index": 506, + "end_index": 508 + }, + { + "title": "Predictive distribution", + "start_index": 508, + "end_index": 509 + }, + { + "title": "Lower bound", + "start_index": 509, + "end_index": 510 + } + ] + }, + { + "title": "Exponential Family Distributions", + "start_index": 510, + "end_index": 511, + "child_nodes": [ + { + "title": "Variational message passing", + "start_index": 511, + "end_index": 512 + } + ] + }, + { + "title": "Local Variational Methods", + "start_index": 513, + "end_index": 518 + }, + { + "title": "Variational Logistic Regression", + "start_index": 518, + "end_index": 518, + "child_nodes": [ + { + "title": "Variational posterior distribution", + "start_index": 518, + "end_index": 520 + }, + { + "title": "Optimizing the variational parameters", + "start_index": 520, + "end_index": 522 + }, + { + "title": "Inference of hyperparameters", + "start_index": 522, + "end_index": 525 + } + ] + }, + { + "title": "Expectation Propagation", + "start_index": 525, + "end_index": 531, + "child_nodes": [ + { + "title": "Example: The clutter problem", + "start_index": 531, + "end_index": 533 + }, + { + "title": "Expectation propagation on graphs", + "start_index": 533, + "end_index": 537 + } + ] + } + ] + }, + { + "title": "Exercises", + "start_index": 537, + "end_index": 542 + }, + { + "title": "Sampling Methods", + "start_index": 543, + "end_index": 546, + "child_nodes": [ + { + "title": "Basic Sampling Algorithms", + "start_index": 546, + "end_index": 546, + "child_nodes": [ + { + "title": "Standard distributions", + "start_index": 546, + "end_index": 548 + }, + { + "title": "Rejection sampling", + "start_index": 548, + "end_index": 550 + }, + { + "title": "Adaptive rejection sampling", + "start_index": 550, + "end_index": 552 + }, + { + "title": "Importance sampling", + "start_index": 552, + "end_index": 554 + }, + { + "title": "Sampling-importance-resampling", + "start_index": 554, + "end_index": 556 + }, + { + "title": "Sampling and the EM algorithm", + "start_index": 556, + "end_index": 556 + } + ] + }, + { + "title": "Markov Chain Monte Carlo", + "start_index": 557, + "end_index": 559, + "child_nodes": [ + { + "title": "Markov chains", + "start_index": 559, + "end_index": 561 + }, + { + "title": "The Metropolis-Hastings algorithm", + "start_index": 561, + "end_index": 562 + } + ] + }, + { + "title": "Gibbs Sampling", + "start_index": 562, + "end_index": 566 + }, + { + "title": "Slice Sampling", + "start_index": 566, + "end_index": 568 + }, + { + "title": "The Hybrid Monte Carlo Algorithm", + "start_index": 568, + "end_index": 568, + "child_nodes": [ + { + "title": "Dynamical systems", + "start_index": 568, + "end_index": 572 + }, + { + "title": "Hybrid Monte Carlo", + "start_index": 572, + "end_index": 574 + } + ] + }, + { + "title": "Estimating the Partition Function", + "start_index": 574, + "end_index": 576 + } + ] + }, + { + "title": "Exercises", + "start_index": 576, + "end_index": 579 + }, + { + "title": "Continuous Latent Variables", + "start_index": 579, + "end_index": 581, + "child_nodes": [ + { + "title": "Principal Component Analysis", + "start_index": 581, + "end_index": 581, + "child_nodes": [ + { + "title": "Maximum variance formulation", + "start_index": 581, + "end_index": 583 + }, + { + "title": "Minimum-error formulation", + "start_index": 583, + "end_index": 585 + }, + { + "title": "Applications of PCA", + "start_index": 585, + "end_index": 589 + }, + { + "title": "PCA for high-dimensional data", + "start_index": 589, + "end_index": 590 + } + ] + }, + { + "title": "Probabilistic PCA", + "start_index": 590, + "end_index": 594, + "child_nodes": [ + { + "title": "Maximum likelihood PCA", + "start_index": 594, + "end_index": 597 + }, + { + "title": "EM algorithm for PCA", + "start_index": 597, + "end_index": 600 + }, + { + "title": "Bayesian PCA", + "start_index": 600, + "end_index": 603 + }, + { + "title": "Factor analysis", + "start_index": 603, + "end_index": 606 + } + ] + }, + { + "title": "Kernel PCA", + "start_index": 606, + "end_index": 610 + }, + { + "title": "Nonlinear Latent Variable Models", + "start_index": 611, + "end_index": 611, + "child_nodes": [ + { + "title": "Independent component analysis", + "start_index": 611, + "end_index": 612 + }, + { + "title": "Autoassociative neural networks", + "start_index": 612, + "end_index": 615 + }, + { + "title": "Modelling nonlinear manifolds", + "start_index": 615, + "end_index": 619 + } + ] + } + ] + }, + { + "title": "Exercises", + "start_index": 619, + "end_index": 624 + }, + { + "title": "Sequential Data", + "start_index": 625, + "end_index": 627, + "child_nodes": [ + { + "title": "Markov Models", + "start_index": 627, + "end_index": 630 + }, + { + "title": "Hidden Markov Models", + "start_index": 630, + "end_index": 635, + "child_nodes": [ + { + "title": "Maximum likelihood for the HMM", + "start_index": 635, + "end_index": 638 + }, + { + "title": "The forward-backward algorithm", + "start_index": 638, + "end_index": 645 + }, + { + "title": "The sum-product algorithm for the HMM", + "start_index": 645, + "end_index": 647 + }, + { + "title": "Scaling factors", + "start_index": 647, + "end_index": 649 + }, + { + "title": "The Viterbi algorithm", + "start_index": 649, + "end_index": 651 + }, + { + "title": "Extensions of the hidden Markov model", + "start_index": 651, + "end_index": 655 + } + ] + }, + { + "title": "Linear Dynamical Systems", + "start_index": 655, + "end_index": 658, + "child_nodes": [ + { + "title": "Inference in LDS", + "start_index": 658, + "end_index": 662 + }, + { + "title": "Learning in LDS", + "start_index": 662, + "end_index": 664 + }, + { + "title": "Extensions of LDS", + "start_index": 664, + "end_index": 665 + }, + { + "title": "Particle filters", + "start_index": 665, + "end_index": 666 + } + ] + } + ] + }, + { + "title": "Exercises", + "start_index": 666, + "end_index": 672 + }, + { + "title": "Combining Models", + "start_index": 673, + "end_index": 674, + "child_nodes": [ + { + "title": "Bayesian Model Averaging", + "start_index": 674, + "end_index": 675 + }, + { + "title": "Committees", + "start_index": 675, + "end_index": 677 + }, + { + "title": "Boosting", + "start_index": 677, + "end_index": 679, + "child_nodes": [ + { + "title": "Minimizing exponential error", + "start_index": 679, + "end_index": 681 + }, + { + "title": "Error functions for boosting", + "start_index": 681, + "end_index": 683 + } + ] + }, + { + "title": "Tree-based Models", + "start_index": 683, + "end_index": 686 + }, + { + "title": "Conditional Mixture Models", + "start_index": 686, + "end_index": 687, + "child_nodes": [ + { + "title": "Mixtures of linear regression models", + "start_index": 687, + "end_index": 690 + }, + { + "title": "Mixtures of logistic models", + "start_index": 690, + "end_index": 692 + }, + { + "title": "Mixtures of experts", + "start_index": 692, + "end_index": 694 + } + ] + } + ] + }, + { + "title": "Exercises", + "start_index": 694, + "end_index": 696 + }, + { + "title": "Appendix A Data Sets", + "start_index": 697, + "end_index": 704 + }, + { + "title": "Appendix B Probability Distributions", + "start_index": 705, + "end_index": 714 + }, + { + "title": "Appendix C Properties of Matrices", + "start_index": 715, + "end_index": 722 + }, + { + "title": "Appendix D Calculus of Variations", + "start_index": 723, + "end_index": 726 + }, + { + "title": "Appendix E Lagrange Multipliers", + "start_index": 727, + "end_index": 730 + }, + { + "title": "References", + "start_index": 731, + "end_index": 749 + }, + { + "title": "Index", + "start_index": 749, + "end_index": 758 + } +] \ No newline at end of file diff --git a/results/Regulation Best Interest_Interpretive release_structure.json b/results/Regulation Best Interest_Interpretive release_structure.json new file mode 100644 index 0000000..3d80f3b --- /dev/null +++ b/results/Regulation Best Interest_Interpretive release_structure.json @@ -0,0 +1,51 @@ +[ + { + "title": "Preface", + "start_index": 1, + "end_index": 2 + }, + { + "title": "Introduction", + "start_index": 2, + "end_index": 6 + }, + { + "title": "Interpretation and Application", + "start_index": 6, + "end_index": 8, + "child_nodes": [ + { + "title": "Historical Context and Legislative History", + "start_index": 8, + "end_index": 10 + }, + { + "title": "Scope of the Solely Incidental Prong of the Broker-Dealer Exclusion", + "start_index": 10, + "end_index": 14 + }, + { + "title": "Guidance on Applying the Interpretation of the Solely Incidental Prong", + "start_index": 14, + "end_index": 22 + } + ] + }, + { + "title": "Economic Considerations", + "start_index": 22, + "end_index": 22, + "child_nodes": [ + { + "title": "Background", + "start_index": 22, + "end_index": 23 + }, + { + "title": "Potential Economic Effects", + "start_index": 23, + "end_index": 28 + } + ] + } +] \ No newline at end of file diff --git a/results/Regulation Best Interest_proposed rule_structure.json b/results/Regulation Best Interest_proposed rule_structure.json new file mode 100644 index 0000000..947eae7 --- /dev/null +++ b/results/Regulation Best Interest_proposed rule_structure.json @@ -0,0 +1,466 @@ +[ + { + "title": "Preface", + "start_index": 1, + "end_index": 6 + }, + { + "title": "INTRODUCTION", + "start_index": 6, + "end_index": 12, + "child_nodes": [ + { + "title": "Background", + "start_index": 12, + "end_index": 22, + "child_nodes": [ + { + "title": "Evaluation of Standards of Conduct Applicable to Investment Advice", + "start_index": 22, + "end_index": 26 + }, + { + "title": "DOL Rulemaking", + "start_index": 26, + "end_index": 32 + }, + { + "title": "Statement by Chairman Clayton", + "start_index": 32, + "end_index": 36 + } + ] + }, + { + "title": "General Objectives of Proposed Approach", + "start_index": 36, + "end_index": 44 + } + ] + }, + { + "title": "DISCUSSION OF REGULATION BEST INTEREST", + "start_index": 44, + "end_index": 44, + "child_nodes": [ + { + "title": "Overview of Regulation Best Interest", + "start_index": 44, + "end_index": 50 + }, + { + "title": "Best Interest, Generally", + "start_index": 50, + "end_index": 58, + "child_nodes": [ + { + "title": "Consistency with Other Approaches", + "start_index": 58, + "end_index": 66 + }, + { + "title": "Request for Comment on the Best Interest Obligation", + "start_index": 66, + "end_index": 71 + } + ] + }, + { + "title": "Key Terms and Scope of Best Interest Obligation", + "start_index": 71, + "end_index": 71, + "child_nodes": [ + { + "title": "Natural Person who is an Associated Person", + "start_index": 71, + "end_index": 72 + }, + { + "title": "When Making a Recommendation, At Time Recommendation is Made", + "start_index": 72, + "end_index": 82 + }, + { + "title": "Any Securities Transaction or Investment Strategy", + "start_index": 82, + "end_index": 83 + }, + { + "title": "Retail Customer", + "start_index": 83, + "end_index": 90 + }, + { + "title": "Request for Comment on Key Terms and Scope of Best Interest Obligation", + "start_index": 90, + "end_index": 96 + } + ] + }, + { + "title": "Components of Regulation Best Interest", + "start_index": 96, + "end_index": 97, + "child_nodes": [ + { + "title": "Disclosure Obligation", + "start_index": 97, + "end_index": 133 + }, + { + "title": "Care Obligation", + "start_index": 133, + "end_index": 166 + }, + { + "title": "Conflict of Interest Obligations", + "start_index": 166, + "end_index": 196 + } + ] + }, + { + "title": "Recordkeeping and Retention", + "start_index": 196, + "end_index": 199 + }, + { + "title": "Whether the Exercise of Investment Discretion Should be Viewed as Solely Incidental to the Business of a Broker or Dealer", + "start_index": 199, + "end_index": 209 + } + ] + }, + { + "title": "REQUEST FOR COMMENT", + "start_index": 209, + "end_index": 210, + "child_nodes": [ + { + "title": "Generally", + "start_index": 210, + "end_index": 212 + }, + { + "title": "Interactions with Other Standards of Conduct", + "start_index": 212, + "end_index": 214 + } + ] + }, + { + "title": "ECONOMIC ANALYSIS", + "start_index": 214, + "end_index": 214, + "child_nodes": [ + { + "title": "Introduction, Primary Goals of Proposed Regulations and Broad Economic Considerations", + "start_index": 214, + "end_index": 214, + "child_nodes": [ + { + "title": "Introduction and Primary Goals of Proposed Regulation", + "start_index": 214, + "end_index": 215 + }, + { + "title": "Broad Economic Considerations", + "start_index": 215, + "end_index": 225 + } + ] + }, + { + "title": "Economic Baseline", + "start_index": 225, + "end_index": 225, + "child_nodes": [ + { + "title": "Market for Advice Services", + "start_index": 225, + "end_index": 246 + }, + { + "title": "Regulatory Baseline", + "start_index": 246, + "end_index": 255 + } + ] + }, + { + "title": "Benefits, Costs, and Effects on Efficiency, Competition, and Capital Formation", + "start_index": 255, + "end_index": 258, + "child_nodes": [ + { + "title": "Benefits", + "start_index": 258, + "end_index": 272 + }, + { + "title": "Costs", + "start_index": 272, + "end_index": 275, + "child_nodes": [ + { + "title": "Standard of Conduct Defined as Best Interest", + "start_index": 275, + "end_index": 275, + "child_nodes": [ + { + "title": "Operational Costs", + "start_index": 275, + "end_index": 277 + }, + { + "title": "Programmatic Costs", + "start_index": 278, + "end_index": 280 + } + ] + }, + { + "title": "Disclosure Obligation", + "start_index": 280, + "end_index": 286 + }, + { + "title": "Obligation to Exercise Reasonable Diligence, Care, Skill, and Prudence in Making a Recommendation", + "start_index": 286, + "end_index": 290 + }, + { + "title": "Obligation to Establish, Maintain, and Enforce Written Policies and Procedures Reasonably Designed to Identify and at a Minimum Disclose, or Eliminate, All Material Conflicts of Interest Associated with a Recommendation", + "start_index": 290, + "end_index": 295, + "child_nodes": [ + { + "title": "Eliminate Material Conflicts of Interest Associated with a Recommendation", + "start_index": 295, + "end_index": 297 + }, + { + "title": "At a Minimum Disclose Material Conflicts of Interest Associated with a Recommendation", + "start_index": 297, + "end_index": 299 + } + ] + }, + { + "title": "Obligation to Establish, Maintain, and Enforce Written Policies and Procedures Reasonably Designed to Identify and Disclose and Mitigate, or Eliminate, Material Conflicts of Interest Arising from Financial Incentives Associated with a Recommendation", + "start_index": 299, + "end_index": 300, + "child_nodes": [ + { + "title": "Eliminate Material Conflicts Arising from Financial Incentives Associated with a Recommendation", + "start_index": 300, + "end_index": 304 + }, + { + "title": "Disclose and Mitigate Material Conflicts of Interest Arising from Financial Incentives Associated with a Recommendation", + "start_index": 304, + "end_index": 316 + } + ] + } + ] + } + ] + }, + { + "title": "Effects on Efficiency, Competition, and Capital Formation", + "start_index": 316, + "end_index": 324 + }, + { + "title": "Reasonable Alternatives", + "start_index": 324, + "end_index": 325, + "child_nodes": [ + { + "title": "Disclosure-Only Alternative", + "start_index": 325, + "end_index": 327 + }, + { + "title": "Principles-Based Standard of Conduct Obligation", + "start_index": 327, + "end_index": 328 + }, + { + "title": "A Fiduciary Standard for Broker-Dealers", + "start_index": 328, + "end_index": 332 + }, + { + "title": "Enhanced Standards Akin to Conditions of the BIC Exemption", + "start_index": 332, + "end_index": 335 + } + ] + }, + { + "title": "Request for Comment", + "start_index": 335, + "end_index": 338 + } + ] + }, + { + "title": "PAPERWORK REDUCTION ACT ANALYSIS", + "start_index": 338, + "end_index": 340, + "child_nodes": [ + { + "title": "Respondents Subject to Proposed Regulation Best Interest and Proposed Amendments to Rule 17a-3(a)(25), Rule 17a-4(e)(5)", + "start_index": 340, + "end_index": 340, + "child_nodes": [ + { + "title": "Broker-Dealers", + "start_index": 340, + "end_index": 340 + }, + { + "title": "Natural Persons Who Are Associated Persons of Broker-Dealers", + "start_index": 340, + "end_index": 341 + } + ] + }, + { + "title": "Summary of Collections of Information", + "start_index": 341, + "end_index": 342, + "child_nodes": [ + { + "title": "Conflict of Interest Obligations", + "start_index": 342, + "end_index": 353 + }, + { + "title": "Disclosure Obligation", + "start_index": 353, + "end_index": 370 + }, + { + "title": "Care Obligation", + "start_index": 370, + "end_index": 370 + }, + { + "title": "Record-Making and Recordkeeping Obligations", + "start_index": 370, + "end_index": 375 + } + ] + }, + { + "title": "Collection of Information is Mandatory", + "start_index": 375, + "end_index": 375 + }, + { + "title": "Confidentiality", + "start_index": 375, + "end_index": 376 + }, + { + "title": "Request for Comment", + "start_index": 376, + "end_index": 377 + } + ] + }, + { + "title": "SMALL BUSINESS REGULATORY ENFORCEMENT FAIRNESS ACT", + "start_index": 377, + "end_index": 378 + }, + { + "title": "INITIAL REGULATORY FLEXIBILITY ACT ANALYSIS", + "start_index": 378, + "end_index": 379, + "child_nodes": [ + { + "title": "Reasons for and Objectives of the Proposed Action", + "start_index": 379, + "end_index": 381 + }, + { + "title": "Legal Basis", + "start_index": 381, + "end_index": 381 + }, + { + "title": "Small Entities Subject to the Proposed Rule", + "start_index": 381, + "end_index": 382 + }, + { + "title": "Projected Compliance Requirements of the Proposed Rule for Small Entities", + "start_index": 382, + "end_index": 383, + "child_nodes": [ + { + "title": "Conflict of Interest Obligations", + "start_index": 383, + "end_index": 386 + }, + { + "title": "Disclosure Obligations", + "start_index": 387, + "end_index": 394 + }, + { + "title": "Obligation to Exercise Reasonable Diligence, Care, Skill and Prudence", + "start_index": 394, + "end_index": 394 + }, + { + "title": "Record-Making and Recordkeeping Obligations", + "start_index": 394, + "end_index": 397 + } + ] + }, + { + "title": "Duplicative, Overlapping, or Conflicting Federal Rules", + "start_index": 397, + "end_index": 398 + }, + { + "title": "Significant Alternatives", + "start_index": 398, + "end_index": 401, + "child_nodes": [ + { + "title": "Disclosure-Only Alternative", + "start_index": 401, + "end_index": 401 + }, + { + "title": "Principles-Based Alternative", + "start_index": 401, + "end_index": 402 + }, + { + "title": "Enhanced Standards Akin to BIC Exemption", + "start_index": 402, + "end_index": 403 + } + ] + }, + { + "title": "General Request for Comment", + "start_index": 403, + "end_index": 403 + } + ] + }, + { + "title": "STATUTORY AUTHORITY AND TEXT OF PROPOSED RULE", + "start_index": 403, + "end_index": 408 + } +] \ No newline at end of file diff --git a/results/q1-fy25-earnings_structure.json b/results/q1-fy25-earnings_structure.json new file mode 100644 index 0000000..9d969f5 --- /dev/null +++ b/results/q1-fy25-earnings_structure.json @@ -0,0 +1,220 @@ +[ + { + "title": "THE WALT DISNEY COMPANY REPORTS FIRST QUARTER EARNINGS FOR FISCAL 2025", + "start_index": 1, + "end_index": 1, + "child_nodes": [ + { + "title": "Financial Results for the Quarter", + "start_index": 1, + "end_index": 1, + "child_nodes": [ + { + "title": "Key Points", + "start_index": 1, + "end_index": 1 + } + ] + }, + { + "title": "Guidance and Outlook", + "start_index": 2, + "end_index": 2, + "child_nodes": [ + { + "title": "Star India deconsolidated in Q1", + "start_index": 2, + "end_index": 2 + }, + { + "title": "Q2 Fiscal 2025", + "start_index": 2, + "end_index": 2 + }, + { + "title": "Fiscal Year 2025", + "start_index": 2, + "end_index": 2 + } + ] + }, + { + "title": "Message From Our CEO", + "start_index": 2, + "end_index": 2 + }, + { + "title": "SUMMARIZED FINANCIAL RESULTS", + "start_index": 3, + "end_index": 3, + "child_nodes": [ + { + "title": "SUMMARIZED SEGMENT FINANCIAL RESULTS", + "start_index": 3, + "end_index": 3 + } + ] + }, + { + "title": "DISCUSSION OF FIRST QUARTER SEGMENT RESULTS", + "start_index": 4, + "end_index": 4, + "child_nodes": [ + { + "title": "Star India", + "start_index": 4, + "end_index": 4 + }, + { + "title": "Entertainment", + "start_index": 4, + "end_index": 4, + "child_nodes": [ + { + "title": "Linear Networks", + "start_index": 5, + "end_index": 5 + }, + { + "title": "Direct-to-Consumer", + "start_index": 5, + "end_index": 7 + }, + { + "title": "Content Sales/Licensing and Other", + "start_index": 7, + "end_index": 7 + } + ] + }, + { + "title": "Sports", + "start_index": 7, + "end_index": 7, + "child_nodes": [ + { + "title": "Domestic ESPN", + "start_index": 8, + "end_index": 8 + }, + { + "title": "International ESPN", + "start_index": 8, + "end_index": 8 + }, + { + "title": "Star India", + "start_index": 8, + "end_index": 8 + } + ] + }, + { + "title": "Experiences", + "start_index": 9, + "end_index": 9, + "child_nodes": [ + { + "title": "Domestic Parks and Experiences", + "start_index": 9, + "end_index": 9 + }, + { + "title": "International Parks and Experiences", + "start_index": 9, + "end_index": 9 + } + ] + } + ] + }, + { + "title": "OTHER FINANCIAL INFORMATION", + "start_index": 9, + "end_index": 9, + "child_nodes": [ + { + "title": "Corporate and Unallocated Shared Expenses", + "start_index": 9, + "end_index": 9 + }, + { + "title": "Restructuring and Impairment Charges", + "start_index": 9, + "end_index": 9 + }, + { + "title": "Interest Expense, net", + "start_index": 10, + "end_index": 10 + }, + { + "title": "Equity in the Income of Investees", + "start_index": 10, + "end_index": 10 + }, + { + "title": "Income Taxes", + "start_index": 10, + "end_index": 10 + }, + { + "title": "Noncontrolling Interests", + "start_index": 11, + "end_index": 11 + }, + { + "title": "Cash from Operations", + "start_index": 11, + "end_index": 11 + }, + { + "title": "Capital Expenditures", + "start_index": 12, + "end_index": 12 + }, + { + "title": "Depreciation Expense", + "start_index": 12, + "end_index": 12 + } + ] + }, + { + "title": "THE WALT DISNEY COMPANY CONDENSED CONSOLIDATED STATEMENTS OF INCOME", + "start_index": 13, + "end_index": 13 + }, + { + "title": "THE WALT DISNEY COMPANY CONDENSED CONSOLIDATED BALANCE SHEETS", + "start_index": 14, + "end_index": 14 + }, + { + "title": "THE WALT DISNEY COMPANY CONDENSED CONSOLIDATED STATEMENTS OF CASH FLOWS", + "start_index": 15, + "end_index": 15 + }, + { + "title": "DTC PRODUCT DESCRIPTIONS AND KEY DEFINITIONS", + "start_index": 16, + "end_index": 16 + }, + { + "title": "NON-GAAP FINANCIAL MEASURES", + "start_index": 17, + "end_index": 20 + }, + { + "title": "FORWARD-LOOKING STATEMENTS", + "start_index": 21, + "end_index": 21 + }, + { + "title": "PREPARED EARNINGS REMARKS AND CONFERENCE CALL INFORMATION", + "start_index": 22, + "end_index": 22 + } + ] + } +] \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..3464544 --- /dev/null +++ b/utils.py @@ -0,0 +1,524 @@ +import tiktoken +import openai +import logging +import os +from datetime import datetime +import time +import json +import PyPDF2 +import copy +import asyncio +import pymupdf +from io import BytesIO +import logging + + +def count_tokens(text, model): + enc = tiktoken.encoding_for_model(model) + tokens = enc.encode(text) + return len(tokens) + +def ChatGPT_API_with_finish_reason(model, prompt, api_key, chat_history=None): + max_retries = 10 + client = openai.OpenAI(api_key=api_key) + for i in range(max_retries): + try: + if chat_history: + messages = chat_history + messages.append({"role": "user", "content": prompt}) + else: + messages = [{"role": "user", "content": prompt}] + + response = client.chat.completions.create( + model=model, + messages=messages, + temperature=0, + ) + if response.choices[0].finish_reason == "length": + return response.choices[0].message.content, "max_output_reached" + else: + return response.choices[0].message.content, "finished" + + except Exception as e: + print('************* Retrying *************') + logging.error(f"Error: {e}") + if i < max_retries - 1: + time.sleep(1) # Wait for 1秒 before retrying + else: + logging.error('Max retries reached for prompt: ' + prompt) + return "Error" + + + +def ChatGPT_API(model, prompt, api_key, chat_history=None): + max_retries = 10 + client = openai.OpenAI(api_key=api_key) + for i in range(max_retries): + try: + if chat_history: + messages = chat_history + messages.append({"role": "user", "content": prompt}) + else: + messages = [{"role": "user", "content": prompt}] + + response = client.chat.completions.create( + model=model, + messages=messages, + temperature=0, + ) + + return response.choices[0].message.content + except Exception as e: + print('************* Retrying *************') + logging.error(f"Error: {e}") + if i < max_retries - 1: + time.sleep(1) # Wait for 1秒 before retrying + else: + logging.error('Max retries reached for prompt: ' + prompt) + return "Error" + + +async def ChatGPT_API_async(model, prompt, api_key): + max_retries = 10 + client = openai.AsyncOpenAI(api_key=api_key) + for i in range(max_retries): + try: + messages = [{"role": "user", "content": prompt}] + response = await client.chat.completions.create( + model=model, + messages=messages, + temperature=0, + ) + return response.choices[0].message.content + except Exception as e: + print('************* Retrying *************') + logging.error(f"Error: {e}") + if i < max_retries - 1: + await asyncio.sleep(1) # Wait for 1秒 before retrying + else: + logging.error('Max retries reached for prompt: ' + prompt) + return "Error" + +def get_json_content(response): + start_idx = response.find("```json") + if start_idx != -1: + start_idx += 7 + response = response[start_idx:] + + end_idx = response.rfind("```") + if end_idx != -1: + response = response[:end_idx] + + json_content = response.strip() + return json_content + + +def extract_json(content): + try: + # First, try to extract JSON enclosed within ```json and ``` + start_idx = content.find("```json") + if start_idx != -1: + start_idx += 7 # Adjust index to start after the delimiter + end_idx = content.rfind("```") + json_content = content[start_idx:end_idx].strip() + else: + # If no delimiters, assume entire content could be JSON + json_content = content.strip() + + # Clean up common issues that might cause parsing errors + json_content = json_content.replace('None', 'null') # Replace Python None with JSON null + json_content = json_content.replace('\n', ' ').replace('\r', ' ') # Remove newlines + json_content = ' '.join(json_content.split()) # Normalize whitespace + + # Attempt to parse and return the JSON object + return json.loads(json_content) + except json.JSONDecodeError as e: + logging.error(f"Failed to extract JSON: {e}") + # Try to clean up the content further if initial parsing fails + try: + # Remove any trailing commas before closing brackets/braces + json_content = json_content.replace(',]', ']').replace(',}', '}') + return json.loads(json_content) + except: + logging.error("Failed to parse JSON even after cleanup") + return {} + except Exception as e: + logging.error(f"Unexpected error while extracting JSON: {e}") + return {} + +def write_node_id(data, node_id=0): + if isinstance(data, dict): + data['node_id'] = str(node_id).zfill(4) + node_id += 1 + for key in list(data.keys()): + if 'child_nodes' in key: + node_id = write_node_id(data[key], node_id) + elif isinstance(data, list): + for index in range(len(data)): + node_id = write_node_id(data[index], node_id) + return node_id + +def get_nodes(structure): + if isinstance(structure, dict): + structure_node = copy.deepcopy(structure) + structure_node.pop('child_nodes', None) + nodes = [structure_node] + for key in list(structure.keys()): + if 'child_nodes' in key: + nodes.extend(get_nodes(structure[key])) + return nodes + elif isinstance(structure, list): + nodes = [] + for item in structure: + nodes.extend(get_nodes(item)) + return nodes + +def structure_to_list(structure): + if isinstance(structure, dict): + nodes = [] + nodes.append(structure) + if 'child_nodes' in structure: + nodes.extend(structure_to_list(structure['child_nodes'])) + return nodes + elif isinstance(structure, list): + nodes = [] + for item in structure: + nodes.extend(structure_to_list(item)) + return nodes + + +def get_leaf_nodes(structure): + if isinstance(structure, dict): + if not structure['child_nodes']: + structure_node = copy.deepcopy(structure) + structure_node.pop('child_nodes', None) + return [structure_node] + else: + leaf_nodes = [] + for key in list(structure.keys()): + if 'child_nodes' in key: + leaf_nodes.extend(get_leaf_nodes(structure[key])) + return leaf_nodes + elif isinstance(structure, list): + leaf_nodes = [] + for item in structure: + leaf_nodes.extend(get_leaf_nodes(item)) + return leaf_nodes + +def is_leaf_node(data, node_id): + # Helper function to find the node by its node_id + def find_node(data, node_id): + if isinstance(data, dict): + if data.get('node_id') == node_id: + return data + for key in data.keys(): + if 'child_nodes' in key: + result = find_node(data[key], node_id) + if result: + return result + elif isinstance(data, list): + for item in data: + result = find_node(item, node_id) + if result: + return result + return None + + # Find the node with the given node_id + node = find_node(data, node_id) + + # Check if the node is a leaf node + if node and not node.get('child_nodes'): + return True + return False + +def get_last_node(structure): + return structure[-1] + + +def extract_text_from_pdf(pdf_path): + pdf_reader = PyPDF2.PdfReader(pdf_path) + ###return text not list + text="" + for page_num in range(len(pdf_reader.pages)): + page = pdf_reader.pages[page_num] + text+=page.extract_text() + return text + +def get_pdf_title(pdf_path): + pdf_reader = PyPDF2.PdfReader(pdf_path) + meta = pdf_reader.metadata + title = meta.title + return title + +def get_text_of_pages(pdf_path, start_page, end_page, tag=True): + pdf_reader = PyPDF2.PdfReader(pdf_path) + text = "" + for page_num in range(start_page-1, end_page): + page = pdf_reader.pages[page_num] + page_text = page.extract_text() + if tag: + text += f"<start_index_{page_num+1}>\n{page_text}\n<end_index_{page_num+1}>\n" + else: + text += page_text + return text + +def get_first_start_page_from_text(text): + start_page = -1 + start_page_match = re.search(r'<start_index_(\d+)>', text) + if start_page_match: + start_page = int(start_page_match.group(1)) + return start_page + +def get_last_start_page_from_text(text): + start_page = -1 + # Find all matches of start_index tags + start_page_matches = re.finditer(r'<start_index_(\d+)>', text) + # Convert iterator to list and get the last match if any exist + matches_list = list(start_page_matches) + if matches_list: + start_page = int(matches_list[-1].group(1)) + return start_page + + + + +def sanitize_filename(filename, replacement='-'): + # In Linux, only '/' and '\0' (null) are invalid in filenames. + # Null can't be represented in strings, so we only handle '/'. + return filename.replace('/', replacement) + +class JsonLogger: + def __init__(self, file_path): + # Extract PDF name without extension for logger name and filename + # pdf_name = os.path.splitext(os.path.basename(file_path))[0] + if isinstance(file_path, str): + pdf_name = os.path.splitext(os.path.basename(file_path))[0] + elif isinstance(file_path, BytesIO): + pdf_reader = PyPDF2.PdfReader(file_path) + meta = pdf_reader.metadata + pdf_name = meta.title if meta.title else 'Untitled' + pdf_name = sanitize_filename(pdf_name) + + current_time = datetime.now().strftime("%Y%m%d_%H%M%S") + self.filename = f"{pdf_name}_{current_time}.json" + os.makedirs("./logs", exist_ok=True) + # Initialize empty list to store all messages + self.log_data = [] + + def log(self, level, message, **kwargs): + if isinstance(message, dict): + self.log_data.append(message) + else: + self.log_data.append({'message': message}) + # Add new message to the log data + + # Write entire log data to file + with open(self._filepath(), "w") as f: + json.dump(self.log_data, f, indent=2) + + def info(self, message, **kwargs): + self.log("INFO", message, **kwargs) + + def error(self, message, **kwargs): + self.log("ERROR", message, **kwargs) + + def debug(self, message, **kwargs): + self.log("DEBUG", message, **kwargs) + + def exception(self, message, **kwargs): + kwargs["exception"] = True + self.log("ERROR", message, **kwargs) + + def _filepath(self): + return os.path.join("logs", self.filename) + + + + +def list_to_tree(data): + def get_parent_structure(structure): + """Helper function to get the parent structure code""" + if not structure: + return None + parts = str(structure).split('.') + return '.'.join(parts[:-1]) if len(parts) > 1 else None + + # First pass: Create nodes and track parent-child relationships + nodes = {} + root_nodes = [] + + for item in data: + structure = item.get('structure') + node = { + 'title': item.get('title'), + 'start_index': item.get('start_index'), + 'end_index': item.get('end_index'), + 'child_nodes': [] + } + + nodes[structure] = node + + # Find parent + parent_structure = get_parent_structure(structure) + + if parent_structure: + # Add as child to parent if parent exists + if parent_structure in nodes: + nodes[parent_structure]['child_nodes'].append(node) + else: + root_nodes.append(node) + else: + # No parent, this is a root node + root_nodes.append(node) + + # Helper function to clean empty children arrays + def clean_node(node): + if not node['child_nodes']: + del node['child_nodes'] + else: + for child in node['child_nodes']: + clean_node(child) + return node + + # Clean and return the tree + return [clean_node(node) for node in root_nodes] + +def add_preface_if_needed(data): + if not isinstance(data, list) or not data: + return data + + if data[0]['physical_index'] is not None and data[0]['physical_index'] > 1: + preface_node = { + "structure": "0", + "title": "Preface", + "physical_index": 1, + } + data.insert(0, preface_node) + return data + + + +def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"): + if pdf_parser == "PyPDF2": + pdf_reader = PyPDF2.PdfReader(pdf_path) + elif pdf_parser == "PyMuPDF": + pdf_reader = pymupdf.open(pdf_path) + else: + raise ValueError(f"Unsupported PDF parser: {pdf_parser}") + + enc = tiktoken.encoding_for_model(model) + + page_list = [] + for page_num in range(len(pdf_reader.pages)): + page = pdf_reader.pages[page_num] + page_text = page.extract_text() + token_length = len(enc.encode(page_text)) + page_list.append((page_text, token_length)) + + return page_list + + + + + +def get_text_of_pdf_pages(pdf_pages, start_page, end_page): + text = "" + for page_num in range(start_page-1, end_page): + text += pdf_pages[page_num] + return text + +def get_number_of_pages(pdf_path): + pdf_reader = PyPDF2.PdfReader(pdf_path) + num = len(pdf_reader.pages) + return num + + + +def post_processing(structure, end_physical_index): + # First convert page_number to start_index in flat list + for i, item in enumerate(structure): + item['start_index'] = item.get('physical_index') + if i < len(structure) - 1: + if structure[i + 1].get('appear_start') == 'yes': + item['end_index'] = structure[i + 1]['physical_index']-1 + else: + item['end_index'] = structure[i + 1]['physical_index'] + else: + item['end_index'] = end_physical_index + tree = list_to_tree(structure) + if len(tree)!=0: + return tree + else: + ### remove appear_start + for node in structure: + node.pop('appear_start', None) + node.pop('physical_index', None) + return structure + +def clean_structure_post(data): + if isinstance(data, dict): + data.pop('page_number', None) + data.pop('start_index', None) + data.pop('end_index', None) + if 'child_nodes' in data: + clean_structure_post(data['child_nodes']) + elif isinstance(data, list): + for section in data: + clean_structure_post(section) + return data + + +def remove_structure_text(data): + if isinstance(data, dict): + data.pop('text', None) + if 'child_nodes' in data: + remove_structure_text(data['child_nodes']) + elif isinstance(data, list): + for item in data: + remove_structure_text(item) + return data + + +def check_token_limit(structure, limit=110000): + list = structure_to_list(structure) + for node in list: + num_tokens = count_tokens(node['text'], model='gpt-4o') + if num_tokens > limit: + print(f"Node ID: {node['node_id']} has {num_tokens} tokens") + print("Start Index:", node['start_index']) + print("End Index:", node['end_index']) + print("Title:", node['title']) + # print(node['text']) + print("\n") + + +def convert_physical_index_to_int(data): + if isinstance(data, list): + for i in range(len(data)): + if isinstance(data[i]['physical_index'], str): + if data[i]['physical_index'].startswith('<physical_index_'): + data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].rstrip('>').strip()) + elif data[i]['physical_index'].startswith('physical_index_'): + data[i]['physical_index'] = int(data[i]['physical_index'].split('_')[-1].strip()) + elif isinstance(data, str): + if data.startswith('<physical_index_'): + data = int(data.split('_')[-1].rstrip('>').strip()) + elif data.startswith('physical_index_'): + data = int(data.split('_')[-1].strip()) + ###check data is int + if isinstance(data, int): + return data + else: + return None + return data + + +def convert_page_to_int(data): + for item in data: + if 'page' in item and isinstance(item['page'], str): + try: + item['page'] = int(item['page']) + except ValueError: + # Keep original value if conversion fails + pass + return data