fix physical index

This commit is contained in:
zmtomorrow 2025-04-18 17:01:02 +08:00
parent dbd22178a5
commit 5aef9b4a49
2 changed files with 72 additions and 126 deletions

View file

@ -580,13 +580,13 @@ def process_no_toc(page_list, start_index=1, model=None, logger=None):
return toc_with_page_number
def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, model=None, logger=None):
def process_toc_no_page_numbers(toc_content, toc_page_list, page_list, start_index=1, model=None, logger=None):
page_contents=[]
token_lengths=[]
toc_content = toc_transformer(toc_content, model)
logger.info(f'toc_transformer: {toc_content}')
for page_index in range(len(page_list)):
page_text = f"<physical_index_{page_index+1}>\n{page_list[page_index][0]}\n<physical_index_{page_index+1}>\n\n"
for page_index in range(start_index, start_index+len(page_list)):
page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n"
page_contents.append(page_text)
token_lengths.append(count_tokens(page_text, model))
@ -639,27 +639,27 @@ def process_toc_with_page_numbers(toc_content, toc_page_list, page_list, toc_che
##check if needed to process none page numbers
def process_none_page_numbers(toc_items, page_list, model=None):
def process_none_page_numbers(toc_items, page_list, start_index=1, model=None):
for i, item in enumerate(toc_items):
if "physical_index" not in item:
# logger.info(f"fix item: {item}")
# Find previous physical_index
prev_index = 0 # Default if no previous item exists
prev_physical_index = 0 # Default if no previous item exists
for j in range(i - 1, -1, -1):
if toc_items[j].get('physical_index') is not None:
prev_index = toc_items[j]['physical_index']-1
prev_physical_index = toc_items[j]['physical_index']
break
# Find next physical_index
next_index = -1 # Default if no next item exists
next_physical_index = -1 # Default if no next item exists
for j in range(i + 1, len(toc_items)):
if toc_items[j].get('physical_index') is not None:
next_index = toc_items[j]['physical_index']
next_physical_index = toc_items[j]['physical_index']
break
page_contents = []
for page_index in range(prev_index, next_index):
page_text = f"<physical_index_{page_index+1}>\n{page_list[page_index][0]}\n<physical_index_{page_index+1}>\n\n"
for page_index in range(prev_physical_index, next_physical_index+1):
page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n"
page_contents.append(page_text)
item_copy = copy.deepcopy(item)
@ -777,7 +777,7 @@ def fix_incorrect_toc(toc_with_page_number, page_list, incorrect_results, start_
page_contents=[]
for page_index in range(prev_correct, next_correct+1):
page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index-start_index+1}>\n\n"
page_text = f"<physical_index_{page_index}>\n{page_list[page_index-start_index][0]}\n<physical_index_{page_index}>\n\n"
page_contents.append(page_text)
content_range = ''.join(page_contents)

View file

@ -164,7 +164,7 @@
{
"title": "Appendixes",
"start_index": 107,
"end_index": 108,
"end_index": 109,
"node_id": "0025"
},
{
@ -207,7 +207,7 @@
{
"title": "Meeting Minutes",
"start_index": 147,
"end_index": 148,
"end_index": 149,
"node_id": "0032"
}
],
@ -325,13 +325,31 @@
{
"title": "Federal Reserve open market transactions, 2023",
"start_index": 187,
"end_index": 188,
"end_index": 187,
"nodes": [
{
"title": "Federal Reserve open market transactions, 2023\u2014continued",
"start_index": 188,
"title": "Type of security and transaction",
"start_index": 187,
"end_index": 188,
"node_id": "0051"
},
{
"title": "Federal agency obligations",
"start_index": 188,
"end_index": 188,
"node_id": "0052"
},
{
"title": "Mortgage-backed securities",
"start_index": 188,
"end_index": 188,
"node_id": "0053"
},
{
"title": "Temporary transactions",
"start_index": 188,
"end_index": 188,
"node_id": "0054"
}
],
"node_id": "0050"
@ -339,162 +357,90 @@
{
"title": "Federal Reserve Bank holdings of U.S. Treasury and federal agency securities, December 31, 2021\u201323",
"start_index": 189,
"end_index": 190,
"end_index": 189,
"nodes": [
{
"title": "Federal Reserve Bank holdings of U.S. Treasury and federal agency securities, December 31, 2021\u201323\u2014continued",
"title": "By remaining maturity",
"start_index": 189,
"end_index": 189,
"node_id": "0056"
},
{
"title": "By type",
"start_index": 189,
"end_index": 190,
"node_id": "0057"
},
{
"title": "By issuer",
"start_index": 190,
"end_index": 190,
"node_id": "0053"
"node_id": "0058"
}
],
"node_id": "0052"
"node_id": "0055"
},
{
"title": "Reserve requirements of depository institutions, December 31, 2023",
"start_index": 191,
"end_index": 191,
"node_id": "0054"
"node_id": "0059"
},
{
"title": "Banking offices and banks affiliated with bank holding companies in the United States, December 31, 2022 and 2023",
"start_index": 192,
"end_index": 192,
"node_id": "0055"
"node_id": "0060"
},
{
"title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1984\u20132023 and month-end 2023",
"start_index": 193,
"end_index": 194,
"nodes": [
{
"title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1984\u20132023 and month-end 2023\u2014continued",
"start_index": 194,
"end_index": 194,
"node_id": "0057"
},
{
"title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1984\u20132023 and month-end 2023\u2014continued",
"start_index": 195,
"end_index": 196,
"node_id": "0058"
},
{
"title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1984\u20132023 and month-end 2023\u2014continued",
"start_index": 196,
"end_index": 196,
"node_id": "0059"
}
],
"node_id": "0056"
"end_index": 196,
"node_id": "0061"
},
{
"title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1918\u20131983",
"start_index": 197,
"end_index": 198,
"nodes": [
{
"title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1918\u20131983\u2014continued",
"start_index": 198,
"end_index": 198,
"node_id": "0061"
},
{
"title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1918\u20131983\u2014continued",
"start_index": 199,
"end_index": 200,
"node_id": "0062"
},
{
"title": "Reserves of depository institutions, Federal Reserve Bank credit, and related items, year-end 1918\u20131983\u2014continued",
"start_index": 200,
"end_index": 200,
"node_id": "0063"
}
],
"node_id": "0060"
"end_index": 200,
"node_id": "0062"
},
{
"title": "Principal assets and liabilities of insured commercial banks, by class of bank, June 30, 2023 and 2022",
"start_index": 201,
"end_index": 201,
"node_id": "0064"
"node_id": "0063"
},
{
"title": "Initial margin requirements under Regulations T, U, and X",
"start_index": 202,
"end_index": 203,
"node_id": "0065"
"node_id": "0064"
},
{
"title": "Statement of condition of the Federal Reserve Banks, by Bank, December 31, 2023 and 2022",
"start_index": 203,
"end_index": 204,
"nodes": [
{
"title": "Statement of condition of the Federal Reserve Banks, by Bank, December 31, 2023 and 2022\u2014continued",
"start_index": 204,
"end_index": 206,
"node_id": "0067"
},
{
"title": "Statement of condition of the Federal Reserve Banks, by Bank, December 31, 2023 and 2022\u2014continued",
"start_index": 206,
"end_index": 206,
"node_id": "0068"
},
{
"title": "Statement of condition of the Federal Reserve Banks, by Bank, December 31, 2023 and 2022\u2014continued",
"start_index": 206,
"end_index": 207,
"node_id": "0069"
},
{
"title": "Statement of condition of the Federal Reserve Banks, by Bank, December 31, 2023 and 2022\u2014continued",
"start_index": 207,
"end_index": 208,
"node_id": "0070"
},
{
"title": "Statement of condition of the Federal Reserve Banks, by Bank, December 31, 2023 and 2022\u2014continued",
"start_index": 208,
"end_index": 209,
"node_id": "0071"
}
],
"node_id": "0066"
"end_index": 209,
"node_id": "0065"
},
{
"title": "Statement of condition of the Federal Reserve Banks, December 31, 2023 and 2022",
"start_index": 209,
"end_index": 210,
"node_id": "0072"
"node_id": "0066"
},
{
"title": "Income and expenses of the Federal Reserve Banks, by Bank, 2023",
"start_index": 210,
"end_index": 211,
"end_index": 212,
"nodes": [
{
"title": "Income and expenses of the Federal Reserve Banks, by Bank, 2023\u2014continued",
"start_index": 211,
"end_index": 212,
"node_id": "0074"
},
{
"title": "Income and expenses of the Federal Reserve Banks, by Bank, 2023\u2014continued",
"start_index": 212,
"end_index": 213,
"node_id": "0075"
},
{
"title": "Income and expenses of the Federal Reserve Banks, by Bank, 2023\u2014continued",
"start_index": 213,
"end_index": 214,
"node_id": "0076"
"node_id": "0068"
}
],
"node_id": "0073"
"node_id": "0067"
},
{
"title": "Income and expenses of the Federal Reserve Banks, 1914\u20132023",
@ -505,40 +451,40 @@
"title": "Income and expenses of the Federal Reserve Banks, 1914\u20132023\u2014continued",
"start_index": 215,
"end_index": 216,
"node_id": "0078"
"node_id": "0070"
},
{
"title": "Income and expenses of the Federal Reserve Banks, 1914\u20132023\u2014continued",
"start_index": 216,
"end_index": 217,
"node_id": "0079"
"node_id": "0071"
},
{
"title": "Income and expenses of the Federal Reserve Banks, 1914\u20132023\u2014continued",
"start_index": 217,
"end_index": 217,
"node_id": "0080"
"node_id": "0072"
}
],
"node_id": "0077"
"node_id": "0069"
},
{
"title": "Operations in principal departments of the Federal Reserve Banks, 2020\u201323",
"start_index": 218,
"end_index": 218,
"node_id": "0081"
"node_id": "0073"
},
{
"title": "Number and annual salaries of officers and employees of the Federal Reserve Banks, December 31, 2023",
"start_index": 219,
"end_index": 219,
"node_id": "0082"
"end_index": 220,
"node_id": "0074"
},
{
"title": "Acquisition costs and net book value of the premises of the Federal Reserve Banks and Branches, December 31, 2023",
"start_index": 220,
"end_index": 222,
"node_id": "0083"
"node_id": "0075"
}
],
"node_id": "0049"