mirror of
https://github.com/VectifyAI/PageIndex.git
synced 2026-04-24 23:56:21 +02:00
fix tree cleaning
This commit is contained in:
parent
c017d5ea1e
commit
985d8f064f
1 changed files with 31 additions and 33 deletions
|
|
@ -148,7 +148,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 63,
|
||||
"execution_count": 40,
|
||||
"metadata": {
|
||||
"id": "hmj3POkDcG1N"
|
||||
},
|
||||
|
|
@ -163,18 +163,17 @@
|
|||
" )\n",
|
||||
" return response.choices[0].message.content.strip()\n",
|
||||
"\n",
|
||||
"def remove_fields(data, fields=['text'], max_len=40):\n",
|
||||
"def remove_fields(data, fields=['text'], max_len=None):\n",
|
||||
" if isinstance(data, dict):\n",
|
||||
" return {k: remove_fields(v, fields)\n",
|
||||
" for k, v in data.items() if k not in fields}\n",
|
||||
" return {k: remove_fields(v, fields, max_len) for k, v in data.items() if k not in fields}\n",
|
||||
" elif isinstance(data, list):\n",
|
||||
" return [remove_fields(item, fields) for item in data]\n",
|
||||
" return [remove_fields(item, fields, max_len) for item in data]\n",
|
||||
" elif isinstance(data, str):\n",
|
||||
" return (data[:max_len] + '...') if len(data) > max_len else data\n",
|
||||
" return data[:max_len] + '...' if max_len is not None and len(data) > max_len else data\n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"def print_tree(tree, exclude_fields=['text', 'page_index']):\n",
|
||||
" cleaned_tree = remove_fields(tree.copy(), exclude_fields)\n",
|
||||
" cleaned_tree = remove_fields(tree.copy(), exclude_fields, max_len=40)\n",
|
||||
" pprint(cleaned_tree, sort_dicts=False, width=100)\n",
|
||||
"\n",
|
||||
"def show(text, width=100):\n",
|
||||
|
|
@ -212,7 +211,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
|
|
@ -226,7 +225,7 @@
|
|||
"output_type": "stream",
|
||||
"text": [
|
||||
"Downloaded https://arxiv.org/pdf/2501.12948.pdf\n",
|
||||
"Document Submitted: pi-cmek7luf400960ao3o0o8us4d\n"
|
||||
"Document Submitted: pi-cmeseq08w00vt0bo3u6tr244g\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
@ -258,7 +257,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 41,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
|
|
@ -302,7 +301,7 @@
|
|||
" 'prefix_summary': '### 2.2. DeepSeek-R1-Zero: Reinforcement...',\n",
|
||||
" 'nodes': [{'title': '2.2.1. Reinforcement Learning Algorithm',\n",
|
||||
" 'node_id': '0009',\n",
|
||||
" 'summary': 'This partial document describes the Grou...'},\n",
|
||||
" 'summary': 'The partial document describes the Group...'},\n",
|
||||
" {'title': '2.2.2. Reward Modeling',\n",
|
||||
" 'node_id': '0010',\n",
|
||||
" 'summary': 'This partial document discusses the rewa...'},\n",
|
||||
|
|
@ -335,7 +334,7 @@
|
|||
" 'summary': 'This partial document presents the concl...'},\n",
|
||||
" {'title': 'References',\n",
|
||||
" 'node_id': '0020',\n",
|
||||
" 'summary': 'The partial document consists of a compr...'},\n",
|
||||
" 'summary': 'This partial document consists of the re...'},\n",
|
||||
" {'title': 'Appendix', 'node_id': '0021', 'summary': '## Appendix\\n'},\n",
|
||||
" {'title': 'A. Contributions and Acknowledgments',\n",
|
||||
" 'node_id': '0022',\n",
|
||||
|
|
@ -370,7 +369,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"execution_count": 25,
|
||||
"metadata": {
|
||||
"id": "LLHNJAtTcG1O"
|
||||
},
|
||||
|
|
@ -410,7 +409,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 57,
|
||||
"execution_count": 26,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
|
|
@ -425,13 +424,12 @@
|
|||
"output_type": "stream",
|
||||
"text": [
|
||||
"Reasoning Process:\n",
|
||||
"The question asks for the conclusions in the document. Typically, conclusions are found in sections\n",
|
||||
"explicitly titled 'Conclusion' or in sections summarizing the findings and implications of the work.\n",
|
||||
"In this document tree, node 0019 ('5. Conclusion, Limitations, and Future Work') is the most\n",
|
||||
"directly relevant, as it is dedicated to the conclusion and related topics. Additionally, the\n",
|
||||
"'Abstract' (node 0001) may contain a high-level summary that sometimes includes concluding remarks,\n",
|
||||
"but it is less likely to contain the full conclusions. Other sections like 'Discussion' (node 0018)\n",
|
||||
"may discuss implications but are not explicitly conclusions. Therefore, the primary node is 0019.\n",
|
||||
"The question asks for the conclusions in the document. The most direct and relevant node is '5.\n",
|
||||
"Conclusion, Limitations, and Future Work' (node_id: 0019), as it is specifically dedicated to the\n",
|
||||
"conclusion and related topics. Other nodes, such as the Abstract (0001), Introduction (0003), and\n",
|
||||
"Discussion (0018), may contain summary statements or high-level findings, but the explicit\n",
|
||||
"conclusions are most likely found in node 0019. Therefore, node 0019 is the primary node likely to\n",
|
||||
"contain the answer.\n",
|
||||
"\n",
|
||||
"Retrieved Nodes:\n",
|
||||
"Node ID: 0019\t Page: 16\t Title: 5. Conclusion, Limitations, and Future Work\n"
|
||||
|
|
@ -469,7 +467,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 58,
|
||||
"execution_count": 27,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
|
|
@ -521,7 +519,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 59,
|
||||
"execution_count": 28,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
|
|
@ -537,18 +535,18 @@
|
|||
"text": [
|
||||
"Generated Answer:\n",
|
||||
"\n",
|
||||
"The conclusions in this document are:\n",
|
||||
"**Conclusions in this document:**\n",
|
||||
"\n",
|
||||
"- DeepSeek-R1-Zero, a pure reinforcement learning (RL) approach without cold-start data, achieves\n",
|
||||
"- DeepSeek-R1-Zero, a pure reinforcement learning (RL) model without cold-start data, achieves\n",
|
||||
"strong performance across various tasks.\n",
|
||||
"- DeepSeek-R1, which combines cold-start data with iterative RL fine-tuning, is more powerful and\n",
|
||||
"achieves performance comparable to OpenAI-o1-1217 on a range of tasks.\n",
|
||||
"- Distilling DeepSeek-R1’s reasoning capabilities into smaller dense models is promising; for\n",
|
||||
"example, DeepSeek-R1-Distill-Qwen-1.5B outperforms GPT-4o and Claude-3.5-Sonnet on math benchmarks,\n",
|
||||
"and other dense models also show significant improvements over similar instruction-tuned models.\n",
|
||||
"\n",
|
||||
"These results demonstrate the effectiveness of the RL-based approach and the potential for\n",
|
||||
"distilling reasoning abilities into smaller models.\n"
|
||||
"- DeepSeek-R1, which combines cold-start data with iterative RL fine-tuning, is even more powerful\n",
|
||||
"and achieves performance comparable to OpenAI-o1-1217 on a range of tasks.\n",
|
||||
"- The reasoning capabilities of DeepSeek-R1 can be successfully distilled into smaller dense models,\n",
|
||||
"with DeepSeek-R1-Distill-Qwen-1.5B outperforming GPT-4o and Claude-3.5-Sonnet on math benchmarks.\n",
|
||||
"- Other small dense models fine-tuned with DeepSeek-R1 data also significantly outperform other\n",
|
||||
"instruction-tuned models based on the same checkpoints.\n",
|
||||
"- Overall, the approaches described demonstrate promising results in enhancing model reasoning\n",
|
||||
"abilities through RL and distillation.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue