From 985d8f064fe8dfad1dc1727cd1719c0ae01c653b Mon Sep 17 00:00:00 2001
From: Ray <mailtangyu@gmail.com>
Date: Tue, 26 Aug 2025 18:49:45 +0800
Subject: [PATCH] fix tree cleaning

---
 cookbook/pageindex_RAG_simple.ipynb | 64 ++++++++++++++---------------
 1 file changed, 31 insertions(+), 33 deletions(-)

diff --git a/cookbook/pageindex_RAG_simple.ipynb b/cookbook/pageindex_RAG_simple.ipynb
index dcb1971..663638e 100644
--- a/cookbook/pageindex_RAG_simple.ipynb
+++ b/cookbook/pageindex_RAG_simple.ipynb
@@ -148,7 +148,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 63,
+      "execution_count": 40,
       "metadata": {
         "id": "hmj3POkDcG1N"
       },
@@ -163,18 +163,17 @@
         "    )\n",
         "    return response.choices[0].message.content.strip()\n",
         "\n",
-        "def remove_fields(data, fields=['text'], max_len=40):\n",
+        "def remove_fields(data, fields=['text'], max_len=None):\n",
         "    if isinstance(data, dict):\n",
-        "        return {k: remove_fields(v, fields)\n",
-        "            for k, v in data.items() if k not in fields}\n",
+        "        return {k: remove_fields(v, fields, max_len) for k, v in data.items() if k not in fields}\n",
         "    elif isinstance(data, list):\n",
-        "        return [remove_fields(item, fields) for item in data]\n",
+        "        return [remove_fields(item, fields, max_len) for item in data]\n",
         "    elif isinstance(data, str):\n",
-        "        return (data[:max_len] + '...') if len(data) > max_len else data\n",
+        "        return data[:max_len] + '...' if max_len is not None and len(data) > max_len else data\n",
         "    return data\n",
         "\n",
         "def print_tree(tree, exclude_fields=['text', 'page_index']):\n",
-        "    cleaned_tree = remove_fields(tree.copy(), exclude_fields)\n",
+        "    cleaned_tree = remove_fields(tree.copy(), exclude_fields, max_len=40)\n",
         "    pprint(cleaned_tree, sort_dicts=False, width=100)\n",
         "\n",
         "def show(text, width=100):\n",
@@ -212,7 +211,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 6,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -226,7 +225,7 @@
           "output_type": "stream",
           "text": [
             "Downloaded https://arxiv.org/pdf/2501.12948.pdf\n",
-            "Document Submitted: pi-cmek7luf400960ao3o0o8us4d\n"
+            "Document Submitted: pi-cmeseq08w00vt0bo3u6tr244g\n"
           ]
         }
       ],
@@ -258,7 +257,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
+      "execution_count": 41,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -302,7 +301,7 @@
             "                        'prefix_summary': '### 2.2. DeepSeek-R1-Zero: Reinforcement...',\n",
             "                        'nodes': [{'title': '2.2.1. Reinforcement Learning Algorithm',\n",
             "                                   'node_id': '0009',\n",
-            "                                   'summary': 'This partial document describes the Grou...'},\n",
+            "                                   'summary': 'The partial document describes the Group...'},\n",
             "                                  {'title': '2.2.2. Reward Modeling',\n",
             "                                   'node_id': '0010',\n",
             "                                   'summary': 'This partial document discusses the rewa...'},\n",
@@ -335,7 +334,7 @@
             "             'summary': 'This partial document presents the concl...'},\n",
             "            {'title': 'References',\n",
             "             'node_id': '0020',\n",
-            "             'summary': 'The partial document consists of a compr...'},\n",
+            "             'summary': 'This partial document consists of the re...'},\n",
             "            {'title': 'Appendix', 'node_id': '0021', 'summary': '## Appendix\\n'},\n",
             "            {'title': 'A. Contributions and Acknowledgments',\n",
             "             'node_id': '0022',\n",
@@ -370,7 +369,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 21,
+      "execution_count": 25,
       "metadata": {
         "id": "LLHNJAtTcG1O"
       },
@@ -410,7 +409,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 57,
+      "execution_count": 26,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -425,13 +424,12 @@
           "output_type": "stream",
           "text": [
             "Reasoning Process:\n",
-            "The question asks for the conclusions in the document. Typically, conclusions are found in sections\n",
-            "explicitly titled 'Conclusion' or in sections summarizing the findings and implications of the work.\n",
-            "In this document tree, node 0019 ('5. Conclusion, Limitations, and Future Work') is the most\n",
-            "directly relevant, as it is dedicated to the conclusion and related topics. Additionally, the\n",
-            "'Abstract' (node 0001) may contain a high-level summary that sometimes includes concluding remarks,\n",
-            "but it is less likely to contain the full conclusions. Other sections like 'Discussion' (node 0018)\n",
-            "may discuss implications but are not explicitly conclusions. Therefore, the primary node is 0019.\n",
+            "The question asks for the conclusions in the document. The most direct and relevant node is '5.\n",
+            "Conclusion, Limitations, and Future Work' (node_id: 0019), as it is specifically dedicated to the\n",
+            "conclusion and related topics. Other nodes, such as the Abstract (0001), Introduction (0003), and\n",
+            "Discussion (0018), may contain summary statements or high-level findings, but the explicit\n",
+            "conclusions are most likely found in node 0019. Therefore, node 0019 is the primary node likely to\n",
+            "contain the answer.\n",
             "\n",
             "Retrieved Nodes:\n",
             "Node ID: 0019\t Page: 16\t Title: 5. Conclusion, Limitations, and Future Work\n"
@@ -469,7 +467,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 58,
+      "execution_count": 27,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -521,7 +519,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 59,
+      "execution_count": 28,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/",
@@ -537,18 +535,18 @@
           "text": [
             "Generated Answer:\n",
             "\n",
-            "The conclusions in this document are:\n",
+            "**Conclusions in this document:**\n",
             "\n",
-            "- DeepSeek-R1-Zero, a pure reinforcement learning (RL) approach without cold-start data, achieves\n",
+            "- DeepSeek-R1-Zero, a pure reinforcement learning (RL) model without cold-start data, achieves\n",
             "strong performance across various tasks.\n",
-            "- DeepSeek-R1, which combines cold-start data with iterative RL fine-tuning, is more powerful and\n",
-            "achieves performance comparable to OpenAI-o1-1217 on a range of tasks.\n",
-            "- Distilling DeepSeek-R1’s reasoning capabilities into smaller dense models is promising; for\n",
-            "example, DeepSeek-R1-Distill-Qwen-1.5B outperforms GPT-4o and Claude-3.5-Sonnet on math benchmarks,\n",
-            "and other dense models also show significant improvements over similar instruction-tuned models.\n",
-            "\n",
-            "These results demonstrate the effectiveness of the RL-based approach and the potential for\n",
-            "distilling reasoning abilities into smaller models.\n"
+            "- DeepSeek-R1, which combines cold-start data with iterative RL fine-tuning, is even more powerful\n",
+            "and achieves performance comparable to OpenAI-o1-1217 on a range of tasks.\n",
+            "- The reasoning capabilities of DeepSeek-R1 can be successfully distilled into smaller dense models,\n",
+            "with DeepSeek-R1-Distill-Qwen-1.5B outperforming GPT-4o and Claude-3.5-Sonnet on math benchmarks.\n",
+            "- Other small dense models fine-tuned with DeepSeek-R1 data also significantly outperform other\n",
+            "instruction-tuned models based on the same checkpoints.\n",
+            "- Overall, the approaches described demonstrate promising results in enhancing model reasoning\n",
+            "abilities through RL and distillation.\n"
           ]
         }
       ],