feat: add cross-encoder reranking to Document-RAG with two-limit control (#878) (#1011)

Wire the FlashRank reranker subsystem from #1005 into Document-RAG: after vector retrieval, over-fetch a wider candidate pool, rerank with the cross-encoder, and keep the top doc_limit chunks for synthesis. Per maintainer review, the fetch and select sizes are two caller-controlled limits rather than one internal heuristic: - doc_limit: chunks selected into the synthesis prompt (unchanged meaning). - fetch_limit: candidate pool pulled from the vector store before reranking. 0 = derive (OVERFETCH_FACTOR x doc_limit); values below doc_limit are raised to it. Lets the caller control how hard the reranker has to work. Details: - schema: DocumentRagQuery.fetch_limit (additive, backward compatible). - document_rag.py / rag.py: fetch_limit resolved in the processor (mirrors doc_limit); the core applies the heuristic default and derives synthesis provenance from the chunk-selection focus when reranking ran. - provenance: tg:ChunkSelection focus stage (mirrors tg:EdgeSelection). - request translator + client SDKs + CLI: fetch-limit / --fetch-limit, threaded exactly like doc_limit and the GraphRAG limits. - tests: no-op identity, over-fetch/narrow, explicit fetch_limit, heuristic default, floor-at-doc_limit, provenance lineage, cross-repo topic wiring. Reranking is skipped byte-identically when no reranker role is wired. Requires the companion trustgraph-templates change wiring the reranker topics into the document-rag flow (mirrors #279 for GraphRAG).
2026-07-03 15:01:00 +02:00 · 2026-07-02 02:50:13 -06:00 · 2026-07-02 02:50:13 -06:00 · 6c9a545a06
commit 6c9a545a06
parent f18d48dc39
18 changed files with 853 additions and 26 deletions
--- a/trustgraph-cli/trustgraph/cli/invoke_document_rag.py
+++ b/trustgraph-cli/trustgraph/cli/invoke_document_rag.py
@ -21,10 +21,12 @@ default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
 default_workspace = os.getenv("TRUSTGRAPH_WORKSPACE", "default")
 default_collection = 'default'
 default_doc_limit = 10
+default_fetch_limit = 0


 def question_explainable(
-    url, flow_id, question_text, collection, doc_limit, token=None, debug=False,
+    url, flow_id, question_text, collection, doc_limit, fetch_limit=0,
+    token=None, debug=False,
    workspace="default",
 ):
    """Execute document RAG with explainability - shows provenance events inline."""
@ -39,6 +41,7 @@ def question_explainable(
            query=question_text,
                        collection=collection,
            doc_limit=doc_limit,
+            fetch_limit=fetch_limit,
        ):
            if isinstance(item, RAGChunk):
                # Print response content
@ -97,7 +100,7 @@ def question_explainable(


 def question(
-    url, flow_id, question_text, collection, doc_limit,
+    url, flow_id, question_text, collection, doc_limit, fetch_limit=0,
    streaming=True, token=None, explainable=False, debug=False,
    show_usage=False, workspace="default",
 ):
@ -109,6 +112,7 @@ def question(
            question_text=question_text,
                        collection=collection,
            doc_limit=doc_limit,
+            fetch_limit=fetch_limit,
            token=token,
            debug=debug,
            workspace=workspace,
@ -128,6 +132,7 @@ def question(
                query=question_text,
                                collection=collection,
                doc_limit=doc_limit,
+                fetch_limit=fetch_limit,
                streaming=True
            )

@ -155,6 +160,7 @@ def question(
            query=question_text,
                        collection=collection,
            doc_limit=doc_limit,
+            fetch_limit=fetch_limit,
        )
        print(result.text)

@ -214,7 +220,15 @@ def main():
        '-d', '--doc-limit',
        type=int,
        default=default_doc_limit,
-        help=f'Document limit (default: {default_doc_limit})'
+        help=f'Documents selected into the prompt (default: {default_doc_limit})'
+    )
+
+    parser.add_argument(
+        '--fetch-limit',
+        type=int,
+        default=default_fetch_limit,
+        help='Candidate documents fetched from the vector store before '
+             'reranking (default: derive from doc-limit)'
    )

    parser.add_argument(
@ -251,6 +265,7 @@ def main():
            question_text=args.question,
            collection=args.collection,
            doc_limit=args.doc_limit,
+            fetch_limit=args.fetch_limit,
            streaming=not args.no_streaming,
            token=args.token,
            explainable=args.explainable,