Merge 2.0 to master (#651)

2026-04-25 16:36:21 +02:00 · 2026-02-28 11:03:14 +00:00 · 2026-02-28 11:03:14 +00:00 · b9d7bf9a8b
commit b9d7bf9a8b
parent 3666ece2c5
212 changed files with 13940 additions and 6180 deletions
--- a/trustgraph-cli/pyproject.toml
+++ b/trustgraph-cli/pyproject.toml
@ -10,7 +10,7 @@ description = "TrustGraph provides a means to run a pipeline of flexible AI proc
 readme = "README.md"
 requires-python = ">=3.8"
 dependencies = [
-    "trustgraph-base>=1.8,<1.9",
+    "trustgraph-base>=2.0,<2.1",
    "requests",
    "pulsar-client",
    "aiohttp",
@ -43,9 +43,13 @@ tg-invoke-agent = "trustgraph.cli.invoke_agent:main"
 tg-invoke-document-rag = "trustgraph.cli.invoke_document_rag:main"
 tg-invoke-graph-rag = "trustgraph.cli.invoke_graph_rag:main"
 tg-invoke-llm = "trustgraph.cli.invoke_llm:main"
+tg-invoke-embeddings = "trustgraph.cli.invoke_embeddings:main"
+tg-invoke-graph-embeddings = "trustgraph.cli.invoke_graph_embeddings:main"
+tg-invoke-document-embeddings = "trustgraph.cli.invoke_document_embeddings:main"
 tg-invoke-mcp-tool = "trustgraph.cli.invoke_mcp_tool:main"
 tg-invoke-nlp-query = "trustgraph.cli.invoke_nlp_query:main"
-tg-invoke-objects-query = "trustgraph.cli.invoke_objects_query:main"
+tg-invoke-rows-query = "trustgraph.cli.invoke_rows_query:main"
+tg-invoke-row-embeddings = "trustgraph.cli.invoke_row_embeddings:main"
 tg-invoke-prompt = "trustgraph.cli.invoke_prompt:main"
 tg-invoke-structured-query = "trustgraph.cli.invoke_structured_query:main"
 tg-load-doc-embeds = "trustgraph.cli.load_doc_embeds:main"
--- a/trustgraph-cli/trustgraph/cli/invoke_document_embeddings.py
+++ b/trustgraph-cli/trustgraph/cli/invoke_document_embeddings.py
@ -0,0 +1,121 @@
+"""
+Queries document chunks by text similarity using vector embeddings.
+Returns a list of matching document chunks, truncated to the specified length.
+"""
+
+import argparse
+import os
+from trustgraph.api import Api
+
+default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
+default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
+
+def truncate_chunk(chunk, max_length):
+    """Truncate a chunk to max_length characters, adding ellipsis if needed."""
+    if len(chunk) <= max_length:
+        return chunk
+    return chunk[:max_length] + "..."
+
+def query(url, flow_id, query_text, user, collection, limit, max_chunk_length, token=None):
+
+    # Create API client
+    api = Api(url=url, token=token)
+    socket = api.socket()
+    flow = socket.flow(flow_id)
+
+    try:
+        # Call document embeddings query service
+        result = flow.document_embeddings_query(
+            text=query_text,
+            user=user,
+            collection=collection,
+            limit=limit
+        )
+
+        chunks = result.get("chunks", [])
+        for i, chunk in enumerate(chunks, 1):
+            truncated = truncate_chunk(chunk, max_chunk_length)
+            print(f"{i}. {truncated}")
+
+    finally:
+        # Clean up socket connection
+        socket.close()
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        prog='tg-invoke-document-embeddings',
+        description=__doc__,
+    )
+
+    parser.add_argument(
+        '-u', '--url',
+        default=default_url,
+        help=f'API URL (default: {default_url})',
+    )
+
+    parser.add_argument(
+        '-t', '--token',
+        default=default_token,
+        help='Authentication token (default: $TRUSTGRAPH_TOKEN)',
+    )
+
+    parser.add_argument(
+        '-f', '--flow-id',
+        default="default",
+        help=f'Flow ID (default: default)'
+    )
+
+    parser.add_argument(
+        '-U', '--user',
+        default="trustgraph",
+        help='User/keyspace (default: trustgraph)',
+    )
+
+    parser.add_argument(
+        '-c', '--collection',
+        default="default",
+        help='Collection (default: default)',
+    )
+
+    parser.add_argument(
+        '-l', '--limit',
+        type=int,
+        default=10,
+        help='Maximum number of results (default: 10)',
+    )
+
+    parser.add_argument(
+        '--max-chunk-length',
+        type=int,
+        default=200,
+        help='Truncate chunks to N characters (default: 200)',
+    )
+
+    parser.add_argument(
+        'query',
+        nargs=1,
+        help='Query text to search for similar document chunks',
+    )
+
+    args = parser.parse_args()
+
+    try:
+
+        query(
+            url=args.url,
+            flow_id=args.flow_id,
+            query_text=args.query[0],
+            user=args.user,
+            collection=args.collection,
+            limit=args.limit,
+            max_chunk_length=args.max_chunk_length,
+            token=args.token,
+        )
+
+    except Exception as e:
+
+        print("Exception:", e, flush=True)
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/invoke_embeddings.py
+++ b/trustgraph-cli/trustgraph/cli/invoke_embeddings.py
@ -0,0 +1,77 @@
+"""
+Invokes the embeddings service to convert text to a vector embedding.
+Returns the embedding vector as a list of floats.
+"""
+
+import argparse
+import os
+from trustgraph.api import Api
+
+default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
+default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
+
+def query(url, flow_id, text, token=None):
+
+    # Create API client
+    api = Api(url=url, token=token)
+    socket = api.socket()
+    flow = socket.flow(flow_id)
+
+    try:
+        # Call embeddings service
+        result = flow.embeddings(text=text)
+        vectors = result.get("vectors", [])
+        print(vectors)
+
+    finally:
+        # Clean up socket connection
+        socket.close()
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        prog='tg-invoke-embeddings',
+        description=__doc__,
+    )
+
+    parser.add_argument(
+        '-u', '--url',
+        default=default_url,
+        help=f'API URL (default: {default_url})',
+    )
+
+    parser.add_argument(
+        '-t', '--token',
+        default=default_token,
+        help='Authentication token (default: $TRUSTGRAPH_TOKEN)',
+    )
+
+    parser.add_argument(
+        '-f', '--flow-id',
+        default="default",
+        help=f'Flow ID (default: default)'
+    )
+
+    parser.add_argument(
+        'text',
+        nargs=1,
+        help='Text to convert to embedding vector',
+    )
+
+    args = parser.parse_args()
+
+    try:
+
+        query(
+            url=args.url,
+            flow_id=args.flow_id,
+            text=args.text[0],
+            token=args.token,
+        )
+
+    except Exception as e:
+
+        print("Exception:", e, flush=True)
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/invoke_graph_embeddings.py
+++ b/trustgraph-cli/trustgraph/cli/invoke_graph_embeddings.py
@ -0,0 +1,106 @@
+"""
+Queries graph entities by text similarity using vector embeddings.
+Returns a list of matching graph entities.
+"""
+
+import argparse
+import os
+from trustgraph.api import Api
+
+default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
+default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
+
+def query(url, flow_id, query_text, user, collection, limit, token=None):
+
+    # Create API client
+    api = Api(url=url, token=token)
+    socket = api.socket()
+    flow = socket.flow(flow_id)
+
+    try:
+        # Call graph embeddings query service
+        result = flow.graph_embeddings_query(
+            text=query_text,
+            user=user,
+            collection=collection,
+            limit=limit
+        )
+
+        entities = result.get("entities", [])
+        for entity in entities:
+            print(entity)
+
+    finally:
+        # Clean up socket connection
+        socket.close()
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        prog='tg-invoke-graph-embeddings',
+        description=__doc__,
+    )
+
+    parser.add_argument(
+        '-u', '--url',
+        default=default_url,
+        help=f'API URL (default: {default_url})',
+    )
+
+    parser.add_argument(
+        '-t', '--token',
+        default=default_token,
+        help='Authentication token (default: $TRUSTGRAPH_TOKEN)',
+    )
+
+    parser.add_argument(
+        '-f', '--flow-id',
+        default="default",
+        help=f'Flow ID (default: default)'
+    )
+
+    parser.add_argument(
+        '-U', '--user',
+        default="trustgraph",
+        help='User/keyspace (default: trustgraph)',
+    )
+
+    parser.add_argument(
+        '-c', '--collection',
+        default="default",
+        help='Collection (default: default)',
+    )
+
+    parser.add_argument(
+        '-l', '--limit',
+        type=int,
+        default=10,
+        help='Maximum number of results (default: 10)',
+    )
+
+    parser.add_argument(
+        'query',
+        nargs=1,
+        help='Query text to search for similar graph entities',
+    )
+
+    args = parser.parse_args()
+
+    try:
+
+        query(
+            url=args.url,
+            flow_id=args.flow_id,
+            query_text=args.query[0],
+            user=args.user,
+            collection=args.collection,
+            limit=args.limit,
+            token=args.token,
+        )
+
+    except Exception as e:
+
+        print("Exception:", e, flush=True)
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/invoke_row_embeddings.py
+++ b/trustgraph-cli/trustgraph/cli/invoke_row_embeddings.py
@ -0,0 +1,126 @@
+"""
+Queries row data by text similarity using vector embeddings on indexed fields.
+Returns matching rows with their index values and similarity scores.
+"""
+
+import argparse
+import os
+from trustgraph.api import Api
+
+default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
+default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
+
+def query(url, flow_id, query_text, schema_name, user, collection, index_name, limit, token=None):
+
+    # Create API client
+    api = Api(url=url, token=token)
+    socket = api.socket()
+    flow = socket.flow(flow_id)
+
+    try:
+        # Call row embeddings query service
+        result = flow.row_embeddings_query(
+            text=query_text,
+            schema_name=schema_name,
+            user=user,
+            collection=collection,
+            index_name=index_name,
+            limit=limit
+        )
+
+        matches = result.get("matches", [])
+        for match in matches:
+            print(f"Index: {match['index_name']}")
+            print(f"  Values: {match['index_value']}")
+            print(f"  Text: {match['text']}")
+            print(f"  Score: {match['score']:.4f}")
+            print()
+
+    finally:
+        # Clean up socket connection
+        socket.close()
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        prog='tg-invoke-row-embeddings',
+        description=__doc__,
+    )
+
+    parser.add_argument(
+        '-u', '--url',
+        default=default_url,
+        help=f'API URL (default: {default_url})',
+    )
+
+    parser.add_argument(
+        '-t', '--token',
+        default=default_token,
+        help='Authentication token (default: $TRUSTGRAPH_TOKEN)',
+    )
+
+    parser.add_argument(
+        '-f', '--flow-id',
+        default="default",
+        help=f'Flow ID (default: default)'
+    )
+
+    parser.add_argument(
+        '-U', '--user',
+        default="trustgraph",
+        help='User/keyspace (default: trustgraph)',
+    )
+
+    parser.add_argument(
+        '-c', '--collection',
+        default="default",
+        help='Collection (default: default)',
+    )
+
+    parser.add_argument(
+        '-s', '--schema-name',
+        required=True,
+        help='Schema name to search within (required)',
+    )
+
+    parser.add_argument(
+        '-i', '--index-name',
+        default=None,
+        help='Index name to filter search (optional)',
+    )
+
+    parser.add_argument(
+        '-l', '--limit',
+        type=int,
+        default=10,
+        help='Maximum number of results (default: 10)',
+    )
+
+    parser.add_argument(
+        'query',
+        nargs=1,
+        help='Query text to search for similar row index values',
+    )
+
+    args = parser.parse_args()
+
+    try:
+
+        query(
+            url=args.url,
+            flow_id=args.flow_id,
+            query_text=args.query[0],
+            schema_name=args.schema_name,
+            user=args.user,
+            collection=args.collection,
+            index_name=args.index_name,
+            limit=args.limit,
+            token=args.token,
+        )
+
+    except Exception as e:
+
+        print("Exception:", e, flush=True)
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/invoke_objects_query.py
+++ b/trustgraph-cli/trustgraph/cli/invoke_objects_query.py
@ -1,5 +1,5 @@
 """
-Uses the ObjectsQuery service to execute GraphQL queries against structured data
+Uses the RowsQuery service to execute GraphQL queries against structured data
 """

 import argparse
@ -81,7 +81,7 @@ def format_table_data(rows, table_name, output_format):
    else:
        return json.dumps({table_name: rows}, indent=2)

-def objects_query(
+def rows_query(
        url, flow_id, query, user, collection, variables, operation_name, output_format='table'
 ):

@ -96,7 +96,7 @@ def objects_query(
            print(f"Error parsing variables JSON: {e}", file=sys.stderr)
            sys.exit(1)

-    resp = api.objects_query(
+    resp = api.rows_query(
        query=query,
        user=user,
        collection=collection,
@ -126,7 +126,7 @@ def objects_query(
 def main():

    parser = argparse.ArgumentParser(
-        prog='tg-invoke-objects-query',
+        prog='tg-invoke-rows-query',
        description=__doc__,
    )

@ -181,7 +181,7 @@ def main():

    try:

-        objects_query(
+        rows_query(
            url=args.url,
            flow_id=args.flow_id,
            query=args.query,
--- a/trustgraph-cli/trustgraph/cli/load_knowledge.py
+++ b/trustgraph-cli/trustgraph/cli/load_knowledge.py
@ -87,13 +87,20 @@ class KnowledgeLoader:

            # Load triples from all files
            print("Loading triples...")
+            total_triples = 0
            for file in self.files:
                print(f"  Processing {file}...")
-                triples = self.load_triples_from_file(file)
+                count = 0
+
+                def counting_triples():
+                    nonlocal count
+                    for triple in self.load_triples_from_file(file):
+                        count += 1
+                        yield triple

                bulk.import_triples(
                    flow=self.flow,
-                    triples=triples,
+                    triples=counting_triples(),
                    metadata={
                        "id": self.document_id,
                        "metadata": [],
@ -101,25 +108,33 @@ class KnowledgeLoader:
                        "collection": self.collection
                    }
                )
+                print(f"    Loaded {count} triples")
+                total_triples += count

-            print("Triples loaded.")
+            print(f"Triples loaded. Total: {total_triples}")

            # Load entity contexts from all files
            print("Loading entity contexts...")
+            total_contexts = 0
            for file in self.files:
                print(f"  Processing {file}...")
+                count = 0

                # Convert tuples to the format expected by import_entity_contexts
+                # Entity must be in Term format: {"t": "i", "i": uri} for IRI
                def entity_context_generator():
+                    nonlocal count
                    for entity, context in self.load_entity_contexts_from_file(file):
+                        count += 1
+                        # Entities from RDF are URIs, use IRI term format
                        yield {
-                            "entity": {"v": entity, "e": True},
+                            "entity": {"t": "i", "i": entity},
                            "context": context
                        }

                bulk.import_entity_contexts(
                    flow=self.flow,
-                    entities=entity_context_generator(),
+                    contexts=entity_context_generator(),
                    metadata={
                        "id": self.document_id,
                        "metadata": [],
@ -127,8 +142,10 @@ class KnowledgeLoader:
                        "collection": self.collection
                    }
                )
+                print(f"    Loaded {count} entity contexts")
+                total_contexts += count

-            print("Entity contexts loaded.")
+            print(f"Entity contexts loaded. Total: {total_contexts}")

        except Exception as e:
            print(f"Error: {e}", flush=True)
--- a/trustgraph-cli/trustgraph/cli/load_structured_data.py
+++ b/trustgraph-cli/trustgraph/cli/load_structured_data.py
@ -573,19 +573,19 @@ def _process_data_pipeline(input_file, descriptor_file, user, collection, sample
    return output_records, descriptor


-def _send_to_trustgraph(objects, api_url, flow, batch_size=1000, token=None):
+def _send_to_trustgraph(rows, api_url, flow, batch_size=1000, token=None):
    """Send ExtractedObject records to TrustGraph using Python API"""
    from trustgraph.api import Api

    try:
-        total_records = len(objects)
+        total_records = len(rows)
        logger.info(f"Importing {total_records} records to TrustGraph...")

        # Use Python API bulk import
        api = Api(api_url, token=token)
        bulk = api.bulk()

-        bulk.import_objects(flow=flow, objects=iter(objects))
+        bulk.import_rows(flow=flow, rows=iter(rows))

        logger.info(f"Successfully imported {total_records} records to TrustGraph")

--- a/trustgraph-cli/trustgraph/cli/set_tool.py
+++ b/trustgraph-cli/trustgraph/cli/set_tool.py
@ -2,8 +2,9 @@
 Configures and registers tools in the TrustGraph system.

 This script allows you to define agent tools with various types including:
- knowledge-query: Query knowledge bases  
+- knowledge-query: Query knowledge bases
 - structured-query: Query structured data using natural language
+- row-embeddings-query: Semantic search on structured data indexes
 - text-completion: Text generation
 - mcp-tool: Reference to MCP (Model Context Protocol) tools
 - prompt: Prompt template execution
@ -64,6 +65,9 @@ def set_tool(
        mcp_tool : str,
        collection : str,
        template : str,
+        schema_name : str,
+        index_name : str,
+        limit : int,
        arguments : List[Argument],
        group : List[str],
        state : str,
@ -89,6 +93,12 @@ def set_tool(

    if template: object["template"] = template

+    if schema_name: object["schema-name"] = schema_name
+
+    if index_name: object["index-name"] = index_name
+
+    if limit: object["limit"] = limit
+
    if arguments:
        object["arguments"] = [
            {
@ -120,30 +130,37 @@ def main():
        description=__doc__,
        epilog=textwrap.dedent('''
            Valid tool types:
-              knowledge-query    - Query knowledge bases (fixed args)
-              structured-query   - Query structured data using natural language (fixed args)
-              text-completion    - Text completion/generation (fixed args)
-              mcp-tool           - Model Control Protocol tool (configurable args)
-              prompt             - Prompt template query (configurable args)
-            
-            Note: Tools marked "(fixed args)" have predefined arguments and don't need 
+              knowledge-query      - Query knowledge bases (fixed args)
+              structured-query     - Query structured data using natural language (fixed args)
+              row-embeddings-query - Semantic search on structured data indexes (fixed args)
+              text-completion      - Text completion/generation (fixed args)
+              mcp-tool             - Model Control Protocol tool (configurable args)
+              prompt               - Prompt template query (configurable args)
+
+            Note: Tools marked "(fixed args)" have predefined arguments and don't need
            --argument specified. Tools marked "(configurable args)" require --argument.
-            
+
            Valid argument types:
-              string            - String/text parameter  
+              string            - String/text parameter
              number            - Numeric parameter
-            
+
            Examples:
              %(prog)s --id weather_tool --name get_weather \\
                       --type knowledge-query \\
                       --description "Get weather information for a location" \\
                       --collection weather_data
-              
+
              %(prog)s --id data_query_tool --name query_data \\
                       --type structured-query \\
                       --description "Query structured data using natural language" \\
                       --collection sales_data
-              
+
+              %(prog)s --id customer_search --name find_customer \\
+                       --type row-embeddings-query \\
+                       --description "Find customers by name using semantic search" \\
+                       --schema-name customers --collection sales \\
+                       --index-name full_name --limit 20
+
              %(prog)s --id calc_tool --name calculate --type mcp-tool \\
                       --description "Perform mathematical calculations" \\
                       --mcp-tool calculator \\
@ -181,7 +198,7 @@ def main():

    parser.add_argument(
        '--type',
-        help=f'Tool type, one of: knowledge-query, structured-query, text-completion, mcp-tool, prompt',
+        help=f'Tool type, one of: knowledge-query, structured-query, row-embeddings-query, text-completion, mcp-tool, prompt',
    )

    parser.add_argument(
@ -191,7 +208,23 @@ def main():

    parser.add_argument(
        '--collection',
-        help=f'For knowledge-query and structured-query types: collection to query',
+        help=f'For knowledge-query, structured-query, and row-embeddings-query types: collection to query',
+    )
+
+    parser.add_argument(
+        '--schema-name',
+        help=f'For row-embeddings-query type: schema name to search within (required)',
+    )
+
+    parser.add_argument(
+        '--index-name',
+        help=f'For row-embeddings-query type: specific index to filter search (optional)',
+    )
+
+    parser.add_argument(
+        '--limit',
+        type=int,
+        help=f'For row-embeddings-query type: maximum results to return (default: 10)',
    )

    parser.add_argument(
@ -227,7 +260,8 @@ def main():
    try:

        valid_types = [
-            "knowledge-query", "structured-query", "text-completion", "mcp-tool", "prompt"
+            "knowledge-query", "structured-query", "row-embeddings-query",
+            "text-completion", "mcp-tool", "prompt"
        ]

        if args.id is None:
@ -261,6 +295,9 @@ def main():
            mcp_tool=mcp_tool,
            collection=args.collection,
            template=args.template,
+            schema_name=args.schema_name,
+            index_name=args.index_name,
+            limit=args.limit,
            arguments=arguments,
            group=args.group,
            state=args.state,
--- a/trustgraph-cli/trustgraph/cli/show_tools.py
+++ b/trustgraph-cli/trustgraph/cli/show_tools.py
@ -4,8 +4,9 @@ Displays the current agent tool configurations
 Shows all configured tools including their types:
 - knowledge-query: Tools that query knowledge bases
 - structured-query: Tools that query structured data using natural language
+- row-embeddings-query: Tools for semantic search on structured data indexes
 - text-completion: Tools for text generation
- mcp-tool: References to MCP (Model Context Protocol) tools  
+- mcp-tool: References to MCP (Model Context Protocol) tools
 - prompt: Tools that execute prompt templates
 """

@ -41,11 +42,19 @@ def show_config(url, token=None):

        if tp == "mcp-tool":
            table.append(("mcp-tool", data["mcp-tool"]))
-          
-        if tp == "knowledge-query" or tp == "structured-query":
+
+        if tp in ("knowledge-query", "structured-query", "row-embeddings-query"):
            if "collection" in data:
                table.append(("collection", data["collection"]))

+        if tp == "row-embeddings-query":
+            if "schema-name" in data:
+                table.append(("schema-name", data["schema-name"]))
+            if "index-name" in data:
+                table.append(("index-name", data["index-name"]))
+            if "limit" in data:
+                table.append(("limit", data["limit"]))
+
        if tp == "prompt":
            table.append(("template", data["template"]))
            for n, arg in enumerate(data["arguments"]):