Feature/more cli diags (#624)

* CLI tools for tg-invoke-graph-embeddings, tg-invoke-document-embeddings, and tg-invoke-embeddings. Just useful for diagnostics. * Fix tg-load-knowledge
2026-04-25 16:36:21 +02:00 · 2026-02-04 14:10:30 +00:00 · 2026-02-04 14:10:30 +00:00 · 6bf08c3ace
commit 6bf08c3ace
parent 23cc4dfdd1
12 changed files with 559 additions and 24 deletions
--- a/trustgraph-cli/trustgraph/cli/invoke_document_embeddings.py
+++ b/trustgraph-cli/trustgraph/cli/invoke_document_embeddings.py
@ -0,0 +1,121 @@
+"""
+Queries document chunks by text similarity using vector embeddings.
+Returns a list of matching document chunks, truncated to the specified length.
+"""
+
+import argparse
+import os
+from trustgraph.api import Api
+
+default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
+default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
+
+def truncate_chunk(chunk, max_length):
+    """Truncate a chunk to max_length characters, adding ellipsis if needed."""
+    if len(chunk) <= max_length:
+        return chunk
+    return chunk[:max_length] + "..."
+
+def query(url, flow_id, query_text, user, collection, limit, max_chunk_length, token=None):
+
+    # Create API client
+    api = Api(url=url, token=token)
+    socket = api.socket()
+    flow = socket.flow(flow_id)
+
+    try:
+        # Call document embeddings query service
+        result = flow.document_embeddings_query(
+            text=query_text,
+            user=user,
+            collection=collection,
+            limit=limit
+        )
+
+        chunks = result.get("chunks", [])
+        for i, chunk in enumerate(chunks, 1):
+            truncated = truncate_chunk(chunk, max_chunk_length)
+            print(f"{i}. {truncated}")
+
+    finally:
+        # Clean up socket connection
+        socket.close()
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        prog='tg-invoke-document-embeddings',
+        description=__doc__,
+    )
+
+    parser.add_argument(
+        '-u', '--url',
+        default=default_url,
+        help=f'API URL (default: {default_url})',
+    )
+
+    parser.add_argument(
+        '-t', '--token',
+        default=default_token,
+        help='Authentication token (default: $TRUSTGRAPH_TOKEN)',
+    )
+
+    parser.add_argument(
+        '-f', '--flow-id',
+        default="default",
+        help=f'Flow ID (default: default)'
+    )
+
+    parser.add_argument(
+        '-U', '--user',
+        default="trustgraph",
+        help='User/keyspace (default: trustgraph)',
+    )
+
+    parser.add_argument(
+        '-c', '--collection',
+        default="default",
+        help='Collection (default: default)',
+    )
+
+    parser.add_argument(
+        '-l', '--limit',
+        type=int,
+        default=10,
+        help='Maximum number of results (default: 10)',
+    )
+
+    parser.add_argument(
+        '--max-chunk-length',
+        type=int,
+        default=200,
+        help='Truncate chunks to N characters (default: 200)',
+    )
+
+    parser.add_argument(
+        'query',
+        nargs=1,
+        help='Query text to search for similar document chunks',
+    )
+
+    args = parser.parse_args()
+
+    try:
+
+        query(
+            url=args.url,
+            flow_id=args.flow_id,
+            query_text=args.query[0],
+            user=args.user,
+            collection=args.collection,
+            limit=args.limit,
+            max_chunk_length=args.max_chunk_length,
+            token=args.token,
+        )
+
+    except Exception as e:
+
+        print("Exception:", e, flush=True)
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/invoke_embeddings.py
+++ b/trustgraph-cli/trustgraph/cli/invoke_embeddings.py
@ -0,0 +1,77 @@
+"""
+Invokes the embeddings service to convert text to a vector embedding.
+Returns the embedding vector as a list of floats.
+"""
+
+import argparse
+import os
+from trustgraph.api import Api
+
+default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
+default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
+
+def query(url, flow_id, text, token=None):
+
+    # Create API client
+    api = Api(url=url, token=token)
+    socket = api.socket()
+    flow = socket.flow(flow_id)
+
+    try:
+        # Call embeddings service
+        result = flow.embeddings(text=text)
+        vectors = result.get("vectors", [])
+        print(vectors)
+
+    finally:
+        # Clean up socket connection
+        socket.close()
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        prog='tg-invoke-embeddings',
+        description=__doc__,
+    )
+
+    parser.add_argument(
+        '-u', '--url',
+        default=default_url,
+        help=f'API URL (default: {default_url})',
+    )
+
+    parser.add_argument(
+        '-t', '--token',
+        default=default_token,
+        help='Authentication token (default: $TRUSTGRAPH_TOKEN)',
+    )
+
+    parser.add_argument(
+        '-f', '--flow-id',
+        default="default",
+        help=f'Flow ID (default: default)'
+    )
+
+    parser.add_argument(
+        'text',
+        nargs=1,
+        help='Text to convert to embedding vector',
+    )
+
+    args = parser.parse_args()
+
+    try:
+
+        query(
+            url=args.url,
+            flow_id=args.flow_id,
+            text=args.text[0],
+            token=args.token,
+        )
+
+    except Exception as e:
+
+        print("Exception:", e, flush=True)
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/invoke_graph_embeddings.py
+++ b/trustgraph-cli/trustgraph/cli/invoke_graph_embeddings.py
@ -0,0 +1,106 @@
+"""
+Queries graph entities by text similarity using vector embeddings.
+Returns a list of matching graph entities.
+"""
+
+import argparse
+import os
+from trustgraph.api import Api
+
+default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
+default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
+
+def query(url, flow_id, query_text, user, collection, limit, token=None):
+
+    # Create API client
+    api = Api(url=url, token=token)
+    socket = api.socket()
+    flow = socket.flow(flow_id)
+
+    try:
+        # Call graph embeddings query service
+        result = flow.graph_embeddings_query(
+            text=query_text,
+            user=user,
+            collection=collection,
+            limit=limit
+        )
+
+        entities = result.get("entities", [])
+        for entity in entities:
+            print(entity)
+
+    finally:
+        # Clean up socket connection
+        socket.close()
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        prog='tg-invoke-graph-embeddings',
+        description=__doc__,
+    )
+
+    parser.add_argument(
+        '-u', '--url',
+        default=default_url,
+        help=f'API URL (default: {default_url})',
+    )
+
+    parser.add_argument(
+        '-t', '--token',
+        default=default_token,
+        help='Authentication token (default: $TRUSTGRAPH_TOKEN)',
+    )
+
+    parser.add_argument(
+        '-f', '--flow-id',
+        default="default",
+        help=f'Flow ID (default: default)'
+    )
+
+    parser.add_argument(
+        '-U', '--user',
+        default="trustgraph",
+        help='User/keyspace (default: trustgraph)',
+    )
+
+    parser.add_argument(
+        '-c', '--collection',
+        default="default",
+        help='Collection (default: default)',
+    )
+
+    parser.add_argument(
+        '-l', '--limit',
+        type=int,
+        default=10,
+        help='Maximum number of results (default: 10)',
+    )
+
+    parser.add_argument(
+        'query',
+        nargs=1,
+        help='Query text to search for similar graph entities',
+    )
+
+    args = parser.parse_args()
+
+    try:
+
+        query(
+            url=args.url,
+            flow_id=args.flow_id,
+            query_text=args.query[0],
+            user=args.user,
+            collection=args.collection,
+            limit=args.limit,
+            token=args.token,
+        )
+
+    except Exception as e:
+
+        print("Exception:", e, flush=True)
+
+if __name__ == "__main__":
+    main()
--- a/trustgraph-cli/trustgraph/cli/load_knowledge.py
+++ b/trustgraph-cli/trustgraph/cli/load_knowledge.py
@ -87,13 +87,20 @@ class KnowledgeLoader:

            # Load triples from all files
            print("Loading triples...")
+            total_triples = 0
            for file in self.files:
                print(f"  Processing {file}...")
-                triples = self.load_triples_from_file(file)
+                count = 0
+
+                def counting_triples():
+                    nonlocal count
+                    for triple in self.load_triples_from_file(file):
+                        count += 1
+                        yield triple

                bulk.import_triples(
                    flow=self.flow,
-                    triples=triples,
+                    triples=counting_triples(),
                    metadata={
                        "id": self.document_id,
                        "metadata": [],
@ -101,25 +108,33 @@ class KnowledgeLoader:
                        "collection": self.collection
                    }
                )
+                print(f"    Loaded {count} triples")
+                total_triples += count

-            print("Triples loaded.")
+            print(f"Triples loaded. Total: {total_triples}")

            # Load entity contexts from all files
            print("Loading entity contexts...")
+            total_contexts = 0
            for file in self.files:
                print(f"  Processing {file}...")
+                count = 0

                # Convert tuples to the format expected by import_entity_contexts
+                # Entity must be in Term format: {"t": "i", "i": uri} for IRI
                def entity_context_generator():
+                    nonlocal count
                    for entity, context in self.load_entity_contexts_from_file(file):
+                        count += 1
+                        # Entities from RDF are URIs, use IRI term format
                        yield {
-                            "entity": {"v": entity, "e": True},
+                            "entity": {"t": "i", "i": entity},
                            "context": context
                        }

                bulk.import_entity_contexts(
                    flow=self.flow,
-                    entities=entity_context_generator(),
+                    contexts=entity_context_generator(),
                    metadata={
                        "id": self.document_id,
                        "metadata": [],
@ -127,8 +142,10 @@ class KnowledgeLoader:
                        "collection": self.collection
                    }
                )
+                print(f"    Loaded {count} entity contexts")
+                total_contexts += count

-            print("Entity contexts loaded.")
+            print(f"Entity contexts loaded. Total: {total_contexts}")

        except Exception as e:
            print(f"Error: {e}", flush=True)