release/v2.4 -> master (#844)

2026-04-25 08:26:21 +02:00 · 2026-04-22 15:19:57 +01:00 · 2026-04-22 15:19:57 +01:00 · 89cabee1b4
commit 89cabee1b4
parent a24df8e990
386 changed files with 7202 additions and 5741 deletions
--- a/trustgraph-cli/trustgraph/cli/load_structured_data.py
+++ b/trustgraph-cli/trustgraph/cli/load_structured_data.py
@ -23,6 +23,7 @@ logger = logging.getLogger(__name__)

 default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
 default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
+default_workspace = os.getenv("TRUSTGRAPH_WORKSPACE", "default")


 def load_structured_data(
@ -39,11 +40,11 @@ def load_structured_data(
    sample_chars: int = 500,
    schema_name: str = None,
    flow: str = 'default',
-    user: str = 'trustgraph',
    collection: str = 'default',
    dry_run: bool = False,
    verbose: bool = False,
-    token: str = None
+    token: str = None,
+    workspace: str = "default",
 ):
    """
    Load structured data using a descriptor configuration.
@ -62,7 +63,6 @@ def load_structured_data(
        sample_chars: Maximum characters to read for sampling
        schema_name: Target schema name for generation
        flow: TrustGraph flow name to use for prompts
-        user: User name for metadata (default: trustgraph)
        collection: Collection name for metadata (default: default)
        dry_run: If True, validate but don't import data
        verbose: Enable verbose logging
@ -78,7 +78,7 @@ def load_structured_data(
        logger.info("Step 1: Analyzing data to discover best matching schema...")
        
        # Step 1: Auto-discover schema (reuse discover_schema logic)
-        discovered_schema = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger)
+        discovered_schema = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger, workspace=workspace)
        if not discovered_schema:
            logger.error("Failed to discover suitable schema automatically")
            print("❌ Could not automatically determine the best schema for your data.")
@ -90,7 +90,7 @@ def load_structured_data(
        
        # Step 2: Auto-generate descriptor
        logger.info("Step 2: Generating descriptor configuration...")
-        auto_descriptor = _auto_generate_descriptor(api_url, input_file, discovered_schema, sample_chars, flow, logger)
+        auto_descriptor = _auto_generate_descriptor(api_url, input_file, discovered_schema, sample_chars, flow, logger, workspace=workspace)
        if not auto_descriptor:
            logger.error("Failed to generate descriptor automatically")
            print("❌ Could not automatically generate descriptor configuration.")
@ -110,7 +110,7 @@ def load_structured_data(
        
        try:
            # Use shared pipeline for preview (small sample)
-            preview_objects, _ = _process_data_pipeline(input_file, temp_descriptor.name, user, collection, sample_size=5)
+            preview_objects, _ = _process_data_pipeline(input_file, temp_descriptor.name, collection, sample_size=5)
            
            # Show preview
            print("📊 Data Preview (first few records):")
@ -131,13 +131,13 @@ def load_structured_data(
                print("🚀 Importing data to TrustGraph...")
                
                # Use shared pipeline for full processing (no sample limit)
-                output_objects, descriptor = _process_data_pipeline(input_file, temp_descriptor.name, user, collection)
+                output_objects, descriptor = _process_data_pipeline(input_file, temp_descriptor.name, collection)
                
                # Get batch size from descriptor
                batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)

                # Send to TrustGraph using shared function
-                imported_count = _send_to_trustgraph(output_objects, api_url, flow, batch_size, token=token)
+                imported_count = _send_to_trustgraph(output_objects, api_url, flow, batch_size, token=token, workspace=workspace)
                
                # Summary
                format_info = descriptor.get('format', {})
@ -172,7 +172,7 @@ def load_structured_data(
        logger.info(f"Sample chars: {sample_chars} characters")
        
        # Use the helper function to discover schema (get raw response for display)
-        response = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger, return_raw_response=True)
+        response = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger, return_raw_response=True, workspace=workspace)
        
        if response:
            # Debug: print response type and content 
@ -203,7 +203,7 @@ def load_structured_data(
        # If no schema specified, discover it first
        if not schema_name:
            logger.info("No schema specified, auto-discovering...")
-            schema_name = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger)
+            schema_name = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger, workspace=workspace)
            if not schema_name:
                print("Error: Could not determine schema automatically.")
                print("Please specify a schema using --schema-name or run --discover-schema first.")
@ -213,7 +213,7 @@ def load_structured_data(
            logger.info(f"Target schema: {schema_name}")
        
        # Generate descriptor using helper function
-        descriptor = _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, flow, logger)
+        descriptor = _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, flow, logger, workspace=workspace)
        
        if descriptor:
            # Output the generated descriptor
@ -242,7 +242,7 @@ def load_structured_data(
        logger.info(f"Parsing {input_file} with descriptor {descriptor_file}...")
        
        # Use shared pipeline
-        output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, user, collection, sample_size)
+        output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, collection, sample_size)
        
        # Output results
        if output_file:
@ -286,7 +286,7 @@ def load_structured_data(
        logger.info(f"Loading {input_file} to TrustGraph using descriptor {descriptor_file}...")
        
        # Use shared pipeline (no sample_size limit for full load)
-        output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, user, collection)
+        output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, collection)
        
        # Get batch size from descriptor or use default
        batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)
@ -527,18 +527,17 @@ def _apply_transformations(records, mappings):
    return processed_records


-def _format_extracted_objects(processed_records, descriptor, user, collection):
+def _format_extracted_objects(processed_records, descriptor, collection):
    """Convert to TrustGraph ExtractedObject format"""
    output_records = []
    schema_name = descriptor.get('output', {}).get('schema_name', 'default')
    confidence = descriptor.get('output', {}).get('options', {}).get('confidence', 0.9)
-    
+
    for record in processed_records:
        output_record = {
            "metadata": {
                "id": f"parsed-{len(output_records)+1}",
                "metadata": [],  # Empty metadata triples
-                "user": user,
                "collection": collection
            },
            "schema_name": schema_name,
@ -551,7 +550,7 @@ def _format_extracted_objects(processed_records, descriptor, user, collection):
    return output_records


-def _process_data_pipeline(input_file, descriptor_file, user, collection, sample_size=None):
+def _process_data_pipeline(input_file, descriptor_file, collection, sample_size=None):
    """Shared pipeline: load descriptor → read → parse → transform → format"""
    # Load descriptor configuration
    descriptor = _load_descriptor(descriptor_file)
@ -568,12 +567,12 @@ def _process_data_pipeline(input_file, descriptor_file, user, collection, sample
    processed_records = _apply_transformations(parsed_records, mappings)
    
    # Format output for TrustGraph ExtractedObject structure
-    output_records = _format_extracted_objects(processed_records, descriptor, user, collection)
+    output_records = _format_extracted_objects(processed_records, descriptor, collection)
    
    return output_records, descriptor


-def _send_to_trustgraph(rows, api_url, flow, batch_size=1000, token=None):
+def _send_to_trustgraph(rows, api_url, flow, batch_size=1000, token=None, workspace="default"):
    """Send ExtractedObject records to TrustGraph using Python API"""
    from trustgraph.api import Api

@ -582,7 +581,7 @@ def _send_to_trustgraph(rows, api_url, flow, batch_size=1000, token=None):
        logger.info(f"Importing {total_records} records to TrustGraph...")

        # Use Python API bulk import
-        api = Api(api_url, token=token)
+        api = Api(api_url, token=token, workspace=workspace)
        bulk = api.bulk()

        bulk.import_rows(flow=flow, rows=iter(rows))
@ -604,7 +603,7 @@ def _send_to_trustgraph(rows, api_url, flow, batch_size=1000, token=None):


 # Helper functions for auto mode
-def _auto_discover_schema(api_url, input_file, sample_chars, flow, logger, return_raw_response=False):
+def _auto_discover_schema(api_url, input_file, sample_chars, flow, logger, return_raw_response=False, workspace="default"):
    """Auto-discover the best matching schema for the input data
    
    Args:
@ -627,7 +626,7 @@ def _auto_discover_schema(api_url, input_file, sample_chars, flow, logger, retur
        # Import API modules
        from trustgraph.api import Api
        from trustgraph.api.types import ConfigKey
-        api = Api(api_url)
+        api = Api(api_url, workspace=workspace)
        config_api = api.config()
        
        # Get available schemas
@ -708,7 +707,7 @@ def _auto_discover_schema(api_url, input_file, sample_chars, flow, logger, retur
        return None


-def _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, flow, logger):
+def _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, flow, logger, workspace="default"):
    """Auto-generate descriptor configuration for the discovered schema"""
    try:
        # Read sample data
@ -718,7 +717,7 @@ def _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, fl
        # Import API modules
        from trustgraph.api import Api
        from trustgraph.api.types import ConfigKey
-        api = Api(api_url)
+        api = Api(api_url, workspace=workspace)
        config_api = api.config()
        
        # Get schema definition
@ -885,12 +884,6 @@ For more information on the descriptor format, see:
        help='TrustGraph flow name to use for prompts and import (default: default)'
    )
    
-    parser.add_argument(
-        '--user',
-        default='trustgraph',
-        help='User name for metadata (default: trustgraph)'
-    )
-    
    parser.add_argument(
        '--collection',
        default='default',
@ -997,6 +990,12 @@ For more information on the descriptor format, see:
        help='Authentication token (default: $TRUSTGRAPH_TOKEN)',
    )

+    parser.add_argument(
+        '-w', '--workspace',
+        default=default_workspace,
+        help=f'Workspace (default: {default_workspace})',
+    )
+
    args = parser.parse_args()
    
    # Input validation
@ -1046,11 +1045,11 @@ For more information on the descriptor format, see:
            sample_chars=args.sample_chars,
            schema_name=args.schema_name,
            flow=args.flow,
-            user=args.user,
            collection=args.collection,
            dry_run=args.dry_run,
            verbose=args.verbose,
-            token=args.token
+            token=args.token,
+            workspace=args.workspace,
        )
    except FileNotFoundError as e:
        print(f"Error: File not found - {e}", file=sys.stderr)