Fix structured-load auto mode (#501)

* Fix API incorrect usage * Refactor to remove duplicate code * Fix schema extraction in --auto * Fix broken load functionality
2026-05-05 05:12:36 +02:00 · 2025-09-06 12:28:28 +01:00 · 2025-09-06 12:28:28 +01:00 · 415208e3e7
commit 415208e3e7
parent 5537fac731
1 changed files with 584 additions and 477 deletions
--- a/trustgraph-cli/trustgraph/cli/load_structured_data.py
+++ b/trustgraph-cli/trustgraph/cli/load_structured_data.py
@ -2,7 +2,7 @@
 Load structured data into TrustGraph using a descriptor configuration.
 This utility can:
-1. Analyze data samples to suggest appropriate schemas
+1. Analyze data samples to discover appropriate schemas
 2. Generate descriptor configurations from data samples
 3. Parse and transform data using descriptor configurations
 4. Import processed data into TrustGraph
@ -28,15 +28,18 @@ def load_structured_data(
    api_url: str,
    input_file: str,
    descriptor_file: str = None,
-    suggest_schema: bool = False,
+    discover_schema: bool = False,
    generate_descriptor: bool = False,
    parse_only: bool = False,
    load: bool = False,
    auto: bool = False,
    output_file: str = None,
    sample_size: int = 100,
    sample_chars: int = 500,
    schema_name: str = None,
    flow: str = 'default',
    user: str = 'trustgraph',
    collection: str = 'default',
    dry_run: bool = False,
    verbose: bool = False
 ):
@ -47,14 +50,18 @@ def load_structured_data(
        api_url: TrustGraph API URL
        input_file: Path to input data file
        descriptor_file: Path to JSON descriptor configuration
-        suggest_schema: Analyze data and suggest matching schemas
+        discover_schema: Analyze data and discover matching schemas
        generate_descriptor: Generate descriptor from data sample
        parse_only: Parse data but don't import to TrustGraph
-        auto: Run full automatic pipeline (suggest schema + generate descriptor + import)
+        load: Load data to TrustGraph using existing descriptor
        auto: Run full automatic pipeline (discover schema + generate descriptor + import)
        output_file: Path to write output (descriptor/parsed data)
        sample_size: Number of records to sample for analysis
        sample_chars: Maximum characters to read for sampling
        schema_name: Target schema name for generation
        flow: TrustGraph flow name to use for prompts
        user: User name for metadata (default: trustgraph)
        collection: Collection name for metadata (default: default)
        dry_run: If True, validate but don't import data
        verbose: Enable verbose logging
    """
@ -68,12 +75,12 @@ def load_structured_data(
        logger.info(f"🚀 Starting automatic pipeline for {input_file}...")
        logger.info("Step 1: Analyzing data to discover best matching schema...")
-        # Step 1: Auto-discover schema (reuse suggest_schema logic)
+        # Step 1: Auto-discover schema (reuse discover_schema logic)
-        discovered_schema = _auto_discover_schema(api_url, input_file, sample_chars, logger)
+        discovered_schema = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger)
        if not discovered_schema:
            logger.error("Failed to discover suitable schema automatically")
            print("❌ Could not automatically determine the best schema for your data.")
-            print("💡 Try running with --suggest-schema first to see available options.")
+            print("💡 Try running with --discover-schema first to see available options.")
            return None
        logger.info(f"✅ Discovered schema: {discovered_schema}")
@ -81,7 +88,7 @@ def load_structured_data(
        # Step 2: Auto-generate descriptor
        logger.info("Step 2: Generating descriptor configuration...")
-        auto_descriptor = _auto_generate_descriptor(api_url, input_file, discovered_schema, sample_chars, logger)
+        auto_descriptor = _auto_generate_descriptor(api_url, input_file, discovered_schema, sample_chars, flow, logger)
        if not auto_descriptor:
            logger.error("Failed to generate descriptor automatically")
            print("❌ Could not automatically generate descriptor configuration.")
@ -90,216 +97,128 @@ def load_structured_data(
        logger.info("✅ Generated descriptor configuration")
        print("📝 Generated descriptor configuration")
-        # Step 3: Parse and preview data
+        # Step 3: Parse and preview data using shared pipeline  
        logger.info("Step 3: Parsing and validating data...")
-        preview_records = _auto_parse_preview(input_file, auto_descriptor, min(sample_size, 5), logger)
+        
-        if preview_records is None:
+        # Create temporary descriptor file for validation
-            logger.error("Failed to parse data with generated descriptor")
+        import tempfile
-            print("❌ Could not parse data with generated descriptor.")
+        temp_descriptor = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False)
-            return None
+        json.dump(auto_descriptor, temp_descriptor, indent=2)
        temp_descriptor.close()
        try:
            # Use shared pipeline for preview (small sample)
            preview_objects, _ = _process_data_pipeline(input_file, temp_descriptor.name, user, collection, sample_size=5)
            # Show preview
            print("📊 Data Preview (first few records):")
            print("=" * 50)
-        for i, record in enumerate(preview_records[:3], 1):
+            for i, obj in enumerate(preview_objects[:3], 1):
-            print(f"Record {i}: {record}")
+                values = obj.get('values', {})
                print(f"Record {i}: {values}")
            print("=" * 50)
            # Step 4: Import (unless dry_run)
            if dry_run:
                logger.info("✅ Dry run complete - data is ready for import")
                print("✅ Dry run successful! Data is ready for import.")
-            print(f"💡 Run without --dry-run to import {len(preview_records)} records to TrustGraph.")
+                print(f"💡 Run without --dry-run to import data to TrustGraph.")
                return None
            else:
                logger.info("Step 4: Importing data to TrustGraph...")
                print("🚀 Importing data to TrustGraph...")
-            # Recursively call ourselves with the auto-generated descriptor
+                # Use shared pipeline for full processing (no sample limit)
-            # This reuses all the existing import logic
+                output_objects, descriptor = _process_data_pipeline(input_file, temp_descriptor.name, user, collection)
            import tempfile
            import os
-            # Save auto-generated descriptor to temp file
+                # Get batch size from descriptor
-            temp_descriptor = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False)
+                batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)
            json.dump(auto_descriptor, temp_descriptor, indent=2)
            temp_descriptor.close()
-            try:
+                # Send to TrustGraph using shared function
-                # Call the full pipeline mode with our auto-generated descriptor
+                imported_count = _send_to_trustgraph(output_objects, api_url, flow, batch_size)
-                result = load_structured_data(
+                
-                    api_url=api_url,
+                # Summary
-                    input_file=input_file,
+                format_info = descriptor.get('format', {})
-                    descriptor_file=temp_descriptor.name,
+                format_type = format_info.get('type', 'csv').lower()
-                    flow=flow,
+                schema_name = descriptor.get('output', {}).get('schema_name', 'default')
-                    dry_run=False,  # We already handled dry_run above
+                
-                    verbose=verbose
+                print(f"\n🎉 Auto-Import Complete!")
-                )
+                print(f"- Input format: {format_type}")
                print(f"- Target schema: {schema_name}")
                print(f"- Records imported: {imported_count}")
                print(f"- Flow used: {flow}")
                print("✅ Auto-import completed successfully!")
                logger.info("Auto-import pipeline completed successfully")
-                return result
+                return imported_count
        except Exception as e:
            logger.error(f"Auto-import failed: {e}")
            print(f"❌ Auto-import failed: {e}")
            return None
        finally:
            # Clean up temp descriptor file
            try:
                import os
                os.unlink(temp_descriptor.name)
            except:
                pass
-    elif suggest_schema:
+    elif discover_schema:
-        logger.info(f"Analyzing {input_file} to suggest schemas...")
+        logger.info(f"Analyzing {input_file} to discover schemas...")
        logger.info(f"Sample size: {sample_size} records")
        logger.info(f"Sample chars: {sample_chars} characters")
-        # Read sample data from input file
+        # Use the helper function to discover schema (get raw response for display)
-        try:
+        response = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger, return_raw_response=True)
-            with open(input_file, 'r', encoding='utf-8') as f:
+        
-                # Read up to sample_chars characters
+        if response:
-                sample_data = f.read(sample_chars)
+            # Debug: print response type and content 
-                if len(sample_data) < sample_chars:
+            logger.debug(f"Response type: {type(response)}, content: {response}")
-                    logger.info(f"Read entire file ({len(sample_data)} characters)")
+            if isinstance(response, list) and len(response) == 1:
                # Just print the schema name for clean output
                print(f"Best matching schema: {response[0]}")
            elif isinstance(response, list):
                # Multiple schemas - show the list
                print("Multiple schemas found:")
                for schema in response:
                    print(f"  - {schema}")
            else:
-                    logger.info(f"Read sample ({sample_chars} characters)")
+                # Show full response for debugging
-        except Exception as e:
+                print("Schema Discovery Results:")
            logger.error(f"Failed to read input file: {e}")
            raise
        # Fetch available schemas from Config API
        try:
            from trustgraph.api import Api
            from trustgraph.api.types import ConfigKey
            api = Api(api_url)
            config_api = api.config()
            # Get list of available schema keys
            logger.info("Fetching available schemas from Config API...")
            schema_keys = config_api.list("schema")
            logger.info(f"Found {len(schema_keys)} schemas: {schema_keys}")
            if not schema_keys:
                logger.warning("No schemas found in configuration")
                print("No schemas available in TrustGraph configuration")
                return
            # Fetch each schema definition
            schemas = []
            config_keys = [ConfigKey(type="schema", key=key) for key in schema_keys]
            schema_values = config_api.get(config_keys)
            for value in schema_values:
                try:
                    # Schema values are JSON strings, parse them
                    schema_def = json.loads(value.value) if isinstance(value.value, str) else value.value
                    schemas.append(schema_def)
                    logger.debug(f"Loaded schema: {value.key}")
                except json.JSONDecodeError as e:
                    logger.warning(f"Failed to parse schema {value.key}: {e}")
                    continue
            logger.info(f"Successfully loaded {len(schemas)} schema definitions")
            # Use TrustGraph prompt service for schema suggestion
            flow_api = api.flow().id("default")
            # Call schema-selection prompt with actual schemas and data sample
            logger.info("Calling TrustGraph schema-selection prompt...")
            response = flow_api.prompt(
                id="schema-selection",
                variables={
                    "schemas": schemas,  # Array of actual schema definitions (note: plural 'schemas')
                    "data": sample_data
                }
            )
            print("Schema Suggestion Results:")
                print("=" * 50)
                print(response)
-            
+                print("=" * 50)
-        except ImportError as e:
+        else:
-            logger.error(f"Failed to import TrustGraph API: {e}")
+            print("Could not determine the best matching schema for your data.")
-            raise
+            print("Available schemas can be viewed using: tg-config-list schema")
        except Exception as e:
            logger.error(f"Failed to call TrustGraph prompt service: {e}")
            raise
    elif generate_descriptor:
        logger.info(f"Generating descriptor from {input_file}...")
        logger.info(f"Sample size: {sample_size} records")
        logger.info(f"Sample chars: {sample_chars} characters")
-        if schema_name:
+        
        # If no schema specified, discover it first
        if not schema_name:
            logger.info("No schema specified, auto-discovering...")
            schema_name = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger)
            if not schema_name:
                print("Error: Could not determine schema automatically.")
                print("Please specify a schema using --schema-name or run --discover-schema first.")
                return
            logger.info(f"Auto-selected schema: {schema_name}")
        else:
            logger.info(f"Target schema: {schema_name}")
-        # Read sample data from input file
+        # Generate descriptor using helper function
-        try:
+        descriptor = _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, flow, logger)
            with open(input_file, 'r', encoding='utf-8') as f:
                # Read up to sample_chars characters
                sample_data = f.read(sample_chars)
                if len(sample_data) < sample_chars:
                    logger.info(f"Read entire file ({len(sample_data)} characters)")
                else:
                    logger.info(f"Read sample ({sample_chars} characters)")
        except Exception as e:
            logger.error(f"Failed to read input file: {e}")
            raise
        # Fetch available schemas from Config API (same as suggest-schema mode)
        try:
            from trustgraph.api import Api
            from trustgraph.api.types import ConfigKey
            api = Api(api_url)
            config_api = api.config()
            # Get list of available schema keys
            logger.info("Fetching available schemas from Config API...")
            schema_keys = config_api.list("schema")
            logger.info(f"Found {len(schema_keys)} schemas: {schema_keys}")
            if not schema_keys:
                logger.warning("No schemas found in configuration")
                print("No schemas available in TrustGraph configuration")
                return
            # Fetch each schema definition
            schemas = []
            config_keys = [ConfigKey(type="schema", key=key) for key in schema_keys]
            schema_values = config_api.get(config_keys)
            for value in schema_values:
                try:
                    # Schema values are JSON strings, parse them
                    schema_def = json.loads(value.value) if isinstance(value.value, str) else value.value
                    schemas.append(schema_def)
                    logger.debug(f"Loaded schema: {value.key}")
                except json.JSONDecodeError as e:
                    logger.warning(f"Failed to parse schema {value.key}: {e}")
                    continue
            logger.info(f"Successfully loaded {len(schemas)} schema definitions")
            # Use TrustGraph prompt service for descriptor generation
            flow_api = api.flow().id("default")
            # Call diagnose-structured-data prompt with schemas and data sample
            logger.info("Calling TrustGraph diagnose-structured-data prompt...")
            response = flow_api.prompt(
                id="diagnose-structured-data",
                variables={
                    "schemas": schemas,  # Array of actual schema definitions
                    "sample": sample_data  # Note: using 'sample' instead of 'data'
                }
            )
        if descriptor:
            # Output the generated descriptor
            if output_file:
                try:
                    with open(output_file, 'w', encoding='utf-8') as f:
-                        if isinstance(response, str):
+                        f.write(json.dumps(descriptor, indent=2))
                            f.write(response)
                        else:
                            f.write(json.dumps(response, indent=2))
                    print(f"Generated descriptor saved to: {output_file}")
                    logger.info(f"Descriptor saved to {output_file}")
                except Exception as e:
@ -308,52 +227,120 @@ def load_structured_data(
            else:
                print("Generated Descriptor:")
                print("=" * 50)
-                if isinstance(response, str):
+                print(json.dumps(descriptor, indent=2))
-                    print(response)
+                print("=" * 50)
                print("Use this descriptor with --parse-only to validate or without modes to import.")
        else:
-                    print(json.dumps(response, indent=2))
+            print("Error: Failed to generate descriptor.")
-                
+            print("Check the logs for details or try --discover-schema to verify schema availability.")
        except ImportError as e:
            logger.error(f"Failed to import TrustGraph API: {e}")
            raise
        except Exception as e:
            logger.error(f"Failed to call TrustGraph prompt service: {e}")
            raise
    elif parse_only:
        if not descriptor_file:
            raise ValueError("--descriptor is required when using --parse-only")
        logger.info(f"Parsing {input_file} with descriptor {descriptor_file}...")
-        # Load descriptor configuration
+        # Use shared pipeline
        output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, user, collection, sample_size)
        # Output results
        if output_file:
            try:
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump(output_records, f, indent=2)
                print(f"Parsed data saved to: {output_file}")
                logger.info(f"Parsed {len(output_records)} records saved to {output_file}")
            except Exception as e:
                logger.error(f"Failed to save parsed data to {output_file}: {e}")
                print(f"Error saving parsed data: {e}")
        else:
            print("Parsed Data Preview:")
            print("=" * 50)
            # Show first few records for preview
            preview_count = min(3, len(output_records))
            for i in range(preview_count):
                print(f"Record {i+1}:")
                print(json.dumps(output_records[i], indent=2))
                print()
            if len(output_records) > preview_count:
                print(f"... and {len(output_records) - preview_count} more records")
                print(f"Total records processed: {len(output_records)}")
        # Get summary info from descriptor
        format_info = descriptor.get('format', {})
        format_type = format_info.get('type', 'csv').lower()
        schema_name = descriptor.get('output', {}).get('schema_name', 'default')
        mappings = descriptor.get('mappings', [])
        print(f"\nParsing Summary:")
        print(f"- Input format: {format_type}")
        print(f"- Records processed: {len(output_records)}")
        print(f"- Target schema: {schema_name}")
        print(f"- Field mappings: {len(mappings)}")
    elif load:
        if not descriptor_file:
            raise ValueError("--descriptor is required when using --load")
        logger.info(f"Loading {input_file} to TrustGraph using descriptor {descriptor_file}...")
        # Use shared pipeline (no sample_size limit for full load)
        output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, user, collection)
        # Get batch size from descriptor or use default
        batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)
        # Send to TrustGraph
        print(f"🚀 Importing {len(output_records)} records to TrustGraph...")
        imported_count = _send_to_trustgraph(output_records, api_url, flow, batch_size)
        # Get summary info from descriptor
        format_info = descriptor.get('format', {})
        format_type = format_info.get('type', 'csv').lower()
        schema_name = descriptor.get('output', {}).get('schema_name', 'default')
        print(f"\n🎉 Load Complete!")
        print(f"- Input format: {format_type}")
        print(f"- Target schema: {schema_name}")
        print(f"- Records imported: {imported_count}")
        print(f"- Flow used: {flow}")
 # Shared core functions
 def _load_descriptor(descriptor_file):
    """Load and validate descriptor configuration"""
    try:
        with open(descriptor_file, 'r', encoding='utf-8') as f:
            descriptor = json.load(f)
        logger.info(f"Loaded descriptor configuration from {descriptor_file}")
        return descriptor
    except Exception as e:
        logger.error(f"Failed to load descriptor file: {e}")
        raise
        # Read input data based on format in descriptor
        try:
            format_info = descriptor.get('format', {})
            format_type = format_info.get('type', 'csv').lower()
            encoding = format_info.get('encoding', 'utf-8')
-            logger.info(f"Input format: {format_type}, encoding: {encoding}")
+def _read_input_data(input_file, format_info):
    """Read raw data based on format type"""
    try:
        encoding = format_info.get('encoding', 'utf-8')
        with open(input_file, 'r', encoding=encoding) as f:
            raw_data = f.read()
        logger.info(f"Read {len(raw_data)} characters from input file")
        return raw_data
    except Exception as e:
        logger.error(f"Failed to read input file: {e}")
        raise
-        # Parse data based on format type
+
 def _parse_data_by_format(raw_data, format_info, sample_size=None):
    """Parse raw data into records based on format (CSV/JSON/XML)"""
    format_type = format_info.get('type', 'csv').lower()
    parsed_records = []
    logger.info(f"Input format: {format_type}")
    if format_type == 'csv':
        import csv
        from io import StringIO
@ -373,8 +360,8 @@ def load_structured_data(
                reader = csv.DictReader(StringIO(raw_data), fieldnames=fieldnames, delimiter=delimiter)
            for row_num, row in enumerate(reader, start=1):
-                    # Respect sample_size limit
+                # Respect sample_size limit if provided
-                    if row_num > sample_size:
+                if sample_size and row_num > sample_size:
                    logger.info(f"Reached sample size limit of {sample_size} records")
                    break
                parsed_records.append(row)
@ -387,7 +374,7 @@ def load_structured_data(
        try:
            data = json.loads(raw_data)
            if isinstance(data, list):
-                    parsed_records = data[:sample_size]  # Respect sample_size
+                parsed_records = data[:sample_size] if sample_size else data
            elif isinstance(data, dict):
                # Handle single object or extract array from root path
                root_path = format_info.get('options', {}).get('root_path')
@ -398,7 +385,7 @@ def load_structured_data(
                        data = data.get(key, data)
                if isinstance(data, list):
-                        parsed_records = data[:sample_size]
+                    parsed_records = data[:sample_size] if sample_size else data
                else:
                    parsed_records = [data]
@ -443,7 +430,7 @@ def load_structured_data(
            # Convert XML elements to dictionaries
            record_count = 0
            for element in records:
-                    if record_count >= sample_size:
+                if sample_size and record_count >= sample_size:
                    logger.info(f"Reached sample size limit of {sample_size} records")
                    break
@ -487,12 +474,14 @@ def load_structured_data(
        raise ValueError(f"Unsupported format type: {format_type}")
    logger.info(f"Successfully parsed {len(parsed_records)} records")
    return parsed_records
-        # Apply basic transformations and validation (simplified version)
+
-        mappings = descriptor.get('mappings', [])
+def _apply_transformations(records, mappings):
    """Apply descriptor mappings and transformations"""
    processed_records = []
-        for record_num, record in enumerate(parsed_records, start=1):
+    for record_num, record in enumerate(records, start=1):
        processed_record = {}
        for mapping in mappings:
@ -533,7 +522,11 @@ def load_structured_data(
        processed_records.append(processed_record)
-        # Format output for TrustGraph ExtractedObject structure
+    return processed_records
 def _format_extracted_objects(processed_records, descriptor, user, collection):
    """Convert to TrustGraph ExtractedObject format"""
    output_records = []
    schema_name = descriptor.get('output', {}).get('schema_name', 'default')
    confidence = descriptor.get('output', {}).get('options', {}).get('confidence', 0.9)
@ -543,8 +536,8 @@ def load_structured_data(
            "metadata": {
                "id": f"parsed-{len(output_records)+1}",
                "metadata": [],  # Empty metadata triples
-                    "user": "trustgraph",
+                "user": user,
-                    "collection": "default"
+                "collection": collection
            },
            "schema_name": schema_name,
            "values": record,
@ -553,52 +546,130 @@ def load_structured_data(
        }
        output_records.append(output_record)
-        # Output results
+    return output_records
-        if output_file:
+
 def _process_data_pipeline(input_file, descriptor_file, user, collection, sample_size=None):
    """Shared pipeline: load descriptor → read → parse → transform → format"""
    # Load descriptor configuration
    descriptor = _load_descriptor(descriptor_file)
    # Read input data based on format in descriptor
    format_info = descriptor.get('format', {})
    raw_data = _read_input_data(input_file, format_info)
    # Parse data based on format type
    parsed_records = _parse_data_by_format(raw_data, format_info, sample_size)
    # Apply transformations and validation
    mappings = descriptor.get('mappings', [])
    processed_records = _apply_transformations(parsed_records, mappings)
    # Format output for TrustGraph ExtractedObject structure
    output_records = _format_extracted_objects(processed_records, descriptor, user, collection)
    return output_records, descriptor
 def _send_to_trustgraph(objects, api_url, flow, batch_size=1000):
    """Send ExtractedObject records to TrustGraph using WebSocket"""
    import json
    import asyncio
    from websockets.asyncio.client import connect
    try:
-                with open(output_file, 'w', encoding='utf-8') as f:
+        # Construct objects import URL similar to load_knowledge pattern
-                    json.dump(output_records, f, indent=2)
+        if not api_url.endswith("/"):
-                print(f"Parsed data saved to: {output_file}")
+            api_url += "/"
-                logger.info(f"Parsed {len(output_records)} records saved to {output_file}")
+        
        # Convert HTTP URL to WebSocket URL if needed
        ws_url = api_url.replace("http://", "ws://").replace("https://", "wss://")
        objects_url = ws_url + f"api/v1/flow/{flow}/import/objects"
        logger.info(f"Connecting to objects import endpoint: {objects_url}")
        async def import_objects():
            async with connect(objects_url) as ws:
                imported_count = 0
                for record in objects:
                    try:
                        # Send individual ExtractedObject records
                        await ws.send(json.dumps(record))
                        imported_count += 1
                        if imported_count % 100 == 0:
                            logger.info(f"Imported {imported_count}/{len(objects)} records...")
                            print(f"✅ Imported {imported_count}/{len(objects)} records...")
                    except Exception as e:
-                logger.error(f"Failed to save parsed data to {output_file}: {e}")
+                        logger.error(f"Failed to send record {imported_count + 1}: {e}")
-                print(f"Error saving parsed data: {e}")
+                        print(f"❌ Failed to send record {imported_count + 1}: {e}")
                logger.info(f"Successfully imported {imported_count} records to TrustGraph")
                return imported_count
        # Run the async import
        imported_count = asyncio.run(import_objects())
        # Summary
        total_records = len(objects)
        failed_count = total_records - imported_count
        print(f"\n📊 Import Summary:")
        print(f"- Total records: {total_records}")
        print(f"- Successfully imported: {imported_count}")
        print(f"- Failed: {failed_count}")
        if failed_count > 0:
            print(f"⚠️  {failed_count} records failed to import. Check logs for details.")
        else:
-            print("Parsed Data Preview:")
+            print("✅ All records imported successfully!")
            print("=" * 50)
            # Show first few records for preview
            preview_count = min(3, len(output_records))
            for i in range(preview_count):
                print(f"Record {i+1}:")
                print(json.dumps(output_records[i], indent=2))
                print()
-            if len(output_records) > preview_count:
+        return imported_count
                print(f"... and {len(output_records) - preview_count} more records")
                print(f"Total records processed: {len(output_records)}")
-        print(f"\nParsing Summary:")
+    except ImportError as e:
-        print(f"- Input format: {format_type}")
+        logger.error(f"Failed to import required modules: {e}")
-        print(f"- Records processed: {len(output_records)}")
+        print(f"Error: Required modules not available - {e}")
-        print(f"- Target schema: {schema_name}")
+        raise
-        print(f"- Field mappings: {len(mappings)}")
+    except Exception as e:
        logger.error(f"Failed to import data to TrustGraph: {e}")
        print(f"Import failed: {e}")
        raise
 # Helper functions for auto mode
-def _auto_discover_schema(api_url, input_file, sample_chars, logger):
+def _auto_discover_schema(api_url, input_file, sample_chars, flow, logger, return_raw_response=False):
-    """Auto-discover the best matching schema for the input data"""
+    """Auto-discover the best matching schema for the input data
    Args:
        api_url: TrustGraph API URL
        input_file: Path to input data file
        sample_chars: Number of characters to sample from file
        flow: TrustGraph flow name to use for prompts
        logger: Logger instance
        return_raw_response: If True, return raw prompt response; if False, parse to extract schema name
    Returns:
        Schema name (str) if return_raw_response=False, or full response if True
    """
    try:
        # Read sample data
        with open(input_file, 'r', encoding='utf-8') as f:
            sample_data = f.read(sample_chars)
            logger.info(f"Read {len(sample_data)} characters for analysis")
        # Import API modules
        from trustgraph.api import Api
        from trustgraph.api.types import ConfigKey
        api = Api(api_url)
        config_api = api.config()
        # Get available schemas
        logger.info("Fetching available schemas from Config API...")
        schema_keys = config_api.list("schema")
        logger.info(f"Found {len(schema_keys)} schemas: {schema_keys}")
        if not schema_keys:
            logger.error("No schemas available in TrustGraph configuration")
            return None
@ -607,8 +678,12 @@ def _auto_discover_schema(api_url, input_file, sample_chars, logger):
        schemas = {}
        for key in schema_keys:
            try:
-                schema_def = config_api.get("schema", key)
+                config_key = ConfigKey(type="schema", key=key)
                schema_values = config_api.get([config_key])
                if schema_values:
                    schema_def = json.loads(schema_values[0].value) if isinstance(schema_values[0].value, str) else schema_values[0].value
                    schemas[key] = schema_def
                    logger.debug(f"Loaded schema: {key}")
            except Exception as e:
                logger.warning(f"Could not load schema {key}: {e}")
@ -616,33 +691,32 @@ def _auto_discover_schema(api_url, input_file, sample_chars, logger):
            logger.error("No valid schemas could be loaded")
            return None
        logger.info(f"Successfully loaded {len(schemas)} schema definitions")
        # Use prompt service for schema selection
-        flow_api = api.flow().id("default")
+        flow_api = api.flow().id(flow)
        prompt_client = flow_api.prompt()
-        prompt = f"""Analyze this data sample and determine the best matching schema:
+        # Call schema-selection prompt with actual schemas and data sample
-
+        logger.info("Calling TrustGraph schema-selection prompt...")
-DATA SAMPLE:
+        response = flow_api.prompt(
-{sample_data[:1000]}
+            id="schema-selection",
-
+            variables={
-AVAILABLE SCHEMAS:
+                "schemas": list(schemas.values()),  # Array of actual schema definitions
-{json.dumps(schemas, indent=2)}
+                "question": sample_data  # Truncate sample data
-
+            }
 Return ONLY the schema name (key) that best matches this data. Consider:
 1. Field names and types in the data
 2. Data structure and format
 3. Domain and use case alignment
 Schema name:"""
        response = prompt_client.schema_selection(
            schemas=schemas,
            sample=sample_data[:1000]
        )
        # Return raw response if requested (for discover_schema mode)
        if return_raw_response:
            return response
        # Extract schema name from response
        if isinstance(response, dict) and 'schema' in response:
            return response['schema']
        elif isinstance(response, list) and len(response) > 0:
            # If response is a list, use the first element
            logger.info(f"Extracted schema '{response[0]}' from list response")
            return response[0]
        elif isinstance(response, str):
            # Try to extract schema name from text response
            response_lower = response.lower().strip()
@ -669,7 +743,7 @@ Schema name:"""
        return None
-def _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, logger):
+def _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, flow, logger):
    """Auto-generate descriptor configuration for the discovered schema"""
    try:
        # Read sample data
@ -678,20 +752,28 @@ def _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, lo
        # Import API modules
        from trustgraph.api import Api
        from trustgraph.api.types import ConfigKey
        api = Api(api_url)
        config_api = api.config()
        # Get schema definition
-        schema_def = config_api.get("schema", schema_name)
+        config_key = ConfigKey(type="schema", key=schema_name)
        schema_values = config_api.get([config_key])
        if not schema_values:
            logger.error(f"Schema '{schema_name}' not found")
            return None
        schema_def = json.loads(schema_values[0].value) if isinstance(schema_values[0].value, str) else schema_values[0].value
        # Use prompt service for descriptor generation
-        flow_api = api.flow().id("default") 
+        flow_api = api.flow().id(flow)
        prompt_client = flow_api.prompt()
-        response = prompt_client.diagnose_structured_data(
+        # Call diagnose-structured-data prompt with schema and data sample
-            sample=sample_data,
+        response = flow_api.prompt(
-            schema_name=schema_name,
+            id="diagnose-structured-data",
-            schema=schema_def
+            variables={
                "schemas": [schema_def],  # Array with single schema definition
                "sample": sample_data  # Data sample for analysis
            }
        )
        if isinstance(response, str):
@ -784,9 +866,9 @@ def main():
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
-  # Step 1: Analyze data and suggest matching schemas
+  # Step 1: Analyze data and discover matching schemas
-  %(prog)s --input customers.csv --suggest-schema
+  %(prog)s --input customers.csv --discover-schema
-  %(prog)s --input products.xml --suggest-schema --sample-chars 1000
+  %(prog)s --input products.xml --discover-schema --sample-chars 1000
  # Step 2: Generate descriptor configuration from data sample
  %(prog)s --input customers.csv --generate-descriptor --schema-name customer --output descriptor.json
@ -813,7 +895,7 @@ Examples:
 Use Cases:
  --auto              : 🚀 FULLY AUTOMATIC: Discover schema + generate descriptor + import data
                        (zero manual configuration required!)
-  --suggest-schema     : Diagnose which TrustGraph schemas might match your data
+  --discover-schema     : Diagnose which TrustGraph schemas might match your data
                        (uses --sample-chars to limit data sent for analysis)
  --generate-descriptor: Create/review the structured data language configuration  
                        (uses --sample-chars to limit data sent for analysis)
@ -835,7 +917,19 @@ For more information on the descriptor format, see:
    parser.add_argument(
        '-f', '--flow',
        default='default',
-        help='TrustGraph flow name to use for import (default: default)'
+        help='TrustGraph flow name to use for prompts and import (default: default)'
    )
    parser.add_argument(
        '--user',
        default='trustgraph',
        help='User name for metadata (default: trustgraph)'
    )
    parser.add_argument(
        '--collection',
        default='default',
        help='Collection name for metadata (default: default)'
    )
    parser.add_argument(
@ -852,9 +946,9 @@ For more information on the descriptor format, see:
    # Operation modes (mutually exclusive)
    mode_group = parser.add_mutually_exclusive_group()
    mode_group.add_argument(
-        '--suggest-schema',
+        '--discover-schema',
        action='store_true',
-        help='Analyze data sample and suggest matching TrustGraph schemas'
+        help='Analyze data sample and discover matching TrustGraph schemas'
    )
    mode_group.add_argument(
        '--generate-descriptor',
@ -866,6 +960,11 @@ For more information on the descriptor format, see:
        action='store_true',
        help='Parse data using descriptor but don\'t import to TrustGraph'
    )
    mode_group.add_argument(
        '--load',
        action='store_true',
        help='Load data to TrustGraph using existing descriptor'
    )
    mode_group.add_argument(
        '--auto',
        action='store_true',
@ -888,7 +987,7 @@ For more information on the descriptor format, see:
        '--sample-chars',
        type=int,
        default=500,
-        help='Maximum characters to read for sampling (suggest-schema/generate-descriptor modes only, default: 500)'
+        help='Maximum characters to read for sampling (discover-schema/generate-descriptor modes only, default: 500)'
    )
    parser.add_argument(
@ -939,21 +1038,26 @@ For more information on the descriptor format, see:
        print("Error: --descriptor is required when using --parse-only", file=sys.stderr)
        sys.exit(1)
    if args.load and not args.descriptor:
        print("Error: --descriptor is required when using --load", file=sys.stderr)
        sys.exit(1)
    # Warn about irrelevant parameters
    if args.parse_only and args.sample_chars != 500:  # 500 is the default
        print("Warning: --sample-chars is ignored in --parse-only mode (entire file is processed)", file=sys.stderr)
-    if (args.suggest_schema or args.generate_descriptor) and args.sample_size != 100:  # 100 is default
+    if (args.discover_schema or args.generate_descriptor) and args.sample_size != 100:  # 100 is default
        print("Warning: --sample-size is ignored in analysis modes, use --sample-chars instead", file=sys.stderr)
    # Require explicit mode selection - no implicit behavior
-    if not any([args.suggest_schema, args.generate_descriptor, args.parse_only, args.auto]):
+    if not any([args.discover_schema, args.generate_descriptor, args.parse_only, args.load, args.auto]):
        print("Error: Must specify an operation mode", file=sys.stderr)
        print("Available modes:", file=sys.stderr)
        print("  --auto                 : Discover schema + generate descriptor + import", file=sys.stderr)
-        print("  --suggest-schema       : Analyze data and suggest schemas", file=sys.stderr)
+        print("  --discover-schema       : Analyze data and discover schemas", file=sys.stderr)
        print("  --generate-descriptor  : Generate descriptor from data", file=sys.stderr)
        print("  --parse-only          : Parse data without importing", file=sys.stderr)
        print("  --load                : Import data using existing descriptor", file=sys.stderr)
        sys.exit(1)
    try:
@ -961,15 +1065,18 @@ For more information on the descriptor format, see:
            api_url=args.api_url,
            input_file=args.input,
            descriptor_file=args.descriptor,
-            suggest_schema=args.suggest_schema,
+            discover_schema=args.discover_schema,
            generate_descriptor=args.generate_descriptor,
            parse_only=args.parse_only,
            load=args.load,
            auto=args.auto,
            output_file=args.output,
            sample_size=args.sample_size,
            sample_chars=args.sample_chars,
            schema_name=args.schema_name,
            flow=args.flow,
            user=args.user,
            collection=args.collection,
            dry_run=args.dry_run,
            verbose=args.verbose
        )