mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-05-05 05:12:36 +02:00
Fix structured-load auto mode (#501)
* Fix API incorrect usage * Refactor to remove duplicate code * Fix schema extraction in --auto * Fix broken load functionality
This commit is contained in:
parent
5537fac731
commit
415208e3e7
1 changed files with 584 additions and 477 deletions
|
|
@ -2,7 +2,7 @@
|
||||||
Load structured data into TrustGraph using a descriptor configuration.
|
Load structured data into TrustGraph using a descriptor configuration.
|
||||||
|
|
||||||
This utility can:
|
This utility can:
|
||||||
1. Analyze data samples to suggest appropriate schemas
|
1. Analyze data samples to discover appropriate schemas
|
||||||
2. Generate descriptor configurations from data samples
|
2. Generate descriptor configurations from data samples
|
||||||
3. Parse and transform data using descriptor configurations
|
3. Parse and transform data using descriptor configurations
|
||||||
4. Import processed data into TrustGraph
|
4. Import processed data into TrustGraph
|
||||||
|
|
@ -28,15 +28,18 @@ def load_structured_data(
|
||||||
api_url: str,
|
api_url: str,
|
||||||
input_file: str,
|
input_file: str,
|
||||||
descriptor_file: str = None,
|
descriptor_file: str = None,
|
||||||
suggest_schema: bool = False,
|
discover_schema: bool = False,
|
||||||
generate_descriptor: bool = False,
|
generate_descriptor: bool = False,
|
||||||
parse_only: bool = False,
|
parse_only: bool = False,
|
||||||
|
load: bool = False,
|
||||||
auto: bool = False,
|
auto: bool = False,
|
||||||
output_file: str = None,
|
output_file: str = None,
|
||||||
sample_size: int = 100,
|
sample_size: int = 100,
|
||||||
sample_chars: int = 500,
|
sample_chars: int = 500,
|
||||||
schema_name: str = None,
|
schema_name: str = None,
|
||||||
flow: str = 'default',
|
flow: str = 'default',
|
||||||
|
user: str = 'trustgraph',
|
||||||
|
collection: str = 'default',
|
||||||
dry_run: bool = False,
|
dry_run: bool = False,
|
||||||
verbose: bool = False
|
verbose: bool = False
|
||||||
):
|
):
|
||||||
|
|
@ -47,14 +50,18 @@ def load_structured_data(
|
||||||
api_url: TrustGraph API URL
|
api_url: TrustGraph API URL
|
||||||
input_file: Path to input data file
|
input_file: Path to input data file
|
||||||
descriptor_file: Path to JSON descriptor configuration
|
descriptor_file: Path to JSON descriptor configuration
|
||||||
suggest_schema: Analyze data and suggest matching schemas
|
discover_schema: Analyze data and discover matching schemas
|
||||||
generate_descriptor: Generate descriptor from data sample
|
generate_descriptor: Generate descriptor from data sample
|
||||||
parse_only: Parse data but don't import to TrustGraph
|
parse_only: Parse data but don't import to TrustGraph
|
||||||
auto: Run full automatic pipeline (suggest schema + generate descriptor + import)
|
load: Load data to TrustGraph using existing descriptor
|
||||||
|
auto: Run full automatic pipeline (discover schema + generate descriptor + import)
|
||||||
output_file: Path to write output (descriptor/parsed data)
|
output_file: Path to write output (descriptor/parsed data)
|
||||||
sample_size: Number of records to sample for analysis
|
sample_size: Number of records to sample for analysis
|
||||||
sample_chars: Maximum characters to read for sampling
|
sample_chars: Maximum characters to read for sampling
|
||||||
schema_name: Target schema name for generation
|
schema_name: Target schema name for generation
|
||||||
|
flow: TrustGraph flow name to use for prompts
|
||||||
|
user: User name for metadata (default: trustgraph)
|
||||||
|
collection: Collection name for metadata (default: default)
|
||||||
dry_run: If True, validate but don't import data
|
dry_run: If True, validate but don't import data
|
||||||
verbose: Enable verbose logging
|
verbose: Enable verbose logging
|
||||||
"""
|
"""
|
||||||
|
|
@ -68,12 +75,12 @@ def load_structured_data(
|
||||||
logger.info(f"🚀 Starting automatic pipeline for {input_file}...")
|
logger.info(f"🚀 Starting automatic pipeline for {input_file}...")
|
||||||
logger.info("Step 1: Analyzing data to discover best matching schema...")
|
logger.info("Step 1: Analyzing data to discover best matching schema...")
|
||||||
|
|
||||||
# Step 1: Auto-discover schema (reuse suggest_schema logic)
|
# Step 1: Auto-discover schema (reuse discover_schema logic)
|
||||||
discovered_schema = _auto_discover_schema(api_url, input_file, sample_chars, logger)
|
discovered_schema = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger)
|
||||||
if not discovered_schema:
|
if not discovered_schema:
|
||||||
logger.error("Failed to discover suitable schema automatically")
|
logger.error("Failed to discover suitable schema automatically")
|
||||||
print("❌ Could not automatically determine the best schema for your data.")
|
print("❌ Could not automatically determine the best schema for your data.")
|
||||||
print("💡 Try running with --suggest-schema first to see available options.")
|
print("💡 Try running with --discover-schema first to see available options.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
logger.info(f"✅ Discovered schema: {discovered_schema}")
|
logger.info(f"✅ Discovered schema: {discovered_schema}")
|
||||||
|
|
@ -81,7 +88,7 @@ def load_structured_data(
|
||||||
|
|
||||||
# Step 2: Auto-generate descriptor
|
# Step 2: Auto-generate descriptor
|
||||||
logger.info("Step 2: Generating descriptor configuration...")
|
logger.info("Step 2: Generating descriptor configuration...")
|
||||||
auto_descriptor = _auto_generate_descriptor(api_url, input_file, discovered_schema, sample_chars, logger)
|
auto_descriptor = _auto_generate_descriptor(api_url, input_file, discovered_schema, sample_chars, flow, logger)
|
||||||
if not auto_descriptor:
|
if not auto_descriptor:
|
||||||
logger.error("Failed to generate descriptor automatically")
|
logger.error("Failed to generate descriptor automatically")
|
||||||
print("❌ Could not automatically generate descriptor configuration.")
|
print("❌ Could not automatically generate descriptor configuration.")
|
||||||
|
|
@ -90,216 +97,128 @@ def load_structured_data(
|
||||||
logger.info("✅ Generated descriptor configuration")
|
logger.info("✅ Generated descriptor configuration")
|
||||||
print("📝 Generated descriptor configuration")
|
print("📝 Generated descriptor configuration")
|
||||||
|
|
||||||
# Step 3: Parse and preview data
|
# Step 3: Parse and preview data using shared pipeline
|
||||||
logger.info("Step 3: Parsing and validating data...")
|
logger.info("Step 3: Parsing and validating data...")
|
||||||
preview_records = _auto_parse_preview(input_file, auto_descriptor, min(sample_size, 5), logger)
|
|
||||||
if preview_records is None:
|
# Create temporary descriptor file for validation
|
||||||
logger.error("Failed to parse data with generated descriptor")
|
import tempfile
|
||||||
print("❌ Could not parse data with generated descriptor.")
|
temp_descriptor = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False)
|
||||||
return None
|
json.dump(auto_descriptor, temp_descriptor, indent=2)
|
||||||
|
temp_descriptor.close()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Use shared pipeline for preview (small sample)
|
||||||
|
preview_objects, _ = _process_data_pipeline(input_file, temp_descriptor.name, user, collection, sample_size=5)
|
||||||
|
|
||||||
# Show preview
|
# Show preview
|
||||||
print("📊 Data Preview (first few records):")
|
print("📊 Data Preview (first few records):")
|
||||||
print("=" * 50)
|
print("=" * 50)
|
||||||
for i, record in enumerate(preview_records[:3], 1):
|
for i, obj in enumerate(preview_objects[:3], 1):
|
||||||
print(f"Record {i}: {record}")
|
values = obj.get('values', {})
|
||||||
|
print(f"Record {i}: {values}")
|
||||||
print("=" * 50)
|
print("=" * 50)
|
||||||
|
|
||||||
# Step 4: Import (unless dry_run)
|
# Step 4: Import (unless dry_run)
|
||||||
if dry_run:
|
if dry_run:
|
||||||
logger.info("✅ Dry run complete - data is ready for import")
|
logger.info("✅ Dry run complete - data is ready for import")
|
||||||
print("✅ Dry run successful! Data is ready for import.")
|
print("✅ Dry run successful! Data is ready for import.")
|
||||||
print(f"💡 Run without --dry-run to import {len(preview_records)} records to TrustGraph.")
|
print(f"💡 Run without --dry-run to import data to TrustGraph.")
|
||||||
return None
|
return None
|
||||||
else:
|
else:
|
||||||
logger.info("Step 4: Importing data to TrustGraph...")
|
logger.info("Step 4: Importing data to TrustGraph...")
|
||||||
print("🚀 Importing data to TrustGraph...")
|
print("🚀 Importing data to TrustGraph...")
|
||||||
|
|
||||||
# Recursively call ourselves with the auto-generated descriptor
|
# Use shared pipeline for full processing (no sample limit)
|
||||||
# This reuses all the existing import logic
|
output_objects, descriptor = _process_data_pipeline(input_file, temp_descriptor.name, user, collection)
|
||||||
import tempfile
|
|
||||||
import os
|
|
||||||
|
|
||||||
# Save auto-generated descriptor to temp file
|
# Get batch size from descriptor
|
||||||
temp_descriptor = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False)
|
batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)
|
||||||
json.dump(auto_descriptor, temp_descriptor, indent=2)
|
|
||||||
temp_descriptor.close()
|
|
||||||
|
|
||||||
try:
|
# Send to TrustGraph using shared function
|
||||||
# Call the full pipeline mode with our auto-generated descriptor
|
imported_count = _send_to_trustgraph(output_objects, api_url, flow, batch_size)
|
||||||
result = load_structured_data(
|
|
||||||
api_url=api_url,
|
# Summary
|
||||||
input_file=input_file,
|
format_info = descriptor.get('format', {})
|
||||||
descriptor_file=temp_descriptor.name,
|
format_type = format_info.get('type', 'csv').lower()
|
||||||
flow=flow,
|
schema_name = descriptor.get('output', {}).get('schema_name', 'default')
|
||||||
dry_run=False, # We already handled dry_run above
|
|
||||||
verbose=verbose
|
print(f"\n🎉 Auto-Import Complete!")
|
||||||
)
|
print(f"- Input format: {format_type}")
|
||||||
|
print(f"- Target schema: {schema_name}")
|
||||||
|
print(f"- Records imported: {imported_count}")
|
||||||
|
print(f"- Flow used: {flow}")
|
||||||
|
|
||||||
print("✅ Auto-import completed successfully!")
|
|
||||||
logger.info("Auto-import pipeline completed successfully")
|
logger.info("Auto-import pipeline completed successfully")
|
||||||
return result
|
return imported_count
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Auto-import failed: {e}")
|
||||||
|
print(f"❌ Auto-import failed: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
# Clean up temp descriptor file
|
# Clean up temp descriptor file
|
||||||
try:
|
try:
|
||||||
|
import os
|
||||||
os.unlink(temp_descriptor.name)
|
os.unlink(temp_descriptor.name)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
elif suggest_schema:
|
elif discover_schema:
|
||||||
logger.info(f"Analyzing {input_file} to suggest schemas...")
|
logger.info(f"Analyzing {input_file} to discover schemas...")
|
||||||
logger.info(f"Sample size: {sample_size} records")
|
logger.info(f"Sample size: {sample_size} records")
|
||||||
logger.info(f"Sample chars: {sample_chars} characters")
|
logger.info(f"Sample chars: {sample_chars} characters")
|
||||||
|
|
||||||
# Read sample data from input file
|
# Use the helper function to discover schema (get raw response for display)
|
||||||
try:
|
response = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger, return_raw_response=True)
|
||||||
with open(input_file, 'r', encoding='utf-8') as f:
|
|
||||||
# Read up to sample_chars characters
|
if response:
|
||||||
sample_data = f.read(sample_chars)
|
# Debug: print response type and content
|
||||||
if len(sample_data) < sample_chars:
|
logger.debug(f"Response type: {type(response)}, content: {response}")
|
||||||
logger.info(f"Read entire file ({len(sample_data)} characters)")
|
if isinstance(response, list) and len(response) == 1:
|
||||||
|
# Just print the schema name for clean output
|
||||||
|
print(f"Best matching schema: {response[0]}")
|
||||||
|
elif isinstance(response, list):
|
||||||
|
# Multiple schemas - show the list
|
||||||
|
print("Multiple schemas found:")
|
||||||
|
for schema in response:
|
||||||
|
print(f" - {schema}")
|
||||||
else:
|
else:
|
||||||
logger.info(f"Read sample ({sample_chars} characters)")
|
# Show full response for debugging
|
||||||
except Exception as e:
|
print("Schema Discovery Results:")
|
||||||
logger.error(f"Failed to read input file: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
# Fetch available schemas from Config API
|
|
||||||
try:
|
|
||||||
from trustgraph.api import Api
|
|
||||||
from trustgraph.api.types import ConfigKey
|
|
||||||
|
|
||||||
api = Api(api_url)
|
|
||||||
config_api = api.config()
|
|
||||||
|
|
||||||
# Get list of available schema keys
|
|
||||||
logger.info("Fetching available schemas from Config API...")
|
|
||||||
schema_keys = config_api.list("schema")
|
|
||||||
logger.info(f"Found {len(schema_keys)} schemas: {schema_keys}")
|
|
||||||
|
|
||||||
if not schema_keys:
|
|
||||||
logger.warning("No schemas found in configuration")
|
|
||||||
print("No schemas available in TrustGraph configuration")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Fetch each schema definition
|
|
||||||
schemas = []
|
|
||||||
config_keys = [ConfigKey(type="schema", key=key) for key in schema_keys]
|
|
||||||
schema_values = config_api.get(config_keys)
|
|
||||||
|
|
||||||
for value in schema_values:
|
|
||||||
try:
|
|
||||||
# Schema values are JSON strings, parse them
|
|
||||||
schema_def = json.loads(value.value) if isinstance(value.value, str) else value.value
|
|
||||||
schemas.append(schema_def)
|
|
||||||
logger.debug(f"Loaded schema: {value.key}")
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
logger.warning(f"Failed to parse schema {value.key}: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
logger.info(f"Successfully loaded {len(schemas)} schema definitions")
|
|
||||||
|
|
||||||
# Use TrustGraph prompt service for schema suggestion
|
|
||||||
flow_api = api.flow().id("default")
|
|
||||||
|
|
||||||
# Call schema-selection prompt with actual schemas and data sample
|
|
||||||
logger.info("Calling TrustGraph schema-selection prompt...")
|
|
||||||
response = flow_api.prompt(
|
|
||||||
id="schema-selection",
|
|
||||||
variables={
|
|
||||||
"schemas": schemas, # Array of actual schema definitions (note: plural 'schemas')
|
|
||||||
"data": sample_data
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
print("Schema Suggestion Results:")
|
|
||||||
print("=" * 50)
|
print("=" * 50)
|
||||||
print(response)
|
print(response)
|
||||||
|
print("=" * 50)
|
||||||
except ImportError as e:
|
else:
|
||||||
logger.error(f"Failed to import TrustGraph API: {e}")
|
print("Could not determine the best matching schema for your data.")
|
||||||
raise
|
print("Available schemas can be viewed using: tg-config-list schema")
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to call TrustGraph prompt service: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
elif generate_descriptor:
|
elif generate_descriptor:
|
||||||
logger.info(f"Generating descriptor from {input_file}...")
|
logger.info(f"Generating descriptor from {input_file}...")
|
||||||
logger.info(f"Sample size: {sample_size} records")
|
logger.info(f"Sample size: {sample_size} records")
|
||||||
logger.info(f"Sample chars: {sample_chars} characters")
|
logger.info(f"Sample chars: {sample_chars} characters")
|
||||||
if schema_name:
|
|
||||||
|
# If no schema specified, discover it first
|
||||||
|
if not schema_name:
|
||||||
|
logger.info("No schema specified, auto-discovering...")
|
||||||
|
schema_name = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger)
|
||||||
|
if not schema_name:
|
||||||
|
print("Error: Could not determine schema automatically.")
|
||||||
|
print("Please specify a schema using --schema-name or run --discover-schema first.")
|
||||||
|
return
|
||||||
|
logger.info(f"Auto-selected schema: {schema_name}")
|
||||||
|
else:
|
||||||
logger.info(f"Target schema: {schema_name}")
|
logger.info(f"Target schema: {schema_name}")
|
||||||
|
|
||||||
# Read sample data from input file
|
# Generate descriptor using helper function
|
||||||
try:
|
descriptor = _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, flow, logger)
|
||||||
with open(input_file, 'r', encoding='utf-8') as f:
|
|
||||||
# Read up to sample_chars characters
|
|
||||||
sample_data = f.read(sample_chars)
|
|
||||||
if len(sample_data) < sample_chars:
|
|
||||||
logger.info(f"Read entire file ({len(sample_data)} characters)")
|
|
||||||
else:
|
|
||||||
logger.info(f"Read sample ({sample_chars} characters)")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to read input file: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
# Fetch available schemas from Config API (same as suggest-schema mode)
|
|
||||||
try:
|
|
||||||
from trustgraph.api import Api
|
|
||||||
from trustgraph.api.types import ConfigKey
|
|
||||||
|
|
||||||
api = Api(api_url)
|
|
||||||
config_api = api.config()
|
|
||||||
|
|
||||||
# Get list of available schema keys
|
|
||||||
logger.info("Fetching available schemas from Config API...")
|
|
||||||
schema_keys = config_api.list("schema")
|
|
||||||
logger.info(f"Found {len(schema_keys)} schemas: {schema_keys}")
|
|
||||||
|
|
||||||
if not schema_keys:
|
|
||||||
logger.warning("No schemas found in configuration")
|
|
||||||
print("No schemas available in TrustGraph configuration")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Fetch each schema definition
|
|
||||||
schemas = []
|
|
||||||
config_keys = [ConfigKey(type="schema", key=key) for key in schema_keys]
|
|
||||||
schema_values = config_api.get(config_keys)
|
|
||||||
|
|
||||||
for value in schema_values:
|
|
||||||
try:
|
|
||||||
# Schema values are JSON strings, parse them
|
|
||||||
schema_def = json.loads(value.value) if isinstance(value.value, str) else value.value
|
|
||||||
schemas.append(schema_def)
|
|
||||||
logger.debug(f"Loaded schema: {value.key}")
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
logger.warning(f"Failed to parse schema {value.key}: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
logger.info(f"Successfully loaded {len(schemas)} schema definitions")
|
|
||||||
|
|
||||||
# Use TrustGraph prompt service for descriptor generation
|
|
||||||
flow_api = api.flow().id("default")
|
|
||||||
|
|
||||||
# Call diagnose-structured-data prompt with schemas and data sample
|
|
||||||
logger.info("Calling TrustGraph diagnose-structured-data prompt...")
|
|
||||||
response = flow_api.prompt(
|
|
||||||
id="diagnose-structured-data",
|
|
||||||
variables={
|
|
||||||
"schemas": schemas, # Array of actual schema definitions
|
|
||||||
"sample": sample_data # Note: using 'sample' instead of 'data'
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
if descriptor:
|
||||||
# Output the generated descriptor
|
# Output the generated descriptor
|
||||||
if output_file:
|
if output_file:
|
||||||
try:
|
try:
|
||||||
with open(output_file, 'w', encoding='utf-8') as f:
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
if isinstance(response, str):
|
f.write(json.dumps(descriptor, indent=2))
|
||||||
f.write(response)
|
|
||||||
else:
|
|
||||||
f.write(json.dumps(response, indent=2))
|
|
||||||
print(f"Generated descriptor saved to: {output_file}")
|
print(f"Generated descriptor saved to: {output_file}")
|
||||||
logger.info(f"Descriptor saved to {output_file}")
|
logger.info(f"Descriptor saved to {output_file}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -308,52 +227,120 @@ def load_structured_data(
|
||||||
else:
|
else:
|
||||||
print("Generated Descriptor:")
|
print("Generated Descriptor:")
|
||||||
print("=" * 50)
|
print("=" * 50)
|
||||||
if isinstance(response, str):
|
print(json.dumps(descriptor, indent=2))
|
||||||
print(response)
|
print("=" * 50)
|
||||||
|
print("Use this descriptor with --parse-only to validate or without modes to import.")
|
||||||
else:
|
else:
|
||||||
print(json.dumps(response, indent=2))
|
print("Error: Failed to generate descriptor.")
|
||||||
|
print("Check the logs for details or try --discover-schema to verify schema availability.")
|
||||||
except ImportError as e:
|
|
||||||
logger.error(f"Failed to import TrustGraph API: {e}")
|
|
||||||
raise
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to call TrustGraph prompt service: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
elif parse_only:
|
elif parse_only:
|
||||||
if not descriptor_file:
|
if not descriptor_file:
|
||||||
raise ValueError("--descriptor is required when using --parse-only")
|
raise ValueError("--descriptor is required when using --parse-only")
|
||||||
logger.info(f"Parsing {input_file} with descriptor {descriptor_file}...")
|
logger.info(f"Parsing {input_file} with descriptor {descriptor_file}...")
|
||||||
|
|
||||||
# Load descriptor configuration
|
# Use shared pipeline
|
||||||
|
output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, user, collection, sample_size)
|
||||||
|
|
||||||
|
# Output results
|
||||||
|
if output_file:
|
||||||
|
try:
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(output_records, f, indent=2)
|
||||||
|
print(f"Parsed data saved to: {output_file}")
|
||||||
|
logger.info(f"Parsed {len(output_records)} records saved to {output_file}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to save parsed data to {output_file}: {e}")
|
||||||
|
print(f"Error saving parsed data: {e}")
|
||||||
|
else:
|
||||||
|
print("Parsed Data Preview:")
|
||||||
|
print("=" * 50)
|
||||||
|
# Show first few records for preview
|
||||||
|
preview_count = min(3, len(output_records))
|
||||||
|
for i in range(preview_count):
|
||||||
|
print(f"Record {i+1}:")
|
||||||
|
print(json.dumps(output_records[i], indent=2))
|
||||||
|
print()
|
||||||
|
|
||||||
|
if len(output_records) > preview_count:
|
||||||
|
print(f"... and {len(output_records) - preview_count} more records")
|
||||||
|
print(f"Total records processed: {len(output_records)}")
|
||||||
|
|
||||||
|
# Get summary info from descriptor
|
||||||
|
format_info = descriptor.get('format', {})
|
||||||
|
format_type = format_info.get('type', 'csv').lower()
|
||||||
|
schema_name = descriptor.get('output', {}).get('schema_name', 'default')
|
||||||
|
mappings = descriptor.get('mappings', [])
|
||||||
|
|
||||||
|
print(f"\nParsing Summary:")
|
||||||
|
print(f"- Input format: {format_type}")
|
||||||
|
print(f"- Records processed: {len(output_records)}")
|
||||||
|
print(f"- Target schema: {schema_name}")
|
||||||
|
print(f"- Field mappings: {len(mappings)}")
|
||||||
|
|
||||||
|
elif load:
|
||||||
|
if not descriptor_file:
|
||||||
|
raise ValueError("--descriptor is required when using --load")
|
||||||
|
logger.info(f"Loading {input_file} to TrustGraph using descriptor {descriptor_file}...")
|
||||||
|
|
||||||
|
# Use shared pipeline (no sample_size limit for full load)
|
||||||
|
output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, user, collection)
|
||||||
|
|
||||||
|
# Get batch size from descriptor or use default
|
||||||
|
batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)
|
||||||
|
|
||||||
|
# Send to TrustGraph
|
||||||
|
print(f"🚀 Importing {len(output_records)} records to TrustGraph...")
|
||||||
|
imported_count = _send_to_trustgraph(output_records, api_url, flow, batch_size)
|
||||||
|
|
||||||
|
# Get summary info from descriptor
|
||||||
|
format_info = descriptor.get('format', {})
|
||||||
|
format_type = format_info.get('type', 'csv').lower()
|
||||||
|
schema_name = descriptor.get('output', {}).get('schema_name', 'default')
|
||||||
|
|
||||||
|
print(f"\n🎉 Load Complete!")
|
||||||
|
print(f"- Input format: {format_type}")
|
||||||
|
print(f"- Target schema: {schema_name}")
|
||||||
|
print(f"- Records imported: {imported_count}")
|
||||||
|
print(f"- Flow used: {flow}")
|
||||||
|
|
||||||
|
|
||||||
|
# Shared core functions
|
||||||
|
def _load_descriptor(descriptor_file):
|
||||||
|
"""Load and validate descriptor configuration"""
|
||||||
try:
|
try:
|
||||||
with open(descriptor_file, 'r', encoding='utf-8') as f:
|
with open(descriptor_file, 'r', encoding='utf-8') as f:
|
||||||
descriptor = json.load(f)
|
descriptor = json.load(f)
|
||||||
logger.info(f"Loaded descriptor configuration from {descriptor_file}")
|
logger.info(f"Loaded descriptor configuration from {descriptor_file}")
|
||||||
|
return descriptor
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to load descriptor file: {e}")
|
logger.error(f"Failed to load descriptor file: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
# Read input data based on format in descriptor
|
|
||||||
try:
|
|
||||||
format_info = descriptor.get('format', {})
|
|
||||||
format_type = format_info.get('type', 'csv').lower()
|
|
||||||
encoding = format_info.get('encoding', 'utf-8')
|
|
||||||
|
|
||||||
logger.info(f"Input format: {format_type}, encoding: {encoding}")
|
def _read_input_data(input_file, format_info):
|
||||||
|
"""Read raw data based on format type"""
|
||||||
|
try:
|
||||||
|
encoding = format_info.get('encoding', 'utf-8')
|
||||||
|
|
||||||
with open(input_file, 'r', encoding=encoding) as f:
|
with open(input_file, 'r', encoding=encoding) as f:
|
||||||
raw_data = f.read()
|
raw_data = f.read()
|
||||||
|
|
||||||
logger.info(f"Read {len(raw_data)} characters from input file")
|
logger.info(f"Read {len(raw_data)} characters from input file")
|
||||||
|
return raw_data
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to read input file: {e}")
|
logger.error(f"Failed to read input file: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
# Parse data based on format type
|
|
||||||
|
def _parse_data_by_format(raw_data, format_info, sample_size=None):
|
||||||
|
"""Parse raw data into records based on format (CSV/JSON/XML)"""
|
||||||
|
format_type = format_info.get('type', 'csv').lower()
|
||||||
parsed_records = []
|
parsed_records = []
|
||||||
|
|
||||||
|
logger.info(f"Input format: {format_type}")
|
||||||
|
|
||||||
if format_type == 'csv':
|
if format_type == 'csv':
|
||||||
import csv
|
import csv
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
|
|
@ -373,8 +360,8 @@ def load_structured_data(
|
||||||
reader = csv.DictReader(StringIO(raw_data), fieldnames=fieldnames, delimiter=delimiter)
|
reader = csv.DictReader(StringIO(raw_data), fieldnames=fieldnames, delimiter=delimiter)
|
||||||
|
|
||||||
for row_num, row in enumerate(reader, start=1):
|
for row_num, row in enumerate(reader, start=1):
|
||||||
# Respect sample_size limit
|
# Respect sample_size limit if provided
|
||||||
if row_num > sample_size:
|
if sample_size and row_num > sample_size:
|
||||||
logger.info(f"Reached sample size limit of {sample_size} records")
|
logger.info(f"Reached sample size limit of {sample_size} records")
|
||||||
break
|
break
|
||||||
parsed_records.append(row)
|
parsed_records.append(row)
|
||||||
|
|
@ -387,7 +374,7 @@ def load_structured_data(
|
||||||
try:
|
try:
|
||||||
data = json.loads(raw_data)
|
data = json.loads(raw_data)
|
||||||
if isinstance(data, list):
|
if isinstance(data, list):
|
||||||
parsed_records = data[:sample_size] # Respect sample_size
|
parsed_records = data[:sample_size] if sample_size else data
|
||||||
elif isinstance(data, dict):
|
elif isinstance(data, dict):
|
||||||
# Handle single object or extract array from root path
|
# Handle single object or extract array from root path
|
||||||
root_path = format_info.get('options', {}).get('root_path')
|
root_path = format_info.get('options', {}).get('root_path')
|
||||||
|
|
@ -398,7 +385,7 @@ def load_structured_data(
|
||||||
data = data.get(key, data)
|
data = data.get(key, data)
|
||||||
|
|
||||||
if isinstance(data, list):
|
if isinstance(data, list):
|
||||||
parsed_records = data[:sample_size]
|
parsed_records = data[:sample_size] if sample_size else data
|
||||||
else:
|
else:
|
||||||
parsed_records = [data]
|
parsed_records = [data]
|
||||||
|
|
||||||
|
|
@ -443,7 +430,7 @@ def load_structured_data(
|
||||||
# Convert XML elements to dictionaries
|
# Convert XML elements to dictionaries
|
||||||
record_count = 0
|
record_count = 0
|
||||||
for element in records:
|
for element in records:
|
||||||
if record_count >= sample_size:
|
if sample_size and record_count >= sample_size:
|
||||||
logger.info(f"Reached sample size limit of {sample_size} records")
|
logger.info(f"Reached sample size limit of {sample_size} records")
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
@ -487,12 +474,14 @@ def load_structured_data(
|
||||||
raise ValueError(f"Unsupported format type: {format_type}")
|
raise ValueError(f"Unsupported format type: {format_type}")
|
||||||
|
|
||||||
logger.info(f"Successfully parsed {len(parsed_records)} records")
|
logger.info(f"Successfully parsed {len(parsed_records)} records")
|
||||||
|
return parsed_records
|
||||||
|
|
||||||
# Apply basic transformations and validation (simplified version)
|
|
||||||
mappings = descriptor.get('mappings', [])
|
def _apply_transformations(records, mappings):
|
||||||
|
"""Apply descriptor mappings and transformations"""
|
||||||
processed_records = []
|
processed_records = []
|
||||||
|
|
||||||
for record_num, record in enumerate(parsed_records, start=1):
|
for record_num, record in enumerate(records, start=1):
|
||||||
processed_record = {}
|
processed_record = {}
|
||||||
|
|
||||||
for mapping in mappings:
|
for mapping in mappings:
|
||||||
|
|
@ -533,7 +522,11 @@ def load_structured_data(
|
||||||
|
|
||||||
processed_records.append(processed_record)
|
processed_records.append(processed_record)
|
||||||
|
|
||||||
# Format output for TrustGraph ExtractedObject structure
|
return processed_records
|
||||||
|
|
||||||
|
|
||||||
|
def _format_extracted_objects(processed_records, descriptor, user, collection):
|
||||||
|
"""Convert to TrustGraph ExtractedObject format"""
|
||||||
output_records = []
|
output_records = []
|
||||||
schema_name = descriptor.get('output', {}).get('schema_name', 'default')
|
schema_name = descriptor.get('output', {}).get('schema_name', 'default')
|
||||||
confidence = descriptor.get('output', {}).get('options', {}).get('confidence', 0.9)
|
confidence = descriptor.get('output', {}).get('options', {}).get('confidence', 0.9)
|
||||||
|
|
@ -543,8 +536,8 @@ def load_structured_data(
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"id": f"parsed-{len(output_records)+1}",
|
"id": f"parsed-{len(output_records)+1}",
|
||||||
"metadata": [], # Empty metadata triples
|
"metadata": [], # Empty metadata triples
|
||||||
"user": "trustgraph",
|
"user": user,
|
||||||
"collection": "default"
|
"collection": collection
|
||||||
},
|
},
|
||||||
"schema_name": schema_name,
|
"schema_name": schema_name,
|
||||||
"values": record,
|
"values": record,
|
||||||
|
|
@ -553,52 +546,130 @@ def load_structured_data(
|
||||||
}
|
}
|
||||||
output_records.append(output_record)
|
output_records.append(output_record)
|
||||||
|
|
||||||
# Output results
|
return output_records
|
||||||
if output_file:
|
|
||||||
|
|
||||||
|
def _process_data_pipeline(input_file, descriptor_file, user, collection, sample_size=None):
|
||||||
|
"""Shared pipeline: load descriptor → read → parse → transform → format"""
|
||||||
|
# Load descriptor configuration
|
||||||
|
descriptor = _load_descriptor(descriptor_file)
|
||||||
|
|
||||||
|
# Read input data based on format in descriptor
|
||||||
|
format_info = descriptor.get('format', {})
|
||||||
|
raw_data = _read_input_data(input_file, format_info)
|
||||||
|
|
||||||
|
# Parse data based on format type
|
||||||
|
parsed_records = _parse_data_by_format(raw_data, format_info, sample_size)
|
||||||
|
|
||||||
|
# Apply transformations and validation
|
||||||
|
mappings = descriptor.get('mappings', [])
|
||||||
|
processed_records = _apply_transformations(parsed_records, mappings)
|
||||||
|
|
||||||
|
# Format output for TrustGraph ExtractedObject structure
|
||||||
|
output_records = _format_extracted_objects(processed_records, descriptor, user, collection)
|
||||||
|
|
||||||
|
return output_records, descriptor
|
||||||
|
|
||||||
|
|
||||||
|
def _send_to_trustgraph(objects, api_url, flow, batch_size=1000):
|
||||||
|
"""Send ExtractedObject records to TrustGraph using WebSocket"""
|
||||||
|
import json
|
||||||
|
import asyncio
|
||||||
|
from websockets.asyncio.client import connect
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(output_file, 'w', encoding='utf-8') as f:
|
# Construct objects import URL similar to load_knowledge pattern
|
||||||
json.dump(output_records, f, indent=2)
|
if not api_url.endswith("/"):
|
||||||
print(f"Parsed data saved to: {output_file}")
|
api_url += "/"
|
||||||
logger.info(f"Parsed {len(output_records)} records saved to {output_file}")
|
|
||||||
|
# Convert HTTP URL to WebSocket URL if needed
|
||||||
|
ws_url = api_url.replace("http://", "ws://").replace("https://", "wss://")
|
||||||
|
objects_url = ws_url + f"api/v1/flow/{flow}/import/objects"
|
||||||
|
|
||||||
|
logger.info(f"Connecting to objects import endpoint: {objects_url}")
|
||||||
|
|
||||||
|
async def import_objects():
|
||||||
|
async with connect(objects_url) as ws:
|
||||||
|
imported_count = 0
|
||||||
|
|
||||||
|
for record in objects:
|
||||||
|
try:
|
||||||
|
# Send individual ExtractedObject records
|
||||||
|
await ws.send(json.dumps(record))
|
||||||
|
imported_count += 1
|
||||||
|
|
||||||
|
if imported_count % 100 == 0:
|
||||||
|
logger.info(f"Imported {imported_count}/{len(objects)} records...")
|
||||||
|
print(f"✅ Imported {imported_count}/{len(objects)} records...")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to save parsed data to {output_file}: {e}")
|
logger.error(f"Failed to send record {imported_count + 1}: {e}")
|
||||||
print(f"Error saving parsed data: {e}")
|
print(f"❌ Failed to send record {imported_count + 1}: {e}")
|
||||||
|
|
||||||
|
logger.info(f"Successfully imported {imported_count} records to TrustGraph")
|
||||||
|
return imported_count
|
||||||
|
|
||||||
|
# Run the async import
|
||||||
|
imported_count = asyncio.run(import_objects())
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
total_records = len(objects)
|
||||||
|
failed_count = total_records - imported_count
|
||||||
|
|
||||||
|
print(f"\n📊 Import Summary:")
|
||||||
|
print(f"- Total records: {total_records}")
|
||||||
|
print(f"- Successfully imported: {imported_count}")
|
||||||
|
print(f"- Failed: {failed_count}")
|
||||||
|
|
||||||
|
if failed_count > 0:
|
||||||
|
print(f"⚠️ {failed_count} records failed to import. Check logs for details.")
|
||||||
else:
|
else:
|
||||||
print("Parsed Data Preview:")
|
print("✅ All records imported successfully!")
|
||||||
print("=" * 50)
|
|
||||||
# Show first few records for preview
|
|
||||||
preview_count = min(3, len(output_records))
|
|
||||||
for i in range(preview_count):
|
|
||||||
print(f"Record {i+1}:")
|
|
||||||
print(json.dumps(output_records[i], indent=2))
|
|
||||||
print()
|
|
||||||
|
|
||||||
if len(output_records) > preview_count:
|
return imported_count
|
||||||
print(f"... and {len(output_records) - preview_count} more records")
|
|
||||||
print(f"Total records processed: {len(output_records)}")
|
|
||||||
|
|
||||||
print(f"\nParsing Summary:")
|
except ImportError as e:
|
||||||
print(f"- Input format: {format_type}")
|
logger.error(f"Failed to import required modules: {e}")
|
||||||
print(f"- Records processed: {len(output_records)}")
|
print(f"Error: Required modules not available - {e}")
|
||||||
print(f"- Target schema: {schema_name}")
|
raise
|
||||||
print(f"- Field mappings: {len(mappings)}")
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to import data to TrustGraph: {e}")
|
||||||
|
print(f"Import failed: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
# Helper functions for auto mode
|
# Helper functions for auto mode
|
||||||
def _auto_discover_schema(api_url, input_file, sample_chars, logger):
|
def _auto_discover_schema(api_url, input_file, sample_chars, flow, logger, return_raw_response=False):
|
||||||
"""Auto-discover the best matching schema for the input data"""
|
"""Auto-discover the best matching schema for the input data
|
||||||
|
|
||||||
|
Args:
|
||||||
|
api_url: TrustGraph API URL
|
||||||
|
input_file: Path to input data file
|
||||||
|
sample_chars: Number of characters to sample from file
|
||||||
|
flow: TrustGraph flow name to use for prompts
|
||||||
|
logger: Logger instance
|
||||||
|
return_raw_response: If True, return raw prompt response; if False, parse to extract schema name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Schema name (str) if return_raw_response=False, or full response if True
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
# Read sample data
|
# Read sample data
|
||||||
with open(input_file, 'r', encoding='utf-8') as f:
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||||||
sample_data = f.read(sample_chars)
|
sample_data = f.read(sample_chars)
|
||||||
|
logger.info(f"Read {len(sample_data)} characters for analysis")
|
||||||
|
|
||||||
# Import API modules
|
# Import API modules
|
||||||
from trustgraph.api import Api
|
from trustgraph.api import Api
|
||||||
|
from trustgraph.api.types import ConfigKey
|
||||||
api = Api(api_url)
|
api = Api(api_url)
|
||||||
config_api = api.config()
|
config_api = api.config()
|
||||||
|
|
||||||
# Get available schemas
|
# Get available schemas
|
||||||
|
logger.info("Fetching available schemas from Config API...")
|
||||||
schema_keys = config_api.list("schema")
|
schema_keys = config_api.list("schema")
|
||||||
|
logger.info(f"Found {len(schema_keys)} schemas: {schema_keys}")
|
||||||
|
|
||||||
if not schema_keys:
|
if not schema_keys:
|
||||||
logger.error("No schemas available in TrustGraph configuration")
|
logger.error("No schemas available in TrustGraph configuration")
|
||||||
return None
|
return None
|
||||||
|
|
@ -607,8 +678,12 @@ def _auto_discover_schema(api_url, input_file, sample_chars, logger):
|
||||||
schemas = {}
|
schemas = {}
|
||||||
for key in schema_keys:
|
for key in schema_keys:
|
||||||
try:
|
try:
|
||||||
schema_def = config_api.get("schema", key)
|
config_key = ConfigKey(type="schema", key=key)
|
||||||
|
schema_values = config_api.get([config_key])
|
||||||
|
if schema_values:
|
||||||
|
schema_def = json.loads(schema_values[0].value) if isinstance(schema_values[0].value, str) else schema_values[0].value
|
||||||
schemas[key] = schema_def
|
schemas[key] = schema_def
|
||||||
|
logger.debug(f"Loaded schema: {key}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Could not load schema {key}: {e}")
|
logger.warning(f"Could not load schema {key}: {e}")
|
||||||
|
|
||||||
|
|
@ -616,33 +691,32 @@ def _auto_discover_schema(api_url, input_file, sample_chars, logger):
|
||||||
logger.error("No valid schemas could be loaded")
|
logger.error("No valid schemas could be loaded")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
logger.info(f"Successfully loaded {len(schemas)} schema definitions")
|
||||||
|
|
||||||
# Use prompt service for schema selection
|
# Use prompt service for schema selection
|
||||||
flow_api = api.flow().id("default")
|
flow_api = api.flow().id(flow)
|
||||||
prompt_client = flow_api.prompt()
|
|
||||||
|
|
||||||
prompt = f"""Analyze this data sample and determine the best matching schema:
|
# Call schema-selection prompt with actual schemas and data sample
|
||||||
|
logger.info("Calling TrustGraph schema-selection prompt...")
|
||||||
DATA SAMPLE:
|
response = flow_api.prompt(
|
||||||
{sample_data[:1000]}
|
id="schema-selection",
|
||||||
|
variables={
|
||||||
AVAILABLE SCHEMAS:
|
"schemas": list(schemas.values()), # Array of actual schema definitions
|
||||||
{json.dumps(schemas, indent=2)}
|
"question": sample_data # Truncate sample data
|
||||||
|
}
|
||||||
Return ONLY the schema name (key) that best matches this data. Consider:
|
|
||||||
1. Field names and types in the data
|
|
||||||
2. Data structure and format
|
|
||||||
3. Domain and use case alignment
|
|
||||||
|
|
||||||
Schema name:"""
|
|
||||||
|
|
||||||
response = prompt_client.schema_selection(
|
|
||||||
schemas=schemas,
|
|
||||||
sample=sample_data[:1000]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Return raw response if requested (for discover_schema mode)
|
||||||
|
if return_raw_response:
|
||||||
|
return response
|
||||||
|
|
||||||
# Extract schema name from response
|
# Extract schema name from response
|
||||||
if isinstance(response, dict) and 'schema' in response:
|
if isinstance(response, dict) and 'schema' in response:
|
||||||
return response['schema']
|
return response['schema']
|
||||||
|
elif isinstance(response, list) and len(response) > 0:
|
||||||
|
# If response is a list, use the first element
|
||||||
|
logger.info(f"Extracted schema '{response[0]}' from list response")
|
||||||
|
return response[0]
|
||||||
elif isinstance(response, str):
|
elif isinstance(response, str):
|
||||||
# Try to extract schema name from text response
|
# Try to extract schema name from text response
|
||||||
response_lower = response.lower().strip()
|
response_lower = response.lower().strip()
|
||||||
|
|
@ -669,7 +743,7 @@ Schema name:"""
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, logger):
|
def _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, flow, logger):
|
||||||
"""Auto-generate descriptor configuration for the discovered schema"""
|
"""Auto-generate descriptor configuration for the discovered schema"""
|
||||||
try:
|
try:
|
||||||
# Read sample data
|
# Read sample data
|
||||||
|
|
@ -678,20 +752,28 @@ def _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, lo
|
||||||
|
|
||||||
# Import API modules
|
# Import API modules
|
||||||
from trustgraph.api import Api
|
from trustgraph.api import Api
|
||||||
|
from trustgraph.api.types import ConfigKey
|
||||||
api = Api(api_url)
|
api = Api(api_url)
|
||||||
config_api = api.config()
|
config_api = api.config()
|
||||||
|
|
||||||
# Get schema definition
|
# Get schema definition
|
||||||
schema_def = config_api.get("schema", schema_name)
|
config_key = ConfigKey(type="schema", key=schema_name)
|
||||||
|
schema_values = config_api.get([config_key])
|
||||||
|
if not schema_values:
|
||||||
|
logger.error(f"Schema '{schema_name}' not found")
|
||||||
|
return None
|
||||||
|
schema_def = json.loads(schema_values[0].value) if isinstance(schema_values[0].value, str) else schema_values[0].value
|
||||||
|
|
||||||
# Use prompt service for descriptor generation
|
# Use prompt service for descriptor generation
|
||||||
flow_api = api.flow().id("default")
|
flow_api = api.flow().id(flow)
|
||||||
prompt_client = flow_api.prompt()
|
|
||||||
|
|
||||||
response = prompt_client.diagnose_structured_data(
|
# Call diagnose-structured-data prompt with schema and data sample
|
||||||
sample=sample_data,
|
response = flow_api.prompt(
|
||||||
schema_name=schema_name,
|
id="diagnose-structured-data",
|
||||||
schema=schema_def
|
variables={
|
||||||
|
"schemas": [schema_def], # Array with single schema definition
|
||||||
|
"sample": sample_data # Data sample for analysis
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
if isinstance(response, str):
|
if isinstance(response, str):
|
||||||
|
|
@ -784,9 +866,9 @@ def main():
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
epilog="""
|
epilog="""
|
||||||
Examples:
|
Examples:
|
||||||
# Step 1: Analyze data and suggest matching schemas
|
# Step 1: Analyze data and discover matching schemas
|
||||||
%(prog)s --input customers.csv --suggest-schema
|
%(prog)s --input customers.csv --discover-schema
|
||||||
%(prog)s --input products.xml --suggest-schema --sample-chars 1000
|
%(prog)s --input products.xml --discover-schema --sample-chars 1000
|
||||||
|
|
||||||
# Step 2: Generate descriptor configuration from data sample
|
# Step 2: Generate descriptor configuration from data sample
|
||||||
%(prog)s --input customers.csv --generate-descriptor --schema-name customer --output descriptor.json
|
%(prog)s --input customers.csv --generate-descriptor --schema-name customer --output descriptor.json
|
||||||
|
|
@ -813,7 +895,7 @@ Examples:
|
||||||
Use Cases:
|
Use Cases:
|
||||||
--auto : 🚀 FULLY AUTOMATIC: Discover schema + generate descriptor + import data
|
--auto : 🚀 FULLY AUTOMATIC: Discover schema + generate descriptor + import data
|
||||||
(zero manual configuration required!)
|
(zero manual configuration required!)
|
||||||
--suggest-schema : Diagnose which TrustGraph schemas might match your data
|
--discover-schema : Diagnose which TrustGraph schemas might match your data
|
||||||
(uses --sample-chars to limit data sent for analysis)
|
(uses --sample-chars to limit data sent for analysis)
|
||||||
--generate-descriptor: Create/review the structured data language configuration
|
--generate-descriptor: Create/review the structured data language configuration
|
||||||
(uses --sample-chars to limit data sent for analysis)
|
(uses --sample-chars to limit data sent for analysis)
|
||||||
|
|
@ -835,7 +917,19 @@ For more information on the descriptor format, see:
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-f', '--flow',
|
'-f', '--flow',
|
||||||
default='default',
|
default='default',
|
||||||
help='TrustGraph flow name to use for import (default: default)'
|
help='TrustGraph flow name to use for prompts and import (default: default)'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--user',
|
||||||
|
default='trustgraph',
|
||||||
|
help='User name for metadata (default: trustgraph)'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--collection',
|
||||||
|
default='default',
|
||||||
|
help='Collection name for metadata (default: default)'
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|
@ -852,9 +946,9 @@ For more information on the descriptor format, see:
|
||||||
# Operation modes (mutually exclusive)
|
# Operation modes (mutually exclusive)
|
||||||
mode_group = parser.add_mutually_exclusive_group()
|
mode_group = parser.add_mutually_exclusive_group()
|
||||||
mode_group.add_argument(
|
mode_group.add_argument(
|
||||||
'--suggest-schema',
|
'--discover-schema',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help='Analyze data sample and suggest matching TrustGraph schemas'
|
help='Analyze data sample and discover matching TrustGraph schemas'
|
||||||
)
|
)
|
||||||
mode_group.add_argument(
|
mode_group.add_argument(
|
||||||
'--generate-descriptor',
|
'--generate-descriptor',
|
||||||
|
|
@ -866,6 +960,11 @@ For more information on the descriptor format, see:
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help='Parse data using descriptor but don\'t import to TrustGraph'
|
help='Parse data using descriptor but don\'t import to TrustGraph'
|
||||||
)
|
)
|
||||||
|
mode_group.add_argument(
|
||||||
|
'--load',
|
||||||
|
action='store_true',
|
||||||
|
help='Load data to TrustGraph using existing descriptor'
|
||||||
|
)
|
||||||
mode_group.add_argument(
|
mode_group.add_argument(
|
||||||
'--auto',
|
'--auto',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
|
|
@ -888,7 +987,7 @@ For more information on the descriptor format, see:
|
||||||
'--sample-chars',
|
'--sample-chars',
|
||||||
type=int,
|
type=int,
|
||||||
default=500,
|
default=500,
|
||||||
help='Maximum characters to read for sampling (suggest-schema/generate-descriptor modes only, default: 500)'
|
help='Maximum characters to read for sampling (discover-schema/generate-descriptor modes only, default: 500)'
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|
@ -939,21 +1038,26 @@ For more information on the descriptor format, see:
|
||||||
print("Error: --descriptor is required when using --parse-only", file=sys.stderr)
|
print("Error: --descriptor is required when using --parse-only", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
if args.load and not args.descriptor:
|
||||||
|
print("Error: --descriptor is required when using --load", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
# Warn about irrelevant parameters
|
# Warn about irrelevant parameters
|
||||||
if args.parse_only and args.sample_chars != 500: # 500 is the default
|
if args.parse_only and args.sample_chars != 500: # 500 is the default
|
||||||
print("Warning: --sample-chars is ignored in --parse-only mode (entire file is processed)", file=sys.stderr)
|
print("Warning: --sample-chars is ignored in --parse-only mode (entire file is processed)", file=sys.stderr)
|
||||||
|
|
||||||
if (args.suggest_schema or args.generate_descriptor) and args.sample_size != 100: # 100 is default
|
if (args.discover_schema or args.generate_descriptor) and args.sample_size != 100: # 100 is default
|
||||||
print("Warning: --sample-size is ignored in analysis modes, use --sample-chars instead", file=sys.stderr)
|
print("Warning: --sample-size is ignored in analysis modes, use --sample-chars instead", file=sys.stderr)
|
||||||
|
|
||||||
# Require explicit mode selection - no implicit behavior
|
# Require explicit mode selection - no implicit behavior
|
||||||
if not any([args.suggest_schema, args.generate_descriptor, args.parse_only, args.auto]):
|
if not any([args.discover_schema, args.generate_descriptor, args.parse_only, args.load, args.auto]):
|
||||||
print("Error: Must specify an operation mode", file=sys.stderr)
|
print("Error: Must specify an operation mode", file=sys.stderr)
|
||||||
print("Available modes:", file=sys.stderr)
|
print("Available modes:", file=sys.stderr)
|
||||||
print(" --auto : Discover schema + generate descriptor + import", file=sys.stderr)
|
print(" --auto : Discover schema + generate descriptor + import", file=sys.stderr)
|
||||||
print(" --suggest-schema : Analyze data and suggest schemas", file=sys.stderr)
|
print(" --discover-schema : Analyze data and discover schemas", file=sys.stderr)
|
||||||
print(" --generate-descriptor : Generate descriptor from data", file=sys.stderr)
|
print(" --generate-descriptor : Generate descriptor from data", file=sys.stderr)
|
||||||
print(" --parse-only : Parse data without importing", file=sys.stderr)
|
print(" --parse-only : Parse data without importing", file=sys.stderr)
|
||||||
|
print(" --load : Import data using existing descriptor", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -961,15 +1065,18 @@ For more information on the descriptor format, see:
|
||||||
api_url=args.api_url,
|
api_url=args.api_url,
|
||||||
input_file=args.input,
|
input_file=args.input,
|
||||||
descriptor_file=args.descriptor,
|
descriptor_file=args.descriptor,
|
||||||
suggest_schema=args.suggest_schema,
|
discover_schema=args.discover_schema,
|
||||||
generate_descriptor=args.generate_descriptor,
|
generate_descriptor=args.generate_descriptor,
|
||||||
parse_only=args.parse_only,
|
parse_only=args.parse_only,
|
||||||
|
load=args.load,
|
||||||
auto=args.auto,
|
auto=args.auto,
|
||||||
output_file=args.output,
|
output_file=args.output,
|
||||||
sample_size=args.sample_size,
|
sample_size=args.sample_size,
|
||||||
sample_chars=args.sample_chars,
|
sample_chars=args.sample_chars,
|
||||||
schema_name=args.schema_name,
|
schema_name=args.schema_name,
|
||||||
flow=args.flow,
|
flow=args.flow,
|
||||||
|
user=args.user,
|
||||||
|
collection=args.collection,
|
||||||
dry_run=args.dry_run,
|
dry_run=args.dry_run,
|
||||||
verbose=args.verbose
|
verbose=args.verbose
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue