Fix structured-load auto mode (#501)

* Fix API incorrect usage

* Refactor to remove duplicate code

* Fix schema extraction in --auto

* Fix broken load functionality
This commit is contained in:
cybermaggedon 2025-09-06 12:28:28 +01:00 committed by GitHub
parent 5537fac731
commit 415208e3e7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -2,7 +2,7 @@
Load structured data into TrustGraph using a descriptor configuration. Load structured data into TrustGraph using a descriptor configuration.
This utility can: This utility can:
1. Analyze data samples to suggest appropriate schemas 1. Analyze data samples to discover appropriate schemas
2. Generate descriptor configurations from data samples 2. Generate descriptor configurations from data samples
3. Parse and transform data using descriptor configurations 3. Parse and transform data using descriptor configurations
4. Import processed data into TrustGraph 4. Import processed data into TrustGraph
@ -28,15 +28,18 @@ def load_structured_data(
api_url: str, api_url: str,
input_file: str, input_file: str,
descriptor_file: str = None, descriptor_file: str = None,
suggest_schema: bool = False, discover_schema: bool = False,
generate_descriptor: bool = False, generate_descriptor: bool = False,
parse_only: bool = False, parse_only: bool = False,
load: bool = False,
auto: bool = False, auto: bool = False,
output_file: str = None, output_file: str = None,
sample_size: int = 100, sample_size: int = 100,
sample_chars: int = 500, sample_chars: int = 500,
schema_name: str = None, schema_name: str = None,
flow: str = 'default', flow: str = 'default',
user: str = 'trustgraph',
collection: str = 'default',
dry_run: bool = False, dry_run: bool = False,
verbose: bool = False verbose: bool = False
): ):
@ -47,14 +50,18 @@ def load_structured_data(
api_url: TrustGraph API URL api_url: TrustGraph API URL
input_file: Path to input data file input_file: Path to input data file
descriptor_file: Path to JSON descriptor configuration descriptor_file: Path to JSON descriptor configuration
suggest_schema: Analyze data and suggest matching schemas discover_schema: Analyze data and discover matching schemas
generate_descriptor: Generate descriptor from data sample generate_descriptor: Generate descriptor from data sample
parse_only: Parse data but don't import to TrustGraph parse_only: Parse data but don't import to TrustGraph
auto: Run full automatic pipeline (suggest schema + generate descriptor + import) load: Load data to TrustGraph using existing descriptor
auto: Run full automatic pipeline (discover schema + generate descriptor + import)
output_file: Path to write output (descriptor/parsed data) output_file: Path to write output (descriptor/parsed data)
sample_size: Number of records to sample for analysis sample_size: Number of records to sample for analysis
sample_chars: Maximum characters to read for sampling sample_chars: Maximum characters to read for sampling
schema_name: Target schema name for generation schema_name: Target schema name for generation
flow: TrustGraph flow name to use for prompts
user: User name for metadata (default: trustgraph)
collection: Collection name for metadata (default: default)
dry_run: If True, validate but don't import data dry_run: If True, validate but don't import data
verbose: Enable verbose logging verbose: Enable verbose logging
""" """
@ -68,12 +75,12 @@ def load_structured_data(
logger.info(f"🚀 Starting automatic pipeline for {input_file}...") logger.info(f"🚀 Starting automatic pipeline for {input_file}...")
logger.info("Step 1: Analyzing data to discover best matching schema...") logger.info("Step 1: Analyzing data to discover best matching schema...")
# Step 1: Auto-discover schema (reuse suggest_schema logic) # Step 1: Auto-discover schema (reuse discover_schema logic)
discovered_schema = _auto_discover_schema(api_url, input_file, sample_chars, logger) discovered_schema = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger)
if not discovered_schema: if not discovered_schema:
logger.error("Failed to discover suitable schema automatically") logger.error("Failed to discover suitable schema automatically")
print("❌ Could not automatically determine the best schema for your data.") print("❌ Could not automatically determine the best schema for your data.")
print("💡 Try running with --suggest-schema first to see available options.") print("💡 Try running with --discover-schema first to see available options.")
return None return None
logger.info(f"✅ Discovered schema: {discovered_schema}") logger.info(f"✅ Discovered schema: {discovered_schema}")
@ -81,7 +88,7 @@ def load_structured_data(
# Step 2: Auto-generate descriptor # Step 2: Auto-generate descriptor
logger.info("Step 2: Generating descriptor configuration...") logger.info("Step 2: Generating descriptor configuration...")
auto_descriptor = _auto_generate_descriptor(api_url, input_file, discovered_schema, sample_chars, logger) auto_descriptor = _auto_generate_descriptor(api_url, input_file, discovered_schema, sample_chars, flow, logger)
if not auto_descriptor: if not auto_descriptor:
logger.error("Failed to generate descriptor automatically") logger.error("Failed to generate descriptor automatically")
print("❌ Could not automatically generate descriptor configuration.") print("❌ Could not automatically generate descriptor configuration.")
@ -90,216 +97,128 @@ def load_structured_data(
logger.info("✅ Generated descriptor configuration") logger.info("✅ Generated descriptor configuration")
print("📝 Generated descriptor configuration") print("📝 Generated descriptor configuration")
# Step 3: Parse and preview data # Step 3: Parse and preview data using shared pipeline
logger.info("Step 3: Parsing and validating data...") logger.info("Step 3: Parsing and validating data...")
preview_records = _auto_parse_preview(input_file, auto_descriptor, min(sample_size, 5), logger)
if preview_records is None: # Create temporary descriptor file for validation
logger.error("Failed to parse data with generated descriptor") import tempfile
print("❌ Could not parse data with generated descriptor.") temp_descriptor = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False)
return None json.dump(auto_descriptor, temp_descriptor, indent=2)
temp_descriptor.close()
try:
# Use shared pipeline for preview (small sample)
preview_objects, _ = _process_data_pipeline(input_file, temp_descriptor.name, user, collection, sample_size=5)
# Show preview # Show preview
print("📊 Data Preview (first few records):") print("📊 Data Preview (first few records):")
print("=" * 50) print("=" * 50)
for i, record in enumerate(preview_records[:3], 1): for i, obj in enumerate(preview_objects[:3], 1):
print(f"Record {i}: {record}") values = obj.get('values', {})
print(f"Record {i}: {values}")
print("=" * 50) print("=" * 50)
# Step 4: Import (unless dry_run) # Step 4: Import (unless dry_run)
if dry_run: if dry_run:
logger.info("✅ Dry run complete - data is ready for import") logger.info("✅ Dry run complete - data is ready for import")
print("✅ Dry run successful! Data is ready for import.") print("✅ Dry run successful! Data is ready for import.")
print(f"💡 Run without --dry-run to import {len(preview_records)} records to TrustGraph.") print(f"💡 Run without --dry-run to import data to TrustGraph.")
return None return None
else: else:
logger.info("Step 4: Importing data to TrustGraph...") logger.info("Step 4: Importing data to TrustGraph...")
print("🚀 Importing data to TrustGraph...") print("🚀 Importing data to TrustGraph...")
# Recursively call ourselves with the auto-generated descriptor # Use shared pipeline for full processing (no sample limit)
# This reuses all the existing import logic output_objects, descriptor = _process_data_pipeline(input_file, temp_descriptor.name, user, collection)
import tempfile
import os
# Save auto-generated descriptor to temp file # Get batch size from descriptor
temp_descriptor = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)
json.dump(auto_descriptor, temp_descriptor, indent=2)
temp_descriptor.close()
try: # Send to TrustGraph using shared function
# Call the full pipeline mode with our auto-generated descriptor imported_count = _send_to_trustgraph(output_objects, api_url, flow, batch_size)
result = load_structured_data(
api_url=api_url, # Summary
input_file=input_file, format_info = descriptor.get('format', {})
descriptor_file=temp_descriptor.name, format_type = format_info.get('type', 'csv').lower()
flow=flow, schema_name = descriptor.get('output', {}).get('schema_name', 'default')
dry_run=False, # We already handled dry_run above
verbose=verbose print(f"\n🎉 Auto-Import Complete!")
) print(f"- Input format: {format_type}")
print(f"- Target schema: {schema_name}")
print(f"- Records imported: {imported_count}")
print(f"- Flow used: {flow}")
print("✅ Auto-import completed successfully!")
logger.info("Auto-import pipeline completed successfully") logger.info("Auto-import pipeline completed successfully")
return result return imported_count
except Exception as e:
logger.error(f"Auto-import failed: {e}")
print(f"❌ Auto-import failed: {e}")
return None
finally: finally:
# Clean up temp descriptor file # Clean up temp descriptor file
try: try:
import os
os.unlink(temp_descriptor.name) os.unlink(temp_descriptor.name)
except: except:
pass pass
elif suggest_schema: elif discover_schema:
logger.info(f"Analyzing {input_file} to suggest schemas...") logger.info(f"Analyzing {input_file} to discover schemas...")
logger.info(f"Sample size: {sample_size} records") logger.info(f"Sample size: {sample_size} records")
logger.info(f"Sample chars: {sample_chars} characters") logger.info(f"Sample chars: {sample_chars} characters")
# Read sample data from input file # Use the helper function to discover schema (get raw response for display)
try: response = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger, return_raw_response=True)
with open(input_file, 'r', encoding='utf-8') as f:
# Read up to sample_chars characters if response:
sample_data = f.read(sample_chars) # Debug: print response type and content
if len(sample_data) < sample_chars: logger.debug(f"Response type: {type(response)}, content: {response}")
logger.info(f"Read entire file ({len(sample_data)} characters)") if isinstance(response, list) and len(response) == 1:
# Just print the schema name for clean output
print(f"Best matching schema: {response[0]}")
elif isinstance(response, list):
# Multiple schemas - show the list
print("Multiple schemas found:")
for schema in response:
print(f" - {schema}")
else: else:
logger.info(f"Read sample ({sample_chars} characters)") # Show full response for debugging
except Exception as e: print("Schema Discovery Results:")
logger.error(f"Failed to read input file: {e}")
raise
# Fetch available schemas from Config API
try:
from trustgraph.api import Api
from trustgraph.api.types import ConfigKey
api = Api(api_url)
config_api = api.config()
# Get list of available schema keys
logger.info("Fetching available schemas from Config API...")
schema_keys = config_api.list("schema")
logger.info(f"Found {len(schema_keys)} schemas: {schema_keys}")
if not schema_keys:
logger.warning("No schemas found in configuration")
print("No schemas available in TrustGraph configuration")
return
# Fetch each schema definition
schemas = []
config_keys = [ConfigKey(type="schema", key=key) for key in schema_keys]
schema_values = config_api.get(config_keys)
for value in schema_values:
try:
# Schema values are JSON strings, parse them
schema_def = json.loads(value.value) if isinstance(value.value, str) else value.value
schemas.append(schema_def)
logger.debug(f"Loaded schema: {value.key}")
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse schema {value.key}: {e}")
continue
logger.info(f"Successfully loaded {len(schemas)} schema definitions")
# Use TrustGraph prompt service for schema suggestion
flow_api = api.flow().id("default")
# Call schema-selection prompt with actual schemas and data sample
logger.info("Calling TrustGraph schema-selection prompt...")
response = flow_api.prompt(
id="schema-selection",
variables={
"schemas": schemas, # Array of actual schema definitions (note: plural 'schemas')
"data": sample_data
}
)
print("Schema Suggestion Results:")
print("=" * 50) print("=" * 50)
print(response) print(response)
print("=" * 50)
except ImportError as e: else:
logger.error(f"Failed to import TrustGraph API: {e}") print("Could not determine the best matching schema for your data.")
raise print("Available schemas can be viewed using: tg-config-list schema")
except Exception as e:
logger.error(f"Failed to call TrustGraph prompt service: {e}")
raise
elif generate_descriptor: elif generate_descriptor:
logger.info(f"Generating descriptor from {input_file}...") logger.info(f"Generating descriptor from {input_file}...")
logger.info(f"Sample size: {sample_size} records") logger.info(f"Sample size: {sample_size} records")
logger.info(f"Sample chars: {sample_chars} characters") logger.info(f"Sample chars: {sample_chars} characters")
if schema_name:
# If no schema specified, discover it first
if not schema_name:
logger.info("No schema specified, auto-discovering...")
schema_name = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger)
if not schema_name:
print("Error: Could not determine schema automatically.")
print("Please specify a schema using --schema-name or run --discover-schema first.")
return
logger.info(f"Auto-selected schema: {schema_name}")
else:
logger.info(f"Target schema: {schema_name}") logger.info(f"Target schema: {schema_name}")
# Read sample data from input file # Generate descriptor using helper function
try: descriptor = _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, flow, logger)
with open(input_file, 'r', encoding='utf-8') as f:
# Read up to sample_chars characters
sample_data = f.read(sample_chars)
if len(sample_data) < sample_chars:
logger.info(f"Read entire file ({len(sample_data)} characters)")
else:
logger.info(f"Read sample ({sample_chars} characters)")
except Exception as e:
logger.error(f"Failed to read input file: {e}")
raise
# Fetch available schemas from Config API (same as suggest-schema mode)
try:
from trustgraph.api import Api
from trustgraph.api.types import ConfigKey
api = Api(api_url)
config_api = api.config()
# Get list of available schema keys
logger.info("Fetching available schemas from Config API...")
schema_keys = config_api.list("schema")
logger.info(f"Found {len(schema_keys)} schemas: {schema_keys}")
if not schema_keys:
logger.warning("No schemas found in configuration")
print("No schemas available in TrustGraph configuration")
return
# Fetch each schema definition
schemas = []
config_keys = [ConfigKey(type="schema", key=key) for key in schema_keys]
schema_values = config_api.get(config_keys)
for value in schema_values:
try:
# Schema values are JSON strings, parse them
schema_def = json.loads(value.value) if isinstance(value.value, str) else value.value
schemas.append(schema_def)
logger.debug(f"Loaded schema: {value.key}")
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse schema {value.key}: {e}")
continue
logger.info(f"Successfully loaded {len(schemas)} schema definitions")
# Use TrustGraph prompt service for descriptor generation
flow_api = api.flow().id("default")
# Call diagnose-structured-data prompt with schemas and data sample
logger.info("Calling TrustGraph diagnose-structured-data prompt...")
response = flow_api.prompt(
id="diagnose-structured-data",
variables={
"schemas": schemas, # Array of actual schema definitions
"sample": sample_data # Note: using 'sample' instead of 'data'
}
)
if descriptor:
# Output the generated descriptor # Output the generated descriptor
if output_file: if output_file:
try: try:
with open(output_file, 'w', encoding='utf-8') as f: with open(output_file, 'w', encoding='utf-8') as f:
if isinstance(response, str): f.write(json.dumps(descriptor, indent=2))
f.write(response)
else:
f.write(json.dumps(response, indent=2))
print(f"Generated descriptor saved to: {output_file}") print(f"Generated descriptor saved to: {output_file}")
logger.info(f"Descriptor saved to {output_file}") logger.info(f"Descriptor saved to {output_file}")
except Exception as e: except Exception as e:
@ -308,52 +227,120 @@ def load_structured_data(
else: else:
print("Generated Descriptor:") print("Generated Descriptor:")
print("=" * 50) print("=" * 50)
if isinstance(response, str): print(json.dumps(descriptor, indent=2))
print(response) print("=" * 50)
print("Use this descriptor with --parse-only to validate or without modes to import.")
else: else:
print(json.dumps(response, indent=2)) print("Error: Failed to generate descriptor.")
print("Check the logs for details or try --discover-schema to verify schema availability.")
except ImportError as e:
logger.error(f"Failed to import TrustGraph API: {e}")
raise
except Exception as e:
logger.error(f"Failed to call TrustGraph prompt service: {e}")
raise
elif parse_only: elif parse_only:
if not descriptor_file: if not descriptor_file:
raise ValueError("--descriptor is required when using --parse-only") raise ValueError("--descriptor is required when using --parse-only")
logger.info(f"Parsing {input_file} with descriptor {descriptor_file}...") logger.info(f"Parsing {input_file} with descriptor {descriptor_file}...")
# Load descriptor configuration # Use shared pipeline
output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, user, collection, sample_size)
# Output results
if output_file:
try:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output_records, f, indent=2)
print(f"Parsed data saved to: {output_file}")
logger.info(f"Parsed {len(output_records)} records saved to {output_file}")
except Exception as e:
logger.error(f"Failed to save parsed data to {output_file}: {e}")
print(f"Error saving parsed data: {e}")
else:
print("Parsed Data Preview:")
print("=" * 50)
# Show first few records for preview
preview_count = min(3, len(output_records))
for i in range(preview_count):
print(f"Record {i+1}:")
print(json.dumps(output_records[i], indent=2))
print()
if len(output_records) > preview_count:
print(f"... and {len(output_records) - preview_count} more records")
print(f"Total records processed: {len(output_records)}")
# Get summary info from descriptor
format_info = descriptor.get('format', {})
format_type = format_info.get('type', 'csv').lower()
schema_name = descriptor.get('output', {}).get('schema_name', 'default')
mappings = descriptor.get('mappings', [])
print(f"\nParsing Summary:")
print(f"- Input format: {format_type}")
print(f"- Records processed: {len(output_records)}")
print(f"- Target schema: {schema_name}")
print(f"- Field mappings: {len(mappings)}")
elif load:
if not descriptor_file:
raise ValueError("--descriptor is required when using --load")
logger.info(f"Loading {input_file} to TrustGraph using descriptor {descriptor_file}...")
# Use shared pipeline (no sample_size limit for full load)
output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, user, collection)
# Get batch size from descriptor or use default
batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)
# Send to TrustGraph
print(f"🚀 Importing {len(output_records)} records to TrustGraph...")
imported_count = _send_to_trustgraph(output_records, api_url, flow, batch_size)
# Get summary info from descriptor
format_info = descriptor.get('format', {})
format_type = format_info.get('type', 'csv').lower()
schema_name = descriptor.get('output', {}).get('schema_name', 'default')
print(f"\n🎉 Load Complete!")
print(f"- Input format: {format_type}")
print(f"- Target schema: {schema_name}")
print(f"- Records imported: {imported_count}")
print(f"- Flow used: {flow}")
# Shared core functions
def _load_descriptor(descriptor_file):
"""Load and validate descriptor configuration"""
try: try:
with open(descriptor_file, 'r', encoding='utf-8') as f: with open(descriptor_file, 'r', encoding='utf-8') as f:
descriptor = json.load(f) descriptor = json.load(f)
logger.info(f"Loaded descriptor configuration from {descriptor_file}") logger.info(f"Loaded descriptor configuration from {descriptor_file}")
return descriptor
except Exception as e: except Exception as e:
logger.error(f"Failed to load descriptor file: {e}") logger.error(f"Failed to load descriptor file: {e}")
raise raise
# Read input data based on format in descriptor
try:
format_info = descriptor.get('format', {})
format_type = format_info.get('type', 'csv').lower()
encoding = format_info.get('encoding', 'utf-8')
logger.info(f"Input format: {format_type}, encoding: {encoding}") def _read_input_data(input_file, format_info):
"""Read raw data based on format type"""
try:
encoding = format_info.get('encoding', 'utf-8')
with open(input_file, 'r', encoding=encoding) as f: with open(input_file, 'r', encoding=encoding) as f:
raw_data = f.read() raw_data = f.read()
logger.info(f"Read {len(raw_data)} characters from input file") logger.info(f"Read {len(raw_data)} characters from input file")
return raw_data
except Exception as e: except Exception as e:
logger.error(f"Failed to read input file: {e}") logger.error(f"Failed to read input file: {e}")
raise raise
# Parse data based on format type
def _parse_data_by_format(raw_data, format_info, sample_size=None):
"""Parse raw data into records based on format (CSV/JSON/XML)"""
format_type = format_info.get('type', 'csv').lower()
parsed_records = [] parsed_records = []
logger.info(f"Input format: {format_type}")
if format_type == 'csv': if format_type == 'csv':
import csv import csv
from io import StringIO from io import StringIO
@ -373,8 +360,8 @@ def load_structured_data(
reader = csv.DictReader(StringIO(raw_data), fieldnames=fieldnames, delimiter=delimiter) reader = csv.DictReader(StringIO(raw_data), fieldnames=fieldnames, delimiter=delimiter)
for row_num, row in enumerate(reader, start=1): for row_num, row in enumerate(reader, start=1):
# Respect sample_size limit # Respect sample_size limit if provided
if row_num > sample_size: if sample_size and row_num > sample_size:
logger.info(f"Reached sample size limit of {sample_size} records") logger.info(f"Reached sample size limit of {sample_size} records")
break break
parsed_records.append(row) parsed_records.append(row)
@ -387,7 +374,7 @@ def load_structured_data(
try: try:
data = json.loads(raw_data) data = json.loads(raw_data)
if isinstance(data, list): if isinstance(data, list):
parsed_records = data[:sample_size] # Respect sample_size parsed_records = data[:sample_size] if sample_size else data
elif isinstance(data, dict): elif isinstance(data, dict):
# Handle single object or extract array from root path # Handle single object or extract array from root path
root_path = format_info.get('options', {}).get('root_path') root_path = format_info.get('options', {}).get('root_path')
@ -398,7 +385,7 @@ def load_structured_data(
data = data.get(key, data) data = data.get(key, data)
if isinstance(data, list): if isinstance(data, list):
parsed_records = data[:sample_size] parsed_records = data[:sample_size] if sample_size else data
else: else:
parsed_records = [data] parsed_records = [data]
@ -443,7 +430,7 @@ def load_structured_data(
# Convert XML elements to dictionaries # Convert XML elements to dictionaries
record_count = 0 record_count = 0
for element in records: for element in records:
if record_count >= sample_size: if sample_size and record_count >= sample_size:
logger.info(f"Reached sample size limit of {sample_size} records") logger.info(f"Reached sample size limit of {sample_size} records")
break break
@ -487,12 +474,14 @@ def load_structured_data(
raise ValueError(f"Unsupported format type: {format_type}") raise ValueError(f"Unsupported format type: {format_type}")
logger.info(f"Successfully parsed {len(parsed_records)} records") logger.info(f"Successfully parsed {len(parsed_records)} records")
return parsed_records
# Apply basic transformations and validation (simplified version)
mappings = descriptor.get('mappings', []) def _apply_transformations(records, mappings):
"""Apply descriptor mappings and transformations"""
processed_records = [] processed_records = []
for record_num, record in enumerate(parsed_records, start=1): for record_num, record in enumerate(records, start=1):
processed_record = {} processed_record = {}
for mapping in mappings: for mapping in mappings:
@ -533,7 +522,11 @@ def load_structured_data(
processed_records.append(processed_record) processed_records.append(processed_record)
# Format output for TrustGraph ExtractedObject structure return processed_records
def _format_extracted_objects(processed_records, descriptor, user, collection):
"""Convert to TrustGraph ExtractedObject format"""
output_records = [] output_records = []
schema_name = descriptor.get('output', {}).get('schema_name', 'default') schema_name = descriptor.get('output', {}).get('schema_name', 'default')
confidence = descriptor.get('output', {}).get('options', {}).get('confidence', 0.9) confidence = descriptor.get('output', {}).get('options', {}).get('confidence', 0.9)
@ -543,8 +536,8 @@ def load_structured_data(
"metadata": { "metadata": {
"id": f"parsed-{len(output_records)+1}", "id": f"parsed-{len(output_records)+1}",
"metadata": [], # Empty metadata triples "metadata": [], # Empty metadata triples
"user": "trustgraph", "user": user,
"collection": "default" "collection": collection
}, },
"schema_name": schema_name, "schema_name": schema_name,
"values": record, "values": record,
@ -553,52 +546,130 @@ def load_structured_data(
} }
output_records.append(output_record) output_records.append(output_record)
# Output results return output_records
if output_file:
def _process_data_pipeline(input_file, descriptor_file, user, collection, sample_size=None):
"""Shared pipeline: load descriptor → read → parse → transform → format"""
# Load descriptor configuration
descriptor = _load_descriptor(descriptor_file)
# Read input data based on format in descriptor
format_info = descriptor.get('format', {})
raw_data = _read_input_data(input_file, format_info)
# Parse data based on format type
parsed_records = _parse_data_by_format(raw_data, format_info, sample_size)
# Apply transformations and validation
mappings = descriptor.get('mappings', [])
processed_records = _apply_transformations(parsed_records, mappings)
# Format output for TrustGraph ExtractedObject structure
output_records = _format_extracted_objects(processed_records, descriptor, user, collection)
return output_records, descriptor
def _send_to_trustgraph(objects, api_url, flow, batch_size=1000):
"""Send ExtractedObject records to TrustGraph using WebSocket"""
import json
import asyncio
from websockets.asyncio.client import connect
try: try:
with open(output_file, 'w', encoding='utf-8') as f: # Construct objects import URL similar to load_knowledge pattern
json.dump(output_records, f, indent=2) if not api_url.endswith("/"):
print(f"Parsed data saved to: {output_file}") api_url += "/"
logger.info(f"Parsed {len(output_records)} records saved to {output_file}")
# Convert HTTP URL to WebSocket URL if needed
ws_url = api_url.replace("http://", "ws://").replace("https://", "wss://")
objects_url = ws_url + f"api/v1/flow/{flow}/import/objects"
logger.info(f"Connecting to objects import endpoint: {objects_url}")
async def import_objects():
async with connect(objects_url) as ws:
imported_count = 0
for record in objects:
try:
# Send individual ExtractedObject records
await ws.send(json.dumps(record))
imported_count += 1
if imported_count % 100 == 0:
logger.info(f"Imported {imported_count}/{len(objects)} records...")
print(f"✅ Imported {imported_count}/{len(objects)} records...")
except Exception as e: except Exception as e:
logger.error(f"Failed to save parsed data to {output_file}: {e}") logger.error(f"Failed to send record {imported_count + 1}: {e}")
print(f"Error saving parsed data: {e}") print(f"❌ Failed to send record {imported_count + 1}: {e}")
logger.info(f"Successfully imported {imported_count} records to TrustGraph")
return imported_count
# Run the async import
imported_count = asyncio.run(import_objects())
# Summary
total_records = len(objects)
failed_count = total_records - imported_count
print(f"\n📊 Import Summary:")
print(f"- Total records: {total_records}")
print(f"- Successfully imported: {imported_count}")
print(f"- Failed: {failed_count}")
if failed_count > 0:
print(f"⚠️ {failed_count} records failed to import. Check logs for details.")
else: else:
print("Parsed Data Preview:") print("✅ All records imported successfully!")
print("=" * 50)
# Show first few records for preview
preview_count = min(3, len(output_records))
for i in range(preview_count):
print(f"Record {i+1}:")
print(json.dumps(output_records[i], indent=2))
print()
if len(output_records) > preview_count: return imported_count
print(f"... and {len(output_records) - preview_count} more records")
print(f"Total records processed: {len(output_records)}")
print(f"\nParsing Summary:") except ImportError as e:
print(f"- Input format: {format_type}") logger.error(f"Failed to import required modules: {e}")
print(f"- Records processed: {len(output_records)}") print(f"Error: Required modules not available - {e}")
print(f"- Target schema: {schema_name}") raise
print(f"- Field mappings: {len(mappings)}") except Exception as e:
logger.error(f"Failed to import data to TrustGraph: {e}")
print(f"Import failed: {e}")
raise
# Helper functions for auto mode # Helper functions for auto mode
def _auto_discover_schema(api_url, input_file, sample_chars, logger): def _auto_discover_schema(api_url, input_file, sample_chars, flow, logger, return_raw_response=False):
"""Auto-discover the best matching schema for the input data""" """Auto-discover the best matching schema for the input data
Args:
api_url: TrustGraph API URL
input_file: Path to input data file
sample_chars: Number of characters to sample from file
flow: TrustGraph flow name to use for prompts
logger: Logger instance
return_raw_response: If True, return raw prompt response; if False, parse to extract schema name
Returns:
Schema name (str) if return_raw_response=False, or full response if True
"""
try: try:
# Read sample data # Read sample data
with open(input_file, 'r', encoding='utf-8') as f: with open(input_file, 'r', encoding='utf-8') as f:
sample_data = f.read(sample_chars) sample_data = f.read(sample_chars)
logger.info(f"Read {len(sample_data)} characters for analysis")
# Import API modules # Import API modules
from trustgraph.api import Api from trustgraph.api import Api
from trustgraph.api.types import ConfigKey
api = Api(api_url) api = Api(api_url)
config_api = api.config() config_api = api.config()
# Get available schemas # Get available schemas
logger.info("Fetching available schemas from Config API...")
schema_keys = config_api.list("schema") schema_keys = config_api.list("schema")
logger.info(f"Found {len(schema_keys)} schemas: {schema_keys}")
if not schema_keys: if not schema_keys:
logger.error("No schemas available in TrustGraph configuration") logger.error("No schemas available in TrustGraph configuration")
return None return None
@ -607,8 +678,12 @@ def _auto_discover_schema(api_url, input_file, sample_chars, logger):
schemas = {} schemas = {}
for key in schema_keys: for key in schema_keys:
try: try:
schema_def = config_api.get("schema", key) config_key = ConfigKey(type="schema", key=key)
schema_values = config_api.get([config_key])
if schema_values:
schema_def = json.loads(schema_values[0].value) if isinstance(schema_values[0].value, str) else schema_values[0].value
schemas[key] = schema_def schemas[key] = schema_def
logger.debug(f"Loaded schema: {key}")
except Exception as e: except Exception as e:
logger.warning(f"Could not load schema {key}: {e}") logger.warning(f"Could not load schema {key}: {e}")
@ -616,33 +691,32 @@ def _auto_discover_schema(api_url, input_file, sample_chars, logger):
logger.error("No valid schemas could be loaded") logger.error("No valid schemas could be loaded")
return None return None
logger.info(f"Successfully loaded {len(schemas)} schema definitions")
# Use prompt service for schema selection # Use prompt service for schema selection
flow_api = api.flow().id("default") flow_api = api.flow().id(flow)
prompt_client = flow_api.prompt()
prompt = f"""Analyze this data sample and determine the best matching schema: # Call schema-selection prompt with actual schemas and data sample
logger.info("Calling TrustGraph schema-selection prompt...")
DATA SAMPLE: response = flow_api.prompt(
{sample_data[:1000]} id="schema-selection",
variables={
AVAILABLE SCHEMAS: "schemas": list(schemas.values()), # Array of actual schema definitions
{json.dumps(schemas, indent=2)} "question": sample_data # Truncate sample data
}
Return ONLY the schema name (key) that best matches this data. Consider:
1. Field names and types in the data
2. Data structure and format
3. Domain and use case alignment
Schema name:"""
response = prompt_client.schema_selection(
schemas=schemas,
sample=sample_data[:1000]
) )
# Return raw response if requested (for discover_schema mode)
if return_raw_response:
return response
# Extract schema name from response # Extract schema name from response
if isinstance(response, dict) and 'schema' in response: if isinstance(response, dict) and 'schema' in response:
return response['schema'] return response['schema']
elif isinstance(response, list) and len(response) > 0:
# If response is a list, use the first element
logger.info(f"Extracted schema '{response[0]}' from list response")
return response[0]
elif isinstance(response, str): elif isinstance(response, str):
# Try to extract schema name from text response # Try to extract schema name from text response
response_lower = response.lower().strip() response_lower = response.lower().strip()
@ -669,7 +743,7 @@ Schema name:"""
return None return None
def _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, logger): def _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, flow, logger):
"""Auto-generate descriptor configuration for the discovered schema""" """Auto-generate descriptor configuration for the discovered schema"""
try: try:
# Read sample data # Read sample data
@ -678,20 +752,28 @@ def _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, lo
# Import API modules # Import API modules
from trustgraph.api import Api from trustgraph.api import Api
from trustgraph.api.types import ConfigKey
api = Api(api_url) api = Api(api_url)
config_api = api.config() config_api = api.config()
# Get schema definition # Get schema definition
schema_def = config_api.get("schema", schema_name) config_key = ConfigKey(type="schema", key=schema_name)
schema_values = config_api.get([config_key])
if not schema_values:
logger.error(f"Schema '{schema_name}' not found")
return None
schema_def = json.loads(schema_values[0].value) if isinstance(schema_values[0].value, str) else schema_values[0].value
# Use prompt service for descriptor generation # Use prompt service for descriptor generation
flow_api = api.flow().id("default") flow_api = api.flow().id(flow)
prompt_client = flow_api.prompt()
response = prompt_client.diagnose_structured_data( # Call diagnose-structured-data prompt with schema and data sample
sample=sample_data, response = flow_api.prompt(
schema_name=schema_name, id="diagnose-structured-data",
schema=schema_def variables={
"schemas": [schema_def], # Array with single schema definition
"sample": sample_data # Data sample for analysis
}
) )
if isinstance(response, str): if isinstance(response, str):
@ -784,9 +866,9 @@ def main():
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=""" epilog="""
Examples: Examples:
# Step 1: Analyze data and suggest matching schemas # Step 1: Analyze data and discover matching schemas
%(prog)s --input customers.csv --suggest-schema %(prog)s --input customers.csv --discover-schema
%(prog)s --input products.xml --suggest-schema --sample-chars 1000 %(prog)s --input products.xml --discover-schema --sample-chars 1000
# Step 2: Generate descriptor configuration from data sample # Step 2: Generate descriptor configuration from data sample
%(prog)s --input customers.csv --generate-descriptor --schema-name customer --output descriptor.json %(prog)s --input customers.csv --generate-descriptor --schema-name customer --output descriptor.json
@ -813,7 +895,7 @@ Examples:
Use Cases: Use Cases:
--auto : 🚀 FULLY AUTOMATIC: Discover schema + generate descriptor + import data --auto : 🚀 FULLY AUTOMATIC: Discover schema + generate descriptor + import data
(zero manual configuration required!) (zero manual configuration required!)
--suggest-schema : Diagnose which TrustGraph schemas might match your data --discover-schema : Diagnose which TrustGraph schemas might match your data
(uses --sample-chars to limit data sent for analysis) (uses --sample-chars to limit data sent for analysis)
--generate-descriptor: Create/review the structured data language configuration --generate-descriptor: Create/review the structured data language configuration
(uses --sample-chars to limit data sent for analysis) (uses --sample-chars to limit data sent for analysis)
@ -835,7 +917,19 @@ For more information on the descriptor format, see:
parser.add_argument( parser.add_argument(
'-f', '--flow', '-f', '--flow',
default='default', default='default',
help='TrustGraph flow name to use for import (default: default)' help='TrustGraph flow name to use for prompts and import (default: default)'
)
parser.add_argument(
'--user',
default='trustgraph',
help='User name for metadata (default: trustgraph)'
)
parser.add_argument(
'--collection',
default='default',
help='Collection name for metadata (default: default)'
) )
parser.add_argument( parser.add_argument(
@ -852,9 +946,9 @@ For more information on the descriptor format, see:
# Operation modes (mutually exclusive) # Operation modes (mutually exclusive)
mode_group = parser.add_mutually_exclusive_group() mode_group = parser.add_mutually_exclusive_group()
mode_group.add_argument( mode_group.add_argument(
'--suggest-schema', '--discover-schema',
action='store_true', action='store_true',
help='Analyze data sample and suggest matching TrustGraph schemas' help='Analyze data sample and discover matching TrustGraph schemas'
) )
mode_group.add_argument( mode_group.add_argument(
'--generate-descriptor', '--generate-descriptor',
@ -866,6 +960,11 @@ For more information on the descriptor format, see:
action='store_true', action='store_true',
help='Parse data using descriptor but don\'t import to TrustGraph' help='Parse data using descriptor but don\'t import to TrustGraph'
) )
mode_group.add_argument(
'--load',
action='store_true',
help='Load data to TrustGraph using existing descriptor'
)
mode_group.add_argument( mode_group.add_argument(
'--auto', '--auto',
action='store_true', action='store_true',
@ -888,7 +987,7 @@ For more information on the descriptor format, see:
'--sample-chars', '--sample-chars',
type=int, type=int,
default=500, default=500,
help='Maximum characters to read for sampling (suggest-schema/generate-descriptor modes only, default: 500)' help='Maximum characters to read for sampling (discover-schema/generate-descriptor modes only, default: 500)'
) )
parser.add_argument( parser.add_argument(
@ -939,21 +1038,26 @@ For more information on the descriptor format, see:
print("Error: --descriptor is required when using --parse-only", file=sys.stderr) print("Error: --descriptor is required when using --parse-only", file=sys.stderr)
sys.exit(1) sys.exit(1)
if args.load and not args.descriptor:
print("Error: --descriptor is required when using --load", file=sys.stderr)
sys.exit(1)
# Warn about irrelevant parameters # Warn about irrelevant parameters
if args.parse_only and args.sample_chars != 500: # 500 is the default if args.parse_only and args.sample_chars != 500: # 500 is the default
print("Warning: --sample-chars is ignored in --parse-only mode (entire file is processed)", file=sys.stderr) print("Warning: --sample-chars is ignored in --parse-only mode (entire file is processed)", file=sys.stderr)
if (args.suggest_schema or args.generate_descriptor) and args.sample_size != 100: # 100 is default if (args.discover_schema or args.generate_descriptor) and args.sample_size != 100: # 100 is default
print("Warning: --sample-size is ignored in analysis modes, use --sample-chars instead", file=sys.stderr) print("Warning: --sample-size is ignored in analysis modes, use --sample-chars instead", file=sys.stderr)
# Require explicit mode selection - no implicit behavior # Require explicit mode selection - no implicit behavior
if not any([args.suggest_schema, args.generate_descriptor, args.parse_only, args.auto]): if not any([args.discover_schema, args.generate_descriptor, args.parse_only, args.load, args.auto]):
print("Error: Must specify an operation mode", file=sys.stderr) print("Error: Must specify an operation mode", file=sys.stderr)
print("Available modes:", file=sys.stderr) print("Available modes:", file=sys.stderr)
print(" --auto : Discover schema + generate descriptor + import", file=sys.stderr) print(" --auto : Discover schema + generate descriptor + import", file=sys.stderr)
print(" --suggest-schema : Analyze data and suggest schemas", file=sys.stderr) print(" --discover-schema : Analyze data and discover schemas", file=sys.stderr)
print(" --generate-descriptor : Generate descriptor from data", file=sys.stderr) print(" --generate-descriptor : Generate descriptor from data", file=sys.stderr)
print(" --parse-only : Parse data without importing", file=sys.stderr) print(" --parse-only : Parse data without importing", file=sys.stderr)
print(" --load : Import data using existing descriptor", file=sys.stderr)
sys.exit(1) sys.exit(1)
try: try:
@ -961,15 +1065,18 @@ For more information on the descriptor format, see:
api_url=args.api_url, api_url=args.api_url,
input_file=args.input, input_file=args.input,
descriptor_file=args.descriptor, descriptor_file=args.descriptor,
suggest_schema=args.suggest_schema, discover_schema=args.discover_schema,
generate_descriptor=args.generate_descriptor, generate_descriptor=args.generate_descriptor,
parse_only=args.parse_only, parse_only=args.parse_only,
load=args.load,
auto=args.auto, auto=args.auto,
output_file=args.output, output_file=args.output,
sample_size=args.sample_size, sample_size=args.sample_size,
sample_chars=args.sample_chars, sample_chars=args.sample_chars,
schema_name=args.schema_name, schema_name=args.schema_name,
flow=args.flow, flow=args.flow,
user=args.user,
collection=args.collection,
dry_run=args.dry_run, dry_run=args.dry_run,
verbose=args.verbose verbose=args.verbose
) )