mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 08:26:21 +02:00
* Structured data refactor - multi-index tables, remove need for manual mods to the Cassandra tables * Tech spec updated to track implementation
1070 lines
43 KiB
Python
1070 lines
43 KiB
Python
"""
|
|
Load structured data into TrustGraph using a descriptor configuration.
|
|
|
|
This utility can:
|
|
1. Analyze data samples to discover appropriate schemas
|
|
2. Generate descriptor configurations from data samples
|
|
3. Parse and transform data using descriptor configurations
|
|
4. Import processed data into TrustGraph
|
|
|
|
The tool supports running all steps automatically or individual steps for
|
|
validation and debugging. The descriptor language allows for complex
|
|
transformations, validations, and mappings without requiring custom code.
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
import json
|
|
import logging
|
|
|
|
# Module logger
|
|
logger = logging.getLogger(__name__)
|
|
|
|
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
|
|
default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
|
|
|
|
|
|
def load_structured_data(
|
|
api_url: str,
|
|
input_file: str,
|
|
descriptor_file: str = None,
|
|
discover_schema: bool = False,
|
|
generate_descriptor: bool = False,
|
|
parse_only: bool = False,
|
|
load: bool = False,
|
|
auto: bool = False,
|
|
output_file: str = None,
|
|
sample_size: int = 100,
|
|
sample_chars: int = 500,
|
|
schema_name: str = None,
|
|
flow: str = 'default',
|
|
user: str = 'trustgraph',
|
|
collection: str = 'default',
|
|
dry_run: bool = False,
|
|
verbose: bool = False,
|
|
token: str = None
|
|
):
|
|
"""
|
|
Load structured data using a descriptor configuration.
|
|
|
|
Args:
|
|
api_url: TrustGraph API URL
|
|
input_file: Path to input data file
|
|
descriptor_file: Path to JSON descriptor configuration
|
|
discover_schema: Analyze data and discover matching schemas
|
|
generate_descriptor: Generate descriptor from data sample
|
|
parse_only: Parse data but don't import to TrustGraph
|
|
load: Load data to TrustGraph using existing descriptor
|
|
auto: Run full automatic pipeline (discover schema + generate descriptor + import)
|
|
output_file: Path to write output (descriptor/parsed data)
|
|
sample_size: Number of records to sample for analysis
|
|
sample_chars: Maximum characters to read for sampling
|
|
schema_name: Target schema name for generation
|
|
flow: TrustGraph flow name to use for prompts
|
|
user: User name for metadata (default: trustgraph)
|
|
collection: Collection name for metadata (default: default)
|
|
dry_run: If True, validate but don't import data
|
|
verbose: Enable verbose logging
|
|
"""
|
|
if verbose:
|
|
logging.basicConfig(level=logging.DEBUG)
|
|
else:
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
# Determine operation mode
|
|
if auto:
|
|
logger.info(f"🚀 Starting automatic pipeline for {input_file}...")
|
|
logger.info("Step 1: Analyzing data to discover best matching schema...")
|
|
|
|
# Step 1: Auto-discover schema (reuse discover_schema logic)
|
|
discovered_schema = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger)
|
|
if not discovered_schema:
|
|
logger.error("Failed to discover suitable schema automatically")
|
|
print("❌ Could not automatically determine the best schema for your data.")
|
|
print("💡 Try running with --discover-schema first to see available options.")
|
|
return None
|
|
|
|
logger.info(f"✅ Discovered schema: {discovered_schema}")
|
|
print(f"🎯 Auto-selected schema: {discovered_schema}")
|
|
|
|
# Step 2: Auto-generate descriptor
|
|
logger.info("Step 2: Generating descriptor configuration...")
|
|
auto_descriptor = _auto_generate_descriptor(api_url, input_file, discovered_schema, sample_chars, flow, logger)
|
|
if not auto_descriptor:
|
|
logger.error("Failed to generate descriptor automatically")
|
|
print("❌ Could not automatically generate descriptor configuration.")
|
|
return None
|
|
|
|
logger.info("✅ Generated descriptor configuration")
|
|
print("📝 Generated descriptor configuration")
|
|
|
|
# Step 3: Parse and preview data using shared pipeline
|
|
logger.info("Step 3: Parsing and validating data...")
|
|
|
|
# Create temporary descriptor file for validation
|
|
import tempfile
|
|
temp_descriptor = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False)
|
|
json.dump(auto_descriptor, temp_descriptor, indent=2)
|
|
temp_descriptor.close()
|
|
|
|
try:
|
|
# Use shared pipeline for preview (small sample)
|
|
preview_objects, _ = _process_data_pipeline(input_file, temp_descriptor.name, user, collection, sample_size=5)
|
|
|
|
# Show preview
|
|
print("📊 Data Preview (first few records):")
|
|
print("=" * 50)
|
|
for i, obj in enumerate(preview_objects[:3], 1):
|
|
values = obj.get('values', {})
|
|
print(f"Record {i}: {values}")
|
|
print("=" * 50)
|
|
|
|
# Step 4: Import (unless dry_run)
|
|
if dry_run:
|
|
logger.info("✅ Dry run complete - data is ready for import")
|
|
print("✅ Dry run successful! Data is ready for import.")
|
|
print(f"💡 Run without --dry-run to import data to TrustGraph.")
|
|
return None
|
|
else:
|
|
logger.info("Step 4: Importing data to TrustGraph...")
|
|
print("🚀 Importing data to TrustGraph...")
|
|
|
|
# Use shared pipeline for full processing (no sample limit)
|
|
output_objects, descriptor = _process_data_pipeline(input_file, temp_descriptor.name, user, collection)
|
|
|
|
# Get batch size from descriptor
|
|
batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)
|
|
|
|
# Send to TrustGraph using shared function
|
|
imported_count = _send_to_trustgraph(output_objects, api_url, flow, batch_size, token=token)
|
|
|
|
# Summary
|
|
format_info = descriptor.get('format', {})
|
|
format_type = format_info.get('type', 'csv').lower()
|
|
schema_name = descriptor.get('output', {}).get('schema_name', 'default')
|
|
|
|
print(f"\n🎉 Auto-Import Complete!")
|
|
print(f"- Input format: {format_type}")
|
|
print(f"- Target schema: {schema_name}")
|
|
print(f"- Records imported: {imported_count}")
|
|
print(f"- Flow used: {flow}")
|
|
|
|
logger.info("Auto-import pipeline completed successfully")
|
|
return imported_count
|
|
|
|
except Exception as e:
|
|
logger.error(f"Auto-import failed: {e}")
|
|
print(f"❌ Auto-import failed: {e}")
|
|
return None
|
|
|
|
finally:
|
|
# Clean up temp descriptor file
|
|
try:
|
|
import os
|
|
os.unlink(temp_descriptor.name)
|
|
except:
|
|
pass
|
|
|
|
elif discover_schema:
|
|
logger.info(f"Analyzing {input_file} to discover schemas...")
|
|
logger.info(f"Sample size: {sample_size} records")
|
|
logger.info(f"Sample chars: {sample_chars} characters")
|
|
|
|
# Use the helper function to discover schema (get raw response for display)
|
|
response = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger, return_raw_response=True)
|
|
|
|
if response:
|
|
# Debug: print response type and content
|
|
logger.debug(f"Response type: {type(response)}, content: {response}")
|
|
if isinstance(response, list) and len(response) == 1:
|
|
# Just print the schema name for clean output
|
|
print(f"Best matching schema: {response[0]}")
|
|
elif isinstance(response, list):
|
|
# Multiple schemas - show the list
|
|
print("Multiple schemas found:")
|
|
for schema in response:
|
|
print(f" - {schema}")
|
|
else:
|
|
# Show full response for debugging
|
|
print("Schema Discovery Results:")
|
|
print("=" * 50)
|
|
print(response)
|
|
print("=" * 50)
|
|
else:
|
|
print("Could not determine the best matching schema for your data.")
|
|
print("Available schemas can be viewed using: tg-config-list schema")
|
|
|
|
elif generate_descriptor:
|
|
logger.info(f"Generating descriptor from {input_file}...")
|
|
logger.info(f"Sample size: {sample_size} records")
|
|
logger.info(f"Sample chars: {sample_chars} characters")
|
|
|
|
# If no schema specified, discover it first
|
|
if not schema_name:
|
|
logger.info("No schema specified, auto-discovering...")
|
|
schema_name = _auto_discover_schema(api_url, input_file, sample_chars, flow, logger)
|
|
if not schema_name:
|
|
print("Error: Could not determine schema automatically.")
|
|
print("Please specify a schema using --schema-name or run --discover-schema first.")
|
|
return
|
|
logger.info(f"Auto-selected schema: {schema_name}")
|
|
else:
|
|
logger.info(f"Target schema: {schema_name}")
|
|
|
|
# Generate descriptor using helper function
|
|
descriptor = _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, flow, logger)
|
|
|
|
if descriptor:
|
|
# Output the generated descriptor
|
|
if output_file:
|
|
try:
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(json.dumps(descriptor, indent=2))
|
|
print(f"Generated descriptor saved to: {output_file}")
|
|
logger.info(f"Descriptor saved to {output_file}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to save descriptor to {output_file}: {e}")
|
|
print(f"Error saving descriptor: {e}")
|
|
else:
|
|
print("Generated Descriptor:")
|
|
print("=" * 50)
|
|
print(json.dumps(descriptor, indent=2))
|
|
print("=" * 50)
|
|
print("Use this descriptor with --parse-only to validate or without modes to import.")
|
|
else:
|
|
print("Error: Failed to generate descriptor.")
|
|
print("Check the logs for details or try --discover-schema to verify schema availability.")
|
|
|
|
elif parse_only:
|
|
if not descriptor_file:
|
|
raise ValueError("--descriptor is required when using --parse-only")
|
|
logger.info(f"Parsing {input_file} with descriptor {descriptor_file}...")
|
|
|
|
# Use shared pipeline
|
|
output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, user, collection, sample_size)
|
|
|
|
# Output results
|
|
if output_file:
|
|
try:
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output_records, f, indent=2)
|
|
print(f"Parsed data saved to: {output_file}")
|
|
logger.info(f"Parsed {len(output_records)} records saved to {output_file}")
|
|
except Exception as e:
|
|
logger.error(f"Failed to save parsed data to {output_file}: {e}")
|
|
print(f"Error saving parsed data: {e}")
|
|
else:
|
|
print("Parsed Data Preview:")
|
|
print("=" * 50)
|
|
# Show first few records for preview
|
|
preview_count = min(3, len(output_records))
|
|
for i in range(preview_count):
|
|
print(f"Record {i+1}:")
|
|
print(json.dumps(output_records[i], indent=2))
|
|
print()
|
|
|
|
if len(output_records) > preview_count:
|
|
print(f"... and {len(output_records) - preview_count} more records")
|
|
print(f"Total records processed: {len(output_records)}")
|
|
|
|
# Get summary info from descriptor
|
|
format_info = descriptor.get('format', {})
|
|
format_type = format_info.get('type', 'csv').lower()
|
|
schema_name = descriptor.get('output', {}).get('schema_name', 'default')
|
|
mappings = descriptor.get('mappings', [])
|
|
|
|
print(f"\nParsing Summary:")
|
|
print(f"- Input format: {format_type}")
|
|
print(f"- Records processed: {len(output_records)}")
|
|
print(f"- Target schema: {schema_name}")
|
|
print(f"- Field mappings: {len(mappings)}")
|
|
|
|
elif load:
|
|
if not descriptor_file:
|
|
raise ValueError("--descriptor is required when using --load")
|
|
logger.info(f"Loading {input_file} to TrustGraph using descriptor {descriptor_file}...")
|
|
|
|
# Use shared pipeline (no sample_size limit for full load)
|
|
output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, user, collection)
|
|
|
|
# Get batch size from descriptor or use default
|
|
batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)
|
|
|
|
# Send to TrustGraph
|
|
print(f"🚀 Importing {len(output_records)} records to TrustGraph...")
|
|
imported_count = _send_to_trustgraph(output_records, api_url, flow, batch_size, token=token)
|
|
|
|
# Get summary info from descriptor
|
|
format_info = descriptor.get('format', {})
|
|
format_type = format_info.get('type', 'csv').lower()
|
|
schema_name = descriptor.get('output', {}).get('schema_name', 'default')
|
|
|
|
print(f"\n🎉 Load Complete!")
|
|
print(f"- Input format: {format_type}")
|
|
print(f"- Target schema: {schema_name}")
|
|
print(f"- Records imported: {imported_count}")
|
|
print(f"- Flow used: {flow}")
|
|
|
|
|
|
# Shared core functions
|
|
def _load_descriptor(descriptor_file):
|
|
"""Load and validate descriptor configuration"""
|
|
try:
|
|
with open(descriptor_file, 'r', encoding='utf-8') as f:
|
|
descriptor = json.load(f)
|
|
logger.info(f"Loaded descriptor configuration from {descriptor_file}")
|
|
return descriptor
|
|
except Exception as e:
|
|
logger.error(f"Failed to load descriptor file: {e}")
|
|
raise
|
|
|
|
|
|
def _read_input_data(input_file, format_info):
|
|
"""Read raw data based on format type"""
|
|
try:
|
|
encoding = format_info.get('encoding', 'utf-8')
|
|
|
|
with open(input_file, 'r', encoding=encoding) as f:
|
|
raw_data = f.read()
|
|
|
|
logger.info(f"Read {len(raw_data)} characters from input file")
|
|
return raw_data
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to read input file: {e}")
|
|
raise
|
|
|
|
|
|
def _parse_data_by_format(raw_data, format_info, sample_size=None):
|
|
"""Parse raw data into records based on format (CSV/JSON/XML)"""
|
|
format_type = format_info.get('type', 'csv').lower()
|
|
parsed_records = []
|
|
|
|
logger.info(f"Input format: {format_type}")
|
|
|
|
if format_type == 'csv':
|
|
import csv
|
|
from io import StringIO
|
|
|
|
options = format_info.get('options', {})
|
|
delimiter = options.get('delimiter', ',')
|
|
has_header = options.get('has_header', True) or options.get('header', True)
|
|
|
|
logger.info(f"CSV options - delimiter: '{delimiter}', has_header: {has_header}")
|
|
|
|
try:
|
|
reader = csv.DictReader(StringIO(raw_data), delimiter=delimiter)
|
|
if not has_header:
|
|
# If no header, create field names from first row or use generic names
|
|
first_row = next(reader)
|
|
fieldnames = [f"field_{i+1}" for i in range(len(first_row))]
|
|
reader = csv.DictReader(StringIO(raw_data), fieldnames=fieldnames, delimiter=delimiter)
|
|
|
|
for row_num, row in enumerate(reader, start=1):
|
|
# Respect sample_size limit if provided
|
|
if sample_size and row_num > sample_size:
|
|
logger.info(f"Reached sample size limit of {sample_size} records")
|
|
break
|
|
parsed_records.append(row)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to parse CSV data: {e}")
|
|
raise
|
|
|
|
elif format_type == 'json':
|
|
try:
|
|
data = json.loads(raw_data)
|
|
if isinstance(data, list):
|
|
parsed_records = data[:sample_size] if sample_size else data
|
|
elif isinstance(data, dict):
|
|
# Handle single object or extract array from root path
|
|
root_path = format_info.get('options', {}).get('root_path')
|
|
if root_path:
|
|
# Simple JSONPath-like extraction (basic implementation)
|
|
if root_path.startswith('$.'):
|
|
key = root_path[2:]
|
|
data = data.get(key, data)
|
|
|
|
if isinstance(data, list):
|
|
parsed_records = data[:sample_size] if sample_size else data
|
|
else:
|
|
parsed_records = [data]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to parse JSON data: {e}")
|
|
raise
|
|
|
|
elif format_type == 'xml':
|
|
import xml.etree.ElementTree as ET
|
|
|
|
options = format_info.get('options', {})
|
|
record_path = options.get('record_path', '//record') # XPath to find record elements
|
|
field_attribute = options.get('field_attribute') # Attribute name for field names (e.g., "name")
|
|
|
|
# Legacy support for old options format
|
|
if 'root_element' in options or 'record_element' in options:
|
|
root_element = options.get('root_element')
|
|
record_element = options.get('record_element', 'record')
|
|
if root_element:
|
|
record_path = f"//{root_element}/{record_element}"
|
|
else:
|
|
record_path = f"//{record_element}"
|
|
|
|
logger.info(f"XML options - record_path: '{record_path}', field_attribute: '{field_attribute}'")
|
|
|
|
try:
|
|
root = ET.fromstring(raw_data)
|
|
|
|
# Find record elements using XPath
|
|
# ElementTree XPath support is limited, convert absolute paths to relative
|
|
xpath_expr = record_path
|
|
if xpath_expr.startswith('/ROOT/'):
|
|
# Remove /ROOT/ prefix since we're already at the root
|
|
xpath_expr = xpath_expr[6:]
|
|
elif xpath_expr.startswith('/'):
|
|
# Convert absolute path to relative by removing leading /
|
|
xpath_expr = '.' + xpath_expr
|
|
|
|
records = root.findall(xpath_expr)
|
|
logger.info(f"Found {len(records)} records using XPath: {record_path} (converted to: {xpath_expr})")
|
|
|
|
# Convert XML elements to dictionaries
|
|
record_count = 0
|
|
for element in records:
|
|
if sample_size and record_count >= sample_size:
|
|
logger.info(f"Reached sample size limit of {sample_size} records")
|
|
break
|
|
|
|
record = {}
|
|
|
|
if field_attribute:
|
|
# Handle field elements with name attributes (UN data format)
|
|
# <field name="Country or Area">Albania</field>
|
|
for child in element:
|
|
if child.tag == 'field' and field_attribute in child.attrib:
|
|
field_name = child.attrib[field_attribute]
|
|
field_value = child.text.strip() if child.text else ""
|
|
record[field_name] = field_value
|
|
else:
|
|
# Handle standard XML structure
|
|
# Convert element attributes to fields
|
|
record.update(element.attrib)
|
|
|
|
# Convert child elements to fields
|
|
for child in element:
|
|
if child.text:
|
|
record[child.tag] = child.text.strip()
|
|
else:
|
|
record[child.tag] = ""
|
|
|
|
# If no children or attributes, use element text as single field
|
|
if not record and element.text:
|
|
record['value'] = element.text.strip()
|
|
|
|
parsed_records.append(record)
|
|
record_count += 1
|
|
|
|
except ET.ParseError as e:
|
|
logger.error(f"Failed to parse XML data: {e}")
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Failed to process XML data: {e}")
|
|
raise
|
|
|
|
else:
|
|
raise ValueError(f"Unsupported format type: {format_type}")
|
|
|
|
logger.info(f"Successfully parsed {len(parsed_records)} records")
|
|
return parsed_records
|
|
|
|
|
|
def _apply_transformations(records, mappings):
|
|
"""Apply descriptor mappings and transformations"""
|
|
processed_records = []
|
|
|
|
for record_num, record in enumerate(records, start=1):
|
|
processed_record = {}
|
|
|
|
for mapping in mappings:
|
|
source_field = mapping.get('source_field') or mapping.get('source')
|
|
target_field = mapping.get('target_field') or mapping.get('target')
|
|
|
|
if source_field in record:
|
|
value = record[source_field]
|
|
|
|
# Apply basic transforms (simplified)
|
|
transforms = mapping.get('transforms', [])
|
|
for transform in transforms:
|
|
transform_type = transform.get('type')
|
|
|
|
if transform_type == 'trim' and isinstance(value, str):
|
|
value = value.strip()
|
|
elif transform_type == 'upper' and isinstance(value, str):
|
|
value = value.upper()
|
|
elif transform_type == 'lower' and isinstance(value, str):
|
|
value = value.lower()
|
|
elif transform_type == 'title_case' and isinstance(value, str):
|
|
value = value.title()
|
|
elif transform_type == 'to_int':
|
|
try:
|
|
value = int(value) if value != '' else None
|
|
except (ValueError, TypeError):
|
|
logger.warning(f"Failed to convert '{value}' to int in record {record_num}")
|
|
elif transform_type == 'to_float':
|
|
try:
|
|
value = float(value) if value != '' else None
|
|
except (ValueError, TypeError):
|
|
logger.warning(f"Failed to convert '{value}' to float in record {record_num}")
|
|
|
|
# Convert all values to strings as required by ExtractedObject schema
|
|
processed_record[target_field] = str(value) if value is not None else ""
|
|
else:
|
|
logger.warning(f"Source field '{source_field}' not found in record {record_num}")
|
|
|
|
processed_records.append(processed_record)
|
|
|
|
return processed_records
|
|
|
|
|
|
def _format_extracted_objects(processed_records, descriptor, user, collection):
|
|
"""Convert to TrustGraph ExtractedObject format"""
|
|
output_records = []
|
|
schema_name = descriptor.get('output', {}).get('schema_name', 'default')
|
|
confidence = descriptor.get('output', {}).get('options', {}).get('confidence', 0.9)
|
|
|
|
for record in processed_records:
|
|
output_record = {
|
|
"metadata": {
|
|
"id": f"parsed-{len(output_records)+1}",
|
|
"metadata": [], # Empty metadata triples
|
|
"user": user,
|
|
"collection": collection
|
|
},
|
|
"schema_name": schema_name,
|
|
"values": record,
|
|
"confidence": confidence,
|
|
"source_span": ""
|
|
}
|
|
output_records.append(output_record)
|
|
|
|
return output_records
|
|
|
|
|
|
def _process_data_pipeline(input_file, descriptor_file, user, collection, sample_size=None):
|
|
"""Shared pipeline: load descriptor → read → parse → transform → format"""
|
|
# Load descriptor configuration
|
|
descriptor = _load_descriptor(descriptor_file)
|
|
|
|
# Read input data based on format in descriptor
|
|
format_info = descriptor.get('format', {})
|
|
raw_data = _read_input_data(input_file, format_info)
|
|
|
|
# Parse data based on format type
|
|
parsed_records = _parse_data_by_format(raw_data, format_info, sample_size)
|
|
|
|
# Apply transformations and validation
|
|
mappings = descriptor.get('mappings', [])
|
|
processed_records = _apply_transformations(parsed_records, mappings)
|
|
|
|
# Format output for TrustGraph ExtractedObject structure
|
|
output_records = _format_extracted_objects(processed_records, descriptor, user, collection)
|
|
|
|
return output_records, descriptor
|
|
|
|
|
|
def _send_to_trustgraph(rows, api_url, flow, batch_size=1000, token=None):
|
|
"""Send ExtractedObject records to TrustGraph using Python API"""
|
|
from trustgraph.api import Api
|
|
|
|
try:
|
|
total_records = len(rows)
|
|
logger.info(f"Importing {total_records} records to TrustGraph...")
|
|
|
|
# Use Python API bulk import
|
|
api = Api(api_url, token=token)
|
|
bulk = api.bulk()
|
|
|
|
bulk.import_rows(flow=flow, rows=iter(rows))
|
|
|
|
logger.info(f"Successfully imported {total_records} records to TrustGraph")
|
|
|
|
# Summary
|
|
print(f"\n📊 Import Summary:")
|
|
print(f"- Total records: {total_records}")
|
|
print(f"- Successfully imported: {total_records}")
|
|
print("✅ All records imported successfully!")
|
|
|
|
return total_records
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to import data to TrustGraph: {e}")
|
|
print(f"Import failed: {e}")
|
|
raise
|
|
|
|
|
|
# Helper functions for auto mode
|
|
def _auto_discover_schema(api_url, input_file, sample_chars, flow, logger, return_raw_response=False):
|
|
"""Auto-discover the best matching schema for the input data
|
|
|
|
Args:
|
|
api_url: TrustGraph API URL
|
|
input_file: Path to input data file
|
|
sample_chars: Number of characters to sample from file
|
|
flow: TrustGraph flow name to use for prompts
|
|
logger: Logger instance
|
|
return_raw_response: If True, return raw prompt response; if False, parse to extract schema name
|
|
|
|
Returns:
|
|
Schema name (str) if return_raw_response=False, or full response if True
|
|
"""
|
|
try:
|
|
# Read sample data
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
sample_data = f.read(sample_chars)
|
|
logger.info(f"Read {len(sample_data)} characters for analysis")
|
|
|
|
# Import API modules
|
|
from trustgraph.api import Api
|
|
from trustgraph.api.types import ConfigKey
|
|
api = Api(api_url)
|
|
config_api = api.config()
|
|
|
|
# Get available schemas
|
|
logger.info("Fetching available schemas from Config API...")
|
|
schema_keys = config_api.list("schema")
|
|
logger.info(f"Found {len(schema_keys)} schemas: {schema_keys}")
|
|
|
|
if not schema_keys:
|
|
logger.error("No schemas available in TrustGraph configuration")
|
|
return None
|
|
|
|
# Get schema definitions
|
|
schemas = {}
|
|
for key in schema_keys:
|
|
try:
|
|
config_key = ConfigKey(type="schema", key=key)
|
|
schema_values = config_api.get([config_key])
|
|
if schema_values:
|
|
schema_def = json.loads(schema_values[0].value) if isinstance(schema_values[0].value, str) else schema_values[0].value
|
|
schemas[key] = schema_def
|
|
logger.debug(f"Loaded schema: {key}")
|
|
except Exception as e:
|
|
logger.warning(f"Could not load schema {key}: {e}")
|
|
|
|
if not schemas:
|
|
logger.error("No valid schemas could be loaded")
|
|
return None
|
|
|
|
logger.info(f"Successfully loaded {len(schemas)} schema definitions")
|
|
|
|
# Use prompt service for schema selection
|
|
flow_api = api.flow().id(flow)
|
|
|
|
# Call schema-selection prompt with actual schemas and data sample
|
|
logger.info("Calling TrustGraph schema-selection prompt...")
|
|
response = flow_api.prompt(
|
|
id="schema-selection",
|
|
variables={
|
|
"schemas": list(schemas.values()), # Array of actual schema definitions
|
|
"question": sample_data # Truncate sample data
|
|
}
|
|
)
|
|
|
|
# Return raw response if requested (for discover_schema mode)
|
|
if return_raw_response:
|
|
return response
|
|
|
|
# Extract schema name from response
|
|
if isinstance(response, dict) and 'schema' in response:
|
|
return response['schema']
|
|
elif isinstance(response, list) and len(response) > 0:
|
|
# If response is a list, use the first element
|
|
logger.info(f"Extracted schema '{response[0]}' from list response")
|
|
return response[0]
|
|
elif isinstance(response, str):
|
|
# Try to extract schema name from text response
|
|
response_lower = response.lower().strip()
|
|
for schema_key in schema_keys:
|
|
if schema_key.lower() in response_lower:
|
|
return schema_key
|
|
|
|
# If no exact match, try first mentioned schema
|
|
words = response.split()
|
|
for word in words:
|
|
clean_word = word.strip('.,!?":').lower()
|
|
if clean_word in [s.lower() for s in schema_keys]:
|
|
matching_schema = next(s for s in schema_keys if s.lower() == clean_word)
|
|
return matching_schema
|
|
|
|
logger.warning(f"Could not parse schema selection from response: {response}")
|
|
|
|
# Fallback: return first available schema
|
|
logger.info(f"Using fallback: first available schema '{schema_keys[0]}'")
|
|
return schema_keys[0]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Schema discovery failed: {e}")
|
|
return None
|
|
|
|
|
|
def _auto_generate_descriptor(api_url, input_file, schema_name, sample_chars, flow, logger):
|
|
"""Auto-generate descriptor configuration for the discovered schema"""
|
|
try:
|
|
# Read sample data
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
sample_data = f.read(sample_chars)
|
|
|
|
# Import API modules
|
|
from trustgraph.api import Api
|
|
from trustgraph.api.types import ConfigKey
|
|
api = Api(api_url)
|
|
config_api = api.config()
|
|
|
|
# Get schema definition
|
|
config_key = ConfigKey(type="schema", key=schema_name)
|
|
schema_values = config_api.get([config_key])
|
|
if not schema_values:
|
|
logger.error(f"Schema '{schema_name}' not found")
|
|
return None
|
|
schema_def = json.loads(schema_values[0].value) if isinstance(schema_values[0].value, str) else schema_values[0].value
|
|
|
|
# Use prompt service for descriptor generation
|
|
flow_api = api.flow().id(flow)
|
|
|
|
# Call diagnose-structured-data prompt with schema and data sample
|
|
response = flow_api.prompt(
|
|
id="diagnose-structured-data",
|
|
variables={
|
|
"schemas": [schema_def], # Array with single schema definition
|
|
"sample": sample_data # Data sample for analysis
|
|
}
|
|
)
|
|
|
|
if isinstance(response, str):
|
|
try:
|
|
return json.loads(response)
|
|
except json.JSONDecodeError:
|
|
logger.error("Generated descriptor is not valid JSON")
|
|
return None
|
|
else:
|
|
return response
|
|
|
|
except Exception as e:
|
|
logger.error(f"Descriptor generation failed: {e}")
|
|
return None
|
|
|
|
|
|
def _auto_parse_preview(input_file, descriptor, max_records, logger):
|
|
"""Parse and preview data using the auto-generated descriptor"""
|
|
try:
|
|
# Simplified parsing logic for preview (reuse existing logic)
|
|
format_info = descriptor.get('format', {})
|
|
format_type = format_info.get('type', 'csv').lower()
|
|
encoding = format_info.get('encoding', 'utf-8')
|
|
|
|
with open(input_file, 'r', encoding=encoding) as f:
|
|
raw_data = f.read()
|
|
|
|
parsed_records = []
|
|
|
|
if format_type == 'csv':
|
|
import csv
|
|
from io import StringIO
|
|
|
|
options = format_info.get('options', {})
|
|
delimiter = options.get('delimiter', ',')
|
|
has_header = options.get('has_header', True) or options.get('header', True)
|
|
|
|
reader = csv.DictReader(StringIO(raw_data), delimiter=delimiter)
|
|
if not has_header:
|
|
first_row = next(reader)
|
|
fieldnames = [f"field_{i+1}" for i in range(len(first_row))]
|
|
reader = csv.DictReader(StringIO(raw_data), fieldnames=fieldnames, delimiter=delimiter)
|
|
|
|
count = 0
|
|
for row in reader:
|
|
if count >= max_records:
|
|
break
|
|
parsed_records.append(dict(row))
|
|
count += 1
|
|
|
|
elif format_type == 'json':
|
|
import json
|
|
data = json.loads(raw_data)
|
|
|
|
if isinstance(data, list):
|
|
parsed_records = data[:max_records]
|
|
else:
|
|
parsed_records = [data]
|
|
|
|
# Apply basic field mappings for preview
|
|
mappings = descriptor.get('mappings', [])
|
|
preview_records = []
|
|
|
|
for record in parsed_records:
|
|
processed_record = {}
|
|
for mapping in mappings:
|
|
source_field = mapping.get('source_field')
|
|
target_field = mapping.get('target_field', source_field)
|
|
|
|
if source_field in record:
|
|
value = record[source_field]
|
|
processed_record[target_field] = str(value) if value is not None else ""
|
|
|
|
if processed_record: # Only add if we got some data
|
|
preview_records.append(processed_record)
|
|
|
|
return preview_records if preview_records else parsed_records
|
|
|
|
except Exception as e:
|
|
logger.error(f"Preview parsing failed: {e}")
|
|
return None
|
|
|
|
|
|
def main():
|
|
"""Main entry point for the CLI."""
|
|
|
|
parser = argparse.ArgumentParser(
|
|
prog='tg-load-structured-data',
|
|
description=__doc__,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Step 1: Analyze data and discover matching schemas
|
|
%(prog)s --input customers.csv --discover-schema
|
|
%(prog)s --input products.xml --discover-schema --sample-chars 1000
|
|
|
|
# Step 2: Generate descriptor configuration from data sample
|
|
%(prog)s --input customers.csv --generate-descriptor --schema-name customer --output descriptor.json
|
|
%(prog)s --input products.xml --generate-descriptor --schema-name product --output xml_descriptor.json
|
|
|
|
# Generate descriptor with custom sampling (more data for better analysis)
|
|
%(prog)s --input large_dataset.csv --generate-descriptor --schema-name product --sample-chars 100000 --sample-size 500
|
|
|
|
# Step 3: Parse data and review output without importing (supports CSV, JSON, XML)
|
|
%(prog)s --input customers.csv --descriptor descriptor.json --parse-only --output parsed.json
|
|
%(prog)s --input products.xml --descriptor xml_descriptor.json --parse-only
|
|
|
|
# Step 4: Import data to TrustGraph using descriptor
|
|
%(prog)s --input customers.csv --descriptor descriptor.json
|
|
%(prog)s --input products.xml --descriptor xml_descriptor.json
|
|
|
|
# FULLY AUTOMATIC: Discover schema + generate descriptor + import (zero manual steps!)
|
|
%(prog)s --input customers.csv --auto
|
|
%(prog)s --input products.xml --auto --dry-run # Preview before importing
|
|
|
|
# Dry run to validate without importing
|
|
%(prog)s --input customers.csv --descriptor descriptor.json --dry-run
|
|
|
|
Use Cases:
|
|
--auto : 🚀 FULLY AUTOMATIC: Discover schema + generate descriptor + import data
|
|
(zero manual configuration required!)
|
|
--discover-schema : Diagnose which TrustGraph schemas might match your data
|
|
(uses --sample-chars to limit data sent for analysis)
|
|
--generate-descriptor: Create/review the structured data language configuration
|
|
(uses --sample-chars to limit data sent for analysis)
|
|
--parse-only : Validate that parsed data looks correct before import
|
|
(uses --sample-size to limit records processed, ignores --sample-chars)
|
|
|
|
For more information on the descriptor format, see:
|
|
docs/tech-specs/structured-data-descriptor.md
|
|
""",
|
|
)
|
|
|
|
# Required arguments
|
|
parser.add_argument(
|
|
'-u', '--api-url',
|
|
default=default_url,
|
|
help=f'TrustGraph API URL (default: {default_url})'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'-f', '--flow',
|
|
default='default',
|
|
help='TrustGraph flow name to use for prompts and import (default: default)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--user',
|
|
default='trustgraph',
|
|
help='User name for metadata (default: trustgraph)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--collection',
|
|
default='default',
|
|
help='Collection name for metadata (default: default)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'-i', '--input',
|
|
required=True,
|
|
help='Path to input data file to process'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'-d', '--descriptor',
|
|
help='Path to JSON descriptor configuration file (required for full import and parse-only)'
|
|
)
|
|
|
|
# Operation modes (mutually exclusive)
|
|
mode_group = parser.add_mutually_exclusive_group()
|
|
mode_group.add_argument(
|
|
'--discover-schema',
|
|
action='store_true',
|
|
help='Analyze data sample and discover matching TrustGraph schemas'
|
|
)
|
|
mode_group.add_argument(
|
|
'--generate-descriptor',
|
|
action='store_true',
|
|
help='Generate descriptor configuration from data sample'
|
|
)
|
|
mode_group.add_argument(
|
|
'--parse-only',
|
|
action='store_true',
|
|
help='Parse data using descriptor but don\'t import to TrustGraph'
|
|
)
|
|
mode_group.add_argument(
|
|
'--load',
|
|
action='store_true',
|
|
help='Load data to TrustGraph using existing descriptor'
|
|
)
|
|
mode_group.add_argument(
|
|
'--auto',
|
|
action='store_true',
|
|
help='Run full automatic pipeline: discover schema + generate descriptor + import data'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'-o', '--output',
|
|
help='Output file path (for generated descriptors or parsed data)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--sample-size',
|
|
type=int,
|
|
default=100,
|
|
help='Number of records to process (parse-only mode) or sample for analysis (default: 100)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--sample-chars',
|
|
type=int,
|
|
default=500,
|
|
help='Maximum characters to read for sampling (discover-schema/generate-descriptor modes only, default: 500)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--schema-name',
|
|
help='Target schema name for descriptor generation'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--dry-run',
|
|
action='store_true',
|
|
help='Validate configuration and data without importing (full pipeline only)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'-v', '--verbose',
|
|
action='store_true',
|
|
help='Enable verbose output for debugging'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--batch-size',
|
|
type=int,
|
|
default=1000,
|
|
help='Number of records to process in each batch (default: 1000)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--max-errors',
|
|
type=int,
|
|
default=100,
|
|
help='Maximum number of errors before stopping (default: 100)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--error-file',
|
|
help='Path to write error records (optional)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'-t', '--token',
|
|
default=default_token,
|
|
help='Authentication token (default: $TRUSTGRAPH_TOKEN)',
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Input validation
|
|
if not os.path.exists(args.input):
|
|
print(f"Error: Input file not found: {args.input}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Mode-specific validation
|
|
if args.parse_only and not args.descriptor:
|
|
print("Error: --descriptor is required when using --parse-only", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
if args.load and not args.descriptor:
|
|
print("Error: --descriptor is required when using --load", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Warn about irrelevant parameters
|
|
if args.parse_only and args.sample_chars != 500: # 500 is the default
|
|
print("Warning: --sample-chars is ignored in --parse-only mode (entire file is processed)", file=sys.stderr)
|
|
|
|
if (args.discover_schema or args.generate_descriptor) and args.sample_size != 100: # 100 is default
|
|
print("Warning: --sample-size is ignored in analysis modes, use --sample-chars instead", file=sys.stderr)
|
|
|
|
# Require explicit mode selection - no implicit behavior
|
|
if not any([args.discover_schema, args.generate_descriptor, args.parse_only, args.load, args.auto]):
|
|
print("Error: Must specify an operation mode", file=sys.stderr)
|
|
print("Available modes:", file=sys.stderr)
|
|
print(" --auto : Discover schema + generate descriptor + import", file=sys.stderr)
|
|
print(" --discover-schema : Analyze data and discover schemas", file=sys.stderr)
|
|
print(" --generate-descriptor : Generate descriptor from data", file=sys.stderr)
|
|
print(" --parse-only : Parse data without importing", file=sys.stderr)
|
|
print(" --load : Import data using existing descriptor", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
try:
|
|
load_structured_data(
|
|
api_url=args.api_url,
|
|
input_file=args.input,
|
|
descriptor_file=args.descriptor,
|
|
discover_schema=args.discover_schema,
|
|
generate_descriptor=args.generate_descriptor,
|
|
parse_only=args.parse_only,
|
|
load=args.load,
|
|
auto=args.auto,
|
|
output_file=args.output,
|
|
sample_size=args.sample_size,
|
|
sample_chars=args.sample_chars,
|
|
schema_name=args.schema_name,
|
|
flow=args.flow,
|
|
user=args.user,
|
|
collection=args.collection,
|
|
dry_run=args.dry_run,
|
|
verbose=args.verbose,
|
|
token=args.token
|
|
)
|
|
except FileNotFoundError as e:
|
|
print(f"Error: File not found - {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
except json.JSONDecodeError as e:
|
|
print(f"Error: Invalid JSON in descriptor - {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
if args.verbose:
|
|
import traceback
|
|
traceback.print_exc()
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|