Additional user fixes and test fixes

This commit is contained in:
Cyber MacGeddon 2026-04-21 10:53:15 +01:00
parent db05427d0e
commit 7f0f79dd15
62 changed files with 1078 additions and 1315 deletions

View file

@ -12,7 +12,6 @@ from trustgraph.api import Api
from trustgraph.api.types import hash, Uri, Literal, Triple
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
default_user = 'trustgraph'
default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
default_workspace = os.getenv("TRUSTGRAPH_WORKSPACE", "default")

View file

@ -40,7 +40,6 @@ def load_structured_data(
sample_chars: int = 500,
schema_name: str = None,
flow: str = 'default',
user: str = 'trustgraph',
collection: str = 'default',
dry_run: bool = False,
verbose: bool = False,
@ -64,7 +63,6 @@ def load_structured_data(
sample_chars: Maximum characters to read for sampling
schema_name: Target schema name for generation
flow: TrustGraph flow name to use for prompts
user: User name for metadata (default: trustgraph)
collection: Collection name for metadata (default: default)
dry_run: If True, validate but don't import data
verbose: Enable verbose logging
@ -112,7 +110,7 @@ def load_structured_data(
try:
# Use shared pipeline for preview (small sample)
preview_objects, _ = _process_data_pipeline(input_file, temp_descriptor.name, user, collection, sample_size=5)
preview_objects, _ = _process_data_pipeline(input_file, temp_descriptor.name, collection, sample_size=5)
# Show preview
print("📊 Data Preview (first few records):")
@ -133,7 +131,7 @@ def load_structured_data(
print("🚀 Importing data to TrustGraph...")
# Use shared pipeline for full processing (no sample limit)
output_objects, descriptor = _process_data_pipeline(input_file, temp_descriptor.name, user, collection)
output_objects, descriptor = _process_data_pipeline(input_file, temp_descriptor.name, collection)
# Get batch size from descriptor
batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)
@ -244,7 +242,7 @@ def load_structured_data(
logger.info(f"Parsing {input_file} with descriptor {descriptor_file}...")
# Use shared pipeline
output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, user, collection, sample_size)
output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, collection, sample_size)
# Output results
if output_file:
@ -288,7 +286,7 @@ def load_structured_data(
logger.info(f"Loading {input_file} to TrustGraph using descriptor {descriptor_file}...")
# Use shared pipeline (no sample_size limit for full load)
output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, user, collection)
output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, collection)
# Get batch size from descriptor or use default
batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)
@ -529,18 +527,17 @@ def _apply_transformations(records, mappings):
return processed_records
def _format_extracted_objects(processed_records, descriptor, user, collection):
def _format_extracted_objects(processed_records, descriptor, collection):
"""Convert to TrustGraph ExtractedObject format"""
output_records = []
schema_name = descriptor.get('output', {}).get('schema_name', 'default')
confidence = descriptor.get('output', {}).get('options', {}).get('confidence', 0.9)
for record in processed_records:
output_record = {
"metadata": {
"id": f"parsed-{len(output_records)+1}",
"metadata": [], # Empty metadata triples
"user": user,
"collection": collection
},
"schema_name": schema_name,
@ -553,7 +550,7 @@ def _format_extracted_objects(processed_records, descriptor, user, collection):
return output_records
def _process_data_pipeline(input_file, descriptor_file, user, collection, sample_size=None):
def _process_data_pipeline(input_file, descriptor_file, collection, sample_size=None):
"""Shared pipeline: load descriptor → read → parse → transform → format"""
# Load descriptor configuration
descriptor = _load_descriptor(descriptor_file)
@ -570,7 +567,7 @@ def _process_data_pipeline(input_file, descriptor_file, user, collection, sample
processed_records = _apply_transformations(parsed_records, mappings)
# Format output for TrustGraph ExtractedObject structure
output_records = _format_extracted_objects(processed_records, descriptor, user, collection)
output_records = _format_extracted_objects(processed_records, descriptor, collection)
return output_records, descriptor
@ -1048,7 +1045,6 @@ For more information on the descriptor format, see:
sample_chars=args.sample_chars,
schema_name=args.schema_name,
flow=args.flow,
user=args.user,
collection=args.collection,
dry_run=args.dry_run,
verbose=args.verbose,