mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-06-13 08:45:13 +02:00
Additional user fixes and test fixes
This commit is contained in:
parent
db05427d0e
commit
7f0f79dd15
62 changed files with 1078 additions and 1315 deletions
|
|
@ -12,7 +12,6 @@ from trustgraph.api import Api
|
|||
from trustgraph.api.types import hash, Uri, Literal, Triple
|
||||
|
||||
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
|
||||
default_user = 'trustgraph'
|
||||
default_token = os.getenv("TRUSTGRAPH_TOKEN", None)
|
||||
default_workspace = os.getenv("TRUSTGRAPH_WORKSPACE", "default")
|
||||
|
||||
|
|
|
|||
|
|
@ -40,7 +40,6 @@ def load_structured_data(
|
|||
sample_chars: int = 500,
|
||||
schema_name: str = None,
|
||||
flow: str = 'default',
|
||||
user: str = 'trustgraph',
|
||||
collection: str = 'default',
|
||||
dry_run: bool = False,
|
||||
verbose: bool = False,
|
||||
|
|
@ -64,7 +63,6 @@ def load_structured_data(
|
|||
sample_chars: Maximum characters to read for sampling
|
||||
schema_name: Target schema name for generation
|
||||
flow: TrustGraph flow name to use for prompts
|
||||
user: User name for metadata (default: trustgraph)
|
||||
collection: Collection name for metadata (default: default)
|
||||
dry_run: If True, validate but don't import data
|
||||
verbose: Enable verbose logging
|
||||
|
|
@ -112,7 +110,7 @@ def load_structured_data(
|
|||
|
||||
try:
|
||||
# Use shared pipeline for preview (small sample)
|
||||
preview_objects, _ = _process_data_pipeline(input_file, temp_descriptor.name, user, collection, sample_size=5)
|
||||
preview_objects, _ = _process_data_pipeline(input_file, temp_descriptor.name, collection, sample_size=5)
|
||||
|
||||
# Show preview
|
||||
print("📊 Data Preview (first few records):")
|
||||
|
|
@ -133,7 +131,7 @@ def load_structured_data(
|
|||
print("🚀 Importing data to TrustGraph...")
|
||||
|
||||
# Use shared pipeline for full processing (no sample limit)
|
||||
output_objects, descriptor = _process_data_pipeline(input_file, temp_descriptor.name, user, collection)
|
||||
output_objects, descriptor = _process_data_pipeline(input_file, temp_descriptor.name, collection)
|
||||
|
||||
# Get batch size from descriptor
|
||||
batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)
|
||||
|
|
@ -244,7 +242,7 @@ def load_structured_data(
|
|||
logger.info(f"Parsing {input_file} with descriptor {descriptor_file}...")
|
||||
|
||||
# Use shared pipeline
|
||||
output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, user, collection, sample_size)
|
||||
output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, collection, sample_size)
|
||||
|
||||
# Output results
|
||||
if output_file:
|
||||
|
|
@ -288,7 +286,7 @@ def load_structured_data(
|
|||
logger.info(f"Loading {input_file} to TrustGraph using descriptor {descriptor_file}...")
|
||||
|
||||
# Use shared pipeline (no sample_size limit for full load)
|
||||
output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, user, collection)
|
||||
output_records, descriptor = _process_data_pipeline(input_file, descriptor_file, collection)
|
||||
|
||||
# Get batch size from descriptor or use default
|
||||
batch_size = descriptor.get('output', {}).get('options', {}).get('batch_size', 1000)
|
||||
|
|
@ -529,18 +527,17 @@ def _apply_transformations(records, mappings):
|
|||
return processed_records
|
||||
|
||||
|
||||
def _format_extracted_objects(processed_records, descriptor, user, collection):
|
||||
def _format_extracted_objects(processed_records, descriptor, collection):
|
||||
"""Convert to TrustGraph ExtractedObject format"""
|
||||
output_records = []
|
||||
schema_name = descriptor.get('output', {}).get('schema_name', 'default')
|
||||
confidence = descriptor.get('output', {}).get('options', {}).get('confidence', 0.9)
|
||||
|
||||
|
||||
for record in processed_records:
|
||||
output_record = {
|
||||
"metadata": {
|
||||
"id": f"parsed-{len(output_records)+1}",
|
||||
"metadata": [], # Empty metadata triples
|
||||
"user": user,
|
||||
"collection": collection
|
||||
},
|
||||
"schema_name": schema_name,
|
||||
|
|
@ -553,7 +550,7 @@ def _format_extracted_objects(processed_records, descriptor, user, collection):
|
|||
return output_records
|
||||
|
||||
|
||||
def _process_data_pipeline(input_file, descriptor_file, user, collection, sample_size=None):
|
||||
def _process_data_pipeline(input_file, descriptor_file, collection, sample_size=None):
|
||||
"""Shared pipeline: load descriptor → read → parse → transform → format"""
|
||||
# Load descriptor configuration
|
||||
descriptor = _load_descriptor(descriptor_file)
|
||||
|
|
@ -570,7 +567,7 @@ def _process_data_pipeline(input_file, descriptor_file, user, collection, sample
|
|||
processed_records = _apply_transformations(parsed_records, mappings)
|
||||
|
||||
# Format output for TrustGraph ExtractedObject structure
|
||||
output_records = _format_extracted_objects(processed_records, descriptor, user, collection)
|
||||
output_records = _format_extracted_objects(processed_records, descriptor, collection)
|
||||
|
||||
return output_records, descriptor
|
||||
|
||||
|
|
@ -1048,7 +1045,6 @@ For more information on the descriptor format, see:
|
|||
sample_chars=args.sample_chars,
|
||||
schema_name=args.schema_name,
|
||||
flow=args.flow,
|
||||
user=args.user,
|
||||
collection=args.collection,
|
||||
dry_run=args.dry_run,
|
||||
verbose=args.verbose,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue