mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-29 18:36:22 +02:00
parent
ebca467ed8
commit
0b7620bc04
12 changed files with 946 additions and 107 deletions
|
|
@ -256,31 +256,34 @@ class Processor(FlowProcessor):
|
|||
flow
|
||||
)
|
||||
|
||||
# Emit each extracted object
|
||||
for obj in objects:
|
||||
# Emit extracted objects as a batch if any were found
|
||||
if objects:
|
||||
|
||||
# Calculate confidence (could be enhanced with actual confidence from prompt)
|
||||
confidence = 0.8 # Default confidence
|
||||
|
||||
# Convert all values to strings for Pulsar compatibility
|
||||
string_values = convert_values_to_strings(obj)
|
||||
# Convert all objects' values to strings for Pulsar compatibility
|
||||
batch_values = []
|
||||
for obj in objects:
|
||||
string_values = convert_values_to_strings(obj)
|
||||
batch_values.append(string_values)
|
||||
|
||||
# Create ExtractedObject
|
||||
# Create ExtractedObject with batched values
|
||||
extracted = ExtractedObject(
|
||||
metadata=Metadata(
|
||||
id=f"{v.metadata.id}:{schema_name}:{hash(str(obj))}",
|
||||
id=f"{v.metadata.id}:{schema_name}",
|
||||
metadata=[],
|
||||
user=v.metadata.user,
|
||||
collection=v.metadata.collection,
|
||||
),
|
||||
schema_name=schema_name,
|
||||
values=string_values,
|
||||
values=batch_values, # Array of objects
|
||||
confidence=confidence,
|
||||
source_span=chunk_text[:100] # First 100 chars as source reference
|
||||
)
|
||||
|
||||
await flow("output").send(extracted)
|
||||
logger.debug(f"Emitted extracted object for schema {schema_name}")
|
||||
logger.debug(f"Emitted batch of {len(objects)} objects for schema {schema_name}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Object extraction exception: {e}", exc_info=True)
|
||||
|
|
|
|||
|
|
@ -44,6 +44,12 @@ class ObjectsImport:
|
|||
|
||||
data = msg.json()
|
||||
|
||||
# Handle both single object and array of objects for backward compatibility
|
||||
values_data = data["values"]
|
||||
if not isinstance(values_data, list):
|
||||
# Single object - wrap in array
|
||||
values_data = [values_data]
|
||||
|
||||
elt = ExtractedObject(
|
||||
metadata=Metadata(
|
||||
id=data["metadata"]["id"],
|
||||
|
|
@ -52,7 +58,7 @@ class ObjectsImport:
|
|||
collection=data["metadata"]["collection"],
|
||||
),
|
||||
schema_name=data["schema_name"],
|
||||
values=data["values"],
|
||||
values=values_data,
|
||||
confidence=data.get("confidence", 1.0),
|
||||
source_span=data.get("source_span", ""),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -311,7 +311,7 @@ class Processor(FlowProcessor):
|
|||
"""Process incoming ExtractedObject and store in Cassandra"""
|
||||
|
||||
obj = msg.value()
|
||||
logger.info(f"Storing object for schema {obj.schema_name} from {obj.metadata.id}")
|
||||
logger.info(f"Storing {len(obj.values)} objects for schema {obj.schema_name} from {obj.metadata.id}")
|
||||
|
||||
# Get schema definition
|
||||
schema = self.schemas.get(obj.schema_name)
|
||||
|
|
@ -328,59 +328,67 @@ class Processor(FlowProcessor):
|
|||
safe_keyspace = self.sanitize_name(keyspace)
|
||||
safe_table = self.sanitize_table(table_name)
|
||||
|
||||
# Build column names and values
|
||||
columns = ["collection"]
|
||||
values = [obj.metadata.collection]
|
||||
placeholders = ["%s"]
|
||||
|
||||
# Check if we need a synthetic ID
|
||||
has_primary_key = any(field.primary for field in schema.fields)
|
||||
if not has_primary_key:
|
||||
import uuid
|
||||
columns.append("synthetic_id")
|
||||
values.append(uuid.uuid4())
|
||||
placeholders.append("%s")
|
||||
|
||||
# Process fields
|
||||
for field in schema.fields:
|
||||
safe_field_name = self.sanitize_name(field.name)
|
||||
raw_value = obj.values.get(field.name)
|
||||
# Process each object in the batch
|
||||
for obj_index, value_map in enumerate(obj.values):
|
||||
# Build column names and values for this object
|
||||
columns = ["collection"]
|
||||
values = [obj.metadata.collection]
|
||||
placeholders = ["%s"]
|
||||
|
||||
# Handle required fields
|
||||
if field.required and raw_value is None:
|
||||
logger.warning(f"Required field {field.name} is missing in object")
|
||||
# Continue anyway - Cassandra doesn't enforce NOT NULL
|
||||
# Check if we need a synthetic ID
|
||||
has_primary_key = any(field.primary for field in schema.fields)
|
||||
if not has_primary_key:
|
||||
import uuid
|
||||
columns.append("synthetic_id")
|
||||
values.append(uuid.uuid4())
|
||||
placeholders.append("%s")
|
||||
|
||||
# Check if primary key field is NULL
|
||||
if field.primary and raw_value is None:
|
||||
logger.error(f"Primary key field {field.name} cannot be NULL - skipping object")
|
||||
return
|
||||
# Process fields for this object
|
||||
skip_object = False
|
||||
for field in schema.fields:
|
||||
safe_field_name = self.sanitize_name(field.name)
|
||||
raw_value = value_map.get(field.name)
|
||||
|
||||
# Handle required fields
|
||||
if field.required and raw_value is None:
|
||||
logger.warning(f"Required field {field.name} is missing in object {obj_index}")
|
||||
# Continue anyway - Cassandra doesn't enforce NOT NULL
|
||||
|
||||
# Check if primary key field is NULL
|
||||
if field.primary and raw_value is None:
|
||||
logger.error(f"Primary key field {field.name} cannot be NULL - skipping object {obj_index}")
|
||||
skip_object = True
|
||||
break
|
||||
|
||||
# Convert value to appropriate type
|
||||
converted_value = self.convert_value(raw_value, field.type)
|
||||
|
||||
columns.append(safe_field_name)
|
||||
values.append(converted_value)
|
||||
placeholders.append("%s")
|
||||
|
||||
# Convert value to appropriate type
|
||||
converted_value = self.convert_value(raw_value, field.type)
|
||||
# Skip this object if primary key validation failed
|
||||
if skip_object:
|
||||
continue
|
||||
|
||||
columns.append(safe_field_name)
|
||||
values.append(converted_value)
|
||||
placeholders.append("%s")
|
||||
|
||||
# Build and execute insert query
|
||||
insert_cql = f"""
|
||||
INSERT INTO {safe_keyspace}.{safe_table} ({', '.join(columns)})
|
||||
VALUES ({', '.join(placeholders)})
|
||||
"""
|
||||
|
||||
# Debug: Show data being inserted
|
||||
logger.debug(f"Storing {obj.schema_name}: {dict(zip(columns, values))}")
|
||||
|
||||
if len(columns) != len(values) or len(columns) != len(placeholders):
|
||||
raise ValueError(f"Mismatch in counts - columns: {len(columns)}, values: {len(values)}, placeholders: {len(placeholders)}")
|
||||
|
||||
try:
|
||||
# Convert to tuple - Cassandra driver requires tuple for parameters
|
||||
self.session.execute(insert_cql, tuple(values))
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to insert object: {e}", exc_info=True)
|
||||
raise
|
||||
# Build and execute insert query for this object
|
||||
insert_cql = f"""
|
||||
INSERT INTO {safe_keyspace}.{safe_table} ({', '.join(columns)})
|
||||
VALUES ({', '.join(placeholders)})
|
||||
"""
|
||||
|
||||
# Debug: Show data being inserted
|
||||
logger.debug(f"Storing {obj.schema_name} object {obj_index}: {dict(zip(columns, values))}")
|
||||
|
||||
if len(columns) != len(values) or len(columns) != len(placeholders):
|
||||
raise ValueError(f"Mismatch in counts - columns: {len(columns)}, values: {len(values)}, placeholders: {len(placeholders)}")
|
||||
|
||||
try:
|
||||
# Convert to tuple - Cassandra driver requires tuple for parameters
|
||||
self.session.execute(insert_cql, tuple(values))
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to insert object {obj_index}: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
def close(self):
|
||||
"""Clean up Cassandra connections"""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue