Object batching (#499)

* Object batching

* Update tests
This commit is contained in:
cybermaggedon 2025-09-05 15:59:06 +01:00 committed by GitHub
parent ebca467ed8
commit 0b7620bc04
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 946 additions and 107 deletions

View file

@ -256,31 +256,34 @@ class Processor(FlowProcessor):
flow
)
# Emit each extracted object
for obj in objects:
# Emit extracted objects as a batch if any were found
if objects:
# Calculate confidence (could be enhanced with actual confidence from prompt)
confidence = 0.8 # Default confidence
# Convert all values to strings for Pulsar compatibility
string_values = convert_values_to_strings(obj)
# Convert all objects' values to strings for Pulsar compatibility
batch_values = []
for obj in objects:
string_values = convert_values_to_strings(obj)
batch_values.append(string_values)
# Create ExtractedObject
# Create ExtractedObject with batched values
extracted = ExtractedObject(
metadata=Metadata(
id=f"{v.metadata.id}:{schema_name}:{hash(str(obj))}",
id=f"{v.metadata.id}:{schema_name}",
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),
schema_name=schema_name,
values=string_values,
values=batch_values, # Array of objects
confidence=confidence,
source_span=chunk_text[:100] # First 100 chars as source reference
)
await flow("output").send(extracted)
logger.debug(f"Emitted extracted object for schema {schema_name}")
logger.debug(f"Emitted batch of {len(objects)} objects for schema {schema_name}")
except Exception as e:
logger.error(f"Object extraction exception: {e}", exc_info=True)

View file

@ -44,6 +44,12 @@ class ObjectsImport:
data = msg.json()
# Handle both single object and array of objects for backward compatibility
values_data = data["values"]
if not isinstance(values_data, list):
# Single object - wrap in array
values_data = [values_data]
elt = ExtractedObject(
metadata=Metadata(
id=data["metadata"]["id"],
@ -52,7 +58,7 @@ class ObjectsImport:
collection=data["metadata"]["collection"],
),
schema_name=data["schema_name"],
values=data["values"],
values=values_data,
confidence=data.get("confidence", 1.0),
source_span=data.get("source_span", ""),
)

View file

@ -311,7 +311,7 @@ class Processor(FlowProcessor):
"""Process incoming ExtractedObject and store in Cassandra"""
obj = msg.value()
logger.info(f"Storing object for schema {obj.schema_name} from {obj.metadata.id}")
logger.info(f"Storing {len(obj.values)} objects for schema {obj.schema_name} from {obj.metadata.id}")
# Get schema definition
schema = self.schemas.get(obj.schema_name)
@ -328,59 +328,67 @@ class Processor(FlowProcessor):
safe_keyspace = self.sanitize_name(keyspace)
safe_table = self.sanitize_table(table_name)
# Build column names and values
columns = ["collection"]
values = [obj.metadata.collection]
placeholders = ["%s"]
# Check if we need a synthetic ID
has_primary_key = any(field.primary for field in schema.fields)
if not has_primary_key:
import uuid
columns.append("synthetic_id")
values.append(uuid.uuid4())
placeholders.append("%s")
# Process fields
for field in schema.fields:
safe_field_name = self.sanitize_name(field.name)
raw_value = obj.values.get(field.name)
# Process each object in the batch
for obj_index, value_map in enumerate(obj.values):
# Build column names and values for this object
columns = ["collection"]
values = [obj.metadata.collection]
placeholders = ["%s"]
# Handle required fields
if field.required and raw_value is None:
logger.warning(f"Required field {field.name} is missing in object")
# Continue anyway - Cassandra doesn't enforce NOT NULL
# Check if we need a synthetic ID
has_primary_key = any(field.primary for field in schema.fields)
if not has_primary_key:
import uuid
columns.append("synthetic_id")
values.append(uuid.uuid4())
placeholders.append("%s")
# Check if primary key field is NULL
if field.primary and raw_value is None:
logger.error(f"Primary key field {field.name} cannot be NULL - skipping object")
return
# Process fields for this object
skip_object = False
for field in schema.fields:
safe_field_name = self.sanitize_name(field.name)
raw_value = value_map.get(field.name)
# Handle required fields
if field.required and raw_value is None:
logger.warning(f"Required field {field.name} is missing in object {obj_index}")
# Continue anyway - Cassandra doesn't enforce NOT NULL
# Check if primary key field is NULL
if field.primary and raw_value is None:
logger.error(f"Primary key field {field.name} cannot be NULL - skipping object {obj_index}")
skip_object = True
break
# Convert value to appropriate type
converted_value = self.convert_value(raw_value, field.type)
columns.append(safe_field_name)
values.append(converted_value)
placeholders.append("%s")
# Convert value to appropriate type
converted_value = self.convert_value(raw_value, field.type)
# Skip this object if primary key validation failed
if skip_object:
continue
columns.append(safe_field_name)
values.append(converted_value)
placeholders.append("%s")
# Build and execute insert query
insert_cql = f"""
INSERT INTO {safe_keyspace}.{safe_table} ({', '.join(columns)})
VALUES ({', '.join(placeholders)})
"""
# Debug: Show data being inserted
logger.debug(f"Storing {obj.schema_name}: {dict(zip(columns, values))}")
if len(columns) != len(values) or len(columns) != len(placeholders):
raise ValueError(f"Mismatch in counts - columns: {len(columns)}, values: {len(values)}, placeholders: {len(placeholders)}")
try:
# Convert to tuple - Cassandra driver requires tuple for parameters
self.session.execute(insert_cql, tuple(values))
except Exception as e:
logger.error(f"Failed to insert object: {e}", exc_info=True)
raise
# Build and execute insert query for this object
insert_cql = f"""
INSERT INTO {safe_keyspace}.{safe_table} ({', '.join(columns)})
VALUES ({', '.join(placeholders)})
"""
# Debug: Show data being inserted
logger.debug(f"Storing {obj.schema_name} object {obj_index}: {dict(zip(columns, values))}")
if len(columns) != len(values) or len(columns) != len(placeholders):
raise ValueError(f"Mismatch in counts - columns: {len(columns)}, values: {len(values)}, placeholders: {len(placeholders)}")
try:
# Convert to tuple - Cassandra driver requires tuple for parameters
self.session.execute(insert_cql, tuple(values))
except Exception as e:
logger.error(f"Failed to insert object {obj_index}: {e}", exc_info=True)
raise
def close(self):
"""Clean up Cassandra connections"""