Feature/prompts jsonl (#619)

* Tech spec

* JSONL implementation complete

* Updated prompt client users

* Fix tests
This commit is contained in:
cybermaggedon 2026-01-26 17:38:00 +00:00 committed by GitHub
parent e4f0013841
commit e214eb4e02
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 1292 additions and 463 deletions

View file

@ -126,16 +126,42 @@ class Processor(FlowProcessor):
await pub.send(ecs)
def parse_json(self, text):
json_match = re.search(r'```(?:json)?(.*?)```', text, re.DOTALL)
if json_match:
json_str = json_match.group(1).strip()
else:
# If no delimiters, assume the entire output is JSON
json_str = text.strip()
def parse_jsonl(self, text):
"""
Parse JSONL response, returning list of valid objects.
return json.loads(json_str)
Invalid lines (malformed JSON, empty lines) are skipped with warnings.
This provides truncation resilience - partial output yields partial results.
"""
results = []
# Strip markdown code fences if present
text = text.strip()
if text.startswith('```'):
# Remove opening fence (possibly with language hint)
text = re.sub(r'^```(?:json|jsonl)?\s*\n?', '', text)
if text.endswith('```'):
text = text[:-3]
for line_num, line in enumerate(text.strip().split('\n'), 1):
line = line.strip()
# Skip empty lines
if not line:
continue
# Skip any remaining fence markers
if line.startswith('```'):
continue
try:
obj = json.loads(line)
results.append(obj)
except json.JSONDecodeError as e:
# Log warning but continue - this provides truncation resilience
logger.warning(f"JSONL parse error on line {line_num}: {e}")
return results
async def on_message(self, msg, consumer, flow):
@ -178,11 +204,12 @@ class Processor(FlowProcessor):
question = prompt
)
# Parse JSON response
try:
extraction_data = self.parse_json(agent_response)
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON response from agent: {e}")
# Parse JSONL response
extraction_data = self.parse_jsonl(agent_response)
if not extraction_data:
logger.warning("JSONL parse returned no valid objects")
return
# Process extraction data
triples, entity_contexts = self.process_extraction_data(
@ -209,12 +236,21 @@ class Processor(FlowProcessor):
raise
def process_extraction_data(self, data, metadata):
"""Process combined extraction data to generate triples and entity contexts"""
"""Process JSONL extraction data to generate triples and entity contexts.
Data is a flat list of objects with 'type' discriminator field:
- {"type": "definition", "entity": "...", "definition": "..."}
- {"type": "relationship", "subject": "...", "predicate": "...", "object": "...", "object-entity": bool}
"""
triples = []
entity_contexts = []
# Categorize items by type
definitions = [item for item in data if item.get("type") == "definition"]
relationships = [item for item in data if item.get("type") == "relationship"]
# Process definitions
for defn in data.get("definitions", []):
for defn in definitions:
entity_uri = self.to_uri(defn["entity"])
@ -247,17 +283,18 @@ class Processor(FlowProcessor):
))
# Process relationships
for rel in data.get("relationships", []):
for rel in relationships:
subject_uri = self.to_uri(rel["subject"])
predicate_uri = self.to_uri(rel["predicate"])
subject_value = Value(value=subject_uri, is_uri=True)
predicate_value = Value(value=predicate_uri, is_uri=True)
if data.get("object-entity", False):
object_value = Value(value=predicate_uri, is_uri=True)
if rel.get("object-entity", True):
object_uri = self.to_uri(rel["object"])
object_value = Value(value=object_uri, is_uri=True)
else:
object_value = Value(value=predicate_uri, is_uri=False)
object_value = Value(value=rel["object"], is_uri=False)
# Add subject and predicate labels
triples.append(Triple(

View file

@ -49,8 +49,17 @@ class ExtractionResult:
def parse_extraction_response(response: Any) -> Optional[ExtractionResult]:
"""Parse LLM extraction response into structured format.
Supports two formats:
1. JSONL format (list): Flat list of objects with 'type' discriminator field
[{"type": "entity", ...}, {"type": "relationship", ...}, {"type": "attribute", ...}]
2. Legacy format (dict): Nested structure with separate arrays
{"entities": [...], "relationships": [...], "attributes": [...]}
Args:
response: LLM response (string JSON or already parsed dict)
response: LLM response - can be:
- string (JSON to parse)
- dict (legacy nested format)
- list (JSONL format - flat list with type discriminators)
Returns:
ExtractionResult with parsed entities/relationships/attributes,
@ -64,17 +73,89 @@ def parse_extraction_response(response: Any) -> Optional[ExtractionResult]:
logger.error(f"Failed to parse JSON response: {e}")
logger.debug(f"Response was: {response[:500]}")
return None
elif isinstance(response, dict):
elif isinstance(response, (dict, list)):
data = response
else:
logger.error(f"Unexpected response type: {type(response)}")
return None
# Validate structure
if not isinstance(data, dict):
logger.error(f"Expected dict, got {type(data)}")
return None
# Handle JSONL format (flat list with type discriminators)
if isinstance(data, list):
return parse_jsonl_format(data)
# Handle legacy format (nested dict)
if isinstance(data, dict):
return parse_legacy_format(data)
logger.error(f"Expected dict or list, got {type(data)}")
return None
def parse_jsonl_format(data: List[Dict[str, Any]]) -> ExtractionResult:
"""Parse JSONL format response (flat list with type discriminators).
Each item has a 'type' field: 'entity', 'relationship', or 'attribute'.
Args:
data: List of dicts with type discriminator
Returns:
ExtractionResult with categorized items
"""
entities = []
relationships = []
attributes = []
for item in data:
if not isinstance(item, dict):
logger.warning(f"Skipping non-dict item: {type(item)}")
continue
item_type = item.get('type')
if item_type == 'entity':
try:
entity = parse_entity_jsonl(item)
if entity:
entities.append(entity)
except Exception as e:
logger.warning(f"Failed to parse entity {item}: {e}")
elif item_type == 'relationship':
try:
relationship = parse_relationship(item)
if relationship:
relationships.append(relationship)
except Exception as e:
logger.warning(f"Failed to parse relationship {item}: {e}")
elif item_type == 'attribute':
try:
attribute = parse_attribute(item)
if attribute:
attributes.append(attribute)
except Exception as e:
logger.warning(f"Failed to parse attribute {item}: {e}")
else:
logger.warning(f"Unknown item type '{item_type}': {item}")
return ExtractionResult(
entities=entities,
relationships=relationships,
attributes=attributes
)
def parse_legacy_format(data: Dict[str, Any]) -> ExtractionResult:
"""Parse legacy format response (nested dict with arrays).
Args:
data: Dict with 'entities', 'relationships', 'attributes' arrays
Returns:
ExtractionResult with parsed items
"""
# Parse entities
entities = []
entities_data = data.get('entities', [])
@ -127,6 +208,37 @@ def parse_extraction_response(response: Any) -> Optional[ExtractionResult]:
)
def parse_entity_jsonl(data: Dict[str, Any]) -> Optional[Entity]:
"""Parse entity from JSONL format dict.
JSONL format uses 'entity_type' instead of 'type' for the entity's type
(since 'type' is the discriminator field).
Args:
data: Entity dict with 'entity' and 'entity_type' fields
Returns:
Entity object or None if invalid
"""
if not isinstance(data, dict):
logger.warning(f"Entity data is not a dict: {type(data)}")
return None
entity = data.get('entity')
# JSONL format uses 'entity_type' since 'type' is the discriminator
entity_type = data.get('entity_type')
if not entity or not entity_type:
logger.warning(f"Missing required fields in entity: {data}")
return None
if not isinstance(entity, str) or not isinstance(entity_type, str):
logger.warning(f"Entity fields must be strings: {data}")
return None
return Entity(entity=entity, type=entity_type)
def parse_entity(data: Dict[str, Any]) -> Optional[Entity]:
"""Parse entity from dict.

View file

@ -83,7 +83,7 @@ class PromptManager:
def parse_json(self, text):
json_match = re.search(r'```(?:json)?(.*?)```', text, re.DOTALL)
if json_match:
json_str = json_match.group(1).strip()
else:
@ -92,6 +92,43 @@ class PromptManager:
return json.loads(json_str)
def parse_jsonl(self, text):
"""
Parse JSONL response, returning list of valid objects.
Invalid lines (malformed JSON, empty lines) are skipped with warnings.
This provides truncation resilience - partial output yields partial results.
"""
results = []
# Strip markdown code fences if present
text = text.strip()
if text.startswith('```'):
# Remove opening fence (possibly with language hint)
text = re.sub(r'^```(?:json|jsonl)?\s*\n?', '', text)
if text.endswith('```'):
text = text[:-3]
for line_num, line in enumerate(text.strip().split('\n'), 1):
line = line.strip()
# Skip empty lines
if not line:
continue
# Skip any remaining fence markers
if line.startswith('```'):
continue
try:
obj = json.loads(line)
results.append(obj)
except json.JSONDecodeError as e:
# Log warning but continue - this provides truncation resilience
logger.warning(f"JSONL parse error on line {line_num}: {e}")
return results
def render(self, id, input):
if id not in self.prompts:
@ -121,21 +158,41 @@ class PromptManager:
if resp_type == "text":
return resp
if resp_type != "json":
raise RuntimeError(f"Response type {resp_type} not known")
try:
obj = self.parse_json(resp)
except:
logger.error(f"JSON parse failed: {resp}")
raise RuntimeError("JSON parse fail")
if self.prompts[id].schema:
if resp_type == "json":
try:
validate(instance=obj, schema=self.prompts[id].schema)
logger.debug("Schema validation successful")
except Exception as e:
raise RuntimeError(f"Schema validation fail: {e}")
obj = self.parse_json(resp)
except:
logger.error(f"JSON parse failed: {resp}")
raise RuntimeError("JSON parse fail")
return obj
if self.prompts[id].schema:
try:
validate(instance=obj, schema=self.prompts[id].schema)
logger.debug("Schema validation successful")
except Exception as e:
raise RuntimeError(f"Schema validation fail: {e}")
return obj
if resp_type == "jsonl":
objects = self.parse_jsonl(resp)
if not objects:
logger.warning("JSONL parse returned no valid objects")
return []
# Validate each object against schema if provided
if self.prompts[id].schema:
validated = []
for i, obj in enumerate(objects):
try:
validate(instance=obj, schema=self.prompts[id].schema)
validated.append(obj)
except Exception as e:
logger.warning(f"Object {i} failed schema validation: {e}")
return validated
return objects
raise RuntimeError(f"Response type {resp_type} not known")