Feature/prompts jsonl (#619)

* Tech spec * JSONL implementation complete * Updated prompt client users * Fix tests
2026-04-27 01:16:22 +02:00 · 2026-01-26 17:38:00 +00:00 · 2026-01-26 17:38:00 +00:00 · e214eb4e02
commit e214eb4e02
parent e4f0013841
8 changed files with 1292 additions and 463 deletions
--- a/trustgraph-flow/trustgraph/extract/kg/agent/extract.py
+++ b/trustgraph-flow/trustgraph/extract/kg/agent/extract.py
@ -126,16 +126,42 @@ class Processor(FlowProcessor):

        await pub.send(ecs)

-    def parse_json(self, text):
-        json_match = re.search(r'```(?:json)?(.*?)```', text, re.DOTALL)
-    
-        if json_match:
-            json_str = json_match.group(1).strip()
-        else:
-            # If no delimiters, assume the entire output is JSON
-            json_str = text.strip()
+    def parse_jsonl(self, text):
+        """
+        Parse JSONL response, returning list of valid objects.

-        return json.loads(json_str)
+        Invalid lines (malformed JSON, empty lines) are skipped with warnings.
+        This provides truncation resilience - partial output yields partial results.
+        """
+        results = []
+
+        # Strip markdown code fences if present
+        text = text.strip()
+        if text.startswith('```'):
+            # Remove opening fence (possibly with language hint)
+            text = re.sub(r'^```(?:json|jsonl)?\s*\n?', '', text)
+        if text.endswith('```'):
+            text = text[:-3]
+
+        for line_num, line in enumerate(text.strip().split('\n'), 1):
+            line = line.strip()
+
+            # Skip empty lines
+            if not line:
+                continue
+
+            # Skip any remaining fence markers
+            if line.startswith('```'):
+                continue
+
+            try:
+                obj = json.loads(line)
+                results.append(obj)
+            except json.JSONDecodeError as e:
+                # Log warning but continue - this provides truncation resilience
+                logger.warning(f"JSONL parse error on line {line_num}: {e}")
+
+        return results

    async def on_message(self, msg, consumer, flow):

@ -178,11 +204,12 @@ class Processor(FlowProcessor):
                question = prompt
            )
            
-            # Parse JSON response
-            try:
-                extraction_data = self.parse_json(agent_response)
-            except json.JSONDecodeError as e:
-                raise ValueError(f"Invalid JSON response from agent: {e}")
+            # Parse JSONL response
+            extraction_data = self.parse_jsonl(agent_response)
+
+            if not extraction_data:
+                logger.warning("JSONL parse returned no valid objects")
+                return
            
            # Process extraction data
            triples, entity_contexts = self.process_extraction_data(
@ -209,12 +236,21 @@ class Processor(FlowProcessor):
            raise

    def process_extraction_data(self, data, metadata):
-        """Process combined extraction data to generate triples and entity contexts"""
+        """Process JSONL extraction data to generate triples and entity contexts.
+
+        Data is a flat list of objects with 'type' discriminator field:
+        - {"type": "definition", "entity": "...", "definition": "..."}
+        - {"type": "relationship", "subject": "...", "predicate": "...", "object": "...", "object-entity": bool}
+        """
        triples = []
        entity_contexts = []

+        # Categorize items by type
+        definitions = [item for item in data if item.get("type") == "definition"]
+        relationships = [item for item in data if item.get("type") == "relationship"]
+
        # Process definitions
-        for defn in data.get("definitions", []):
+        for defn in definitions:

            entity_uri = self.to_uri(defn["entity"])
            
@ -247,17 +283,18 @@ class Processor(FlowProcessor):
            ))
        
        # Process relationships
-        for rel in data.get("relationships", []):
+        for rel in relationships:

            subject_uri = self.to_uri(rel["subject"])
            predicate_uri = self.to_uri(rel["predicate"])

            subject_value = Value(value=subject_uri, is_uri=True)
            predicate_value = Value(value=predicate_uri, is_uri=True)
-            if data.get("object-entity", False):
-                object_value = Value(value=predicate_uri, is_uri=True)
+            if rel.get("object-entity", True):
+                object_uri = self.to_uri(rel["object"])
+                object_value = Value(value=object_uri, is_uri=True)
            else:
-                object_value = Value(value=predicate_uri, is_uri=False)
+                object_value = Value(value=rel["object"], is_uri=False)
            
            # Add subject and predicate labels
            triples.append(Triple(
--- a/trustgraph-flow/trustgraph/extract/kg/ontology/simplified_parser.py
+++ b/trustgraph-flow/trustgraph/extract/kg/ontology/simplified_parser.py
@ -49,8 +49,17 @@ class ExtractionResult:
 def parse_extraction_response(response: Any) -> Optional[ExtractionResult]:
    """Parse LLM extraction response into structured format.

+    Supports two formats:
+    1. JSONL format (list): Flat list of objects with 'type' discriminator field
+       [{"type": "entity", ...}, {"type": "relationship", ...}, {"type": "attribute", ...}]
+    2. Legacy format (dict): Nested structure with separate arrays
+       {"entities": [...], "relationships": [...], "attributes": [...]}
+
    Args:
-        response: LLM response (string JSON or already parsed dict)
+        response: LLM response - can be:
+            - string (JSON to parse)
+            - dict (legacy nested format)
+            - list (JSONL format - flat list with type discriminators)

    Returns:
        ExtractionResult with parsed entities/relationships/attributes,
@ -64,17 +73,89 @@ def parse_extraction_response(response: Any) -> Optional[ExtractionResult]:
            logger.error(f"Failed to parse JSON response: {e}")
            logger.debug(f"Response was: {response[:500]}")
            return None
-    elif isinstance(response, dict):
+    elif isinstance(response, (dict, list)):
        data = response
    else:
        logger.error(f"Unexpected response type: {type(response)}")
        return None

-    # Validate structure
-    if not isinstance(data, dict):
-        logger.error(f"Expected dict, got {type(data)}")
-        return None
+    # Handle JSONL format (flat list with type discriminators)
+    if isinstance(data, list):
+        return parse_jsonl_format(data)

+    # Handle legacy format (nested dict)
+    if isinstance(data, dict):
+        return parse_legacy_format(data)
+
+    logger.error(f"Expected dict or list, got {type(data)}")
+    return None
+
+
+def parse_jsonl_format(data: List[Dict[str, Any]]) -> ExtractionResult:
+    """Parse JSONL format response (flat list with type discriminators).
+
+    Each item has a 'type' field: 'entity', 'relationship', or 'attribute'.
+
+    Args:
+        data: List of dicts with type discriminator
+
+    Returns:
+        ExtractionResult with categorized items
+    """
+    entities = []
+    relationships = []
+    attributes = []
+
+    for item in data:
+        if not isinstance(item, dict):
+            logger.warning(f"Skipping non-dict item: {type(item)}")
+            continue
+
+        item_type = item.get('type')
+
+        if item_type == 'entity':
+            try:
+                entity = parse_entity_jsonl(item)
+                if entity:
+                    entities.append(entity)
+            except Exception as e:
+                logger.warning(f"Failed to parse entity {item}: {e}")
+
+        elif item_type == 'relationship':
+            try:
+                relationship = parse_relationship(item)
+                if relationship:
+                    relationships.append(relationship)
+            except Exception as e:
+                logger.warning(f"Failed to parse relationship {item}: {e}")
+
+        elif item_type == 'attribute':
+            try:
+                attribute = parse_attribute(item)
+                if attribute:
+                    attributes.append(attribute)
+            except Exception as e:
+                logger.warning(f"Failed to parse attribute {item}: {e}")
+
+        else:
+            logger.warning(f"Unknown item type '{item_type}': {item}")
+
+    return ExtractionResult(
+        entities=entities,
+        relationships=relationships,
+        attributes=attributes
+    )
+
+
+def parse_legacy_format(data: Dict[str, Any]) -> ExtractionResult:
+    """Parse legacy format response (nested dict with arrays).
+
+    Args:
+        data: Dict with 'entities', 'relationships', 'attributes' arrays
+
+    Returns:
+        ExtractionResult with parsed items
+    """
    # Parse entities
    entities = []
    entities_data = data.get('entities', [])
@ -127,6 +208,37 @@ def parse_extraction_response(response: Any) -> Optional[ExtractionResult]:
    )


+def parse_entity_jsonl(data: Dict[str, Any]) -> Optional[Entity]:
+    """Parse entity from JSONL format dict.
+
+    JSONL format uses 'entity_type' instead of 'type' for the entity's type
+    (since 'type' is the discriminator field).
+
+    Args:
+        data: Entity dict with 'entity' and 'entity_type' fields
+
+    Returns:
+        Entity object or None if invalid
+    """
+    if not isinstance(data, dict):
+        logger.warning(f"Entity data is not a dict: {type(data)}")
+        return None
+
+    entity = data.get('entity')
+    # JSONL format uses 'entity_type' since 'type' is the discriminator
+    entity_type = data.get('entity_type')
+
+    if not entity or not entity_type:
+        logger.warning(f"Missing required fields in entity: {data}")
+        return None
+
+    if not isinstance(entity, str) or not isinstance(entity_type, str):
+        logger.warning(f"Entity fields must be strings: {data}")
+        return None
+
+    return Entity(entity=entity, type=entity_type)
+
+
 def parse_entity(data: Dict[str, Any]) -> Optional[Entity]:
    """Parse entity from dict.

--- a/trustgraph-flow/trustgraph/template/prompt_manager.py
+++ b/trustgraph-flow/trustgraph/template/prompt_manager.py
@ -83,7 +83,7 @@ class PromptManager:

    def parse_json(self, text):
        json_match = re.search(r'```(?:json)?(.*?)```', text, re.DOTALL)
-    
+
        if json_match:
            json_str = json_match.group(1).strip()
        else:
@ -92,6 +92,43 @@ class PromptManager:

        return json.loads(json_str)

+    def parse_jsonl(self, text):
+        """
+        Parse JSONL response, returning list of valid objects.
+
+        Invalid lines (malformed JSON, empty lines) are skipped with warnings.
+        This provides truncation resilience - partial output yields partial results.
+        """
+        results = []
+
+        # Strip markdown code fences if present
+        text = text.strip()
+        if text.startswith('```'):
+            # Remove opening fence (possibly with language hint)
+            text = re.sub(r'^```(?:json|jsonl)?\s*\n?', '', text)
+        if text.endswith('```'):
+            text = text[:-3]
+
+        for line_num, line in enumerate(text.strip().split('\n'), 1):
+            line = line.strip()
+
+            # Skip empty lines
+            if not line:
+                continue
+
+            # Skip any remaining fence markers
+            if line.startswith('```'):
+                continue
+
+            try:
+                obj = json.loads(line)
+                results.append(obj)
+            except json.JSONDecodeError as e:
+                # Log warning but continue - this provides truncation resilience
+                logger.warning(f"JSONL parse error on line {line_num}: {e}")
+
+        return results
+
    def render(self, id, input):

        if id not in self.prompts:
@ -121,21 +158,41 @@ class PromptManager:
        if resp_type == "text":
            return resp

-        if resp_type != "json":
-            raise RuntimeError(f"Response type {resp_type} not known")
-
-        try:
-            obj = self.parse_json(resp)
-        except:
-            logger.error(f"JSON parse failed: {resp}")
-            raise RuntimeError("JSON parse fail")
-
-        if self.prompts[id].schema:
+        if resp_type == "json":
            try:
-                validate(instance=obj, schema=self.prompts[id].schema)
-                logger.debug("Schema validation successful")
-            except Exception as e:
-                raise RuntimeError(f"Schema validation fail: {e}")
+                obj = self.parse_json(resp)
+            except:
+                logger.error(f"JSON parse failed: {resp}")
+                raise RuntimeError("JSON parse fail")

-        return obj
+            if self.prompts[id].schema:
+                try:
+                    validate(instance=obj, schema=self.prompts[id].schema)
+                    logger.debug("Schema validation successful")
+                except Exception as e:
+                    raise RuntimeError(f"Schema validation fail: {e}")
+
+            return obj
+
+        if resp_type == "jsonl":
+            objects = self.parse_jsonl(resp)
+
+            if not objects:
+                logger.warning("JSONL parse returned no valid objects")
+                return []
+
+            # Validate each object against schema if provided
+            if self.prompts[id].schema:
+                validated = []
+                for i, obj in enumerate(objects):
+                    try:
+                        validate(instance=obj, schema=self.prompts[id].schema)
+                        validated.append(obj)
+                    except Exception as e:
+                        logger.warning(f"Object {i} failed schema validation: {e}")
+                return validated
+
+            return objects
+
+        raise RuntimeError(f"Response type {resp_type} not known")