mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-27 01:16:22 +02:00
Feature/prompts jsonl (#619)
* Tech spec * JSONL implementation complete * Updated prompt client users * Fix tests
This commit is contained in:
parent
e4f0013841
commit
e214eb4e02
8 changed files with 1292 additions and 463 deletions
|
|
@ -126,16 +126,42 @@ class Processor(FlowProcessor):
|
|||
|
||||
await pub.send(ecs)
|
||||
|
||||
def parse_json(self, text):
|
||||
json_match = re.search(r'```(?:json)?(.*?)```', text, re.DOTALL)
|
||||
|
||||
if json_match:
|
||||
json_str = json_match.group(1).strip()
|
||||
else:
|
||||
# If no delimiters, assume the entire output is JSON
|
||||
json_str = text.strip()
|
||||
def parse_jsonl(self, text):
|
||||
"""
|
||||
Parse JSONL response, returning list of valid objects.
|
||||
|
||||
return json.loads(json_str)
|
||||
Invalid lines (malformed JSON, empty lines) are skipped with warnings.
|
||||
This provides truncation resilience - partial output yields partial results.
|
||||
"""
|
||||
results = []
|
||||
|
||||
# Strip markdown code fences if present
|
||||
text = text.strip()
|
||||
if text.startswith('```'):
|
||||
# Remove opening fence (possibly with language hint)
|
||||
text = re.sub(r'^```(?:json|jsonl)?\s*\n?', '', text)
|
||||
if text.endswith('```'):
|
||||
text = text[:-3]
|
||||
|
||||
for line_num, line in enumerate(text.strip().split('\n'), 1):
|
||||
line = line.strip()
|
||||
|
||||
# Skip empty lines
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Skip any remaining fence markers
|
||||
if line.startswith('```'):
|
||||
continue
|
||||
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
results.append(obj)
|
||||
except json.JSONDecodeError as e:
|
||||
# Log warning but continue - this provides truncation resilience
|
||||
logger.warning(f"JSONL parse error on line {line_num}: {e}")
|
||||
|
||||
return results
|
||||
|
||||
async def on_message(self, msg, consumer, flow):
|
||||
|
||||
|
|
@ -178,11 +204,12 @@ class Processor(FlowProcessor):
|
|||
question = prompt
|
||||
)
|
||||
|
||||
# Parse JSON response
|
||||
try:
|
||||
extraction_data = self.parse_json(agent_response)
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON response from agent: {e}")
|
||||
# Parse JSONL response
|
||||
extraction_data = self.parse_jsonl(agent_response)
|
||||
|
||||
if not extraction_data:
|
||||
logger.warning("JSONL parse returned no valid objects")
|
||||
return
|
||||
|
||||
# Process extraction data
|
||||
triples, entity_contexts = self.process_extraction_data(
|
||||
|
|
@ -209,12 +236,21 @@ class Processor(FlowProcessor):
|
|||
raise
|
||||
|
||||
def process_extraction_data(self, data, metadata):
|
||||
"""Process combined extraction data to generate triples and entity contexts"""
|
||||
"""Process JSONL extraction data to generate triples and entity contexts.
|
||||
|
||||
Data is a flat list of objects with 'type' discriminator field:
|
||||
- {"type": "definition", "entity": "...", "definition": "..."}
|
||||
- {"type": "relationship", "subject": "...", "predicate": "...", "object": "...", "object-entity": bool}
|
||||
"""
|
||||
triples = []
|
||||
entity_contexts = []
|
||||
|
||||
# Categorize items by type
|
||||
definitions = [item for item in data if item.get("type") == "definition"]
|
||||
relationships = [item for item in data if item.get("type") == "relationship"]
|
||||
|
||||
# Process definitions
|
||||
for defn in data.get("definitions", []):
|
||||
for defn in definitions:
|
||||
|
||||
entity_uri = self.to_uri(defn["entity"])
|
||||
|
||||
|
|
@ -247,17 +283,18 @@ class Processor(FlowProcessor):
|
|||
))
|
||||
|
||||
# Process relationships
|
||||
for rel in data.get("relationships", []):
|
||||
for rel in relationships:
|
||||
|
||||
subject_uri = self.to_uri(rel["subject"])
|
||||
predicate_uri = self.to_uri(rel["predicate"])
|
||||
|
||||
subject_value = Value(value=subject_uri, is_uri=True)
|
||||
predicate_value = Value(value=predicate_uri, is_uri=True)
|
||||
if data.get("object-entity", False):
|
||||
object_value = Value(value=predicate_uri, is_uri=True)
|
||||
if rel.get("object-entity", True):
|
||||
object_uri = self.to_uri(rel["object"])
|
||||
object_value = Value(value=object_uri, is_uri=True)
|
||||
else:
|
||||
object_value = Value(value=predicate_uri, is_uri=False)
|
||||
object_value = Value(value=rel["object"], is_uri=False)
|
||||
|
||||
# Add subject and predicate labels
|
||||
triples.append(Triple(
|
||||
|
|
|
|||
|
|
@ -49,8 +49,17 @@ class ExtractionResult:
|
|||
def parse_extraction_response(response: Any) -> Optional[ExtractionResult]:
|
||||
"""Parse LLM extraction response into structured format.
|
||||
|
||||
Supports two formats:
|
||||
1. JSONL format (list): Flat list of objects with 'type' discriminator field
|
||||
[{"type": "entity", ...}, {"type": "relationship", ...}, {"type": "attribute", ...}]
|
||||
2. Legacy format (dict): Nested structure with separate arrays
|
||||
{"entities": [...], "relationships": [...], "attributes": [...]}
|
||||
|
||||
Args:
|
||||
response: LLM response (string JSON or already parsed dict)
|
||||
response: LLM response - can be:
|
||||
- string (JSON to parse)
|
||||
- dict (legacy nested format)
|
||||
- list (JSONL format - flat list with type discriminators)
|
||||
|
||||
Returns:
|
||||
ExtractionResult with parsed entities/relationships/attributes,
|
||||
|
|
@ -64,17 +73,89 @@ def parse_extraction_response(response: Any) -> Optional[ExtractionResult]:
|
|||
logger.error(f"Failed to parse JSON response: {e}")
|
||||
logger.debug(f"Response was: {response[:500]}")
|
||||
return None
|
||||
elif isinstance(response, dict):
|
||||
elif isinstance(response, (dict, list)):
|
||||
data = response
|
||||
else:
|
||||
logger.error(f"Unexpected response type: {type(response)}")
|
||||
return None
|
||||
|
||||
# Validate structure
|
||||
if not isinstance(data, dict):
|
||||
logger.error(f"Expected dict, got {type(data)}")
|
||||
return None
|
||||
# Handle JSONL format (flat list with type discriminators)
|
||||
if isinstance(data, list):
|
||||
return parse_jsonl_format(data)
|
||||
|
||||
# Handle legacy format (nested dict)
|
||||
if isinstance(data, dict):
|
||||
return parse_legacy_format(data)
|
||||
|
||||
logger.error(f"Expected dict or list, got {type(data)}")
|
||||
return None
|
||||
|
||||
|
||||
def parse_jsonl_format(data: List[Dict[str, Any]]) -> ExtractionResult:
|
||||
"""Parse JSONL format response (flat list with type discriminators).
|
||||
|
||||
Each item has a 'type' field: 'entity', 'relationship', or 'attribute'.
|
||||
|
||||
Args:
|
||||
data: List of dicts with type discriminator
|
||||
|
||||
Returns:
|
||||
ExtractionResult with categorized items
|
||||
"""
|
||||
entities = []
|
||||
relationships = []
|
||||
attributes = []
|
||||
|
||||
for item in data:
|
||||
if not isinstance(item, dict):
|
||||
logger.warning(f"Skipping non-dict item: {type(item)}")
|
||||
continue
|
||||
|
||||
item_type = item.get('type')
|
||||
|
||||
if item_type == 'entity':
|
||||
try:
|
||||
entity = parse_entity_jsonl(item)
|
||||
if entity:
|
||||
entities.append(entity)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse entity {item}: {e}")
|
||||
|
||||
elif item_type == 'relationship':
|
||||
try:
|
||||
relationship = parse_relationship(item)
|
||||
if relationship:
|
||||
relationships.append(relationship)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse relationship {item}: {e}")
|
||||
|
||||
elif item_type == 'attribute':
|
||||
try:
|
||||
attribute = parse_attribute(item)
|
||||
if attribute:
|
||||
attributes.append(attribute)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse attribute {item}: {e}")
|
||||
|
||||
else:
|
||||
logger.warning(f"Unknown item type '{item_type}': {item}")
|
||||
|
||||
return ExtractionResult(
|
||||
entities=entities,
|
||||
relationships=relationships,
|
||||
attributes=attributes
|
||||
)
|
||||
|
||||
|
||||
def parse_legacy_format(data: Dict[str, Any]) -> ExtractionResult:
|
||||
"""Parse legacy format response (nested dict with arrays).
|
||||
|
||||
Args:
|
||||
data: Dict with 'entities', 'relationships', 'attributes' arrays
|
||||
|
||||
Returns:
|
||||
ExtractionResult with parsed items
|
||||
"""
|
||||
# Parse entities
|
||||
entities = []
|
||||
entities_data = data.get('entities', [])
|
||||
|
|
@ -127,6 +208,37 @@ def parse_extraction_response(response: Any) -> Optional[ExtractionResult]:
|
|||
)
|
||||
|
||||
|
||||
def parse_entity_jsonl(data: Dict[str, Any]) -> Optional[Entity]:
|
||||
"""Parse entity from JSONL format dict.
|
||||
|
||||
JSONL format uses 'entity_type' instead of 'type' for the entity's type
|
||||
(since 'type' is the discriminator field).
|
||||
|
||||
Args:
|
||||
data: Entity dict with 'entity' and 'entity_type' fields
|
||||
|
||||
Returns:
|
||||
Entity object or None if invalid
|
||||
"""
|
||||
if not isinstance(data, dict):
|
||||
logger.warning(f"Entity data is not a dict: {type(data)}")
|
||||
return None
|
||||
|
||||
entity = data.get('entity')
|
||||
# JSONL format uses 'entity_type' since 'type' is the discriminator
|
||||
entity_type = data.get('entity_type')
|
||||
|
||||
if not entity or not entity_type:
|
||||
logger.warning(f"Missing required fields in entity: {data}")
|
||||
return None
|
||||
|
||||
if not isinstance(entity, str) or not isinstance(entity_type, str):
|
||||
logger.warning(f"Entity fields must be strings: {data}")
|
||||
return None
|
||||
|
||||
return Entity(entity=entity, type=entity_type)
|
||||
|
||||
|
||||
def parse_entity(data: Dict[str, Any]) -> Optional[Entity]:
|
||||
"""Parse entity from dict.
|
||||
|
||||
|
|
|
|||
|
|
@ -83,7 +83,7 @@ class PromptManager:
|
|||
|
||||
def parse_json(self, text):
|
||||
json_match = re.search(r'```(?:json)?(.*?)```', text, re.DOTALL)
|
||||
|
||||
|
||||
if json_match:
|
||||
json_str = json_match.group(1).strip()
|
||||
else:
|
||||
|
|
@ -92,6 +92,43 @@ class PromptManager:
|
|||
|
||||
return json.loads(json_str)
|
||||
|
||||
def parse_jsonl(self, text):
|
||||
"""
|
||||
Parse JSONL response, returning list of valid objects.
|
||||
|
||||
Invalid lines (malformed JSON, empty lines) are skipped with warnings.
|
||||
This provides truncation resilience - partial output yields partial results.
|
||||
"""
|
||||
results = []
|
||||
|
||||
# Strip markdown code fences if present
|
||||
text = text.strip()
|
||||
if text.startswith('```'):
|
||||
# Remove opening fence (possibly with language hint)
|
||||
text = re.sub(r'^```(?:json|jsonl)?\s*\n?', '', text)
|
||||
if text.endswith('```'):
|
||||
text = text[:-3]
|
||||
|
||||
for line_num, line in enumerate(text.strip().split('\n'), 1):
|
||||
line = line.strip()
|
||||
|
||||
# Skip empty lines
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Skip any remaining fence markers
|
||||
if line.startswith('```'):
|
||||
continue
|
||||
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
results.append(obj)
|
||||
except json.JSONDecodeError as e:
|
||||
# Log warning but continue - this provides truncation resilience
|
||||
logger.warning(f"JSONL parse error on line {line_num}: {e}")
|
||||
|
||||
return results
|
||||
|
||||
def render(self, id, input):
|
||||
|
||||
if id not in self.prompts:
|
||||
|
|
@ -121,21 +158,41 @@ class PromptManager:
|
|||
if resp_type == "text":
|
||||
return resp
|
||||
|
||||
if resp_type != "json":
|
||||
raise RuntimeError(f"Response type {resp_type} not known")
|
||||
|
||||
try:
|
||||
obj = self.parse_json(resp)
|
||||
except:
|
||||
logger.error(f"JSON parse failed: {resp}")
|
||||
raise RuntimeError("JSON parse fail")
|
||||
|
||||
if self.prompts[id].schema:
|
||||
if resp_type == "json":
|
||||
try:
|
||||
validate(instance=obj, schema=self.prompts[id].schema)
|
||||
logger.debug("Schema validation successful")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Schema validation fail: {e}")
|
||||
obj = self.parse_json(resp)
|
||||
except:
|
||||
logger.error(f"JSON parse failed: {resp}")
|
||||
raise RuntimeError("JSON parse fail")
|
||||
|
||||
return obj
|
||||
if self.prompts[id].schema:
|
||||
try:
|
||||
validate(instance=obj, schema=self.prompts[id].schema)
|
||||
logger.debug("Schema validation successful")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Schema validation fail: {e}")
|
||||
|
||||
return obj
|
||||
|
||||
if resp_type == "jsonl":
|
||||
objects = self.parse_jsonl(resp)
|
||||
|
||||
if not objects:
|
||||
logger.warning("JSONL parse returned no valid objects")
|
||||
return []
|
||||
|
||||
# Validate each object against schema if provided
|
||||
if self.prompts[id].schema:
|
||||
validated = []
|
||||
for i, obj in enumerate(objects):
|
||||
try:
|
||||
validate(instance=obj, schema=self.prompts[id].schema)
|
||||
validated.append(obj)
|
||||
except Exception as e:
|
||||
logger.warning(f"Object {i} failed schema validation: {e}")
|
||||
return validated
|
||||
|
||||
return objects
|
||||
|
||||
raise RuntimeError(f"Response type {resp_type} not known")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue