Feature/prompts jsonl (#619)

* Tech spec

* JSONL implementation complete

* Updated prompt client users

* Fix tests
This commit is contained in:
cybermaggedon 2026-01-26 17:38:00 +00:00 committed by GitHub
parent e4f0013841
commit e214eb4e02
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 1292 additions and 463 deletions

View file

@ -33,7 +33,7 @@ class TestAgentKgExtractor:
# Set up the methods we want to test
extractor.to_uri = real_extractor.to_uri
extractor.parse_json = real_extractor.parse_json
extractor.parse_jsonl = real_extractor.parse_jsonl
extractor.process_extraction_data = real_extractor.process_extraction_data
extractor.emit_triples = real_extractor.emit_triples
extractor.emit_entity_contexts = real_extractor.emit_entity_contexts
@ -62,39 +62,40 @@ class TestAgentKgExtractor:
@pytest.fixture
def sample_extraction_data(self):
"""Sample extraction data in expected format"""
return {
"definitions": [
{
"entity": "Machine Learning",
"definition": "A subset of artificial intelligence that enables computers to learn from data without explicit programming."
},
{
"entity": "Neural Networks",
"definition": "Computing systems inspired by biological neural networks that process information."
}
],
"relationships": [
{
"subject": "Machine Learning",
"predicate": "is_subset_of",
"object": "Artificial Intelligence",
"object-entity": True
},
{
"subject": "Neural Networks",
"predicate": "used_in",
"object": "Machine Learning",
"object-entity": True
},
{
"subject": "Deep Learning",
"predicate": "accuracy",
"object": "95%",
"object-entity": False
}
]
}
"""Sample extraction data in JSONL format (list with type discriminators)"""
return [
{
"type": "definition",
"entity": "Machine Learning",
"definition": "A subset of artificial intelligence that enables computers to learn from data without explicit programming."
},
{
"type": "definition",
"entity": "Neural Networks",
"definition": "Computing systems inspired by biological neural networks that process information."
},
{
"type": "relationship",
"subject": "Machine Learning",
"predicate": "is_subset_of",
"object": "Artificial Intelligence",
"object-entity": True
},
{
"type": "relationship",
"subject": "Neural Networks",
"predicate": "used_in",
"object": "Machine Learning",
"object-entity": True
},
{
"type": "relationship",
"subject": "Deep Learning",
"predicate": "accuracy",
"object": "95%",
"object-entity": False
}
]
def test_to_uri_conversion(self, agent_extractor):
"""Test URI conversion for entities"""
@ -113,61 +114,67 @@ class TestAgentKgExtractor:
expected = f"{TRUSTGRAPH_ENTITIES}"
assert uri == expected
def test_parse_json_with_code_blocks(self, agent_extractor):
"""Test JSON parsing from code blocks"""
# Test JSON in code blocks
def test_parse_jsonl_with_code_blocks(self, agent_extractor):
"""Test JSONL parsing from code blocks"""
# Test JSONL in code blocks - note: JSON uses lowercase true/false
response = '''```json
{
"definitions": [{"entity": "AI", "definition": "Artificial Intelligence"}],
"relationships": []
}
```'''
result = agent_extractor.parse_json(response)
assert result["definitions"][0]["entity"] == "AI"
assert result["definitions"][0]["definition"] == "Artificial Intelligence"
assert result["relationships"] == []
{"type": "definition", "entity": "AI", "definition": "Artificial Intelligence"}
{"type": "relationship", "subject": "AI", "predicate": "is", "object": "technology", "object-entity": false}
```'''
def test_parse_json_without_code_blocks(self, agent_extractor):
"""Test JSON parsing without code blocks"""
response = '''{"definitions": [{"entity": "ML", "definition": "Machine Learning"}], "relationships": []}'''
result = agent_extractor.parse_json(response)
assert result["definitions"][0]["entity"] == "ML"
assert result["definitions"][0]["definition"] == "Machine Learning"
result = agent_extractor.parse_jsonl(response)
def test_parse_json_invalid_format(self, agent_extractor):
"""Test JSON parsing with invalid format"""
invalid_response = "This is not JSON at all"
with pytest.raises(json.JSONDecodeError):
agent_extractor.parse_json(invalid_response)
assert len(result) == 2
assert result[0]["entity"] == "AI"
assert result[0]["definition"] == "Artificial Intelligence"
assert result[1]["type"] == "relationship"
def test_parse_json_malformed_code_blocks(self, agent_extractor):
"""Test JSON parsing with malformed code blocks"""
# Missing closing backticks
response = '''```json
{"definitions": [], "relationships": []}
'''
# Should still parse the JSON content
with pytest.raises(json.JSONDecodeError):
agent_extractor.parse_json(response)
def test_parse_jsonl_without_code_blocks(self, agent_extractor):
"""Test JSONL parsing without code blocks"""
response = '''{"type": "definition", "entity": "ML", "definition": "Machine Learning"}
{"type": "definition", "entity": "AI", "definition": "Artificial Intelligence"}'''
result = agent_extractor.parse_jsonl(response)
assert len(result) == 2
assert result[0]["entity"] == "ML"
assert result[1]["entity"] == "AI"
def test_parse_jsonl_invalid_lines_skipped(self, agent_extractor):
"""Test JSONL parsing skips invalid lines gracefully"""
response = '''{"type": "definition", "entity": "Valid", "definition": "Valid def"}
This is not JSON at all
{"type": "definition", "entity": "Also Valid", "definition": "Another def"}'''
result = agent_extractor.parse_jsonl(response)
# Should get 2 valid objects, skipping the invalid line
assert len(result) == 2
assert result[0]["entity"] == "Valid"
assert result[1]["entity"] == "Also Valid"
def test_parse_jsonl_truncation_resilience(self, agent_extractor):
"""Test JSONL parsing handles truncated responses"""
# Simulates output cut off mid-line
response = '''{"type": "definition", "entity": "Complete", "definition": "Full def"}
{"type": "definition", "entity": "Trunca'''
result = agent_extractor.parse_jsonl(response)
# Should get 1 valid object, the truncated line is skipped
assert len(result) == 1
assert result[0]["entity"] == "Complete"
def test_process_extraction_data_definitions(self, agent_extractor, sample_metadata):
"""Test processing of definition data"""
data = {
"definitions": [
{
"entity": "Machine Learning",
"definition": "A subset of AI that enables learning from data."
}
],
"relationships": []
}
data = [
{
"type": "definition",
"entity": "Machine Learning",
"definition": "A subset of AI that enables learning from data."
}
]
triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
# Check entity label triple
@ -196,18 +203,16 @@ class TestAgentKgExtractor:
def test_process_extraction_data_relationships(self, agent_extractor, sample_metadata):
"""Test processing of relationship data"""
data = {
"definitions": [],
"relationships": [
{
"subject": "Machine Learning",
"predicate": "is_subset_of",
"object": "Artificial Intelligence",
"object-entity": True
}
]
}
data = [
{
"type": "relationship",
"subject": "Machine Learning",
"predicate": "is_subset_of",
"object": "Artificial Intelligence",
"object-entity": True
}
]
triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
# Check that subject, predicate, and object labels are created
@ -223,15 +228,12 @@ class TestAgentKgExtractor:
assert predicate_label is not None
assert predicate_label.o.value == "is_subset_of"
# Check main relationship triple
# NOTE: Current implementation has bugs:
# 1. Uses data.get("object-entity") instead of rel.get("object-entity")
# 2. Sets object_value to predicate_uri instead of actual object URI
# This test documents the current buggy behavior
# Check main relationship triple
object_uri = f"{TRUSTGRAPH_ENTITIES}Artificial%20Intelligence"
rel_triple = next((t for t in triples if t.s.value == subject_uri and t.p.value == predicate_uri), None)
assert rel_triple is not None
# Due to bug, object value is set to predicate_uri
assert rel_triple.o.value == predicate_uri
assert rel_triple.o.value == object_uri
assert rel_triple.o.is_uri == True
# Check subject-of relationships
subject_of_triples = [t for t in triples if t.p.value == SUBJECT_OF and t.o.value == "doc123"]
@ -239,20 +241,18 @@ class TestAgentKgExtractor:
def test_process_extraction_data_literal_object(self, agent_extractor, sample_metadata):
"""Test processing of relationships with literal objects"""
data = {
"definitions": [],
"relationships": [
{
"subject": "Deep Learning",
"predicate": "accuracy",
"object": "95%",
"object-entity": False
}
]
}
data = [
{
"type": "relationship",
"subject": "Deep Learning",
"predicate": "accuracy",
"object": "95%",
"object-entity": False
}
]
triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
# Check that object labels are not created for literal objects
object_labels = [t for t in triples if t.p.value == RDF_LABEL and t.o.value == "95%"]
# Based on the code logic, it should not create object labels for non-entity objects
@ -275,63 +275,50 @@ class TestAgentKgExtractor:
def test_process_extraction_data_no_metadata_id(self, agent_extractor):
"""Test processing when metadata has no ID"""
metadata = Metadata(id=None, metadata=[])
data = {
"definitions": [
{"entity": "Test Entity", "definition": "Test definition"}
],
"relationships": []
}
data = [
{"type": "definition", "entity": "Test Entity", "definition": "Test definition"}
]
triples, entity_contexts = agent_extractor.process_extraction_data(data, metadata)
# Should not create subject-of relationships when no metadata ID
subject_of_triples = [t for t in triples if t.p.value == SUBJECT_OF]
assert len(subject_of_triples) == 0
# Should still create entity contexts
assert len(entity_contexts) == 1
def test_process_extraction_data_empty_data(self, agent_extractor, sample_metadata):
"""Test processing of empty extraction data"""
data = {"definitions": [], "relationships": []}
triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
# Should only have metadata triples
assert len(entity_contexts) == 0
# Triples should only contain metadata triples if any
data = []
def test_process_extraction_data_missing_keys(self, agent_extractor, sample_metadata):
"""Test processing data with missing keys"""
# Test missing definitions key
data = {"relationships": []}
triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
# Should have no entity contexts
assert len(entity_contexts) == 0
# Test missing relationships key
data = {"definitions": []}
# Triples should be empty
assert len(triples) == 0
def test_process_extraction_data_unknown_types_ignored(self, agent_extractor, sample_metadata):
"""Test processing data with unknown type values"""
data = [
{"type": "definition", "entity": "Valid", "definition": "Valid def"},
{"type": "unknown_type", "foo": "bar"}, # Unknown type - should be ignored
{"type": "relationship", "subject": "A", "predicate": "rel", "object": "B", "object-entity": True}
]
triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
assert len(entity_contexts) == 0
# Test completely missing keys
data = {}
triples, entity_contexts = agent_extractor.process_extraction_data(data, sample_metadata)
assert len(entity_contexts) == 0
# Should process valid items and ignore unknown types
assert len(entity_contexts) == 1 # Only the definition creates entity context
def test_process_extraction_data_malformed_entries(self, agent_extractor, sample_metadata):
"""Test processing data with malformed entries"""
# Test definition missing required fields
data = {
"definitions": [
{"entity": "Test"}, # Missing definition
{"definition": "Test def"} # Missing entity
],
"relationships": [
{"subject": "A", "predicate": "rel"}, # Missing object
{"subject": "B", "object": "C"} # Missing predicate
]
}
# Test items missing required fields - should raise KeyError
data = [
{"type": "definition", "entity": "Test"}, # Missing definition
]
# Should handle gracefully or raise appropriate errors
with pytest.raises(KeyError):
agent_extractor.process_extraction_data(data, sample_metadata)

View file

@ -32,11 +32,11 @@ class TestAgentKgExtractionEdgeCases:
# Set up the methods we want to test
extractor.to_uri = real_extractor.to_uri
extractor.parse_json = real_extractor.parse_json
extractor.parse_jsonl = real_extractor.parse_jsonl
extractor.process_extraction_data = real_extractor.process_extraction_data
extractor.emit_triples = real_extractor.emit_triples
extractor.emit_entity_contexts = real_extractor.emit_entity_contexts
return extractor
def test_to_uri_special_characters(self, agent_extractor):
@ -85,138 +85,108 @@ class TestAgentKgExtractionEdgeCases:
# Verify the URI is properly encoded
assert unicode_text not in uri # Original unicode should be encoded
def test_parse_json_whitespace_variations(self, agent_extractor):
"""Test JSON parsing with various whitespace patterns"""
# Test JSON with different whitespace patterns
def test_parse_jsonl_whitespace_variations(self, agent_extractor):
"""Test JSONL parsing with various whitespace patterns"""
# Test JSONL with different whitespace patterns
test_cases = [
# Extra whitespace around code blocks
" ```json\n{\"test\": true}\n``` ",
# Tabs and mixed whitespace
"\t\t```json\n\t{\"test\": true}\n\t```\t",
# Multiple newlines
"\n\n\n```json\n\n{\"test\": true}\n\n```\n\n",
# JSON without code blocks but with whitespace
" {\"test\": true} ",
# Mixed line endings
"```json\r\n{\"test\": true}\r\n```",
' ```json\n{"type": "definition", "entity": "test", "definition": "def"}\n``` ',
# Multiple newlines between lines
'{"type": "definition", "entity": "A", "definition": "def A"}\n\n\n{"type": "definition", "entity": "B", "definition": "def B"}',
# JSONL without code blocks but with whitespace
' {"type": "definition", "entity": "test", "definition": "def"} ',
]
for response in test_cases:
result = agent_extractor.parse_json(response)
assert result == {"test": True}
def test_parse_json_code_block_variations(self, agent_extractor):
"""Test JSON parsing with different code block formats"""
for response in test_cases:
result = agent_extractor.parse_jsonl(response)
assert len(result) >= 1
assert result[0].get("type") == "definition"
def test_parse_jsonl_code_block_variations(self, agent_extractor):
"""Test JSONL parsing with different code block formats"""
test_cases = [
# Standard json code block
"```json\n{\"valid\": true}\n```",
'```json\n{"type": "definition", "entity": "A", "definition": "def"}\n```',
# jsonl code block
'```jsonl\n{"type": "definition", "entity": "A", "definition": "def"}\n```',
# Code block without language
"```\n{\"valid\": true}\n```",
# Uppercase JSON
"```JSON\n{\"valid\": true}\n```",
# Mixed case
"```Json\n{\"valid\": true}\n```",
# Multiple code blocks (should take first one)
"```json\n{\"first\": true}\n```\n```json\n{\"second\": true}\n```",
# Code block with extra content
"Here's the result:\n```json\n{\"valid\": true}\n```\nDone!",
'```\n{"type": "definition", "entity": "A", "definition": "def"}\n```',
# Code block with extra content before/after
'Here\'s the result:\n```json\n{"type": "definition", "entity": "A", "definition": "def"}\n```\nDone!',
]
for i, response in enumerate(test_cases):
try:
result = agent_extractor.parse_json(response)
assert result.get("valid") == True or result.get("first") == True
except json.JSONDecodeError:
# Some cases may fail due to regex extraction issues
# This documents current behavior - the regex may not match all cases
print(f"Case {i} failed JSON parsing: {response[:50]}...")
pass
result = agent_extractor.parse_jsonl(response)
assert len(result) >= 1, f"Case {i} failed"
assert result[0].get("entity") == "A"
def test_parse_json_malformed_code_blocks(self, agent_extractor):
"""Test JSON parsing with malformed code block formats"""
# These should still work by falling back to treating entire text as JSON
test_cases = [
# Unclosed code block
"```json\n{\"test\": true}",
# No opening backticks
"{\"test\": true}\n```",
# Wrong number of backticks
"`json\n{\"test\": true}\n`",
# Nested backticks (should handle gracefully)
"```json\n{\"code\": \"```\", \"test\": true}\n```",
]
for response in test_cases:
try:
result = agent_extractor.parse_json(response)
assert "test" in result # Should successfully parse
except json.JSONDecodeError:
# This is also acceptable for malformed cases
pass
def test_parse_jsonl_truncation_resilience(self, agent_extractor):
"""Test JSONL parsing with truncated responses"""
# Simulates LLM output being cut off mid-line
response = '''{"type": "definition", "entity": "Complete1", "definition": "Full definition"}
{"type": "definition", "entity": "Complete2", "definition": "Another full def"}
{"type": "definition", "entity": "Trunca'''
def test_parse_json_large_responses(self, agent_extractor):
"""Test JSON parsing with very large responses"""
# Create a large JSON structure
large_data = {
"definitions": [
{
"entity": f"Entity {i}",
"definition": f"Definition {i} " + "with more content " * 100
}
for i in range(100)
],
"relationships": [
{
"subject": f"Subject {i}",
"predicate": f"predicate_{i}",
"object": f"Object {i}",
"object-entity": i % 2 == 0
}
for i in range(50)
]
}
large_json_str = json.dumps(large_data)
response = f"```json\n{large_json_str}\n```"
result = agent_extractor.parse_json(response)
assert len(result["definitions"]) == 100
assert len(result["relationships"]) == 50
assert result["definitions"][0]["entity"] == "Entity 0"
result = agent_extractor.parse_jsonl(response)
# Should get 2 valid objects, the truncated line is skipped
assert len(result) == 2
assert result[0]["entity"] == "Complete1"
assert result[1]["entity"] == "Complete2"
def test_parse_jsonl_large_responses(self, agent_extractor):
"""Test JSONL parsing with very large responses"""
# Create a large JSONL response
lines = []
for i in range(100):
lines.append(json.dumps({
"type": "definition",
"entity": f"Entity {i}",
"definition": f"Definition {i} " + "with more content " * 100
}))
for i in range(50):
lines.append(json.dumps({
"type": "relationship",
"subject": f"Subject {i}",
"predicate": f"predicate_{i}",
"object": f"Object {i}",
"object-entity": i % 2 == 0
}))
response = f"```json\n{chr(10).join(lines)}\n```"
result = agent_extractor.parse_jsonl(response)
definitions = [r for r in result if r.get("type") == "definition"]
relationships = [r for r in result if r.get("type") == "relationship"]
assert len(definitions) == 100
assert len(relationships) == 50
assert definitions[0]["entity"] == "Entity 0"
def test_process_extraction_data_empty_metadata(self, agent_extractor):
"""Test processing with empty or minimal metadata"""
# Test with None metadata - may not raise AttributeError depending on implementation
try:
triples, contexts = agent_extractor.process_extraction_data(
{"definitions": [], "relationships": []},
None
)
triples, contexts = agent_extractor.process_extraction_data([], None)
# If it doesn't raise, check the results
assert len(triples) == 0
assert len(contexts) == 0
except (AttributeError, TypeError):
# This is expected behavior when metadata is None
pass
# Test with metadata without ID
metadata = Metadata(id=None, metadata=[])
triples, contexts = agent_extractor.process_extraction_data(
{"definitions": [], "relationships": []},
metadata
)
triples, contexts = agent_extractor.process_extraction_data([], metadata)
assert len(triples) == 0
assert len(contexts) == 0
# Test with metadata with empty string ID
metadata = Metadata(id="", metadata=[])
data = {
"definitions": [{"entity": "Test", "definition": "Test def"}],
"relationships": []
}
data = [{"type": "definition", "entity": "Test", "definition": "Test def"}]
triples, contexts = agent_extractor.process_extraction_data(data, metadata)
# Should not create subject-of triples when ID is empty string
subject_of_triples = [t for t in triples if t.p.value == SUBJECT_OF]
assert len(subject_of_triples) == 0
@ -224,7 +194,7 @@ class TestAgentKgExtractionEdgeCases:
def test_process_extraction_data_special_entity_names(self, agent_extractor):
"""Test processing with special characters in entity names"""
metadata = Metadata(id="doc123", metadata=[])
special_entities = [
"Entity with spaces",
"Entity & Co.",
@ -237,20 +207,17 @@ class TestAgentKgExtractionEdgeCases:
"Quotes: \"test\"",
"Parentheses: (test)",
]
data = {
"definitions": [
{"entity": entity, "definition": f"Definition for {entity}"}
for entity in special_entities
],
"relationships": []
}
data = [
{"type": "definition", "entity": entity, "definition": f"Definition for {entity}"}
for entity in special_entities
]
triples, contexts = agent_extractor.process_extraction_data(data, metadata)
# Verify all entities were processed
assert len(contexts) == len(special_entities)
# Verify URIs were properly encoded
for i, entity in enumerate(special_entities):
expected_uri = f"{TRUSTGRAPH_ENTITIES}{urllib.parse.quote(entity)}"
@ -259,23 +226,20 @@ class TestAgentKgExtractionEdgeCases:
def test_process_extraction_data_very_long_definitions(self, agent_extractor):
"""Test processing with very long entity definitions"""
metadata = Metadata(id="doc123", metadata=[])
# Create very long definition
long_definition = "This is a very long definition. " * 1000
data = {
"definitions": [
{"entity": "Test Entity", "definition": long_definition}
],
"relationships": []
}
data = [
{"type": "definition", "entity": "Test Entity", "definition": long_definition}
]
triples, contexts = agent_extractor.process_extraction_data(data, metadata)
# Should handle long definitions without issues
assert len(contexts) == 1
assert contexts[0].context == long_definition
# Find definition triple
def_triple = next((t for t in triples if t.p.value == DEFINITION), None)
assert def_triple is not None
@ -284,22 +248,19 @@ class TestAgentKgExtractionEdgeCases:
def test_process_extraction_data_duplicate_entities(self, agent_extractor):
"""Test processing with duplicate entity names"""
metadata = Metadata(id="doc123", metadata=[])
data = {
"definitions": [
{"entity": "Machine Learning", "definition": "First definition"},
{"entity": "Machine Learning", "definition": "Second definition"}, # Duplicate
{"entity": "AI", "definition": "AI definition"},
{"entity": "AI", "definition": "Another AI definition"}, # Duplicate
],
"relationships": []
}
data = [
{"type": "definition", "entity": "Machine Learning", "definition": "First definition"},
{"type": "definition", "entity": "Machine Learning", "definition": "Second definition"}, # Duplicate
{"type": "definition", "entity": "AI", "definition": "AI definition"},
{"type": "definition", "entity": "AI", "definition": "Another AI definition"}, # Duplicate
]
triples, contexts = agent_extractor.process_extraction_data(data, metadata)
# Should process all entries (including duplicates)
assert len(contexts) == 4
# Check that both definitions for "Machine Learning" are present
ml_contexts = [ec for ec in contexts if "Machine%20Learning" in ec.entity.value]
assert len(ml_contexts) == 2
@ -309,25 +270,21 @@ class TestAgentKgExtractionEdgeCases:
def test_process_extraction_data_empty_strings(self, agent_extractor):
"""Test processing with empty strings in data"""
metadata = Metadata(id="doc123", metadata=[])
data = {
"definitions": [
{"entity": "", "definition": "Definition for empty entity"},
{"entity": "Valid Entity", "definition": ""},
{"entity": " ", "definition": " "}, # Whitespace only
],
"relationships": [
{"subject": "", "predicate": "test", "object": "test", "object-entity": True},
{"subject": "test", "predicate": "", "object": "test", "object-entity": True},
{"subject": "test", "predicate": "test", "object": "", "object-entity": True},
]
}
data = [
{"type": "definition", "entity": "", "definition": "Definition for empty entity"},
{"type": "definition", "entity": "Valid Entity", "definition": ""},
{"type": "definition", "entity": " ", "definition": " "}, # Whitespace only
{"type": "relationship", "subject": "", "predicate": "test", "object": "test", "object-entity": True},
{"type": "relationship", "subject": "test", "predicate": "", "object": "test", "object-entity": True},
{"type": "relationship", "subject": "test", "predicate": "test", "object": "", "object-entity": True},
]
triples, contexts = agent_extractor.process_extraction_data(data, metadata)
# Should handle empty strings by creating URIs (even if empty)
assert len(contexts) == 3
# Empty entity should create empty URI after encoding
empty_entity_context = next((ec for ec in contexts if ec.entity.value == TRUSTGRAPH_ENTITIES), None)
assert empty_entity_context is not None
@ -335,23 +292,22 @@ class TestAgentKgExtractionEdgeCases:
def test_process_extraction_data_nested_json_in_strings(self, agent_extractor):
"""Test processing when definitions contain JSON-like strings"""
metadata = Metadata(id="doc123", metadata=[])
data = {
"definitions": [
{
"entity": "JSON Entity",
"definition": 'Definition with JSON: {"key": "value", "nested": {"inner": true}}'
},
{
"entity": "Array Entity",
"definition": 'Contains array: [1, 2, 3, "string"]'
}
],
"relationships": []
}
data = [
{
"type": "definition",
"entity": "JSON Entity",
"definition": 'Definition with JSON: {"key": "value", "nested": {"inner": true}}'
},
{
"type": "definition",
"entity": "Array Entity",
"definition": 'Contains array: [1, 2, 3, "string"]'
}
]
triples, contexts = agent_extractor.process_extraction_data(data, metadata)
# Should handle JSON strings in definitions without parsing them
assert len(contexts) == 2
assert '{"key": "value"' in contexts[0].context
@ -360,29 +316,26 @@ class TestAgentKgExtractionEdgeCases:
def test_process_extraction_data_boolean_object_entity_variations(self, agent_extractor):
"""Test processing with various boolean values for object-entity"""
metadata = Metadata(id="doc123", metadata=[])
data = {
"definitions": [],
"relationships": [
# Explicit True
{"subject": "A", "predicate": "rel1", "object": "B", "object-entity": True},
# Explicit False
{"subject": "A", "predicate": "rel2", "object": "literal", "object-entity": False},
# Missing object-entity (should default to True based on code)
{"subject": "A", "predicate": "rel3", "object": "C"},
# String "true" (should be treated as truthy)
{"subject": "A", "predicate": "rel4", "object": "D", "object-entity": "true"},
# String "false" (should be treated as truthy in Python)
{"subject": "A", "predicate": "rel5", "object": "E", "object-entity": "false"},
# Number 0 (falsy)
{"subject": "A", "predicate": "rel6", "object": "literal2", "object-entity": 0},
# Number 1 (truthy)
{"subject": "A", "predicate": "rel7", "object": "F", "object-entity": 1},
]
}
data = [
# Explicit True
{"type": "relationship", "subject": "A", "predicate": "rel1", "object": "B", "object-entity": True},
# Explicit False
{"type": "relationship", "subject": "A", "predicate": "rel2", "object": "literal", "object-entity": False},
# Missing object-entity (should default to True based on code)
{"type": "relationship", "subject": "A", "predicate": "rel3", "object": "C"},
# String "true" (should be treated as truthy)
{"type": "relationship", "subject": "A", "predicate": "rel4", "object": "D", "object-entity": "true"},
# String "false" (should be treated as truthy in Python)
{"type": "relationship", "subject": "A", "predicate": "rel5", "object": "E", "object-entity": "false"},
# Number 0 (falsy)
{"type": "relationship", "subject": "A", "predicate": "rel6", "object": "literal2", "object-entity": 0},
# Number 1 (truthy)
{"type": "relationship", "subject": "A", "predicate": "rel7", "object": "F", "object-entity": 1},
]
triples, contexts = agent_extractor.process_extraction_data(data, metadata)
# Should process all relationships
# Note: The current implementation has some logic issues that these tests document
assert len([t for t in triples if t.p.value != RDF_LABEL and t.p.value != SUBJECT_OF]) >= 7
@ -437,41 +390,40 @@ class TestAgentKgExtractionEdgeCases:
def test_process_extraction_data_performance_large_dataset(self, agent_extractor):
"""Test performance with large extraction datasets"""
metadata = Metadata(id="large-doc", metadata=[])
# Create large dataset
# Create large dataset in JSONL format
num_definitions = 1000
num_relationships = 2000
large_data = {
"definitions": [
{
"entity": f"Entity_{i:04d}",
"definition": f"Definition for entity {i} with some detailed explanation."
}
for i in range(num_definitions)
],
"relationships": [
{
"subject": f"Entity_{i % num_definitions:04d}",
"predicate": f"predicate_{i % 10}",
"object": f"Entity_{(i + 1) % num_definitions:04d}",
"object-entity": True
}
for i in range(num_relationships)
]
}
large_data = [
{
"type": "definition",
"entity": f"Entity_{i:04d}",
"definition": f"Definition for entity {i} with some detailed explanation."
}
for i in range(num_definitions)
] + [
{
"type": "relationship",
"subject": f"Entity_{i % num_definitions:04d}",
"predicate": f"predicate_{i % 10}",
"object": f"Entity_{(i + 1) % num_definitions:04d}",
"object-entity": True
}
for i in range(num_relationships)
]
import time
start_time = time.time()
triples, contexts = agent_extractor.process_extraction_data(large_data, metadata)
end_time = time.time()
processing_time = end_time - start_time
# Should complete within reasonable time (adjust threshold as needed)
assert processing_time < 10.0 # 10 seconds threshold
# Verify results
assert len(contexts) == num_definitions
# Triples include labels, definitions, relationships, and subject-of relations

View file

@ -339,7 +339,250 @@ class TestPromptManager:
"""Test PromptManager with minimal configuration"""
pm = PromptManager()
pm.load_config({}) # Empty config
assert pm.config.system_template == "Be helpful." # Default system
assert pm.terms == {} # Default empty terms
assert len(pm.prompts) == 0
assert len(pm.prompts) == 0
@pytest.mark.unit
class TestPromptManagerJsonl:
"""Unit tests for PromptManager JSONL functionality"""
@pytest.fixture
def jsonl_config(self):
"""Configuration with JSONL response type prompts"""
return {
"system": json.dumps("You are an extraction assistant."),
"template-index": json.dumps(["extract_simple", "extract_with_schema", "extract_mixed"]),
"template.extract_simple": json.dumps({
"prompt": "Extract entities from: {{ text }}",
"response-type": "jsonl"
}),
"template.extract_with_schema": json.dumps({
"prompt": "Extract definitions from: {{ text }}",
"response-type": "jsonl",
"schema": {
"type": "object",
"properties": {
"entity": {"type": "string"},
"definition": {"type": "string"}
},
"required": ["entity", "definition"]
}
}),
"template.extract_mixed": json.dumps({
"prompt": "Extract knowledge from: {{ text }}",
"response-type": "jsonl",
"schema": {
"oneOf": [
{
"type": "object",
"properties": {
"type": {"const": "definition"},
"entity": {"type": "string"},
"definition": {"type": "string"}
},
"required": ["type", "entity", "definition"]
},
{
"type": "object",
"properties": {
"type": {"const": "relationship"},
"subject": {"type": "string"},
"predicate": {"type": "string"},
"object": {"type": "string"}
},
"required": ["type", "subject", "predicate", "object"]
}
]
}
})
}
@pytest.fixture
def prompt_manager(self, jsonl_config):
"""Create a PromptManager with JSONL configuration"""
pm = PromptManager()
pm.load_config(jsonl_config)
return pm
def test_parse_jsonl_basic(self, prompt_manager):
"""Test basic JSONL parsing"""
text = '{"entity": "cat", "definition": "A small furry animal"}\n{"entity": "dog", "definition": "A loyal pet"}'
result = prompt_manager.parse_jsonl(text)
assert len(result) == 2
assert result[0]["entity"] == "cat"
assert result[1]["entity"] == "dog"
def test_parse_jsonl_with_empty_lines(self, prompt_manager):
"""Test JSONL parsing skips empty lines"""
text = '{"entity": "cat"}\n\n\n{"entity": "dog"}\n'
result = prompt_manager.parse_jsonl(text)
assert len(result) == 2
def test_parse_jsonl_with_markdown_fences(self, prompt_manager):
"""Test JSONL parsing strips markdown code fences"""
text = '''```json
{"entity": "cat", "definition": "A furry animal"}
{"entity": "dog", "definition": "A loyal pet"}
```'''
result = prompt_manager.parse_jsonl(text)
assert len(result) == 2
assert result[0]["entity"] == "cat"
assert result[1]["entity"] == "dog"
def test_parse_jsonl_with_jsonl_fence(self, prompt_manager):
"""Test JSONL parsing strips jsonl-marked code fences"""
text = '''```jsonl
{"entity": "cat"}
{"entity": "dog"}
```'''
result = prompt_manager.parse_jsonl(text)
assert len(result) == 2
def test_parse_jsonl_truncation_resilience(self, prompt_manager):
"""Test JSONL parsing handles truncated final line"""
text = '{"entity": "cat", "definition": "Complete"}\n{"entity": "dog", "defi'
result = prompt_manager.parse_jsonl(text)
# Should get the first valid object, skip the truncated one
assert len(result) == 1
assert result[0]["entity"] == "cat"
def test_parse_jsonl_invalid_lines_skipped(self, prompt_manager):
"""Test JSONL parsing skips invalid JSON lines"""
text = '''{"entity": "valid1"}
not json at all
{"entity": "valid2"}
{broken json
{"entity": "valid3"}'''
result = prompt_manager.parse_jsonl(text)
assert len(result) == 3
assert result[0]["entity"] == "valid1"
assert result[1]["entity"] == "valid2"
assert result[2]["entity"] == "valid3"
def test_parse_jsonl_empty_input(self, prompt_manager):
"""Test JSONL parsing with empty input"""
result = prompt_manager.parse_jsonl("")
assert result == []
result = prompt_manager.parse_jsonl("\n\n\n")
assert result == []
@pytest.mark.asyncio
async def test_invoke_jsonl_response(self, prompt_manager):
"""Test invoking a prompt with JSONL response"""
mock_llm = AsyncMock()
mock_llm.return_value = '{"entity": "photosynthesis", "definition": "Plant process"}\n{"entity": "mitosis", "definition": "Cell division"}'
result = await prompt_manager.invoke(
"extract_simple",
{"text": "Biology text"},
mock_llm
)
assert isinstance(result, list)
assert len(result) == 2
assert result[0]["entity"] == "photosynthesis"
assert result[1]["entity"] == "mitosis"
@pytest.mark.asyncio
async def test_invoke_jsonl_with_schema_validation(self, prompt_manager):
"""Test JSONL response with schema validation"""
mock_llm = AsyncMock()
mock_llm.return_value = '{"entity": "cat", "definition": "A pet"}\n{"entity": "dog", "definition": "Another pet"}'
result = await prompt_manager.invoke(
"extract_with_schema",
{"text": "Animal text"},
mock_llm
)
assert len(result) == 2
assert all("entity" in obj and "definition" in obj for obj in result)
@pytest.mark.asyncio
async def test_invoke_jsonl_schema_filters_invalid(self, prompt_manager):
"""Test JSONL schema validation filters out invalid objects"""
mock_llm = AsyncMock()
# Second object is missing required 'definition' field
mock_llm.return_value = '{"entity": "valid", "definition": "Has both fields"}\n{"entity": "invalid_missing_definition"}\n{"entity": "also_valid", "definition": "Complete"}'
result = await prompt_manager.invoke(
"extract_with_schema",
{"text": "Test text"},
mock_llm
)
# Only the two valid objects should be returned
assert len(result) == 2
assert result[0]["entity"] == "valid"
assert result[1]["entity"] == "also_valid"
@pytest.mark.asyncio
async def test_invoke_jsonl_mixed_types(self, prompt_manager):
"""Test JSONL with discriminated union schema (oneOf)"""
mock_llm = AsyncMock()
mock_llm.return_value = '''{"type": "definition", "entity": "DNA", "definition": "Genetic material"}
{"type": "relationship", "subject": "DNA", "predicate": "found_in", "object": "nucleus"}
{"type": "definition", "entity": "RNA", "definition": "Messenger molecule"}'''
result = await prompt_manager.invoke(
"extract_mixed",
{"text": "Biology text"},
mock_llm
)
assert len(result) == 3
# Check definitions
definitions = [r for r in result if r.get("type") == "definition"]
assert len(definitions) == 2
# Check relationships
relationships = [r for r in result if r.get("type") == "relationship"]
assert len(relationships) == 1
assert relationships[0]["subject"] == "DNA"
@pytest.mark.asyncio
async def test_invoke_jsonl_empty_result(self, prompt_manager):
"""Test JSONL response that yields no valid objects"""
mock_llm = AsyncMock()
mock_llm.return_value = "No JSON here at all"
result = await prompt_manager.invoke(
"extract_simple",
{"text": "Test"},
mock_llm
)
assert result == []
@pytest.mark.asyncio
async def test_invoke_jsonl_without_schema(self, prompt_manager):
"""Test JSONL response without schema validation"""
mock_llm = AsyncMock()
mock_llm.return_value = '{"any": "structure"}\n{"completely": "different"}'
result = await prompt_manager.invoke(
"extract_simple",
{"text": "Test"},
mock_llm
)
assert len(result) == 2
assert result[0] == {"any": "structure"}
assert result[1] == {"completely": "different"}