Feature: document metadata (#123)

* Rework metadata structure in processing messages to be a subgraph
* Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes
* Document metadata is added to knowledge graph with subjectOf linkage to extracted entities
This commit is contained in:
cybermaggedon 2024-10-23 18:04:04 +01:00 committed by GitHub
parent b8818e28d0
commit 7954e863cc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 625 additions and 98 deletions

View file

@ -63,16 +63,8 @@ class Processor(ConsumerProducer):
for ix, chunk in enumerate(texts):
id = v.metadata.id + "-c" + str(ix)
r = Chunk(
metadata=Metadata(
source=v.metadata.source,
id=id,
title=v.metadata.title,
user=v.metadata.user,
collection=v.metadata.collection,
),
metadata=v.metadata,
chunk=chunk.page_content.encode("utf-8"),
)

View file

@ -62,16 +62,8 @@ class Processor(ConsumerProducer):
for ix, chunk in enumerate(texts):
id = v.metadata.id + "-c" + str(ix)
r = Chunk(
metadata=Metadata(
source=v.metadata.source,
id=id,
title=v.metadata.title,
user=v.metadata.user,
collection=v.metadata.collection,
),
metadata=v.metadata,
chunk=chunk.page_content.encode("utf-8"),
)

View file

@ -59,15 +59,8 @@ class Processor(ConsumerProducer):
for ix, page in enumerate(pages):
id = v.metadata.id + "-p" + str(ix)
r = TextDocument(
metadata=Metadata(
source=v.metadata.source,
title=v.metadata.title,
id=id,
user=v.metadata.user,
collection=v.metadata.collection,
),
metadata=v.metadata,
text=page.page_content.encode("utf-8"),
)

View file

@ -7,16 +7,18 @@ get entity definitions which are output as graph edges.
import urllib.parse
import json
from .... schema import ChunkEmbeddings, Triple, Metadata, Value
from .... schema import ChunkEmbeddings, Triple, Triples, Metadata, Value
from .... schema import chunk_embeddings_ingest_queue, triples_store_queue
from .... schema import prompt_request_queue
from .... schema import prompt_response_queue
from .... log_level import LogLevel
from .... clients.prompt_client import PromptClient
from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION
from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
from .... base import ConsumerProducer
DEFINITION_VALUE = Value(value=DEFINITION, is_uri=True)
RDF_LABEL_VALUE = Value(value=RDF_LABEL, is_uri=True)
SUBJECT_OF_VALUE = Value(value=SUBJECT_OF, is_uri=True)
module = ".".join(__name__.split(".")[1:-1])
@ -44,7 +46,7 @@ class Processor(ConsumerProducer):
"output_queue": output_queue,
"subscriber": subscriber,
"input_schema": ChunkEmbeddings,
"output_schema": Triple,
"output_schema": Triples,
"prompt_request_queue": pr_request_queue,
"prompt_response_queue": pr_response_queue,
}
@ -69,9 +71,12 @@ class Processor(ConsumerProducer):
return self.prompt.request_definitions(chunk)
def emit_edge(self, metadata, s, p, o):
def emit_edges(self, metadata, triples):
t = Triple(metadata=metadata, s=s, p=p, o=o)
t = Triples(
metadata=metadata,
triples=triples,
)
self.producer.send(t)
def handle(self, msg):
@ -85,6 +90,13 @@ class Processor(ConsumerProducer):
defs = self.get_definitions(chunk)
triples = []
# FIXME: Putting metadata into triples store is duplicated in
# relationships extractor too
for t in v.metadata.metadata:
triples.append(t)
for defn in defs:
s = defn.name
@ -101,7 +113,31 @@ class Processor(ConsumerProducer):
s_value = Value(value=str(s_uri), is_uri=True)
o_value = Value(value=str(o), is_uri=False)
self.emit_edge(v.metadata, s_value, DEFINITION_VALUE, o_value)
triples.append(Triple(
s=s_value,
p=RDF_LABEL_VALUE,
o=Value(value=s, is_uri=False),
))
triples.append(Triple(
s=s_value, p=DEFINITION_VALUE, o=o_value
))
triples.append(Triple(
s=s_value,
p=SUBJECT_OF_VALUE,
o=Value(value=v.metadata.id, is_uri=True)
))
self.emit_edges(
Metadata(
id=v.metadata.id,
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),
triples
)
except Exception as e:
print("Exception: ", e, flush=True)

View file

@ -9,7 +9,7 @@ import urllib.parse
import os
from pulsar.schema import JsonSchema
from .... schema import ChunkEmbeddings, Triple, GraphEmbeddings
from .... schema import ChunkEmbeddings, Triple, Triples, GraphEmbeddings
from .... schema import Metadata, Value
from .... schema import chunk_embeddings_ingest_queue, triples_store_queue
from .... schema import graph_embeddings_store_queue
@ -17,10 +17,11 @@ from .... schema import prompt_request_queue
from .... schema import prompt_response_queue
from .... log_level import LogLevel
from .... clients.prompt_client import PromptClient
from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES
from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
from .... base import ConsumerProducer
RDF_LABEL_VALUE = Value(value=RDF_LABEL, is_uri=True)
SUBJECT_OF_VALUE = Value(value=SUBJECT_OF, is_uri=True)
module = ".".join(__name__.split(".")[1:-1])
@ -50,7 +51,7 @@ class Processor(ConsumerProducer):
"output_queue": output_queue,
"subscriber": subscriber,
"input_schema": ChunkEmbeddings,
"output_schema": Triple,
"output_schema": Triples,
"prompt_request_queue": pr_request_queue,
"prompt_response_queue": pr_response_queue,
}
@ -69,7 +70,7 @@ class Processor(ConsumerProducer):
"prompt_response_queue": pr_response_queue,
"subscriber": subscriber,
"input_schema": ChunkEmbeddings.__name__,
"output_schema": Triple.__name__,
"output_schema": Triples.__name__,
"vector_schema": GraphEmbeddings.__name__,
})
@ -92,9 +93,12 @@ class Processor(ConsumerProducer):
return self.prompt.request_relationships(chunk)
def emit_edge(self, metadata, s, p, o):
def emit_edges(self, metadata, triples):
t = Triple(metadata=metadata, s=s, p=p, o=o)
t = Triples(
metadata=metadata,
triples=triples,
)
self.producer.send(t)
def emit_vec(self, metadata, ent, vec):
@ -113,6 +117,13 @@ class Processor(ConsumerProducer):
rels = self.get_relationships(chunk)
triples = []
# FIXME: Putting metadata into triples store is duplicated in
# relationships extractor too
for t in v.metadata.metadata:
triples.append(t)
for rel in rels:
s = rel.s
@ -139,43 +150,65 @@ class Processor(ConsumerProducer):
else:
o_value = Value(value=str(o), is_uri=False)
self.emit_edge(
v.metadata,
s_value,
p_value,
o_value
)
triples.append(Triple(
s=s_value,
p=p_value,
o=o_value
))
# Label for s
self.emit_edge(
v.metadata,
s_value,
RDF_LABEL_VALUE,
Value(value=str(s), is_uri=False)
)
triples.append(Triple(
s=s_value,
p=RDF_LABEL_VALUE,
o=Value(value=str(s), is_uri=False)
))
# Label for p
self.emit_edge(
v.metadata,
p_value,
RDF_LABEL_VALUE,
Value(value=str(p), is_uri=False)
)
triples.append(Triple(
s=p_value,
p=RDF_LABEL_VALUE,
o=Value(value=str(p), is_uri=False)
))
if rel.o_entity:
# Label for o
self.emit_edge(
v.metadata,
o_value,
RDF_LABEL_VALUE,
Value(value=str(o), is_uri=False)
)
triples.append(Triple(
s=o_value,
p=RDF_LABEL_VALUE,
o=Value(value=str(o), is_uri=False)
))
# 'Subject of' for s
triples.append(Triple(
s=s_value,
p=SUBJECT_OF_VALUE,
o=Value(value=v.metadata.id, is_uri=True)
))
if rel.o_entity:
# 'Subject of' for o
triples.append(Triple(
s=o_value,
p=RDF_LABEL_VALUE,
o=Value(value=v.metadata.id, is_uri=True)
))
self.emit_vec(v.metadata, s_value, v.vectors)
self.emit_vec(v.metadata, p_value, v.vectors)
if rel.o_entity:
self.emit_vec(v.metadata, o_value, v.vectors)
self.emit_edges(
Metadata(
id=v.metadata.id,
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),
triples
)
except Exception as e:
print("Exception: ", e, flush=True)

View file

@ -7,7 +7,7 @@ get entity definitions which are output as graph edges.
import urllib.parse
import json
from .... schema import ChunkEmbeddings, Triple, Metadata, Value
from .... schema import ChunkEmbeddings, Triple, Triples, Metadata, Value
from .... schema import chunk_embeddings_ingest_queue, triples_store_queue
from .... schema import prompt_request_queue
from .... schema import prompt_response_queue
@ -44,7 +44,7 @@ class Processor(ConsumerProducer):
"output_queue": output_queue,
"subscriber": subscriber,
"input_schema": ChunkEmbeddings,
"output_schema": Triple,
"output_schema": Triples,
"prompt_request_queue": pr_request_queue,
"prompt_response_queue": pr_response_queue,
}
@ -71,7 +71,10 @@ class Processor(ConsumerProducer):
def emit_edge(self, metadata, s, p, o):
t = Triple(metadata=metadata, s=s, p=p, o=o)
t = Triples(
metadata=metadata,
triples=[Triple(s=s, p=p, o=o)],
)
self.producer.send(t)
def handle(self, msg):

View file

@ -10,7 +10,7 @@ import argparse
import time
from .... direct.cassandra import TrustGraph
from .... schema import Triple
from .... schema import Triple, Triples
from .... schema import triples_store_queue
from .... log_level import LogLevel
from .... base import Consumer
@ -33,7 +33,7 @@ class Processor(Consumer):
**params | {
"input_queue": input_queue,
"subscriber": subscriber,
"input_schema": Triple,
"input_schema": Triples,
"graph_host": graph_host,
}
)
@ -62,12 +62,13 @@ class Processor(Consumer):
raise e
self.table = table
self.tg.insert(
v.s.value,
v.p.value,
v.o.value
)
for t in v.triples:
self.tg.insert(
t.s.value,
t.p.value,
t.o.value
)
@staticmethod
def add_args(parser):

View file

@ -116,14 +116,16 @@ class Processor(Consumer):
v = msg.value()
self.create_node(v.s.value)
for t in v.triples:
if v.o.is_uri:
self.create_node(v.o.value)
self.relate_node(v.s.value, v.p.value, v.o.value)
else:
self.create_literal(v.o.value)
self.relate_literal(v.s.value, v.p.value, v.o.value)
self.create_node(t.s.value)
if v.o.is_uri:
self.create_node(t.o.value)
self.relate_node(t.s.value, t.p.value, t.o.value)
else:
self.create_literal(t.o.value)
self.relate_literal(t.s.value, t.p.value, t.o.value)
@staticmethod
def add_args(parser):