mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-28 09:56:22 +02:00
Feature: document metadata (#123)
* Rework metadata structure in processing messages to be a subgraph * Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes * Document metadata is added to knowledge graph with subjectOf linkage to extracted entities
This commit is contained in:
parent
b8818e28d0
commit
7954e863cc
21 changed files with 625 additions and 98 deletions
|
|
@ -63,16 +63,8 @@ class Processor(ConsumerProducer):
|
|||
|
||||
for ix, chunk in enumerate(texts):
|
||||
|
||||
id = v.metadata.id + "-c" + str(ix)
|
||||
|
||||
r = Chunk(
|
||||
metadata=Metadata(
|
||||
source=v.metadata.source,
|
||||
id=id,
|
||||
title=v.metadata.title,
|
||||
user=v.metadata.user,
|
||||
collection=v.metadata.collection,
|
||||
),
|
||||
metadata=v.metadata,
|
||||
chunk=chunk.page_content.encode("utf-8"),
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -62,16 +62,8 @@ class Processor(ConsumerProducer):
|
|||
|
||||
for ix, chunk in enumerate(texts):
|
||||
|
||||
id = v.metadata.id + "-c" + str(ix)
|
||||
|
||||
r = Chunk(
|
||||
metadata=Metadata(
|
||||
source=v.metadata.source,
|
||||
id=id,
|
||||
title=v.metadata.title,
|
||||
user=v.metadata.user,
|
||||
collection=v.metadata.collection,
|
||||
),
|
||||
metadata=v.metadata,
|
||||
chunk=chunk.page_content.encode("utf-8"),
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -59,15 +59,8 @@ class Processor(ConsumerProducer):
|
|||
|
||||
for ix, page in enumerate(pages):
|
||||
|
||||
id = v.metadata.id + "-p" + str(ix)
|
||||
r = TextDocument(
|
||||
metadata=Metadata(
|
||||
source=v.metadata.source,
|
||||
title=v.metadata.title,
|
||||
id=id,
|
||||
user=v.metadata.user,
|
||||
collection=v.metadata.collection,
|
||||
),
|
||||
metadata=v.metadata,
|
||||
text=page.page_content.encode("utf-8"),
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -7,16 +7,18 @@ get entity definitions which are output as graph edges.
|
|||
import urllib.parse
|
||||
import json
|
||||
|
||||
from .... schema import ChunkEmbeddings, Triple, Metadata, Value
|
||||
from .... schema import ChunkEmbeddings, Triple, Triples, Metadata, Value
|
||||
from .... schema import chunk_embeddings_ingest_queue, triples_store_queue
|
||||
from .... schema import prompt_request_queue
|
||||
from .... schema import prompt_response_queue
|
||||
from .... log_level import LogLevel
|
||||
from .... clients.prompt_client import PromptClient
|
||||
from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION
|
||||
from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
|
||||
from .... base import ConsumerProducer
|
||||
|
||||
DEFINITION_VALUE = Value(value=DEFINITION, is_uri=True)
|
||||
RDF_LABEL_VALUE = Value(value=RDF_LABEL, is_uri=True)
|
||||
SUBJECT_OF_VALUE = Value(value=SUBJECT_OF, is_uri=True)
|
||||
|
||||
module = ".".join(__name__.split(".")[1:-1])
|
||||
|
||||
|
|
@ -44,7 +46,7 @@ class Processor(ConsumerProducer):
|
|||
"output_queue": output_queue,
|
||||
"subscriber": subscriber,
|
||||
"input_schema": ChunkEmbeddings,
|
||||
"output_schema": Triple,
|
||||
"output_schema": Triples,
|
||||
"prompt_request_queue": pr_request_queue,
|
||||
"prompt_response_queue": pr_response_queue,
|
||||
}
|
||||
|
|
@ -69,9 +71,12 @@ class Processor(ConsumerProducer):
|
|||
|
||||
return self.prompt.request_definitions(chunk)
|
||||
|
||||
def emit_edge(self, metadata, s, p, o):
|
||||
def emit_edges(self, metadata, triples):
|
||||
|
||||
t = Triple(metadata=metadata, s=s, p=p, o=o)
|
||||
t = Triples(
|
||||
metadata=metadata,
|
||||
triples=triples,
|
||||
)
|
||||
self.producer.send(t)
|
||||
|
||||
def handle(self, msg):
|
||||
|
|
@ -85,6 +90,13 @@ class Processor(ConsumerProducer):
|
|||
|
||||
defs = self.get_definitions(chunk)
|
||||
|
||||
triples = []
|
||||
|
||||
# FIXME: Putting metadata into triples store is duplicated in
|
||||
# relationships extractor too
|
||||
for t in v.metadata.metadata:
|
||||
triples.append(t)
|
||||
|
||||
for defn in defs:
|
||||
|
||||
s = defn.name
|
||||
|
|
@ -101,7 +113,31 @@ class Processor(ConsumerProducer):
|
|||
s_value = Value(value=str(s_uri), is_uri=True)
|
||||
o_value = Value(value=str(o), is_uri=False)
|
||||
|
||||
self.emit_edge(v.metadata, s_value, DEFINITION_VALUE, o_value)
|
||||
triples.append(Triple(
|
||||
s=s_value,
|
||||
p=RDF_LABEL_VALUE,
|
||||
o=Value(value=s, is_uri=False),
|
||||
))
|
||||
|
||||
triples.append(Triple(
|
||||
s=s_value, p=DEFINITION_VALUE, o=o_value
|
||||
))
|
||||
|
||||
triples.append(Triple(
|
||||
s=s_value,
|
||||
p=SUBJECT_OF_VALUE,
|
||||
o=Value(value=v.metadata.id, is_uri=True)
|
||||
))
|
||||
|
||||
self.emit_edges(
|
||||
Metadata(
|
||||
id=v.metadata.id,
|
||||
metadata=[],
|
||||
user=v.metadata.user,
|
||||
collection=v.metadata.collection,
|
||||
),
|
||||
triples
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print("Exception: ", e, flush=True)
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ import urllib.parse
|
|||
import os
|
||||
from pulsar.schema import JsonSchema
|
||||
|
||||
from .... schema import ChunkEmbeddings, Triple, GraphEmbeddings
|
||||
from .... schema import ChunkEmbeddings, Triple, Triples, GraphEmbeddings
|
||||
from .... schema import Metadata, Value
|
||||
from .... schema import chunk_embeddings_ingest_queue, triples_store_queue
|
||||
from .... schema import graph_embeddings_store_queue
|
||||
|
|
@ -17,10 +17,11 @@ from .... schema import prompt_request_queue
|
|||
from .... schema import prompt_response_queue
|
||||
from .... log_level import LogLevel
|
||||
from .... clients.prompt_client import PromptClient
|
||||
from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES
|
||||
from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
|
||||
from .... base import ConsumerProducer
|
||||
|
||||
RDF_LABEL_VALUE = Value(value=RDF_LABEL, is_uri=True)
|
||||
SUBJECT_OF_VALUE = Value(value=SUBJECT_OF, is_uri=True)
|
||||
|
||||
module = ".".join(__name__.split(".")[1:-1])
|
||||
|
||||
|
|
@ -50,7 +51,7 @@ class Processor(ConsumerProducer):
|
|||
"output_queue": output_queue,
|
||||
"subscriber": subscriber,
|
||||
"input_schema": ChunkEmbeddings,
|
||||
"output_schema": Triple,
|
||||
"output_schema": Triples,
|
||||
"prompt_request_queue": pr_request_queue,
|
||||
"prompt_response_queue": pr_response_queue,
|
||||
}
|
||||
|
|
@ -69,7 +70,7 @@ class Processor(ConsumerProducer):
|
|||
"prompt_response_queue": pr_response_queue,
|
||||
"subscriber": subscriber,
|
||||
"input_schema": ChunkEmbeddings.__name__,
|
||||
"output_schema": Triple.__name__,
|
||||
"output_schema": Triples.__name__,
|
||||
"vector_schema": GraphEmbeddings.__name__,
|
||||
})
|
||||
|
||||
|
|
@ -92,9 +93,12 @@ class Processor(ConsumerProducer):
|
|||
|
||||
return self.prompt.request_relationships(chunk)
|
||||
|
||||
def emit_edge(self, metadata, s, p, o):
|
||||
def emit_edges(self, metadata, triples):
|
||||
|
||||
t = Triple(metadata=metadata, s=s, p=p, o=o)
|
||||
t = Triples(
|
||||
metadata=metadata,
|
||||
triples=triples,
|
||||
)
|
||||
self.producer.send(t)
|
||||
|
||||
def emit_vec(self, metadata, ent, vec):
|
||||
|
|
@ -113,6 +117,13 @@ class Processor(ConsumerProducer):
|
|||
|
||||
rels = self.get_relationships(chunk)
|
||||
|
||||
triples = []
|
||||
|
||||
# FIXME: Putting metadata into triples store is duplicated in
|
||||
# relationships extractor too
|
||||
for t in v.metadata.metadata:
|
||||
triples.append(t)
|
||||
|
||||
for rel in rels:
|
||||
|
||||
s = rel.s
|
||||
|
|
@ -139,43 +150,65 @@ class Processor(ConsumerProducer):
|
|||
else:
|
||||
o_value = Value(value=str(o), is_uri=False)
|
||||
|
||||
self.emit_edge(
|
||||
v.metadata,
|
||||
s_value,
|
||||
p_value,
|
||||
o_value
|
||||
)
|
||||
triples.append(Triple(
|
||||
s=s_value,
|
||||
p=p_value,
|
||||
o=o_value
|
||||
))
|
||||
|
||||
# Label for s
|
||||
self.emit_edge(
|
||||
v.metadata,
|
||||
s_value,
|
||||
RDF_LABEL_VALUE,
|
||||
Value(value=str(s), is_uri=False)
|
||||
)
|
||||
triples.append(Triple(
|
||||
s=s_value,
|
||||
p=RDF_LABEL_VALUE,
|
||||
o=Value(value=str(s), is_uri=False)
|
||||
))
|
||||
|
||||
# Label for p
|
||||
self.emit_edge(
|
||||
v.metadata,
|
||||
p_value,
|
||||
RDF_LABEL_VALUE,
|
||||
Value(value=str(p), is_uri=False)
|
||||
)
|
||||
triples.append(Triple(
|
||||
s=p_value,
|
||||
p=RDF_LABEL_VALUE,
|
||||
o=Value(value=str(p), is_uri=False)
|
||||
))
|
||||
|
||||
if rel.o_entity:
|
||||
# Label for o
|
||||
self.emit_edge(
|
||||
v.metadata,
|
||||
o_value,
|
||||
RDF_LABEL_VALUE,
|
||||
Value(value=str(o), is_uri=False)
|
||||
)
|
||||
triples.append(Triple(
|
||||
s=o_value,
|
||||
p=RDF_LABEL_VALUE,
|
||||
o=Value(value=str(o), is_uri=False)
|
||||
))
|
||||
|
||||
# 'Subject of' for s
|
||||
triples.append(Triple(
|
||||
s=s_value,
|
||||
p=SUBJECT_OF_VALUE,
|
||||
o=Value(value=v.metadata.id, is_uri=True)
|
||||
))
|
||||
|
||||
if rel.o_entity:
|
||||
# 'Subject of' for o
|
||||
triples.append(Triple(
|
||||
s=o_value,
|
||||
p=RDF_LABEL_VALUE,
|
||||
o=Value(value=v.metadata.id, is_uri=True)
|
||||
))
|
||||
|
||||
self.emit_vec(v.metadata, s_value, v.vectors)
|
||||
self.emit_vec(v.metadata, p_value, v.vectors)
|
||||
|
||||
if rel.o_entity:
|
||||
self.emit_vec(v.metadata, o_value, v.vectors)
|
||||
|
||||
self.emit_edges(
|
||||
Metadata(
|
||||
id=v.metadata.id,
|
||||
metadata=[],
|
||||
user=v.metadata.user,
|
||||
collection=v.metadata.collection,
|
||||
),
|
||||
triples
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print("Exception: ", e, flush=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ get entity definitions which are output as graph edges.
|
|||
import urllib.parse
|
||||
import json
|
||||
|
||||
from .... schema import ChunkEmbeddings, Triple, Metadata, Value
|
||||
from .... schema import ChunkEmbeddings, Triple, Triples, Metadata, Value
|
||||
from .... schema import chunk_embeddings_ingest_queue, triples_store_queue
|
||||
from .... schema import prompt_request_queue
|
||||
from .... schema import prompt_response_queue
|
||||
|
|
@ -44,7 +44,7 @@ class Processor(ConsumerProducer):
|
|||
"output_queue": output_queue,
|
||||
"subscriber": subscriber,
|
||||
"input_schema": ChunkEmbeddings,
|
||||
"output_schema": Triple,
|
||||
"output_schema": Triples,
|
||||
"prompt_request_queue": pr_request_queue,
|
||||
"prompt_response_queue": pr_response_queue,
|
||||
}
|
||||
|
|
@ -71,7 +71,10 @@ class Processor(ConsumerProducer):
|
|||
|
||||
def emit_edge(self, metadata, s, p, o):
|
||||
|
||||
t = Triple(metadata=metadata, s=s, p=p, o=o)
|
||||
t = Triples(
|
||||
metadata=metadata,
|
||||
triples=[Triple(s=s, p=p, o=o)],
|
||||
)
|
||||
self.producer.send(t)
|
||||
|
||||
def handle(self, msg):
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ import argparse
|
|||
import time
|
||||
|
||||
from .... direct.cassandra import TrustGraph
|
||||
from .... schema import Triple
|
||||
from .... schema import Triple, Triples
|
||||
from .... schema import triples_store_queue
|
||||
from .... log_level import LogLevel
|
||||
from .... base import Consumer
|
||||
|
|
@ -33,7 +33,7 @@ class Processor(Consumer):
|
|||
**params | {
|
||||
"input_queue": input_queue,
|
||||
"subscriber": subscriber,
|
||||
"input_schema": Triple,
|
||||
"input_schema": Triples,
|
||||
"graph_host": graph_host,
|
||||
}
|
||||
)
|
||||
|
|
@ -62,12 +62,13 @@ class Processor(Consumer):
|
|||
raise e
|
||||
|
||||
self.table = table
|
||||
|
||||
self.tg.insert(
|
||||
v.s.value,
|
||||
v.p.value,
|
||||
v.o.value
|
||||
)
|
||||
|
||||
for t in v.triples:
|
||||
self.tg.insert(
|
||||
t.s.value,
|
||||
t.p.value,
|
||||
t.o.value
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
|
|
|||
|
|
@ -116,14 +116,16 @@ class Processor(Consumer):
|
|||
|
||||
v = msg.value()
|
||||
|
||||
self.create_node(v.s.value)
|
||||
for t in v.triples:
|
||||
|
||||
if v.o.is_uri:
|
||||
self.create_node(v.o.value)
|
||||
self.relate_node(v.s.value, v.p.value, v.o.value)
|
||||
else:
|
||||
self.create_literal(v.o.value)
|
||||
self.relate_literal(v.s.value, v.p.value, v.o.value)
|
||||
self.create_node(t.s.value)
|
||||
|
||||
if v.o.is_uri:
|
||||
self.create_node(t.o.value)
|
||||
self.relate_node(t.s.value, t.p.value, t.o.value)
|
||||
else:
|
||||
self.create_literal(t.o.value)
|
||||
self.relate_literal(t.s.value, t.p.value, t.o.value)
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue