Feature: document metadata (#123)

* Rework metadata structure in processing messages to be a subgraph
* Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes
* Document metadata is added to knowledge graph with subjectOf linkage to extracted entities
This commit is contained in:
cybermaggedon 2024-10-23 18:04:04 +01:00 committed by GitHub
parent b8818e28d0
commit 7954e863cc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 625 additions and 98 deletions

View file

@ -0,0 +1,6 @@
from . identifier import *
from . publication import *
from . document import *
from . organization import *

View file

@ -0,0 +1,25 @@
IS_A = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'
LABEL = 'http://www.w3.org/2000/01/rdf-schema#label'
DIGITAL_DOCUMENT = 'https://schema.org/DigitalDocument'
PUBLICATION_EVENT = 'https://schema.org/PublicationEvent'
ORGANIZATION = 'https://schema.org/Organization'
NAME = 'https://schema.org/name'
DESCRIPTION = 'https://schema.org/description'
COPYRIGHT_NOTICE = 'https://schema.org/copyrightNotice'
COPYRIGHT_HOLDER = 'https://schema.org/copyrightHolder'
COPYRIGHT_YEAR = 'https://schema.org/copyrightYear'
LICENSE = 'https://schema.org/license'
PUBLICATION = 'https://schema.org/publication'
START_DATE = 'https://schema.org/startDate'
END_DATE = 'https://schema.org/endDate'
PUBLISHED_BY = 'https://schema.org/publishedBy'
DATE_PUBLISHED = 'https://schema.org/datePublished'
PUBLICATION = 'https://schema.org/publication'
DATE_PUBLISHED = 'https://schema.org/datePublished'
URL = 'https://schema.org/url'
IDENTIFIER = 'https://schema.org/identifier'
KEYWORD = 'https://schema.org/keywords'

View file

@ -0,0 +1,119 @@
from . defs import *
from .. schema import Triple, Value
class DigitalDocument:
def __init__(
self, id, name=None, description=None, copyright_notice=None,
copyright_holder=None, copyright_year=None, license=None,
identifier=None,
publication=None, url=None, keywords=[]
):
self.id = id
self.name = name
self.description = description
self.copyright_notice = copyright_notice
self.copyright_holder = copyright_holder
self.copyright_year = copyright_year
self.license = license
self.publication = publication
self.url = url
self.identifier = identifier
self.keywords = keywords
def emit(self, emit):
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=IS_A, is_uri=True),
o=Value(value=DIGITAL_DOCUMENT, is_uri=True)
))
if self.name:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=LABEL, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=NAME, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
if self.identifier:
emit(Triple(
s=Value(value=id, is_uri=True),
p=Value(value=IDENTIFIER, is_uri=True),
o=Value(value=self.identifier, is_uri=False)
))
if self.description:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=DESCRIPTION, is_uri=True),
o=Value(value=self.description, is_uri=False)
))
if self.copyright_notice:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=COPYRIGHT_NOTICE, is_uri=True),
o=Value(value=self.copyright_notice, is_uri=False)
))
if self.copyright_holder:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=COPYRIGHT_HOLDER, is_uri=True),
o=Value(value=self.copyright_holder, is_uri=False)
))
if self.copyright_year:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=COPYRIGHT_YEAR, is_uri=True),
o=Value(value=self.copyright_year, is_uri=False)
))
if self.license:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=LICENSE, is_uri=True),
o=Value(value=self.license, is_uri=False)
))
if self.keywords:
for k in self.keywords:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=KEYWORD, is_uri=True),
o=Value(value=k, is_uri=False)
))
if self.publication:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=PUBLICATION, is_uri=True),
o=Value(value=self.publication.id, is_uri=True)
))
self.publication.emit(emit)
if self.url:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=URL, is_uri=True),
o=Value(value=self.url, is_uri=True)
))

View file

@ -0,0 +1,23 @@
import uuid
import hashlib
def hash(data):
if isinstance(data, str):
data = data.encode("utf-8")
# Create a SHA256 hash from the data
id = hashlib.sha256(data).hexdigest()
# Convert into a UUID, 64-byte hash becomes 32-byte UUID
id = str(uuid.UUID(id[::2]))
return id
def to_uri(pref, id):
return f"https://trustgraph.ai/{pref}/{id}"
PREF_PUBEV = "pubev"
PREF_ORG = "org"
PREF_DOC = "doc"

View file

@ -0,0 +1,40 @@
from . defs import *
from .. schema import Triple, Value
class Organization:
def __init__(self, id, name=None, description=None):
self.id = id
self.name = name
self.description = description
def emit(self, emit):
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=IS_A, is_uri=True),
o=Value(value=ORGANIZATION, is_uri=True)
))
if self.name:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=LABEL, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=NAME, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
if self.description:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=DESCRIPTION, is_uri=True),
o=Value(value=self.description, is_uri=False)
))

View file

@ -0,0 +1,70 @@
from . defs import *
from .. schema import Triple, Value
class PublicationEvent:
def __init__(
self, id, organization=None, name=None, description=None,
start_date=None, end_date=None,
):
self.id = id
self.organization = organization
self.name = name
self.description = description
self.start_date = start_date
self.end_date = end_date
def emit(self, emit):
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=IS_A, is_uri=True),
o=Value(value=PUBLICATION_EVENT, is_uri=True)))
if self.name:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=LABEL, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=NAME, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
if self.description:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=DESCRIPTION, is_uri=True),
o=Value(value=self.description, is_uri=False)
))
if self.organization:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=PUBLISHED_BY, is_uri=True),
o=Value(value=self.organization.id, is_uri=True)
))
self.organization.emit(emit)
if self.start_date:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=START_DATE, is_uri=True),
o=Value(value=self.start_date, is_uri=False)
))
if self.end_date:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=END_DATE, is_uri=True),
o=Value(value=self.end_date, is_uri=False)))

View file

@ -1,6 +1,7 @@
RDF_LABEL = "http://www.w3.org/2000/01/rdf-schema#label" RDF_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
DEFINITION = "http://www.w3.org/2004/02/skos/core#definition" DEFINITION = "http://www.w3.org/2004/02/skos/core#definition"
SUBJECT_OF = "https://schema.org/subjectOf"
TRUSTGRAPH_ENTITIES = "http://trustgraph.ai/e/" TRUSTGRAPH_ENTITIES = "http://trustgraph.ai/e/"

View file

@ -9,5 +9,3 @@ from . graph import *
from . retrieval import * from . retrieval import *
from . metadata import * from . metadata import *

View file

@ -1,7 +1,7 @@
from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array, Double from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array, Double
from . types import Error, Value from . types import Error, Value, Triple
from . topic import topic from . topic import topic
from . metadata import Metadata from . metadata import Metadata
@ -41,11 +41,9 @@ graph_embeddings_response_queue = topic(
# Graph triples # Graph triples
class Triple(Record): class Triples(Record):
metadata = Metadata() metadata = Metadata()
s = Value() triples = Array(Triple())
p = Value()
o = Value()
triples_store_queue = topic('triples-store') triples_store_queue = topic('triples-store')

View file

@ -1,10 +1,16 @@
from pulsar.schema import Record, String from pulsar.schema import Record, String, Array
from . types import Triple
class Metadata(Record): class Metadata(Record):
source = String()
# Source identifier
id = String() id = String()
title = String()
# Subgraph
metadata = Array(Triple())
# Collection management
user = String() user = String()
collection = String() collection = String()

View file

@ -10,6 +10,11 @@ class Value(Record):
is_uri = Boolean() is_uri = Boolean()
type = String() type = String()
class Triple(Record):
s = Value()
p = Value()
o = Value()
class Field(Record): class Field(Record):
name = String() name = String()
# int, string, long, bool, float, double # int, string, long, bool, float, double

View file

@ -6,14 +6,20 @@ Loads a PDF document into TrustGraph processing.
import pulsar import pulsar
from pulsar.schema import JsonSchema from pulsar.schema import JsonSchema
from trustgraph.schema import Document, document_ingest_queue, Metadata
import base64 import base64
import hashlib import hashlib
import argparse import argparse
import os import os
import time import time
import uuid
from trustgraph.schema import Document, document_ingest_queue
from trustgraph.schema import Metadata
from trustgraph.log_level import LogLevel from trustgraph.log_level import LogLevel
from trustgraph.knowledge import hash, to_uri
from trustgraph.knowledge import PREF_PUBEV, PREF_DOC, PREF_ORG
from trustgraph.knowledge import Organization, PublicationEvent
from trustgraph.knowledge import DigitalDocument
default_user = 'trustgraph' default_user = 'trustgraph'
default_collection = 'default' default_collection = 'default'
@ -27,6 +33,7 @@ class Loader:
user, user,
collection, collection,
log_level, log_level,
metadata,
): ):
self.client = pulsar.Client( self.client = pulsar.Client(
@ -42,6 +49,7 @@ class Loader:
self.user = user self.user = user
self.collection = collection self.collection = collection
self.metadata = metadata
def load(self, files): def load(self, files):
@ -55,13 +63,23 @@ class Loader:
path = file path = file
data = open(path, "rb").read() data = open(path, "rb").read()
id = hashlib.sha256(path.encode("utf-8")).hexdigest()[0:8] # Create a SHA256 hash from the data
id = hash(data)
id = to_uri(PREF_DOC, id)
triples = []
def emit(t):
triples.append(t)
self.metadata.id = id
self.metadata.emit(emit)
r = Document( r = Document(
metadata=Metadata( metadata=Metadata(
source=path,
title=path,
id=id, id=id,
metadata=triples,
user=self.user, user=self.user,
collection=self.collection, collection=self.collection,
), ),
@ -112,6 +130,54 @@ def main():
help=f'Collection ID (default: {default_collection})' help=f'Collection ID (default: {default_collection})'
) )
parser.add_argument(
'--name', help=f'Document name'
)
parser.add_argument(
'--description', help=f'Document description'
)
parser.add_argument(
'--copyright-notice', help=f'Copyright notice'
)
parser.add_argument(
'--copyright-holder', help=f'Copyright holder'
)
parser.add_argument(
'--copyright-year', help=f'Copyright year'
)
parser.add_argument(
'--license', help=f'Copyright license'
)
parser.add_argument(
'--publication-organization', help=f'Publication organization'
)
parser.add_argument(
'--publication-description', help=f'Publication description'
)
parser.add_argument(
'--publication-date', help=f'Publication date'
)
parser.add_argument(
'--url', help=f'Document URL'
)
parser.add_argument(
'--keyword', nargs='+', help=f'Keyword'
)
parser.add_argument(
'--identifier', '--id', help=f'Document ID'
)
parser.add_argument( parser.add_argument(
'-l', '--log-level', '-l', '--log-level',
type=LogLevel, type=LogLevel,
@ -131,12 +197,38 @@ def main():
try: try:
document = DigitalDocument(
id,
name=args.name,
description=args.description,
copyright_notice=args.copyright_notice,
copyright_holder=args.copyright_holder,
copyright_year=args.copyright_year,
license=args.license,
url=args.url,
keywords=args.keyword,
)
if args.publication_organization:
org = Organization(
id=to_uri(PREF_ORG, hash(args.publication_organization)),
name=args.publication_organization,
)
document.publication = PublicationEvent(
id = to_uri(PREF_PUBEV, str(uuid.uuid4())),
organization=org,
description=args.publication_description,
start_date=args.publication_date,
end_date=args.publication_date,
)
p = Loader( p = Loader(
pulsar_host=args.pulsar_host, pulsar_host=args.pulsar_host,
output_queue=args.output_queue, output_queue=args.output_queue,
user=args.user, user=args.user,
collection=args.collection, collection=args.collection,
log_level=args.log_level, log_level=args.log_level,
metadata=document,
) )
p.load(args.files) p.load(args.files)

View file

@ -6,14 +6,20 @@ Loads a text document into TrustGraph processing.
import pulsar import pulsar
from pulsar.schema import JsonSchema from pulsar.schema import JsonSchema
from trustgraph.schema import TextDocument, text_ingest_queue, Metadata
import base64 import base64
import hashlib import hashlib
import argparse import argparse
import os import os
import time import time
import uuid
from trustgraph.schema import TextDocument, text_ingest_queue
from trustgraph.schema import Metadata
from trustgraph.log_level import LogLevel from trustgraph.log_level import LogLevel
from trustgraph.knowledge import hash, to_uri
from trustgraph.knowledge import PREF_PUBEV, PREF_DOC, PREF_ORG
from trustgraph.knowledge import Organization, PublicationEvent
from trustgraph.knowledge import DigitalDocument
default_user = 'trustgraph' default_user = 'trustgraph'
default_collection = 'default' default_collection = 'default'
@ -27,6 +33,7 @@ class Loader:
user, user,
collection, collection,
log_level, log_level,
metadata,
): ):
self.client = pulsar.Client( self.client = pulsar.Client(
@ -42,6 +49,7 @@ class Loader:
self.user = user self.user = user
self.collection = collection self.collection = collection
self.metadata = metadata
def load(self, files): def load(self, files):
@ -55,13 +63,23 @@ class Loader:
path = file path = file
data = open(path, "rb").read() data = open(path, "rb").read()
id = hashlib.sha256(path.encode("utf-8")).hexdigest()[0:8] # Create a SHA256 hash from the data
id = hash(data)
id = to_uri(PREF_DOC, id)
triples = []
def emit(t):
triples.append(t)
self.metadata.id = id
self.metadata.emit(emit)
r = TextDocument( r = TextDocument(
metadata=Metadata( metadata=Metadata(
source=path,
title=path,
id=id, id=id,
metadata=triples,
user=self.user, user=self.user,
collection=self.collection, collection=self.collection,
), ),
@ -112,6 +130,54 @@ def main():
help=f'Collection ID (default: {default_collection})' help=f'Collection ID (default: {default_collection})'
) )
parser.add_argument(
'--name', help=f'Document name'
)
parser.add_argument(
'--description', help=f'Document description'
)
parser.add_argument(
'--copyright-notice', help=f'Copyright notice'
)
parser.add_argument(
'--copyright-holder', help=f'Copyright holder'
)
parser.add_argument(
'--copyright-year', help=f'Copyright year'
)
parser.add_argument(
'--license', help=f'Copyright license'
)
parser.add_argument(
'--publication-organization', help=f'Publication organization'
)
parser.add_argument(
'--publication-description', help=f'Publication description'
)
parser.add_argument(
'--publication-date', help=f'Publication date'
)
parser.add_argument(
'--url', help=f'Document URL'
)
parser.add_argument(
'--keyword', nargs='+', help=f'Keyword'
)
parser.add_argument(
'--identifier', '--id', help=f'Document ID'
)
parser.add_argument( parser.add_argument(
'-l', '--log-level', '-l', '--log-level',
type=LogLevel, type=LogLevel,
@ -131,12 +197,38 @@ def main():
try: try:
document = DigitalDocument(
id,
name=args.name,
description=args.description,
copyright_notice=args.copyright_notice,
copyright_holder=args.copyright_holder,
copyright_year=args.copyright_year,
license=args.license,
url=args.url,
keywords=args.keyword,
)
if args.publication_organization:
org = Organization(
id=to_uri(PREF_ORG, hash(args.publication_organization)),
name=args.publication_organization,
)
document.publication = PublicationEvent(
id = to_uri(PREF_PUBEV, str(uuid.uuid4())),
organization=org,
description=args.publication_description,
start_date=args.publication_date,
end_date=args.publication_date,
)
p = Loader( p = Loader(
pulsar_host=args.pulsar_host, pulsar_host=args.pulsar_host,
output_queue=args.output_queue, output_queue=args.output_queue,
user=args.user, user=args.user,
collection=args.collection, collection=args.collection,
log_level=args.log_level, log_level=args.log_level,
metadata=document,
) )
p.load(args.files) p.load(args.files)

View file

@ -63,16 +63,8 @@ class Processor(ConsumerProducer):
for ix, chunk in enumerate(texts): for ix, chunk in enumerate(texts):
id = v.metadata.id + "-c" + str(ix)
r = Chunk( r = Chunk(
metadata=Metadata( metadata=v.metadata,
source=v.metadata.source,
id=id,
title=v.metadata.title,
user=v.metadata.user,
collection=v.metadata.collection,
),
chunk=chunk.page_content.encode("utf-8"), chunk=chunk.page_content.encode("utf-8"),
) )

View file

@ -62,16 +62,8 @@ class Processor(ConsumerProducer):
for ix, chunk in enumerate(texts): for ix, chunk in enumerate(texts):
id = v.metadata.id + "-c" + str(ix)
r = Chunk( r = Chunk(
metadata=Metadata( metadata=v.metadata,
source=v.metadata.source,
id=id,
title=v.metadata.title,
user=v.metadata.user,
collection=v.metadata.collection,
),
chunk=chunk.page_content.encode("utf-8"), chunk=chunk.page_content.encode("utf-8"),
) )

View file

@ -59,15 +59,8 @@ class Processor(ConsumerProducer):
for ix, page in enumerate(pages): for ix, page in enumerate(pages):
id = v.metadata.id + "-p" + str(ix)
r = TextDocument( r = TextDocument(
metadata=Metadata( metadata=v.metadata,
source=v.metadata.source,
title=v.metadata.title,
id=id,
user=v.metadata.user,
collection=v.metadata.collection,
),
text=page.page_content.encode("utf-8"), text=page.page_content.encode("utf-8"),
) )

View file

@ -7,16 +7,18 @@ get entity definitions which are output as graph edges.
import urllib.parse import urllib.parse
import json import json
from .... schema import ChunkEmbeddings, Triple, Metadata, Value from .... schema import ChunkEmbeddings, Triple, Triples, Metadata, Value
from .... schema import chunk_embeddings_ingest_queue, triples_store_queue from .... schema import chunk_embeddings_ingest_queue, triples_store_queue
from .... schema import prompt_request_queue from .... schema import prompt_request_queue
from .... schema import prompt_response_queue from .... schema import prompt_response_queue
from .... log_level import LogLevel from .... log_level import LogLevel
from .... clients.prompt_client import PromptClient from .... clients.prompt_client import PromptClient
from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
from .... base import ConsumerProducer from .... base import ConsumerProducer
DEFINITION_VALUE = Value(value=DEFINITION, is_uri=True) DEFINITION_VALUE = Value(value=DEFINITION, is_uri=True)
RDF_LABEL_VALUE = Value(value=RDF_LABEL, is_uri=True)
SUBJECT_OF_VALUE = Value(value=SUBJECT_OF, is_uri=True)
module = ".".join(__name__.split(".")[1:-1]) module = ".".join(__name__.split(".")[1:-1])
@ -44,7 +46,7 @@ class Processor(ConsumerProducer):
"output_queue": output_queue, "output_queue": output_queue,
"subscriber": subscriber, "subscriber": subscriber,
"input_schema": ChunkEmbeddings, "input_schema": ChunkEmbeddings,
"output_schema": Triple, "output_schema": Triples,
"prompt_request_queue": pr_request_queue, "prompt_request_queue": pr_request_queue,
"prompt_response_queue": pr_response_queue, "prompt_response_queue": pr_response_queue,
} }
@ -69,9 +71,12 @@ class Processor(ConsumerProducer):
return self.prompt.request_definitions(chunk) return self.prompt.request_definitions(chunk)
def emit_edge(self, metadata, s, p, o): def emit_edges(self, metadata, triples):
t = Triple(metadata=metadata, s=s, p=p, o=o) t = Triples(
metadata=metadata,
triples=triples,
)
self.producer.send(t) self.producer.send(t)
def handle(self, msg): def handle(self, msg):
@ -85,6 +90,13 @@ class Processor(ConsumerProducer):
defs = self.get_definitions(chunk) defs = self.get_definitions(chunk)
triples = []
# FIXME: Putting metadata into triples store is duplicated in
# relationships extractor too
for t in v.metadata.metadata:
triples.append(t)
for defn in defs: for defn in defs:
s = defn.name s = defn.name
@ -101,7 +113,31 @@ class Processor(ConsumerProducer):
s_value = Value(value=str(s_uri), is_uri=True) s_value = Value(value=str(s_uri), is_uri=True)
o_value = Value(value=str(o), is_uri=False) o_value = Value(value=str(o), is_uri=False)
self.emit_edge(v.metadata, s_value, DEFINITION_VALUE, o_value) triples.append(Triple(
s=s_value,
p=RDF_LABEL_VALUE,
o=Value(value=s, is_uri=False),
))
triples.append(Triple(
s=s_value, p=DEFINITION_VALUE, o=o_value
))
triples.append(Triple(
s=s_value,
p=SUBJECT_OF_VALUE,
o=Value(value=v.metadata.id, is_uri=True)
))
self.emit_edges(
Metadata(
id=v.metadata.id,
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),
triples
)
except Exception as e: except Exception as e:
print("Exception: ", e, flush=True) print("Exception: ", e, flush=True)

View file

@ -9,7 +9,7 @@ import urllib.parse
import os import os
from pulsar.schema import JsonSchema from pulsar.schema import JsonSchema
from .... schema import ChunkEmbeddings, Triple, GraphEmbeddings from .... schema import ChunkEmbeddings, Triple, Triples, GraphEmbeddings
from .... schema import Metadata, Value from .... schema import Metadata, Value
from .... schema import chunk_embeddings_ingest_queue, triples_store_queue from .... schema import chunk_embeddings_ingest_queue, triples_store_queue
from .... schema import graph_embeddings_store_queue from .... schema import graph_embeddings_store_queue
@ -17,10 +17,11 @@ from .... schema import prompt_request_queue
from .... schema import prompt_response_queue from .... schema import prompt_response_queue
from .... log_level import LogLevel from .... log_level import LogLevel
from .... clients.prompt_client import PromptClient from .... clients.prompt_client import PromptClient
from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
from .... base import ConsumerProducer from .... base import ConsumerProducer
RDF_LABEL_VALUE = Value(value=RDF_LABEL, is_uri=True) RDF_LABEL_VALUE = Value(value=RDF_LABEL, is_uri=True)
SUBJECT_OF_VALUE = Value(value=SUBJECT_OF, is_uri=True)
module = ".".join(__name__.split(".")[1:-1]) module = ".".join(__name__.split(".")[1:-1])
@ -50,7 +51,7 @@ class Processor(ConsumerProducer):
"output_queue": output_queue, "output_queue": output_queue,
"subscriber": subscriber, "subscriber": subscriber,
"input_schema": ChunkEmbeddings, "input_schema": ChunkEmbeddings,
"output_schema": Triple, "output_schema": Triples,
"prompt_request_queue": pr_request_queue, "prompt_request_queue": pr_request_queue,
"prompt_response_queue": pr_response_queue, "prompt_response_queue": pr_response_queue,
} }
@ -69,7 +70,7 @@ class Processor(ConsumerProducer):
"prompt_response_queue": pr_response_queue, "prompt_response_queue": pr_response_queue,
"subscriber": subscriber, "subscriber": subscriber,
"input_schema": ChunkEmbeddings.__name__, "input_schema": ChunkEmbeddings.__name__,
"output_schema": Triple.__name__, "output_schema": Triples.__name__,
"vector_schema": GraphEmbeddings.__name__, "vector_schema": GraphEmbeddings.__name__,
}) })
@ -92,9 +93,12 @@ class Processor(ConsumerProducer):
return self.prompt.request_relationships(chunk) return self.prompt.request_relationships(chunk)
def emit_edge(self, metadata, s, p, o): def emit_edges(self, metadata, triples):
t = Triple(metadata=metadata, s=s, p=p, o=o) t = Triples(
metadata=metadata,
triples=triples,
)
self.producer.send(t) self.producer.send(t)
def emit_vec(self, metadata, ent, vec): def emit_vec(self, metadata, ent, vec):
@ -113,6 +117,13 @@ class Processor(ConsumerProducer):
rels = self.get_relationships(chunk) rels = self.get_relationships(chunk)
triples = []
# FIXME: Putting metadata into triples store is duplicated in
# relationships extractor too
for t in v.metadata.metadata:
triples.append(t)
for rel in rels: for rel in rels:
s = rel.s s = rel.s
@ -139,43 +150,65 @@ class Processor(ConsumerProducer):
else: else:
o_value = Value(value=str(o), is_uri=False) o_value = Value(value=str(o), is_uri=False)
self.emit_edge( triples.append(Triple(
v.metadata, s=s_value,
s_value, p=p_value,
p_value, o=o_value
o_value ))
)
# Label for s # Label for s
self.emit_edge( triples.append(Triple(
v.metadata, s=s_value,
s_value, p=RDF_LABEL_VALUE,
RDF_LABEL_VALUE, o=Value(value=str(s), is_uri=False)
Value(value=str(s), is_uri=False) ))
)
# Label for p # Label for p
self.emit_edge( triples.append(Triple(
v.metadata, s=p_value,
p_value, p=RDF_LABEL_VALUE,
RDF_LABEL_VALUE, o=Value(value=str(p), is_uri=False)
Value(value=str(p), is_uri=False) ))
)
if rel.o_entity: if rel.o_entity:
# Label for o # Label for o
self.emit_edge( triples.append(Triple(
v.metadata, s=o_value,
o_value, p=RDF_LABEL_VALUE,
RDF_LABEL_VALUE, o=Value(value=str(o), is_uri=False)
Value(value=str(o), is_uri=False) ))
)
# 'Subject of' for s
triples.append(Triple(
s=s_value,
p=SUBJECT_OF_VALUE,
o=Value(value=v.metadata.id, is_uri=True)
))
if rel.o_entity:
# 'Subject of' for o
triples.append(Triple(
s=o_value,
p=RDF_LABEL_VALUE,
o=Value(value=v.metadata.id, is_uri=True)
))
self.emit_vec(v.metadata, s_value, v.vectors) self.emit_vec(v.metadata, s_value, v.vectors)
self.emit_vec(v.metadata, p_value, v.vectors) self.emit_vec(v.metadata, p_value, v.vectors)
if rel.o_entity: if rel.o_entity:
self.emit_vec(v.metadata, o_value, v.vectors) self.emit_vec(v.metadata, o_value, v.vectors)
self.emit_edges(
Metadata(
id=v.metadata.id,
metadata=[],
user=v.metadata.user,
collection=v.metadata.collection,
),
triples
)
except Exception as e: except Exception as e:
print("Exception: ", e, flush=True) print("Exception: ", e, flush=True)

View file

@ -7,7 +7,7 @@ get entity definitions which are output as graph edges.
import urllib.parse import urllib.parse
import json import json
from .... schema import ChunkEmbeddings, Triple, Metadata, Value from .... schema import ChunkEmbeddings, Triple, Triples, Metadata, Value
from .... schema import chunk_embeddings_ingest_queue, triples_store_queue from .... schema import chunk_embeddings_ingest_queue, triples_store_queue
from .... schema import prompt_request_queue from .... schema import prompt_request_queue
from .... schema import prompt_response_queue from .... schema import prompt_response_queue
@ -44,7 +44,7 @@ class Processor(ConsumerProducer):
"output_queue": output_queue, "output_queue": output_queue,
"subscriber": subscriber, "subscriber": subscriber,
"input_schema": ChunkEmbeddings, "input_schema": ChunkEmbeddings,
"output_schema": Triple, "output_schema": Triples,
"prompt_request_queue": pr_request_queue, "prompt_request_queue": pr_request_queue,
"prompt_response_queue": pr_response_queue, "prompt_response_queue": pr_response_queue,
} }
@ -71,7 +71,10 @@ class Processor(ConsumerProducer):
def emit_edge(self, metadata, s, p, o): def emit_edge(self, metadata, s, p, o):
t = Triple(metadata=metadata, s=s, p=p, o=o) t = Triples(
metadata=metadata,
triples=[Triple(s=s, p=p, o=o)],
)
self.producer.send(t) self.producer.send(t)
def handle(self, msg): def handle(self, msg):

View file

@ -10,7 +10,7 @@ import argparse
import time import time
from .... direct.cassandra import TrustGraph from .... direct.cassandra import TrustGraph
from .... schema import Triple from .... schema import Triple, Triples
from .... schema import triples_store_queue from .... schema import triples_store_queue
from .... log_level import LogLevel from .... log_level import LogLevel
from .... base import Consumer from .... base import Consumer
@ -33,7 +33,7 @@ class Processor(Consumer):
**params | { **params | {
"input_queue": input_queue, "input_queue": input_queue,
"subscriber": subscriber, "subscriber": subscriber,
"input_schema": Triple, "input_schema": Triples,
"graph_host": graph_host, "graph_host": graph_host,
} }
) )
@ -63,11 +63,12 @@ class Processor(Consumer):
self.table = table self.table = table
self.tg.insert( for t in v.triples:
v.s.value, self.tg.insert(
v.p.value, t.s.value,
v.o.value t.p.value,
) t.o.value
)
@staticmethod @staticmethod
def add_args(parser): def add_args(parser):

View file

@ -116,14 +116,16 @@ class Processor(Consumer):
v = msg.value() v = msg.value()
self.create_node(v.s.value) for t in v.triples:
if v.o.is_uri: self.create_node(t.s.value)
self.create_node(v.o.value)
self.relate_node(v.s.value, v.p.value, v.o.value) if v.o.is_uri:
else: self.create_node(t.o.value)
self.create_literal(v.o.value) self.relate_node(t.s.value, t.p.value, t.o.value)
self.relate_literal(v.s.value, v.p.value, v.o.value) else:
self.create_literal(t.o.value)
self.relate_literal(t.s.value, t.p.value, t.o.value)
@staticmethod @staticmethod
def add_args(parser): def add_args(parser):