mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-26 17:06:22 +02:00
Feature: document metadata (#123)
* Rework metadata structure in processing messages to be a subgraph * Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes * Document metadata is added to knowledge graph with subjectOf linkage to extracted entities
This commit is contained in:
parent
b8818e28d0
commit
7954e863cc
21 changed files with 625 additions and 98 deletions
6
trustgraph-base/trustgraph/knowledge/__init__.py
Normal file
6
trustgraph-base/trustgraph/knowledge/__init__.py
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
|
||||||
|
from . identifier import *
|
||||||
|
from . publication import *
|
||||||
|
from . document import *
|
||||||
|
from . organization import *
|
||||||
|
|
||||||
25
trustgraph-base/trustgraph/knowledge/defs.py
Normal file
25
trustgraph-base/trustgraph/knowledge/defs.py
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
|
||||||
|
IS_A = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'
|
||||||
|
LABEL = 'http://www.w3.org/2000/01/rdf-schema#label'
|
||||||
|
|
||||||
|
DIGITAL_DOCUMENT = 'https://schema.org/DigitalDocument'
|
||||||
|
PUBLICATION_EVENT = 'https://schema.org/PublicationEvent'
|
||||||
|
ORGANIZATION = 'https://schema.org/Organization'
|
||||||
|
|
||||||
|
NAME = 'https://schema.org/name'
|
||||||
|
DESCRIPTION = 'https://schema.org/description'
|
||||||
|
COPYRIGHT_NOTICE = 'https://schema.org/copyrightNotice'
|
||||||
|
COPYRIGHT_HOLDER = 'https://schema.org/copyrightHolder'
|
||||||
|
COPYRIGHT_YEAR = 'https://schema.org/copyrightYear'
|
||||||
|
LICENSE = 'https://schema.org/license'
|
||||||
|
PUBLICATION = 'https://schema.org/publication'
|
||||||
|
START_DATE = 'https://schema.org/startDate'
|
||||||
|
END_DATE = 'https://schema.org/endDate'
|
||||||
|
PUBLISHED_BY = 'https://schema.org/publishedBy'
|
||||||
|
DATE_PUBLISHED = 'https://schema.org/datePublished'
|
||||||
|
PUBLICATION = 'https://schema.org/publication'
|
||||||
|
DATE_PUBLISHED = 'https://schema.org/datePublished'
|
||||||
|
URL = 'https://schema.org/url'
|
||||||
|
IDENTIFIER = 'https://schema.org/identifier'
|
||||||
|
KEYWORD = 'https://schema.org/keywords'
|
||||||
|
|
||||||
119
trustgraph-base/trustgraph/knowledge/document.py
Normal file
119
trustgraph-base/trustgraph/knowledge/document.py
Normal file
|
|
@ -0,0 +1,119 @@
|
||||||
|
|
||||||
|
from . defs import *
|
||||||
|
from .. schema import Triple, Value
|
||||||
|
|
||||||
|
class DigitalDocument:
|
||||||
|
def __init__(
|
||||||
|
self, id, name=None, description=None, copyright_notice=None,
|
||||||
|
copyright_holder=None, copyright_year=None, license=None,
|
||||||
|
identifier=None,
|
||||||
|
publication=None, url=None, keywords=[]
|
||||||
|
):
|
||||||
|
self.id = id
|
||||||
|
self.name = name
|
||||||
|
self.description = description
|
||||||
|
self.copyright_notice = copyright_notice
|
||||||
|
self.copyright_holder = copyright_holder
|
||||||
|
self.copyright_year = copyright_year
|
||||||
|
self.license = license
|
||||||
|
self.publication = publication
|
||||||
|
self.url = url
|
||||||
|
self.identifier = identifier
|
||||||
|
self.keywords = keywords
|
||||||
|
|
||||||
|
def emit(self, emit):
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=IS_A, is_uri=True),
|
||||||
|
o=Value(value=DIGITAL_DOCUMENT, is_uri=True)
|
||||||
|
))
|
||||||
|
|
||||||
|
if self.name:
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=LABEL, is_uri=True),
|
||||||
|
o=Value(value=self.name, is_uri=False)
|
||||||
|
))
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=NAME, is_uri=True),
|
||||||
|
o=Value(value=self.name, is_uri=False)
|
||||||
|
))
|
||||||
|
|
||||||
|
if self.identifier:
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=id, is_uri=True),
|
||||||
|
p=Value(value=IDENTIFIER, is_uri=True),
|
||||||
|
o=Value(value=self.identifier, is_uri=False)
|
||||||
|
))
|
||||||
|
|
||||||
|
if self.description:
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=DESCRIPTION, is_uri=True),
|
||||||
|
o=Value(value=self.description, is_uri=False)
|
||||||
|
))
|
||||||
|
|
||||||
|
if self.copyright_notice:
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=COPYRIGHT_NOTICE, is_uri=True),
|
||||||
|
o=Value(value=self.copyright_notice, is_uri=False)
|
||||||
|
))
|
||||||
|
|
||||||
|
if self.copyright_holder:
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=COPYRIGHT_HOLDER, is_uri=True),
|
||||||
|
o=Value(value=self.copyright_holder, is_uri=False)
|
||||||
|
))
|
||||||
|
|
||||||
|
if self.copyright_year:
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=COPYRIGHT_YEAR, is_uri=True),
|
||||||
|
o=Value(value=self.copyright_year, is_uri=False)
|
||||||
|
))
|
||||||
|
|
||||||
|
if self.license:
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=LICENSE, is_uri=True),
|
||||||
|
o=Value(value=self.license, is_uri=False)
|
||||||
|
))
|
||||||
|
|
||||||
|
if self.keywords:
|
||||||
|
for k in self.keywords:
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=KEYWORD, is_uri=True),
|
||||||
|
o=Value(value=k, is_uri=False)
|
||||||
|
))
|
||||||
|
|
||||||
|
if self.publication:
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=PUBLICATION, is_uri=True),
|
||||||
|
o=Value(value=self.publication.id, is_uri=True)
|
||||||
|
))
|
||||||
|
|
||||||
|
self.publication.emit(emit)
|
||||||
|
|
||||||
|
if self.url:
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=URL, is_uri=True),
|
||||||
|
o=Value(value=self.url, is_uri=True)
|
||||||
|
))
|
||||||
|
|
||||||
23
trustgraph-base/trustgraph/knowledge/identifier.py
Normal file
23
trustgraph-base/trustgraph/knowledge/identifier.py
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
def hash(data):
|
||||||
|
|
||||||
|
if isinstance(data, str):
|
||||||
|
data = data.encode("utf-8")
|
||||||
|
|
||||||
|
# Create a SHA256 hash from the data
|
||||||
|
id = hashlib.sha256(data).hexdigest()
|
||||||
|
|
||||||
|
# Convert into a UUID, 64-byte hash becomes 32-byte UUID
|
||||||
|
id = str(uuid.UUID(id[::2]))
|
||||||
|
|
||||||
|
return id
|
||||||
|
|
||||||
|
def to_uri(pref, id):
|
||||||
|
return f"https://trustgraph.ai/{pref}/{id}"
|
||||||
|
|
||||||
|
PREF_PUBEV = "pubev"
|
||||||
|
PREF_ORG = "org"
|
||||||
|
PREF_DOC = "doc"
|
||||||
40
trustgraph-base/trustgraph/knowledge/organization.py
Normal file
40
trustgraph-base/trustgraph/knowledge/organization.py
Normal file
|
|
@ -0,0 +1,40 @@
|
||||||
|
|
||||||
|
from . defs import *
|
||||||
|
from .. schema import Triple, Value
|
||||||
|
|
||||||
|
class Organization:
|
||||||
|
def __init__(self, id, name=None, description=None):
|
||||||
|
self.id = id
|
||||||
|
self.name = name
|
||||||
|
self.description = description
|
||||||
|
|
||||||
|
def emit(self, emit):
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=IS_A, is_uri=True),
|
||||||
|
o=Value(value=ORGANIZATION, is_uri=True)
|
||||||
|
))
|
||||||
|
|
||||||
|
if self.name:
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=LABEL, is_uri=True),
|
||||||
|
o=Value(value=self.name, is_uri=False)
|
||||||
|
))
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=NAME, is_uri=True),
|
||||||
|
o=Value(value=self.name, is_uri=False)
|
||||||
|
))
|
||||||
|
|
||||||
|
if self.description:
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=DESCRIPTION, is_uri=True),
|
||||||
|
o=Value(value=self.description, is_uri=False)
|
||||||
|
))
|
||||||
|
|
||||||
70
trustgraph-base/trustgraph/knowledge/publication.py
Normal file
70
trustgraph-base/trustgraph/knowledge/publication.py
Normal file
|
|
@ -0,0 +1,70 @@
|
||||||
|
|
||||||
|
from . defs import *
|
||||||
|
from .. schema import Triple, Value
|
||||||
|
|
||||||
|
class PublicationEvent:
|
||||||
|
def __init__(
|
||||||
|
self, id, organization=None, name=None, description=None,
|
||||||
|
start_date=None, end_date=None,
|
||||||
|
):
|
||||||
|
self.id = id
|
||||||
|
self.organization = organization
|
||||||
|
self.name = name
|
||||||
|
self.description = description
|
||||||
|
self.start_date = start_date
|
||||||
|
self.end_date = end_date
|
||||||
|
|
||||||
|
def emit(self, emit):
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=IS_A, is_uri=True),
|
||||||
|
o=Value(value=PUBLICATION_EVENT, is_uri=True)))
|
||||||
|
|
||||||
|
if self.name:
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=LABEL, is_uri=True),
|
||||||
|
o=Value(value=self.name, is_uri=False)
|
||||||
|
))
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=NAME, is_uri=True),
|
||||||
|
o=Value(value=self.name, is_uri=False)
|
||||||
|
))
|
||||||
|
|
||||||
|
if self.description:
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=DESCRIPTION, is_uri=True),
|
||||||
|
o=Value(value=self.description, is_uri=False)
|
||||||
|
))
|
||||||
|
|
||||||
|
if self.organization:
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=PUBLISHED_BY, is_uri=True),
|
||||||
|
o=Value(value=self.organization.id, is_uri=True)
|
||||||
|
))
|
||||||
|
|
||||||
|
self.organization.emit(emit)
|
||||||
|
|
||||||
|
if self.start_date:
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=START_DATE, is_uri=True),
|
||||||
|
o=Value(value=self.start_date, is_uri=False)
|
||||||
|
))
|
||||||
|
|
||||||
|
if self.end_date:
|
||||||
|
|
||||||
|
emit(Triple(
|
||||||
|
s=Value(value=self.id, is_uri=True),
|
||||||
|
p=Value(value=END_DATE, is_uri=True),
|
||||||
|
o=Value(value=self.end_date, is_uri=False)))
|
||||||
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
|
|
||||||
RDF_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
|
RDF_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
|
||||||
DEFINITION = "http://www.w3.org/2004/02/skos/core#definition"
|
DEFINITION = "http://www.w3.org/2004/02/skos/core#definition"
|
||||||
|
SUBJECT_OF = "https://schema.org/subjectOf"
|
||||||
|
|
||||||
TRUSTGRAPH_ENTITIES = "http://trustgraph.ai/e/"
|
TRUSTGRAPH_ENTITIES = "http://trustgraph.ai/e/"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -9,5 +9,3 @@ from . graph import *
|
||||||
from . retrieval import *
|
from . retrieval import *
|
||||||
from . metadata import *
|
from . metadata import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
|
|
||||||
from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array, Double
|
from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array, Double
|
||||||
|
|
||||||
from . types import Error, Value
|
from . types import Error, Value, Triple
|
||||||
from . topic import topic
|
from . topic import topic
|
||||||
from . metadata import Metadata
|
from . metadata import Metadata
|
||||||
|
|
||||||
|
|
@ -41,11 +41,9 @@ graph_embeddings_response_queue = topic(
|
||||||
|
|
||||||
# Graph triples
|
# Graph triples
|
||||||
|
|
||||||
class Triple(Record):
|
class Triples(Record):
|
||||||
metadata = Metadata()
|
metadata = Metadata()
|
||||||
s = Value()
|
triples = Array(Triple())
|
||||||
p = Value()
|
|
||||||
o = Value()
|
|
||||||
|
|
||||||
triples_store_queue = topic('triples-store')
|
triples_store_queue = topic('triples-store')
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,16 @@
|
||||||
|
|
||||||
from pulsar.schema import Record, String
|
from pulsar.schema import Record, String, Array
|
||||||
|
from . types import Triple
|
||||||
|
|
||||||
class Metadata(Record):
|
class Metadata(Record):
|
||||||
source = String()
|
|
||||||
|
# Source identifier
|
||||||
id = String()
|
id = String()
|
||||||
title = String()
|
|
||||||
|
# Subgraph
|
||||||
|
metadata = Array(Triple())
|
||||||
|
|
||||||
|
# Collection management
|
||||||
user = String()
|
user = String()
|
||||||
collection = String()
|
collection = String()
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,11 @@ class Value(Record):
|
||||||
is_uri = Boolean()
|
is_uri = Boolean()
|
||||||
type = String()
|
type = String()
|
||||||
|
|
||||||
|
class Triple(Record):
|
||||||
|
s = Value()
|
||||||
|
p = Value()
|
||||||
|
o = Value()
|
||||||
|
|
||||||
class Field(Record):
|
class Field(Record):
|
||||||
name = String()
|
name = String()
|
||||||
# int, string, long, bool, float, double
|
# int, string, long, bool, float, double
|
||||||
|
|
|
||||||
|
|
@ -6,14 +6,20 @@ Loads a PDF document into TrustGraph processing.
|
||||||
|
|
||||||
import pulsar
|
import pulsar
|
||||||
from pulsar.schema import JsonSchema
|
from pulsar.schema import JsonSchema
|
||||||
from trustgraph.schema import Document, document_ingest_queue, Metadata
|
|
||||||
import base64
|
import base64
|
||||||
import hashlib
|
import hashlib
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
from trustgraph.schema import Document, document_ingest_queue
|
||||||
|
from trustgraph.schema import Metadata
|
||||||
from trustgraph.log_level import LogLevel
|
from trustgraph.log_level import LogLevel
|
||||||
|
from trustgraph.knowledge import hash, to_uri
|
||||||
|
from trustgraph.knowledge import PREF_PUBEV, PREF_DOC, PREF_ORG
|
||||||
|
from trustgraph.knowledge import Organization, PublicationEvent
|
||||||
|
from trustgraph.knowledge import DigitalDocument
|
||||||
|
|
||||||
default_user = 'trustgraph'
|
default_user = 'trustgraph'
|
||||||
default_collection = 'default'
|
default_collection = 'default'
|
||||||
|
|
@ -27,6 +33,7 @@ class Loader:
|
||||||
user,
|
user,
|
||||||
collection,
|
collection,
|
||||||
log_level,
|
log_level,
|
||||||
|
metadata,
|
||||||
):
|
):
|
||||||
|
|
||||||
self.client = pulsar.Client(
|
self.client = pulsar.Client(
|
||||||
|
|
@ -42,6 +49,7 @@ class Loader:
|
||||||
|
|
||||||
self.user = user
|
self.user = user
|
||||||
self.collection = collection
|
self.collection = collection
|
||||||
|
self.metadata = metadata
|
||||||
|
|
||||||
def load(self, files):
|
def load(self, files):
|
||||||
|
|
||||||
|
|
@ -55,13 +63,23 @@ class Loader:
|
||||||
path = file
|
path = file
|
||||||
data = open(path, "rb").read()
|
data = open(path, "rb").read()
|
||||||
|
|
||||||
id = hashlib.sha256(path.encode("utf-8")).hexdigest()[0:8]
|
# Create a SHA256 hash from the data
|
||||||
|
id = hash(data)
|
||||||
|
|
||||||
|
id = to_uri(PREF_DOC, id)
|
||||||
|
|
||||||
|
triples = []
|
||||||
|
|
||||||
|
def emit(t):
|
||||||
|
triples.append(t)
|
||||||
|
|
||||||
|
self.metadata.id = id
|
||||||
|
self.metadata.emit(emit)
|
||||||
|
|
||||||
r = Document(
|
r = Document(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
source=path,
|
|
||||||
title=path,
|
|
||||||
id=id,
|
id=id,
|
||||||
|
metadata=triples,
|
||||||
user=self.user,
|
user=self.user,
|
||||||
collection=self.collection,
|
collection=self.collection,
|
||||||
),
|
),
|
||||||
|
|
@ -112,6 +130,54 @@ def main():
|
||||||
help=f'Collection ID (default: {default_collection})'
|
help=f'Collection ID (default: {default_collection})'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--name', help=f'Document name'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--description', help=f'Document description'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--copyright-notice', help=f'Copyright notice'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--copyright-holder', help=f'Copyright holder'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--copyright-year', help=f'Copyright year'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--license', help=f'Copyright license'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--publication-organization', help=f'Publication organization'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--publication-description', help=f'Publication description'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--publication-date', help=f'Publication date'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--url', help=f'Document URL'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--keyword', nargs='+', help=f'Keyword'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--identifier', '--id', help=f'Document ID'
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-l', '--log-level',
|
'-l', '--log-level',
|
||||||
type=LogLevel,
|
type=LogLevel,
|
||||||
|
|
@ -131,12 +197,38 @@ def main():
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
|
document = DigitalDocument(
|
||||||
|
id,
|
||||||
|
name=args.name,
|
||||||
|
description=args.description,
|
||||||
|
copyright_notice=args.copyright_notice,
|
||||||
|
copyright_holder=args.copyright_holder,
|
||||||
|
copyright_year=args.copyright_year,
|
||||||
|
license=args.license,
|
||||||
|
url=args.url,
|
||||||
|
keywords=args.keyword,
|
||||||
|
)
|
||||||
|
|
||||||
|
if args.publication_organization:
|
||||||
|
org = Organization(
|
||||||
|
id=to_uri(PREF_ORG, hash(args.publication_organization)),
|
||||||
|
name=args.publication_organization,
|
||||||
|
)
|
||||||
|
document.publication = PublicationEvent(
|
||||||
|
id = to_uri(PREF_PUBEV, str(uuid.uuid4())),
|
||||||
|
organization=org,
|
||||||
|
description=args.publication_description,
|
||||||
|
start_date=args.publication_date,
|
||||||
|
end_date=args.publication_date,
|
||||||
|
)
|
||||||
|
|
||||||
p = Loader(
|
p = Loader(
|
||||||
pulsar_host=args.pulsar_host,
|
pulsar_host=args.pulsar_host,
|
||||||
output_queue=args.output_queue,
|
output_queue=args.output_queue,
|
||||||
user=args.user,
|
user=args.user,
|
||||||
collection=args.collection,
|
collection=args.collection,
|
||||||
log_level=args.log_level,
|
log_level=args.log_level,
|
||||||
|
metadata=document,
|
||||||
)
|
)
|
||||||
|
|
||||||
p.load(args.files)
|
p.load(args.files)
|
||||||
|
|
|
||||||
|
|
@ -6,14 +6,20 @@ Loads a text document into TrustGraph processing.
|
||||||
|
|
||||||
import pulsar
|
import pulsar
|
||||||
from pulsar.schema import JsonSchema
|
from pulsar.schema import JsonSchema
|
||||||
from trustgraph.schema import TextDocument, text_ingest_queue, Metadata
|
|
||||||
import base64
|
import base64
|
||||||
import hashlib
|
import hashlib
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
from trustgraph.schema import TextDocument, text_ingest_queue
|
||||||
|
from trustgraph.schema import Metadata
|
||||||
from trustgraph.log_level import LogLevel
|
from trustgraph.log_level import LogLevel
|
||||||
|
from trustgraph.knowledge import hash, to_uri
|
||||||
|
from trustgraph.knowledge import PREF_PUBEV, PREF_DOC, PREF_ORG
|
||||||
|
from trustgraph.knowledge import Organization, PublicationEvent
|
||||||
|
from trustgraph.knowledge import DigitalDocument
|
||||||
|
|
||||||
default_user = 'trustgraph'
|
default_user = 'trustgraph'
|
||||||
default_collection = 'default'
|
default_collection = 'default'
|
||||||
|
|
@ -27,6 +33,7 @@ class Loader:
|
||||||
user,
|
user,
|
||||||
collection,
|
collection,
|
||||||
log_level,
|
log_level,
|
||||||
|
metadata,
|
||||||
):
|
):
|
||||||
|
|
||||||
self.client = pulsar.Client(
|
self.client = pulsar.Client(
|
||||||
|
|
@ -42,6 +49,7 @@ class Loader:
|
||||||
|
|
||||||
self.user = user
|
self.user = user
|
||||||
self.collection = collection
|
self.collection = collection
|
||||||
|
self.metadata = metadata
|
||||||
|
|
||||||
def load(self, files):
|
def load(self, files):
|
||||||
|
|
||||||
|
|
@ -55,13 +63,23 @@ class Loader:
|
||||||
path = file
|
path = file
|
||||||
data = open(path, "rb").read()
|
data = open(path, "rb").read()
|
||||||
|
|
||||||
id = hashlib.sha256(path.encode("utf-8")).hexdigest()[0:8]
|
# Create a SHA256 hash from the data
|
||||||
|
id = hash(data)
|
||||||
|
|
||||||
|
id = to_uri(PREF_DOC, id)
|
||||||
|
|
||||||
|
triples = []
|
||||||
|
|
||||||
|
def emit(t):
|
||||||
|
triples.append(t)
|
||||||
|
|
||||||
|
self.metadata.id = id
|
||||||
|
self.metadata.emit(emit)
|
||||||
|
|
||||||
r = TextDocument(
|
r = TextDocument(
|
||||||
metadata=Metadata(
|
metadata=Metadata(
|
||||||
source=path,
|
|
||||||
title=path,
|
|
||||||
id=id,
|
id=id,
|
||||||
|
metadata=triples,
|
||||||
user=self.user,
|
user=self.user,
|
||||||
collection=self.collection,
|
collection=self.collection,
|
||||||
),
|
),
|
||||||
|
|
@ -112,6 +130,54 @@ def main():
|
||||||
help=f'Collection ID (default: {default_collection})'
|
help=f'Collection ID (default: {default_collection})'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--name', help=f'Document name'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--description', help=f'Document description'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--copyright-notice', help=f'Copyright notice'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--copyright-holder', help=f'Copyright holder'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--copyright-year', help=f'Copyright year'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--license', help=f'Copyright license'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--publication-organization', help=f'Publication organization'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--publication-description', help=f'Publication description'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--publication-date', help=f'Publication date'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--url', help=f'Document URL'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--keyword', nargs='+', help=f'Keyword'
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--identifier', '--id', help=f'Document ID'
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'-l', '--log-level',
|
'-l', '--log-level',
|
||||||
type=LogLevel,
|
type=LogLevel,
|
||||||
|
|
@ -131,12 +197,38 @@ def main():
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
|
document = DigitalDocument(
|
||||||
|
id,
|
||||||
|
name=args.name,
|
||||||
|
description=args.description,
|
||||||
|
copyright_notice=args.copyright_notice,
|
||||||
|
copyright_holder=args.copyright_holder,
|
||||||
|
copyright_year=args.copyright_year,
|
||||||
|
license=args.license,
|
||||||
|
url=args.url,
|
||||||
|
keywords=args.keyword,
|
||||||
|
)
|
||||||
|
|
||||||
|
if args.publication_organization:
|
||||||
|
org = Organization(
|
||||||
|
id=to_uri(PREF_ORG, hash(args.publication_organization)),
|
||||||
|
name=args.publication_organization,
|
||||||
|
)
|
||||||
|
document.publication = PublicationEvent(
|
||||||
|
id = to_uri(PREF_PUBEV, str(uuid.uuid4())),
|
||||||
|
organization=org,
|
||||||
|
description=args.publication_description,
|
||||||
|
start_date=args.publication_date,
|
||||||
|
end_date=args.publication_date,
|
||||||
|
)
|
||||||
|
|
||||||
p = Loader(
|
p = Loader(
|
||||||
pulsar_host=args.pulsar_host,
|
pulsar_host=args.pulsar_host,
|
||||||
output_queue=args.output_queue,
|
output_queue=args.output_queue,
|
||||||
user=args.user,
|
user=args.user,
|
||||||
collection=args.collection,
|
collection=args.collection,
|
||||||
log_level=args.log_level,
|
log_level=args.log_level,
|
||||||
|
metadata=document,
|
||||||
)
|
)
|
||||||
|
|
||||||
p.load(args.files)
|
p.load(args.files)
|
||||||
|
|
|
||||||
|
|
@ -63,16 +63,8 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
for ix, chunk in enumerate(texts):
|
for ix, chunk in enumerate(texts):
|
||||||
|
|
||||||
id = v.metadata.id + "-c" + str(ix)
|
|
||||||
|
|
||||||
r = Chunk(
|
r = Chunk(
|
||||||
metadata=Metadata(
|
metadata=v.metadata,
|
||||||
source=v.metadata.source,
|
|
||||||
id=id,
|
|
||||||
title=v.metadata.title,
|
|
||||||
user=v.metadata.user,
|
|
||||||
collection=v.metadata.collection,
|
|
||||||
),
|
|
||||||
chunk=chunk.page_content.encode("utf-8"),
|
chunk=chunk.page_content.encode("utf-8"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -62,16 +62,8 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
for ix, chunk in enumerate(texts):
|
for ix, chunk in enumerate(texts):
|
||||||
|
|
||||||
id = v.metadata.id + "-c" + str(ix)
|
|
||||||
|
|
||||||
r = Chunk(
|
r = Chunk(
|
||||||
metadata=Metadata(
|
metadata=v.metadata,
|
||||||
source=v.metadata.source,
|
|
||||||
id=id,
|
|
||||||
title=v.metadata.title,
|
|
||||||
user=v.metadata.user,
|
|
||||||
collection=v.metadata.collection,
|
|
||||||
),
|
|
||||||
chunk=chunk.page_content.encode("utf-8"),
|
chunk=chunk.page_content.encode("utf-8"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -59,15 +59,8 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
for ix, page in enumerate(pages):
|
for ix, page in enumerate(pages):
|
||||||
|
|
||||||
id = v.metadata.id + "-p" + str(ix)
|
|
||||||
r = TextDocument(
|
r = TextDocument(
|
||||||
metadata=Metadata(
|
metadata=v.metadata,
|
||||||
source=v.metadata.source,
|
|
||||||
title=v.metadata.title,
|
|
||||||
id=id,
|
|
||||||
user=v.metadata.user,
|
|
||||||
collection=v.metadata.collection,
|
|
||||||
),
|
|
||||||
text=page.page_content.encode("utf-8"),
|
text=page.page_content.encode("utf-8"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,16 +7,18 @@ get entity definitions which are output as graph edges.
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from .... schema import ChunkEmbeddings, Triple, Metadata, Value
|
from .... schema import ChunkEmbeddings, Triple, Triples, Metadata, Value
|
||||||
from .... schema import chunk_embeddings_ingest_queue, triples_store_queue
|
from .... schema import chunk_embeddings_ingest_queue, triples_store_queue
|
||||||
from .... schema import prompt_request_queue
|
from .... schema import prompt_request_queue
|
||||||
from .... schema import prompt_response_queue
|
from .... schema import prompt_response_queue
|
||||||
from .... log_level import LogLevel
|
from .... log_level import LogLevel
|
||||||
from .... clients.prompt_client import PromptClient
|
from .... clients.prompt_client import PromptClient
|
||||||
from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION
|
from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
|
||||||
from .... base import ConsumerProducer
|
from .... base import ConsumerProducer
|
||||||
|
|
||||||
DEFINITION_VALUE = Value(value=DEFINITION, is_uri=True)
|
DEFINITION_VALUE = Value(value=DEFINITION, is_uri=True)
|
||||||
|
RDF_LABEL_VALUE = Value(value=RDF_LABEL, is_uri=True)
|
||||||
|
SUBJECT_OF_VALUE = Value(value=SUBJECT_OF, is_uri=True)
|
||||||
|
|
||||||
module = ".".join(__name__.split(".")[1:-1])
|
module = ".".join(__name__.split(".")[1:-1])
|
||||||
|
|
||||||
|
|
@ -44,7 +46,7 @@ class Processor(ConsumerProducer):
|
||||||
"output_queue": output_queue,
|
"output_queue": output_queue,
|
||||||
"subscriber": subscriber,
|
"subscriber": subscriber,
|
||||||
"input_schema": ChunkEmbeddings,
|
"input_schema": ChunkEmbeddings,
|
||||||
"output_schema": Triple,
|
"output_schema": Triples,
|
||||||
"prompt_request_queue": pr_request_queue,
|
"prompt_request_queue": pr_request_queue,
|
||||||
"prompt_response_queue": pr_response_queue,
|
"prompt_response_queue": pr_response_queue,
|
||||||
}
|
}
|
||||||
|
|
@ -69,9 +71,12 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
return self.prompt.request_definitions(chunk)
|
return self.prompt.request_definitions(chunk)
|
||||||
|
|
||||||
def emit_edge(self, metadata, s, p, o):
|
def emit_edges(self, metadata, triples):
|
||||||
|
|
||||||
t = Triple(metadata=metadata, s=s, p=p, o=o)
|
t = Triples(
|
||||||
|
metadata=metadata,
|
||||||
|
triples=triples,
|
||||||
|
)
|
||||||
self.producer.send(t)
|
self.producer.send(t)
|
||||||
|
|
||||||
def handle(self, msg):
|
def handle(self, msg):
|
||||||
|
|
@ -85,6 +90,13 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
defs = self.get_definitions(chunk)
|
defs = self.get_definitions(chunk)
|
||||||
|
|
||||||
|
triples = []
|
||||||
|
|
||||||
|
# FIXME: Putting metadata into triples store is duplicated in
|
||||||
|
# relationships extractor too
|
||||||
|
for t in v.metadata.metadata:
|
||||||
|
triples.append(t)
|
||||||
|
|
||||||
for defn in defs:
|
for defn in defs:
|
||||||
|
|
||||||
s = defn.name
|
s = defn.name
|
||||||
|
|
@ -101,7 +113,31 @@ class Processor(ConsumerProducer):
|
||||||
s_value = Value(value=str(s_uri), is_uri=True)
|
s_value = Value(value=str(s_uri), is_uri=True)
|
||||||
o_value = Value(value=str(o), is_uri=False)
|
o_value = Value(value=str(o), is_uri=False)
|
||||||
|
|
||||||
self.emit_edge(v.metadata, s_value, DEFINITION_VALUE, o_value)
|
triples.append(Triple(
|
||||||
|
s=s_value,
|
||||||
|
p=RDF_LABEL_VALUE,
|
||||||
|
o=Value(value=s, is_uri=False),
|
||||||
|
))
|
||||||
|
|
||||||
|
triples.append(Triple(
|
||||||
|
s=s_value, p=DEFINITION_VALUE, o=o_value
|
||||||
|
))
|
||||||
|
|
||||||
|
triples.append(Triple(
|
||||||
|
s=s_value,
|
||||||
|
p=SUBJECT_OF_VALUE,
|
||||||
|
o=Value(value=v.metadata.id, is_uri=True)
|
||||||
|
))
|
||||||
|
|
||||||
|
self.emit_edges(
|
||||||
|
Metadata(
|
||||||
|
id=v.metadata.id,
|
||||||
|
metadata=[],
|
||||||
|
user=v.metadata.user,
|
||||||
|
collection=v.metadata.collection,
|
||||||
|
),
|
||||||
|
triples
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Exception: ", e, flush=True)
|
print("Exception: ", e, flush=True)
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ import urllib.parse
|
||||||
import os
|
import os
|
||||||
from pulsar.schema import JsonSchema
|
from pulsar.schema import JsonSchema
|
||||||
|
|
||||||
from .... schema import ChunkEmbeddings, Triple, GraphEmbeddings
|
from .... schema import ChunkEmbeddings, Triple, Triples, GraphEmbeddings
|
||||||
from .... schema import Metadata, Value
|
from .... schema import Metadata, Value
|
||||||
from .... schema import chunk_embeddings_ingest_queue, triples_store_queue
|
from .... schema import chunk_embeddings_ingest_queue, triples_store_queue
|
||||||
from .... schema import graph_embeddings_store_queue
|
from .... schema import graph_embeddings_store_queue
|
||||||
|
|
@ -17,10 +17,11 @@ from .... schema import prompt_request_queue
|
||||||
from .... schema import prompt_response_queue
|
from .... schema import prompt_response_queue
|
||||||
from .... log_level import LogLevel
|
from .... log_level import LogLevel
|
||||||
from .... clients.prompt_client import PromptClient
|
from .... clients.prompt_client import PromptClient
|
||||||
from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES
|
from .... rdf import RDF_LABEL, TRUSTGRAPH_ENTITIES, SUBJECT_OF
|
||||||
from .... base import ConsumerProducer
|
from .... base import ConsumerProducer
|
||||||
|
|
||||||
RDF_LABEL_VALUE = Value(value=RDF_LABEL, is_uri=True)
|
RDF_LABEL_VALUE = Value(value=RDF_LABEL, is_uri=True)
|
||||||
|
SUBJECT_OF_VALUE = Value(value=SUBJECT_OF, is_uri=True)
|
||||||
|
|
||||||
module = ".".join(__name__.split(".")[1:-1])
|
module = ".".join(__name__.split(".")[1:-1])
|
||||||
|
|
||||||
|
|
@ -50,7 +51,7 @@ class Processor(ConsumerProducer):
|
||||||
"output_queue": output_queue,
|
"output_queue": output_queue,
|
||||||
"subscriber": subscriber,
|
"subscriber": subscriber,
|
||||||
"input_schema": ChunkEmbeddings,
|
"input_schema": ChunkEmbeddings,
|
||||||
"output_schema": Triple,
|
"output_schema": Triples,
|
||||||
"prompt_request_queue": pr_request_queue,
|
"prompt_request_queue": pr_request_queue,
|
||||||
"prompt_response_queue": pr_response_queue,
|
"prompt_response_queue": pr_response_queue,
|
||||||
}
|
}
|
||||||
|
|
@ -69,7 +70,7 @@ class Processor(ConsumerProducer):
|
||||||
"prompt_response_queue": pr_response_queue,
|
"prompt_response_queue": pr_response_queue,
|
||||||
"subscriber": subscriber,
|
"subscriber": subscriber,
|
||||||
"input_schema": ChunkEmbeddings.__name__,
|
"input_schema": ChunkEmbeddings.__name__,
|
||||||
"output_schema": Triple.__name__,
|
"output_schema": Triples.__name__,
|
||||||
"vector_schema": GraphEmbeddings.__name__,
|
"vector_schema": GraphEmbeddings.__name__,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
@ -92,9 +93,12 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
return self.prompt.request_relationships(chunk)
|
return self.prompt.request_relationships(chunk)
|
||||||
|
|
||||||
def emit_edge(self, metadata, s, p, o):
|
def emit_edges(self, metadata, triples):
|
||||||
|
|
||||||
t = Triple(metadata=metadata, s=s, p=p, o=o)
|
t = Triples(
|
||||||
|
metadata=metadata,
|
||||||
|
triples=triples,
|
||||||
|
)
|
||||||
self.producer.send(t)
|
self.producer.send(t)
|
||||||
|
|
||||||
def emit_vec(self, metadata, ent, vec):
|
def emit_vec(self, metadata, ent, vec):
|
||||||
|
|
@ -113,6 +117,13 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
rels = self.get_relationships(chunk)
|
rels = self.get_relationships(chunk)
|
||||||
|
|
||||||
|
triples = []
|
||||||
|
|
||||||
|
# FIXME: Putting metadata into triples store is duplicated in
|
||||||
|
# relationships extractor too
|
||||||
|
for t in v.metadata.metadata:
|
||||||
|
triples.append(t)
|
||||||
|
|
||||||
for rel in rels:
|
for rel in rels:
|
||||||
|
|
||||||
s = rel.s
|
s = rel.s
|
||||||
|
|
@ -139,43 +150,65 @@ class Processor(ConsumerProducer):
|
||||||
else:
|
else:
|
||||||
o_value = Value(value=str(o), is_uri=False)
|
o_value = Value(value=str(o), is_uri=False)
|
||||||
|
|
||||||
self.emit_edge(
|
triples.append(Triple(
|
||||||
v.metadata,
|
s=s_value,
|
||||||
s_value,
|
p=p_value,
|
||||||
p_value,
|
o=o_value
|
||||||
o_value
|
))
|
||||||
)
|
|
||||||
|
|
||||||
# Label for s
|
# Label for s
|
||||||
self.emit_edge(
|
triples.append(Triple(
|
||||||
v.metadata,
|
s=s_value,
|
||||||
s_value,
|
p=RDF_LABEL_VALUE,
|
||||||
RDF_LABEL_VALUE,
|
o=Value(value=str(s), is_uri=False)
|
||||||
Value(value=str(s), is_uri=False)
|
))
|
||||||
)
|
|
||||||
|
|
||||||
# Label for p
|
# Label for p
|
||||||
self.emit_edge(
|
triples.append(Triple(
|
||||||
v.metadata,
|
s=p_value,
|
||||||
p_value,
|
p=RDF_LABEL_VALUE,
|
||||||
RDF_LABEL_VALUE,
|
o=Value(value=str(p), is_uri=False)
|
||||||
Value(value=str(p), is_uri=False)
|
))
|
||||||
)
|
|
||||||
|
|
||||||
if rel.o_entity:
|
if rel.o_entity:
|
||||||
# Label for o
|
# Label for o
|
||||||
self.emit_edge(
|
triples.append(Triple(
|
||||||
v.metadata,
|
s=o_value,
|
||||||
o_value,
|
p=RDF_LABEL_VALUE,
|
||||||
RDF_LABEL_VALUE,
|
o=Value(value=str(o), is_uri=False)
|
||||||
Value(value=str(o), is_uri=False)
|
))
|
||||||
)
|
|
||||||
|
# 'Subject of' for s
|
||||||
|
triples.append(Triple(
|
||||||
|
s=s_value,
|
||||||
|
p=SUBJECT_OF_VALUE,
|
||||||
|
o=Value(value=v.metadata.id, is_uri=True)
|
||||||
|
))
|
||||||
|
|
||||||
|
if rel.o_entity:
|
||||||
|
# 'Subject of' for o
|
||||||
|
triples.append(Triple(
|
||||||
|
s=o_value,
|
||||||
|
p=RDF_LABEL_VALUE,
|
||||||
|
o=Value(value=v.metadata.id, is_uri=True)
|
||||||
|
))
|
||||||
|
|
||||||
self.emit_vec(v.metadata, s_value, v.vectors)
|
self.emit_vec(v.metadata, s_value, v.vectors)
|
||||||
self.emit_vec(v.metadata, p_value, v.vectors)
|
self.emit_vec(v.metadata, p_value, v.vectors)
|
||||||
|
|
||||||
if rel.o_entity:
|
if rel.o_entity:
|
||||||
self.emit_vec(v.metadata, o_value, v.vectors)
|
self.emit_vec(v.metadata, o_value, v.vectors)
|
||||||
|
|
||||||
|
self.emit_edges(
|
||||||
|
Metadata(
|
||||||
|
id=v.metadata.id,
|
||||||
|
metadata=[],
|
||||||
|
user=v.metadata.user,
|
||||||
|
collection=v.metadata.collection,
|
||||||
|
),
|
||||||
|
triples
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Exception: ", e, flush=True)
|
print("Exception: ", e, flush=True)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,7 @@ get entity definitions which are output as graph edges.
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from .... schema import ChunkEmbeddings, Triple, Metadata, Value
|
from .... schema import ChunkEmbeddings, Triple, Triples, Metadata, Value
|
||||||
from .... schema import chunk_embeddings_ingest_queue, triples_store_queue
|
from .... schema import chunk_embeddings_ingest_queue, triples_store_queue
|
||||||
from .... schema import prompt_request_queue
|
from .... schema import prompt_request_queue
|
||||||
from .... schema import prompt_response_queue
|
from .... schema import prompt_response_queue
|
||||||
|
|
@ -44,7 +44,7 @@ class Processor(ConsumerProducer):
|
||||||
"output_queue": output_queue,
|
"output_queue": output_queue,
|
||||||
"subscriber": subscriber,
|
"subscriber": subscriber,
|
||||||
"input_schema": ChunkEmbeddings,
|
"input_schema": ChunkEmbeddings,
|
||||||
"output_schema": Triple,
|
"output_schema": Triples,
|
||||||
"prompt_request_queue": pr_request_queue,
|
"prompt_request_queue": pr_request_queue,
|
||||||
"prompt_response_queue": pr_response_queue,
|
"prompt_response_queue": pr_response_queue,
|
||||||
}
|
}
|
||||||
|
|
@ -71,7 +71,10 @@ class Processor(ConsumerProducer):
|
||||||
|
|
||||||
def emit_edge(self, metadata, s, p, o):
|
def emit_edge(self, metadata, s, p, o):
|
||||||
|
|
||||||
t = Triple(metadata=metadata, s=s, p=p, o=o)
|
t = Triples(
|
||||||
|
metadata=metadata,
|
||||||
|
triples=[Triple(s=s, p=p, o=o)],
|
||||||
|
)
|
||||||
self.producer.send(t)
|
self.producer.send(t)
|
||||||
|
|
||||||
def handle(self, msg):
|
def handle(self, msg):
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ import argparse
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from .... direct.cassandra import TrustGraph
|
from .... direct.cassandra import TrustGraph
|
||||||
from .... schema import Triple
|
from .... schema import Triple, Triples
|
||||||
from .... schema import triples_store_queue
|
from .... schema import triples_store_queue
|
||||||
from .... log_level import LogLevel
|
from .... log_level import LogLevel
|
||||||
from .... base import Consumer
|
from .... base import Consumer
|
||||||
|
|
@ -33,7 +33,7 @@ class Processor(Consumer):
|
||||||
**params | {
|
**params | {
|
||||||
"input_queue": input_queue,
|
"input_queue": input_queue,
|
||||||
"subscriber": subscriber,
|
"subscriber": subscriber,
|
||||||
"input_schema": Triple,
|
"input_schema": Triples,
|
||||||
"graph_host": graph_host,
|
"graph_host": graph_host,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
@ -63,10 +63,11 @@ class Processor(Consumer):
|
||||||
|
|
||||||
self.table = table
|
self.table = table
|
||||||
|
|
||||||
|
for t in v.triples:
|
||||||
self.tg.insert(
|
self.tg.insert(
|
||||||
v.s.value,
|
t.s.value,
|
||||||
v.p.value,
|
t.p.value,
|
||||||
v.o.value
|
t.o.value
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
||||||
|
|
@ -116,14 +116,16 @@ class Processor(Consumer):
|
||||||
|
|
||||||
v = msg.value()
|
v = msg.value()
|
||||||
|
|
||||||
self.create_node(v.s.value)
|
for t in v.triples:
|
||||||
|
|
||||||
|
self.create_node(t.s.value)
|
||||||
|
|
||||||
if v.o.is_uri:
|
if v.o.is_uri:
|
||||||
self.create_node(v.o.value)
|
self.create_node(t.o.value)
|
||||||
self.relate_node(v.s.value, v.p.value, v.o.value)
|
self.relate_node(t.s.value, t.p.value, t.o.value)
|
||||||
else:
|
else:
|
||||||
self.create_literal(v.o.value)
|
self.create_literal(t.o.value)
|
||||||
self.relate_literal(v.s.value, v.p.value, v.o.value)
|
self.relate_literal(t.s.value, t.p.value, t.o.value)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def add_args(parser):
|
def add_args(parser):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue