Feature: document metadata (#123)

* Rework metadata structure in processing messages to be a subgraph
* Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes
* Document metadata is added to knowledge graph with subjectOf linkage to extracted entities
This commit is contained in:
cybermaggedon 2024-10-23 18:04:04 +01:00 committed by GitHub
parent b8818e28d0
commit 7954e863cc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 625 additions and 98 deletions

View file

@ -0,0 +1,6 @@
from . identifier import *
from . publication import *
from . document import *
from . organization import *

View file

@ -0,0 +1,25 @@
IS_A = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'
LABEL = 'http://www.w3.org/2000/01/rdf-schema#label'
DIGITAL_DOCUMENT = 'https://schema.org/DigitalDocument'
PUBLICATION_EVENT = 'https://schema.org/PublicationEvent'
ORGANIZATION = 'https://schema.org/Organization'
NAME = 'https://schema.org/name'
DESCRIPTION = 'https://schema.org/description'
COPYRIGHT_NOTICE = 'https://schema.org/copyrightNotice'
COPYRIGHT_HOLDER = 'https://schema.org/copyrightHolder'
COPYRIGHT_YEAR = 'https://schema.org/copyrightYear'
LICENSE = 'https://schema.org/license'
PUBLICATION = 'https://schema.org/publication'
START_DATE = 'https://schema.org/startDate'
END_DATE = 'https://schema.org/endDate'
PUBLISHED_BY = 'https://schema.org/publishedBy'
DATE_PUBLISHED = 'https://schema.org/datePublished'
PUBLICATION = 'https://schema.org/publication'
DATE_PUBLISHED = 'https://schema.org/datePublished'
URL = 'https://schema.org/url'
IDENTIFIER = 'https://schema.org/identifier'
KEYWORD = 'https://schema.org/keywords'

View file

@ -0,0 +1,119 @@
from . defs import *
from .. schema import Triple, Value
class DigitalDocument:
def __init__(
self, id, name=None, description=None, copyright_notice=None,
copyright_holder=None, copyright_year=None, license=None,
identifier=None,
publication=None, url=None, keywords=[]
):
self.id = id
self.name = name
self.description = description
self.copyright_notice = copyright_notice
self.copyright_holder = copyright_holder
self.copyright_year = copyright_year
self.license = license
self.publication = publication
self.url = url
self.identifier = identifier
self.keywords = keywords
def emit(self, emit):
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=IS_A, is_uri=True),
o=Value(value=DIGITAL_DOCUMENT, is_uri=True)
))
if self.name:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=LABEL, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=NAME, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
if self.identifier:
emit(Triple(
s=Value(value=id, is_uri=True),
p=Value(value=IDENTIFIER, is_uri=True),
o=Value(value=self.identifier, is_uri=False)
))
if self.description:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=DESCRIPTION, is_uri=True),
o=Value(value=self.description, is_uri=False)
))
if self.copyright_notice:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=COPYRIGHT_NOTICE, is_uri=True),
o=Value(value=self.copyright_notice, is_uri=False)
))
if self.copyright_holder:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=COPYRIGHT_HOLDER, is_uri=True),
o=Value(value=self.copyright_holder, is_uri=False)
))
if self.copyright_year:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=COPYRIGHT_YEAR, is_uri=True),
o=Value(value=self.copyright_year, is_uri=False)
))
if self.license:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=LICENSE, is_uri=True),
o=Value(value=self.license, is_uri=False)
))
if self.keywords:
for k in self.keywords:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=KEYWORD, is_uri=True),
o=Value(value=k, is_uri=False)
))
if self.publication:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=PUBLICATION, is_uri=True),
o=Value(value=self.publication.id, is_uri=True)
))
self.publication.emit(emit)
if self.url:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=URL, is_uri=True),
o=Value(value=self.url, is_uri=True)
))

View file

@ -0,0 +1,23 @@
import uuid
import hashlib
def hash(data):
if isinstance(data, str):
data = data.encode("utf-8")
# Create a SHA256 hash from the data
id = hashlib.sha256(data).hexdigest()
# Convert into a UUID, 64-byte hash becomes 32-byte UUID
id = str(uuid.UUID(id[::2]))
return id
def to_uri(pref, id):
return f"https://trustgraph.ai/{pref}/{id}"
PREF_PUBEV = "pubev"
PREF_ORG = "org"
PREF_DOC = "doc"

View file

@ -0,0 +1,40 @@
from . defs import *
from .. schema import Triple, Value
class Organization:
def __init__(self, id, name=None, description=None):
self.id = id
self.name = name
self.description = description
def emit(self, emit):
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=IS_A, is_uri=True),
o=Value(value=ORGANIZATION, is_uri=True)
))
if self.name:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=LABEL, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=NAME, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
if self.description:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=DESCRIPTION, is_uri=True),
o=Value(value=self.description, is_uri=False)
))

View file

@ -0,0 +1,70 @@
from . defs import *
from .. schema import Triple, Value
class PublicationEvent:
def __init__(
self, id, organization=None, name=None, description=None,
start_date=None, end_date=None,
):
self.id = id
self.organization = organization
self.name = name
self.description = description
self.start_date = start_date
self.end_date = end_date
def emit(self, emit):
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=IS_A, is_uri=True),
o=Value(value=PUBLICATION_EVENT, is_uri=True)))
if self.name:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=LABEL, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=NAME, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
if self.description:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=DESCRIPTION, is_uri=True),
o=Value(value=self.description, is_uri=False)
))
if self.organization:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=PUBLISHED_BY, is_uri=True),
o=Value(value=self.organization.id, is_uri=True)
))
self.organization.emit(emit)
if self.start_date:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=START_DATE, is_uri=True),
o=Value(value=self.start_date, is_uri=False)
))
if self.end_date:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=END_DATE, is_uri=True),
o=Value(value=self.end_date, is_uri=False)))

View file

@ -1,6 +1,7 @@
RDF_LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
DEFINITION = "http://www.w3.org/2004/02/skos/core#definition"
SUBJECT_OF = "https://schema.org/subjectOf"
TRUSTGRAPH_ENTITIES = "http://trustgraph.ai/e/"

View file

@ -9,5 +9,3 @@ from . graph import *
from . retrieval import *
from . metadata import *

View file

@ -1,7 +1,7 @@
from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array, Double
from . types import Error, Value
from . types import Error, Value, Triple
from . topic import topic
from . metadata import Metadata
@ -41,11 +41,9 @@ graph_embeddings_response_queue = topic(
# Graph triples
class Triple(Record):
class Triples(Record):
metadata = Metadata()
s = Value()
p = Value()
o = Value()
triples = Array(Triple())
triples_store_queue = topic('triples-store')

View file

@ -1,10 +1,16 @@
from pulsar.schema import Record, String
from pulsar.schema import Record, String, Array
from . types import Triple
class Metadata(Record):
source = String()
# Source identifier
id = String()
title = String()
# Subgraph
metadata = Array(Triple())
# Collection management
user = String()
collection = String()

View file

@ -10,6 +10,11 @@ class Value(Record):
is_uri = Boolean()
type = String()
class Triple(Record):
s = Value()
p = Value()
o = Value()
class Field(Record):
name = String()
# int, string, long, bool, float, double