Feature: document metadata (#123)

* Rework metadata structure in processing messages to be a subgraph
* Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes
* Document metadata is added to knowledge graph with subjectOf linkage to extracted entities
This commit is contained in:
cybermaggedon 2024-10-23 18:04:04 +01:00 committed by GitHub
parent b8818e28d0
commit 7954e863cc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 625 additions and 98 deletions

View file

@ -0,0 +1,6 @@
from . identifier import *
from . publication import *
from . document import *
from . organization import *

View file

@ -0,0 +1,25 @@
IS_A = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'
LABEL = 'http://www.w3.org/2000/01/rdf-schema#label'
DIGITAL_DOCUMENT = 'https://schema.org/DigitalDocument'
PUBLICATION_EVENT = 'https://schema.org/PublicationEvent'
ORGANIZATION = 'https://schema.org/Organization'
NAME = 'https://schema.org/name'
DESCRIPTION = 'https://schema.org/description'
COPYRIGHT_NOTICE = 'https://schema.org/copyrightNotice'
COPYRIGHT_HOLDER = 'https://schema.org/copyrightHolder'
COPYRIGHT_YEAR = 'https://schema.org/copyrightYear'
LICENSE = 'https://schema.org/license'
PUBLICATION = 'https://schema.org/publication'
START_DATE = 'https://schema.org/startDate'
END_DATE = 'https://schema.org/endDate'
PUBLISHED_BY = 'https://schema.org/publishedBy'
DATE_PUBLISHED = 'https://schema.org/datePublished'
PUBLICATION = 'https://schema.org/publication'
DATE_PUBLISHED = 'https://schema.org/datePublished'
URL = 'https://schema.org/url'
IDENTIFIER = 'https://schema.org/identifier'
KEYWORD = 'https://schema.org/keywords'

View file

@ -0,0 +1,119 @@
from . defs import *
from .. schema import Triple, Value
class DigitalDocument:
def __init__(
self, id, name=None, description=None, copyright_notice=None,
copyright_holder=None, copyright_year=None, license=None,
identifier=None,
publication=None, url=None, keywords=[]
):
self.id = id
self.name = name
self.description = description
self.copyright_notice = copyright_notice
self.copyright_holder = copyright_holder
self.copyright_year = copyright_year
self.license = license
self.publication = publication
self.url = url
self.identifier = identifier
self.keywords = keywords
def emit(self, emit):
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=IS_A, is_uri=True),
o=Value(value=DIGITAL_DOCUMENT, is_uri=True)
))
if self.name:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=LABEL, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=NAME, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
if self.identifier:
emit(Triple(
s=Value(value=id, is_uri=True),
p=Value(value=IDENTIFIER, is_uri=True),
o=Value(value=self.identifier, is_uri=False)
))
if self.description:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=DESCRIPTION, is_uri=True),
o=Value(value=self.description, is_uri=False)
))
if self.copyright_notice:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=COPYRIGHT_NOTICE, is_uri=True),
o=Value(value=self.copyright_notice, is_uri=False)
))
if self.copyright_holder:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=COPYRIGHT_HOLDER, is_uri=True),
o=Value(value=self.copyright_holder, is_uri=False)
))
if self.copyright_year:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=COPYRIGHT_YEAR, is_uri=True),
o=Value(value=self.copyright_year, is_uri=False)
))
if self.license:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=LICENSE, is_uri=True),
o=Value(value=self.license, is_uri=False)
))
if self.keywords:
for k in self.keywords:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=KEYWORD, is_uri=True),
o=Value(value=k, is_uri=False)
))
if self.publication:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=PUBLICATION, is_uri=True),
o=Value(value=self.publication.id, is_uri=True)
))
self.publication.emit(emit)
if self.url:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=URL, is_uri=True),
o=Value(value=self.url, is_uri=True)
))

View file

@ -0,0 +1,23 @@
import uuid
import hashlib
def hash(data):
if isinstance(data, str):
data = data.encode("utf-8")
# Create a SHA256 hash from the data
id = hashlib.sha256(data).hexdigest()
# Convert into a UUID, 64-byte hash becomes 32-byte UUID
id = str(uuid.UUID(id[::2]))
return id
def to_uri(pref, id):
return f"https://trustgraph.ai/{pref}/{id}"
PREF_PUBEV = "pubev"
PREF_ORG = "org"
PREF_DOC = "doc"

View file

@ -0,0 +1,40 @@
from . defs import *
from .. schema import Triple, Value
class Organization:
def __init__(self, id, name=None, description=None):
self.id = id
self.name = name
self.description = description
def emit(self, emit):
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=IS_A, is_uri=True),
o=Value(value=ORGANIZATION, is_uri=True)
))
if self.name:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=LABEL, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=NAME, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
if self.description:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=DESCRIPTION, is_uri=True),
o=Value(value=self.description, is_uri=False)
))

View file

@ -0,0 +1,70 @@
from . defs import *
from .. schema import Triple, Value
class PublicationEvent:
def __init__(
self, id, organization=None, name=None, description=None,
start_date=None, end_date=None,
):
self.id = id
self.organization = organization
self.name = name
self.description = description
self.start_date = start_date
self.end_date = end_date
def emit(self, emit):
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=IS_A, is_uri=True),
o=Value(value=PUBLICATION_EVENT, is_uri=True)))
if self.name:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=LABEL, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=NAME, is_uri=True),
o=Value(value=self.name, is_uri=False)
))
if self.description:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=DESCRIPTION, is_uri=True),
o=Value(value=self.description, is_uri=False)
))
if self.organization:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=PUBLISHED_BY, is_uri=True),
o=Value(value=self.organization.id, is_uri=True)
))
self.organization.emit(emit)
if self.start_date:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=START_DATE, is_uri=True),
o=Value(value=self.start_date, is_uri=False)
))
if self.end_date:
emit(Triple(
s=Value(value=self.id, is_uri=True),
p=Value(value=END_DATE, is_uri=True),
o=Value(value=self.end_date, is_uri=False)))