mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 16:36:21 +02:00
Feature: document metadata (#123)
* Rework metadata structure in processing messages to be a subgraph * Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes * Document metadata is added to knowledge graph with subjectOf linkage to extracted entities
This commit is contained in:
parent
b8818e28d0
commit
7954e863cc
21 changed files with 625 additions and 98 deletions
6
trustgraph-base/trustgraph/knowledge/__init__.py
Normal file
6
trustgraph-base/trustgraph/knowledge/__init__.py
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
from . identifier import *
|
||||
from . publication import *
|
||||
from . document import *
|
||||
from . organization import *
|
||||
|
||||
25
trustgraph-base/trustgraph/knowledge/defs.py
Normal file
25
trustgraph-base/trustgraph/knowledge/defs.py
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
|
||||
IS_A = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'
|
||||
LABEL = 'http://www.w3.org/2000/01/rdf-schema#label'
|
||||
|
||||
DIGITAL_DOCUMENT = 'https://schema.org/DigitalDocument'
|
||||
PUBLICATION_EVENT = 'https://schema.org/PublicationEvent'
|
||||
ORGANIZATION = 'https://schema.org/Organization'
|
||||
|
||||
NAME = 'https://schema.org/name'
|
||||
DESCRIPTION = 'https://schema.org/description'
|
||||
COPYRIGHT_NOTICE = 'https://schema.org/copyrightNotice'
|
||||
COPYRIGHT_HOLDER = 'https://schema.org/copyrightHolder'
|
||||
COPYRIGHT_YEAR = 'https://schema.org/copyrightYear'
|
||||
LICENSE = 'https://schema.org/license'
|
||||
PUBLICATION = 'https://schema.org/publication'
|
||||
START_DATE = 'https://schema.org/startDate'
|
||||
END_DATE = 'https://schema.org/endDate'
|
||||
PUBLISHED_BY = 'https://schema.org/publishedBy'
|
||||
DATE_PUBLISHED = 'https://schema.org/datePublished'
|
||||
PUBLICATION = 'https://schema.org/publication'
|
||||
DATE_PUBLISHED = 'https://schema.org/datePublished'
|
||||
URL = 'https://schema.org/url'
|
||||
IDENTIFIER = 'https://schema.org/identifier'
|
||||
KEYWORD = 'https://schema.org/keywords'
|
||||
|
||||
119
trustgraph-base/trustgraph/knowledge/document.py
Normal file
119
trustgraph-base/trustgraph/knowledge/document.py
Normal file
|
|
@ -0,0 +1,119 @@
|
|||
|
||||
from . defs import *
|
||||
from .. schema import Triple, Value
|
||||
|
||||
class DigitalDocument:
|
||||
def __init__(
|
||||
self, id, name=None, description=None, copyright_notice=None,
|
||||
copyright_holder=None, copyright_year=None, license=None,
|
||||
identifier=None,
|
||||
publication=None, url=None, keywords=[]
|
||||
):
|
||||
self.id = id
|
||||
self.name = name
|
||||
self.description = description
|
||||
self.copyright_notice = copyright_notice
|
||||
self.copyright_holder = copyright_holder
|
||||
self.copyright_year = copyright_year
|
||||
self.license = license
|
||||
self.publication = publication
|
||||
self.url = url
|
||||
self.identifier = identifier
|
||||
self.keywords = keywords
|
||||
|
||||
def emit(self, emit):
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=IS_A, is_uri=True),
|
||||
o=Value(value=DIGITAL_DOCUMENT, is_uri=True)
|
||||
))
|
||||
|
||||
if self.name:
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=LABEL, is_uri=True),
|
||||
o=Value(value=self.name, is_uri=False)
|
||||
))
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=NAME, is_uri=True),
|
||||
o=Value(value=self.name, is_uri=False)
|
||||
))
|
||||
|
||||
if self.identifier:
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=id, is_uri=True),
|
||||
p=Value(value=IDENTIFIER, is_uri=True),
|
||||
o=Value(value=self.identifier, is_uri=False)
|
||||
))
|
||||
|
||||
if self.description:
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=DESCRIPTION, is_uri=True),
|
||||
o=Value(value=self.description, is_uri=False)
|
||||
))
|
||||
|
||||
if self.copyright_notice:
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=COPYRIGHT_NOTICE, is_uri=True),
|
||||
o=Value(value=self.copyright_notice, is_uri=False)
|
||||
))
|
||||
|
||||
if self.copyright_holder:
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=COPYRIGHT_HOLDER, is_uri=True),
|
||||
o=Value(value=self.copyright_holder, is_uri=False)
|
||||
))
|
||||
|
||||
if self.copyright_year:
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=COPYRIGHT_YEAR, is_uri=True),
|
||||
o=Value(value=self.copyright_year, is_uri=False)
|
||||
))
|
||||
|
||||
if self.license:
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=LICENSE, is_uri=True),
|
||||
o=Value(value=self.license, is_uri=False)
|
||||
))
|
||||
|
||||
if self.keywords:
|
||||
for k in self.keywords:
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=KEYWORD, is_uri=True),
|
||||
o=Value(value=k, is_uri=False)
|
||||
))
|
||||
|
||||
if self.publication:
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=PUBLICATION, is_uri=True),
|
||||
o=Value(value=self.publication.id, is_uri=True)
|
||||
))
|
||||
|
||||
self.publication.emit(emit)
|
||||
|
||||
if self.url:
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=URL, is_uri=True),
|
||||
o=Value(value=self.url, is_uri=True)
|
||||
))
|
||||
|
||||
23
trustgraph-base/trustgraph/knowledge/identifier.py
Normal file
23
trustgraph-base/trustgraph/knowledge/identifier.py
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
|
||||
import uuid
|
||||
import hashlib
|
||||
|
||||
def hash(data):
|
||||
|
||||
if isinstance(data, str):
|
||||
data = data.encode("utf-8")
|
||||
|
||||
# Create a SHA256 hash from the data
|
||||
id = hashlib.sha256(data).hexdigest()
|
||||
|
||||
# Convert into a UUID, 64-byte hash becomes 32-byte UUID
|
||||
id = str(uuid.UUID(id[::2]))
|
||||
|
||||
return id
|
||||
|
||||
def to_uri(pref, id):
|
||||
return f"https://trustgraph.ai/{pref}/{id}"
|
||||
|
||||
PREF_PUBEV = "pubev"
|
||||
PREF_ORG = "org"
|
||||
PREF_DOC = "doc"
|
||||
40
trustgraph-base/trustgraph/knowledge/organization.py
Normal file
40
trustgraph-base/trustgraph/knowledge/organization.py
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
|
||||
from . defs import *
|
||||
from .. schema import Triple, Value
|
||||
|
||||
class Organization:
|
||||
def __init__(self, id, name=None, description=None):
|
||||
self.id = id
|
||||
self.name = name
|
||||
self.description = description
|
||||
|
||||
def emit(self, emit):
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=IS_A, is_uri=True),
|
||||
o=Value(value=ORGANIZATION, is_uri=True)
|
||||
))
|
||||
|
||||
if self.name:
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=LABEL, is_uri=True),
|
||||
o=Value(value=self.name, is_uri=False)
|
||||
))
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=NAME, is_uri=True),
|
||||
o=Value(value=self.name, is_uri=False)
|
||||
))
|
||||
|
||||
if self.description:
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=DESCRIPTION, is_uri=True),
|
||||
o=Value(value=self.description, is_uri=False)
|
||||
))
|
||||
|
||||
70
trustgraph-base/trustgraph/knowledge/publication.py
Normal file
70
trustgraph-base/trustgraph/knowledge/publication.py
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
|
||||
from . defs import *
|
||||
from .. schema import Triple, Value
|
||||
|
||||
class PublicationEvent:
|
||||
def __init__(
|
||||
self, id, organization=None, name=None, description=None,
|
||||
start_date=None, end_date=None,
|
||||
):
|
||||
self.id = id
|
||||
self.organization = organization
|
||||
self.name = name
|
||||
self.description = description
|
||||
self.start_date = start_date
|
||||
self.end_date = end_date
|
||||
|
||||
def emit(self, emit):
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=IS_A, is_uri=True),
|
||||
o=Value(value=PUBLICATION_EVENT, is_uri=True)))
|
||||
|
||||
if self.name:
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=LABEL, is_uri=True),
|
||||
o=Value(value=self.name, is_uri=False)
|
||||
))
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=NAME, is_uri=True),
|
||||
o=Value(value=self.name, is_uri=False)
|
||||
))
|
||||
|
||||
if self.description:
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=DESCRIPTION, is_uri=True),
|
||||
o=Value(value=self.description, is_uri=False)
|
||||
))
|
||||
|
||||
if self.organization:
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=PUBLISHED_BY, is_uri=True),
|
||||
o=Value(value=self.organization.id, is_uri=True)
|
||||
))
|
||||
|
||||
self.organization.emit(emit)
|
||||
|
||||
if self.start_date:
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=START_DATE, is_uri=True),
|
||||
o=Value(value=self.start_date, is_uri=False)
|
||||
))
|
||||
|
||||
if self.end_date:
|
||||
|
||||
emit(Triple(
|
||||
s=Value(value=self.id, is_uri=True),
|
||||
p=Value(value=END_DATE, is_uri=True),
|
||||
o=Value(value=self.end_date, is_uri=False)))
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue