trustgraph/trustgraph-cli/scripts/tg-load-text

#!/usr/bin/env python3

"""
Loads a text document into TrustGraph processing.
"""

import pulsar
from pulsar.schema import JsonSchema
import hashlib
import argparse
import os
import time
import uuid

from trustgraph.api import Api
from trustgraph.knowledge import hash, to_uri
from trustgraph.knowledge import PREF_PUBEV, PREF_DOC, PREF_ORG
from trustgraph.knowledge import Organization, PublicationEvent
from trustgraph.knowledge import DigitalDocument

default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
default_user = 'trustgraph'
default_collection = 'default'

class Loader:

    def __init__(
            self,
            url,
            user,
            collection,
            metadata,
    ):

        self.api = Api(url)

        self.user = user
        self.collection = collection
        self.metadata = metadata

    def load(self, files):

        for file in files:
            self.load_file(file)

    def load_file(self, file):

        try:

            path = file
            data = open(path, "rb").read()

            # Create a SHA256 hash from the data
            id = hash(data)

            id = to_uri(PREF_DOC, id)

            self.metadata.id = id

            self.api.load_text(
                text=data, id=id, metadata=self.metadata, 
#                user=self.user,
#                    collection=self.collection,
            )

            print(f"{file}: Loaded successfully.")

        except Exception as e:
            print(f"{file}: Failed: {str(e)}", flush=True)

def main():

    parser = argparse.ArgumentParser(
        prog='tg-load-text',
        description=__doc__,
    )

    parser.add_argument(
        '-u', '--url',
        default=default_url,
        help=f'API URL (default: {default_url})',
    )

    parser.add_argument(
        '-U', '--user',
        default=default_user,
        help=f'User ID (default: {default_user})'
    )

    parser.add_argument(
        '-C', '--collection',
        default=default_collection,
        help=f'Collection ID (default: {default_collection})'
    )

    parser.add_argument(
        '--name', help=f'Document name'
    )

    parser.add_argument(
        '--description', help=f'Document description'
    )

    parser.add_argument(
        '--copyright-notice', help=f'Copyright notice'
    )

    parser.add_argument(
        '--copyright-holder', help=f'Copyright holder'
    )

    parser.add_argument(
        '--copyright-year', help=f'Copyright year'
    )

    parser.add_argument(
        '--license', help=f'Copyright license'
    )

    parser.add_argument(
        '--publication-organization', help=f'Publication organization'
    )

    parser.add_argument(
        '--publication-description', help=f'Publication description'
    )

    parser.add_argument(
        '--publication-date', help=f'Publication date'
    )

    parser.add_argument(
        '--document-url', help=f'Document URL'
    )

    parser.add_argument(
        '--keyword', nargs='+', help=f'Keyword'
    )

    parser.add_argument(
        '--identifier', '--id', help=f'Document ID'
    )

    parser.add_argument(
        'files', nargs='+',
        help=f'File to load'
    )

    args = parser.parse_args()

    while True:

        try:

            document = DigitalDocument(
                id,
                name=args.name,
                description=args.description,
                copyright_notice=args.copyright_notice,
                copyright_holder=args.copyright_holder,
                copyright_year=args.copyright_year,
                license=args.license,
                url=args.document_url,
                keywords=args.keyword,
            )

            if args.publication_organization:
                org = Organization(
                    id=to_uri(PREF_ORG, hash(args.publication_organization)),
                    name=args.publication_organization,
                )
                document.publication = PublicationEvent(
                    id = to_uri(PREF_PUBEV, str(uuid.uuid4())),
                    organization=org,
                    description=args.publication_description,
                    start_date=args.publication_date,
                    end_date=args.publication_date,
                )

            p = Loader(
                url=args.url,
                user=args.user,
                collection=args.collection,
                metadata=document,
            )

            p.load(args.files)

            print("All done.")
            break

        except Exception as e:

            print("Exception:", e, flush=True)
            print("Will retry...", flush=True)

        time.sleep(10)

main()
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00			`#!/usr/bin/env python3`

			`"""`
			`Loads a text document into TrustGraph processing.`
			`"""`

			`import pulsar`
			`from pulsar.schema import JsonSchema`
			`import hashlib`
			`import argparse`
			`import os`
			`import time`
Feature: document metadata (#123) * Rework metadata structure in processing messages to be a subgraph * Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes * Document metadata is added to knowledge graph with subjectOf linkage to extracted entities 2024-10-23 18:04:04 +01:00			`import uuid`
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00
Migrate cli utils to REST API (#239) * Port a number of commands to use API gateway instead of Pulsar * Ported tg-invoke-agent to websockets API * Rename the 2 RAG commands: tg-query-... to tg-invoke-... 2025-01-02 19:49:22 +00:00			`from trustgraph.api import Api`
			`from trustgraph.knowledge import hash, to_uri`
Feature: document metadata (#123) * Rework metadata structure in processing messages to be a subgraph * Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes * Document metadata is added to knowledge graph with subjectOf linkage to extracted entities 2024-10-23 18:04:04 +01:00			`from trustgraph.knowledge import PREF_PUBEV, PREF_DOC, PREF_ORG`
			`from trustgraph.knowledge import Organization, PublicationEvent`
			`from trustgraph.knowledge import DigitalDocument`
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00
Migrate cli utils to REST API (#239) * Port a number of commands to use API gateway instead of Pulsar * Ported tg-invoke-agent to websockets API * Rename the 2 RAG commands: tg-query-... to tg-invoke-... 2025-01-02 19:49:22 +00:00			`default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')`
Feature / collections (#96) * Update schema defs for source -> metadata * Migrate to use metadata part of schema, also add metadata to triples & vecs * Add user/collection metadata to query * Use user/collection in RAG * Write and query working on triples 2024-10-02 18:14:29 +01:00			`default_user = 'trustgraph'`
			`default_collection = 'default'`

Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00			`class Loader:`

			`def __init__(`
			`self,`
Migrate cli utils to REST API (#239) * Port a number of commands to use API gateway instead of Pulsar * Ported tg-invoke-agent to websockets API * Rename the 2 RAG commands: tg-query-... to tg-invoke-... 2025-01-02 19:49:22 +00:00			`url,`
Feature / collections (#96) * Update schema defs for source -> metadata * Migrate to use metadata part of schema, also add metadata to triples & vecs * Add user/collection metadata to query * Use user/collection in RAG * Write and query working on triples 2024-10-02 18:14:29 +01:00			`user,`
			`collection,`
Feature: document metadata (#123) * Rework metadata structure in processing messages to be a subgraph * Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes * Document metadata is added to knowledge graph with subjectOf linkage to extracted entities 2024-10-23 18:04:04 +01:00			`metadata,`
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00			`):`

Migrate cli utils to REST API (#239) * Port a number of commands to use API gateway instead of Pulsar * Ported tg-invoke-agent to websockets API * Rename the 2 RAG commands: tg-query-... to tg-invoke-... 2025-01-02 19:49:22 +00:00			`self.api = Api(url)`
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00
Feature / collections (#96) * Update schema defs for source -> metadata * Migrate to use metadata part of schema, also add metadata to triples & vecs * Add user/collection metadata to query * Use user/collection in RAG * Write and query working on triples 2024-10-02 18:14:29 +01:00			`self.user = user`
			`self.collection = collection`
Feature: document metadata (#123) * Rework metadata structure in processing messages to be a subgraph * Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes * Document metadata is added to knowledge graph with subjectOf linkage to extracted entities 2024-10-23 18:04:04 +01:00			`self.metadata = metadata`
Feature / collections (#96) * Update schema defs for source -> metadata * Migrate to use metadata part of schema, also add metadata to triples & vecs * Add user/collection metadata to query * Use user/collection in RAG * Write and query working on triples 2024-10-02 18:14:29 +01:00
Replace... (#91) Replace tg-load-* -f file with tg-load-* file1 file2 ... 2024-10-01 19:34:35 +01:00			`def load(self, files):`
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00
Replace... (#91) Replace tg-load-* -f file with tg-load-* file1 file2 ... 2024-10-01 19:34:35 +01:00			`for file in files:`
			`self.load_file(file)`

			`def load_file(self, file):`
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00
			`try:`

Replace... (#91) Replace tg-load-* -f file with tg-load-* file1 file2 ... 2024-10-01 19:34:35 +01:00			`path = file`
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00			`data = open(path, "rb").read()`

Feature: document metadata (#123) * Rework metadata structure in processing messages to be a subgraph * Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes * Document metadata is added to knowledge graph with subjectOf linkage to extracted entities 2024-10-23 18:04:04 +01:00			`# Create a SHA256 hash from the data`
			`id = hash(data)`

			`id = to_uri(PREF_DOC, id)`

			`self.metadata.id = id`
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00
Migrate cli utils to REST API (#239) * Port a number of commands to use API gateway instead of Pulsar * Ported tg-invoke-agent to websockets API * Rename the 2 RAG commands: tg-query-... to tg-invoke-... 2025-01-02 19:49:22 +00:00			`self.api.load_text(`
			`text=data, id=id, metadata=self.metadata,`
			`# user=self.user,`
			`# collection=self.collection,`
			`)`
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00
Replace... (#91) Replace tg-load-* -f file with tg-load-* file1 file2 ... 2024-10-01 19:34:35 +01:00			`print(f"{file}: Loaded successfully.")`

Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00			`except Exception as e:`
Replace... (#91) Replace tg-load-* -f file with tg-load-* file1 file2 ... 2024-10-01 19:34:35 +01:00			`print(f"{file}: Failed: {str(e)}", flush=True)`
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00
			`def main():`

			`parser = argparse.ArgumentParser(`
Fix/improve command line help (#145) * Make command line consistent, fix incorrect documentation. * Improve tg-invoke-prompt help 2024-11-08 18:14:14 +00:00			`prog='tg-load-text',`
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00			`description=__doc__,`
			`)`

			`parser.add_argument(`
Migrate cli utils to REST API (#239) * Port a number of commands to use API gateway instead of Pulsar * Ported tg-invoke-agent to websockets API * Rename the 2 RAG commands: tg-query-... to tg-invoke-... 2025-01-02 19:49:22 +00:00			`'-u', '--url',`
			`default=default_url,`
			`help=f'API URL (default: {default_url})',`
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00			`)`

Feature / collections (#96) * Update schema defs for source -> metadata * Migrate to use metadata part of schema, also add metadata to triples & vecs * Add user/collection metadata to query * Use user/collection in RAG * Write and query working on triples 2024-10-02 18:14:29 +01:00			`parser.add_argument(`
Migrate cli utils to REST API (#239) * Port a number of commands to use API gateway instead of Pulsar * Ported tg-invoke-agent to websockets API * Rename the 2 RAG commands: tg-query-... to tg-invoke-... 2025-01-02 19:49:22 +00:00			`'-U', '--user',`
Feature / collections (#96) * Update schema defs for source -> metadata * Migrate to use metadata part of schema, also add metadata to triples & vecs * Add user/collection metadata to query * Use user/collection in RAG * Write and query working on triples 2024-10-02 18:14:29 +01:00			`default=default_user,`
			`help=f'User ID (default: {default_user})'`
			`)`

			`parser.add_argument(`
Migrate cli utils to REST API (#239) * Port a number of commands to use API gateway instead of Pulsar * Ported tg-invoke-agent to websockets API * Rename the 2 RAG commands: tg-query-... to tg-invoke-... 2025-01-02 19:49:22 +00:00			`'-C', '--collection',`
Feature / collections (#96) * Update schema defs for source -> metadata * Migrate to use metadata part of schema, also add metadata to triples & vecs * Add user/collection metadata to query * Use user/collection in RAG * Write and query working on triples 2024-10-02 18:14:29 +01:00			`default=default_collection,`
			`help=f'Collection ID (default: {default_collection})'`
			`)`

Feature: document metadata (#123) * Rework metadata structure in processing messages to be a subgraph * Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes * Document metadata is added to knowledge graph with subjectOf linkage to extracted entities 2024-10-23 18:04:04 +01:00			`parser.add_argument(`
			`'--name', help=f'Document name'`
			`)`

			`parser.add_argument(`
			`'--description', help=f'Document description'`
			`)`

			`parser.add_argument(`
			`'--copyright-notice', help=f'Copyright notice'`
			`)`

			`parser.add_argument(`
			`'--copyright-holder', help=f'Copyright holder'`
			`)`

			`parser.add_argument(`
			`'--copyright-year', help=f'Copyright year'`
			`)`

			`parser.add_argument(`
			`'--license', help=f'Copyright license'`
			`)`

			`parser.add_argument(`
			`'--publication-organization', help=f'Publication organization'`
			`)`

			`parser.add_argument(`
			`'--publication-description', help=f'Publication description'`
			`)`

			`parser.add_argument(`
			`'--publication-date', help=f'Publication date'`
			`)`

			`parser.add_argument(`
Migrate cli utils to REST API (#239) * Port a number of commands to use API gateway instead of Pulsar * Ported tg-invoke-agent to websockets API * Rename the 2 RAG commands: tg-query-... to tg-invoke-... 2025-01-02 19:49:22 +00:00			`'--document-url', help=f'Document URL'`
Feature: document metadata (#123) * Rework metadata structure in processing messages to be a subgraph * Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes * Document metadata is added to knowledge graph with subjectOf linkage to extracted entities 2024-10-23 18:04:04 +01:00			`)`

			`parser.add_argument(`
			`'--keyword', nargs='+', help=f'Keyword'`
			`)`

			`parser.add_argument(`
			`'--identifier', '--id', help=f'Document ID'`
			`)`

Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00			`parser.add_argument(`
Replace... (#91) Replace tg-load-* -f file with tg-load-* file1 file2 ... 2024-10-01 19:34:35 +01:00			`'files', nargs='+',`
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00			`help=f'File to load'`
			`)`

			`args = parser.parse_args()`

			`while True:`

			`try:`
Replace... (#91) Replace tg-load-* -f file with tg-load-* file1 file2 ... 2024-10-01 19:34:35 +01:00
Feature: document metadata (#123) * Rework metadata structure in processing messages to be a subgraph * Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes * Document metadata is added to knowledge graph with subjectOf linkage to extracted entities 2024-10-23 18:04:04 +01:00			`document = DigitalDocument(`
			`id,`
			`name=args.name,`
			`description=args.description,`
			`copyright_notice=args.copyright_notice,`
			`copyright_holder=args.copyright_holder,`
			`copyright_year=args.copyright_year,`
			`license=args.license,`
Migrate cli utils to REST API (#239) * Port a number of commands to use API gateway instead of Pulsar * Ported tg-invoke-agent to websockets API * Rename the 2 RAG commands: tg-query-... to tg-invoke-... 2025-01-02 19:49:22 +00:00			`url=args.document_url,`
Feature: document metadata (#123) * Rework metadata structure in processing messages to be a subgraph * Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes * Document metadata is added to knowledge graph with subjectOf linkage to extracted entities 2024-10-23 18:04:04 +01:00			`keywords=args.keyword,`
			`)`

			`if args.publication_organization:`
			`org = Organization(`
			`id=to_uri(PREF_ORG, hash(args.publication_organization)),`
			`name=args.publication_organization,`
			`)`
			`document.publication = PublicationEvent(`
			`id = to_uri(PREF_PUBEV, str(uuid.uuid4())),`
			`organization=org,`
			`description=args.publication_description,`
			`start_date=args.publication_date,`
			`end_date=args.publication_date,`
			`)`

Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00			`p = Loader(`
Migrate cli utils to REST API (#239) * Port a number of commands to use API gateway instead of Pulsar * Ported tg-invoke-agent to websockets API * Rename the 2 RAG commands: tg-query-... to tg-invoke-... 2025-01-02 19:49:22 +00:00			`url=args.url,`
Feature / collections (#96) * Update schema defs for source -> metadata * Migrate to use metadata part of schema, also add metadata to triples & vecs * Add user/collection metadata to query * Use user/collection in RAG * Write and query working on triples 2024-10-02 18:14:29 +01:00			`user=args.user,`
			`collection=args.collection,`
Feature: document metadata (#123) * Rework metadata structure in processing messages to be a subgraph * Add subgraph creation for tg-load-pdf and tg-load-text based on command-line passing of doc attributes * Document metadata is added to knowledge graph with subjectOf linkage to extracted entities 2024-10-23 18:04:04 +01:00			`metadata=document,`
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00			`)`

Replace... (#91) Replace tg-load-* -f file with tg-load-* file1 file2 ... 2024-10-01 19:34:35 +01:00			`p.load(args.files)`
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00
Replace... (#91) Replace tg-load-* -f file with tg-load-* file1 file2 ... 2024-10-01 19:34:35 +01:00			`print("All done.")`
Refactor templates (#52) * Switching from docker compose to abstract form - should be easier to k8s later * Text loader util * Recreate templates 2024-09-05 16:40:47 +01:00			`break`

			`except Exception as e:`

			`print("Exception:", e, flush=True)`
			`print("Will retry...", flush=True)`

			`time.sleep(10)`

			`main()`