mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 08:26:21 +02:00
Incremental / large document loading (#659)
Tech spec
BlobStore (trustgraph-flow/trustgraph/librarian/blob_store.py):
- get_stream() - yields document content in chunks for streaming retrieval
- create_multipart_upload() - initializes S3 multipart upload, returns
upload_id
- upload_part() - uploads a single part, returns etag
- complete_multipart_upload() - finalizes upload with part etags
- abort_multipart_upload() - cancels and cleans up
Cassandra schema (trustgraph-flow/trustgraph/tables/library.py):
- New upload_session table with 24-hour TTL
- Index on user for listing sessions
- Prepared statements for all operations
- Methods: create_upload_session(), get_upload_session(),
update_upload_session_chunk(), delete_upload_session(),
list_upload_sessions()
- Schema extended with UploadSession, UploadProgress, and new
request/response fields
- Librarian methods: begin_upload, upload_chunk, complete_upload,
abort_upload, get_upload_status, list_uploads
- Service routing for all new operations
- Python SDK with transparent chunked upload:
- add_document() auto-switches to chunked for files > 10MB
- Progress callback support (on_progress)
- get_pending_uploads(), get_upload_status(), abort_upload(),
resume_upload()
- Document table: Added parent_id and document_type columns with index
- Document schema (knowledge/document.py): Added document_id field for
streaming retrieval
- Librarian operations:
- add-child-document for extracted PDF pages
- list-children to get child documents
- stream-document for chunked content retrieval
- Cascade delete removes children when parent is deleted
- list-documents filters children by default
- PDF decoder (decoding/pdf/pdf_decoder.py): Updated to stream large
documents from librarian API to temp file
- Librarian service (librarian/service.py): Sends document_id instead of
content for large PDFs (>2MB)
- Deprecated tools (load_pdf.py, load_text.py): Added deprecation
warnings directing users to tg-add-library-document +
tg-start-library-processing
Remove load_pdf and load_text utils
Move chunker/librarian comms to base class
Updating tests
This commit is contained in:
parent
a38ca9474f
commit
a630e143ef
21 changed files with 3164 additions and 650 deletions
|
|
@ -1,200 +0,0 @@
|
|||
"""
|
||||
Loads a PDF document into TrustGraph processing by directing to
|
||||
the pdf-decoder queue.
|
||||
Consider using tg-add-library-document to load
|
||||
a document, followed by tg-start-library-processing to initiate processing.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
|
||||
from trustgraph.api import Api
|
||||
from trustgraph.knowledge import hash, to_uri
|
||||
from trustgraph.knowledge import PREF_PUBEV, PREF_DOC, PREF_ORG
|
||||
from trustgraph.knowledge import Organization, PublicationEvent
|
||||
from trustgraph.knowledge import DigitalDocument
|
||||
|
||||
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
|
||||
default_user = 'trustgraph'
|
||||
default_collection = 'default'
|
||||
|
||||
class Loader:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url,
|
||||
flow_id,
|
||||
user,
|
||||
collection,
|
||||
metadata,
|
||||
):
|
||||
|
||||
self.api = Api(url).flow().id(flow_id)
|
||||
|
||||
self.user = user
|
||||
self.collection = collection
|
||||
self.metadata = metadata
|
||||
|
||||
def load(self, files):
|
||||
|
||||
for file in files:
|
||||
self.load_file(file)
|
||||
|
||||
def load_file(self, file):
|
||||
|
||||
try:
|
||||
|
||||
path = file
|
||||
data = open(path, "rb").read()
|
||||
|
||||
# Create a SHA256 hash from the data
|
||||
id = hash(data)
|
||||
|
||||
id = to_uri(PREF_DOC, id)
|
||||
|
||||
self.metadata.id = id
|
||||
|
||||
self.api.load_document(
|
||||
document=data, id=id, metadata=self.metadata,
|
||||
user=self.user,
|
||||
collection=self.collection,
|
||||
)
|
||||
|
||||
print(f"{file}: Loaded successfully.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"{file}: Failed: {str(e)}", flush=True)
|
||||
raise e
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='tg-load-pdf',
|
||||
description=__doc__,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-u', '--url',
|
||||
default=default_url,
|
||||
help=f'API URL (default: {default_url})',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-f', '--flow-id',
|
||||
default="default",
|
||||
help=f'Flow ID (default: default)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-U', '--user',
|
||||
default=default_user,
|
||||
help=f'User ID (default: {default_user})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-C', '--collection',
|
||||
default=default_collection,
|
||||
help=f'Collection ID (default: {default_collection})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--name', help=f'Document name'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--description', help=f'Document description'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--copyright-notice', help=f'Copyright notice'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--copyright-holder', help=f'Copyright holder'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--copyright-year', help=f'Copyright year'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--license', help=f'Copyright license'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--publication-organization', help=f'Publication organization'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--publication-description', help=f'Publication description'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--publication-date', help=f'Publication date'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--document-url', help=f'Document URL'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--keyword', nargs='+', help=f'Keyword'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--identifier', '--id', help=f'Document ID'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'files', nargs='+',
|
||||
help=f'File to load'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
|
||||
document = DigitalDocument(
|
||||
id,
|
||||
name=args.name,
|
||||
description=args.description,
|
||||
copyright_notice=args.copyright_notice,
|
||||
copyright_holder=args.copyright_holder,
|
||||
copyright_year=args.copyright_year,
|
||||
license=args.license,
|
||||
url=args.document_url,
|
||||
keywords=args.keyword,
|
||||
)
|
||||
|
||||
if args.publication_organization:
|
||||
org = Organization(
|
||||
id=to_uri(PREF_ORG, hash(args.publication_organization)),
|
||||
name=args.publication_organization,
|
||||
)
|
||||
document.publication = PublicationEvent(
|
||||
id = to_uri(PREF_PUBEV, str(uuid.uuid4())),
|
||||
organization=org,
|
||||
description=args.publication_description,
|
||||
start_date=args.publication_date,
|
||||
end_date=args.publication_date,
|
||||
)
|
||||
|
||||
p = Loader(
|
||||
url=args.url,
|
||||
flow_id = args.flow_id,
|
||||
user=args.user,
|
||||
collection=args.collection,
|
||||
metadata=document,
|
||||
)
|
||||
|
||||
p.load(args.files)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
print("Exception:", e, flush=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,205 +0,0 @@
|
|||
"""
|
||||
Loads a text document into TrustGraph processing by directing to a text
|
||||
loader queue.
|
||||
Consider using tg-add-library-document to load
|
||||
a document, followed by tg-start-library-processing to initiate processing.
|
||||
"""
|
||||
|
||||
import pulsar
|
||||
from pulsar.schema import JsonSchema
|
||||
import hashlib
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
|
||||
from trustgraph.api import Api
|
||||
from trustgraph.knowledge import hash, to_uri
|
||||
from trustgraph.knowledge import PREF_PUBEV, PREF_DOC, PREF_ORG
|
||||
from trustgraph.knowledge import Organization, PublicationEvent
|
||||
from trustgraph.knowledge import DigitalDocument
|
||||
|
||||
default_url = os.getenv("TRUSTGRAPH_URL", 'http://localhost:8088/')
|
||||
default_user = 'trustgraph'
|
||||
default_collection = 'default'
|
||||
|
||||
class Loader:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url,
|
||||
flow_id,
|
||||
user,
|
||||
collection,
|
||||
metadata,
|
||||
):
|
||||
|
||||
self.api = Api(url).flow().id(flow_id)
|
||||
|
||||
self.user = user
|
||||
self.collection = collection
|
||||
self.metadata = metadata
|
||||
|
||||
def load(self, files):
|
||||
|
||||
for file in files:
|
||||
self.load_file(file)
|
||||
|
||||
def load_file(self, file):
|
||||
|
||||
try:
|
||||
|
||||
path = file
|
||||
data = open(path, "rb").read()
|
||||
|
||||
# Create a SHA256 hash from the data
|
||||
id = hash(data)
|
||||
|
||||
id = to_uri(PREF_DOC, id)
|
||||
|
||||
self.metadata.id = id
|
||||
|
||||
self.api.load_text(
|
||||
text=data, id=id, metadata=self.metadata,
|
||||
user=self.user,
|
||||
collection=self.collection,
|
||||
)
|
||||
|
||||
print(f"{file}: Loaded successfully.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"{file}: Failed: {str(e)}", flush=True)
|
||||
raise e
|
||||
|
||||
def main():
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='tg-load-text',
|
||||
description=__doc__,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-u', '--url',
|
||||
default=default_url,
|
||||
help=f'API URL (default: {default_url})',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-f', '--flow-id',
|
||||
default="default",
|
||||
help=f'Flow ID (default: default)'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-U', '--user',
|
||||
default=default_user,
|
||||
help=f'User ID (default: {default_user})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-C', '--collection',
|
||||
default=default_collection,
|
||||
help=f'Collection ID (default: {default_collection})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--name', help=f'Document name'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--description', help=f'Document description'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--copyright-notice', help=f'Copyright notice'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--copyright-holder', help=f'Copyright holder'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--copyright-year', help=f'Copyright year'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--license', help=f'Copyright license'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--publication-organization', help=f'Publication organization'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--publication-description', help=f'Publication description'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--publication-date', help=f'Publication date'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--document-url', help=f'Document URL'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--keyword', nargs='+', help=f'Keyword'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--identifier', '--id', help=f'Document ID'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'files', nargs='+',
|
||||
help=f'File to load'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
try:
|
||||
|
||||
document = DigitalDocument(
|
||||
id,
|
||||
name=args.name,
|
||||
description=args.description,
|
||||
copyright_notice=args.copyright_notice,
|
||||
copyright_holder=args.copyright_holder,
|
||||
copyright_year=args.copyright_year,
|
||||
license=args.license,
|
||||
url=args.document_url,
|
||||
keywords=args.keyword,
|
||||
)
|
||||
|
||||
if args.publication_organization:
|
||||
org = Organization(
|
||||
id=to_uri(PREF_ORG, hash(args.publication_organization)),
|
||||
name=args.publication_organization,
|
||||
)
|
||||
document.publication = PublicationEvent(
|
||||
id = to_uri(PREF_PUBEV, str(uuid.uuid4())),
|
||||
organization=org,
|
||||
description=args.publication_description,
|
||||
start_date=args.publication_date,
|
||||
end_date=args.publication_date,
|
||||
)
|
||||
|
||||
p = Loader(
|
||||
url = args.url,
|
||||
flow_id = args.flow_id,
|
||||
user = args.user,
|
||||
collection = args.collection,
|
||||
metadata = document,
|
||||
)
|
||||
|
||||
p.load(args.files)
|
||||
|
||||
print("All done.")
|
||||
|
||||
except Exception as e:
|
||||
|
||||
print("Exception:", e, flush=True)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue