diff --git a/Containerfile b/Containerfile index 45a152c0..7283a06b 100644 --- a/Containerfile +++ b/Containerfile @@ -45,6 +45,7 @@ COPY trustgraph-vertexai/ /root/build/trustgraph-vertexai/ COPY trustgraph-bedrock/ /root/build/trustgraph-bedrock/ COPY trustgraph-embeddings-hf/ /root/build/trustgraph-embeddings-hf/ COPY trustgraph-cli/ /root/build/trustgraph-cli/ +COPY trustgraph-ocr/ /root/build/trustgraph-ocr/ WORKDIR /root/build/ @@ -54,6 +55,7 @@ RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-vertexai/ RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-bedrock/ RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-embeddings-hf/ RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-cli/ +RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-ocr/ RUN ls /root/wheels @@ -72,6 +74,7 @@ RUN \ pip3 install /root/wheels/trustgraph_bedrock-* && \ pip3 install /root/wheels/trustgraph_embeddings_hf-* && \ pip3 install /root/wheels/trustgraph_cli-* && \ + pip3 install /root/wheels/trustgraph_ocr-* && \ pip3 cache purge && \ rm -rf /root/wheels diff --git a/Makefile b/Makefile index 0defca58..1fae97f6 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,7 @@ wheels: pip3 wheel --no-deps --wheel-dir dist trustgraph-bedrock/ pip3 wheel --no-deps --wheel-dir dist trustgraph-embeddings-hf/ pip3 wheel --no-deps --wheel-dir dist trustgraph-cli/ + pip3 wheel --no-deps --wheel-dir dist trustgraph-ocr/ packages: update-package-versions rm -rf dist/ @@ -26,6 +27,7 @@ packages: update-package-versions cd trustgraph-bedrock && python3 setup.py sdist --dist-dir ../dist/ cd trustgraph-embeddings-hf && python3 setup.py sdist --dist-dir ../dist/ cd trustgraph-cli && python3 setup.py sdist --dist-dir ../dist/ + cd trustgraph-ocr && python3 setup.py sdist --dist-dir ../dist/ pypi-upload: twine upload dist/*-${VERSION}.* @@ -41,6 +43,7 @@ update-package-versions: echo __version__ = \"${VERSION}\" > trustgraph-bedrock/trustgraph/bedrock_version.py echo __version__ = \"${VERSION}\" > trustgraph-embeddings-hf/trustgraph/embeddings_hf_version.py echo __version__ = \"${VERSION}\" > trustgraph-cli/trustgraph/cli_version.py + echo __version__ = \"${VERSION}\" > trustgraph-ocr/trustgraph/ocr_version.py echo __version__ = \"${VERSION}\" > trustgraph/trustgraph/trustgraph_version.py container: update-package-versions @@ -54,6 +57,12 @@ container: update-package-versions -t ${CONTAINER_BASE}/trustgraph-vertexai:${VERSION} . ${DOCKER} build -f containers/Containerfile.hf \ -t ${CONTAINER_BASE}/trustgraph-hf:${VERSION} . + ${DOCKER} build -f containers/Containerfile.ocr \ + -t ${CONTAINER_BASE}/trustgraph-ocr:${VERSION} . + +container.ocr: + ${DOCKER} build -f containers/Containerfile.ocr \ + -t ${CONTAINER_BASE}/trustgraph-ocr:${VERSION} . push: ${DOCKER} push ${CONTAINER_BASE}/trustgraph-base:${VERSION} @@ -61,6 +70,7 @@ push: ${DOCKER} push ${CONTAINER_BASE}/trustgraph-bedrock:${VERSION} ${DOCKER} push ${CONTAINER_BASE}/trustgraph-vertexai:${VERSION} ${DOCKER} push ${CONTAINER_BASE}/trustgraph-hf:${VERSION} + ${DOCKER} push ${CONTAINER_BASE}/trustgraph-ocr:${VERSION} clean: rm -rf wheels/ diff --git a/containers/Containerfile.ocr b/containers/Containerfile.ocr new file mode 100644 index 00000000..1b1fa4b0 --- /dev/null +++ b/containers/Containerfile.ocr @@ -0,0 +1,48 @@ + +# ---------------------------------------------------------------------------- +# Build an AI container. This does the torch install which is huge, and I +# like to avoid re-doing this. +# ---------------------------------------------------------------------------- + +FROM docker.io/fedora:40 AS base + +ENV PIP_BREAK_SYSTEM_PACKAGES=1 + +RUN dnf install -y python3 python3-pip python3-wheel python3-aiohttp \ + python3-rdflib tesseract poppler poppler-utils + +RUN pip3 install --no-cache-dir pytesseract pulsar-client==3.5.0 + +# ---------------------------------------------------------------------------- +# Build a container which contains the built Python packages. The build +# creates a bunch of left-over cruft, a separate phase means this is only +# needed to support package build +# ---------------------------------------------------------------------------- + +FROM base AS build + +COPY trustgraph-base/ /root/build/trustgraph-base/ +COPY trustgraph-tesseract/ /root/build/trustgraph-tesseract/ + +WORKDIR /root/build/ + +RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-base/ +RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-tesseract/ + +RUN ls /root/wheels + +# ---------------------------------------------------------------------------- +# Finally, the target container. Start with base and add the package. +# ---------------------------------------------------------------------------- + +FROM base + +COPY --from=build /root/wheels /root/wheels + +RUN \ + pip3 install --no-cache-dir /root/wheels/trustgraph_base-* && \ + pip3 install --no-cache-dir /root/wheels/trustgraph_ocr-* && \ + rm -rf /root/wheels + +WORKDIR / + diff --git a/trustgraph-ocr/README.md b/trustgraph-ocr/README.md new file mode 100644 index 00000000..7a2ce130 --- /dev/null +++ b/trustgraph-ocr/README.md @@ -0,0 +1 @@ +See https://trustgraph.ai/ diff --git a/trustgraph-ocr/scripts/pdf-ocr b/trustgraph-ocr/scripts/pdf-ocr new file mode 100755 index 00000000..1417351f --- /dev/null +++ b/trustgraph-ocr/scripts/pdf-ocr @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 + +from trustgraph.decoding.ocr import run + +run() + diff --git a/trustgraph-ocr/setup.py b/trustgraph-ocr/setup.py new file mode 100644 index 00000000..43e15061 --- /dev/null +++ b/trustgraph-ocr/setup.py @@ -0,0 +1,47 @@ +import setuptools +import os +import importlib + +with open("README.md", "r") as fh: + long_description = fh.read() + +# Load a version number module +spec = importlib.util.spec_from_file_location( + 'version', 'trustgraph/ocr_version.py' +) +version_module = importlib.util.module_from_spec(spec) +spec.loader.exec_module(version_module) + +version = version_module.__version__ + +setuptools.setup( + name="trustgraph-ocr", + version=version, + author="trustgraph.ai", + author_email="security@trustgraph.ai", + description="TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/trustgraph-ai/trustgraph", + packages=setuptools.find_namespace_packages( + where='./', + ), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)", + "Operating System :: OS Independent", + ], + python_requires='>=3.8', + download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz", + install_requires=[ + "trustgraph-base>=0.21,<0.22", + "pulsar-client", + "prometheus-client", + "boto3", + "pdf2image", + "pytesseract", + ], + scripts=[ + "scripts/pdf-ocr", + ] +) diff --git a/trustgraph-ocr/trustgraph/decoding/ocr/__init__.py b/trustgraph-ocr/trustgraph/decoding/ocr/__init__.py new file mode 100644 index 00000000..0d8d9c78 --- /dev/null +++ b/trustgraph-ocr/trustgraph/decoding/ocr/__init__.py @@ -0,0 +1,3 @@ + +from . pdf_decoder import * + diff --git a/trustgraph-ocr/trustgraph/decoding/ocr/__main__.py b/trustgraph-ocr/trustgraph/decoding/ocr/__main__.py new file mode 100755 index 00000000..44dd026d --- /dev/null +++ b/trustgraph-ocr/trustgraph/decoding/ocr/__main__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +from . pdf_decoder import run + +if __name__ == '__main__': + run() + diff --git a/trustgraph-ocr/trustgraph/decoding/ocr/pdf_decoder.py b/trustgraph-ocr/trustgraph/decoding/ocr/pdf_decoder.py new file mode 100755 index 00000000..f8926589 --- /dev/null +++ b/trustgraph-ocr/trustgraph/decoding/ocr/pdf_decoder.py @@ -0,0 +1,83 @@ + +""" +Simple decoder, accepts PDF documents on input, outputs pages from the +PDF document as text as separate output objects. +""" + +import tempfile +import base64 +import pytesseract +from pdf2image import convert_from_bytes + +from ... schema import Document, TextDocument, Metadata +from ... schema import document_ingest_queue, text_ingest_queue +from ... log_level import LogLevel +from ... base import ConsumerProducer + +module = ".".join(__name__.split(".")[1:-1]) + +default_input_queue = document_ingest_queue +default_output_queue = text_ingest_queue +default_subscriber = module + +class Processor(ConsumerProducer): + + def __init__(self, **params): + + input_queue = params.get("input_queue", default_input_queue) + output_queue = params.get("output_queue", default_output_queue) + subscriber = params.get("subscriber", default_subscriber) + + super(Processor, self).__init__( + **params | { + "input_queue": input_queue, + "output_queue": output_queue, + "subscriber": subscriber, + "input_schema": Document, + "output_schema": TextDocument, + } + ) + + print("PDF OCR inited") + + async def handle(self, msg): + + print("PDF message received") + + v = msg.value() + + print(f"Decoding {v.metadata.id}...", flush=True) + + blob = base64.b64decode(v.data) + + pages = convert_from_bytes(blob) + + for ix, page in enumerate(pages): + + try: + text = pytesseract.image_to_string(page, lang='eng') + except Exception as e: + print(f"Page did not OCR: {e}") + continue + + r = TextDocument( + metadata=v.metadata, + text=text.encode("utf-8"), + ) + + await self.send(r) + + print("Done.", flush=True) + + @staticmethod + def add_args(parser): + + ConsumerProducer.add_args( + parser, default_input_queue, default_subscriber, + default_output_queue, + ) + +def run(): + + Processor.launch(module, __doc__) +