Added module which does OCR for PDF, pdf-ocr in a separate package (#324)

(has a lot of dependencies).  Uses Tesseract.
This commit is contained in:
cybermaggedon 2025-03-20 09:29:40 +00:00 committed by GitHub
parent cbfe37fec7
commit c759d55734
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 208 additions and 0 deletions

View file

@ -0,0 +1,48 @@
# ----------------------------------------------------------------------------
# Build an AI container. This does the torch install which is huge, and I
# like to avoid re-doing this.
# ----------------------------------------------------------------------------
FROM docker.io/fedora:40 AS base
ENV PIP_BREAK_SYSTEM_PACKAGES=1
RUN dnf install -y python3 python3-pip python3-wheel python3-aiohttp \
python3-rdflib tesseract poppler poppler-utils
RUN pip3 install --no-cache-dir pytesseract pulsar-client==3.5.0
# ----------------------------------------------------------------------------
# Build a container which contains the built Python packages. The build
# creates a bunch of left-over cruft, a separate phase means this is only
# needed to support package build
# ----------------------------------------------------------------------------
FROM base AS build
COPY trustgraph-base/ /root/build/trustgraph-base/
COPY trustgraph-tesseract/ /root/build/trustgraph-tesseract/
WORKDIR /root/build/
RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-base/
RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-tesseract/
RUN ls /root/wheels
# ----------------------------------------------------------------------------
# Finally, the target container. Start with base and add the package.
# ----------------------------------------------------------------------------
FROM base
COPY --from=build /root/wheels /root/wheels
RUN \
pip3 install --no-cache-dir /root/wheels/trustgraph_base-* && \
pip3 install --no-cache-dir /root/wheels/trustgraph_ocr-* && \
rm -rf /root/wheels
WORKDIR /