mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 00:16:23 +02:00
Added module which does OCR for PDF, pdf-ocr in a separate package (#324)
(has a lot of dependencies). Uses Tesseract.
This commit is contained in:
parent
cbfe37fec7
commit
c759d55734
9 changed files with 208 additions and 0 deletions
48
containers/Containerfile.ocr
Normal file
48
containers/Containerfile.ocr
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Build an AI container. This does the torch install which is huge, and I
|
||||
# like to avoid re-doing this.
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
FROM docker.io/fedora:40 AS base
|
||||
|
||||
ENV PIP_BREAK_SYSTEM_PACKAGES=1
|
||||
|
||||
RUN dnf install -y python3 python3-pip python3-wheel python3-aiohttp \
|
||||
python3-rdflib tesseract poppler poppler-utils
|
||||
|
||||
RUN pip3 install --no-cache-dir pytesseract pulsar-client==3.5.0
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Build a container which contains the built Python packages. The build
|
||||
# creates a bunch of left-over cruft, a separate phase means this is only
|
||||
# needed to support package build
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
FROM base AS build
|
||||
|
||||
COPY trustgraph-base/ /root/build/trustgraph-base/
|
||||
COPY trustgraph-tesseract/ /root/build/trustgraph-tesseract/
|
||||
|
||||
WORKDIR /root/build/
|
||||
|
||||
RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-base/
|
||||
RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-tesseract/
|
||||
|
||||
RUN ls /root/wheels
|
||||
|
||||
# ----------------------------------------------------------------------------
|
||||
# Finally, the target container. Start with base and add the package.
|
||||
# ----------------------------------------------------------------------------
|
||||
|
||||
FROM base
|
||||
|
||||
COPY --from=build /root/wheels /root/wheels
|
||||
|
||||
RUN \
|
||||
pip3 install --no-cache-dir /root/wheels/trustgraph_base-* && \
|
||||
pip3 install --no-cache-dir /root/wheels/trustgraph_ocr-* && \
|
||||
rm -rf /root/wheels
|
||||
|
||||
WORKDIR /
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue