diff --git a/containers/Containerfile.unstructured b/containers/Containerfile.unstructured index 22ee05b2..7284901e 100644 --- a/containers/Containerfile.unstructured +++ b/containers/Containerfile.unstructured @@ -7,7 +7,7 @@ FROM docker.io/fedora:42 AS base ENV PIP_BREAK_SYSTEM_PACKAGES=1 -RUN dnf install -y python3.13 && \ +RUN dnf install -y python3.13 libxcb mesa-libGL && \ alternatives --install /usr/bin/python python /usr/bin/python3.13 1 && \ python -m ensurepip --upgrade && \ pip3 install --no-cache-dir build wheel aiohttp && \ @@ -38,6 +38,11 @@ RUN ls /root/wheels FROM base +# Pre-install CPU-only PyTorch so that unstructured[pdf]'s torch +# dependency is satisfied without pulling in CUDA (~190MB vs ~2GB+) +RUN pip3 install --no-cache-dir torch==2.11.0+cpu \ + --index-url https://download.pytorch.org/whl/cpu + COPY --from=build /root/wheels /root/wheels RUN \ diff --git a/trustgraph-unstructured/pyproject.toml b/trustgraph-unstructured/pyproject.toml index 35597398..33265edb 100644 --- a/trustgraph-unstructured/pyproject.toml +++ b/trustgraph-unstructured/pyproject.toml @@ -14,7 +14,7 @@ dependencies = [ "pulsar-client", "prometheus-client", "python-magic", - "unstructured[csv,docx,epub,md,odt,pptx,rst,rtf,tsv,xlsx]", + "unstructured[csv,docx,epub,md,odt,pdf,pptx,rst,rtf,tsv,xlsx]", ] classifiers = [ "Programming Language :: Python :: 3",