diff --git a/containers/Containerfile.unstructured b/containers/Containerfile.unstructured index 22ee05b2..f159eb88 100644 --- a/containers/Containerfile.unstructured +++ b/containers/Containerfile.unstructured @@ -38,6 +38,11 @@ RUN ls /root/wheels FROM base +# Pre-install CPU-only PyTorch so that unstructured[pdf]'s torch +# dependency is satisfied without pulling in CUDA (~190MB vs ~2GB+) +RUN pip3 install --no-cache-dir torch==2.11.0+cpu \ + --index-url https://download.pytorch.org/whl/cpu + COPY --from=build /root/wheels /root/wheels RUN \ diff --git a/trustgraph-unstructured/pyproject.toml b/trustgraph-unstructured/pyproject.toml index c0e9d025..33265edb 100644 --- a/trustgraph-unstructured/pyproject.toml +++ b/trustgraph-unstructured/pyproject.toml @@ -14,13 +14,7 @@ dependencies = [ "pulsar-client", "prometheus-client", "python-magic", - "unstructured[csv,docx,epub,md,odt,pptx,rst,rtf,tsv,xlsx]", - "pdfminer.six", - "pdf2image", - "pikepdf", - "pi_heif", - "pypdfium2", - "unstructured.pytesseract", + "unstructured[csv,docx,epub,md,odt,pdf,pptx,rst,rtf,tsv,xlsx]", ] classifiers = [ "Programming Language :: Python :: 3",