trustgraph/containers/Containerfile.unstructured
cybermaggedon 6b1dd16f9f
fix: large document handling and Cassandra query pagination (#969)
- Paginate heavy Cassandra reads (triples, graph/document embeddings)
  using synchronous session.execute() in run_in_executor with fetch_size
  paging, preventing materialization hang on large result sets
- Fix document stream endpoint to use workspace-scoped librarian queues
- Add decoder error handling for PDF/OCR/unstructured processors
- Add WebSocket mux guards for missing auth fields
- Add null check in librarian document streaming
- Rewrite get_document_content CLI to stream via librarian
- Add Poppler dependency to unstructured container
2026-06-01 22:39:30 +01:00

54 lines
2 KiB
Text

# ----------------------------------------------------------------------------
# Base container with system dependencies
# ----------------------------------------------------------------------------
FROM docker.io/fedora:42 AS base
ENV PIP_BREAK_SYSTEM_PACKAGES=1
RUN dnf install -y python3.13 libxcb mesa-libGL poppler-utils && \
alternatives --install /usr/bin/python python /usr/bin/python3.13 1 && \
python -m ensurepip --upgrade && \
pip3 install --no-cache-dir --upgrade 'pip>=26.0' 'setuptools>=78.1.1' && \
pip3 install --no-cache-dir build wheel aiohttp && \
pip3 install --no-cache-dir pulsar-client==3.11.0 && \
dnf clean all
# ----------------------------------------------------------------------------
# Build a container which contains the built Python packages. The build
# creates a bunch of left-over cruft, a separate phase means this is only
# needed to support package build
# ----------------------------------------------------------------------------
FROM base AS build
COPY trustgraph-base/ /root/build/trustgraph-base/
COPY trustgraph-unstructured/ /root/build/trustgraph-unstructured/
WORKDIR /root/build/
RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-base/
RUN pip3 wheel -w /root/wheels/ --no-deps ./trustgraph-unstructured/
RUN ls /root/wheels
# ----------------------------------------------------------------------------
# Finally, the target container. Start with base and add the package.
# ----------------------------------------------------------------------------
FROM base
# Pre-install CPU-only PyTorch so that unstructured[pdf]'s torch
# dependency is satisfied without pulling in CUDA (~190MB vs ~2GB+)
RUN pip3 install --no-cache-dir torch==2.11.0+cpu \
--index-url https://download.pytorch.org/whl/cpu
COPY --from=build /root/wheels /root/wheels
RUN \
pip3 install --no-cache-dir /root/wheels/trustgraph_base-* && \
pip3 install --no-cache-dir /root/wheels/trustgraph_unstructured-* && \
rm -rf /root/wheels
WORKDIR /