chore: refactor file upload mechanism to avoid NFS dependency (#496)

* chore: refactor file upload mechanism to avoid NFS dependency * add regression test for deregistration of calls * fix: fix minio upload issue * fix: make transcript upload async
2026-07-04 10:52:17 +02:00 · 2026-07-03 20:01:52 +05:30 · 2026-07-03 20:01:52 +05:30 · a54ab519b8
commit a54ab519b8
parent 79a4a3c9f1
23 changed files with 370 additions and 401 deletions
--- a/api/services/filesystem/base.py
+++ b/api/services/filesystem/base.py
@ -1,23 +1,51 @@
 from abc import ABC, abstractmethod
-from typing import Any, BinaryIO, Dict, Optional
+from typing import Any, Dict, Optional, Protocol
+
+
+class AsyncReadable(Protocol):
+    """Anything exposing ``await .read() -> bytes`` (aiofiles handles, in-memory wrappers)."""
+
+    async def read(self) -> bytes: ...
+
+
+class _AsyncBytesReader:
+    """Async file-like wrapper over in-memory bytes for acreate_file()."""
+
+    def __init__(self, data: bytes):
+        self._data = data
+
+    async def read(self) -> bytes:
+        return self._data


 class BaseFileSystem(ABC):
    """Abstract base class for filesystem operations."""

    @abstractmethod
-    async def acreate_file(self, file_path: str, content: BinaryIO) -> bool:
+    async def acreate_file(self, file_path: str, content: AsyncReadable) -> bool:
        """Create a new file with the given content.

        Args:
            file_path: Path where the file should be created
-            content: File content as a binary stream
+            content: File content readable via ``await content.read()``

        Returns:
            bool: True if file was created successfully, False otherwise
        """
        pass

+    async def acreate_file_from_bytes(self, file_path: str, data: bytes) -> bool:
+        """Create a file directly from in-memory bytes (no local file needed).
+
+        Args:
+            file_path: Path where the file should be created
+            data: File content as bytes
+
+        Returns:
+            bool: True if file was created successfully, False otherwise
+        """
+        return await self.acreate_file(file_path, _AsyncBytesReader(data))
+
    @abstractmethod
    async def aupload_file(self, local_path: str, destination_path: str) -> bool:
        """Upload a file from local path to destination.
--- a/api/services/filesystem/local.py
+++ b/api/services/filesystem/local.py
@ -1,11 +1,11 @@
 import asyncio
 import os
 from datetime import datetime
-from typing import BinaryIO, Optional
+from typing import Optional

 import aiofiles

-from .base import BaseFileSystem
+from .base import AsyncReadable, BaseFileSystem


 class LocalFileSystem(BaseFileSystem):
@ -24,7 +24,7 @@ class LocalFileSystem(BaseFileSystem):
        """Get the full path by joining with base path."""
        return os.path.join(self.base_path, file_path)

-    async def acreate_file(self, file_path: str, content: BinaryIO) -> bool:
+    async def acreate_file(self, file_path: str, content: AsyncReadable) -> bool:
        try:
            full_path = self._get_full_path(file_path)
            os.makedirs(os.path.dirname(full_path), exist_ok=True)
--- a/api/services/filesystem/minio.py
+++ b/api/services/filesystem/minio.py
@ -1,12 +1,13 @@
 import asyncio
+import io
 import json
-from typing import Any, BinaryIO, Dict, Optional
+from typing import Any, Dict, Optional

 from loguru import logger
 from minio import Minio
 from minio.error import S3Error

-from .base import BaseFileSystem
+from .base import AsyncReadable, BaseFileSystem


 class MinioFileSystem(BaseFileSystem):
@ -89,15 +90,16 @@ class MinioFileSystem(BaseFileSystem):
            logger.debug(f"Bucket setup note: {e}")
            pass

-    async def acreate_file(self, file_path: str, content: BinaryIO) -> bool:
+    async def acreate_file(self, file_path: str, content: AsyncReadable) -> bool:
        try:
            data = await content.read()

            def _put():
+                # The MinIO SDK requires a stream with .read(), not raw bytes.
                self.client.put_object(
                    self.bucket_name,
                    file_path,
-                    data=bytes(data),
+                    data=io.BytesIO(data),
                    length=len(data),
                )

--- a/api/services/filesystem/null.py
+++ b/api/services/filesystem/null.py
@ -1,6 +1,6 @@
-from typing import Any, BinaryIO, Dict, NoReturn, Optional
+from typing import Any, Dict, NoReturn, Optional

-from .base import BaseFileSystem
+from .base import AsyncReadable, BaseFileSystem


 class NullFileSystem(BaseFileSystem):
@ -16,7 +16,7 @@ class NullFileSystem(BaseFileSystem):
            "Set ENVIRONMENT to a non-test value or inject a real filesystem fixture."
        )

-    async def acreate_file(self, file_path: str, content: BinaryIO) -> bool:
+    async def acreate_file(self, file_path: str, content: AsyncReadable) -> bool:
        self._fail("acreate_file")

    async def aupload_file(self, local_path: str, destination_path: str) -> bool:
--- a/api/services/filesystem/s3.py
+++ b/api/services/filesystem/s3.py
@ -1,10 +1,10 @@
-from typing import Any, BinaryIO, Dict, Optional
+from typing import Any, Dict, Optional

 import aioboto3
 from botocore.config import Config
 from botocore.exceptions import ClientError

-from .base import BaseFileSystem
+from .base import AsyncReadable, BaseFileSystem


 class S3FileSystem(BaseFileSystem):
@ -57,7 +57,7 @@ class S3FileSystem(BaseFileSystem):
            kwargs["config"] = self._config
        return kwargs

-    async def acreate_file(self, file_path: str, content: BinaryIO) -> bool:
+    async def acreate_file(self, file_path: str, content: AsyncReadable) -> bool:
        try:
            async with self.session.client("s3", **self._client_kwargs()) as s3_client:
                await s3_client.put_object(
--- a/api/services/pipecat/event_handlers.py
+++ b/api/services/pipecat/event_handlers.py
@ -16,6 +16,7 @@ from api.services.pipecat.pipeline_metrics_aggregator import PipelineMetricsAggr
 from api.services.pipecat.tracing_config import get_trace_url
 from api.services.posthog_client import capture_event
 from api.services.workflow.pipecat_engine import PipecatEngine
+from api.services.workflow_run_artifacts import upload_workflow_run_artifacts
 from api.tasks.arq import enqueue_job
 from api.tasks.function_names import FunctionNames
 from pipecat.frames.frames import (
@ -361,50 +362,49 @@ def register_event_handlers(
            except Exception as e:
                logger.error(f"Error saving workflow run logs: {e}", exc_info=True)

-        # Write buffers to temp files and enqueue combined processing task
-        audio_temp_path = None
-        user_audio_temp_path = None
-        bot_audio_temp_path = None
-        transcript_temp_path = None
-
+        # Upload artifacts straight from the in-memory buffers so nothing has
+        # to cross a process/host boundary via temp files. Must complete
+        # before the completion job is enqueued so QA and webhooks see the
+        # artifacts in storage.
        try:
+            mixed_audio_wav = None
+            user_audio_wav = None
+            bot_audio_wav = None
+
            if not in_memory_audio_buffers.mixed.is_empty:
-                audio_temp_path = (
-                    await in_memory_audio_buffers.mixed.write_to_temp_file()
-                )
+                mixed_audio_wav = await in_memory_audio_buffers.mixed.to_wav_bytes()
            else:
                logger.debug("Audio buffer is empty, skipping upload")

            if not in_memory_audio_buffers.user.is_empty:
-                user_audio_temp_path = (
-                    await in_memory_audio_buffers.user.write_to_temp_file()
-                )
+                user_audio_wav = await in_memory_audio_buffers.user.to_wav_bytes()
            else:
                logger.debug("User audio buffer is empty, skipping upload")

            if not in_memory_audio_buffers.bot.is_empty:
-                bot_audio_temp_path = (
-                    await in_memory_audio_buffers.bot.write_to_temp_file()
-                )
+                bot_audio_wav = await in_memory_audio_buffers.bot.to_wav_bytes()
            else:
                logger.debug("Bot audio buffer is empty, skipping upload")

-            transcript_temp_path = in_memory_logs_buffer.write_transcript_to_temp_file()
-            if not transcript_temp_path:
+            transcript_text = in_memory_logs_buffer.generate_transcript_text()
+            if not transcript_text:
                logger.debug("No transcript events in logs buffer, skipping upload")

+            await upload_workflow_run_artifacts(
+                workflow_run_id,
+                mixed_audio_wav=mixed_audio_wav,
+                user_audio_wav=user_audio_wav,
+                bot_audio_wav=bot_audio_wav,
+                transcript_text=transcript_text,
+            )
        except Exception as e:
-            logger.error(f"Error preparing buffers for S3 upload: {e}", exc_info=True)
+            logger.error(f"Error uploading call artifacts: {e}", exc_info=True)

-        # Combined task: uploads artifacts, runs integrations (including QA),
-        # then calculates cost (so QA token usage is captured in usage_info)
+        # Combined task: runs integrations (including QA), then calculates
+        # cost (so QA token usage is captured in usage_info)
        await enqueue_job(
            FunctionNames.PROCESS_WORKFLOW_COMPLETION,
            workflow_run_id,
-            audio_temp_path,
-            transcript_temp_path,
-            user_audio_temp_path,
-            bot_audio_temp_path,
        )

    # Return the buffer so it can be passed to other handlers
--- a/api/services/pipecat/in_memory_buffers.py
+++ b/api/services/pipecat/in_memory_buffers.py
@ -1,5 +1,5 @@
 import asyncio
-import tempfile
+import io
 import wave
 from datetime import UTC, datetime
 from typing import List, Optional
@ -15,7 +15,7 @@ from pipecat.utils.enums import RealtimeFeedbackType


 class InMemoryAudioBuffer:
-    """Buffer audio data in memory during a call, then write to temp file on disconnect."""
+    """Buffer audio data in memory during a call, then encode to WAV bytes on disconnect."""

    def __init__(self, workflow_run_id: int, sample_rate: int, num_channels: int = 1):
        self._workflow_run_id = workflow_run_id
@ -41,28 +41,30 @@ class InMemoryAudioBuffer:
                f"Appended {len(pcm_data)} bytes to audio buffer. Total size: {self._total_size}"
            )

-    async def write_to_temp_file(self) -> str:
-        """Write audio data to a temporary WAV file and return the path."""
+    async def to_wav_bytes(self) -> bytes:
+        """Encode the buffered PCM data as an in-memory WAV file."""
        async with self._lock:
-            temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-            logger.debug(
-                f"Writing audio buffer to temp file {temp_file.name} for workflow {self._workflow_run_id}"
-            )
+            chunks = list(self._chunks)

-            # Write WAV header and PCM data
-            with wave.open(temp_file.name, "wb") as wf:
+        def _encode() -> bytes:
+            wav_io = io.BytesIO()
+            with wave.open(wav_io, "wb") as wf:
                wf.setnchannels(self._num_channels)
                wf.setsampwidth(2)  # 16-bit audio
                wf.setframerate(self._sample_rate)

                # Concatenate all chunks
-                for chunk in self._chunks:
+                for chunk in chunks:
                    wf.writeframes(chunk)
+            return wav_io.getvalue()

-            logger.info(
-                f"Successfully wrote {self._total_size} bytes of audio to {temp_file.name}"
-            )
-            return temp_file.name
+        # Encoding is mostly memcpy but can touch ~100MB; keep it off the event loop
+        data = await asyncio.to_thread(_encode)
+        logger.info(
+            f"Encoded {self._total_size} bytes of audio to {len(data)} WAV bytes "
+            f"for workflow {self._workflow_run_id}"
+        )
+        return data

    @property
    def is_empty(self) -> bool:
@ -172,27 +174,6 @@ class InMemoryLogsBuffer:
        """
        return _generate_transcript_text(self._sorted_events())

-    def write_transcript_to_temp_file(self) -> Optional[str]:
-        """Write transcript to a temporary text file and return the path.
-
-        Returns None if there are no transcript events.
-        """
-        content = self.generate_transcript_text()
-        if not content:
-            return None
-
-        temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False)
-        logger.debug(
-            f"Writing transcript to temp file {temp_file.name} for workflow {self._workflow_run_id}"
-        )
-        temp_file.write(content)
-        temp_file.close()
-
-        logger.info(
-            f"Successfully wrote {len(content)} chars of transcript to {temp_file.name}"
-        )
-        return temp_file.name
-
    @property
    def is_empty(self) -> bool:
        """Check if the buffer is empty."""
--- a/api/services/workflow_run_artifacts.py
+++ b/api/services/workflow_run_artifacts.py
@ -0,0 +1,126 @@
+"""Upload end-of-call artifacts (recordings, transcript) to object storage.
+
+Called from the pipeline process itself, straight from the in-memory call
+buffers, so no local file ever has to cross a process/host boundary (no
+shared /tmp between web and ARQ workers). Uploads happen before the
+workflow-completion job is enqueued so QA and webhooks see the artifacts
+in storage.
+"""
+
+from loguru import logger
+
+from api.db import db_client
+from api.services.storage import get_current_storage_backend, storage_fs
+
+
+def _recording_metadata(storage_key: str, storage_backend: str, track: str) -> dict:
+    return {
+        "storage_key": storage_key,
+        "storage_backend": storage_backend,
+        "format": "wav",
+        "track": track,
+    }
+
+
+async def _upload_bytes(
+    workflow_run_id: int,
+    data: bytes,
+    storage_key: str,
+    label: str,
+) -> bool:
+    try:
+        logger.debug(f"{label} size: {len(data)} bytes")
+        if await storage_fs.acreate_file_from_bytes(storage_key, data):
+            logger.info(f"Successfully uploaded {label}: {storage_key}")
+            return True
+        logger.error(
+            f"Storage backend rejected {label} upload for workflow "
+            f"{workflow_run_id}: {storage_key}"
+        )
+        return False
+    except Exception as e:
+        logger.error(f"Error uploading {label} for workflow {workflow_run_id}: {e}")
+        return False
+
+
+async def upload_workflow_run_artifacts(
+    workflow_run_id: int,
+    *,
+    mixed_audio_wav: bytes | None = None,
+    user_audio_wav: bytes | None = None,
+    bot_audio_wav: bytes | None = None,
+    transcript_text: str | None = None,
+) -> None:
+    """Upload call artifacts to object storage and persist their metadata.
+
+    Each artifact is uploaded independently; a failure is logged and the
+    remaining artifacts are still attempted.
+    """
+    storage_backend = get_current_storage_backend()
+
+    recordings_metadata: dict[str, dict] = {}
+
+    if mixed_audio_wav:
+        recording_url = f"recordings/{workflow_run_id}.wav"
+        logger.info(
+            f"Uploading mixed audio to {storage_backend.name} - workflow_run_id: {workflow_run_id}"
+        )
+        if await _upload_bytes(
+            workflow_run_id, mixed_audio_wav, recording_url, "mixed audio"
+        ):
+            recordings_metadata["mixed"] = _recording_metadata(
+                recording_url, storage_backend.value, "mixed"
+            )
+            await db_client.update_workflow_run(
+                run_id=workflow_run_id,
+                recording_url=recording_url,
+                storage_backend=storage_backend.value,
+            )
+
+    if user_audio_wav:
+        user_recording_url = f"recordings/{workflow_run_id}/user.wav"
+        logger.info(
+            f"Uploading user audio to {storage_backend.name} - workflow_run_id: {workflow_run_id}"
+        )
+        if await _upload_bytes(
+            workflow_run_id, user_audio_wav, user_recording_url, "user audio"
+        ):
+            recordings_metadata["user"] = _recording_metadata(
+                user_recording_url, storage_backend.value, "user"
+            )
+
+    if bot_audio_wav:
+        bot_recording_url = f"recordings/{workflow_run_id}/bot.wav"
+        logger.info(
+            f"Uploading bot audio to {storage_backend.name} - workflow_run_id: {workflow_run_id}"
+        )
+        if await _upload_bytes(
+            workflow_run_id, bot_audio_wav, bot_recording_url, "bot audio"
+        ):
+            recordings_metadata["bot"] = _recording_metadata(
+                bot_recording_url, storage_backend.value, "bot"
+            )
+
+    if recordings_metadata:
+        await db_client.update_workflow_run(
+            run_id=workflow_run_id,
+            storage_backend=storage_backend.value,
+            extra={"recordings": recordings_metadata},
+        )
+
+    if transcript_text:
+        transcript_url = f"transcripts/{workflow_run_id}.txt"
+        logger.info(
+            f"Uploading transcript to {storage_backend.name} - workflow_run_id: {workflow_run_id}"
+        )
+        if await _upload_bytes(
+            workflow_run_id,
+            transcript_text.encode("utf-8"),
+            transcript_url,
+            "transcript",
+        ):
+            await db_client.update_workflow_run(
+                run_id=workflow_run_id,
+                transcript_url=transcript_url,
+                storage_backend=storage_backend.value,
+            )