chore: refactor file upload mechanism to avoid NFS dependency (#496)

* chore: refactor file upload mechanism to avoid NFS dependency

* add regression test for deregistration of calls

* fix: fix minio upload issue

* fix: make transcript upload async
This commit is contained in:
Abhishek 2026-07-03 20:01:52 +05:30 committed by GitHub
parent 79a4a3c9f1
commit a54ab519b8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
23 changed files with 370 additions and 401 deletions

View file

@ -45,7 +45,6 @@ from api.tasks.campaign_tasks import (
)
from api.tasks.knowledge_base_processing import process_knowledge_base_document
from api.tasks.run_integrations import run_integrations_post_workflow_run
from api.tasks.s3_upload import upload_voicemail_audio_to_s3
from api.tasks.webhook_delivery import deliver_webhook, sweep_webhook_deliveries
from api.tasks.workflow_completion import process_workflow_completion
@ -53,7 +52,6 @@ from api.tasks.workflow_completion import process_workflow_completion
class WorkerSettings:
functions = [
run_integrations_post_workflow_run,
upload_voicemail_audio_to_s3,
process_workflow_completion,
sync_campaign_source,
process_campaign_batch,

View file

@ -1,7 +1,6 @@
class FunctionNames:
RUN_INTEGRATIONS_POST_WORKFLOW_RUN = "run_integrations_post_workflow_run"
PROCESS_WORKFLOW_COMPLETION = "process_workflow_completion"
UPLOAD_VOICEMAIL_AUDIO_TO_S3 = "upload_voicemail_audio_to_s3"
SYNC_CAMPAIGN_SOURCE = "sync_campaign_source"
PROCESS_CAMPAIGN_BATCH = "process_campaign_batch"
PROCESS_KNOWLEDGE_BASE_DOCUMENT = "process_knowledge_base_document"

View file

@ -1,67 +0,0 @@
import os
from loguru import logger
from pipecat.utils.run_context import set_current_run_id
from api.services.storage import storage_fs
async def upload_voicemail_audio_to_s3(
_ctx,
workflow_run_id: int,
temp_file_path: str,
s3_key: str,
):
"""Upload voicemail detection audio from temp file to S3.
Handles voicemail-specific paths and doesn't update the workflow run's
recording_url field.
Args:
_ctx: ARQ context (unused)
workflow_run_id: The workflow run ID
temp_file_path: Path to the temporary WAV file
s3_key: The S3 key where the file should be uploaded
"""
run_id = str(workflow_run_id)
set_current_run_id(run_id)
logger.info(f"Starting voicemail audio upload to S3 from {temp_file_path}")
try:
# Verify temp file exists
if not os.path.exists(temp_file_path):
logger.error(f"Temp voicemail audio file not found: {temp_file_path}")
raise FileNotFoundError(
f"Temp voicemail audio file not found: {temp_file_path}"
)
file_size = os.path.getsize(temp_file_path)
logger.debug(f"Voicemail audio file size: {file_size} bytes")
# Upload to S3
upload_ok = await storage_fs.aupload_file(temp_file_path, s3_key)
if upload_ok:
logger.info(f"Successfully uploaded voicemail audio to S3: {s3_key}")
else:
logger.error(
f"Failed to upload voicemail audio to S3 for workflow {workflow_run_id}"
)
raise Exception(f"S3 upload failed for {s3_key}")
except Exception as e:
logger.error(
f"Error uploading voicemail audio to S3 for workflow {workflow_run_id}: {e}"
)
raise
finally:
# Clean up temp file
if os.path.exists(temp_file_path):
try:
os.remove(temp_file_path)
logger.debug(f"Cleaned up temp voicemail audio file: {temp_file_path}")
except Exception as e:
logger.warning(
f"Failed to clean up temp voicemail audio file {temp_file_path}: {e}"
)

View file

@ -1,178 +1,118 @@
import asyncio
import os
from typing import Optional
from loguru import logger
from pipecat.utils.run_context import set_current_run_id
from api.db import db_client
from api.services.storage import get_current_storage_backend, storage_fs
from api.services.workflow_run_artifacts import upload_workflow_run_artifacts
from api.services.workflow_run_billing import (
report_completed_workflow_run_platform_usage,
)
from api.tasks.run_integrations import run_integrations_post_workflow_run
def _recording_metadata(storage_key: str, storage_backend: str, track: str) -> dict:
return {
"storage_key": storage_key,
"storage_backend": storage_backend,
"format": "wav",
"track": track,
}
async def _upload_temp_file(
workflow_run_id: int,
temp_file_path: str,
storage_key: str,
label: str,
) -> bool:
def _read_and_remove_temp_file(temp_file_path: str | None, label: str) -> bytes | None:
if not temp_file_path:
return None
try:
if not os.path.exists(temp_file_path):
logger.warning(f"{label} temp file not found: {temp_file_path}")
return False
file_size = os.path.getsize(temp_file_path)
logger.debug(f"{label} file size: {file_size} bytes")
await storage_fs.aupload_file(temp_file_path, storage_key)
logger.info(f"Successfully uploaded {label}: {storage_key}")
return True
return None
with open(temp_file_path, "rb") as f:
data = f.read()
os.remove(temp_file_path)
return data
except Exception as e:
logger.error(f"Error uploading {label} for workflow {workflow_run_id}: {e}")
return False
finally:
if os.path.exists(temp_file_path):
try:
os.remove(temp_file_path)
logger.debug(f"Cleaned up temp {label} file: {temp_file_path}")
except Exception as e:
logger.warning(f"Failed to clean up temp {label} file: {e}")
logger.error(f"Error reading legacy {label} temp file {temp_file_path}: {e}")
return None
async def _upload_legacy_temp_artifacts(
workflow_run_id: int,
audio_temp_path: str | None,
transcript_temp_path: str | None,
user_audio_temp_path: str | None,
bot_audio_temp_path: str | None,
) -> None:
"""Handle jobs enqueued before uploads moved into the pipeline process.
Pre-refactor web workers passed local temp-file paths; upload them if this
worker can still see the files (same host / shared volume).
Deprecated: remove once no pre-refactor jobs can remain in the queue.
"""
logger.info(
f"Processing legacy temp-file artifacts for workflow run {workflow_run_id}"
)
transcript_bytes = await asyncio.to_thread(
_read_and_remove_temp_file, transcript_temp_path, "transcript"
)
await upload_workflow_run_artifacts(
workflow_run_id,
mixed_audio_wav=await asyncio.to_thread(
_read_and_remove_temp_file, audio_temp_path, "mixed audio"
),
user_audio_wav=await asyncio.to_thread(
_read_and_remove_temp_file, user_audio_temp_path, "user audio"
),
bot_audio_wav=await asyncio.to_thread(
_read_and_remove_temp_file, bot_audio_temp_path, "bot audio"
),
transcript_text=(
transcript_bytes.decode("utf-8") if transcript_bytes else None
),
)
async def process_workflow_completion(
_ctx,
workflow_run_id: int,
audio_temp_path: Optional[str] = None,
transcript_temp_path: Optional[str] = None,
user_audio_temp_path: Optional[str] = None,
bot_audio_temp_path: Optional[str] = None,
audio_temp_path: str | None = None,
transcript_temp_path: str | None = None,
user_audio_temp_path: str | None = None,
bot_audio_temp_path: str | None = None,
):
"""Process workflow completion: upload artifacts and run integrations.
"""Process workflow completion: run integrations and report billing.
This task combines audio upload, transcript upload, and webhook integrations
into a single sequential task to ensure integrations run after uploads complete.
Recording/transcript uploads happen in the pipeline process itself
(api/services/workflow_run_artifacts.py) before this job is enqueued,
so this task needs no shared filesystem with the web tier. The temp-path
arguments only exist for jobs enqueued by pre-refactor web workers.
Args:
_ctx: ARQ context (unused)
workflow_run_id: The workflow run ID
audio_temp_path: Optional path to temp audio file
transcript_temp_path: Optional path to temp transcript file
user_audio_temp_path: Optional path to temp user-track audio file
bot_audio_temp_path: Optional path to temp bot-track audio file
audio_temp_path: Deprecated, pre-refactor jobs only
transcript_temp_path: Deprecated, pre-refactor jobs only
user_audio_temp_path: Deprecated, pre-refactor jobs only
bot_audio_temp_path: Deprecated, pre-refactor jobs only
"""
run_id = str(workflow_run_id)
set_current_run_id(run_id)
logger.info(f"Processing workflow completion for run {workflow_run_id}")
storage_backend = get_current_storage_backend()
# Step 1: Upload audio if provided
recordings_metadata: dict[str, dict] = {}
if audio_temp_path:
recording_url = f"recordings/{workflow_run_id}.wav"
logger.info(
f"Uploading mixed audio to {storage_backend.name} - workflow_run_id: {workflow_run_id}"
)
if await _upload_temp_file(
workflow_run_id, audio_temp_path, recording_url, "mixed audio"
):
recordings_metadata["mixed"] = _recording_metadata(
recording_url, storage_backend.value, "mixed"
)
await db_client.update_workflow_run(
run_id=workflow_run_id,
recording_url=recording_url,
storage_backend=storage_backend.value,
)
if user_audio_temp_path:
user_recording_url = f"recordings/{workflow_run_id}/user.wav"
logger.info(
f"Uploading user audio to {storage_backend.name} - workflow_run_id: {workflow_run_id}"
)
if await _upload_temp_file(
workflow_run_id, user_audio_temp_path, user_recording_url, "user audio"
):
recordings_metadata["user"] = _recording_metadata(
user_recording_url, storage_backend.value, "user"
)
if bot_audio_temp_path:
bot_recording_url = f"recordings/{workflow_run_id}/bot.wav"
logger.info(
f"Uploading bot audio to {storage_backend.name} - workflow_run_id: {workflow_run_id}"
)
if await _upload_temp_file(
workflow_run_id, bot_audio_temp_path, bot_recording_url, "bot audio"
):
recordings_metadata["bot"] = _recording_metadata(
bot_recording_url, storage_backend.value, "bot"
)
if recordings_metadata:
await db_client.update_workflow_run(
run_id=workflow_run_id,
storage_backend=storage_backend.value,
extra={"recordings": recordings_metadata},
if (
audio_temp_path
or transcript_temp_path
or user_audio_temp_path
or bot_audio_temp_path
):
await _upload_legacy_temp_artifacts(
workflow_run_id,
audio_temp_path,
transcript_temp_path,
user_audio_temp_path,
bot_audio_temp_path,
)
# Step 2: Upload transcript if provided
if transcript_temp_path:
try:
if os.path.exists(transcript_temp_path):
file_size = os.path.getsize(transcript_temp_path)
logger.debug(f"Transcript file size: {file_size} bytes")
transcript_url = f"transcripts/{workflow_run_id}.txt"
logger.info(
f"Uploading transcript to {storage_backend.name} - workflow_run_id: {workflow_run_id}"
)
await storage_fs.aupload_file(transcript_temp_path, transcript_url)
await db_client.update_workflow_run(
run_id=workflow_run_id,
transcript_url=transcript_url,
storage_backend=storage_backend.value,
)
logger.info(f"Successfully uploaded transcript: {transcript_url}")
else:
logger.warning(
f"Transcript temp file not found: {transcript_temp_path}"
)
except Exception as e:
logger.error(
f"Error uploading transcript for workflow {workflow_run_id}: {e}"
)
finally:
if transcript_temp_path and os.path.exists(transcript_temp_path):
try:
os.remove(transcript_temp_path)
logger.debug(
f"Cleaned up temp transcript file: {transcript_temp_path}"
)
except Exception as e:
logger.warning(f"Failed to clean up temp transcript file: {e}")
# Step 3: Run integrations including QA analysis (after uploads are complete)
# Run integrations including QA analysis (after uploads are complete)
try:
await run_integrations_post_workflow_run(_ctx, workflow_run_id)
except Exception as e:
logger.error(f"Error running integrations for workflow {workflow_run_id}: {e}")
# Step 4: Notify MPS after completion. MPS owns credit accounting.
# Notify MPS after completion. MPS owns credit accounting.
try:
await report_completed_workflow_run_platform_usage(workflow_run_id)
except Exception as e: