mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-07-04 10:52:17 +02:00
chore: refactor file upload mechanism to avoid NFS dependency (#496)
* chore: refactor file upload mechanism to avoid NFS dependency * add regression test for deregistration of calls * fix: fix minio upload issue * fix: make transcript upload async
This commit is contained in:
parent
79a4a3c9f1
commit
a54ab519b8
23 changed files with 370 additions and 401 deletions
|
|
@ -45,7 +45,6 @@ from api.tasks.campaign_tasks import (
|
|||
)
|
||||
from api.tasks.knowledge_base_processing import process_knowledge_base_document
|
||||
from api.tasks.run_integrations import run_integrations_post_workflow_run
|
||||
from api.tasks.s3_upload import upload_voicemail_audio_to_s3
|
||||
from api.tasks.webhook_delivery import deliver_webhook, sweep_webhook_deliveries
|
||||
from api.tasks.workflow_completion import process_workflow_completion
|
||||
|
||||
|
|
@ -53,7 +52,6 @@ from api.tasks.workflow_completion import process_workflow_completion
|
|||
class WorkerSettings:
|
||||
functions = [
|
||||
run_integrations_post_workflow_run,
|
||||
upload_voicemail_audio_to_s3,
|
||||
process_workflow_completion,
|
||||
sync_campaign_source,
|
||||
process_campaign_batch,
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
class FunctionNames:
|
||||
RUN_INTEGRATIONS_POST_WORKFLOW_RUN = "run_integrations_post_workflow_run"
|
||||
PROCESS_WORKFLOW_COMPLETION = "process_workflow_completion"
|
||||
UPLOAD_VOICEMAIL_AUDIO_TO_S3 = "upload_voicemail_audio_to_s3"
|
||||
SYNC_CAMPAIGN_SOURCE = "sync_campaign_source"
|
||||
PROCESS_CAMPAIGN_BATCH = "process_campaign_batch"
|
||||
PROCESS_KNOWLEDGE_BASE_DOCUMENT = "process_knowledge_base_document"
|
||||
|
|
|
|||
|
|
@ -1,67 +0,0 @@
|
|||
import os
|
||||
|
||||
from loguru import logger
|
||||
from pipecat.utils.run_context import set_current_run_id
|
||||
|
||||
from api.services.storage import storage_fs
|
||||
|
||||
|
||||
async def upload_voicemail_audio_to_s3(
|
||||
_ctx,
|
||||
workflow_run_id: int,
|
||||
temp_file_path: str,
|
||||
s3_key: str,
|
||||
):
|
||||
"""Upload voicemail detection audio from temp file to S3.
|
||||
|
||||
Handles voicemail-specific paths and doesn't update the workflow run's
|
||||
recording_url field.
|
||||
|
||||
Args:
|
||||
_ctx: ARQ context (unused)
|
||||
workflow_run_id: The workflow run ID
|
||||
temp_file_path: Path to the temporary WAV file
|
||||
s3_key: The S3 key where the file should be uploaded
|
||||
"""
|
||||
run_id = str(workflow_run_id)
|
||||
set_current_run_id(run_id)
|
||||
|
||||
logger.info(f"Starting voicemail audio upload to S3 from {temp_file_path}")
|
||||
|
||||
try:
|
||||
# Verify temp file exists
|
||||
if not os.path.exists(temp_file_path):
|
||||
logger.error(f"Temp voicemail audio file not found: {temp_file_path}")
|
||||
raise FileNotFoundError(
|
||||
f"Temp voicemail audio file not found: {temp_file_path}"
|
||||
)
|
||||
|
||||
file_size = os.path.getsize(temp_file_path)
|
||||
logger.debug(f"Voicemail audio file size: {file_size} bytes")
|
||||
|
||||
# Upload to S3
|
||||
upload_ok = await storage_fs.aupload_file(temp_file_path, s3_key)
|
||||
|
||||
if upload_ok:
|
||||
logger.info(f"Successfully uploaded voicemail audio to S3: {s3_key}")
|
||||
else:
|
||||
logger.error(
|
||||
f"Failed to upload voicemail audio to S3 for workflow {workflow_run_id}"
|
||||
)
|
||||
raise Exception(f"S3 upload failed for {s3_key}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error uploading voicemail audio to S3 for workflow {workflow_run_id}: {e}"
|
||||
)
|
||||
raise
|
||||
finally:
|
||||
# Clean up temp file
|
||||
if os.path.exists(temp_file_path):
|
||||
try:
|
||||
os.remove(temp_file_path)
|
||||
logger.debug(f"Cleaned up temp voicemail audio file: {temp_file_path}")
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed to clean up temp voicemail audio file {temp_file_path}: {e}"
|
||||
)
|
||||
|
|
@ -1,178 +1,118 @@
|
|||
import asyncio
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from loguru import logger
|
||||
from pipecat.utils.run_context import set_current_run_id
|
||||
|
||||
from api.db import db_client
|
||||
from api.services.storage import get_current_storage_backend, storage_fs
|
||||
from api.services.workflow_run_artifacts import upload_workflow_run_artifacts
|
||||
from api.services.workflow_run_billing import (
|
||||
report_completed_workflow_run_platform_usage,
|
||||
)
|
||||
from api.tasks.run_integrations import run_integrations_post_workflow_run
|
||||
|
||||
|
||||
def _recording_metadata(storage_key: str, storage_backend: str, track: str) -> dict:
|
||||
return {
|
||||
"storage_key": storage_key,
|
||||
"storage_backend": storage_backend,
|
||||
"format": "wav",
|
||||
"track": track,
|
||||
}
|
||||
|
||||
|
||||
async def _upload_temp_file(
|
||||
workflow_run_id: int,
|
||||
temp_file_path: str,
|
||||
storage_key: str,
|
||||
label: str,
|
||||
) -> bool:
|
||||
def _read_and_remove_temp_file(temp_file_path: str | None, label: str) -> bytes | None:
|
||||
if not temp_file_path:
|
||||
return None
|
||||
try:
|
||||
if not os.path.exists(temp_file_path):
|
||||
logger.warning(f"{label} temp file not found: {temp_file_path}")
|
||||
return False
|
||||
|
||||
file_size = os.path.getsize(temp_file_path)
|
||||
logger.debug(f"{label} file size: {file_size} bytes")
|
||||
|
||||
await storage_fs.aupload_file(temp_file_path, storage_key)
|
||||
logger.info(f"Successfully uploaded {label}: {storage_key}")
|
||||
return True
|
||||
return None
|
||||
with open(temp_file_path, "rb") as f:
|
||||
data = f.read()
|
||||
os.remove(temp_file_path)
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Error uploading {label} for workflow {workflow_run_id}: {e}")
|
||||
return False
|
||||
finally:
|
||||
if os.path.exists(temp_file_path):
|
||||
try:
|
||||
os.remove(temp_file_path)
|
||||
logger.debug(f"Cleaned up temp {label} file: {temp_file_path}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to clean up temp {label} file: {e}")
|
||||
logger.error(f"Error reading legacy {label} temp file {temp_file_path}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
async def _upload_legacy_temp_artifacts(
|
||||
workflow_run_id: int,
|
||||
audio_temp_path: str | None,
|
||||
transcript_temp_path: str | None,
|
||||
user_audio_temp_path: str | None,
|
||||
bot_audio_temp_path: str | None,
|
||||
) -> None:
|
||||
"""Handle jobs enqueued before uploads moved into the pipeline process.
|
||||
|
||||
Pre-refactor web workers passed local temp-file paths; upload them if this
|
||||
worker can still see the files (same host / shared volume).
|
||||
|
||||
Deprecated: remove once no pre-refactor jobs can remain in the queue.
|
||||
"""
|
||||
logger.info(
|
||||
f"Processing legacy temp-file artifacts for workflow run {workflow_run_id}"
|
||||
)
|
||||
transcript_bytes = await asyncio.to_thread(
|
||||
_read_and_remove_temp_file, transcript_temp_path, "transcript"
|
||||
)
|
||||
await upload_workflow_run_artifacts(
|
||||
workflow_run_id,
|
||||
mixed_audio_wav=await asyncio.to_thread(
|
||||
_read_and_remove_temp_file, audio_temp_path, "mixed audio"
|
||||
),
|
||||
user_audio_wav=await asyncio.to_thread(
|
||||
_read_and_remove_temp_file, user_audio_temp_path, "user audio"
|
||||
),
|
||||
bot_audio_wav=await asyncio.to_thread(
|
||||
_read_and_remove_temp_file, bot_audio_temp_path, "bot audio"
|
||||
),
|
||||
transcript_text=(
|
||||
transcript_bytes.decode("utf-8") if transcript_bytes else None
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
async def process_workflow_completion(
|
||||
_ctx,
|
||||
workflow_run_id: int,
|
||||
audio_temp_path: Optional[str] = None,
|
||||
transcript_temp_path: Optional[str] = None,
|
||||
user_audio_temp_path: Optional[str] = None,
|
||||
bot_audio_temp_path: Optional[str] = None,
|
||||
audio_temp_path: str | None = None,
|
||||
transcript_temp_path: str | None = None,
|
||||
user_audio_temp_path: str | None = None,
|
||||
bot_audio_temp_path: str | None = None,
|
||||
):
|
||||
"""Process workflow completion: upload artifacts and run integrations.
|
||||
"""Process workflow completion: run integrations and report billing.
|
||||
|
||||
This task combines audio upload, transcript upload, and webhook integrations
|
||||
into a single sequential task to ensure integrations run after uploads complete.
|
||||
Recording/transcript uploads happen in the pipeline process itself
|
||||
(api/services/workflow_run_artifacts.py) before this job is enqueued,
|
||||
so this task needs no shared filesystem with the web tier. The temp-path
|
||||
arguments only exist for jobs enqueued by pre-refactor web workers.
|
||||
|
||||
Args:
|
||||
_ctx: ARQ context (unused)
|
||||
workflow_run_id: The workflow run ID
|
||||
audio_temp_path: Optional path to temp audio file
|
||||
transcript_temp_path: Optional path to temp transcript file
|
||||
user_audio_temp_path: Optional path to temp user-track audio file
|
||||
bot_audio_temp_path: Optional path to temp bot-track audio file
|
||||
audio_temp_path: Deprecated, pre-refactor jobs only
|
||||
transcript_temp_path: Deprecated, pre-refactor jobs only
|
||||
user_audio_temp_path: Deprecated, pre-refactor jobs only
|
||||
bot_audio_temp_path: Deprecated, pre-refactor jobs only
|
||||
"""
|
||||
run_id = str(workflow_run_id)
|
||||
set_current_run_id(run_id)
|
||||
|
||||
logger.info(f"Processing workflow completion for run {workflow_run_id}")
|
||||
|
||||
storage_backend = get_current_storage_backend()
|
||||
|
||||
# Step 1: Upload audio if provided
|
||||
recordings_metadata: dict[str, dict] = {}
|
||||
|
||||
if audio_temp_path:
|
||||
recording_url = f"recordings/{workflow_run_id}.wav"
|
||||
logger.info(
|
||||
f"Uploading mixed audio to {storage_backend.name} - workflow_run_id: {workflow_run_id}"
|
||||
)
|
||||
if await _upload_temp_file(
|
||||
workflow_run_id, audio_temp_path, recording_url, "mixed audio"
|
||||
):
|
||||
recordings_metadata["mixed"] = _recording_metadata(
|
||||
recording_url, storage_backend.value, "mixed"
|
||||
)
|
||||
await db_client.update_workflow_run(
|
||||
run_id=workflow_run_id,
|
||||
recording_url=recording_url,
|
||||
storage_backend=storage_backend.value,
|
||||
)
|
||||
|
||||
if user_audio_temp_path:
|
||||
user_recording_url = f"recordings/{workflow_run_id}/user.wav"
|
||||
logger.info(
|
||||
f"Uploading user audio to {storage_backend.name} - workflow_run_id: {workflow_run_id}"
|
||||
)
|
||||
if await _upload_temp_file(
|
||||
workflow_run_id, user_audio_temp_path, user_recording_url, "user audio"
|
||||
):
|
||||
recordings_metadata["user"] = _recording_metadata(
|
||||
user_recording_url, storage_backend.value, "user"
|
||||
)
|
||||
|
||||
if bot_audio_temp_path:
|
||||
bot_recording_url = f"recordings/{workflow_run_id}/bot.wav"
|
||||
logger.info(
|
||||
f"Uploading bot audio to {storage_backend.name} - workflow_run_id: {workflow_run_id}"
|
||||
)
|
||||
if await _upload_temp_file(
|
||||
workflow_run_id, bot_audio_temp_path, bot_recording_url, "bot audio"
|
||||
):
|
||||
recordings_metadata["bot"] = _recording_metadata(
|
||||
bot_recording_url, storage_backend.value, "bot"
|
||||
)
|
||||
|
||||
if recordings_metadata:
|
||||
await db_client.update_workflow_run(
|
||||
run_id=workflow_run_id,
|
||||
storage_backend=storage_backend.value,
|
||||
extra={"recordings": recordings_metadata},
|
||||
if (
|
||||
audio_temp_path
|
||||
or transcript_temp_path
|
||||
or user_audio_temp_path
|
||||
or bot_audio_temp_path
|
||||
):
|
||||
await _upload_legacy_temp_artifacts(
|
||||
workflow_run_id,
|
||||
audio_temp_path,
|
||||
transcript_temp_path,
|
||||
user_audio_temp_path,
|
||||
bot_audio_temp_path,
|
||||
)
|
||||
|
||||
# Step 2: Upload transcript if provided
|
||||
if transcript_temp_path:
|
||||
try:
|
||||
if os.path.exists(transcript_temp_path):
|
||||
file_size = os.path.getsize(transcript_temp_path)
|
||||
logger.debug(f"Transcript file size: {file_size} bytes")
|
||||
|
||||
transcript_url = f"transcripts/{workflow_run_id}.txt"
|
||||
logger.info(
|
||||
f"Uploading transcript to {storage_backend.name} - workflow_run_id: {workflow_run_id}"
|
||||
)
|
||||
|
||||
await storage_fs.aupload_file(transcript_temp_path, transcript_url)
|
||||
await db_client.update_workflow_run(
|
||||
run_id=workflow_run_id,
|
||||
transcript_url=transcript_url,
|
||||
storage_backend=storage_backend.value,
|
||||
)
|
||||
logger.info(f"Successfully uploaded transcript: {transcript_url}")
|
||||
else:
|
||||
logger.warning(
|
||||
f"Transcript temp file not found: {transcript_temp_path}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error uploading transcript for workflow {workflow_run_id}: {e}"
|
||||
)
|
||||
finally:
|
||||
if transcript_temp_path and os.path.exists(transcript_temp_path):
|
||||
try:
|
||||
os.remove(transcript_temp_path)
|
||||
logger.debug(
|
||||
f"Cleaned up temp transcript file: {transcript_temp_path}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to clean up temp transcript file: {e}")
|
||||
|
||||
# Step 3: Run integrations including QA analysis (after uploads are complete)
|
||||
# Run integrations including QA analysis (after uploads are complete)
|
||||
try:
|
||||
await run_integrations_post_workflow_run(_ctx, workflow_run_id)
|
||||
except Exception as e:
|
||||
logger.error(f"Error running integrations for workflow {workflow_run_id}: {e}")
|
||||
|
||||
# Step 4: Notify MPS after completion. MPS owns credit accounting.
|
||||
# Notify MPS after completion. MPS owns credit accounting.
|
||||
try:
|
||||
await report_completed_workflow_run_platform_usage(workflow_run_id)
|
||||
except Exception as e:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue