mirror of
https://github.com/dograh-hq/dograh.git
synced 2026-06-22 08:38:13 +02:00
feat: refactor node spec and add mcp tools (#244)
* refactor: carve out extraction panel * refactor: create spec versions for node types * refactor: create a GenericNode and remove custom nodes * feat: add python and typescript sdk * add dograh sdk * fix: fetch draft workflow definition over published one * fix: fix routes of SDKs to use code gen * chore: remove doclink dependency to reduce image size * chore: format files * chore: bump pipecat * feat: let mcp fetch archived workflows on demand * chore: fix tests * feat: add sdk documentation * chore: change banner and add badge
This commit is contained in:
parent
0a61ef295f
commit
00a1a22b74
162 changed files with 14355 additions and 3554 deletions
|
|
@ -487,6 +487,71 @@ class MPSServiceKeyClient:
|
|||
response=response,
|
||||
)
|
||||
|
||||
async def process_document(
|
||||
self,
|
||||
file_path: str,
|
||||
filename: str,
|
||||
content_type: str,
|
||||
retrieval_mode: str = "chunked",
|
||||
max_tokens: int = 128,
|
||||
chunk_overlap_tokens: int = 0,
|
||||
merge_peers: bool = True,
|
||||
tokenizer_model: Optional[str] = None,
|
||||
correlation_id: Optional[str] = None,
|
||||
organization_id: Optional[int] = None,
|
||||
created_by: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""Convert + chunk a document via MPS /document/process.
|
||||
|
||||
Returns a dict matching DocumentProcessResponse in MPS:
|
||||
{
|
||||
"mode": "chunked" | "full_document",
|
||||
"docling_metadata": {...},
|
||||
"full_text": str | None, # populated only in full_document mode
|
||||
"chunks": [...], # populated only in chunked mode
|
||||
}
|
||||
|
||||
Timeout is 300s to match the ALB idle_timeout configured in
|
||||
infrastructure/mps/main.tf. Raises on non-2xx responses.
|
||||
"""
|
||||
data = {
|
||||
"retrieval_mode": retrieval_mode,
|
||||
"max_tokens": str(max_tokens),
|
||||
"chunk_overlap_tokens": str(chunk_overlap_tokens),
|
||||
"merge_peers": str(merge_peers).lower(),
|
||||
}
|
||||
if tokenizer_model is not None:
|
||||
data["tokenizer_model"] = tokenizer_model
|
||||
if correlation_id:
|
||||
data["correlation_id"] = correlation_id
|
||||
|
||||
headers = self._get_headers(organization_id, created_by)
|
||||
# Remove JSON content-type so httpx sets the correct multipart boundary.
|
||||
headers.pop("Content-Type", None)
|
||||
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(300.0)) as client:
|
||||
with open(file_path, "rb") as fh:
|
||||
files = {"file": (filename, fh.read(), content_type)}
|
||||
|
||||
response = await client.post(
|
||||
f"{self.base_url}/api/v1/document/process",
|
||||
files=files,
|
||||
data=data,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
|
||||
logger.error(
|
||||
f"Failed to process document: {response.status_code} - {response.text}"
|
||||
)
|
||||
raise httpx.HTTPStatusError(
|
||||
f"Failed to process document: {response.text}",
|
||||
request=response.request,
|
||||
response=response,
|
||||
)
|
||||
|
||||
async def call_workflow_api(
|
||||
self,
|
||||
call_type: str,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue