mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-06-03 20:05:13 +02:00
Release/v1.2 (#457)
* Bump setup.py versions for 1.1 * PoC MCP server (#419) * Very initial MCP server PoC for TrustGraph * Put service on port 8000 * Add MCP container and packages to buildout * Update docs for API/CLI changes in 1.0 (#421) * Update some API basics for the 0.23/1.0 API change * Add MCP container push (#425) * Add command args to the MCP server (#426) * Host and port parameters * Added websocket arg * More docs * MCP client support (#427) - MCP client service - Tool request/response schema - API gateway support for mcp-tool - Message translation for tool request & response - Make mcp-tool using configuration service for information about where the MCP services are. * Feature/react call mcp (#428) Key Features - MCP Tool Integration: Added core MCP tool support with ToolClientSpec and ToolClient classes - API Enhancement: New mcp_tool method for flow-specific tool invocation - CLI Tooling: New tg-invoke-mcp-tool command for testing MCP integration - React Agent Enhancement: Fixed and improved multi-tool invocation capabilities - Tool Management: Enhanced CLI for tool configuration and management Changes - Added MCP tool invocation to API with flow-specific integration - Implemented ToolClientSpec and ToolClient for tool call handling - Updated agent-manager-react to invoke MCP tools with configurable types - Enhanced CLI with new commands and improved help text - Added comprehensive documentation for new CLI commands - Improved tool configuration management Testing - Added tg-invoke-mcp-tool CLI command for isolated MCP integration testing - Enhanced agent capability to invoke multiple tools simultaneously * Test suite executed from CI pipeline (#433) * Test strategy & test cases * Unit tests * Integration tests * Extending test coverage (#434) * Contract tests * Testing embeedings * Agent unit tests * Knowledge pipeline tests * Turn on contract tests * Increase storage test coverage (#435) * Fixing storage and adding tests * PR pipeline only runs quick tests * Empty configuration is returned as empty list, previously was not in response (#436) * Update config util to take files as well as command-line text (#437) * Updated CLI invocation and config model for tools and mcp (#438) * Updated CLI invocation and config model for tools and mcp * CLI anomalies * Tweaked the MCP tool implementation for new model * Update agent implementation to match the new model * Fix agent tools, now all tested * Fixed integration tests * Fix MCP delete tool params * Update Python deps to 1.2 * Update to enable knowledge extraction using the agent framework (#439) * Implement KG extraction agent (kg-extract-agent) * Using ReAct framework (agent-manager-react) * ReAct manager had an issue when emitting JSON, which conflicts which ReAct manager's own JSON messages, so refactored ReAct manager to use traditional ReAct messages, non-JSON structure. * Minor refactor to take the prompt template client out of prompt-template so it can be more readily used by other modules. kg-extract-agent uses this framework. * Migrate from setup.py to pyproject.toml (#440) * Converted setup.py to pyproject.toml * Modern package infrastructure as recommended by py docs * Install missing build deps (#441) * Install missing build deps (#442) * Implement logging strategy (#444) * Logging strategy and convert all prints() to logging invocations * Fix/startup failure (#445) * Fix loggin startup problems * Fix logging startup problems (#446) * Fix logging startup problems (#447) * Fixed Mistral OCR to use current API (#448) * Fixed Mistral OCR to use current API * Added PDF decoder tests * Fix Mistral OCR ident to be standard pdf-decoder (#450) * Fix Mistral OCR ident to be standard pdf-decoder * Correct test * Schema structure refactor (#451) * Write schema refactor spec * Implemented schema refactor spec * Structure data mvp (#452) * Structured data tech spec * Architecture principles * New schemas * Updated schemas and specs * Object extractor * Add .coveragerc * New tests * Cassandra object storage * Trying to object extraction working, issues exist * Validate librarian collection (#453) * Fix token chunker, broken API invocation (#454) * Fix token chunker, broken API invocation (#455) * Knowledge load utility CLI (#456) * Knowledge loader * More tests
This commit is contained in:
parent
c85ba197be
commit
89be656990
509 changed files with 49632 additions and 5159 deletions
123
trustgraph-flow/pyproject.toml
Normal file
123
trustgraph-flow/pyproject.toml
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
[build-system]
|
||||
requires = ["setuptools>=61.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "trustgraph-flow"
|
||||
dynamic = ["version"]
|
||||
authors = [{name = "trustgraph.ai", email = "security@trustgraph.ai"}]
|
||||
description = "TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline."
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.8"
|
||||
dependencies = [
|
||||
"trustgraph-base>=1.2,<1.3",
|
||||
"aiohttp",
|
||||
"anthropic",
|
||||
"cassandra-driver",
|
||||
"cohere",
|
||||
"cryptography",
|
||||
"falkordb",
|
||||
"fastembed",
|
||||
"google-genai",
|
||||
"ibis",
|
||||
"jsonschema",
|
||||
"langchain",
|
||||
"langchain-community",
|
||||
"langchain-core",
|
||||
"langchain-text-splitters",
|
||||
"mcp",
|
||||
"minio",
|
||||
"mistralai",
|
||||
"neo4j",
|
||||
"ollama",
|
||||
"openai",
|
||||
"pinecone[grpc]",
|
||||
"prometheus-client",
|
||||
"pulsar-client",
|
||||
"pymilvus",
|
||||
"pypdf",
|
||||
"pyyaml",
|
||||
"qdrant-client",
|
||||
"rdflib",
|
||||
"requests",
|
||||
"tabulate",
|
||||
"tiktoken",
|
||||
"urllib3",
|
||||
]
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3",
|
||||
"Operating System :: OS Independent",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/trustgraph-ai/trustgraph"
|
||||
|
||||
[project.scripts]
|
||||
agent-manager-react = "trustgraph.agent.react:run"
|
||||
api-gateway = "trustgraph.gateway:run"
|
||||
chunker-recursive = "trustgraph.chunking.recursive:run"
|
||||
chunker-token = "trustgraph.chunking.token:run"
|
||||
config-svc = "trustgraph.config.service:run"
|
||||
de-query-milvus = "trustgraph.query.doc_embeddings.milvus:run"
|
||||
de-query-pinecone = "trustgraph.query.doc_embeddings.pinecone:run"
|
||||
de-query-qdrant = "trustgraph.query.doc_embeddings.qdrant:run"
|
||||
de-write-milvus = "trustgraph.storage.doc_embeddings.milvus:run"
|
||||
de-write-pinecone = "trustgraph.storage.doc_embeddings.pinecone:run"
|
||||
de-write-qdrant = "trustgraph.storage.doc_embeddings.qdrant:run"
|
||||
document-embeddings = "trustgraph.embeddings.document_embeddings:run"
|
||||
document-rag = "trustgraph.retrieval.document_rag:run"
|
||||
embeddings-fastembed = "trustgraph.embeddings.fastembed:run"
|
||||
embeddings-ollama = "trustgraph.embeddings.ollama:run"
|
||||
ge-query-milvus = "trustgraph.query.graph_embeddings.milvus:run"
|
||||
ge-query-pinecone = "trustgraph.query.graph_embeddings.pinecone:run"
|
||||
ge-query-qdrant = "trustgraph.query.graph_embeddings.qdrant:run"
|
||||
ge-write-milvus = "trustgraph.storage.graph_embeddings.milvus:run"
|
||||
ge-write-pinecone = "trustgraph.storage.graph_embeddings.pinecone:run"
|
||||
ge-write-qdrant = "trustgraph.storage.graph_embeddings.qdrant:run"
|
||||
graph-embeddings = "trustgraph.embeddings.graph_embeddings:run"
|
||||
graph-rag = "trustgraph.retrieval.graph_rag:run"
|
||||
kg-extract-agent = "trustgraph.extract.kg.agent:run"
|
||||
kg-extract-definitions = "trustgraph.extract.kg.definitions:run"
|
||||
kg-extract-objects = "trustgraph.extract.kg.objects:run"
|
||||
kg-extract-relationships = "trustgraph.extract.kg.relationships:run"
|
||||
kg-extract-topics = "trustgraph.extract.kg.topics:run"
|
||||
kg-manager = "trustgraph.cores:run"
|
||||
kg-store = "trustgraph.storage.knowledge:run"
|
||||
librarian = "trustgraph.librarian:run"
|
||||
mcp-tool = "trustgraph.agent.mcp_tool:run"
|
||||
metering = "trustgraph.metering:run"
|
||||
objects-write-cassandra = "trustgraph.storage.objects.cassandra:run"
|
||||
oe-write-milvus = "trustgraph.storage.object_embeddings.milvus:run"
|
||||
pdf-decoder = "trustgraph.decoding.pdf:run"
|
||||
pdf-ocr-mistral = "trustgraph.decoding.mistral_ocr:run"
|
||||
prompt-template = "trustgraph.prompt.template:run"
|
||||
rev-gateway = "trustgraph.rev_gateway:run"
|
||||
rows-write-cassandra = "trustgraph.storage.rows.cassandra:run"
|
||||
run-processing = "trustgraph.processing:run"
|
||||
text-completion-azure = "trustgraph.model.text_completion.azure:run"
|
||||
text-completion-azure-openai = "trustgraph.model.text_completion.azure_openai:run"
|
||||
text-completion-claude = "trustgraph.model.text_completion.claude:run"
|
||||
text-completion-cohere = "trustgraph.model.text_completion.cohere:run"
|
||||
text-completion-googleaistudio = "trustgraph.model.text_completion.googleaistudio:run"
|
||||
text-completion-llamafile = "trustgraph.model.text_completion.llamafile:run"
|
||||
text-completion-lmstudio = "trustgraph.model.text_completion.lmstudio:run"
|
||||
text-completion-mistral = "trustgraph.model.text_completion.mistral:run"
|
||||
text-completion-ollama = "trustgraph.model.text_completion.ollama:run"
|
||||
text-completion-openai = "trustgraph.model.text_completion.openai:run"
|
||||
text-completion-tgi = "trustgraph.model.text_completion.tgi:run"
|
||||
text-completion-vllm = "trustgraph.model.text_completion.vllm:run"
|
||||
triples-query-cassandra = "trustgraph.query.triples.cassandra:run"
|
||||
triples-query-falkordb = "trustgraph.query.triples.falkordb:run"
|
||||
triples-query-memgraph = "trustgraph.query.triples.memgraph:run"
|
||||
triples-query-neo4j = "trustgraph.query.triples.neo4j:run"
|
||||
triples-write-cassandra = "trustgraph.storage.triples.cassandra:run"
|
||||
triples-write-falkordb = "trustgraph.storage.triples.falkordb:run"
|
||||
triples-write-memgraph = "trustgraph.storage.triples.memgraph:run"
|
||||
triples-write-neo4j = "trustgraph.storage.triples.neo4j:run"
|
||||
wikipedia-lookup = "trustgraph.external.wikipedia:run"
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
include = ["trustgraph*"]
|
||||
|
||||
[tool.setuptools.dynamic]
|
||||
version = {attr = "trustgraph.flow_version.__version__"}
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.agent.react import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.gateway import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.chunking.recursive import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.chunking.token import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.config.service import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.query.doc_embeddings.milvus import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.query.doc_embeddings.pinecone import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.query.doc_embeddings.qdrant import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.storage.doc_embeddings.milvus import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.storage.doc_embeddings.pinecone import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.storage.doc_embeddings.qdrant import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.embeddings.document_embeddings import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.retrieval.document_rag import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.embeddings.fastembed import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.embeddings.ollama import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.query.graph_embeddings.milvus import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.query.graph_embeddings.pinecone import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.query.graph_embeddings.qdrant import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.storage.graph_embeddings.milvus import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.storage.graph_embeddings.pinecone import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.storage.graph_embeddings.qdrant import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.embeddings.graph_embeddings import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.retrieval.graph_rag import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.extract.kg.definitions import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.extract.kg.relationships import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.extract.kg.topics import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.cores import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.storage.knowledge import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.librarian import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,5 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.metering import run
|
||||
|
||||
run()
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.extract.object.row import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.storage.object_embeddings.milvus import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.decoding.pdf import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.decoding.mistral_ocr import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.model.prompt.generic import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.model.prompt.template import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.rev_gateway import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.storage.rows.cassandra import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.processing import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.model.text_completion.azure import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.model.text_completion.azure_openai import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.model.text_completion.claude import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.model.text_completion.cohere import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.model.text_completion.googleaistudio import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.model.text_completion.llamafile import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.model.text_completion.lmstudio import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.model.text_completion.mistral import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.model.text_completion.ollama import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.model.text_completion.openai import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.model.text_completion.tgi import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.model.text_completion.vllm import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.query.triples.cassandra import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.query.triples.falkordb import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.query.triples.memgraph import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.query.triples.neo4j import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.storage.triples.cassandra import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.storage.triples.falkordb import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.storage.triples.memgraph import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.storage.triples.neo4j import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from trustgraph.external.wikipedia import run
|
||||
|
||||
run()
|
||||
|
||||
|
|
@ -1,133 +0,0 @@
|
|||
import setuptools
|
||||
import os
|
||||
import importlib
|
||||
|
||||
with open("README.md", "r") as fh:
|
||||
long_description = fh.read()
|
||||
|
||||
# Load a version number module
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
'version', 'trustgraph/flow_version.py'
|
||||
)
|
||||
version_module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(version_module)
|
||||
|
||||
version = version_module.__version__
|
||||
|
||||
setuptools.setup(
|
||||
name="trustgraph-flow",
|
||||
version=version,
|
||||
author="trustgraph.ai",
|
||||
author_email="security@trustgraph.ai",
|
||||
description="TrustGraph provides a means to run a pipeline of flexible AI processing components in a flexible means to achieve a processing pipeline.",
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
url="https://github.com/trustgraph-ai/trustgraph",
|
||||
packages=setuptools.find_namespace_packages(
|
||||
where='./',
|
||||
),
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
|
||||
"Operating System :: OS Independent",
|
||||
],
|
||||
python_requires='>=3.8',
|
||||
download_url = "https://github.com/trustgraph-ai/trustgraph/archive/refs/tags/v" + version + ".tar.gz",
|
||||
install_requires=[
|
||||
"trustgraph-base>=1.0,<1.1",
|
||||
"aiohttp",
|
||||
"anthropic",
|
||||
"cassandra-driver",
|
||||
"cohere",
|
||||
"cryptography",
|
||||
"falkordb",
|
||||
"fastembed",
|
||||
"google-genai",
|
||||
"ibis",
|
||||
"jsonschema",
|
||||
"langchain",
|
||||
"langchain-community",
|
||||
"langchain-core",
|
||||
"langchain-text-splitters",
|
||||
"minio",
|
||||
"mistralai",
|
||||
"neo4j",
|
||||
"ollama",
|
||||
"openai",
|
||||
"pinecone[grpc]",
|
||||
"prometheus-client",
|
||||
"pulsar-client",
|
||||
"pymilvus",
|
||||
"pypdf",
|
||||
"mistralai",
|
||||
"pyyaml",
|
||||
"qdrant-client",
|
||||
"rdflib",
|
||||
"requests",
|
||||
"tabulate",
|
||||
"tiktoken",
|
||||
"urllib3",
|
||||
],
|
||||
scripts=[
|
||||
"scripts/agent-manager-react",
|
||||
"scripts/api-gateway",
|
||||
"scripts/rev-gateway",
|
||||
"scripts/chunker-recursive",
|
||||
"scripts/chunker-token",
|
||||
"scripts/config-svc",
|
||||
"scripts/de-query-milvus",
|
||||
"scripts/de-query-pinecone",
|
||||
"scripts/de-query-qdrant",
|
||||
"scripts/de-write-milvus",
|
||||
"scripts/de-write-pinecone",
|
||||
"scripts/de-write-qdrant",
|
||||
"scripts/document-embeddings",
|
||||
"scripts/document-rag",
|
||||
"scripts/embeddings-fastembed",
|
||||
"scripts/embeddings-ollama",
|
||||
"scripts/ge-query-milvus",
|
||||
"scripts/ge-query-pinecone",
|
||||
"scripts/ge-query-qdrant",
|
||||
"scripts/ge-write-milvus",
|
||||
"scripts/ge-write-pinecone",
|
||||
"scripts/ge-write-qdrant",
|
||||
"scripts/graph-embeddings",
|
||||
"scripts/graph-rag",
|
||||
"scripts/kg-extract-definitions",
|
||||
"scripts/kg-extract-relationships",
|
||||
"scripts/kg-extract-topics",
|
||||
"scripts/kg-store",
|
||||
"scripts/kg-manager",
|
||||
"scripts/librarian",
|
||||
"scripts/metering",
|
||||
"scripts/object-extract-row",
|
||||
"scripts/oe-write-milvus",
|
||||
"scripts/pdf-decoder",
|
||||
"scripts/pdf-ocr-mistral",
|
||||
"scripts/prompt-generic",
|
||||
"scripts/prompt-template",
|
||||
"scripts/rows-write-cassandra",
|
||||
"scripts/run-processing",
|
||||
"scripts/text-completion-azure",
|
||||
"scripts/text-completion-azure-openai",
|
||||
"scripts/text-completion-claude",
|
||||
"scripts/text-completion-cohere",
|
||||
"scripts/text-completion-googleaistudio",
|
||||
"scripts/text-completion-llamafile",
|
||||
"scripts/text-completion-lmstudio",
|
||||
"scripts/text-completion-mistral",
|
||||
"scripts/text-completion-ollama",
|
||||
"scripts/text-completion-openai",
|
||||
"scripts/text-completion-tgi",
|
||||
"scripts/text-completion-vllm",
|
||||
"scripts/triples-query-cassandra",
|
||||
"scripts/triples-query-falkordb",
|
||||
"scripts/triples-query-memgraph",
|
||||
"scripts/triples-query-neo4j",
|
||||
"scripts/triples-write-cassandra",
|
||||
"scripts/triples-write-falkordb",
|
||||
"scripts/triples-write-memgraph",
|
||||
"scripts/triples-write-neo4j",
|
||||
"scripts/wikipedia-lookup",
|
||||
]
|
||||
)
|
||||
0
trustgraph-flow/trustgraph/model/prompt/generic/__main__.py → trustgraph-flow/trustgraph/agent/mcp_tool/__main__.py
Executable file → Normal file
0
trustgraph-flow/trustgraph/model/prompt/generic/__main__.py → trustgraph-flow/trustgraph/agent/mcp_tool/__main__.py
Executable file → Normal file
109
trustgraph-flow/trustgraph/agent/mcp_tool/service.py
Executable file
109
trustgraph-flow/trustgraph/agent/mcp_tool/service.py
Executable file
|
|
@ -0,0 +1,109 @@
|
|||
|
||||
"""
|
||||
MCP tool-calling service, calls an external MCP tool. Input is
|
||||
name + parameters, output is the response, either a string or an object.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from mcp.client.streamable_http import streamablehttp_client
|
||||
from mcp import ClientSession
|
||||
|
||||
from ... base import ToolService
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
default_ident = "mcp-tool"
|
||||
|
||||
class Service(ToolService):
|
||||
|
||||
def __init__(self, **params):
|
||||
|
||||
super(Service, self).__init__(
|
||||
**params
|
||||
)
|
||||
|
||||
self.register_config_handler(self.on_mcp_config)
|
||||
|
||||
self.mcp_services = {}
|
||||
|
||||
async def on_mcp_config(self, config, version):
|
||||
|
||||
logger.info(f"Got config version {version}")
|
||||
|
||||
if "mcp" not in config: return
|
||||
|
||||
self.mcp_services = {
|
||||
k: json.loads(v)
|
||||
for k, v in config["mcp"].items()
|
||||
}
|
||||
|
||||
async def invoke_tool(self, name, parameters):
|
||||
|
||||
try:
|
||||
|
||||
if name not in self.mcp_services:
|
||||
raise RuntimeError(f"MCP service {name} not known")
|
||||
|
||||
if "url" not in self.mcp_services[name]:
|
||||
raise RuntimeError(f"MCP service {name} URL not defined")
|
||||
|
||||
url = self.mcp_services[name]["url"]
|
||||
|
||||
if "remote-name" in self.mcp_services[name]:
|
||||
remote_name = self.mcp_services[name]["remote-name"]
|
||||
else:
|
||||
remote_name = name
|
||||
|
||||
logger.info(f"Invoking {remote_name} at {url}")
|
||||
|
||||
# Connect to a streamable HTTP server
|
||||
async with streamablehttp_client(url) as (
|
||||
read_stream,
|
||||
write_stream,
|
||||
_,
|
||||
):
|
||||
|
||||
# Create a session using the client streams
|
||||
async with ClientSession(read_stream, write_stream) as session:
|
||||
|
||||
# Initialize the connection
|
||||
await session.initialize()
|
||||
|
||||
# Call a tool
|
||||
result = await session.call_tool(
|
||||
remote_name,
|
||||
parameters
|
||||
)
|
||||
|
||||
if result.structuredContent:
|
||||
return result.structuredContent
|
||||
elif hasattr(result, "content"):
|
||||
return "".join([
|
||||
x.text
|
||||
for x in result.content
|
||||
])
|
||||
else:
|
||||
return "No content"
|
||||
|
||||
except BaseExceptionGroup as e:
|
||||
|
||||
for child in e.exceptions:
|
||||
logger.debug(f"Child: {child}")
|
||||
|
||||
raise e.exceptions[0]
|
||||
|
||||
except Exception as e:
|
||||
|
||||
logger.error(f"Error invoking MCP tool: {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
ToolService.add_args(parser)
|
||||
|
||||
def run():
|
||||
Service.launch(default_ident, __doc__)
|
||||
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
|
||||
import logging
|
||||
import json
|
||||
import re
|
||||
|
||||
from . types import Action, Final
|
||||
|
||||
|
|
@ -12,14 +13,170 @@ class AgentManager:
|
|||
self.tools = tools
|
||||
self.additional_context = additional_context
|
||||
|
||||
def parse_react_response(self, text):
|
||||
"""Parse text-based ReAct response format.
|
||||
|
||||
Expected format:
|
||||
Thought: [reasoning about what to do next]
|
||||
Action: [tool_name]
|
||||
Args: {
|
||||
"param": "value"
|
||||
}
|
||||
|
||||
OR
|
||||
|
||||
Thought: [reasoning about the final answer]
|
||||
Final Answer: [the answer]
|
||||
"""
|
||||
if not isinstance(text, str):
|
||||
raise ValueError(f"Expected string response, got {type(text)}")
|
||||
|
||||
# Remove any markdown code blocks that might wrap the response
|
||||
text = re.sub(r'^```[^\n]*\n', '', text.strip())
|
||||
text = re.sub(r'\n```$', '', text.strip())
|
||||
|
||||
lines = text.strip().split('\n')
|
||||
|
||||
thought = None
|
||||
action = None
|
||||
args = None
|
||||
final_answer = None
|
||||
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
line = lines[i].strip()
|
||||
|
||||
# Parse Thought
|
||||
if line.startswith("Thought:"):
|
||||
thought = line[8:].strip()
|
||||
# Handle multi-line thoughts
|
||||
i += 1
|
||||
while i < len(lines):
|
||||
next_line = lines[i].strip()
|
||||
if next_line.startswith(("Action:", "Final Answer:", "Args:")):
|
||||
break
|
||||
thought += " " + next_line
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Parse Final Answer
|
||||
if line.startswith("Final Answer:"):
|
||||
final_answer = line[13:].strip()
|
||||
# Handle multi-line final answers (including JSON)
|
||||
i += 1
|
||||
|
||||
# Check if the answer might be JSON
|
||||
if final_answer.startswith('{') or (i < len(lines) and lines[i].strip().startswith('{')):
|
||||
# Collect potential JSON answer
|
||||
json_text = final_answer if final_answer.startswith('{') else ""
|
||||
brace_count = json_text.count('{') - json_text.count('}')
|
||||
|
||||
while i < len(lines) and (brace_count > 0 or not json_text):
|
||||
current_line = lines[i].strip()
|
||||
if current_line.startswith(("Thought:", "Action:")) and brace_count == 0:
|
||||
break
|
||||
json_text += ("\n" if json_text else "") + current_line
|
||||
brace_count += current_line.count('{') - current_line.count('}')
|
||||
i += 1
|
||||
|
||||
# Try to parse as JSON
|
||||
# try:
|
||||
# final_answer = json.loads(json_text)
|
||||
# except json.JSONDecodeError:
|
||||
# # Not valid JSON, treat as regular text
|
||||
# final_answer = json_text
|
||||
final_answer = json_text
|
||||
else:
|
||||
# Regular text answer
|
||||
while i < len(lines):
|
||||
next_line = lines[i].strip()
|
||||
if next_line.startswith(("Thought:", "Action:")):
|
||||
break
|
||||
final_answer += " " + next_line
|
||||
i += 1
|
||||
|
||||
# If we have a final answer, return Final object
|
||||
return Final(
|
||||
thought=thought or "",
|
||||
final=final_answer
|
||||
)
|
||||
|
||||
# Parse Action
|
||||
if line.startswith("Action:"):
|
||||
action = line[7:].strip()
|
||||
|
||||
# Parse Args
|
||||
if line.startswith("Args:"):
|
||||
# Check if JSON starts on the same line
|
||||
args_on_same_line = line[5:].strip()
|
||||
if args_on_same_line:
|
||||
args_text = args_on_same_line
|
||||
brace_count = args_on_same_line.count('{') - args_on_same_line.count('}')
|
||||
else:
|
||||
args_text = ""
|
||||
brace_count = 0
|
||||
|
||||
# Collect all lines that form the JSON arguments
|
||||
i += 1
|
||||
started = bool(args_on_same_line and '{' in args_on_same_line)
|
||||
|
||||
while i < len(lines) and (not started or brace_count > 0):
|
||||
current_line = lines[i]
|
||||
args_text += ("\n" if args_text else "") + current_line
|
||||
|
||||
# Count braces to determine when JSON is complete
|
||||
for char in current_line:
|
||||
if char == '{':
|
||||
brace_count += 1
|
||||
started = True
|
||||
elif char == '}':
|
||||
brace_count -= 1
|
||||
|
||||
# If we've started and braces are balanced, we're done
|
||||
if started and brace_count == 0:
|
||||
break
|
||||
|
||||
i += 1
|
||||
|
||||
# Parse the JSON arguments
|
||||
try:
|
||||
args = json.loads(args_text.strip())
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse JSON arguments: {args_text}")
|
||||
raise ValueError(f"Invalid JSON in Args: {e}")
|
||||
|
||||
i += 1
|
||||
|
||||
# If we have an action, return Action object
|
||||
if action:
|
||||
return Action(
|
||||
thought=thought or "",
|
||||
name=action,
|
||||
arguments=args or {},
|
||||
observation=""
|
||||
)
|
||||
|
||||
# If we only have a thought but no action or final answer
|
||||
if thought and not action and not final_answer:
|
||||
raise ValueError(f"Response has thought but no action or final answer: {text}")
|
||||
|
||||
raise ValueError(f"Could not parse response: {text}")
|
||||
|
||||
async def reason(self, question, history, context):
|
||||
|
||||
logger.debug(f"calling reason: {question}")
|
||||
|
||||
tools = self.tools
|
||||
|
||||
logger.debug("in reason")
|
||||
logger.debug(f"tools: {tools}")
|
||||
|
||||
tool_names = ",".join([
|
||||
t for t in self.tools.keys()
|
||||
])
|
||||
|
||||
logger.debug(f"Tool names: {tool_names}")
|
||||
|
||||
variables = {
|
||||
"question": question,
|
||||
"tools": [
|
||||
|
|
@ -32,7 +189,7 @@ class AgentManager:
|
|||
"type": arg.type,
|
||||
"description": arg.description
|
||||
}
|
||||
for arg in tool.arguments.values()
|
||||
for arg in tool.arguments
|
||||
]
|
||||
}
|
||||
for tool in self.tools.values()
|
||||
|
|
@ -51,38 +208,32 @@ class AgentManager:
|
|||
]
|
||||
}
|
||||
|
||||
print(json.dumps(variables, indent=4), flush=True)
|
||||
logger.debug(f"Variables: {json.dumps(variables, indent=4)}")
|
||||
|
||||
logger.info(f"prompt: {variables}")
|
||||
|
||||
obj = await context("prompt-request").agent_react(variables)
|
||||
# Get text response from prompt service
|
||||
response_text = await context("prompt-request").agent_react(variables)
|
||||
|
||||
print(json.dumps(obj, indent=4), flush=True)
|
||||
logger.debug(f"Response text:\n{response_text}")
|
||||
|
||||
logger.info(f"response: {obj}")
|
||||
logger.info(f"response: {response_text}")
|
||||
|
||||
if obj.get("final-answer"):
|
||||
|
||||
a = Final(
|
||||
thought = obj.get("thought"),
|
||||
final = obj.get("final-answer"),
|
||||
)
|
||||
|
||||
return a
|
||||
|
||||
else:
|
||||
|
||||
a = Action(
|
||||
thought = obj.get("thought"),
|
||||
name = obj.get("action"),
|
||||
arguments = obj.get("arguments"),
|
||||
observation = ""
|
||||
)
|
||||
|
||||
return a
|
||||
# Parse the text response
|
||||
try:
|
||||
result = self.parse_react_response(response_text)
|
||||
logger.info(f"Parsed result: {result}")
|
||||
return result
|
||||
except ValueError as e:
|
||||
logger.error(f"Failed to parse response: {e}")
|
||||
# Try to provide a helpful error message
|
||||
logger.error(f"Response was: {response_text}")
|
||||
raise RuntimeError(f"Failed to parse agent response: {e}")
|
||||
|
||||
async def react(self, question, history, think, observe, context):
|
||||
|
||||
logger.info(f"question: {question}")
|
||||
|
||||
act = await self.reason(
|
||||
question = question,
|
||||
history = history,
|
||||
|
|
@ -104,14 +255,17 @@ class AgentManager:
|
|||
else:
|
||||
raise RuntimeError(f"No action for {act.name}!")
|
||||
|
||||
print("TOOL>>>", act)
|
||||
logger.debug(f"TOOL>>> {act}")
|
||||
|
||||
resp = await action.implementation(context).invoke(
|
||||
**act.arguments
|
||||
)
|
||||
|
||||
print("RSETUL", resp)
|
||||
|
||||
resp = resp.strip()
|
||||
if isinstance(resp, str):
|
||||
resp = resp.strip()
|
||||
else:
|
||||
resp = str(resp)
|
||||
resp = resp.strip()
|
||||
|
||||
logger.info(f"resp: {resp}")
|
||||
|
||||
|
|
|
|||
|
|
@ -5,13 +5,18 @@ Simple agent infrastructure broadly implements the ReAct flow.
|
|||
import json
|
||||
import re
|
||||
import sys
|
||||
import functools
|
||||
import logging
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from ... base import AgentService, TextCompletionClientSpec, PromptClientSpec
|
||||
from ... base import GraphRagClientSpec
|
||||
from ... base import GraphRagClientSpec, ToolClientSpec
|
||||
|
||||
from ... schema import AgentRequest, AgentResponse, AgentStep, Error
|
||||
|
||||
from . tools import KnowledgeQueryImpl, TextCompletionImpl
|
||||
from . tools import KnowledgeQueryImpl, TextCompletionImpl, McpToolImpl, PromptImpl
|
||||
from . agent_manager import AgentManager
|
||||
|
||||
from . types import Final, Action, Tool, Argument
|
||||
|
|
@ -67,69 +72,92 @@ class Processor(AgentService):
|
|||
)
|
||||
)
|
||||
|
||||
self.register_specification(
|
||||
ToolClientSpec(
|
||||
request_name = "mcp-tool-request",
|
||||
response_name = "mcp-tool-response",
|
||||
)
|
||||
)
|
||||
|
||||
async def on_tools_config(self, config, version):
|
||||
|
||||
print("Loading configuration version", version)
|
||||
|
||||
if self.config_key not in config:
|
||||
print(f"No key {self.config_key} in config", flush=True)
|
||||
return
|
||||
|
||||
config = config[self.config_key]
|
||||
logger.info(f"Loading configuration version {version}")
|
||||
|
||||
try:
|
||||
|
||||
# This is some extra stuff to put in the prompt
|
||||
additional = config.get("additional-context", None)
|
||||
|
||||
ix = json.loads(config["tool-index"])
|
||||
|
||||
tools = {}
|
||||
|
||||
for k in ix:
|
||||
|
||||
pc = config[f"tool.{k}"]
|
||||
data = json.loads(pc)
|
||||
|
||||
arguments = {
|
||||
v.get("name"): Argument(
|
||||
name = v.get("name"),
|
||||
type = v.get("type"),
|
||||
description = v.get("description")
|
||||
# Load tool configurations from the new location
|
||||
if "tool" in config:
|
||||
for tool_id, tool_value in config["tool"].items():
|
||||
data = json.loads(tool_value)
|
||||
|
||||
impl_id = data.get("type")
|
||||
name = data.get("name")
|
||||
|
||||
# Create the appropriate implementation
|
||||
if impl_id == "knowledge-query":
|
||||
impl = functools.partial(
|
||||
KnowledgeQueryImpl,
|
||||
collection=data.get("collection")
|
||||
)
|
||||
arguments = KnowledgeQueryImpl.get_arguments()
|
||||
elif impl_id == "text-completion":
|
||||
impl = TextCompletionImpl
|
||||
arguments = TextCompletionImpl.get_arguments()
|
||||
elif impl_id == "mcp-tool":
|
||||
impl = functools.partial(
|
||||
McpToolImpl,
|
||||
mcp_tool_id=data.get("mcp-tool")
|
||||
)
|
||||
arguments = McpToolImpl.get_arguments()
|
||||
elif impl_id == "prompt":
|
||||
# For prompt tools, arguments come from config
|
||||
config_args = data.get("arguments", [])
|
||||
arguments = [
|
||||
Argument(
|
||||
name=arg.get("name"),
|
||||
type=arg.get("type"),
|
||||
description=arg.get("description")
|
||||
)
|
||||
for arg in config_args
|
||||
]
|
||||
impl = functools.partial(
|
||||
PromptImpl,
|
||||
template_id=data.get("template"),
|
||||
arguments=arguments
|
||||
)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Tool type {impl_id} not known"
|
||||
)
|
||||
|
||||
tools[name] = Tool(
|
||||
name=name,
|
||||
description=data.get("description"),
|
||||
implementation=impl,
|
||||
config=data, # Store full config for reference
|
||||
arguments=arguments,
|
||||
)
|
||||
for v in data["arguments"]
|
||||
}
|
||||
|
||||
impl_id = data.get("type")
|
||||
|
||||
if impl_id == "knowledge-query":
|
||||
impl = KnowledgeQueryImpl
|
||||
elif impl_id == "text-completion":
|
||||
impl = TextCompletionImpl
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Tool-kind {impl_id} not known"
|
||||
)
|
||||
|
||||
tools[data.get("name")] = Tool(
|
||||
name = data.get("name"),
|
||||
description = data.get("description"),
|
||||
implementation = impl,
|
||||
config=data.get("config", {}),
|
||||
arguments = arguments,
|
||||
)
|
||||
|
||||
|
||||
# Load additional context from agent config if it exists
|
||||
additional = None
|
||||
if self.config_key in config:
|
||||
agent_config = config[self.config_key]
|
||||
additional = agent_config.get("additional-context", None)
|
||||
|
||||
self.agent = AgentManager(
|
||||
tools=tools,
|
||||
additional_context=additional
|
||||
)
|
||||
|
||||
print("Prompt configuration reloaded.", flush=True)
|
||||
logger.info(f"Loaded {len(tools)} tools")
|
||||
logger.info("Tool configuration reloaded.")
|
||||
|
||||
except Exception as e:
|
||||
|
||||
print("on_tools_config Exception:", e, flush=True)
|
||||
print("Configuration reload failed", flush=True)
|
||||
logger.error(f"on_tools_config Exception: {e}", exc_info=True)
|
||||
logger.error("Configuration reload failed")
|
||||
|
||||
async def agent_request(self, request, respond, next, flow):
|
||||
|
||||
|
|
@ -148,16 +176,16 @@ class Processor(AgentService):
|
|||
else:
|
||||
history = []
|
||||
|
||||
print(f"Question: {request.question}", flush=True)
|
||||
logger.info(f"Question: {request.question}")
|
||||
|
||||
if len(history) >= self.max_iterations:
|
||||
raise RuntimeError("Too many agent iterations")
|
||||
|
||||
print(f"History: {history}", flush=True)
|
||||
logger.debug(f"History: {history}")
|
||||
|
||||
async def think(x):
|
||||
|
||||
print(f"Think: {x}", flush=True)
|
||||
logger.debug(f"Think: {x}")
|
||||
|
||||
r = AgentResponse(
|
||||
answer=None,
|
||||
|
|
@ -170,7 +198,7 @@ class Processor(AgentService):
|
|||
|
||||
async def observe(x):
|
||||
|
||||
print(f"Observe: {x}", flush=True)
|
||||
logger.debug(f"Observe: {x}")
|
||||
|
||||
r = AgentResponse(
|
||||
answer=None,
|
||||
|
|
@ -181,6 +209,8 @@ class Processor(AgentService):
|
|||
|
||||
await respond(r)
|
||||
|
||||
logger.debug("Call React")
|
||||
|
||||
act = await self.agent.react(
|
||||
question = request.question,
|
||||
history = history,
|
||||
|
|
@ -189,11 +219,16 @@ class Processor(AgentService):
|
|||
context = flow,
|
||||
)
|
||||
|
||||
print(f"Action: {act}", flush=True)
|
||||
logger.debug(f"Action: {act}")
|
||||
|
||||
if isinstance(act, Final):
|
||||
|
||||
print("Send final response...", flush=True)
|
||||
logger.debug("Send final response...")
|
||||
|
||||
if isinstance(act.final, str):
|
||||
f = act.final
|
||||
else:
|
||||
f = json.dumps(act.final)
|
||||
|
||||
r = AgentResponse(
|
||||
answer=act.final,
|
||||
|
|
@ -203,11 +238,11 @@ class Processor(AgentService):
|
|||
|
||||
await respond(r)
|
||||
|
||||
print("Done.", flush=True)
|
||||
logger.debug("Done.")
|
||||
|
||||
return
|
||||
|
||||
print("Send next...", flush=True)
|
||||
logger.debug("Send next...")
|
||||
|
||||
history.append(act)
|
||||
|
||||
|
|
@ -228,15 +263,15 @@ class Processor(AgentService):
|
|||
|
||||
await next(r)
|
||||
|
||||
print("Done.", flush=True)
|
||||
logger.debug("React agent processing complete")
|
||||
|
||||
return
|
||||
|
||||
except Exception as e:
|
||||
|
||||
print(f"agent_request Exception: {e}")
|
||||
logger.error(f"agent_request Exception: {e}", exc_info=True)
|
||||
|
||||
print("Send error response...", flush=True)
|
||||
logger.debug("Send error response...")
|
||||
|
||||
r = AgentResponse(
|
||||
error=Error(
|
||||
|
|
@ -266,6 +301,5 @@ class Processor(AgentService):
|
|||
)
|
||||
|
||||
def run():
|
||||
|
||||
Processor.launch(default_ident, __doc__)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,12 +1,31 @@
|
|||
|
||||
import json
|
||||
import logging
|
||||
from .types import Argument
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# This tool implementation knows how to put a question to the graph RAG
|
||||
# service
|
||||
class KnowledgeQueryImpl:
|
||||
def __init__(self, context):
|
||||
def __init__(self, context, collection=None):
|
||||
self.context = context
|
||||
self.collection = collection
|
||||
|
||||
@staticmethod
|
||||
def get_arguments():
|
||||
return [
|
||||
Argument(
|
||||
name="question",
|
||||
type="string",
|
||||
description="The question to ask the knowledge base"
|
||||
)
|
||||
]
|
||||
|
||||
async def invoke(self, **arguments):
|
||||
client = self.context("graph-rag-request")
|
||||
print("Graph RAG question...", flush=True)
|
||||
logger.debug("Graph RAG question...")
|
||||
return await client.rag(
|
||||
arguments.get("question")
|
||||
)
|
||||
|
|
@ -16,10 +35,71 @@ class KnowledgeQueryImpl:
|
|||
class TextCompletionImpl:
|
||||
def __init__(self, context):
|
||||
self.context = context
|
||||
|
||||
@staticmethod
|
||||
def get_arguments():
|
||||
return [
|
||||
Argument(
|
||||
name="question",
|
||||
type="string",
|
||||
description="The text prompt or question for completion"
|
||||
)
|
||||
]
|
||||
|
||||
async def invoke(self, **arguments):
|
||||
client = self.context("prompt-request")
|
||||
print("Prompt question...", flush=True)
|
||||
logger.debug("Prompt question...")
|
||||
return await client.question(
|
||||
arguments.get("question")
|
||||
)
|
||||
|
||||
# This tool implementation knows how to do MCP tool invocation. This uses
|
||||
# the mcp-tool service.
|
||||
class McpToolImpl:
|
||||
|
||||
def __init__(self, context, mcp_tool_id):
|
||||
self.context = context
|
||||
self.mcp_tool_id = mcp_tool_id
|
||||
|
||||
@staticmethod
|
||||
def get_arguments():
|
||||
# MCP tools define their own arguments dynamically
|
||||
# For now, we return empty list and let the MCP service handle validation
|
||||
return []
|
||||
|
||||
async def invoke(self, **arguments):
|
||||
|
||||
client = self.context("mcp-tool-request")
|
||||
|
||||
logger.debug(f"MCP tool invocation: {self.mcp_tool_id}...")
|
||||
output = await client.invoke(
|
||||
name = self.mcp_tool_id,
|
||||
parameters = arguments, # Pass the actual arguments
|
||||
)
|
||||
|
||||
logger.debug(f"MCP tool output: {output}")
|
||||
|
||||
if isinstance(output, str):
|
||||
return output
|
||||
else:
|
||||
return json.dumps(output)
|
||||
|
||||
|
||||
# This tool implementation knows how to execute prompt templates
|
||||
class PromptImpl:
|
||||
def __init__(self, context, template_id, arguments=None):
|
||||
self.context = context
|
||||
self.template_id = template_id
|
||||
self.arguments = arguments or [] # These come from config
|
||||
|
||||
def get_arguments(self):
|
||||
# For prompt tools, arguments are defined in configuration
|
||||
return self.arguments
|
||||
|
||||
async def invoke(self, **arguments):
|
||||
client = self.context("prompt-request")
|
||||
logger.debug(f"Prompt template invocation: {self.template_id}...")
|
||||
return await client.prompt(
|
||||
id=self.template_id,
|
||||
variables=arguments
|
||||
)
|
||||
|
|
|
|||
|
|
@ -4,12 +4,16 @@ Simple decoder, accepts text documents on input, outputs chunks from the
|
|||
as text as separate output objects.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
from prometheus_client import Histogram
|
||||
|
||||
from ... schema import TextDocument, Chunk
|
||||
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
default_ident = "chunker"
|
||||
|
||||
class Processor(FlowProcessor):
|
||||
|
|
@ -54,12 +58,12 @@ class Processor(FlowProcessor):
|
|||
)
|
||||
)
|
||||
|
||||
print("Chunker initialised", flush=True)
|
||||
logger.info("Recursive chunker initialized")
|
||||
|
||||
async def on_message(self, msg, consumer, flow):
|
||||
|
||||
v = msg.value()
|
||||
print(f"Chunking {v.metadata.id}...", flush=True)
|
||||
logger.info(f"Chunking document {v.metadata.id}...")
|
||||
|
||||
texts = self.text_splitter.create_documents(
|
||||
[v.text.decode("utf-8")]
|
||||
|
|
@ -67,7 +71,7 @@ class Processor(FlowProcessor):
|
|||
|
||||
for ix, chunk in enumerate(texts):
|
||||
|
||||
print("Chunk", len(chunk.page_content), flush=True)
|
||||
logger.debug(f"Created chunk of size {len(chunk.page_content)}")
|
||||
|
||||
r = Chunk(
|
||||
metadata=v.metadata,
|
||||
|
|
@ -80,7 +84,7 @@ class Processor(FlowProcessor):
|
|||
|
||||
await flow("output").send(r)
|
||||
|
||||
print("Done.", flush=True)
|
||||
logger.debug("Document chunking complete")
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
|
|
|||
|
|
@ -4,11 +4,15 @@ Simple decoder, accepts text documents on input, outputs chunks from the
|
|||
as text as separate output objects.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from langchain_text_splitters import TokenTextSplitter
|
||||
from prometheus_client import Histogram
|
||||
|
||||
from ... schema import TextDocument, Chunk
|
||||
from ... base import FlowProcessor
|
||||
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
default_ident = "chunker"
|
||||
|
||||
|
|
@ -16,7 +20,7 @@ class Processor(FlowProcessor):
|
|||
|
||||
def __init__(self, **params):
|
||||
|
||||
id = params.get("id")
|
||||
id = params.get("id", default_ident)
|
||||
chunk_size = params.get("chunk_size", 250)
|
||||
chunk_overlap = params.get("chunk_overlap", 15)
|
||||
|
||||
|
|
@ -53,12 +57,12 @@ class Processor(FlowProcessor):
|
|||
)
|
||||
)
|
||||
|
||||
print("Chunker initialised", flush=True)
|
||||
logger.info("Token chunker initialized")
|
||||
|
||||
async def on_message(self, msg, consumer, flow):
|
||||
|
||||
v = msg.value()
|
||||
print(f"Chunking {v.metadata.id}...", flush=True)
|
||||
logger.info(f"Chunking document {v.metadata.id}...")
|
||||
|
||||
texts = self.text_splitter.create_documents(
|
||||
[v.text.decode("utf-8")]
|
||||
|
|
@ -66,7 +70,7 @@ class Processor(FlowProcessor):
|
|||
|
||||
for ix, chunk in enumerate(texts):
|
||||
|
||||
print("Chunk", len(chunk.page_content), flush=True)
|
||||
logger.debug(f"Created chunk of size {len(chunk.page_content)}")
|
||||
|
||||
r = Chunk(
|
||||
metadata=v.metadata,
|
||||
|
|
@ -79,7 +83,7 @@ class Processor(FlowProcessor):
|
|||
|
||||
await flow("output").send(r)
|
||||
|
||||
print("Done.", flush=True)
|
||||
logger.debug("Document chunking complete")
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
|
|
|||
|
|
@ -1,9 +1,14 @@
|
|||
|
||||
import logging
|
||||
|
||||
from trustgraph.schema import ConfigResponse
|
||||
from trustgraph.schema import ConfigValue, Error
|
||||
|
||||
from ... tables.config import ConfigTableStore
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ConfigurationClass:
|
||||
|
||||
async def keys(self):
|
||||
|
|
@ -228,7 +233,7 @@ class Configuration:
|
|||
|
||||
async def handle(self, msg):
|
||||
|
||||
print("Handle message ", msg.operation)
|
||||
logger.debug(f"Handling config message: {msg.operation}")
|
||||
|
||||
if msg.operation == "get":
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,10 @@
|
|||
|
||||
from trustgraph.schema import FlowResponse, Error
|
||||
import json
|
||||
import logging
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class FlowConfig:
|
||||
def __init__(self, config):
|
||||
|
|
@ -41,7 +45,7 @@ class FlowConfig:
|
|||
|
||||
async def handle_delete_class(self, msg):
|
||||
|
||||
print(msg)
|
||||
logger.debug(f"Flow config message: {msg}")
|
||||
|
||||
await self.config.get("flow-classes").delete(msg.class_name)
|
||||
|
||||
|
|
@ -218,7 +222,7 @@ class FlowConfig:
|
|||
|
||||
async def handle(self, msg):
|
||||
|
||||
print("Handle message ", msg.operation)
|
||||
logger.debug(f"Handling flow message: {msg.operation}")
|
||||
|
||||
if msg.operation == "list-classes":
|
||||
resp = await self.handle_list_classes(msg)
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@
|
|||
Config service. Manages system global configuration state
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from trustgraph.schema import Error
|
||||
|
||||
from trustgraph.schema import ConfigRequest, ConfigResponse, ConfigPush
|
||||
|
|
@ -20,6 +22,9 @@ from . flow import FlowConfig
|
|||
from ... base import ProcessorMetrics, ConsumerMetrics, ProducerMetrics
|
||||
from ... base import Consumer, Producer
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# FIXME: How to ensure this doesn't conflict with other usage?
|
||||
keyspace = "config"
|
||||
|
||||
|
|
@ -146,7 +151,7 @@ class Processor(AsyncProcessor):
|
|||
|
||||
self.flow = FlowConfig(self.config)
|
||||
|
||||
print("Service initialised.")
|
||||
logger.info("Config service initialized")
|
||||
|
||||
async def start(self):
|
||||
|
||||
|
|
@ -172,7 +177,7 @@ class Processor(AsyncProcessor):
|
|||
|
||||
# Race condition, should make sure version & config sync
|
||||
|
||||
print("Pushed version ", await self.config.get_version())
|
||||
logger.info(f"Pushed configuration version {await self.config.get_version()}")
|
||||
|
||||
async def on_config_request(self, msg, consumer, flow):
|
||||
|
||||
|
|
@ -183,7 +188,7 @@ class Processor(AsyncProcessor):
|
|||
# Sender-produced ID
|
||||
id = msg.properties()["id"]
|
||||
|
||||
print(f"Handling {id}...", flush=True)
|
||||
logger.info(f"Handling config request {id}...")
|
||||
|
||||
resp = await self.config.handle(v)
|
||||
|
||||
|
|
@ -214,7 +219,7 @@ class Processor(AsyncProcessor):
|
|||
# Sender-produced ID
|
||||
id = msg.properties()["id"]
|
||||
|
||||
print(f"Handling {id}...", flush=True)
|
||||
logger.info(f"Handling flow request {id}...")
|
||||
|
||||
resp = await self.flow.handle(v)
|
||||
|
||||
|
|
|
|||
|
|
@ -8,6 +8,10 @@ from .. base import Publisher
|
|||
import base64
|
||||
import asyncio
|
||||
import uuid
|
||||
import logging
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class KnowledgeManager:
|
||||
|
||||
|
|
@ -26,7 +30,7 @@ class KnowledgeManager:
|
|||
|
||||
async def delete_kg_core(self, request, respond):
|
||||
|
||||
print("Deleting core...", flush=True)
|
||||
logger.info("Deleting knowledge core...")
|
||||
|
||||
await self.table_store.delete_kg_core(
|
||||
request.user, request.id
|
||||
|
|
@ -44,7 +48,7 @@ class KnowledgeManager:
|
|||
|
||||
async def get_kg_core(self, request, respond):
|
||||
|
||||
print("Get core...", flush=True)
|
||||
logger.info("Getting knowledge core...")
|
||||
|
||||
async def publish_triples(t):
|
||||
await respond(
|
||||
|
|
@ -82,7 +86,7 @@ class KnowledgeManager:
|
|||
publish_ge,
|
||||
)
|
||||
|
||||
print("Get complete", flush=True)
|
||||
logger.debug("Knowledge core retrieval complete")
|
||||
|
||||
await respond(
|
||||
KnowledgeResponse(
|
||||
|
|
@ -158,13 +162,13 @@ class KnowledgeManager:
|
|||
|
||||
async def core_loader(self):
|
||||
|
||||
print("Running...", flush=True)
|
||||
logger.info("Knowledge background processor running...")
|
||||
while True:
|
||||
|
||||
print("Wait for next load...", flush=True)
|
||||
logger.debug("Waiting for next load...")
|
||||
request, respond = await self.loader_queue.get()
|
||||
|
||||
print("Loading...", request.id, flush=True)
|
||||
logger.info(f"Loading knowledge: {request.id}")
|
||||
|
||||
try:
|
||||
|
||||
|
|
@ -204,7 +208,7 @@ class KnowledgeManager:
|
|||
|
||||
except Exception as e:
|
||||
|
||||
print("Exception:", e, flush=True)
|
||||
logger.error(f"Knowledge exception: {e}", exc_info=True)
|
||||
await respond(
|
||||
KnowledgeResponse(
|
||||
error = Error(
|
||||
|
|
@ -219,15 +223,15 @@ class KnowledgeManager:
|
|||
)
|
||||
|
||||
|
||||
print("Going to start loading...", flush=True)
|
||||
logger.debug("Starting knowledge loading process...")
|
||||
|
||||
try:
|
||||
|
||||
t_pub = None
|
||||
ge_pub = None
|
||||
|
||||
print(t_q, flush=True)
|
||||
print(ge_q, flush=True)
|
||||
logger.debug(f"Triples queue: {t_q}")
|
||||
logger.debug(f"Graph embeddings queue: {ge_q}")
|
||||
|
||||
t_pub = Publisher(
|
||||
self.flow_config.pulsar_client, t_q,
|
||||
|
|
@ -238,7 +242,7 @@ class KnowledgeManager:
|
|||
schema=GraphEmbeddings
|
||||
)
|
||||
|
||||
print("Start publishers...", flush=True)
|
||||
logger.debug("Starting publishers...")
|
||||
|
||||
await t_pub.start()
|
||||
await ge_pub.start()
|
||||
|
|
@ -246,7 +250,7 @@ class KnowledgeManager:
|
|||
async def publish_triples(t):
|
||||
await t_pub.send(None, t)
|
||||
|
||||
print("Publish triples...", flush=True)
|
||||
logger.debug("Publishing triples...")
|
||||
|
||||
# Remove doc table row
|
||||
await self.table_store.get_triples(
|
||||
|
|
@ -258,7 +262,7 @@ class KnowledgeManager:
|
|||
async def publish_ge(g):
|
||||
await ge_pub.send(None, g)
|
||||
|
||||
print("Publish GEs...", flush=True)
|
||||
logger.debug("Publishing graph embeddings...")
|
||||
|
||||
# Remove doc table row
|
||||
await self.table_store.get_graph_embeddings(
|
||||
|
|
@ -267,19 +271,19 @@ class KnowledgeManager:
|
|||
publish_ge,
|
||||
)
|
||||
|
||||
print("Completed that.", flush=True)
|
||||
logger.debug("Knowledge loading completed")
|
||||
|
||||
except Exception as e:
|
||||
|
||||
print("Exception:", e, flush=True)
|
||||
logger.error(f"Knowledge exception: {e}", exc_info=True)
|
||||
|
||||
finally:
|
||||
|
||||
print("Stopping publishers...", flush=True)
|
||||
logger.debug("Stopping publishers...")
|
||||
|
||||
if t_pub: await t_pub.stop()
|
||||
if ge_pub: await ge_pub.stop()
|
||||
|
||||
print("Done", flush=True)
|
||||
logger.debug("Knowledge processing done")
|
||||
|
||||
continue
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ from functools import partial
|
|||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
|
||||
from .. base import AsyncProcessor, Consumer, Producer, Publisher, Subscriber
|
||||
from .. base import ConsumerMetrics, ProducerMetrics
|
||||
|
|
@ -21,6 +22,9 @@ from .. exceptions import RequestError
|
|||
|
||||
from . knowledge import KnowledgeManager
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
default_ident = "knowledge"
|
||||
|
||||
default_knowledge_request_queue = knowledge_request_queue
|
||||
|
|
@ -96,7 +100,7 @@ class Processor(AsyncProcessor):
|
|||
|
||||
self.flows = {}
|
||||
|
||||
print("Initialised.", flush=True)
|
||||
logger.info("Knowledge service initialized")
|
||||
|
||||
async def start(self):
|
||||
|
||||
|
|
@ -106,7 +110,7 @@ class Processor(AsyncProcessor):
|
|||
|
||||
async def on_knowledge_config(self, config, version):
|
||||
|
||||
print("config version", version)
|
||||
logger.info(f"Configuration version: {version}")
|
||||
|
||||
if "flows" in config:
|
||||
|
||||
|
|
@ -115,14 +119,14 @@ class Processor(AsyncProcessor):
|
|||
for k, v in config["flows"].items()
|
||||
}
|
||||
|
||||
print(self.flows)
|
||||
logger.debug(f"Flows: {self.flows}")
|
||||
|
||||
async def process_request(self, v, id):
|
||||
|
||||
if v.operation is None:
|
||||
raise RequestError("Null operation")
|
||||
|
||||
print("request", v.operation)
|
||||
logger.debug(f"Knowledge request: {v.operation}")
|
||||
|
||||
impls = {
|
||||
"list-kg-cores": self.knowledge.list_kg_cores,
|
||||
|
|
@ -150,7 +154,7 @@ class Processor(AsyncProcessor):
|
|||
|
||||
id = msg.properties()["id"]
|
||||
|
||||
print(f"Handling input {id}...", flush=True)
|
||||
logger.info(f"Handling knowledge input {id}...")
|
||||
|
||||
try:
|
||||
|
||||
|
|
@ -187,7 +191,7 @@ class Processor(AsyncProcessor):
|
|||
|
||||
return
|
||||
|
||||
print("Done.", flush=True)
|
||||
logger.debug("Knowledge input processing complete")
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
|
|
|||
|
|
@ -15,13 +15,13 @@ from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
|
|||
from mistralai.models import OCRResponse
|
||||
|
||||
from ... schema import Document, TextDocument, Metadata
|
||||
from ... schema import document_ingest_queue, text_ingest_queue
|
||||
from ... log_level import LogLevel
|
||||
from ... base import InputOutputProcessor
|
||||
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
||||
|
||||
module = "ocr"
|
||||
import logging
|
||||
|
||||
default_subscriber = module
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
default_ident = "pdf-decoder"
|
||||
default_api_key = os.getenv("MISTRAL_TOKEN")
|
||||
|
||||
pages_per_chunk = 5
|
||||
|
|
@ -69,23 +69,34 @@ def get_combined_markdown(ocr_response: OCRResponse) -> str:
|
|||
|
||||
return "\n\n".join(markdowns)
|
||||
|
||||
class Processor(InputOutputProcessor):
|
||||
class Processor(FlowProcessor):
|
||||
|
||||
def __init__(self, **params):
|
||||
|
||||
id = params.get("id")
|
||||
subscriber = params.get("subscriber", default_subscriber)
|
||||
id = params.get("id", default_ident)
|
||||
api_key = params.get("api_key", default_api_key)
|
||||
|
||||
super(Processor, self).__init__(
|
||||
**params | {
|
||||
"id": id,
|
||||
"subscriber": subscriber,
|
||||
"input_schema": Document,
|
||||
"output_schema": TextDocument,
|
||||
}
|
||||
)
|
||||
|
||||
self.register_specification(
|
||||
ConsumerSpec(
|
||||
name = "input",
|
||||
schema = Document,
|
||||
handler = self.on_message,
|
||||
)
|
||||
)
|
||||
|
||||
self.register_specification(
|
||||
ProducerSpec(
|
||||
name = "output",
|
||||
schema = TextDocument,
|
||||
)
|
||||
)
|
||||
|
||||
if api_key is None:
|
||||
raise RuntimeError("Mistral API key not specified")
|
||||
|
||||
|
|
@ -94,18 +105,18 @@ class Processor(InputOutputProcessor):
|
|||
# Used with Mistral doc upload
|
||||
self.unique_id = str(uuid.uuid4())
|
||||
|
||||
print("PDF inited")
|
||||
logger.info("Mistral OCR processor initialized")
|
||||
|
||||
def ocr(self, blob):
|
||||
|
||||
print("Parse PDF...", flush=True)
|
||||
logger.debug("Parse PDF...")
|
||||
|
||||
pdfbuf = BytesIO(blob)
|
||||
pdf = PdfReader(pdfbuf)
|
||||
|
||||
for chunk in chunks(pdf.pages, pages_per_chunk):
|
||||
|
||||
print("Get next pages...", flush=True)
|
||||
logger.debug("Get next pages...")
|
||||
|
||||
part = PdfWriter()
|
||||
for page in chunk:
|
||||
|
|
@ -114,7 +125,7 @@ class Processor(InputOutputProcessor):
|
|||
buf = BytesIO()
|
||||
part.write_stream(buf)
|
||||
|
||||
print("Upload chunk...", flush=True)
|
||||
logger.debug("Upload chunk...")
|
||||
|
||||
uploaded_file = self.mistral.files.upload(
|
||||
file={
|
||||
|
|
@ -128,7 +139,7 @@ class Processor(InputOutputProcessor):
|
|||
file_id=uploaded_file.id, expiry=1
|
||||
)
|
||||
|
||||
print("OCR...", flush=True)
|
||||
logger.debug("OCR...")
|
||||
|
||||
processed = self.mistral.ocr.process(
|
||||
model="mistral-ocr-latest",
|
||||
|
|
@ -139,21 +150,21 @@ class Processor(InputOutputProcessor):
|
|||
}
|
||||
)
|
||||
|
||||
print("Extract markdown...", flush=True)
|
||||
logger.debug("Extract markdown...")
|
||||
|
||||
markdown = get_combined_markdown(processed)
|
||||
|
||||
print("OCR complete.", flush=True)
|
||||
logger.info("OCR complete.")
|
||||
|
||||
return markdown
|
||||
|
||||
async def on_message(self, msg, consumer):
|
||||
async def on_message(self, msg, consumer, flow):
|
||||
|
||||
print("PDF message received")
|
||||
logger.debug("PDF message received")
|
||||
|
||||
v = msg.value()
|
||||
|
||||
print(f"Decoding {v.metadata.id}...", flush=True)
|
||||
logger.info(f"Decoding {v.metadata.id}...")
|
||||
|
||||
markdown = self.ocr(base64.b64decode(v.data))
|
||||
|
||||
|
|
@ -162,14 +173,14 @@ class Processor(InputOutputProcessor):
|
|||
text=markdown.encode("utf-8"),
|
||||
)
|
||||
|
||||
await consumer.q.output.send(r)
|
||||
await flow("output").send(r)
|
||||
|
||||
print("Done.", flush=True)
|
||||
logger.info("Done.")
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
InputOutputProcessor.add_args(parser, default_subscriber)
|
||||
FlowProcessor.add_args(parser)
|
||||
|
||||
parser.add_argument(
|
||||
'-k', '--api-key',
|
||||
|
|
@ -179,5 +190,5 @@ class Processor(InputOutputProcessor):
|
|||
|
||||
def run():
|
||||
|
||||
Processor.launch(module, __doc__)
|
||||
Processor.launch(default_ident, __doc__)
|
||||
|
||||
|
|
|
|||
|
|
@ -6,11 +6,15 @@ PDF document as text as separate output objects.
|
|||
|
||||
import tempfile
|
||||
import base64
|
||||
import logging
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
|
||||
from ... schema import Document, TextDocument, Metadata
|
||||
from ... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
default_ident = "pdf-decoder"
|
||||
|
||||
class Processor(FlowProcessor):
|
||||
|
|
@ -40,15 +44,15 @@ class Processor(FlowProcessor):
|
|||
)
|
||||
)
|
||||
|
||||
print("PDF inited", flush=True)
|
||||
logger.info("PDF decoder initialized")
|
||||
|
||||
async def on_message(self, msg, consumer, flow):
|
||||
|
||||
print("PDF message received", flush=True)
|
||||
logger.debug("PDF message received")
|
||||
|
||||
v = msg.value()
|
||||
|
||||
print(f"Decoding {v.metadata.id}...", flush=True)
|
||||
logger.info(f"Decoding PDF {v.metadata.id}...")
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete_on_close=False) as fp:
|
||||
|
||||
|
|
@ -62,7 +66,7 @@ class Processor(FlowProcessor):
|
|||
|
||||
for ix, page in enumerate(pages):
|
||||
|
||||
print("page", ix, flush=True)
|
||||
logger.debug(f"Processing page {ix}")
|
||||
|
||||
r = TextDocument(
|
||||
metadata=v.metadata,
|
||||
|
|
@ -71,7 +75,7 @@ class Processor(FlowProcessor):
|
|||
|
||||
await flow("output").send(r)
|
||||
|
||||
print("Done.", flush=True)
|
||||
logger.debug("PDF decoding complete")
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
|
|
|||
|
|
@ -3,6 +3,9 @@ from cassandra.cluster import Cluster
|
|||
from cassandra.auth import PlainTextAuthProvider
|
||||
from ssl import SSLContext, PROTOCOL_TLSv1_2
|
||||
|
||||
# Global list to track clusters for cleanup
|
||||
_active_clusters = []
|
||||
|
||||
class TrustGraph:
|
||||
|
||||
def __init__(
|
||||
|
|
@ -24,6 +27,9 @@ class TrustGraph:
|
|||
else:
|
||||
self.cluster = Cluster(hosts)
|
||||
self.session = self.cluster.connect()
|
||||
|
||||
# Track this cluster globally
|
||||
_active_clusters.append(self.cluster)
|
||||
|
||||
self.init()
|
||||
|
||||
|
|
@ -119,3 +125,13 @@ class TrustGraph:
|
|||
f"""select s as x from {self.table} where s = %s and p = %s and o = %s limit {limit}""",
|
||||
(s, p, o)
|
||||
)
|
||||
|
||||
def close(self):
|
||||
"""Close the Cassandra session and cluster connections properly"""
|
||||
if hasattr(self, 'session') and self.session:
|
||||
self.session.shutdown()
|
||||
if hasattr(self, 'cluster') and self.cluster:
|
||||
self.cluster.shutdown()
|
||||
# Remove from global tracking
|
||||
if self.cluster in _active_clusters:
|
||||
_active_clusters.remove(self.cluster)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
|
||||
from pymilvus import MilvusClient, CollectionSchema, FieldSchema, DataType
|
||||
import time
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DocVectors:
|
||||
|
||||
|
|
@ -21,7 +24,7 @@ class DocVectors:
|
|||
|
||||
# Next time to reload - this forces a reload at next window
|
||||
self.next_reload = time.time() + self.reload_time
|
||||
print("Reload at", self.next_reload)
|
||||
logger.debug(f"Reload at {self.next_reload}")
|
||||
|
||||
def init_collection(self, dimension):
|
||||
|
||||
|
|
@ -110,12 +113,12 @@ class DocVectors:
|
|||
}
|
||||
}
|
||||
|
||||
print("Loading...")
|
||||
logger.debug("Loading...")
|
||||
self.client.load_collection(
|
||||
collection_name=coll,
|
||||
)
|
||||
|
||||
print("Searching...")
|
||||
logger.debug("Searching...")
|
||||
|
||||
res = self.client.search(
|
||||
collection_name=coll,
|
||||
|
|
@ -128,7 +131,7 @@ class DocVectors:
|
|||
|
||||
# If reload time has passed, unload collection
|
||||
if time.time() > self.next_reload:
|
||||
print("Unloading, reload at", self.next_reload)
|
||||
logger.debug(f"Unloading, reload at {self.next_reload}")
|
||||
self.client.release_collection(
|
||||
collection_name=coll,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
|
||||
from pymilvus import MilvusClient, CollectionSchema, FieldSchema, DataType
|
||||
import time
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class EntityVectors:
|
||||
|
||||
|
|
@ -21,7 +24,7 @@ class EntityVectors:
|
|||
|
||||
# Next time to reload - this forces a reload at next window
|
||||
self.next_reload = time.time() + self.reload_time
|
||||
print("Reload at", self.next_reload)
|
||||
logger.debug(f"Reload at {self.next_reload}")
|
||||
|
||||
def init_collection(self, dimension):
|
||||
|
||||
|
|
@ -110,12 +113,12 @@ class EntityVectors:
|
|||
}
|
||||
}
|
||||
|
||||
print("Loading...")
|
||||
logger.debug("Loading...")
|
||||
self.client.load_collection(
|
||||
collection_name=coll,
|
||||
)
|
||||
|
||||
print("Searching...")
|
||||
logger.debug("Searching...")
|
||||
|
||||
res = self.client.search(
|
||||
collection_name=coll,
|
||||
|
|
@ -128,7 +131,7 @@ class EntityVectors:
|
|||
|
||||
# If reload time has passed, unload collection
|
||||
if time.time() > self.next_reload:
|
||||
print("Unloading, reload at", self.next_reload)
|
||||
logger.debug(f"Unloading, reload at {self.next_reload}")
|
||||
self.client.release_collection(
|
||||
collection_name=coll,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
|
||||
from pymilvus import MilvusClient, CollectionSchema, FieldSchema, DataType
|
||||
import time
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ObjectVectors:
|
||||
|
||||
|
|
@ -21,7 +24,7 @@ class ObjectVectors:
|
|||
|
||||
# Next time to reload - this forces a reload at next window
|
||||
self.next_reload = time.time() + self.reload_time
|
||||
print("Reload at", self.next_reload)
|
||||
logger.debug(f"Reload at {self.next_reload}")
|
||||
|
||||
def init_collection(self, dimension, name):
|
||||
|
||||
|
|
@ -126,12 +129,12 @@ class ObjectVectors:
|
|||
}
|
||||
}
|
||||
|
||||
print("Loading...")
|
||||
logger.debug("Loading...")
|
||||
self.client.load_collection(
|
||||
collection_name=coll,
|
||||
)
|
||||
|
||||
print("Searching...")
|
||||
logger.debug("Searching...")
|
||||
|
||||
res = self.client.search(
|
||||
collection_name=coll,
|
||||
|
|
@ -144,7 +147,7 @@ class ObjectVectors:
|
|||
|
||||
# If reload time has passed, unload collection
|
||||
if time.time() > self.next_reload:
|
||||
print("Unloading, reload at", self.next_reload)
|
||||
logger.debug(f"Unloading, reload at {self.next_reload}")
|
||||
self.client.release_collection(
|
||||
collection_name=coll,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -11,6 +11,10 @@ from ... schema import EmbeddingsRequest, EmbeddingsResponse
|
|||
from ... base import FlowProcessor, RequestResponseSpec, ConsumerSpec
|
||||
from ... base import ProducerSpec
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
default_ident = "document-embeddings"
|
||||
|
||||
class Processor(FlowProcessor):
|
||||
|
|
@ -52,7 +56,7 @@ class Processor(FlowProcessor):
|
|||
async def on_message(self, msg, consumer, flow):
|
||||
|
||||
v = msg.value()
|
||||
print(f"Indexing {v.metadata.id}...", flush=True)
|
||||
logger.info(f"Indexing {v.metadata.id}...")
|
||||
|
||||
try:
|
||||
|
||||
|
|
@ -79,12 +83,12 @@ class Processor(FlowProcessor):
|
|||
await flow("output").send(r)
|
||||
|
||||
except Exception as e:
|
||||
print("Exception:", e, flush=True)
|
||||
logger.error("Exception occurred", exc_info=True)
|
||||
|
||||
# Retry
|
||||
raise e
|
||||
|
||||
print("Done.", flush=True)
|
||||
logger.info("Done.")
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
|
|
|||
|
|
@ -4,10 +4,15 @@ Embeddings service, applies an embeddings model using fastembed
|
|||
Input is text, output is embeddings vector.
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
from ... base import EmbeddingsService
|
||||
|
||||
from fastembed import TextEmbedding
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
default_ident = "embeddings"
|
||||
|
||||
default_model="sentence-transformers/all-MiniLM-L6-v2"
|
||||
|
|
@ -22,7 +27,7 @@ class Processor(EmbeddingsService):
|
|||
**params | { "model": model }
|
||||
)
|
||||
|
||||
print("Get model...", flush=True)
|
||||
logger.info("Loading FastEmbed model...")
|
||||
self.embeddings = TextEmbedding(model_name = model)
|
||||
|
||||
async def on_embeddings(self, text):
|
||||
|
|
|
|||
|
|
@ -11,6 +11,10 @@ from ... schema import EmbeddingsRequest, EmbeddingsResponse
|
|||
from ... base import FlowProcessor, EmbeddingsClientSpec, ConsumerSpec
|
||||
from ... base import ProducerSpec
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
default_ident = "graph-embeddings"
|
||||
|
||||
class Processor(FlowProcessor):
|
||||
|
|
@ -50,7 +54,7 @@ class Processor(FlowProcessor):
|
|||
async def on_message(self, msg, consumer, flow):
|
||||
|
||||
v = msg.value()
|
||||
print(f"Indexing {v.metadata.id}...", flush=True)
|
||||
logger.info(f"Indexing {v.metadata.id}...")
|
||||
|
||||
entities = []
|
||||
|
||||
|
|
@ -77,12 +81,12 @@ class Processor(FlowProcessor):
|
|||
await flow("output").send(r)
|
||||
|
||||
except Exception as e:
|
||||
print("Exception:", e, flush=True)
|
||||
logger.error("Exception occurred", exc_info=True)
|
||||
|
||||
# Retry
|
||||
raise e
|
||||
|
||||
print("Done.", flush=True)
|
||||
logger.info("Done.")
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
|
|
|||
|
|
@ -3,81 +3,46 @@
|
|||
Embeddings service, applies an embeddings model hosted on a local Ollama.
|
||||
Input is text, output is embeddings vector.
|
||||
"""
|
||||
from ... base import EmbeddingsService
|
||||
|
||||
from ... schema import EmbeddingsRequest, EmbeddingsResponse
|
||||
from ... schema import embeddings_request_queue, embeddings_response_queue
|
||||
from ... log_level import LogLevel
|
||||
from ... base import ConsumerProducer
|
||||
from ollama import Client
|
||||
import os
|
||||
|
||||
module = "embeddings"
|
||||
default_ident = "embeddings"
|
||||
|
||||
default_input_queue = embeddings_request_queue
|
||||
default_output_queue = embeddings_response_queue
|
||||
default_subscriber = module
|
||||
default_model="mxbai-embed-large"
|
||||
default_ollama = os.getenv("OLLAMA_HOST", 'http://localhost:11434')
|
||||
|
||||
class Processor(ConsumerProducer):
|
||||
class Processor(EmbeddingsService):
|
||||
|
||||
def __init__(self, **params):
|
||||
|
||||
input_queue = params.get("input_queue", default_input_queue)
|
||||
output_queue = params.get("output_queue", default_output_queue)
|
||||
subscriber = params.get("subscriber", default_subscriber)
|
||||
|
||||
ollama = params.get("ollama", default_ollama)
|
||||
model = params.get("model", default_model)
|
||||
ollama = params.get("ollama", default_ollama)
|
||||
|
||||
super(Processor, self).__init__(
|
||||
**params | {
|
||||
"input_queue": input_queue,
|
||||
"output_queue": output_queue,
|
||||
"subscriber": subscriber,
|
||||
"input_schema": EmbeddingsRequest,
|
||||
"output_schema": EmbeddingsResponse,
|
||||
"ollama": ollama,
|
||||
"model": model,
|
||||
"model": model
|
||||
}
|
||||
)
|
||||
|
||||
self.client = Client(host=ollama)
|
||||
self.model = model
|
||||
|
||||
async def handle(self, msg):
|
||||
async def on_embeddings(self, text):
|
||||
|
||||
v = msg.value()
|
||||
|
||||
# Sender-produced ID
|
||||
|
||||
id = msg.properties()["id"]
|
||||
|
||||
print(f"Handling input {id}...", flush=True)
|
||||
|
||||
text = v.text
|
||||
embeds = self.client.embed(
|
||||
model = self.model,
|
||||
input = text
|
||||
)
|
||||
|
||||
print("Send response...", flush=True)
|
||||
r = EmbeddingsResponse(
|
||||
vectors=embeds.embeddings,
|
||||
error=None,
|
||||
)
|
||||
|
||||
await self.send(r, properties={"id": id})
|
||||
|
||||
print("Done.", flush=True)
|
||||
return embeds.embeddings
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
ConsumerProducer.add_args(
|
||||
parser, default_input_queue, default_subscriber,
|
||||
default_output_queue,
|
||||
)
|
||||
EmbeddingsService.add_args(parser)
|
||||
|
||||
parser.add_argument(
|
||||
'-m', '--model',
|
||||
|
|
@ -93,5 +58,6 @@ class Processor(ConsumerProducer):
|
|||
|
||||
def run():
|
||||
|
||||
Processor.launch(module, __doc__)
|
||||
Processor.launch(default_ident, __doc__)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -10,6 +10,9 @@ from trustgraph.schema import encyclopedia_lookup_response_queue
|
|||
from trustgraph.log_level import LogLevel
|
||||
from trustgraph.base import ConsumerProducer
|
||||
import requests
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
module = "wikipedia"
|
||||
|
||||
|
|
@ -46,7 +49,7 @@ class Processor(ConsumerProducer):
|
|||
# Sender-produced ID
|
||||
id = msg.properties()["id"]
|
||||
|
||||
print(f"Handling {v.kind} / {v.term}...", flush=True)
|
||||
logger.info(f"Handling {v.kind} / {v.term}...")
|
||||
|
||||
try:
|
||||
|
||||
|
|
|
|||
1
trustgraph-flow/trustgraph/extract/kg/agent/__init__.py
Normal file
1
trustgraph-flow/trustgraph/extract/kg/agent/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
from .extract import *
|
||||
4
trustgraph-flow/trustgraph/extract/kg/agent/__main__.py
Normal file
4
trustgraph-flow/trustgraph/extract/kg/agent/__main__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
from .extract import Processor
|
||||
|
||||
if __name__ == "__main__":
|
||||
Processor.run()
|
||||
340
trustgraph-flow/trustgraph/extract/kg/agent/extract.py
Normal file
340
trustgraph-flow/trustgraph/extract/kg/agent/extract.py
Normal file
|
|
@ -0,0 +1,340 @@
|
|||
import re
|
||||
import json
|
||||
import urllib.parse
|
||||
import logging
|
||||
|
||||
from ....schema import Chunk, Triple, Triples, Metadata, Value
|
||||
from ....schema import EntityContext, EntityContexts
|
||||
|
||||
from ....rdf import TRUSTGRAPH_ENTITIES, RDF_LABEL, SUBJECT_OF, DEFINITION
|
||||
|
||||
from ....base import FlowProcessor, ConsumerSpec, ProducerSpec
|
||||
from ....base import AgentClientSpec
|
||||
|
||||
from ....template import PromptManager
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
default_ident = "kg-extract-agent"
|
||||
default_concurrency = 1
|
||||
default_template_id = "agent-kg-extract"
|
||||
default_config_type = "prompt"
|
||||
|
||||
class Processor(FlowProcessor):
|
||||
|
||||
def __init__(self, **params):
|
||||
|
||||
id = params.get("id")
|
||||
concurrency = params.get("concurrency", 1)
|
||||
template_id = params.get("template-id", default_template_id)
|
||||
config_key = params.get("config-type", default_config_type)
|
||||
|
||||
super().__init__(**params | {
|
||||
"id": id,
|
||||
"template-id": template_id,
|
||||
"config-type": config_key,
|
||||
"concurrency": concurrency,
|
||||
})
|
||||
|
||||
self.concurrency = concurrency
|
||||
self.template_id = template_id
|
||||
self.config_key = config_key
|
||||
|
||||
self.register_config_handler(self.on_prompt_config)
|
||||
|
||||
self.register_specification(
|
||||
ConsumerSpec(
|
||||
name = "input",
|
||||
schema = Chunk,
|
||||
handler = self.on_message,
|
||||
concurrency = self.concurrency,
|
||||
)
|
||||
)
|
||||
|
||||
self.register_specification(
|
||||
AgentClientSpec(
|
||||
request_name = "agent-request",
|
||||
response_name = "agent-response",
|
||||
)
|
||||
)
|
||||
|
||||
self.register_specification(
|
||||
ProducerSpec(
|
||||
name="triples",
|
||||
schema=Triples,
|
||||
)
|
||||
)
|
||||
|
||||
self.register_specification(
|
||||
ProducerSpec(
|
||||
name="entity-contexts",
|
||||
schema=EntityContexts,
|
||||
)
|
||||
)
|
||||
|
||||
# Null configuration, should reload quickly
|
||||
self.manager = PromptManager()
|
||||
|
||||
async def on_prompt_config(self, config, version):
|
||||
|
||||
logger.info(f"Loading configuration version {version}")
|
||||
|
||||
if self.config_key not in config:
|
||||
logger.warning(f"No key {self.config_key} in config")
|
||||
return
|
||||
|
||||
config = config[self.config_key]
|
||||
|
||||
try:
|
||||
|
||||
self.manager.load_config(config)
|
||||
|
||||
logger.info("Prompt configuration reloaded")
|
||||
|
||||
except Exception as e:
|
||||
|
||||
logger.error(f"Configuration reload exception: {e}", exc_info=True)
|
||||
logger.error("Configuration reload failed")
|
||||
|
||||
def to_uri(self, text):
|
||||
return TRUSTGRAPH_ENTITIES + urllib.parse.quote(text)
|
||||
|
||||
async def emit_triples(self, pub, metadata, triples):
|
||||
tpls = Triples(
|
||||
metadata = Metadata(
|
||||
id = metadata.id,
|
||||
metadata = [],
|
||||
user = metadata.user,
|
||||
collection = metadata.collection,
|
||||
),
|
||||
triples = triples,
|
||||
)
|
||||
|
||||
await pub.send(tpls)
|
||||
|
||||
async def emit_entity_contexts(self, pub, metadata, entity_contexts):
|
||||
ecs = EntityContexts(
|
||||
metadata = Metadata(
|
||||
id = metadata.id,
|
||||
metadata = [],
|
||||
user = metadata.user,
|
||||
collection = metadata.collection,
|
||||
),
|
||||
entities = entity_contexts,
|
||||
)
|
||||
|
||||
await pub.send(ecs)
|
||||
|
||||
def parse_json(self, text):
|
||||
json_match = re.search(r'```(?:json)?(.*?)```', text, re.DOTALL)
|
||||
|
||||
if json_match:
|
||||
json_str = json_match.group(1).strip()
|
||||
else:
|
||||
# If no delimiters, assume the entire output is JSON
|
||||
json_str = text.strip()
|
||||
|
||||
return json.loads(json_str)
|
||||
|
||||
async def on_message(self, msg, consumer, flow):
|
||||
|
||||
try:
|
||||
|
||||
v = msg.value()
|
||||
|
||||
# Extract chunk text
|
||||
chunk_text = v.chunk.decode('utf-8')
|
||||
|
||||
logger.debug("Processing chunk for agent extraction")
|
||||
|
||||
prompt = self.manager.render(
|
||||
self.template_id,
|
||||
{
|
||||
"text": chunk_text
|
||||
}
|
||||
)
|
||||
|
||||
logger.debug(f"Agent prompt: {prompt}")
|
||||
|
||||
async def handle(response):
|
||||
|
||||
logger.debug(f"Agent response: {response}")
|
||||
|
||||
if response.error is not None:
|
||||
if response.error.message:
|
||||
raise RuntimeError(str(response.error.message))
|
||||
else:
|
||||
raise RuntimeError(str(response.error))
|
||||
|
||||
if response.answer is not None:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
# Send to agent API
|
||||
agent_response = await flow("agent-request").invoke(
|
||||
recipient = handle,
|
||||
question = prompt
|
||||
)
|
||||
|
||||
# Parse JSON response
|
||||
try:
|
||||
extraction_data = self.parse_json(agent_response)
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON response from agent: {e}")
|
||||
|
||||
# Process extraction data
|
||||
triples, entity_contexts = self.process_extraction_data(
|
||||
extraction_data, v.metadata
|
||||
)
|
||||
|
||||
# Put document metadata into triples
|
||||
for t in v.metadata.metadata:
|
||||
triples.append(t)
|
||||
|
||||
# Emit outputs
|
||||
if triples:
|
||||
await self.emit_triples(flow("triples"), v.metadata, triples)
|
||||
|
||||
if entity_contexts:
|
||||
await self.emit_entity_contexts(
|
||||
flow("entity-contexts"),
|
||||
v.metadata,
|
||||
entity_contexts
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing chunk: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
def process_extraction_data(self, data, metadata):
|
||||
"""Process combined extraction data to generate triples and entity contexts"""
|
||||
triples = []
|
||||
entity_contexts = []
|
||||
|
||||
# Process definitions
|
||||
for defn in data.get("definitions", []):
|
||||
|
||||
entity_uri = self.to_uri(defn["entity"])
|
||||
|
||||
# Add entity label
|
||||
triples.append(Triple(
|
||||
s = Value(value=entity_uri, is_uri=True),
|
||||
p = Value(value=RDF_LABEL, is_uri=True),
|
||||
o = Value(value=defn["entity"], is_uri=False),
|
||||
))
|
||||
|
||||
# Add definition
|
||||
triples.append(Triple(
|
||||
s = Value(value=entity_uri, is_uri=True),
|
||||
p = Value(value=DEFINITION, is_uri=True),
|
||||
o = Value(value=defn["definition"], is_uri=False),
|
||||
))
|
||||
|
||||
# Add subject-of relationship to document
|
||||
if metadata.id:
|
||||
triples.append(Triple(
|
||||
s = Value(value=entity_uri, is_uri=True),
|
||||
p = Value(value=SUBJECT_OF, is_uri=True),
|
||||
o = Value(value=metadata.id, is_uri=True),
|
||||
))
|
||||
|
||||
# Create entity context for embeddings
|
||||
entity_contexts.append(EntityContext(
|
||||
entity=Value(value=entity_uri, is_uri=True),
|
||||
context=defn["definition"]
|
||||
))
|
||||
|
||||
# Process relationships
|
||||
for rel in data.get("relationships", []):
|
||||
|
||||
subject_uri = self.to_uri(rel["subject"])
|
||||
predicate_uri = self.to_uri(rel["predicate"])
|
||||
|
||||
subject_value = Value(value=subject_uri, is_uri=True)
|
||||
predicate_value = Value(value=predicate_uri, is_uri=True)
|
||||
if data.get("object-entity", False):
|
||||
object_value = Value(value=predicate_uri, is_uri=True)
|
||||
else:
|
||||
object_value = Value(value=predicate_uri, is_uri=False)
|
||||
|
||||
# Add subject and predicate labels
|
||||
triples.append(Triple(
|
||||
s = subject_value,
|
||||
p = Value(value=RDF_LABEL, is_uri=True),
|
||||
o = Value(value=rel["subject"], is_uri=False),
|
||||
))
|
||||
|
||||
triples.append(Triple(
|
||||
s = predicate_value,
|
||||
p = Value(value=RDF_LABEL, is_uri=True),
|
||||
o = Value(value=rel["predicate"], is_uri=False),
|
||||
))
|
||||
|
||||
# Handle object (entity vs literal)
|
||||
if rel.get("object-entity", True):
|
||||
triples.append(Triple(
|
||||
s = object_value,
|
||||
p = Value(value=RDF_LABEL, is_uri=True),
|
||||
o = Value(value=rel["object"], is_uri=True),
|
||||
))
|
||||
|
||||
# Add the main relationship triple
|
||||
triples.append(Triple(
|
||||
s = subject_value,
|
||||
p = predicate_value,
|
||||
o = object_value
|
||||
))
|
||||
|
||||
# Add subject-of relationships to document
|
||||
if metadata.id:
|
||||
triples.append(Triple(
|
||||
s = subject_value,
|
||||
p = Value(value=SUBJECT_OF, is_uri=True),
|
||||
o = Value(value=metadata.id, is_uri=True),
|
||||
))
|
||||
|
||||
triples.append(Triple(
|
||||
s = predicate_value,
|
||||
p = Value(value=SUBJECT_OF, is_uri=True),
|
||||
o = Value(value=metadata.id, is_uri=True),
|
||||
))
|
||||
|
||||
if rel.get("object-entity", True):
|
||||
triples.append(Triple(
|
||||
s = object_value,
|
||||
p = Value(value=SUBJECT_OF, is_uri=True),
|
||||
o = Value(value=metadata.id, is_uri=True),
|
||||
))
|
||||
|
||||
return triples, entity_contexts
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
parser.add_argument(
|
||||
'-c', '--concurrency',
|
||||
type=int,
|
||||
default=default_concurrency,
|
||||
help=f'Concurrent processing threads (default: {default_concurrency})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--template-id",
|
||||
type=str,
|
||||
default=default_template_id,
|
||||
help="Template ID to use for agent extraction"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--config-type',
|
||||
default="prompt",
|
||||
help=f'Configuration key for prompts (default: prompt)',
|
||||
)
|
||||
|
||||
FlowProcessor.add_args(parser)
|
||||
|
||||
def run():
|
||||
|
||||
Processor.launch(default_ident, __doc__)
|
||||
|
|
@ -7,8 +7,12 @@ entity/context definitions for embedding.
|
|||
|
||||
import json
|
||||
import urllib.parse
|
||||
import logging
|
||||
|
||||
from .... schema import Chunk, Triple, Triples, Metadata, Value
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
from .... schema import EntityContext, EntityContexts
|
||||
from .... schema import PromptRequest, PromptResponse
|
||||
from .... rdf import TRUSTGRAPH_ENTITIES, DEFINITION, RDF_LABEL, SUBJECT_OF
|
||||
|
|
@ -94,11 +98,11 @@ class Processor(FlowProcessor):
|
|||
async def on_message(self, msg, consumer, flow):
|
||||
|
||||
v = msg.value()
|
||||
print(f"Indexing {v.metadata.id}...", flush=True)
|
||||
logger.info(f"Extracting definitions from {v.metadata.id}...")
|
||||
|
||||
chunk = v.chunk.decode("utf-8")
|
||||
|
||||
print(chunk, flush=True)
|
||||
logger.debug(f"Processing chunk: {chunk[:200]}...") # Log first 200 chars
|
||||
|
||||
try:
|
||||
|
||||
|
|
@ -108,13 +112,13 @@ class Processor(FlowProcessor):
|
|||
text = chunk
|
||||
)
|
||||
|
||||
print("Response", defs, flush=True)
|
||||
logger.debug(f"Definitions response: {defs}")
|
||||
|
||||
if type(defs) != list:
|
||||
raise RuntimeError("Expecting array in prompt response")
|
||||
|
||||
except Exception as e:
|
||||
print("Prompt exception:", e, flush=True)
|
||||
logger.error(f"Prompt exception: {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
triples = []
|
||||
|
|
@ -187,9 +191,9 @@ class Processor(FlowProcessor):
|
|||
)
|
||||
|
||||
except Exception as e:
|
||||
print("Exception: ", e, flush=True)
|
||||
logger.error(f"Definitions extraction exception: {e}", exc_info=True)
|
||||
|
||||
print("Done.", flush=True)
|
||||
logger.debug("Definitions extraction complete")
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
|
|
|||
|
|
@ -0,0 +1,3 @@
|
|||
|
||||
from . processor import *
|
||||
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
from . extract import run
|
||||
from . processor import run
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
||||
241
trustgraph-flow/trustgraph/extract/kg/objects/processor.py
Normal file
241
trustgraph-flow/trustgraph/extract/kg/objects/processor.py
Normal file
|
|
@ -0,0 +1,241 @@
|
|||
"""
|
||||
Object extraction service - extracts structured objects from text chunks
|
||||
based on configured schemas.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, List, Any
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from .... schema import Chunk, ExtractedObject, Metadata
|
||||
from .... schema import PromptRequest, PromptResponse
|
||||
from .... schema import RowSchema, Field
|
||||
|
||||
from .... base import FlowProcessor, ConsumerSpec, ProducerSpec
|
||||
from .... base import PromptClientSpec
|
||||
from .... messaging.translators import row_schema_translator
|
||||
|
||||
default_ident = "kg-extract-objects"
|
||||
|
||||
|
||||
def convert_values_to_strings(obj: Dict[str, Any]) -> Dict[str, str]:
|
||||
"""Convert all values in a dictionary to strings for Pulsar Map(String()) compatibility"""
|
||||
result = {}
|
||||
for key, value in obj.items():
|
||||
if value is None:
|
||||
result[key] = ""
|
||||
elif isinstance(value, str):
|
||||
result[key] = value
|
||||
elif isinstance(value, (int, float, bool)):
|
||||
result[key] = str(value)
|
||||
elif isinstance(value, (list, dict)):
|
||||
# For complex types, serialize as JSON
|
||||
result[key] = json.dumps(value)
|
||||
else:
|
||||
# For any other type, convert to string
|
||||
result[key] = str(value)
|
||||
return result
|
||||
default_concurrency = 1
|
||||
|
||||
class Processor(FlowProcessor):
|
||||
|
||||
def __init__(self, **params):
|
||||
|
||||
id = params.get("id")
|
||||
concurrency = params.get("concurrency", 1)
|
||||
|
||||
# Config key for schemas
|
||||
self.config_key = params.get("config_type", "schema")
|
||||
|
||||
super(Processor, self).__init__(
|
||||
**params | {
|
||||
"id": id,
|
||||
"config-type": self.config_key,
|
||||
"concurrency": concurrency,
|
||||
}
|
||||
)
|
||||
|
||||
self.register_specification(
|
||||
ConsumerSpec(
|
||||
name = "input",
|
||||
schema = Chunk,
|
||||
handler = self.on_chunk,
|
||||
concurrency = concurrency,
|
||||
)
|
||||
)
|
||||
|
||||
self.register_specification(
|
||||
PromptClientSpec(
|
||||
request_name = "prompt-request",
|
||||
response_name = "prompt-response",
|
||||
)
|
||||
)
|
||||
|
||||
self.register_specification(
|
||||
ProducerSpec(
|
||||
name = "output",
|
||||
schema = ExtractedObject
|
||||
)
|
||||
)
|
||||
|
||||
# Register config handler for schema updates
|
||||
self.register_config_handler(self.on_schema_config)
|
||||
|
||||
# Schema storage: name -> RowSchema
|
||||
self.schemas: Dict[str, RowSchema] = {}
|
||||
|
||||
async def on_schema_config(self, config, version):
|
||||
"""Handle schema configuration updates"""
|
||||
|
||||
logger.info(f"Loading schema configuration version {version}")
|
||||
|
||||
# Clear existing schemas
|
||||
self.schemas = {}
|
||||
|
||||
# Check if our config type exists
|
||||
if self.config_key not in config:
|
||||
logger.warning(f"No '{self.config_key}' type in configuration")
|
||||
return
|
||||
|
||||
# Get the schemas dictionary for our type
|
||||
schemas_config = config[self.config_key]
|
||||
|
||||
# Process each schema in the schemas config
|
||||
for schema_name, schema_json in schemas_config.items():
|
||||
|
||||
try:
|
||||
# Parse the JSON schema definition
|
||||
schema_def = json.loads(schema_json)
|
||||
|
||||
# Create Field objects
|
||||
fields = []
|
||||
for field_def in schema_def.get("fields", []):
|
||||
field = Field(
|
||||
name=field_def["name"],
|
||||
type=field_def["type"],
|
||||
size=field_def.get("size", 0),
|
||||
primary=field_def.get("primary_key", False),
|
||||
description=field_def.get("description", ""),
|
||||
required=field_def.get("required", False),
|
||||
enum_values=field_def.get("enum", []),
|
||||
indexed=field_def.get("indexed", False)
|
||||
)
|
||||
fields.append(field)
|
||||
|
||||
# Create RowSchema
|
||||
row_schema = RowSchema(
|
||||
name=schema_def.get("name", schema_name),
|
||||
description=schema_def.get("description", ""),
|
||||
fields=fields
|
||||
)
|
||||
|
||||
self.schemas[schema_name] = row_schema
|
||||
logger.info(f"Loaded schema: {schema_name} with {len(fields)} fields")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to parse schema {schema_name}: {e}", exc_info=True)
|
||||
|
||||
logger.info(f"Schema configuration loaded: {len(self.schemas)} schemas")
|
||||
|
||||
async def extract_objects_for_schema(self, text: str, schema_name: str, schema: RowSchema, flow) -> List[Dict[str, Any]]:
|
||||
"""Extract objects from text for a specific schema"""
|
||||
|
||||
try:
|
||||
# Convert Pulsar RowSchema to JSON-serializable dict
|
||||
schema_dict = row_schema_translator.from_pulsar(schema)
|
||||
|
||||
# Use prompt client to extract rows based on schema
|
||||
objects = await flow("prompt-request").extract_objects(
|
||||
schema=schema_dict,
|
||||
text=text
|
||||
)
|
||||
|
||||
return objects if isinstance(objects, list) else []
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract objects for schema {schema_name}: {e}", exc_info=True)
|
||||
return []
|
||||
|
||||
async def on_chunk(self, msg, consumer, flow):
|
||||
"""Process incoming chunk and extract objects"""
|
||||
|
||||
v = msg.value()
|
||||
logger.info(f"Extracting objects from chunk {v.metadata.id}...")
|
||||
|
||||
chunk_text = v.chunk.decode("utf-8")
|
||||
|
||||
# If no schemas configured, log warning and return
|
||||
if not self.schemas:
|
||||
logger.warning("No schemas configured - skipping extraction")
|
||||
return
|
||||
|
||||
try:
|
||||
# Extract objects for each configured schema
|
||||
for schema_name, schema in self.schemas.items():
|
||||
|
||||
logger.debug(f"Extracting {schema_name} objects from chunk")
|
||||
|
||||
# Extract objects using prompt
|
||||
objects = await self.extract_objects_for_schema(
|
||||
chunk_text,
|
||||
schema_name,
|
||||
schema,
|
||||
flow
|
||||
)
|
||||
|
||||
# Emit each extracted object
|
||||
for obj in objects:
|
||||
|
||||
# Calculate confidence (could be enhanced with actual confidence from prompt)
|
||||
confidence = 0.8 # Default confidence
|
||||
|
||||
# Convert all values to strings for Pulsar compatibility
|
||||
string_values = convert_values_to_strings(obj)
|
||||
|
||||
# Create ExtractedObject
|
||||
extracted = ExtractedObject(
|
||||
metadata=Metadata(
|
||||
id=f"{v.metadata.id}:{schema_name}:{hash(str(obj))}",
|
||||
metadata=[],
|
||||
user=v.metadata.user,
|
||||
collection=v.metadata.collection,
|
||||
),
|
||||
schema_name=schema_name,
|
||||
values=string_values,
|
||||
confidence=confidence,
|
||||
source_span=chunk_text[:100] # First 100 chars as source reference
|
||||
)
|
||||
|
||||
await flow("output").send(extracted)
|
||||
logger.debug(f"Emitted extracted object for schema {schema_name}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Object extraction exception: {e}", exc_info=True)
|
||||
|
||||
logger.debug("Object extraction complete")
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
"""Add command-line arguments"""
|
||||
|
||||
parser.add_argument(
|
||||
'-c', '--concurrency',
|
||||
type=int,
|
||||
default=default_concurrency,
|
||||
help=f'Concurrent processing threads (default: {default_concurrency})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--config-type',
|
||||
default='schema',
|
||||
help='Configuration type prefix for schemas (default: schema)'
|
||||
)
|
||||
|
||||
FlowProcessor.add_args(parser)
|
||||
|
||||
def run():
|
||||
"""Entry point for kg-extract-objects command"""
|
||||
Processor.launch(default_ident, __doc__)
|
||||
|
|
@ -6,8 +6,12 @@ graph edges.
|
|||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import urllib.parse
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from .... schema import Chunk, Triple, Triples
|
||||
from .... schema import Metadata, Value
|
||||
from .... schema import PromptRequest, PromptResponse
|
||||
|
|
@ -78,11 +82,11 @@ class Processor(FlowProcessor):
|
|||
async def on_message(self, msg, consumer, flow):
|
||||
|
||||
v = msg.value()
|
||||
print(f"Indexing {v.metadata.id}...", flush=True)
|
||||
logger.info(f"Extracting relationships from {v.metadata.id}...")
|
||||
|
||||
chunk = v.chunk.decode("utf-8")
|
||||
|
||||
print(chunk, flush=True)
|
||||
logger.debug(f"Processing chunk: {chunk[:100]}..." if len(chunk) > 100 else f"Processing chunk: {chunk}")
|
||||
|
||||
try:
|
||||
|
||||
|
|
@ -92,13 +96,13 @@ class Processor(FlowProcessor):
|
|||
text = chunk
|
||||
)
|
||||
|
||||
print("Response", rels, flush=True)
|
||||
logger.debug(f"Prompt response: {rels}")
|
||||
|
||||
if type(rels) != list:
|
||||
raise RuntimeError("Expecting array in prompt response")
|
||||
|
||||
except Exception as e:
|
||||
print("Prompt exception:", e, flush=True)
|
||||
logger.error(f"Prompt exception: {e}", exc_info=True)
|
||||
raise e
|
||||
|
||||
triples = []
|
||||
|
|
@ -189,9 +193,9 @@ class Processor(FlowProcessor):
|
|||
)
|
||||
|
||||
except Exception as e:
|
||||
print("Exception: ", e, flush=True)
|
||||
logger.error(f"Relationship extraction exception: {e}", exc_info=True)
|
||||
|
||||
print("Done.", flush=True)
|
||||
logger.debug("Relationship extraction complete")
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
|
|
|||
|
|
@ -6,6 +6,10 @@ get topics which are output as graph edges.
|
|||
|
||||
import urllib.parse
|
||||
import json
|
||||
import logging
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from .... schema import Chunk, Triple, Triples, Metadata, Value
|
||||
from .... schema import chunk_ingest_queue, triples_store_queue
|
||||
|
|
@ -81,7 +85,7 @@ class Processor(ConsumerProducer):
|
|||
async def handle(self, msg):
|
||||
|
||||
v = msg.value()
|
||||
print(f"Indexing {v.metadata.id}...", flush=True)
|
||||
logger.info(f"Extracting topics from {v.metadata.id}...")
|
||||
|
||||
chunk = v.chunk.decode("utf-8")
|
||||
|
||||
|
|
@ -110,9 +114,9 @@ class Processor(ConsumerProducer):
|
|||
)
|
||||
|
||||
except Exception as e:
|
||||
print("Exception: ", e, flush=True)
|
||||
logger.error(f"Topic extraction exception: {e}", exc_info=True)
|
||||
|
||||
print("Done.", flush=True)
|
||||
logger.debug("Topic extraction complete")
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
|
|
|||
|
|
@ -1,3 +0,0 @@
|
|||
|
||||
from . extract import *
|
||||
|
||||
|
|
@ -1,221 +0,0 @@
|
|||
|
||||
"""
|
||||
Simple decoder, accepts vector+text chunks input, applies analysis to pull
|
||||
out a row of fields. Output as a vector plus object.
|
||||
"""
|
||||
|
||||
import urllib.parse
|
||||
import os
|
||||
from pulsar.schema import JsonSchema
|
||||
|
||||
from .... schema import ChunkEmbeddings, Rows, ObjectEmbeddings, Metadata
|
||||
from .... schema import RowSchema, Field
|
||||
from .... schema import chunk_embeddings_ingest_queue, rows_store_queue
|
||||
from .... schema import object_embeddings_store_queue
|
||||
from .... schema import prompt_request_queue
|
||||
from .... schema import prompt_response_queue
|
||||
from .... log_level import LogLevel
|
||||
from .... clients.prompt_client import PromptClient
|
||||
from .... base import ConsumerProducer
|
||||
|
||||
from .... objects.field import Field as FieldParser
|
||||
from .... objects.object import Schema
|
||||
|
||||
module = ".".join(__name__.split(".")[1:-1])
|
||||
|
||||
default_input_queue = chunk_embeddings_ingest_queue
|
||||
default_output_queue = rows_store_queue
|
||||
default_vector_queue = object_embeddings_store_queue
|
||||
default_subscriber = module
|
||||
|
||||
class Processor(ConsumerProducer):
|
||||
|
||||
def __init__(self, **params):
|
||||
|
||||
input_queue = params.get("input_queue", default_input_queue)
|
||||
output_queue = params.get("output_queue", default_output_queue)
|
||||
vector_queue = params.get("vector_queue", default_vector_queue)
|
||||
subscriber = params.get("subscriber", default_subscriber)
|
||||
pr_request_queue = params.get(
|
||||
"prompt_request_queue", prompt_request_queue
|
||||
)
|
||||
pr_response_queue = params.get(
|
||||
"prompt_response_queue", prompt_response_queue
|
||||
)
|
||||
|
||||
super(Processor, self).__init__(
|
||||
**params | {
|
||||
"input_queue": input_queue,
|
||||
"output_queue": output_queue,
|
||||
"subscriber": subscriber,
|
||||
"input_schema": ChunkEmbeddings,
|
||||
"output_schema": Rows,
|
||||
"prompt_request_queue": pr_request_queue,
|
||||
"prompt_response_queue": pr_response_queue,
|
||||
}
|
||||
)
|
||||
|
||||
self.vec_prod = self.client.create_producer(
|
||||
topic=vector_queue,
|
||||
schema=JsonSchema(ObjectEmbeddings),
|
||||
)
|
||||
|
||||
__class__.pubsub_metric.info({
|
||||
"input_queue": input_queue,
|
||||
"output_queue": output_queue,
|
||||
"vector_queue": vector_queue,
|
||||
"prompt_request_queue": pr_request_queue,
|
||||
"prompt_response_queue": pr_response_queue,
|
||||
"subscriber": subscriber,
|
||||
"input_schema": ChunkEmbeddings.__name__,
|
||||
"output_schema": Rows.__name__,
|
||||
"vector_schema": ObjectEmbeddings.__name__,
|
||||
})
|
||||
|
||||
flds = __class__.parse_fields(params["field"])
|
||||
|
||||
for fld in flds:
|
||||
print(fld)
|
||||
|
||||
self.primary = None
|
||||
|
||||
for f in flds:
|
||||
if f.primary:
|
||||
if self.primary:
|
||||
raise RuntimeError(
|
||||
"Only one primary key field is supported"
|
||||
)
|
||||
self.primary = f
|
||||
|
||||
if self.primary == None:
|
||||
raise RuntimeError(
|
||||
"Must have exactly one primary key field"
|
||||
)
|
||||
|
||||
self.schema = Schema(
|
||||
name = params["name"],
|
||||
description = params["description"],
|
||||
fields = flds
|
||||
)
|
||||
|
||||
self.row_schema=RowSchema(
|
||||
name=self.schema.name,
|
||||
description=self.schema.description,
|
||||
fields=[
|
||||
Field(
|
||||
name=f.name, type=str(f.type), size=f.size,
|
||||
primary=f.primary, description=f.description,
|
||||
)
|
||||
for f in self.schema.fields
|
||||
]
|
||||
)
|
||||
|
||||
self.prompt = PromptClient(
|
||||
pulsar_host=self.pulsar_host,
|
||||
pulsar_api_key=self.pulsar_api_key,
|
||||
input_queue=pr_request_queue,
|
||||
output_queue=pr_response_queue,
|
||||
subscriber = module + "-prompt",
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def parse_fields(fields):
|
||||
return [ FieldParser.parse(f) for f in fields ]
|
||||
|
||||
def get_rows(self, chunk):
|
||||
return self.prompt.request_rows(self.schema, chunk)
|
||||
|
||||
def emit_rows(self, metadata, rows):
|
||||
|
||||
t = Rows(
|
||||
metadata=metadata, row_schema=self.row_schema, rows=rows
|
||||
)
|
||||
await self.send(t)
|
||||
|
||||
def emit_vec(self, metadata, name, vec, key_name, key):
|
||||
|
||||
r = ObjectEmbeddings(
|
||||
metadata=metadata, vectors=vec, name=name, key_name=key_name, id=key
|
||||
)
|
||||
self.vec_prod.send(r)
|
||||
|
||||
async def handle(self, msg):
|
||||
|
||||
v = msg.value()
|
||||
print(f"Indexing {v.metadata.id}...", flush=True)
|
||||
|
||||
chunk = v.chunk.decode("utf-8")
|
||||
|
||||
try:
|
||||
|
||||
rows = self.get_rows(chunk)
|
||||
|
||||
self.emit_rows(
|
||||
metadata=v.metadata,
|
||||
rows=rows
|
||||
)
|
||||
|
||||
for row in rows:
|
||||
self.emit_vec(
|
||||
metadata=v.metadata, vec=v.vectors,
|
||||
name=self.schema.name, key_name=self.primary.name,
|
||||
key=row[self.primary.name]
|
||||
)
|
||||
|
||||
for row in rows:
|
||||
print(row)
|
||||
|
||||
except Exception as e:
|
||||
print("Exception: ", e, flush=True)
|
||||
|
||||
print("Done.", flush=True)
|
||||
|
||||
@staticmethod
|
||||
def add_args(parser):
|
||||
|
||||
ConsumerProducer.add_args(
|
||||
parser, default_input_queue, default_subscriber,
|
||||
default_output_queue,
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-c', '--vector-queue',
|
||||
default=default_vector_queue,
|
||||
help=f'Vector output queue (default: {default_vector_queue})'
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--prompt-request-queue',
|
||||
default=prompt_request_queue,
|
||||
help=f'Prompt request queue (default: {prompt_request_queue})',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--prompt-response-queue',
|
||||
default=prompt_response_queue,
|
||||
help=f'Prompt response queue (default: {prompt_response_queue})',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-f', '--field',
|
||||
required=True,
|
||||
action='append',
|
||||
help=f'Field definition, format name:type:size:pri:descriptionn',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-n', '--name',
|
||||
required=True,
|
||||
help=f'Name of row object',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-d', '--description',
|
||||
required=True,
|
||||
help=f'Description of object',
|
||||
)
|
||||
|
||||
def run():
|
||||
|
||||
Processor.launch(module, __doc__)
|
||||
|
||||
|
|
@ -18,6 +18,9 @@ import logging
|
|||
import os
|
||||
import base64
|
||||
import uuid
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
import json
|
||||
|
||||
import pulsar
|
||||
|
|
@ -48,7 +51,7 @@ class ConfigReceiver:
|
|||
|
||||
v = msg.value()
|
||||
|
||||
print(f"Config version", v.version)
|
||||
logger.info(f"Config version: {v.version}")
|
||||
|
||||
if "flows" in v.config:
|
||||
|
||||
|
|
@ -68,29 +71,29 @@ class ConfigReceiver:
|
|||
del self.flows[k]
|
||||
|
||||
except Exception as e:
|
||||
print(f"Exception: {e}", flush=True)
|
||||
logger.error(f"Config processing exception: {e}", exc_info=True)
|
||||
|
||||
async def start_flow(self, id, flow):
|
||||
|
||||
print("Start flow", id)
|
||||
logger.info(f"Starting flow: {id}")
|
||||
|
||||
for handler in self.flow_handlers:
|
||||
|
||||
try:
|
||||
await handler.start_flow(id, flow)
|
||||
except Exception as e:
|
||||
print(f"Exception: {e}", flush=True)
|
||||
logger.error(f"Config processing exception: {e}", exc_info=True)
|
||||
|
||||
async def stop_flow(self, id, flow):
|
||||
|
||||
print("Stop flow", id)
|
||||
logger.info(f"Stopping flow: {id}")
|
||||
|
||||
for handler in self.flow_handlers:
|
||||
|
||||
try:
|
||||
await handler.stop_flow(id, flow)
|
||||
except Exception as e:
|
||||
print(f"Exception: {e}", flush=True)
|
||||
logger.error(f"Config processing exception: {e}", exc_info=True)
|
||||
|
||||
async def config_loader(self):
|
||||
|
||||
|
|
@ -111,9 +114,9 @@ class ConfigReceiver:
|
|||
|
||||
await self.config_cons.start()
|
||||
|
||||
print("Waiting...")
|
||||
logger.debug("Waiting for config updates...")
|
||||
|
||||
print("Config consumer done. :/")
|
||||
logger.info("Config consumer finished")
|
||||
|
||||
async def start(self):
|
||||
|
||||
|
|
|
|||
|
|
@ -2,8 +2,12 @@
|
|||
import asyncio
|
||||
import uuid
|
||||
import msgpack
|
||||
import logging
|
||||
from . knowledge import KnowledgeRequestor
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class CoreExport:
|
||||
|
||||
def __init__(self, pulsar_client):
|
||||
|
|
@ -84,7 +88,7 @@ class CoreExport:
|
|||
|
||||
except Exception as e:
|
||||
|
||||
print("Exception:", e)
|
||||
logger.error(f"Core export exception: {e}", exc_info=True)
|
||||
|
||||
finally:
|
||||
|
||||
|
|
|
|||
|
|
@ -3,8 +3,12 @@ import asyncio
|
|||
import json
|
||||
import uuid
|
||||
import msgpack
|
||||
import logging
|
||||
from . knowledge import KnowledgeRequestor
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class CoreImport:
|
||||
|
||||
def __init__(self, pulsar_client):
|
||||
|
|
@ -80,14 +84,14 @@ class CoreImport:
|
|||
await kr.process(msg)
|
||||
|
||||
except Exception as e:
|
||||
print("Exception:", e)
|
||||
logger.error(f"Core import exception: {e}", exc_info=True)
|
||||
await error(str(e))
|
||||
|
||||
finally:
|
||||
|
||||
await kr.stop()
|
||||
|
||||
print("All done.")
|
||||
logger.info("Core import completed")
|
||||
response = await ok()
|
||||
await response.write_eof()
|
||||
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue