trustgraph/tests/unit/test_chunking/conftest.py

import pytest
from unittest.mock import AsyncMock, Mock, patch
from trustgraph.schema import TextDocument, Metadata
from trustgraph.chunking.recursive.chunker import Processor as RecursiveChunker
from trustgraph.chunking.token.chunker import Processor as TokenChunker
from prometheus_client import REGISTRY


@pytest.fixture
def mock_flow():
    """Mock flow function that returns a mock output producer."""
    output_mock = AsyncMock()
    flow_mock = Mock(return_value=output_mock)
    return flow_mock, output_mock


@pytest.fixture
def mock_consumer():
    """Mock consumer with test attributes."""
    consumer = Mock()
    consumer.id = "test-consumer"
    consumer.flow = "test-flow"
    return consumer


@pytest.fixture
def sample_text_document():
    """Sample document with moderate length text."""
    metadata = Metadata(
        id="test-doc-1",
        metadata=[],
        user="test-user",
        collection="test-collection"
    )
    text = "The quick brown fox jumps over the lazy dog. " * 20
    return TextDocument(
        metadata=metadata,
        text=text.encode("utf-8")
    )


@pytest.fixture
def long_text_document():
    """Long document for testing multiple chunks."""
    metadata = Metadata(
        id="test-doc-long",
        metadata=[],
        user="test-user",
        collection="test-collection"
    )
    # Create a long text that will definitely be chunked
    text = " ".join([f"Sentence number {i}. This is part of a long document." for i in range(200)])
    return TextDocument(
        metadata=metadata,
        text=text.encode("utf-8")
    )


@pytest.fixture
def unicode_text_document():
    """Document with various unicode characters."""
    metadata = Metadata(
        id="test-doc-unicode",
        metadata=[],
        user="test-user",
        collection="test-collection"
    )
    text = """
    English: Hello World!
    Chinese: 你好世界
    Japanese: こんにちは世界
    Korean: 안녕하세요 세계
    Arabic: مرحبا بالعالم
    Russian: Привет мир
    Emoji: 🌍 🌎 🌏 😀 🎉
    Math: ∑ ∏ ∫ ∞ √ π
    Symbols: © ® ™ € £ ¥
    """
    return TextDocument(
        metadata=metadata,
        text=text.encode("utf-8")
    )


@pytest.fixture
def empty_text_document():
    """Empty document for edge case testing."""
    metadata = Metadata(
        id="test-doc-empty",
        metadata=[],
        user="test-user",
        collection="test-collection"
    )
    return TextDocument(
        metadata=metadata,
        text=b""
    )


@pytest.fixture
def mock_message(sample_text_document):
    """Mock message containing a document."""
    msg = Mock()
    msg.value.return_value = sample_text_document
    return msg


@pytest.fixture(autouse=True)
def clear_metrics():
    """Clear metrics before each test to avoid duplicates."""
    # Clear the chunk_metric class attribute if it exists
    if hasattr(RecursiveChunker, 'chunk_metric'):
        # Unregister from Prometheus registry first
        try:
            REGISTRY.unregister(RecursiveChunker.chunk_metric)
        except KeyError:
            pass  # Already unregistered
        delattr(RecursiveChunker, 'chunk_metric')
    if hasattr(TokenChunker, 'chunk_metric'):
        try:
            REGISTRY.unregister(TokenChunker.chunk_metric)
        except KeyError:
            pass  # Already unregistered
        delattr(TokenChunker, 'chunk_metric')
    yield
    # Clean up after test as well
    if hasattr(RecursiveChunker, 'chunk_metric'):
        try:
            REGISTRY.unregister(RecursiveChunker.chunk_metric)
        except KeyError:
            pass
        delattr(RecursiveChunker, 'chunk_metric')
    if hasattr(TokenChunker, 'chunk_metric'):
        try:
            REGISTRY.unregister(TokenChunker.chunk_metric)
        except KeyError:
            pass
        delattr(TokenChunker, 'chunk_metric')


@pytest.fixture
def mock_async_processor_init():
    """Mock AsyncProcessor.__init__ to avoid taskgroup requirement."""
    def init_mock(self, **kwargs):
        # Set attributes that AsyncProcessor would normally set
        self.config_handlers = []
        self.specifications = []
        self.flows = {}
        self.id = kwargs.get('id', 'test-processor')
        # Don't call the real __init__
    
    with patch('trustgraph.base.async_processor.AsyncProcessor.__init__', init_mock):
        yield
Release/v1.2 (#457) * Bump setup.py versions for 1.1 * PoC MCP server (#419) * Very initial MCP server PoC for TrustGraph * Put service on port 8000 * Add MCP container and packages to buildout * Update docs for API/CLI changes in 1.0 (#421) * Update some API basics for the 0.23/1.0 API change * Add MCP container push (#425) * Add command args to the MCP server (#426) * Host and port parameters * Added websocket arg * More docs * MCP client support (#427) - MCP client service - Tool request/response schema - API gateway support for mcp-tool - Message translation for tool request & response - Make mcp-tool using configuration service for information about where the MCP services are. * Feature/react call mcp (#428) Key Features - MCP Tool Integration: Added core MCP tool support with ToolClientSpec and ToolClient classes - API Enhancement: New mcp_tool method for flow-specific tool invocation - CLI Tooling: New tg-invoke-mcp-tool command for testing MCP integration - React Agent Enhancement: Fixed and improved multi-tool invocation capabilities - Tool Management: Enhanced CLI for tool configuration and management Changes - Added MCP tool invocation to API with flow-specific integration - Implemented ToolClientSpec and ToolClient for tool call handling - Updated agent-manager-react to invoke MCP tools with configurable types - Enhanced CLI with new commands and improved help text - Added comprehensive documentation for new CLI commands - Improved tool configuration management Testing - Added tg-invoke-mcp-tool CLI command for isolated MCP integration testing - Enhanced agent capability to invoke multiple tools simultaneously * Test suite executed from CI pipeline (#433) * Test strategy & test cases * Unit tests * Integration tests * Extending test coverage (#434) * Contract tests * Testing embeedings * Agent unit tests * Knowledge pipeline tests * Turn on contract tests * Increase storage test coverage (#435) * Fixing storage and adding tests * PR pipeline only runs quick tests * Empty configuration is returned as empty list, previously was not in response (#436) * Update config util to take files as well as command-line text (#437) * Updated CLI invocation and config model for tools and mcp (#438) * Updated CLI invocation and config model for tools and mcp * CLI anomalies * Tweaked the MCP tool implementation for new model * Update agent implementation to match the new model * Fix agent tools, now all tested * Fixed integration tests * Fix MCP delete tool params * Update Python deps to 1.2 * Update to enable knowledge extraction using the agent framework (#439) * Implement KG extraction agent (kg-extract-agent) * Using ReAct framework (agent-manager-react) * ReAct manager had an issue when emitting JSON, which conflicts which ReAct manager's own JSON messages, so refactored ReAct manager to use traditional ReAct messages, non-JSON structure. * Minor refactor to take the prompt template client out of prompt-template so it can be more readily used by other modules. kg-extract-agent uses this framework. * Migrate from setup.py to pyproject.toml (#440) * Converted setup.py to pyproject.toml * Modern package infrastructure as recommended by py docs * Install missing build deps (#441) * Install missing build deps (#442) * Implement logging strategy (#444) * Logging strategy and convert all prints() to logging invocations * Fix/startup failure (#445) * Fix loggin startup problems * Fix logging startup problems (#446) * Fix logging startup problems (#447) * Fixed Mistral OCR to use current API (#448) * Fixed Mistral OCR to use current API * Added PDF decoder tests * Fix Mistral OCR ident to be standard pdf-decoder (#450) * Fix Mistral OCR ident to be standard pdf-decoder * Correct test * Schema structure refactor (#451) * Write schema refactor spec * Implemented schema refactor spec * Structure data mvp (#452) * Structured data tech spec * Architecture principles * New schemas * Updated schemas and specs * Object extractor * Add .coveragerc * New tests * Cassandra object storage * Trying to object extraction working, issues exist * Validate librarian collection (#453) * Fix token chunker, broken API invocation (#454) * Fix token chunker, broken API invocation (#455) * Knowledge load utility CLI (#456) * Knowledge loader * More tests 2025-08-18 20:56:09 +01:00			`import pytest`
			`from unittest.mock import AsyncMock, Mock, patch`
			`from trustgraph.schema import TextDocument, Metadata`
			`from trustgraph.chunking.recursive.chunker import Processor as RecursiveChunker`
			`from trustgraph.chunking.token.chunker import Processor as TokenChunker`
			`from prometheus_client import REGISTRY`


			`@pytest.fixture`
			`def mock_flow():`
			`"""Mock flow function that returns a mock output producer."""`
			`output_mock = AsyncMock()`
			`flow_mock = Mock(return_value=output_mock)`
			`return flow_mock, output_mock`


			`@pytest.fixture`
			`def mock_consumer():`
			`"""Mock consumer with test attributes."""`
			`consumer = Mock()`
			`consumer.id = "test-consumer"`
			`consumer.flow = "test-flow"`
			`return consumer`


			`@pytest.fixture`
			`def sample_text_document():`
			`"""Sample document with moderate length text."""`
			`metadata = Metadata(`
			`id="test-doc-1",`
			`metadata=[],`
			`user="test-user",`
			`collection="test-collection"`
			`)`
			`text = "The quick brown fox jumps over the lazy dog. " * 20`
			`return TextDocument(`
			`metadata=metadata,`
			`text=text.encode("utf-8")`
			`)`


			`@pytest.fixture`
			`def long_text_document():`
			`"""Long document for testing multiple chunks."""`
			`metadata = Metadata(`
			`id="test-doc-long",`
			`metadata=[],`
			`user="test-user",`
			`collection="test-collection"`
			`)`
			`# Create a long text that will definitely be chunked`
			`text = " ".join([f"Sentence number {i}. This is part of a long document." for i in range(200)])`
			`return TextDocument(`
			`metadata=metadata,`
			`text=text.encode("utf-8")`
			`)`


			`@pytest.fixture`
			`def unicode_text_document():`
			`"""Document with various unicode characters."""`
			`metadata = Metadata(`
			`id="test-doc-unicode",`
			`metadata=[],`
			`user="test-user",`
			`collection="test-collection"`
			`)`
			`text = """`
			`English: Hello World!`
			`Chinese: 你好世界`
			`Japanese: こんにちは世界`
			`Korean: 안녕하세요 세계`
			`Arabic: مرحبا بالعالم`
			`Russian: Привет мир`
			`Emoji: 🌍 🌎 🌏 😀 🎉`
			`Math: ∑ ∏ ∫ ∞ √ π`
			`Symbols: © ® ™ € £ ¥`
			`"""`
			`return TextDocument(`
			`metadata=metadata,`
			`text=text.encode("utf-8")`
			`)`


			`@pytest.fixture`
			`def empty_text_document():`
			`"""Empty document for edge case testing."""`
			`metadata = Metadata(`
			`id="test-doc-empty",`
			`metadata=[],`
			`user="test-user",`
			`collection="test-collection"`
			`)`
			`return TextDocument(`
			`metadata=metadata,`
			`text=b""`
			`)`


			`@pytest.fixture`
			`def mock_message(sample_text_document):`
			`"""Mock message containing a document."""`
			`msg = Mock()`
			`msg.value.return_value = sample_text_document`
			`return msg`


			`@pytest.fixture(autouse=True)`
			`def clear_metrics():`
			`"""Clear metrics before each test to avoid duplicates."""`
			`# Clear the chunk_metric class attribute if it exists`
			`if hasattr(RecursiveChunker, 'chunk_metric'):`
			`# Unregister from Prometheus registry first`
			`try:`
			`REGISTRY.unregister(RecursiveChunker.chunk_metric)`
			`except KeyError:`
			`pass # Already unregistered`
			`delattr(RecursiveChunker, 'chunk_metric')`
			`if hasattr(TokenChunker, 'chunk_metric'):`
			`try:`
			`REGISTRY.unregister(TokenChunker.chunk_metric)`
			`except KeyError:`
			`pass # Already unregistered`
			`delattr(TokenChunker, 'chunk_metric')`
			`yield`
			`# Clean up after test as well`
			`if hasattr(RecursiveChunker, 'chunk_metric'):`
			`try:`
			`REGISTRY.unregister(RecursiveChunker.chunk_metric)`
			`except KeyError:`
			`pass`
			`delattr(RecursiveChunker, 'chunk_metric')`
			`if hasattr(TokenChunker, 'chunk_metric'):`
			`try:`
			`REGISTRY.unregister(TokenChunker.chunk_metric)`
			`except KeyError:`
			`pass`
			`delattr(TokenChunker, 'chunk_metric')`


			`@pytest.fixture`
			`def mock_async_processor_init():`
			`"""Mock AsyncProcessor.__init__ to avoid taskgroup requirement."""`
			`def init_mock(self, **kwargs):`
			`# Set attributes that AsyncProcessor would normally set`
			`self.config_handlers = []`
			`self.specifications = []`
			`self.flows = {}`
			`self.id = kwargs.get('id', 'test-processor')`
			`# Don't call the real __init__`

			`with patch('trustgraph.base.async_processor.AsyncProcessor.__init__', init_mock):`
			`yield`