diff --git a/docs/tech-specs/SCHEMA_REFACTORING_PROPOSAL.md b/docs/tech-specs/SCHEMA_REFACTORING_PROPOSAL.md new file mode 100644 index 00000000..07265e6c --- /dev/null +++ b/docs/tech-specs/SCHEMA_REFACTORING_PROPOSAL.md @@ -0,0 +1,91 @@ +# Schema Directory Refactoring Proposal + +## Current Issues + +1. **Flat structure** - All schemas in one directory makes it hard to understand relationships +2. **Mixed concerns** - Core types, domain objects, and API contracts all mixed together +3. **Unclear naming** - Files like "object.py", "types.py", "topic.py" don't clearly indicate their purpose +4. **No clear layering** - Can't easily see what depends on what + +## Proposed Structure + +``` +trustgraph-base/trustgraph/schema/ +├── __init__.py +├── core/ # Core primitive types used everywhere +│ ├── __init__.py +│ ├── primitives.py # Error, Value, Triple, Field, RowSchema +│ ├── metadata.py # Metadata record +│ └── topic.py # Topic utilities +│ +├── knowledge/ # Knowledge domain models and extraction +│ ├── __init__.py +│ ├── graph.py # EntityContext, EntityEmbeddings, Triples +│ ├── document.py # Document, TextDocument, Chunk +│ ├── knowledge.py # Knowledge extraction types +│ ├── embeddings.py # All embedding-related types (moved from multiple files) +│ └── nlp.py # Definition, Topic, Relationship, Fact types +│ +└── services/ # Service request/response contracts + ├── __init__.py + ├── llm.py # TextCompletion, Embeddings, Tool requests/responses + ├── retrieval.py # GraphRAG, DocumentRAG queries/responses + ├── query.py # GraphEmbeddingsRequest/Response, DocumentEmbeddingsRequest/Response + ├── agent.py # Agent requests/responses + ├── flow.py # Flow requests/responses + ├── prompt.py # Prompt service requests/responses + ├── config.py # Configuration service + ├── library.py # Librarian service + └── lookup.py # Lookup service +``` + +## Key Changes + +1. **Hierarchical organization** - Clear separation between core types, knowledge models, and service contracts +2. **Better naming**: + - `types.py` → `core/primitives.py` (clearer purpose) + - `object.py` → Split between appropriate files based on actual content + - `documents.py` → `knowledge/document.py` (singular, consistent) + - `models.py` → `services/llm.py` (clearer what kind of models) + - `prompt.py` → Split: service parts to `services/prompt.py`, data types to `knowledge/nlp.py` + +3. **Logical grouping**: + - All embedding types consolidated in `knowledge/embeddings.py` + - All LLM-related service contracts in `services/llm.py` + - Clear separation of request/response pairs in services directory + - Knowledge extraction types grouped with other knowledge domain models + +4. **Dependency clarity**: + - Core types have no dependencies + - Knowledge models depend only on core + - Service contracts can depend on both core and knowledge models + +## Migration Benefits + +1. **Easier navigation** - Developers can quickly find what they need +2. **Better modularity** - Clear boundaries between different concerns +3. **Simpler imports** - More intuitive import paths +4. **Future-proof** - Easy to add new knowledge types or services without cluttering + +## Example Import Changes + +```python +# Before +from trustgraph.schema import Error, Triple, GraphEmbeddings, TextCompletionRequest + +# After +from trustgraph.schema.core import Error, Triple +from trustgraph.schema.knowledge import GraphEmbeddings +from trustgraph.schema.services import TextCompletionRequest +``` + +## Implementation Notes + +1. Keep backward compatibility by maintaining imports in root `__init__.py` +2. Move files gradually, updating imports as needed +3. Consider adding a `legacy.py` that imports everything for transition period +4. Update documentation to reflect new structure + + + +[{"id": "1", "content": "Examine current schema directory structure", "status": "completed", "priority": "high"}, {"id": "2", "content": "Analyze schema files and their purposes", "status": "completed", "priority": "high"}, {"id": "3", "content": "Propose improved naming and structure", "status": "completed", "priority": "high"}] \ No newline at end of file diff --git a/trustgraph-base/trustgraph/schema/README.flows b/trustgraph-base/trustgraph/schema/README.flows new file mode 100644 index 00000000..d418b1f5 --- /dev/null +++ b/trustgraph-base/trustgraph/schema/README.flows @@ -0,0 +1,35 @@ + + pdf- + decoder + + | + v + + chunker + + | + ,------------------+----------- . . . + | | + v v + + extract- extract- + relationships definitions + + | | | + +----------------' | + | v + v + vectorize + triple- + store | + v + + ge-write + +Refactor: + +[] Change vectorize +[] Re-route chunker to extract-* +[] Re-route vectorize to ge-write* +[] Re-route extract-definitions to ge-write* +[] Remove extract-relationships to ge-write routing diff --git a/trustgraph-base/trustgraph/schema/__init__.py b/trustgraph-base/trustgraph/schema/__init__.py index 957ebcbd..387d39e0 100644 --- a/trustgraph-base/trustgraph/schema/__init__.py +++ b/trustgraph-base/trustgraph/schema/__init__.py @@ -1,17 +1,10 @@ -from . types import * -from . prompt import * -from . documents import * -from . models import * -from . object import * -from . topic import * -from . graph import * -from . retrieval import * -from . metadata import * -from . agent import * -from . lookup import * -from . library import * -from . config import * -from . flows import * -from . knowledge import * +# Import core types and primitives +from .core import * + +# Import knowledge schemas +from .knowledge import * + +# Import service schemas +from .services import * diff --git a/trustgraph-base/trustgraph/schema/core/__init__.py b/trustgraph-base/trustgraph/schema/core/__init__.py new file mode 100644 index 00000000..989869bb --- /dev/null +++ b/trustgraph-base/trustgraph/schema/core/__init__.py @@ -0,0 +1,3 @@ +from .primitives import * +from .metadata import * +from .topic import * \ No newline at end of file diff --git a/trustgraph-base/trustgraph/schema/metadata.py b/trustgraph-base/trustgraph/schema/core/metadata.py similarity index 88% rename from trustgraph-base/trustgraph/schema/metadata.py rename to trustgraph-base/trustgraph/schema/core/metadata.py index 5922db26..cb2022ac 100644 --- a/trustgraph-base/trustgraph/schema/metadata.py +++ b/trustgraph-base/trustgraph/schema/core/metadata.py @@ -1,6 +1,6 @@ from pulsar.schema import Record, String, Array -from . types import Triple +from .primitives import Triple class Metadata(Record): diff --git a/trustgraph-base/trustgraph/schema/types.py b/trustgraph-base/trustgraph/schema/core/primitives.py similarity index 100% rename from trustgraph-base/trustgraph/schema/types.py rename to trustgraph-base/trustgraph/schema/core/primitives.py diff --git a/trustgraph-base/trustgraph/schema/topic.py b/trustgraph-base/trustgraph/schema/core/topic.py similarity index 100% rename from trustgraph-base/trustgraph/schema/topic.py rename to trustgraph-base/trustgraph/schema/core/topic.py diff --git a/trustgraph-base/trustgraph/schema/documents.py b/trustgraph-base/trustgraph/schema/documents.py deleted file mode 100644 index e479371d..00000000 --- a/trustgraph-base/trustgraph/schema/documents.py +++ /dev/null @@ -1,56 +0,0 @@ - -from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array, Double -from . topic import topic -from . types import Error -from . metadata import Metadata - -############################################################################ - -# PDF docs etc. -class Document(Record): - metadata = Metadata() - data = Bytes() - -############################################################################ - -# Text documents / text from PDF - -class TextDocument(Record): - metadata = Metadata() - text = Bytes() - -############################################################################ - -# Chunks of text - -class Chunk(Record): - metadata = Metadata() - chunk = Bytes() - -############################################################################ - -# Document embeddings are embeddings associated with a chunk - -class ChunkEmbeddings(Record): - chunk = Bytes() - vectors = Array(Array(Double())) - -# This is a 'batching' mechanism for the above data -class DocumentEmbeddings(Record): - metadata = Metadata() - chunks = Array(ChunkEmbeddings()) - -############################################################################ - -# Doc embeddings query - -class DocumentEmbeddingsRequest(Record): - vectors = Array(Array(Double())) - limit = Integer() - user = String() - collection = String() - -class DocumentEmbeddingsResponse(Record): - error = Error() - documents = Array(Bytes()) - diff --git a/trustgraph-base/trustgraph/schema/graph.py b/trustgraph-base/trustgraph/schema/graph.py deleted file mode 100644 index 97a99fbd..00000000 --- a/trustgraph-base/trustgraph/schema/graph.py +++ /dev/null @@ -1,71 +0,0 @@ - -from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array, Double - -from . types import Error, Value, Triple -from . topic import topic -from . metadata import Metadata - -############################################################################ - -# Entity context are an entity associated with textual context - -class EntityContext(Record): - entity = Value() - context = String() - -# This is a 'batching' mechanism for the above data -class EntityContexts(Record): - metadata = Metadata() - entities = Array(EntityContext()) - -############################################################################ - -# Graph embeddings are embeddings associated with a graph entity - -class EntityEmbeddings(Record): - entity = Value() - vectors = Array(Array(Double())) - -# This is a 'batching' mechanism for the above data -class GraphEmbeddings(Record): - metadata = Metadata() - entities = Array(EntityEmbeddings()) - -############################################################################ - -# Graph embeddings query - -class GraphEmbeddingsRequest(Record): - vectors = Array(Array(Double())) - limit = Integer() - user = String() - collection = String() - -class GraphEmbeddingsResponse(Record): - error = Error() - entities = Array(Value()) - -############################################################################ - -# Graph triples - -class Triples(Record): - metadata = Metadata() - triples = Array(Triple()) - -############################################################################ - -# Triples query - -class TriplesQueryRequest(Record): - s = Value() - p = Value() - o = Value() - limit = Integer() - user = String() - collection = String() - -class TriplesQueryResponse(Record): - error = Error() - triples = Array(Triple()) - diff --git a/trustgraph-base/trustgraph/schema/knowledge/__init__.py b/trustgraph-base/trustgraph/schema/knowledge/__init__.py new file mode 100644 index 00000000..e58e9f25 --- /dev/null +++ b/trustgraph-base/trustgraph/schema/knowledge/__init__.py @@ -0,0 +1,6 @@ +from .graph import * +from .document import * +from .embeddings import * +from .knowledge import * +from .nlp import * +from .rows import * \ No newline at end of file diff --git a/trustgraph-base/trustgraph/schema/knowledge/document.py b/trustgraph-base/trustgraph/schema/knowledge/document.py new file mode 100644 index 00000000..f41ee8a6 --- /dev/null +++ b/trustgraph-base/trustgraph/schema/knowledge/document.py @@ -0,0 +1,29 @@ +from pulsar.schema import Record, Bytes + +from ..core.metadata import Metadata +from ..core.topic import topic + +############################################################################ + +# PDF docs etc. +class Document(Record): + metadata = Metadata() + data = Bytes() + +############################################################################ + +# Text documents / text from PDF + +class TextDocument(Record): + metadata = Metadata() + text = Bytes() + +############################################################################ + +# Chunks of text + +class Chunk(Record): + metadata = Metadata() + chunk = Bytes() + +############################################################################ \ No newline at end of file diff --git a/trustgraph-base/trustgraph/schema/knowledge/embeddings.py b/trustgraph-base/trustgraph/schema/knowledge/embeddings.py new file mode 100644 index 00000000..c1b55eba --- /dev/null +++ b/trustgraph-base/trustgraph/schema/knowledge/embeddings.py @@ -0,0 +1,43 @@ +from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array, Double, Map + +from ..core.metadata import Metadata +from ..core.primitives import Value, RowSchema +from ..core.topic import topic + +############################################################################ + +# Graph embeddings are embeddings associated with a graph entity + +class EntityEmbeddings(Record): + entity = Value() + vectors = Array(Array(Double())) + +# This is a 'batching' mechanism for the above data +class GraphEmbeddings(Record): + metadata = Metadata() + entities = Array(EntityEmbeddings()) + +############################################################################ + +# Document embeddings are embeddings associated with a chunk + +class ChunkEmbeddings(Record): + chunk = Bytes() + vectors = Array(Array(Double())) + +# This is a 'batching' mechanism for the above data +class DocumentEmbeddings(Record): + metadata = Metadata() + chunks = Array(ChunkEmbeddings()) + +############################################################################ + +# Object embeddings are embeddings associated with the primary key of an +# object + +class ObjectEmbeddings(Record): + metadata = Metadata() + vectors = Array(Array(Double())) + name = String() + key_name = String() + id = String() \ No newline at end of file diff --git a/trustgraph-base/trustgraph/schema/knowledge/graph.py b/trustgraph-base/trustgraph/schema/knowledge/graph.py new file mode 100644 index 00000000..1d55c8f0 --- /dev/null +++ b/trustgraph-base/trustgraph/schema/knowledge/graph.py @@ -0,0 +1,28 @@ +from pulsar.schema import Record, String, Array + +from ..core.primitives import Value, Triple +from ..core.metadata import Metadata +from ..core.topic import topic + +############################################################################ + +# Entity context are an entity associated with textual context + +class EntityContext(Record): + entity = Value() + context = String() + +# This is a 'batching' mechanism for the above data +class EntityContexts(Record): + metadata = Metadata() + entities = Array(EntityContext()) + +############################################################################ + +# Graph triples + +class Triples(Record): + metadata = Metadata() + triples = Array(Triple()) + +############################################################################ \ No newline at end of file diff --git a/trustgraph-base/trustgraph/schema/knowledge.py b/trustgraph-base/trustgraph/schema/knowledge/knowledge.py similarity index 83% rename from trustgraph-base/trustgraph/schema/knowledge.py rename to trustgraph-base/trustgraph/schema/knowledge/knowledge.py index 21217153..7cd5450e 100644 --- a/trustgraph-base/trustgraph/schema/knowledge.py +++ b/trustgraph-base/trustgraph/schema/knowledge/knowledge.py @@ -1,11 +1,11 @@ from pulsar.schema import Record, Bytes, String, Array, Long, Boolean -from . types import Triple -from . topic import topic -from . types import Error -from . metadata import Metadata -from . documents import Document, TextDocument -from . graph import Triples, GraphEmbeddings +from ..core.primitives import Triple, Error +from ..core.topic import topic +from ..core.metadata import Metadata +from .document import Document, TextDocument +from .graph import Triples +from .embeddings import GraphEmbeddings # get-kg-core # -> (???) diff --git a/trustgraph-base/trustgraph/schema/knowledge/nlp.py b/trustgraph-base/trustgraph/schema/knowledge/nlp.py new file mode 100644 index 00000000..0ffc3ba1 --- /dev/null +++ b/trustgraph-base/trustgraph/schema/knowledge/nlp.py @@ -0,0 +1,26 @@ +from pulsar.schema import Record, String, Boolean + +from ..core.topic import topic + +############################################################################ + +# NLP extraction data types + +class Definition(Record): + name = String() + definition = String() + +class Topic(Record): + name = String() + definition = String() + +class Relationship(Record): + s = String() + p = String() + o = String() + o_entity = Boolean() + +class Fact(Record): + s = String() + p = String() + o = String() \ No newline at end of file diff --git a/trustgraph-base/trustgraph/schema/knowledge/rows.py b/trustgraph-base/trustgraph/schema/knowledge/rows.py new file mode 100644 index 00000000..8b1c79ef --- /dev/null +++ b/trustgraph-base/trustgraph/schema/knowledge/rows.py @@ -0,0 +1,16 @@ +from pulsar.schema import Record, Array, Map, String + +from ..core.metadata import Metadata +from ..core.primitives import RowSchema +from ..core.topic import topic + +############################################################################ + +# Stores rows of information + +class Rows(Record): + metadata = Metadata() + row_schema = RowSchema() + rows = Array(Map(String())) + +############################################################################ \ No newline at end of file diff --git a/trustgraph-base/trustgraph/schema/object.py b/trustgraph-base/trustgraph/schema/object.py deleted file mode 100644 index 6667fdf3..00000000 --- a/trustgraph-base/trustgraph/schema/object.py +++ /dev/null @@ -1,31 +0,0 @@ - -from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array -from pulsar.schema import Double, Map - -from . metadata import Metadata -from . types import Value, RowSchema -from . topic import topic - -############################################################################ - -# Object embeddings are embeddings associated with the primary key of an -# object - -class ObjectEmbeddings(Record): - metadata = Metadata() - vectors = Array(Array(Double())) - name = String() - key_name = String() - id = String() - -############################################################################ - -# Stores rows of information - -class Rows(Record): - metadata = Metadata() - row_schema = RowSchema() - rows = Array(Map(String())) - - - diff --git a/trustgraph-base/trustgraph/schema/services/__init__.py b/trustgraph-base/trustgraph/schema/services/__init__.py new file mode 100644 index 00000000..4fb66b4d --- /dev/null +++ b/trustgraph-base/trustgraph/schema/services/__init__.py @@ -0,0 +1,9 @@ +from .llm import * +from .retrieval import * +from .query import * +from .agent import * +from .flow import * +from .prompt import * +from .config import * +from .library import * +from .lookup import * \ No newline at end of file diff --git a/trustgraph-base/trustgraph/schema/agent.py b/trustgraph-base/trustgraph/schema/services/agent.py similarity index 90% rename from trustgraph-base/trustgraph/schema/agent.py rename to trustgraph-base/trustgraph/schema/services/agent.py index ee20a9aa..21d2fe1f 100644 --- a/trustgraph-base/trustgraph/schema/agent.py +++ b/trustgraph-base/trustgraph/schema/services/agent.py @@ -1,8 +1,8 @@ from pulsar.schema import Record, String, Array, Map -from . topic import topic -from . types import Error +from ..core.topic import topic +from ..core.primitives import Error ############################################################################ diff --git a/trustgraph-base/trustgraph/schema/config.py b/trustgraph-base/trustgraph/schema/services/config.py similarity index 95% rename from trustgraph-base/trustgraph/schema/config.py rename to trustgraph-base/trustgraph/schema/services/config.py index 3be63aa3..a0955eab 100644 --- a/trustgraph-base/trustgraph/schema/config.py +++ b/trustgraph-base/trustgraph/schema/services/config.py @@ -1,8 +1,8 @@ from pulsar.schema import Record, Bytes, String, Boolean, Array, Map, Integer -from . topic import topic -from . types import Error +from ..core.topic import topic +from ..core.primitives import Error ############################################################################ diff --git a/trustgraph-base/trustgraph/schema/flows.py b/trustgraph-base/trustgraph/schema/services/flow.py similarity index 95% rename from trustgraph-base/trustgraph/schema/flows.py rename to trustgraph-base/trustgraph/schema/services/flow.py index 28b90f5d..0b5c1bfd 100644 --- a/trustgraph-base/trustgraph/schema/flows.py +++ b/trustgraph-base/trustgraph/schema/services/flow.py @@ -1,8 +1,8 @@ from pulsar.schema import Record, Bytes, String, Boolean, Array, Map, Integer -from . topic import topic -from . types import Error +from ..core.topic import topic +from ..core.primitives import Error ############################################################################ diff --git a/trustgraph-base/trustgraph/schema/library.py b/trustgraph-base/trustgraph/schema/services/library.py similarity index 93% rename from trustgraph-base/trustgraph/schema/library.py rename to trustgraph-base/trustgraph/schema/services/library.py index 6504fa78..d9678a90 100644 --- a/trustgraph-base/trustgraph/schema/library.py +++ b/trustgraph-base/trustgraph/schema/services/library.py @@ -1,10 +1,9 @@ from pulsar.schema import Record, Bytes, String, Array, Long -from . types import Triple -from . topic import topic -from . types import Error -from . metadata import Metadata -from . documents import Document, TextDocument +from ..core.primitives import Triple, Error +from ..core.topic import topic +from ..core.metadata import Metadata +from ..knowledge.document import Document, TextDocument # add-document # -> (document_id, document_metadata, content) diff --git a/trustgraph-base/trustgraph/schema/models.py b/trustgraph-base/trustgraph/schema/services/llm.py similarity index 93% rename from trustgraph-base/trustgraph/schema/models.py rename to trustgraph-base/trustgraph/schema/services/llm.py index a3b37e4e..4665bc8a 100644 --- a/trustgraph-base/trustgraph/schema/models.py +++ b/trustgraph-base/trustgraph/schema/services/llm.py @@ -1,8 +1,8 @@ from pulsar.schema import Record, String, Array, Double, Integer -from . topic import topic -from . types import Error +from ..core.topic import topic +from ..core.primitives import Error ############################################################################ diff --git a/trustgraph-base/trustgraph/schema/lookup.py b/trustgraph-base/trustgraph/schema/services/lookup.py similarity index 74% rename from trustgraph-base/trustgraph/schema/lookup.py rename to trustgraph-base/trustgraph/schema/services/lookup.py index a88d188e..7cc0bd03 100644 --- a/trustgraph-base/trustgraph/schema/lookup.py +++ b/trustgraph-base/trustgraph/schema/services/lookup.py @@ -1,9 +1,9 @@ from pulsar.schema import Record, String -from . types import Error, Value, Triple -from . topic import topic -from . metadata import Metadata +from ..core.primitives import Error, Value, Triple +from ..core.topic import topic +from ..core.metadata import Metadata ############################################################################ diff --git a/trustgraph-base/trustgraph/schema/prompt.py b/trustgraph-base/trustgraph/schema/services/prompt.py similarity index 59% rename from trustgraph-base/trustgraph/schema/prompt.py rename to trustgraph-base/trustgraph/schema/services/prompt.py index 369ace53..2567f471 100644 --- a/trustgraph-base/trustgraph/schema/prompt.py +++ b/trustgraph-base/trustgraph/schema/services/prompt.py @@ -1,32 +1,12 @@ +from pulsar.schema import Record, String, Map -from pulsar.schema import Record, Bytes, String, Boolean, Array, Map, Integer - -from . topic import topic -from . types import Error, RowSchema +from ..core.primitives import Error +from ..core.topic import topic ############################################################################ # Prompt services, abstract the prompt generation -class Definition(Record): - name = String() - definition = String() - -class Topic(Record): - name = String() - definition = String() - -class Relationship(Record): - s = String() - p = String() - o = String() - o_entity = Boolean() - -class Fact(Record): - s = String() - p = String() - o = String() - # extract-definitions: # chunk -> definitions # extract-relationships: @@ -55,5 +35,4 @@ class PromptResponse(Record): # JSON encoded object = String() -############################################################################ - +############################################################################ \ No newline at end of file diff --git a/trustgraph-base/trustgraph/schema/services/query.py b/trustgraph-base/trustgraph/schema/services/query.py new file mode 100644 index 00000000..214a1d4b --- /dev/null +++ b/trustgraph-base/trustgraph/schema/services/query.py @@ -0,0 +1,48 @@ +from pulsar.schema import Record, String, Integer, Array, Double + +from ..core.primitives import Error, Value, Triple +from ..core.topic import topic + +############################################################################ + +# Graph embeddings query + +class GraphEmbeddingsRequest(Record): + vectors = Array(Array(Double())) + limit = Integer() + user = String() + collection = String() + +class GraphEmbeddingsResponse(Record): + error = Error() + entities = Array(Value()) + +############################################################################ + +# Graph triples query + +class TriplesQueryRequest(Record): + user = String() + collection = String() + s = Value() + p = Value() + o = Value() + limit = Integer() + +class TriplesQueryResponse(Record): + error = Error() + triples = Array(Triple()) + +############################################################################ + +# Doc embeddings query + +class DocumentEmbeddingsRequest(Record): + vectors = Array(Array(Double())) + limit = Integer() + user = String() + collection = String() + +class DocumentEmbeddingsResponse(Record): + error = Error() + chunks = Array(String()) \ No newline at end of file diff --git a/trustgraph-base/trustgraph/schema/retrieval.py b/trustgraph-base/trustgraph/schema/services/retrieval.py similarity index 91% rename from trustgraph-base/trustgraph/schema/retrieval.py rename to trustgraph-base/trustgraph/schema/services/retrieval.py index 1077e4f9..ee96bb1e 100644 --- a/trustgraph-base/trustgraph/schema/retrieval.py +++ b/trustgraph-base/trustgraph/schema/services/retrieval.py @@ -1,7 +1,7 @@ from pulsar.schema import Record, Bytes, String, Boolean, Integer, Array, Double -from . topic import topic -from . types import Error, Value +from ..core.topic import topic +from ..core.primitives import Error, Value ############################################################################