gomcp/internal/domain/intent/distiller.go

231 lines
7.1 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Package intent provides the Intent Distiller — recursive compression
// of user input into a pure intent vector (DIP H0.2).
//
// The distillation process:
// 1. Embed raw text → surface vector
// 2. Extract key phrases (top-N by TF weight)
// 3. Re-embed compressed text → deep vector
// 4. Compute cosine similarity(surface, deep)
// 5. If similarity > threshold → converged (intent = deep vector)
// 6. If similarity < threshold → iterate with further compression
// 7. Final sincerity check: high divergence between surface and deep = manipulation
package intent
import (
"context"
"fmt"
"math"
"strings"
"time"
)
// EmbeddingFunc abstracts the embedding computation (bridges to Python NLP).
type EmbeddingFunc func(ctx context.Context, text string) ([]float64, error)
// DistillConfig configures the distillation pipeline.
type DistillConfig struct {
MaxIterations int // Maximum distillation iterations (default: 5)
ConvergenceThreshold float64 // Cosine similarity threshold for convergence (default: 0.92)
SincerityThreshold float64 // Max surface-deep divergence before flagging manipulation (default: 0.35)
MinTextLength int // Minimum text length to attempt distillation (default: 10)
}
// DefaultConfig returns sensible defaults.
func DefaultConfig() DistillConfig {
return DistillConfig{
MaxIterations: 5,
ConvergenceThreshold: 0.92,
SincerityThreshold: 0.35,
MinTextLength: 10,
}
}
// DistillResult holds the output of intent distillation.
type DistillResult struct {
// Core outputs
IntentVector []float64 `json:"intent_vector"` // Pure intent embedding
SurfaceVector []float64 `json:"surface_vector"` // Raw text embedding
CompressedText string `json:"compressed_text"` // Final compressed form
// Metrics
Iterations int `json:"iterations"` // Distillation iterations used
Convergence float64 `json:"convergence"` // Final cosine similarity
SincerityScore float64 `json:"sincerity_score"` // 1.0 = sincere, 0.0 = manipulative
IsSincere bool `json:"is_sincere"` // Passed sincerity check
IsManipulation bool `json:"is_manipulation"` // Failed sincerity check
// Timing
DurationMs int64 `json:"duration_ms"`
}
// Distiller performs recursive intent extraction.
type Distiller struct {
cfg DistillConfig
embed EmbeddingFunc
}
// NewDistiller creates a new Intent Distiller.
func NewDistiller(embedFn EmbeddingFunc, cfg *DistillConfig) *Distiller {
c := DefaultConfig()
if cfg != nil {
if cfg.MaxIterations > 0 {
c.MaxIterations = cfg.MaxIterations
}
if cfg.ConvergenceThreshold > 0 {
c.ConvergenceThreshold = cfg.ConvergenceThreshold
}
if cfg.SincerityThreshold > 0 {
c.SincerityThreshold = cfg.SincerityThreshold
}
if cfg.MinTextLength > 0 {
c.MinTextLength = cfg.MinTextLength
}
}
return &Distiller{cfg: c, embed: embedFn}
}
// Distill performs recursive intent distillation on the input text.
//
// The process iteratively compresses the text and compares embeddings
// until convergence (the meaning stabilizes) or max iterations.
// A sincerity check compares the original surface embedding against
// the final deep embedding — high divergence signals manipulation.
func (d *Distiller) Distill(ctx context.Context, text string) (*DistillResult, error) {
start := time.Now()
if len(strings.TrimSpace(text)) < d.cfg.MinTextLength {
return nil, fmt.Errorf("text too short for distillation (min %d chars)", d.cfg.MinTextLength)
}
// Step 1: Surface embedding (raw text as-is).
surfaceVec, err := d.embed(ctx, text)
if err != nil {
return nil, fmt.Errorf("surface embedding: %w", err)
}
// Step 2: Iterative compression loop.
currentText := text
var prevVec []float64
currentVec := surfaceVec
iterations := 0
convergence := 0.0
for i := 0; i < d.cfg.MaxIterations; i++ {
iterations = i + 1
// Compress text: extract core phrases.
compressed := compressText(currentText)
if compressed == currentText || len(compressed) < d.cfg.MinTextLength {
break // Cannot compress further
}
// Re-embed compressed text.
prevVec = currentVec
currentVec, err = d.embed(ctx, compressed)
if err != nil {
return nil, fmt.Errorf("iteration %d embedding: %w", i, err)
}
// Check convergence.
convergence = cosineSimilarity(prevVec, currentVec)
if convergence >= d.cfg.ConvergenceThreshold {
currentText = compressed
break // Intent has stabilized
}
currentText = compressed
}
// Step 3: Sincerity check.
surfaceDeepSim := cosineSimilarity(surfaceVec, currentVec)
divergence := 1.0 - surfaceDeepSim
isSincere := divergence <= d.cfg.SincerityThreshold
result := &DistillResult{
IntentVector: currentVec,
SurfaceVector: surfaceVec,
CompressedText: currentText,
Iterations: iterations,
Convergence: convergence,
SincerityScore: surfaceDeepSim,
IsSincere: isSincere,
IsManipulation: !isSincere,
DurationMs: time.Since(start).Milliseconds(),
}
return result, nil
}
// compressText extracts the semantic core of text by removing
// filler words, decorations, and social engineering wrappers.
func compressText(text string) string {
words := strings.Fields(text)
if len(words) <= 3 {
return text
}
// Remove common filler/manipulation patterns
fillers := map[string]bool{
"please": true, "пожалуйста": true, "kindly": true,
"just": true, "simply": true, "только": true,
"imagine": true, "представь": true, "pretend": true,
"suppose": true, "допустим": true, "assuming": true,
"hypothetically": true, "гипотетически": true,
"for": true, "для": true, "as": true, "как": true,
"the": true, "a": true, "an": true, "и": true,
"is": true, "are": true, "was": true, "were": true,
"that": true, "this": true, "these": true, "those": true,
"будь": true, "будьте": true, "можешь": true,
"could": true, "would": true, "should": true,
"actually": true, "really": true, "very": true,
"you": true, "your": true, "ты": true, "твой": true,
"my": true, "мой": true, "i": true, "я": true,
"в": true, "на": true, "с": true, "к": true,
"не": true, "но": true, "из": true, "от": true,
}
var core []string
for _, w := range words {
lower := strings.ToLower(w)
// Strip punctuation for check, keep original
cleaned := strings.Trim(lower, ".,!?;:'\"()-[]{}«»")
if !fillers[cleaned] && len(cleaned) > 1 {
core = append(core, w)
}
}
if len(core) == 0 {
return text // Don't compress to nothing
}
// Keep max 70% of original words (progressive compression)
maxWords := int(float64(len(words)) * 0.7)
if maxWords < 3 {
maxWords = 3
}
if len(core) > maxWords {
core = core[:maxWords]
}
return strings.Join(core, " ")
}
// cosineSimilarity computes cosine similarity between two vectors.
func cosineSimilarity(a, b []float64) float64 {
if len(a) != len(b) || len(a) == 0 {
return 0
}
var dot, normA, normB float64
for i := range a {
dot += a[i] * b[i]
normA += a[i] * a[i]
normB += b[i] * b[i]
}
denom := math.Sqrt(normA) * math.Sqrt(normB)
if denom == 0 {
return 0
}
return dot / denom
}