mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-05-07 10:32:36 +02:00
232 lines
7.1 KiB
Go
232 lines
7.1 KiB
Go
|
|
// Package intent provides the Intent Distiller — recursive compression
|
|||
|
|
// of user input into a pure intent vector (DIP H0.2).
|
|||
|
|
//
|
|||
|
|
// The distillation process:
|
|||
|
|
// 1. Embed raw text → surface vector
|
|||
|
|
// 2. Extract key phrases (top-N by TF weight)
|
|||
|
|
// 3. Re-embed compressed text → deep vector
|
|||
|
|
// 4. Compute cosine similarity(surface, deep)
|
|||
|
|
// 5. If similarity > threshold → converged (intent = deep vector)
|
|||
|
|
// 6. If similarity < threshold → iterate with further compression
|
|||
|
|
// 7. Final sincerity check: high divergence between surface and deep = manipulation
|
|||
|
|
package intent
|
|||
|
|
|
|||
|
|
import (
|
|||
|
|
"context"
|
|||
|
|
"fmt"
|
|||
|
|
"math"
|
|||
|
|
"strings"
|
|||
|
|
"time"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// EmbeddingFunc abstracts the embedding computation (bridges to Python NLP).
|
|||
|
|
type EmbeddingFunc func(ctx context.Context, text string) ([]float64, error)
|
|||
|
|
|
|||
|
|
// DistillConfig configures the distillation pipeline.
|
|||
|
|
type DistillConfig struct {
|
|||
|
|
MaxIterations int // Maximum distillation iterations (default: 5)
|
|||
|
|
ConvergenceThreshold float64 // Cosine similarity threshold for convergence (default: 0.92)
|
|||
|
|
SincerityThreshold float64 // Max surface-deep divergence before flagging manipulation (default: 0.35)
|
|||
|
|
MinTextLength int // Minimum text length to attempt distillation (default: 10)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DefaultConfig returns sensible defaults.
|
|||
|
|
func DefaultConfig() DistillConfig {
|
|||
|
|
return DistillConfig{
|
|||
|
|
MaxIterations: 5,
|
|||
|
|
ConvergenceThreshold: 0.92,
|
|||
|
|
SincerityThreshold: 0.35,
|
|||
|
|
MinTextLength: 10,
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// DistillResult holds the output of intent distillation.
|
|||
|
|
type DistillResult struct {
|
|||
|
|
// Core outputs
|
|||
|
|
IntentVector []float64 `json:"intent_vector"` // Pure intent embedding
|
|||
|
|
SurfaceVector []float64 `json:"surface_vector"` // Raw text embedding
|
|||
|
|
CompressedText string `json:"compressed_text"` // Final compressed form
|
|||
|
|
|
|||
|
|
// Metrics
|
|||
|
|
Iterations int `json:"iterations"` // Distillation iterations used
|
|||
|
|
Convergence float64 `json:"convergence"` // Final cosine similarity
|
|||
|
|
SincerityScore float64 `json:"sincerity_score"` // 1.0 = sincere, 0.0 = manipulative
|
|||
|
|
IsSincere bool `json:"is_sincere"` // Passed sincerity check
|
|||
|
|
IsManipulation bool `json:"is_manipulation"` // Failed sincerity check
|
|||
|
|
|
|||
|
|
// Timing
|
|||
|
|
DurationMs int64 `json:"duration_ms"`
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Distiller performs recursive intent extraction.
|
|||
|
|
type Distiller struct {
|
|||
|
|
cfg DistillConfig
|
|||
|
|
embed EmbeddingFunc
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// NewDistiller creates a new Intent Distiller.
|
|||
|
|
func NewDistiller(embedFn EmbeddingFunc, cfg *DistillConfig) *Distiller {
|
|||
|
|
c := DefaultConfig()
|
|||
|
|
if cfg != nil {
|
|||
|
|
if cfg.MaxIterations > 0 {
|
|||
|
|
c.MaxIterations = cfg.MaxIterations
|
|||
|
|
}
|
|||
|
|
if cfg.ConvergenceThreshold > 0 {
|
|||
|
|
c.ConvergenceThreshold = cfg.ConvergenceThreshold
|
|||
|
|
}
|
|||
|
|
if cfg.SincerityThreshold > 0 {
|
|||
|
|
c.SincerityThreshold = cfg.SincerityThreshold
|
|||
|
|
}
|
|||
|
|
if cfg.MinTextLength > 0 {
|
|||
|
|
c.MinTextLength = cfg.MinTextLength
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return &Distiller{cfg: c, embed: embedFn}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Distill performs recursive intent distillation on the input text.
|
|||
|
|
//
|
|||
|
|
// The process iteratively compresses the text and compares embeddings
|
|||
|
|
// until convergence (the meaning stabilizes) or max iterations.
|
|||
|
|
// A sincerity check compares the original surface embedding against
|
|||
|
|
// the final deep embedding — high divergence signals manipulation.
|
|||
|
|
func (d *Distiller) Distill(ctx context.Context, text string) (*DistillResult, error) {
|
|||
|
|
start := time.Now()
|
|||
|
|
|
|||
|
|
if len(strings.TrimSpace(text)) < d.cfg.MinTextLength {
|
|||
|
|
return nil, fmt.Errorf("text too short for distillation (min %d chars)", d.cfg.MinTextLength)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Step 1: Surface embedding (raw text as-is).
|
|||
|
|
surfaceVec, err := d.embed(ctx, text)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("surface embedding: %w", err)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Step 2: Iterative compression loop.
|
|||
|
|
currentText := text
|
|||
|
|
var prevVec []float64
|
|||
|
|
currentVec := surfaceVec
|
|||
|
|
iterations := 0
|
|||
|
|
convergence := 0.0
|
|||
|
|
|
|||
|
|
for i := 0; i < d.cfg.MaxIterations; i++ {
|
|||
|
|
iterations = i + 1
|
|||
|
|
|
|||
|
|
// Compress text: extract core phrases.
|
|||
|
|
compressed := compressText(currentText)
|
|||
|
|
if compressed == currentText || len(compressed) < d.cfg.MinTextLength {
|
|||
|
|
break // Cannot compress further
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Re-embed compressed text.
|
|||
|
|
prevVec = currentVec
|
|||
|
|
currentVec, err = d.embed(ctx, compressed)
|
|||
|
|
if err != nil {
|
|||
|
|
return nil, fmt.Errorf("iteration %d embedding: %w", i, err)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Check convergence.
|
|||
|
|
convergence = cosineSimilarity(prevVec, currentVec)
|
|||
|
|
if convergence >= d.cfg.ConvergenceThreshold {
|
|||
|
|
currentText = compressed
|
|||
|
|
break // Intent has stabilized
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
currentText = compressed
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Step 3: Sincerity check.
|
|||
|
|
surfaceDeepSim := cosineSimilarity(surfaceVec, currentVec)
|
|||
|
|
divergence := 1.0 - surfaceDeepSim
|
|||
|
|
isSincere := divergence <= d.cfg.SincerityThreshold
|
|||
|
|
|
|||
|
|
result := &DistillResult{
|
|||
|
|
IntentVector: currentVec,
|
|||
|
|
SurfaceVector: surfaceVec,
|
|||
|
|
CompressedText: currentText,
|
|||
|
|
Iterations: iterations,
|
|||
|
|
Convergence: convergence,
|
|||
|
|
SincerityScore: surfaceDeepSim,
|
|||
|
|
IsSincere: isSincere,
|
|||
|
|
IsManipulation: !isSincere,
|
|||
|
|
DurationMs: time.Since(start).Milliseconds(),
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return result, nil
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// compressText extracts the semantic core of text by removing
|
|||
|
|
// filler words, decorations, and social engineering wrappers.
|
|||
|
|
func compressText(text string) string {
|
|||
|
|
words := strings.Fields(text)
|
|||
|
|
if len(words) <= 3 {
|
|||
|
|
return text
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Remove common filler/manipulation patterns
|
|||
|
|
fillers := map[string]bool{
|
|||
|
|
"please": true, "пожалуйста": true, "kindly": true,
|
|||
|
|
"just": true, "simply": true, "только": true,
|
|||
|
|
"imagine": true, "представь": true, "pretend": true,
|
|||
|
|
"suppose": true, "допустим": true, "assuming": true,
|
|||
|
|
"hypothetically": true, "гипотетически": true,
|
|||
|
|
"for": true, "для": true, "as": true, "как": true,
|
|||
|
|
"the": true, "a": true, "an": true, "и": true,
|
|||
|
|
"is": true, "are": true, "was": true, "were": true,
|
|||
|
|
"that": true, "this": true, "these": true, "those": true,
|
|||
|
|
"будь": true, "будьте": true, "можешь": true,
|
|||
|
|
"could": true, "would": true, "should": true,
|
|||
|
|
"actually": true, "really": true, "very": true,
|
|||
|
|
"you": true, "your": true, "ты": true, "твой": true,
|
|||
|
|
"my": true, "мой": true, "i": true, "я": true,
|
|||
|
|
"в": true, "на": true, "с": true, "к": true,
|
|||
|
|
"не": true, "но": true, "из": true, "от": true,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
var core []string
|
|||
|
|
for _, w := range words {
|
|||
|
|
lower := strings.ToLower(w)
|
|||
|
|
// Strip punctuation for check, keep original
|
|||
|
|
cleaned := strings.Trim(lower, ".,!?;:'\"()-[]{}«»")
|
|||
|
|
if !fillers[cleaned] && len(cleaned) > 1 {
|
|||
|
|
core = append(core, w)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if len(core) == 0 {
|
|||
|
|
return text // Don't compress to nothing
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Keep max 70% of original words (progressive compression)
|
|||
|
|
maxWords := int(float64(len(words)) * 0.7)
|
|||
|
|
if maxWords < 3 {
|
|||
|
|
maxWords = 3
|
|||
|
|
}
|
|||
|
|
if len(core) > maxWords {
|
|||
|
|
core = core[:maxWords]
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
return strings.Join(core, " ")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// cosineSimilarity computes cosine similarity between two vectors.
|
|||
|
|
func cosineSimilarity(a, b []float64) float64 {
|
|||
|
|
if len(a) != len(b) || len(a) == 0 {
|
|||
|
|
return 0
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
var dot, normA, normB float64
|
|||
|
|
for i := range a {
|
|||
|
|
dot += a[i] * b[i]
|
|||
|
|
normA += a[i] * a[i]
|
|||
|
|
normB += b[i] * b[i]
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
denom := math.Sqrt(normA) * math.Sqrt(normB)
|
|||
|
|
if denom == 0 {
|
|||
|
|
return 0
|
|||
|
|
}
|
|||
|
|
return dot / denom
|
|||
|
|
}
|