mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-05-08 19:12:37 +02:00
initial: Syntrex extraction from sentinel-community (615 files)
This commit is contained in:
commit
2c50c993b1
175 changed files with 32396 additions and 0 deletions
231
internal/domain/intent/distiller.go
Normal file
231
internal/domain/intent/distiller.go
Normal file
|
|
@ -0,0 +1,231 @@
|
|||
// Package intent provides the Intent Distiller — recursive compression
|
||||
// of user input into a pure intent vector (DIP H0.2).
|
||||
//
|
||||
// The distillation process:
|
||||
// 1. Embed raw text → surface vector
|
||||
// 2. Extract key phrases (top-N by TF weight)
|
||||
// 3. Re-embed compressed text → deep vector
|
||||
// 4. Compute cosine similarity(surface, deep)
|
||||
// 5. If similarity > threshold → converged (intent = deep vector)
|
||||
// 6. If similarity < threshold → iterate with further compression
|
||||
// 7. Final sincerity check: high divergence between surface and deep = manipulation
|
||||
package intent
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// EmbeddingFunc abstracts the embedding computation (bridges to Python NLP).
|
||||
type EmbeddingFunc func(ctx context.Context, text string) ([]float64, error)
|
||||
|
||||
// DistillConfig configures the distillation pipeline.
|
||||
type DistillConfig struct {
|
||||
MaxIterations int // Maximum distillation iterations (default: 5)
|
||||
ConvergenceThreshold float64 // Cosine similarity threshold for convergence (default: 0.92)
|
||||
SincerityThreshold float64 // Max surface-deep divergence before flagging manipulation (default: 0.35)
|
||||
MinTextLength int // Minimum text length to attempt distillation (default: 10)
|
||||
}
|
||||
|
||||
// DefaultConfig returns sensible defaults.
|
||||
func DefaultConfig() DistillConfig {
|
||||
return DistillConfig{
|
||||
MaxIterations: 5,
|
||||
ConvergenceThreshold: 0.92,
|
||||
SincerityThreshold: 0.35,
|
||||
MinTextLength: 10,
|
||||
}
|
||||
}
|
||||
|
||||
// DistillResult holds the output of intent distillation.
|
||||
type DistillResult struct {
|
||||
// Core outputs
|
||||
IntentVector []float64 `json:"intent_vector"` // Pure intent embedding
|
||||
SurfaceVector []float64 `json:"surface_vector"` // Raw text embedding
|
||||
CompressedText string `json:"compressed_text"` // Final compressed form
|
||||
|
||||
// Metrics
|
||||
Iterations int `json:"iterations"` // Distillation iterations used
|
||||
Convergence float64 `json:"convergence"` // Final cosine similarity
|
||||
SincerityScore float64 `json:"sincerity_score"` // 1.0 = sincere, 0.0 = manipulative
|
||||
IsSincere bool `json:"is_sincere"` // Passed sincerity check
|
||||
IsManipulation bool `json:"is_manipulation"` // Failed sincerity check
|
||||
|
||||
// Timing
|
||||
DurationMs int64 `json:"duration_ms"`
|
||||
}
|
||||
|
||||
// Distiller performs recursive intent extraction.
|
||||
type Distiller struct {
|
||||
cfg DistillConfig
|
||||
embed EmbeddingFunc
|
||||
}
|
||||
|
||||
// NewDistiller creates a new Intent Distiller.
|
||||
func NewDistiller(embedFn EmbeddingFunc, cfg *DistillConfig) *Distiller {
|
||||
c := DefaultConfig()
|
||||
if cfg != nil {
|
||||
if cfg.MaxIterations > 0 {
|
||||
c.MaxIterations = cfg.MaxIterations
|
||||
}
|
||||
if cfg.ConvergenceThreshold > 0 {
|
||||
c.ConvergenceThreshold = cfg.ConvergenceThreshold
|
||||
}
|
||||
if cfg.SincerityThreshold > 0 {
|
||||
c.SincerityThreshold = cfg.SincerityThreshold
|
||||
}
|
||||
if cfg.MinTextLength > 0 {
|
||||
c.MinTextLength = cfg.MinTextLength
|
||||
}
|
||||
}
|
||||
return &Distiller{cfg: c, embed: embedFn}
|
||||
}
|
||||
|
||||
// Distill performs recursive intent distillation on the input text.
|
||||
//
|
||||
// The process iteratively compresses the text and compares embeddings
|
||||
// until convergence (the meaning stabilizes) or max iterations.
|
||||
// A sincerity check compares the original surface embedding against
|
||||
// the final deep embedding — high divergence signals manipulation.
|
||||
func (d *Distiller) Distill(ctx context.Context, text string) (*DistillResult, error) {
|
||||
start := time.Now()
|
||||
|
||||
if len(strings.TrimSpace(text)) < d.cfg.MinTextLength {
|
||||
return nil, fmt.Errorf("text too short for distillation (min %d chars)", d.cfg.MinTextLength)
|
||||
}
|
||||
|
||||
// Step 1: Surface embedding (raw text as-is).
|
||||
surfaceVec, err := d.embed(ctx, text)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("surface embedding: %w", err)
|
||||
}
|
||||
|
||||
// Step 2: Iterative compression loop.
|
||||
currentText := text
|
||||
var prevVec []float64
|
||||
currentVec := surfaceVec
|
||||
iterations := 0
|
||||
convergence := 0.0
|
||||
|
||||
for i := 0; i < d.cfg.MaxIterations; i++ {
|
||||
iterations = i + 1
|
||||
|
||||
// Compress text: extract core phrases.
|
||||
compressed := compressText(currentText)
|
||||
if compressed == currentText || len(compressed) < d.cfg.MinTextLength {
|
||||
break // Cannot compress further
|
||||
}
|
||||
|
||||
// Re-embed compressed text.
|
||||
prevVec = currentVec
|
||||
currentVec, err = d.embed(ctx, compressed)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("iteration %d embedding: %w", i, err)
|
||||
}
|
||||
|
||||
// Check convergence.
|
||||
convergence = cosineSimilarity(prevVec, currentVec)
|
||||
if convergence >= d.cfg.ConvergenceThreshold {
|
||||
currentText = compressed
|
||||
break // Intent has stabilized
|
||||
}
|
||||
|
||||
currentText = compressed
|
||||
}
|
||||
|
||||
// Step 3: Sincerity check.
|
||||
surfaceDeepSim := cosineSimilarity(surfaceVec, currentVec)
|
||||
divergence := 1.0 - surfaceDeepSim
|
||||
isSincere := divergence <= d.cfg.SincerityThreshold
|
||||
|
||||
result := &DistillResult{
|
||||
IntentVector: currentVec,
|
||||
SurfaceVector: surfaceVec,
|
||||
CompressedText: currentText,
|
||||
Iterations: iterations,
|
||||
Convergence: convergence,
|
||||
SincerityScore: surfaceDeepSim,
|
||||
IsSincere: isSincere,
|
||||
IsManipulation: !isSincere,
|
||||
DurationMs: time.Since(start).Milliseconds(),
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// compressText extracts the semantic core of text by removing
|
||||
// filler words, decorations, and social engineering wrappers.
|
||||
func compressText(text string) string {
|
||||
words := strings.Fields(text)
|
||||
if len(words) <= 3 {
|
||||
return text
|
||||
}
|
||||
|
||||
// Remove common filler/manipulation patterns
|
||||
fillers := map[string]bool{
|
||||
"please": true, "пожалуйста": true, "kindly": true,
|
||||
"just": true, "simply": true, "только": true,
|
||||
"imagine": true, "представь": true, "pretend": true,
|
||||
"suppose": true, "допустим": true, "assuming": true,
|
||||
"hypothetically": true, "гипотетически": true,
|
||||
"for": true, "для": true, "as": true, "как": true,
|
||||
"the": true, "a": true, "an": true, "и": true,
|
||||
"is": true, "are": true, "was": true, "were": true,
|
||||
"that": true, "this": true, "these": true, "those": true,
|
||||
"будь": true, "будьте": true, "можешь": true,
|
||||
"could": true, "would": true, "should": true,
|
||||
"actually": true, "really": true, "very": true,
|
||||
"you": true, "your": true, "ты": true, "твой": true,
|
||||
"my": true, "мой": true, "i": true, "я": true,
|
||||
"в": true, "на": true, "с": true, "к": true,
|
||||
"не": true, "но": true, "из": true, "от": true,
|
||||
}
|
||||
|
||||
var core []string
|
||||
for _, w := range words {
|
||||
lower := strings.ToLower(w)
|
||||
// Strip punctuation for check, keep original
|
||||
cleaned := strings.Trim(lower, ".,!?;:'\"()-[]{}«»")
|
||||
if !fillers[cleaned] && len(cleaned) > 1 {
|
||||
core = append(core, w)
|
||||
}
|
||||
}
|
||||
|
||||
if len(core) == 0 {
|
||||
return text // Don't compress to nothing
|
||||
}
|
||||
|
||||
// Keep max 70% of original words (progressive compression)
|
||||
maxWords := int(float64(len(words)) * 0.7)
|
||||
if maxWords < 3 {
|
||||
maxWords = 3
|
||||
}
|
||||
if len(core) > maxWords {
|
||||
core = core[:maxWords]
|
||||
}
|
||||
|
||||
return strings.Join(core, " ")
|
||||
}
|
||||
|
||||
// cosineSimilarity computes cosine similarity between two vectors.
|
||||
func cosineSimilarity(a, b []float64) float64 {
|
||||
if len(a) != len(b) || len(a) == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
var dot, normA, normB float64
|
||||
for i := range a {
|
||||
dot += a[i] * b[i]
|
||||
normA += a[i] * a[i]
|
||||
normB += b[i] * b[i]
|
||||
}
|
||||
|
||||
denom := math.Sqrt(normA) * math.Sqrt(normB)
|
||||
if denom == 0 {
|
||||
return 0
|
||||
}
|
||||
return dot / denom
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue