gomcp/internal/application/shadow_ai/doc_bridge.go

357 lines
9.8 KiB
Go

// Copyright 2026 Syntrex Lab. All rights reserved.
// Use of this source code is governed by an Apache-2.0 license
// that can be found in the LICENSE file.
package shadow_ai
import (
"crypto/sha256"
"fmt"
"regexp"
"strings"
"sync"
"time"
)
// --- Document Review Bridge ---
// Controlled gateway for AI access: scans documents for secrets and PII,
// supports content redaction, and routes through the approval workflow.
// DocReviewStatus tracks the lifecycle of a document review.
type DocReviewStatus string
const (
DocReviewPending DocReviewStatus = "pending"
DocReviewScanning DocReviewStatus = "scanning"
DocReviewClean DocReviewStatus = "clean"
DocReviewRedacted DocReviewStatus = "redacted"
DocReviewBlocked DocReviewStatus = "blocked"
DocReviewApproved DocReviewStatus = "approved"
)
// ScanResult contains the results of scanning a document.
type ScanResult struct {
DocumentID string `json:"document_id"`
Status DocReviewStatus `json:"status"`
PIIFound []PIIMatch `json:"pii_found,omitempty"`
SecretsFound []SecretMatch `json:"secrets_found,omitempty"`
DataClass DataClassification `json:"data_classification"`
ContentHash string `json:"content_hash"`
ScannedAt time.Time `json:"scanned_at"`
SizeBytes int `json:"size_bytes"`
}
// PIIMatch represents a detected PII pattern in content.
type PIIMatch struct {
Type string `json:"type"` // "email", "phone", "ssn", "credit_card", "passport"
Location int `json:"location"` // Character offset
Length int `json:"length"`
Masked string `json:"masked"` // Redacted value, e.g., "j***@example.com"
}
// SecretMatch represents a detected secret/API key in content.
type SecretMatch struct {
Type string `json:"type"` // "api_key", "password", "token", "private_key"
Location int `json:"location"`
Length int `json:"length"`
Provider string `json:"provider"` // "OpenAI", "AWS", "GitHub", etc.
}
// DocBridge manages document scanning, redaction, and review workflow.
type DocBridge struct {
mu sync.RWMutex
reviews map[string]*ScanResult
piiPatterns []*piiPattern
secretPats []secretPattern // Cached compiled patterns
signatures *AISignatureDB // Reused across scans
maxDocSize int // bytes
}
type piiPattern struct {
name string
regex *regexp.Regexp
maskFn func(string) string
}
// NewDocBridge creates a new Document Review Bridge.
func NewDocBridge() *DocBridge {
return &DocBridge{
reviews: make(map[string]*ScanResult),
piiPatterns: defaultPIIPatterns(),
secretPats: secretPatterns(),
signatures: NewAISignatureDB(),
maxDocSize: 10 * 1024 * 1024, // 10 MB
}
}
// ScanDocument scans content for PII and secrets, classifies data, returns result.
func (db *DocBridge) ScanDocument(docID, content, userID string) *ScanResult {
result := &ScanResult{
DocumentID: docID,
Status: DocReviewScanning,
ScannedAt: time.Now(),
SizeBytes: len(content),
}
// Content hash for dedup.
h := sha256.Sum256([]byte(content))
result.ContentHash = fmt.Sprintf("%x", h[:])
// Size check.
if len(content) > db.maxDocSize {
result.Status = DocReviewBlocked
result.DataClass = DataCritical
db.store(result)
return result
}
// Scan for PII.
result.PIIFound = db.scanPII(content)
// Scan for secrets (reuse cached signature DB).
if keyType := db.signatures.ScanForAPIKeys(content); keyType != "" {
result.SecretsFound = append(result.SecretsFound, SecretMatch{
Type: "api_key",
Provider: keyType,
})
}
// Scan for additional secret patterns.
result.SecretsFound = append(result.SecretsFound, db.scanSecrets(content)...)
// Classify data based on findings.
result.DataClass = db.classifyData(result)
// Set status based on findings.
if len(result.SecretsFound) > 0 {
result.Status = DocReviewBlocked
} else if len(result.PIIFound) > 0 {
result.Status = DocReviewRedacted
} else {
result.Status = DocReviewClean
}
db.store(result)
return result
}
// RedactContent replaces PII and secrets in content with masked values.
func (db *DocBridge) RedactContent(content string) string {
for _, p := range db.piiPatterns {
content = p.regex.ReplaceAllStringFunc(content, p.maskFn)
}
// Redact common secret patterns (cached).
for _, sp := range db.secretPats {
content = sp.regex.ReplaceAllString(content, sp.replacement)
}
return content
}
// GetReview returns a scan result by document ID.
func (db *DocBridge) GetReview(docID string) (*ScanResult, bool) {
db.mu.RLock()
defer db.mu.RUnlock()
r, ok := db.reviews[docID]
if !ok {
return nil, false
}
cp := *r
return &cp, true
}
// RecentReviews returns the N most recent reviews.
func (db *DocBridge) RecentReviews(limit int) []ScanResult {
db.mu.RLock()
defer db.mu.RUnlock()
results := make([]ScanResult, 0, len(db.reviews))
for _, r := range db.reviews {
results = append(results, *r)
}
// Sort by time desc (simple bubble for bounded set).
for i := 0; i < len(results); i++ {
for j := i + 1; j < len(results); j++ {
if results[j].ScannedAt.After(results[i].ScannedAt) {
results[i], results[j] = results[j], results[i]
}
}
}
if len(results) > limit {
results = results[:limit]
}
return results
}
// Stats returns aggregate document review statistics.
func (db *DocBridge) Stats() map[string]int {
db.mu.RLock()
defer db.mu.RUnlock()
stats := map[string]int{
"total": len(db.reviews),
"clean": 0,
"redacted": 0,
"blocked": 0,
}
for _, r := range db.reviews {
switch r.Status {
case DocReviewClean:
stats["clean"]++
case DocReviewRedacted:
stats["redacted"]++
case DocReviewBlocked:
stats["blocked"]++
}
}
return stats
}
func (db *DocBridge) store(result *ScanResult) {
db.mu.Lock()
defer db.mu.Unlock()
db.reviews[result.DocumentID] = result
}
// scanPII runs all PII patterns against content.
func (db *DocBridge) scanPII(content string) []PIIMatch {
var matches []PIIMatch
for _, p := range db.piiPatterns {
locs := p.regex.FindAllStringIndex(content, -1)
for _, loc := range locs {
matched := content[loc[0]:loc[1]]
matches = append(matches, PIIMatch{
Type: p.name,
Location: loc[0],
Length: loc[1] - loc[0],
Masked: p.maskFn(matched),
})
}
}
return matches
}
// scanSecrets scans for common secret patterns beyond AI API keys.
func (db *DocBridge) scanSecrets(content string) []SecretMatch {
var matches []SecretMatch
for _, sp := range db.secretPats {
locs := sp.regex.FindAllStringIndex(content, -1)
for _, loc := range locs {
matches = append(matches, SecretMatch{
Type: sp.secretType,
Location: loc[0],
Length: loc[1] - loc[0],
Provider: sp.provider,
})
}
}
return matches
}
// classifyData determines the data classification level based on scan results.
func (db *DocBridge) classifyData(result *ScanResult) DataClassification {
if len(result.SecretsFound) > 0 {
return DataCritical
}
hasSensitivePII := false
for _, pii := range result.PIIFound {
switch pii.Type {
case "ssn", "credit_card", "passport":
return DataCritical
case "email", "phone":
hasSensitivePII = true
}
}
if hasSensitivePII {
return DataConfidential
}
if result.SizeBytes > 1024*1024 { // >1MB
return DataInternal
}
return DataPublic
}
// --- PII Patterns ---
func defaultPIIPatterns() []*piiPattern {
return []*piiPattern{
{
name: "email",
regex: regexp.MustCompile(`[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`),
maskFn: func(s string) string {
parts := strings.SplitN(s, "@", 2)
if len(parts) != 2 {
return "***@***"
}
if len(parts[0]) <= 1 {
return "*@" + parts[1]
}
return string(parts[0][0]) + "***@" + parts[1]
},
},
{
name: "phone",
regex: regexp.MustCompile(`\+?[1-9]\d{0,2}[\s\-]?\(?\d{3}\)?[\s\-]?\d{3}[\s\-]?\d{2,4}`),
maskFn: func(s string) string {
if len(s) < 4 {
return "***"
}
return s[:2] + strings.Repeat("*", len(s)-4) + s[len(s)-2:]
},
},
{
name: "ssn",
regex: regexp.MustCompile(`\b\d{3}-\d{2}-\d{4}\b`),
maskFn: func(_ string) string {
return "***-**-****"
},
},
{
name: "credit_card",
regex: regexp.MustCompile(`\b(?:\d{4}[\s\-]?){3}\d{4}\b`),
maskFn: func(s string) string {
clean := strings.ReplaceAll(strings.ReplaceAll(s, "-", ""), " ", "")
if len(clean) < 4 {
return "****"
}
return strings.Repeat("*", len(clean)-4) + clean[len(clean)-4:]
},
},
{
name: "passport",
regex: regexp.MustCompile(`\b[A-Z]{1,2}\d{6,9}\b`),
maskFn: func(s string) string {
if len(s) <= 2 {
return "**"
}
return s[:2] + strings.Repeat("*", len(s)-2)
},
},
}
}
type secretPattern struct {
secretType string
provider string
regex *regexp.Regexp
replacement string
}
func secretPatterns() []secretPattern {
return []secretPattern{
{secretType: "aws_key", provider: "AWS", regex: regexp.MustCompile(`AKIA[0-9A-Z]{16}`), replacement: "[AWS_KEY_REDACTED]"},
{secretType: "github_token", provider: "GitHub", regex: regexp.MustCompile(`ghp_[a-zA-Z0-9]{36}`), replacement: "[GITHUB_TOKEN_REDACTED]"},
{secretType: "github_token", provider: "GitHub", regex: regexp.MustCompile(`github_pat_[a-zA-Z0-9_]{82}`), replacement: "[GITHUB_PAT_REDACTED]"},
{secretType: "slack_token", provider: "Slack", regex: regexp.MustCompile(`xoxb-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24}`), replacement: "[SLACK_TOKEN_REDACTED]"},
{secretType: "private_key", provider: "Generic", regex: regexp.MustCompile(`-----BEGIN (?:RSA |EC |DSA )?PRIVATE KEY-----`), replacement: "[PRIVATE_KEY_REDACTED]"},
{secretType: "password", provider: "Generic", regex: regexp.MustCompile(`(?i)password\s*[=:]\s*['"]?[^\s'"]{8,}`), replacement: "[PASSWORD_REDACTED]"},
{secretType: "connection_string", provider: "Database", regex: regexp.MustCompile(`(?i)(?:mysql|postgres|mongodb)://[^\s]+`), replacement: "[DB_CONN_REDACTED]"},
}
}