mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-04-24 20:06:21 +02:00
357 lines
9.8 KiB
Go
357 lines
9.8 KiB
Go
// Copyright 2026 Syntrex Lab. All rights reserved.
|
|
// Use of this source code is governed by an Apache-2.0 license
|
|
// that can be found in the LICENSE file.
|
|
|
|
package shadow_ai
|
|
|
|
import (
|
|
"crypto/sha256"
|
|
"fmt"
|
|
"regexp"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// --- Document Review Bridge ---
|
|
// Controlled gateway for AI access: scans documents for secrets and PII,
|
|
// supports content redaction, and routes through the approval workflow.
|
|
|
|
// DocReviewStatus tracks the lifecycle of a document review.
|
|
type DocReviewStatus string
|
|
|
|
const (
|
|
DocReviewPending DocReviewStatus = "pending"
|
|
DocReviewScanning DocReviewStatus = "scanning"
|
|
DocReviewClean DocReviewStatus = "clean"
|
|
DocReviewRedacted DocReviewStatus = "redacted"
|
|
DocReviewBlocked DocReviewStatus = "blocked"
|
|
DocReviewApproved DocReviewStatus = "approved"
|
|
)
|
|
|
|
// ScanResult contains the results of scanning a document.
|
|
type ScanResult struct {
|
|
DocumentID string `json:"document_id"`
|
|
Status DocReviewStatus `json:"status"`
|
|
PIIFound []PIIMatch `json:"pii_found,omitempty"`
|
|
SecretsFound []SecretMatch `json:"secrets_found,omitempty"`
|
|
DataClass DataClassification `json:"data_classification"`
|
|
ContentHash string `json:"content_hash"`
|
|
ScannedAt time.Time `json:"scanned_at"`
|
|
SizeBytes int `json:"size_bytes"`
|
|
}
|
|
|
|
// PIIMatch represents a detected PII pattern in content.
|
|
type PIIMatch struct {
|
|
Type string `json:"type"` // "email", "phone", "ssn", "credit_card", "passport"
|
|
Location int `json:"location"` // Character offset
|
|
Length int `json:"length"`
|
|
Masked string `json:"masked"` // Redacted value, e.g., "j***@example.com"
|
|
}
|
|
|
|
// SecretMatch represents a detected secret/API key in content.
|
|
type SecretMatch struct {
|
|
Type string `json:"type"` // "api_key", "password", "token", "private_key"
|
|
Location int `json:"location"`
|
|
Length int `json:"length"`
|
|
Provider string `json:"provider"` // "OpenAI", "AWS", "GitHub", etc.
|
|
}
|
|
|
|
// DocBridge manages document scanning, redaction, and review workflow.
|
|
type DocBridge struct {
|
|
mu sync.RWMutex
|
|
reviews map[string]*ScanResult
|
|
piiPatterns []*piiPattern
|
|
secretPats []secretPattern // Cached compiled patterns
|
|
signatures *AISignatureDB // Reused across scans
|
|
maxDocSize int // bytes
|
|
}
|
|
|
|
type piiPattern struct {
|
|
name string
|
|
regex *regexp.Regexp
|
|
maskFn func(string) string
|
|
}
|
|
|
|
// NewDocBridge creates a new Document Review Bridge.
|
|
func NewDocBridge() *DocBridge {
|
|
return &DocBridge{
|
|
reviews: make(map[string]*ScanResult),
|
|
piiPatterns: defaultPIIPatterns(),
|
|
secretPats: secretPatterns(),
|
|
signatures: NewAISignatureDB(),
|
|
maxDocSize: 10 * 1024 * 1024, // 10 MB
|
|
}
|
|
}
|
|
|
|
// ScanDocument scans content for PII and secrets, classifies data, returns result.
|
|
func (db *DocBridge) ScanDocument(docID, content, userID string) *ScanResult {
|
|
result := &ScanResult{
|
|
DocumentID: docID,
|
|
Status: DocReviewScanning,
|
|
ScannedAt: time.Now(),
|
|
SizeBytes: len(content),
|
|
}
|
|
|
|
// Content hash for dedup.
|
|
h := sha256.Sum256([]byte(content))
|
|
result.ContentHash = fmt.Sprintf("%x", h[:])
|
|
|
|
// Size check.
|
|
if len(content) > db.maxDocSize {
|
|
result.Status = DocReviewBlocked
|
|
result.DataClass = DataCritical
|
|
db.store(result)
|
|
return result
|
|
}
|
|
|
|
// Scan for PII.
|
|
result.PIIFound = db.scanPII(content)
|
|
|
|
// Scan for secrets (reuse cached signature DB).
|
|
if keyType := db.signatures.ScanForAPIKeys(content); keyType != "" {
|
|
result.SecretsFound = append(result.SecretsFound, SecretMatch{
|
|
Type: "api_key",
|
|
Provider: keyType,
|
|
})
|
|
}
|
|
|
|
// Scan for additional secret patterns.
|
|
result.SecretsFound = append(result.SecretsFound, db.scanSecrets(content)...)
|
|
|
|
// Classify data based on findings.
|
|
result.DataClass = db.classifyData(result)
|
|
|
|
// Set status based on findings.
|
|
if len(result.SecretsFound) > 0 {
|
|
result.Status = DocReviewBlocked
|
|
} else if len(result.PIIFound) > 0 {
|
|
result.Status = DocReviewRedacted
|
|
} else {
|
|
result.Status = DocReviewClean
|
|
}
|
|
|
|
db.store(result)
|
|
return result
|
|
}
|
|
|
|
// RedactContent replaces PII and secrets in content with masked values.
|
|
func (db *DocBridge) RedactContent(content string) string {
|
|
for _, p := range db.piiPatterns {
|
|
content = p.regex.ReplaceAllStringFunc(content, p.maskFn)
|
|
}
|
|
|
|
// Redact common secret patterns (cached).
|
|
for _, sp := range db.secretPats {
|
|
content = sp.regex.ReplaceAllString(content, sp.replacement)
|
|
}
|
|
|
|
return content
|
|
}
|
|
|
|
// GetReview returns a scan result by document ID.
|
|
func (db *DocBridge) GetReview(docID string) (*ScanResult, bool) {
|
|
db.mu.RLock()
|
|
defer db.mu.RUnlock()
|
|
r, ok := db.reviews[docID]
|
|
if !ok {
|
|
return nil, false
|
|
}
|
|
cp := *r
|
|
return &cp, true
|
|
}
|
|
|
|
// RecentReviews returns the N most recent reviews.
|
|
func (db *DocBridge) RecentReviews(limit int) []ScanResult {
|
|
db.mu.RLock()
|
|
defer db.mu.RUnlock()
|
|
|
|
results := make([]ScanResult, 0, len(db.reviews))
|
|
for _, r := range db.reviews {
|
|
results = append(results, *r)
|
|
}
|
|
|
|
// Sort by time desc (simple bubble for bounded set).
|
|
for i := 0; i < len(results); i++ {
|
|
for j := i + 1; j < len(results); j++ {
|
|
if results[j].ScannedAt.After(results[i].ScannedAt) {
|
|
results[i], results[j] = results[j], results[i]
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(results) > limit {
|
|
results = results[:limit]
|
|
}
|
|
return results
|
|
}
|
|
|
|
// Stats returns aggregate document review statistics.
|
|
func (db *DocBridge) Stats() map[string]int {
|
|
db.mu.RLock()
|
|
defer db.mu.RUnlock()
|
|
|
|
stats := map[string]int{
|
|
"total": len(db.reviews),
|
|
"clean": 0,
|
|
"redacted": 0,
|
|
"blocked": 0,
|
|
}
|
|
for _, r := range db.reviews {
|
|
switch r.Status {
|
|
case DocReviewClean:
|
|
stats["clean"]++
|
|
case DocReviewRedacted:
|
|
stats["redacted"]++
|
|
case DocReviewBlocked:
|
|
stats["blocked"]++
|
|
}
|
|
}
|
|
return stats
|
|
}
|
|
|
|
func (db *DocBridge) store(result *ScanResult) {
|
|
db.mu.Lock()
|
|
defer db.mu.Unlock()
|
|
db.reviews[result.DocumentID] = result
|
|
}
|
|
|
|
// scanPII runs all PII patterns against content.
|
|
func (db *DocBridge) scanPII(content string) []PIIMatch {
|
|
var matches []PIIMatch
|
|
for _, p := range db.piiPatterns {
|
|
locs := p.regex.FindAllStringIndex(content, -1)
|
|
for _, loc := range locs {
|
|
matched := content[loc[0]:loc[1]]
|
|
matches = append(matches, PIIMatch{
|
|
Type: p.name,
|
|
Location: loc[0],
|
|
Length: loc[1] - loc[0],
|
|
Masked: p.maskFn(matched),
|
|
})
|
|
}
|
|
}
|
|
return matches
|
|
}
|
|
|
|
// scanSecrets scans for common secret patterns beyond AI API keys.
|
|
func (db *DocBridge) scanSecrets(content string) []SecretMatch {
|
|
var matches []SecretMatch
|
|
for _, sp := range db.secretPats {
|
|
locs := sp.regex.FindAllStringIndex(content, -1)
|
|
for _, loc := range locs {
|
|
matches = append(matches, SecretMatch{
|
|
Type: sp.secretType,
|
|
Location: loc[0],
|
|
Length: loc[1] - loc[0],
|
|
Provider: sp.provider,
|
|
})
|
|
}
|
|
}
|
|
return matches
|
|
}
|
|
|
|
// classifyData determines the data classification level based on scan results.
|
|
func (db *DocBridge) classifyData(result *ScanResult) DataClassification {
|
|
if len(result.SecretsFound) > 0 {
|
|
return DataCritical
|
|
}
|
|
|
|
hasSensitivePII := false
|
|
for _, pii := range result.PIIFound {
|
|
switch pii.Type {
|
|
case "ssn", "credit_card", "passport":
|
|
return DataCritical
|
|
case "email", "phone":
|
|
hasSensitivePII = true
|
|
}
|
|
}
|
|
|
|
if hasSensitivePII {
|
|
return DataConfidential
|
|
}
|
|
|
|
if result.SizeBytes > 1024*1024 { // >1MB
|
|
return DataInternal
|
|
}
|
|
|
|
return DataPublic
|
|
}
|
|
|
|
// --- PII Patterns ---
|
|
|
|
func defaultPIIPatterns() []*piiPattern {
|
|
return []*piiPattern{
|
|
{
|
|
name: "email",
|
|
regex: regexp.MustCompile(`[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`),
|
|
maskFn: func(s string) string {
|
|
parts := strings.SplitN(s, "@", 2)
|
|
if len(parts) != 2 {
|
|
return "***@***"
|
|
}
|
|
if len(parts[0]) <= 1 {
|
|
return "*@" + parts[1]
|
|
}
|
|
return string(parts[0][0]) + "***@" + parts[1]
|
|
},
|
|
},
|
|
{
|
|
name: "phone",
|
|
regex: regexp.MustCompile(`\+?[1-9]\d{0,2}[\s\-]?\(?\d{3}\)?[\s\-]?\d{3}[\s\-]?\d{2,4}`),
|
|
maskFn: func(s string) string {
|
|
if len(s) < 4 {
|
|
return "***"
|
|
}
|
|
return s[:2] + strings.Repeat("*", len(s)-4) + s[len(s)-2:]
|
|
},
|
|
},
|
|
{
|
|
name: "ssn",
|
|
regex: regexp.MustCompile(`\b\d{3}-\d{2}-\d{4}\b`),
|
|
maskFn: func(_ string) string {
|
|
return "***-**-****"
|
|
},
|
|
},
|
|
{
|
|
name: "credit_card",
|
|
regex: regexp.MustCompile(`\b(?:\d{4}[\s\-]?){3}\d{4}\b`),
|
|
maskFn: func(s string) string {
|
|
clean := strings.ReplaceAll(strings.ReplaceAll(s, "-", ""), " ", "")
|
|
if len(clean) < 4 {
|
|
return "****"
|
|
}
|
|
return strings.Repeat("*", len(clean)-4) + clean[len(clean)-4:]
|
|
},
|
|
},
|
|
{
|
|
name: "passport",
|
|
regex: regexp.MustCompile(`\b[A-Z]{1,2}\d{6,9}\b`),
|
|
maskFn: func(s string) string {
|
|
if len(s) <= 2 {
|
|
return "**"
|
|
}
|
|
return s[:2] + strings.Repeat("*", len(s)-2)
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
type secretPattern struct {
|
|
secretType string
|
|
provider string
|
|
regex *regexp.Regexp
|
|
replacement string
|
|
}
|
|
|
|
func secretPatterns() []secretPattern {
|
|
return []secretPattern{
|
|
{secretType: "aws_key", provider: "AWS", regex: regexp.MustCompile(`AKIA[0-9A-Z]{16}`), replacement: "[AWS_KEY_REDACTED]"},
|
|
{secretType: "github_token", provider: "GitHub", regex: regexp.MustCompile(`ghp_[a-zA-Z0-9]{36}`), replacement: "[GITHUB_TOKEN_REDACTED]"},
|
|
{secretType: "github_token", provider: "GitHub", regex: regexp.MustCompile(`github_pat_[a-zA-Z0-9_]{82}`), replacement: "[GITHUB_PAT_REDACTED]"},
|
|
{secretType: "slack_token", provider: "Slack", regex: regexp.MustCompile(`xoxb-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24}`), replacement: "[SLACK_TOKEN_REDACTED]"},
|
|
{secretType: "private_key", provider: "Generic", regex: regexp.MustCompile(`-----BEGIN (?:RSA |EC |DSA )?PRIVATE KEY-----`), replacement: "[PRIVATE_KEY_REDACTED]"},
|
|
{secretType: "password", provider: "Generic", regex: regexp.MustCompile(`(?i)password\s*[=:]\s*['"]?[^\s'"]{8,}`), replacement: "[PASSWORD_REDACTED]"},
|
|
{secretType: "connection_string", provider: "Database", regex: regexp.MustCompile(`(?i)(?:mysql|postgres|mongodb)://[^\s]+`), replacement: "[DB_CONN_REDACTED]"},
|
|
}
|
|
}
|