mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-04-26 21:06:21 +02:00
Release prep: 54 engines, self-hosted signatures, i18n, dashboard updates
This commit is contained in:
parent
694e32be26
commit
41cbfd6e0a
178 changed files with 36008 additions and 399 deletions
353
internal/application/shadow_ai/doc_bridge.go
Normal file
353
internal/application/shadow_ai/doc_bridge.go
Normal file
|
|
@ -0,0 +1,353 @@
|
|||
package shadow_ai
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// --- Document Review Bridge ---
|
||||
// Controlled gateway for AI access: scans documents for secrets and PII,
|
||||
// supports content redaction, and routes through the approval workflow.
|
||||
|
||||
// DocReviewStatus tracks the lifecycle of a document review.
|
||||
type DocReviewStatus string
|
||||
|
||||
const (
|
||||
DocReviewPending DocReviewStatus = "pending"
|
||||
DocReviewScanning DocReviewStatus = "scanning"
|
||||
DocReviewClean DocReviewStatus = "clean"
|
||||
DocReviewRedacted DocReviewStatus = "redacted"
|
||||
DocReviewBlocked DocReviewStatus = "blocked"
|
||||
DocReviewApproved DocReviewStatus = "approved"
|
||||
)
|
||||
|
||||
// ScanResult contains the results of scanning a document.
|
||||
type ScanResult struct {
|
||||
DocumentID string `json:"document_id"`
|
||||
Status DocReviewStatus `json:"status"`
|
||||
PIIFound []PIIMatch `json:"pii_found,omitempty"`
|
||||
SecretsFound []SecretMatch `json:"secrets_found,omitempty"`
|
||||
DataClass DataClassification `json:"data_classification"`
|
||||
ContentHash string `json:"content_hash"`
|
||||
ScannedAt time.Time `json:"scanned_at"`
|
||||
SizeBytes int `json:"size_bytes"`
|
||||
}
|
||||
|
||||
// PIIMatch represents a detected PII pattern in content.
|
||||
type PIIMatch struct {
|
||||
Type string `json:"type"` // "email", "phone", "ssn", "credit_card", "passport"
|
||||
Location int `json:"location"` // Character offset
|
||||
Length int `json:"length"`
|
||||
Masked string `json:"masked"` // Redacted value, e.g., "j***@example.com"
|
||||
}
|
||||
|
||||
// SecretMatch represents a detected secret/API key in content.
|
||||
type SecretMatch struct {
|
||||
Type string `json:"type"` // "api_key", "password", "token", "private_key"
|
||||
Location int `json:"location"`
|
||||
Length int `json:"length"`
|
||||
Provider string `json:"provider"` // "OpenAI", "AWS", "GitHub", etc.
|
||||
}
|
||||
|
||||
// DocBridge manages document scanning, redaction, and review workflow.
|
||||
type DocBridge struct {
|
||||
mu sync.RWMutex
|
||||
reviews map[string]*ScanResult
|
||||
piiPatterns []*piiPattern
|
||||
secretPats []secretPattern // Cached compiled patterns
|
||||
signatures *AISignatureDB // Reused across scans
|
||||
maxDocSize int // bytes
|
||||
}
|
||||
|
||||
type piiPattern struct {
|
||||
name string
|
||||
regex *regexp.Regexp
|
||||
maskFn func(string) string
|
||||
}
|
||||
|
||||
// NewDocBridge creates a new Document Review Bridge.
|
||||
func NewDocBridge() *DocBridge {
|
||||
return &DocBridge{
|
||||
reviews: make(map[string]*ScanResult),
|
||||
piiPatterns: defaultPIIPatterns(),
|
||||
secretPats: secretPatterns(),
|
||||
signatures: NewAISignatureDB(),
|
||||
maxDocSize: 10 * 1024 * 1024, // 10 MB
|
||||
}
|
||||
}
|
||||
|
||||
// ScanDocument scans content for PII and secrets, classifies data, returns result.
|
||||
func (db *DocBridge) ScanDocument(docID, content, userID string) *ScanResult {
|
||||
result := &ScanResult{
|
||||
DocumentID: docID,
|
||||
Status: DocReviewScanning,
|
||||
ScannedAt: time.Now(),
|
||||
SizeBytes: len(content),
|
||||
}
|
||||
|
||||
// Content hash for dedup.
|
||||
h := sha256.Sum256([]byte(content))
|
||||
result.ContentHash = fmt.Sprintf("%x", h[:])
|
||||
|
||||
// Size check.
|
||||
if len(content) > db.maxDocSize {
|
||||
result.Status = DocReviewBlocked
|
||||
result.DataClass = DataCritical
|
||||
db.store(result)
|
||||
return result
|
||||
}
|
||||
|
||||
// Scan for PII.
|
||||
result.PIIFound = db.scanPII(content)
|
||||
|
||||
// Scan for secrets (reuse cached signature DB).
|
||||
if keyType := db.signatures.ScanForAPIKeys(content); keyType != "" {
|
||||
result.SecretsFound = append(result.SecretsFound, SecretMatch{
|
||||
Type: "api_key",
|
||||
Provider: keyType,
|
||||
})
|
||||
}
|
||||
|
||||
// Scan for additional secret patterns.
|
||||
result.SecretsFound = append(result.SecretsFound, db.scanSecrets(content)...)
|
||||
|
||||
// Classify data based on findings.
|
||||
result.DataClass = db.classifyData(result)
|
||||
|
||||
// Set status based on findings.
|
||||
if len(result.SecretsFound) > 0 {
|
||||
result.Status = DocReviewBlocked
|
||||
} else if len(result.PIIFound) > 0 {
|
||||
result.Status = DocReviewRedacted
|
||||
} else {
|
||||
result.Status = DocReviewClean
|
||||
}
|
||||
|
||||
db.store(result)
|
||||
return result
|
||||
}
|
||||
|
||||
// RedactContent replaces PII and secrets in content with masked values.
|
||||
func (db *DocBridge) RedactContent(content string) string {
|
||||
for _, p := range db.piiPatterns {
|
||||
content = p.regex.ReplaceAllStringFunc(content, p.maskFn)
|
||||
}
|
||||
|
||||
// Redact common secret patterns (cached).
|
||||
for _, sp := range db.secretPats {
|
||||
content = sp.regex.ReplaceAllString(content, sp.replacement)
|
||||
}
|
||||
|
||||
return content
|
||||
}
|
||||
|
||||
// GetReview returns a scan result by document ID.
|
||||
func (db *DocBridge) GetReview(docID string) (*ScanResult, bool) {
|
||||
db.mu.RLock()
|
||||
defer db.mu.RUnlock()
|
||||
r, ok := db.reviews[docID]
|
||||
if !ok {
|
||||
return nil, false
|
||||
}
|
||||
cp := *r
|
||||
return &cp, true
|
||||
}
|
||||
|
||||
// RecentReviews returns the N most recent reviews.
|
||||
func (db *DocBridge) RecentReviews(limit int) []ScanResult {
|
||||
db.mu.RLock()
|
||||
defer db.mu.RUnlock()
|
||||
|
||||
results := make([]ScanResult, 0, len(db.reviews))
|
||||
for _, r := range db.reviews {
|
||||
results = append(results, *r)
|
||||
}
|
||||
|
||||
// Sort by time desc (simple bubble for bounded set).
|
||||
for i := 0; i < len(results); i++ {
|
||||
for j := i + 1; j < len(results); j++ {
|
||||
if results[j].ScannedAt.After(results[i].ScannedAt) {
|
||||
results[i], results[j] = results[j], results[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(results) > limit {
|
||||
results = results[:limit]
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
||||
// Stats returns aggregate document review statistics.
|
||||
func (db *DocBridge) Stats() map[string]int {
|
||||
db.mu.RLock()
|
||||
defer db.mu.RUnlock()
|
||||
|
||||
stats := map[string]int{
|
||||
"total": len(db.reviews),
|
||||
"clean": 0,
|
||||
"redacted": 0,
|
||||
"blocked": 0,
|
||||
}
|
||||
for _, r := range db.reviews {
|
||||
switch r.Status {
|
||||
case DocReviewClean:
|
||||
stats["clean"]++
|
||||
case DocReviewRedacted:
|
||||
stats["redacted"]++
|
||||
case DocReviewBlocked:
|
||||
stats["blocked"]++
|
||||
}
|
||||
}
|
||||
return stats
|
||||
}
|
||||
|
||||
func (db *DocBridge) store(result *ScanResult) {
|
||||
db.mu.Lock()
|
||||
defer db.mu.Unlock()
|
||||
db.reviews[result.DocumentID] = result
|
||||
}
|
||||
|
||||
// scanPII runs all PII patterns against content.
|
||||
func (db *DocBridge) scanPII(content string) []PIIMatch {
|
||||
var matches []PIIMatch
|
||||
for _, p := range db.piiPatterns {
|
||||
locs := p.regex.FindAllStringIndex(content, -1)
|
||||
for _, loc := range locs {
|
||||
matched := content[loc[0]:loc[1]]
|
||||
matches = append(matches, PIIMatch{
|
||||
Type: p.name,
|
||||
Location: loc[0],
|
||||
Length: loc[1] - loc[0],
|
||||
Masked: p.maskFn(matched),
|
||||
})
|
||||
}
|
||||
}
|
||||
return matches
|
||||
}
|
||||
|
||||
// scanSecrets scans for common secret patterns beyond AI API keys.
|
||||
func (db *DocBridge) scanSecrets(content string) []SecretMatch {
|
||||
var matches []SecretMatch
|
||||
for _, sp := range db.secretPats {
|
||||
locs := sp.regex.FindAllStringIndex(content, -1)
|
||||
for _, loc := range locs {
|
||||
matches = append(matches, SecretMatch{
|
||||
Type: sp.secretType,
|
||||
Location: loc[0],
|
||||
Length: loc[1] - loc[0],
|
||||
Provider: sp.provider,
|
||||
})
|
||||
}
|
||||
}
|
||||
return matches
|
||||
}
|
||||
|
||||
// classifyData determines the data classification level based on scan results.
|
||||
func (db *DocBridge) classifyData(result *ScanResult) DataClassification {
|
||||
if len(result.SecretsFound) > 0 {
|
||||
return DataCritical
|
||||
}
|
||||
|
||||
hasSensitivePII := false
|
||||
for _, pii := range result.PIIFound {
|
||||
switch pii.Type {
|
||||
case "ssn", "credit_card", "passport":
|
||||
return DataCritical
|
||||
case "email", "phone":
|
||||
hasSensitivePII = true
|
||||
}
|
||||
}
|
||||
|
||||
if hasSensitivePII {
|
||||
return DataConfidential
|
||||
}
|
||||
|
||||
if result.SizeBytes > 1024*1024 { // >1MB
|
||||
return DataInternal
|
||||
}
|
||||
|
||||
return DataPublic
|
||||
}
|
||||
|
||||
// --- PII Patterns ---
|
||||
|
||||
func defaultPIIPatterns() []*piiPattern {
|
||||
return []*piiPattern{
|
||||
{
|
||||
name: "email",
|
||||
regex: regexp.MustCompile(`[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`),
|
||||
maskFn: func(s string) string {
|
||||
parts := strings.SplitN(s, "@", 2)
|
||||
if len(parts) != 2 {
|
||||
return "***@***"
|
||||
}
|
||||
if len(parts[0]) <= 1 {
|
||||
return "*@" + parts[1]
|
||||
}
|
||||
return string(parts[0][0]) + "***@" + parts[1]
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "phone",
|
||||
regex: regexp.MustCompile(`\+?[1-9]\d{0,2}[\s\-]?\(?\d{3}\)?[\s\-]?\d{3}[\s\-]?\d{2,4}`),
|
||||
maskFn: func(s string) string {
|
||||
if len(s) < 4 {
|
||||
return "***"
|
||||
}
|
||||
return s[:2] + strings.Repeat("*", len(s)-4) + s[len(s)-2:]
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "ssn",
|
||||
regex: regexp.MustCompile(`\b\d{3}-\d{2}-\d{4}\b`),
|
||||
maskFn: func(_ string) string {
|
||||
return "***-**-****"
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "credit_card",
|
||||
regex: regexp.MustCompile(`\b(?:\d{4}[\s\-]?){3}\d{4}\b`),
|
||||
maskFn: func(s string) string {
|
||||
clean := strings.ReplaceAll(strings.ReplaceAll(s, "-", ""), " ", "")
|
||||
if len(clean) < 4 {
|
||||
return "****"
|
||||
}
|
||||
return strings.Repeat("*", len(clean)-4) + clean[len(clean)-4:]
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "passport",
|
||||
regex: regexp.MustCompile(`\b[A-Z]{1,2}\d{6,9}\b`),
|
||||
maskFn: func(s string) string {
|
||||
if len(s) <= 2 {
|
||||
return "**"
|
||||
}
|
||||
return s[:2] + strings.Repeat("*", len(s)-2)
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
type secretPattern struct {
|
||||
secretType string
|
||||
provider string
|
||||
regex *regexp.Regexp
|
||||
replacement string
|
||||
}
|
||||
|
||||
func secretPatterns() []secretPattern {
|
||||
return []secretPattern{
|
||||
{secretType: "aws_key", provider: "AWS", regex: regexp.MustCompile(`AKIA[0-9A-Z]{16}`), replacement: "[AWS_KEY_REDACTED]"},
|
||||
{secretType: "github_token", provider: "GitHub", regex: regexp.MustCompile(`ghp_[a-zA-Z0-9]{36}`), replacement: "[GITHUB_TOKEN_REDACTED]"},
|
||||
{secretType: "github_token", provider: "GitHub", regex: regexp.MustCompile(`github_pat_[a-zA-Z0-9_]{82}`), replacement: "[GITHUB_PAT_REDACTED]"},
|
||||
{secretType: "slack_token", provider: "Slack", regex: regexp.MustCompile(`xoxb-[0-9]{10,13}-[0-9]{10,13}-[a-zA-Z0-9]{24}`), replacement: "[SLACK_TOKEN_REDACTED]"},
|
||||
{secretType: "private_key", provider: "Generic", regex: regexp.MustCompile(`-----BEGIN (?:RSA |EC |DSA )?PRIVATE KEY-----`), replacement: "[PRIVATE_KEY_REDACTED]"},
|
||||
{secretType: "password", provider: "Generic", regex: regexp.MustCompile(`(?i)password\s*[=:]\s*['"]?[^\s'"]{8,}`), replacement: "[PASSWORD_REDACTED]"},
|
||||
{secretType: "connection_string", provider: "Database", regex: regexp.MustCompile(`(?i)(?:mysql|postgres|mongodb)://[^\s]+`), replacement: "[DB_CONN_REDACTED]"},
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue