mirror of
https://github.com/rushil-thareja/dp-fusion-lib.git
synced 2026-04-25 12:16:22 +02:00
150 lines
4.7 KiB
Python
150 lines
4.7 KiB
Python
|
|
"""
|
|||
|
|
Basic usage example for DP-Fusion-Lib with Tagger API
|
|||
|
|
|
|||
|
|
Demonstrates the Tagger integration for fine-grained privacy redaction.
|
|||
|
|
|
|||
|
|
Requirements:
|
|||
|
|
pip install dp-fusion-lib transformers torch
|
|||
|
|
|
|||
|
|
Note: This example requires a GPU for reasonable performance.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import torch
|
|||
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|||
|
|
from dp_fusion_lib import DPFusion, Tagger, compute_epsilon_single_group
|
|||
|
|
|
|||
|
|
# Model config
|
|||
|
|
# this model works well u can use it
|
|||
|
|
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
|
|||
|
|
|
|||
|
|
# API config - Get your free key at console.documentprivacy.com
|
|||
|
|
API_KEY = "put ure key here!"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("DP-Fusion Library Example (with Tagger API)")
|
|||
|
|
print("=" * 60)
|
|||
|
|
|
|||
|
|
# Load tokenizer
|
|||
|
|
print(f"\nLoading tokenizer: {MODEL_ID}")
|
|||
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|||
|
|
MODEL_ID,
|
|||
|
|
trust_remote_code=True
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Load model on GPU
|
|||
|
|
print(f"Loading model: {MODEL_ID}")
|
|||
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|||
|
|
MODEL_ID,
|
|||
|
|
dtype=torch.float16,
|
|||
|
|
device_map="auto",
|
|||
|
|
trust_remote_code=True
|
|||
|
|
)
|
|||
|
|
model.eval()
|
|||
|
|
|
|||
|
|
print("Model loaded successfully")
|
|||
|
|
|
|||
|
|
# Initialize Tagger with API key (verbose=True to see input/output)
|
|||
|
|
print("\nInitializing Tagger...")
|
|||
|
|
tagger = Tagger(api_key=API_KEY, verbose=True)
|
|||
|
|
|
|||
|
|
# List available models
|
|||
|
|
print("\nAvailable models:")
|
|||
|
|
available_models = tagger.get_available_models()
|
|||
|
|
for m in available_models:
|
|||
|
|
print(f" - {m}")
|
|||
|
|
|
|||
|
|
# Configure tagger
|
|||
|
|
# gpt-oss-120b is a good nice strong model
|
|||
|
|
tagger.set_model("gpt-oss-120b")
|
|||
|
|
tagger.set_constitution("LEGAL")
|
|||
|
|
|
|||
|
|
# Initialize DPFusion with tagger
|
|||
|
|
print("Initializing DPFusion with Tagger...")
|
|||
|
|
dpf = DPFusion(model=model, tokenizer=tokenizer, max_tokens=100, tagger=tagger)
|
|||
|
|
|
|||
|
|
# Example private text (ECHR style legal document)
|
|||
|
|
private_text = """The applicant was born in 1973 and currently resides in Les Salles-sur-Verdon, France.
|
|||
|
|
In the early 1990s, a new criminal phenomenon emerged in Denmark known as 'tax asset stripping cases' (selskabstømmersager)."""
|
|||
|
|
|
|||
|
|
print(f"\nPrivate text ({len(private_text)} characters):")
|
|||
|
|
print(private_text)
|
|||
|
|
|
|||
|
|
# Build context using message API
|
|||
|
|
dpf.add_message("system", "You are a helpful assistant that paraphrases text.", is_private=False)
|
|||
|
|
dpf.add_message("user", private_text, is_private=True)
|
|||
|
|
dpf.add_message("system", "Now paraphrase this text for privacy", is_private=False)
|
|||
|
|
dpf.add_message("assistant", "Sure, here is the paraphrase of the above text that ensures privacy:", is_private=False)
|
|||
|
|
|
|||
|
|
# Run tagger to extract and redact private phrases
|
|||
|
|
print("\n" + "-" * 60)
|
|||
|
|
print("Running Tagger API to extract private phrases...")
|
|||
|
|
print("-" * 60)
|
|||
|
|
dpf.run_tagger()
|
|||
|
|
|
|||
|
|
# Show both contexts
|
|||
|
|
print("\n" + "-" * 60)
|
|||
|
|
print("Private Context (full text):")
|
|||
|
|
print("-" * 60)
|
|||
|
|
print(dpf.private_context)
|
|||
|
|
|
|||
|
|
print("\n" + "-" * 60)
|
|||
|
|
print("Public Context (redacted):")
|
|||
|
|
print("-" * 60)
|
|||
|
|
print(dpf.public_context)
|
|||
|
|
|
|||
|
|
# Run DP-Fusion generation
|
|||
|
|
print("\n" + "-" * 60)
|
|||
|
|
print("Running DP-Fusion generation...")
|
|||
|
|
print("-" * 60)
|
|||
|
|
|
|||
|
|
output = dpf.generate(
|
|||
|
|
alpha=2.0,
|
|||
|
|
beta=0.01,
|
|||
|
|
temperature=1.0,
|
|||
|
|
max_new_tokens=100,
|
|||
|
|
debug=True
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 60)
|
|||
|
|
print("Results:")
|
|||
|
|
print("=" * 60)
|
|||
|
|
print(f"\nGenerated text:\n{output['text']}\n")
|
|||
|
|
|
|||
|
|
# Print some stats
|
|||
|
|
if output['lambdas'].get('PRIVATE'):
|
|||
|
|
lambdas = output['lambdas']['PRIVATE']
|
|||
|
|
print(f"Lambda stats: Mean={sum(lambdas)/len(lambdas):.4f}, Min={min(lambdas):.4f}, Max={max(lambdas):.4f}")
|
|||
|
|
|
|||
|
|
if output['divergences'].get('PRIVATE'):
|
|||
|
|
divs = output['divergences']['PRIVATE']
|
|||
|
|
print(f"Divergence stats: Mean={sum(divs)/len(divs):.4f}, Min={min(divs):.4f}, Max={max(divs):.4f}")
|
|||
|
|
|
|||
|
|
# Compute (ε, δ)-DP guarantee
|
|||
|
|
print("\n" + "-" * 60)
|
|||
|
|
print("Computing (ε, δ)-DP guarantees:")
|
|||
|
|
print("-" * 60)
|
|||
|
|
|
|||
|
|
alpha = 2.0 # Rényi order (same as used in generation)
|
|||
|
|
beta = 0.01 # Divergence bound (same as used in generation)
|
|||
|
|
delta = 1e-5 # Target δ for (ε, δ)-DP
|
|||
|
|
|
|||
|
|
if output['divergences'].get('PRIVATE'):
|
|||
|
|
eps_result = compute_epsilon_single_group(
|
|||
|
|
divergences=output['divergences']['PRIVATE'],
|
|||
|
|
alpha=alpha,
|
|||
|
|
delta=delta,
|
|||
|
|
beta=beta
|
|||
|
|
)
|
|||
|
|
print(f"\n(ε, δ)-DP guarantees (α={alpha}, δ={delta}, T={eps_result['T']} tokens):")
|
|||
|
|
print(f" Empirical ε = {eps_result['empirical']:.4f} (from actual divergences)")
|
|||
|
|
print(f" Theoretical ε = {eps_result['theoretical']:.4f} (worst-case, β={beta} per step)")
|
|||
|
|
|
|||
|
|
print("\nExample completed successfully!")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|