dp-fusion-lib/examples/basic_usage.py
rushil-thareja d012046d85 Initial release v0.1.0
- Token-level differential privacy for LLMs
  - Integration with Document Privacy API
  - Comprehensive test suite and documentation
  - Examples and Jupyter notebook included
2025-12-23 17:02:06 +04:00

149 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Basic usage example for DP-Fusion-Lib with Tagger API
Demonstrates the Tagger integration for fine-grained privacy redaction.
Requirements:
pip install dp-fusion-lib transformers torch
Note: This example requires a GPU for reasonable performance.
"""
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from dp_fusion_lib import DPFusion, Tagger, compute_epsilon_single_group
# Model config
# this model works well u can use it
MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
# API config - Get your free key at console.documentprivacy.com
API_KEY = "put ure key here!"
def main():
print("=" * 60)
print("DP-Fusion Library Example (with Tagger API)")
print("=" * 60)
# Load tokenizer
print(f"\nLoading tokenizer: {MODEL_ID}")
tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
trust_remote_code=True
)
# Load model on GPU
print(f"Loading model: {MODEL_ID}")
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
model.eval()
print("Model loaded successfully")
# Initialize Tagger with API key (verbose=True to see input/output)
print("\nInitializing Tagger...")
tagger = Tagger(api_key=API_KEY, verbose=True)
# List available models
print("\nAvailable models:")
available_models = tagger.get_available_models()
for m in available_models:
print(f" - {m}")
# Configure tagger
# gpt-oss-120b is a good nice strong model
tagger.set_model("gpt-oss-120b")
tagger.set_constitution("LEGAL")
# Initialize DPFusion with tagger
print("Initializing DPFusion with Tagger...")
dpf = DPFusion(model=model, tokenizer=tokenizer, max_tokens=100, tagger=tagger)
# Example private text (ECHR style legal document)
private_text = """The applicant was born in 1973 and currently resides in Les Salles-sur-Verdon, France.
In the early 1990s, a new criminal phenomenon emerged in Denmark known as 'tax asset stripping cases' (selskabstømmersager)."""
print(f"\nPrivate text ({len(private_text)} characters):")
print(private_text)
# Build context using message API
dpf.add_message("system", "You are a helpful assistant that paraphrases text.", is_private=False)
dpf.add_message("user", private_text, is_private=True)
dpf.add_message("system", "Now paraphrase this text for privacy", is_private=False)
dpf.add_message("assistant", "Sure, here is the paraphrase of the above text that ensures privacy:", is_private=False)
# Run tagger to extract and redact private phrases
print("\n" + "-" * 60)
print("Running Tagger API to extract private phrases...")
print("-" * 60)
dpf.run_tagger()
# Show both contexts
print("\n" + "-" * 60)
print("Private Context (full text):")
print("-" * 60)
print(dpf.private_context)
print("\n" + "-" * 60)
print("Public Context (redacted):")
print("-" * 60)
print(dpf.public_context)
# Run DP-Fusion generation
print("\n" + "-" * 60)
print("Running DP-Fusion generation...")
print("-" * 60)
output = dpf.generate(
alpha=2.0,
beta=0.01,
temperature=1.0,
max_new_tokens=100,
debug=True
)
print("\n" + "=" * 60)
print("Results:")
print("=" * 60)
print(f"\nGenerated text:\n{output['text']}\n")
# Print some stats
if output['lambdas'].get('PRIVATE'):
lambdas = output['lambdas']['PRIVATE']
print(f"Lambda stats: Mean={sum(lambdas)/len(lambdas):.4f}, Min={min(lambdas):.4f}, Max={max(lambdas):.4f}")
if output['divergences'].get('PRIVATE'):
divs = output['divergences']['PRIVATE']
print(f"Divergence stats: Mean={sum(divs)/len(divs):.4f}, Min={min(divs):.4f}, Max={max(divs):.4f}")
# Compute (ε, δ)-DP guarantee
print("\n" + "-" * 60)
print("Computing (ε, δ)-DP guarantees:")
print("-" * 60)
alpha = 2.0 # Rényi order (same as used in generation)
beta = 0.01 # Divergence bound (same as used in generation)
delta = 1e-5 # Target δ for (ε, δ)-DP
if output['divergences'].get('PRIVATE'):
eps_result = compute_epsilon_single_group(
divergences=output['divergences']['PRIVATE'],
alpha=alpha,
delta=delta,
beta=beta
)
print(f"\n(ε, δ)-DP guarantees (α={alpha}, δ={delta}, T={eps_result['T']} tokens):")
print(f" Empirical ε = {eps_result['empirical']:.4f} (from actual divergences)")
print(f" Theoretical ε = {eps_result['theoretical']:.4f} (worst-case, β={beta} per step)")
print("\nExample completed successfully!")
if __name__ == "__main__":
main()