Initial release v0.1.0

- Token-level differential privacy for LLMs - Integration with Document Privacy API - Comprehensive test suite and documentation - Examples and Jupyter notebook included
2026-04-25 12:16:22 +02:00 · 2025-12-23 17:02:06 +04:00 · 2025-12-23 17:02:06 +04:00 · d012046d85
commit d012046d85
31 changed files with 4480 additions and 0 deletions
--- a/examples/basic_usage.py
+++ b/examples/basic_usage.py
@ -0,0 +1,149 @@
+"""
+Basic usage example for DP-Fusion-Lib with Tagger API
+
+Demonstrates the Tagger integration for fine-grained privacy redaction.
+
+Requirements:
+    pip install dp-fusion-lib transformers torch
+
+Note: This example requires a GPU for reasonable performance.
+"""
+
+import os
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from dp_fusion_lib import DPFusion, Tagger, compute_epsilon_single_group
+
+# Model config
+# this model works well u can use it
+MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
+
+# API config - Get your free key at console.documentprivacy.com
+API_KEY = "put ure key here!"
+
+
+def main():
+    print("=" * 60)
+    print("DP-Fusion Library Example (with Tagger API)")
+    print("=" * 60)
+
+    # Load tokenizer
+    print(f"\nLoading tokenizer: {MODEL_ID}")
+    tokenizer = AutoTokenizer.from_pretrained(
+        MODEL_ID,
+        trust_remote_code=True
+    )
+
+    # Load model on GPU
+    print(f"Loading model: {MODEL_ID}")
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True
+    )
+    model.eval()
+
+    print("Model loaded successfully")
+
+    # Initialize Tagger with API key (verbose=True to see input/output)
+    print("\nInitializing Tagger...")
+    tagger = Tagger(api_key=API_KEY, verbose=True)
+
+    # List available models
+    print("\nAvailable models:")
+    available_models = tagger.get_available_models()
+    for m in available_models:
+        print(f"  - {m}")
+
+    # Configure tagger
+    # gpt-oss-120b is a good nice strong model
+    tagger.set_model("gpt-oss-120b")
+    tagger.set_constitution("LEGAL")
+
+    # Initialize DPFusion with tagger
+    print("Initializing DPFusion with Tagger...")
+    dpf = DPFusion(model=model, tokenizer=tokenizer, max_tokens=100, tagger=tagger)
+
+    # Example private text (ECHR style legal document)
+    private_text = """The applicant was born in 1973 and currently resides in Les Salles-sur-Verdon, France.
+In the early 1990s, a new criminal phenomenon emerged in Denmark known as 'tax asset stripping cases' (selskabstømmersager)."""
+
+    print(f"\nPrivate text ({len(private_text)} characters):")
+    print(private_text)
+
+    # Build context using message API
+    dpf.add_message("system", "You are a helpful assistant that paraphrases text.", is_private=False)
+    dpf.add_message("user", private_text, is_private=True)
+    dpf.add_message("system", "Now paraphrase this text for privacy", is_private=False)
+    dpf.add_message("assistant", "Sure, here is the paraphrase of the above text that ensures privacy:", is_private=False)
+
+    # Run tagger to extract and redact private phrases
+    print("\n" + "-" * 60)
+    print("Running Tagger API to extract private phrases...")
+    print("-" * 60)
+    dpf.run_tagger()
+
+    # Show both contexts
+    print("\n" + "-" * 60)
+    print("Private Context (full text):")
+    print("-" * 60)
+    print(dpf.private_context)
+
+    print("\n" + "-" * 60)
+    print("Public Context (redacted):")
+    print("-" * 60)
+    print(dpf.public_context)
+
+    # Run DP-Fusion generation
+    print("\n" + "-" * 60)
+    print("Running DP-Fusion generation...")
+    print("-" * 60)
+
+    output = dpf.generate(
+        alpha=2.0,
+        beta=0.01,
+        temperature=1.0,
+        max_new_tokens=100,
+        debug=True
+    )
+
+    print("\n" + "=" * 60)
+    print("Results:")
+    print("=" * 60)
+    print(f"\nGenerated text:\n{output['text']}\n")
+
+    # Print some stats
+    if output['lambdas'].get('PRIVATE'):
+        lambdas = output['lambdas']['PRIVATE']
+        print(f"Lambda stats: Mean={sum(lambdas)/len(lambdas):.4f}, Min={min(lambdas):.4f}, Max={max(lambdas):.4f}")
+
+    if output['divergences'].get('PRIVATE'):
+        divs = output['divergences']['PRIVATE']
+        print(f"Divergence stats: Mean={sum(divs)/len(divs):.4f}, Min={min(divs):.4f}, Max={max(divs):.4f}")
+
+    # Compute (ε, δ)-DP guarantee
+    print("\n" + "-" * 60)
+    print("Computing (ε, δ)-DP guarantees:")
+    print("-" * 60)
+
+    alpha = 2.0   # Rényi order (same as used in generation)
+    beta = 0.01   # Divergence bound (same as used in generation)
+    delta = 1e-5  # Target δ for (ε, δ)-DP
+
+    if output['divergences'].get('PRIVATE'):
+        eps_result = compute_epsilon_single_group(
+            divergences=output['divergences']['PRIVATE'],
+            alpha=alpha,
+            delta=delta,
+            beta=beta
+        )
+        print(f"\n(ε, δ)-DP guarantees (α={alpha}, δ={delta}, T={eps_result['T']} tokens):")
+        print(f"  Empirical ε  = {eps_result['empirical']:.4f}  (from actual divergences)")
+        print(f"  Theoretical ε = {eps_result['theoretical']:.4f}  (worst-case, β={beta} per step)")
+
+    print("\nExample completed successfully!")
+
+
+if __name__ == "__main__":
+    main()