dograh/api/services/gender/build_model.py

"""
Build gender prediction model from SSA baby names data.
Generates a compressed JSON model file.
"""

import json
from collections import defaultdict
from datetime import datetime
from math import log10
from pathlib import Path

from api.services.gender.constants import CONFIDENCE_THRESHOLD


def calculate_confidence(male_count: int, female_count: int) -> float:
    """Calculate confidence score for gender prediction."""
    total = male_count + female_count

    # Minimum sample size requirement
    if total < 100:
        return 0.0

    # Calculate gender ratio
    ratio = max(male_count, female_count) / total

    # Apply logarithmic scaling for sample size
    # Max confidence at 100,000 occurrences
    sample_weight = min(1.0, log10(total) / 5)

    # Final confidence
    return round(ratio * sample_weight, 4)


def build_model():
    """Build gender prediction model from SSA data."""
    # Initialize counters
    name_stats = defaultdict(lambda: {"M": 0, "F": 0})

    # Get the path to names directory
    names_dir = Path(__file__).parent / "names"

    if not names_dir.exists():
        raise FileNotFoundError(f"Names directory not found: {names_dir}")

    file_count = 0
    # Process all year files
    for year_file in sorted(names_dir.glob("yob*.txt")):
        file_count += 1
        with open(year_file, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue

                parts = line.split(",")
                if len(parts) != 3:
                    continue

                name, gender, count = parts
                name = name.lower()
                count = int(count)

                # Simple aggregation - no year weighting
                name_stats[name][gender] += count

    print(f"Processed {file_count} year files")

    # Build compressed model format
    names_data = {}
    high_confidence_count = 0

    for name, stats in name_stats.items():
        male_count = stats["M"]
        female_count = stats["F"]

        if male_count == 0 and female_count == 0:
            continue

        # Calculate confidence
        confidence = calculate_confidence(male_count, female_count)

        # Store as compact array: [male_count, female_count, confidence]
        names_data[name] = [male_count, female_count, confidence]

        if confidence >= CONFIDENCE_THRESHOLD:
            high_confidence_count += 1

    # Create final model structure with metadata
    model = {
        "version": "1.0",
        "metadata": {
            "confidence_threshold": CONFIDENCE_THRESHOLD,
            "total_names": len(names_data),
            "high_confidence_names": high_confidence_count,
            "build_date": datetime.now().isoformat(),
            "source_files": file_count,
        },
        "names": names_data,
    }

    return model


def save_model(model, output_path="model.txt"):
    """Save model to compressed JSON file."""
    output_file = Path(__file__).parent / output_path

    # Write compressed JSON (no indentation for smaller size)
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(model, f, separators=(",", ":"))

    # Calculate file size
    file_size_mb = output_file.stat().st_size / (1024 * 1024)

    print(f"\nModel saved to: {output_file}")
    print(f"File size: {file_size_mb:.2f} MB")
    print(f"\nModel statistics:")
    print(f"  Total names: {model['metadata']['total_names']:,}")
    print(
        f"  High confidence names (≥{CONFIDENCE_THRESHOLD}): {model['metadata']['high_confidence_names']:,}"
    )
    print(
        f"  Confidence percentage: {model['metadata']['high_confidence_names'] / model['metadata']['total_names'] * 100:.1f}%"
    )


def test_model(model):
    """Test model with sample names."""
    print("\nSample predictions:")
    test_names = [
        "john",
        "mary",
        "alex",
        "taylor",
        "michael",
        "sarah",
        "jordan",
        "casey",
    ]

    for name in test_names:
        if name in model["names"]:
            male_count, female_count, confidence = model["names"][name]
            gender = "male" if male_count > female_count else "female"
            print(
                f"  {name.capitalize():10} -> {gender:6} (confidence: {confidence:.3f}, M:{male_count:,} F:{female_count:,})"
            )
        else:
            print(f"  {name.capitalize():10} -> not found in dataset")


if __name__ == "__main__":
    print("Building gender prediction model from SSA data...")
    print("=" * 50)

    try:
        model = build_model()
        save_model(model)
        test_model(model)
        print("\n✓ Model build complete!")

    except Exception as e:
        print(f"\n✗ Error building model: {e}")
        raise