dograh/api/services/gender/build_model.py
Abhishek Kumar 4f2a629340 Initial Commit 🚀 🚀
2025-09-09 14:37:32 +05:30

164 lines
4.7 KiB
Python

"""
Build gender prediction model from SSA baby names data.
Generates a compressed JSON model file.
"""
import json
from collections import defaultdict
from datetime import datetime
from math import log10
from pathlib import Path
from api.services.gender.constants import CONFIDENCE_THRESHOLD
def calculate_confidence(male_count: int, female_count: int) -> float:
"""Calculate confidence score for gender prediction."""
total = male_count + female_count
# Minimum sample size requirement
if total < 100:
return 0.0
# Calculate gender ratio
ratio = max(male_count, female_count) / total
# Apply logarithmic scaling for sample size
# Max confidence at 100,000 occurrences
sample_weight = min(1.0, log10(total) / 5)
# Final confidence
return round(ratio * sample_weight, 4)
def build_model():
"""Build gender prediction model from SSA data."""
# Initialize counters
name_stats = defaultdict(lambda: {"M": 0, "F": 0})
# Get the path to names directory
names_dir = Path(__file__).parent / "names"
if not names_dir.exists():
raise FileNotFoundError(f"Names directory not found: {names_dir}")
file_count = 0
# Process all year files
for year_file in sorted(names_dir.glob("yob*.txt")):
file_count += 1
with open(year_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
parts = line.split(",")
if len(parts) != 3:
continue
name, gender, count = parts
name = name.lower()
count = int(count)
# Simple aggregation - no year weighting
name_stats[name][gender] += count
print(f"Processed {file_count} year files")
# Build compressed model format
names_data = {}
high_confidence_count = 0
for name, stats in name_stats.items():
male_count = stats["M"]
female_count = stats["F"]
if male_count == 0 and female_count == 0:
continue
# Calculate confidence
confidence = calculate_confidence(male_count, female_count)
# Store as compact array: [male_count, female_count, confidence]
names_data[name] = [male_count, female_count, confidence]
if confidence >= CONFIDENCE_THRESHOLD:
high_confidence_count += 1
# Create final model structure with metadata
model = {
"version": "1.0",
"metadata": {
"confidence_threshold": CONFIDENCE_THRESHOLD,
"total_names": len(names_data),
"high_confidence_names": high_confidence_count,
"build_date": datetime.now().isoformat(),
"source_files": file_count,
},
"names": names_data,
}
return model
def save_model(model, output_path="model.txt"):
"""Save model to compressed JSON file."""
output_file = Path(__file__).parent / output_path
# Write compressed JSON (no indentation for smaller size)
with open(output_file, "w", encoding="utf-8") as f:
json.dump(model, f, separators=(",", ":"))
# Calculate file size
file_size_mb = output_file.stat().st_size / (1024 * 1024)
print(f"\nModel saved to: {output_file}")
print(f"File size: {file_size_mb:.2f} MB")
print(f"\nModel statistics:")
print(f" Total names: {model['metadata']['total_names']:,}")
print(
f" High confidence names (≥{CONFIDENCE_THRESHOLD}): {model['metadata']['high_confidence_names']:,}"
)
print(
f" Confidence percentage: {model['metadata']['high_confidence_names'] / model['metadata']['total_names'] * 100:.1f}%"
)
def test_model(model):
"""Test model with sample names."""
print("\nSample predictions:")
test_names = [
"john",
"mary",
"alex",
"taylor",
"michael",
"sarah",
"jordan",
"casey",
]
for name in test_names:
if name in model["names"]:
male_count, female_count, confidence = model["names"][name]
gender = "male" if male_count > female_count else "female"
print(
f" {name.capitalize():10} -> {gender:6} (confidence: {confidence:.3f}, M:{male_count:,} F:{female_count:,})"
)
else:
print(f" {name.capitalize():10} -> not found in dataset")
if __name__ == "__main__":
print("Building gender prediction model from SSA data...")
print("=" * 50)
try:
model = build_model()
save_model(model)
test_model(model)
print("\n✓ Model build complete!")
except Exception as e:
print(f"\n✗ Error building model: {e}")
raise