Doc-to-LoRA release

This commit is contained in:
51616 2026-02-27 03:47:04 +00:00
commit 1abe8ae16d
92 changed files with 22131 additions and 0 deletions

31
webui/SELF_GEN_VIEWER.md Normal file
View file

@ -0,0 +1,31 @@
# Self-Gen Data Viewer
Thanks Claude.
Running the viewer
```bash
uv run self_gen_viewer.py
```
Then open your browser and go to: **http://localhost:5001**
## Usage
1. **Select a Model Folder**: Choose from the dropdown list (e.g., `google/gemma-2-2b-it_temp_0.0_closed_qa_prob_1.0`)
2. **Select a Parquet File**: Once a folder is selected, available parquet files will appear
3. **Set Number of Samples**: Adjust the sample count (default: 100, max: 1000)
4. **Click "Load Data"**: View the visualized data with context and Q&A pairs
## Data Structure
The viewer expects data in the following structure:
```
data/raw_datasets/self_gen/
├── google/
│ └── gemma-2-2b-it_temp_0.0_closed_qa_prob_1.0/
│ └── fw_qa_v2/
│ └── *.parquet
└── mistralai/
└── Mistral-7B-Instruct-v0.2_temp_0.0_closed_qa_prob_1.0/
└── *.parquet
```

170
webui/self_gen_viewer.py Normal file
View file

@ -0,0 +1,170 @@
import traceback
from pathlib import Path
from datasets import load_dataset
from flask import Flask, jsonify, render_template, request
from transformers import AutoTokenizer
app = Flask(__name__)
# Base path for self_gen data
BASE_DATA_PATH = Path(__file__).parent.parent / "data" / "raw_datasets" / "self_gen"
# Cache for tokenizers
tokenizer_cache = {}
def get_tokenizer(model_path):
"""Get or create tokenizer with caching"""
if model_path not in tokenizer_cache:
try:
tokenizer_cache[model_path] = AutoTokenizer.from_pretrained(model_path)
except Exception as e:
print(f"Error loading tokenizer for {model_path}: {e}")
return None
return tokenizer_cache[model_path]
def discover_folders():
"""Discover all model folders in self_gen directory"""
folders = []
if not BASE_DATA_PATH.exists():
return folders
for vendor_dir in BASE_DATA_PATH.iterdir():
if vendor_dir.is_dir():
for model_dir in vendor_dir.iterdir():
if model_dir.is_dir():
rel_path = model_dir.relative_to(BASE_DATA_PATH)
folders.append(str(rel_path))
return sorted(folders)
def discover_parquet_files(folder_path):
"""Discover all parquet files in a folder"""
full_path = BASE_DATA_PATH / folder_path
parquet_files = []
if full_path.exists():
for parquet_file in full_path.glob("**/*.parquet"):
rel_path = parquet_file.relative_to(full_path)
parquet_files.append(str(rel_path))
return sorted(parquet_files)
def extract_model_name_from_folder(folder_path):
"""Extract base model name from folder path"""
# e.g., "google/gemma-2-2b-it_temp_0.0_closed_qa_prob_1.0" -> "google/gemma-2-2b-it"
parts = folder_path.split("/")
if len(parts) >= 2:
vendor = parts[0]
model_part = parts[1].split("_temp_")[0]
return f"{vendor}/{model_part}"
return None
@app.route("/")
def index():
"""Main page"""
folders = discover_folders()
return render_template("self_gen_viewer.html", folders=folders)
@app.route("/api/folders")
def api_folders():
"""API endpoint to get available folders"""
folders = discover_folders()
return jsonify({"folders": folders})
@app.route("/api/parquet_files")
def api_parquet_files():
"""API endpoint to get parquet files in a folder"""
folder = request.args.get("folder", "")
if not folder:
return jsonify({"error": "No folder specified"}), 400
files = discover_parquet_files(folder)
return jsonify({"files": files})
@app.route("/api/load_data")
def api_load_data():
"""API endpoint to load and display data from a parquet file"""
folder = request.args.get("folder", "")
parquet_file = request.args.get("file", "")
num_samples = int(request.args.get("num_samples", 100))
if not folder or not parquet_file:
return jsonify({"error": "Missing parameters"}), 400
try:
# Construct full path
full_path = BASE_DATA_PATH / folder / parquet_file
if not full_path.exists():
return jsonify({"error": f"File not found: {full_path}"}), 404
# Extract model name for tokenizer
model_name = extract_model_name_from_folder(folder)
if not model_name:
return jsonify({"error": "Could not extract model name from folder"}), 400
# Load tokenizer
tokenizer = get_tokenizer(model_name)
if tokenizer is None:
return jsonify({"error": f"Could not load tokenizer for {model_name}"}), 500
# Load dataset
ds = load_dataset(
"parquet", data_files=str(full_path), split=f"train[:{num_samples}]"
)
# Process samples
samples = []
for i, sample in enumerate(ds):
processed_sample = {
"index": i,
"ctx": tokenizer.decode(sample["ctx_ids"], skip_special_tokens=False)
if "ctx_ids" in sample
else "N/A",
"questions": [],
}
# Decode input_ids if present
if "input_ids" in sample:
if isinstance(sample["input_ids"][0], list):
# Multiple Q&A pairs
processed_sample["questions"] = [
tokenizer.decode(qa, skip_special_tokens=False)
for qa in sample["input_ids"]
]
else:
# Single item
processed_sample["questions"] = [
tokenizer.decode(sample["input_ids"], skip_special_tokens=False)
]
samples.append(processed_sample)
return jsonify(
{
"success": True,
"num_samples": len(samples),
"model_name": model_name,
"file_path": str(parquet_file),
"samples": samples,
}
)
except Exception as e:
traceback.print_exc()
return jsonify({"error": str(e), "traceback": traceback.format_exc()}), 500
if __name__ == "__main__":
print(f"Data path: {BASE_DATA_PATH}")
print(f"Available folders: {discover_folders()}")
app.run(debug=True, host="0.0.0.0", port=5001)

View file

@ -0,0 +1,422 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Self-Gen Data Viewer</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
min-height: 100vh;
padding: 20px;
}
.container {
max-width: 1400px;
margin: 0 auto;
background: white;
border-radius: 15px;
box-shadow: 0 10px 40px rgba(0, 0, 0, 0.3);
padding: 30px;
}
h1 {
color: #667eea;
margin-bottom: 25px;
text-align: center;
font-size: 2.5em;
}
.controls {
display: flex;
flex-direction: column;
gap: 15px;
margin-bottom: 25px;
}
.control-group {
display: flex;
flex-direction: column;
}
.samples-row {
display: grid;
grid-template-columns: 1fr auto;
gap: 15px;
align-items: end;
}
label {
font-weight: 600;
margin-bottom: 5px;
color: #333;
font-size: 0.9em;
}
select,
input {
padding: 10px;
border: 2px solid #ddd;
border-radius: 8px;
font-size: 1em;
transition: border-color 0.3s;
}
select:focus,
input:focus {
outline: none;
border-color: #667eea;
}
button {
padding: 10px 20px;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
border: none;
border-radius: 8px;
cursor: pointer;
font-size: 1em;
font-weight: 600;
transition: transform 0.2s, box-shadow 0.2s;
margin-top: auto;
}
button:hover {
transform: translateY(-2px);
box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
}
button:active {
transform: translateY(0);
}
button:disabled {
background: #ccc;
cursor: not-allowed;
transform: none;
}
.info-box {
background: #f8f9fa;
padding: 15px;
border-radius: 8px;
margin-bottom: 20px;
border-left: 4px solid #667eea;
}
.info-box p {
margin: 5px 0;
color: #555;
}
.loading {
text-align: center;
padding: 40px;
color: #667eea;
font-size: 1.2em;
}
.error {
background: #fee;
color: #c33;
padding: 15px;
border-radius: 8px;
margin: 20px 0;
border-left: 4px solid #c33;
}
.sample {
background: #f8f9fa;
border: 1px solid #e0e0e0;
border-radius: 10px;
padding: 20px;
margin-bottom: 20px;
transition: box-shadow 0.3s;
}
.sample:hover {
box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1);
}
.sample-header {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 10px 15px;
border-radius: 6px;
margin-bottom: 15px;
font-weight: 600;
}
.section {
margin-bottom: 20px;
}
.section-title {
font-weight: 600;
color: #667eea;
margin-bottom: 10px;
font-size: 1.1em;
border-bottom: 2px solid #667eea;
padding-bottom: 5px;
}
.content {
background: white;
padding: 15px;
border-radius: 6px;
border: 1px solid #e0e0e0;
white-space: pre-wrap;
word-wrap: break-word;
font-family: 'Courier New', monospace;
font-size: 0.9em;
line-height: 1.6;
max-height: 400px;
overflow-y: auto;
}
.question-item {
background: #fff;
padding: 12px;
border-radius: 6px;
border: 1px solid #ddd;
margin-bottom: 10px;
}
.question-number {
background: #667eea;
color: white;
padding: 3px 8px;
border-radius: 4px;
font-size: 0.85em;
font-weight: 600;
display: inline-block;
margin-bottom: 8px;
}
#results {
margin-top: 30px;
}
.load-more {
text-align: center;
margin-top: 20px;
}
.stats {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 15px;
margin-bottom: 20px;
}
.stat-card {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 15px;
border-radius: 8px;
text-align: center;
}
.stat-value {
font-size: 2em;
font-weight: 700;
}
.stat-label {
font-size: 0.9em;
opacity: 0.9;
}
::-webkit-scrollbar {
width: 8px;
height: 8px;
}
::-webkit-scrollbar-track {
background: #f1f1f1;
border-radius: 4px;
}
::-webkit-scrollbar-thumb {
background: #667eea;
border-radius: 4px;
}
::-webkit-scrollbar-thumb:hover {
background: #764ba2;
}
</style>
</head>
<body>
<div class="container">
<h1>🔍 Self-Gen Data Viewer</h1>
<div class="controls">
<div class="control-group">
<label for="folder-select">Model Folder:</label>
<select id="folder-select">
<option value="">-- Select a folder --</option>
{% for folder in folders %}
<option value="{{ folder }}">{{ folder }}</option>
{% endfor %}
</select>
</div>
<div class="control-group">
<label for="file-select">Parquet File:</label>
<select id="file-select" disabled>
<option value="">-- Select a file --</option>
</select>
</div>
<div class="samples-row">
<div class="control-group">
<label for="num-samples">Number of Samples:</label>
<input type="number" id="num-samples" value="100" min="1" max="1000" step="10">
</div>
<button id="load-btn" disabled>Load Data</button>
</div>
</div>
<div id="info" style="display: none;"></div>
<div id="results"></div>
</div>
<script>
const folderSelect = document.getElementById('folder-select');
const fileSelect = document.getElementById('file-select');
const numSamplesInput = document.getElementById('num-samples');
const loadBtn = document.getElementById('load-btn');
const infoDiv = document.getElementById('info');
const resultsDiv = document.getElementById('results');
// When folder is selected, load parquet files
folderSelect.addEventListener('change', async () => {
const folder = folderSelect.value;
fileSelect.innerHTML = '<option value="">-- Select a file --</option>';
fileSelect.disabled = true;
loadBtn.disabled = true;
if (!folder) return;
try {
const response = await fetch(`/api/parquet_files?folder=${encodeURIComponent(folder)}`);
const data = await response.json();
if (data.files && data.files.length > 0) {
data.files.forEach(file => {
const option = document.createElement('option');
option.value = file;
option.textContent = file;
fileSelect.appendChild(option);
});
fileSelect.disabled = false;
} else {
alert('No parquet files found in this folder');
}
} catch (error) {
alert('Error loading files: ' + error.message);
}
});
// Enable load button when file is selected
fileSelect.addEventListener('change', () => {
loadBtn.disabled = !fileSelect.value;
});
// Load data button
loadBtn.addEventListener('click', loadData);
async function loadData() {
const folder = folderSelect.value;
const file = fileSelect.value;
const numSamples = numSamplesInput.value;
if (!folder || !file) return;
resultsDiv.innerHTML = '<div class="loading">⏳ Loading data...</div>';
infoDiv.style.display = 'none';
try {
const response = await fetch(
`/api/load_data?folder=${encodeURIComponent(folder)}&file=${encodeURIComponent(file)}&num_samples=${numSamples}`
);
const data = await response.json();
if (data.error) {
resultsDiv.innerHTML = `<div class="error"><strong>Error:</strong> ${data.error}</div>`;
return;
}
displayData(data);
} catch (error) {
resultsDiv.innerHTML = `<div class="error"><strong>Error:</strong> ${error.message}</div>`;
}
}
function displayData(data) {
// Show info box
infoDiv.style.display = 'block';
infoDiv.innerHTML = `
<div class="info-box">
<div class="stats">
<div class="stat-card">
<div class="stat-value">${data.num_samples}</div>
<div class="stat-label">Samples</div>
</div>
<div class="stat-card">
<div class="stat-value">${data.samples.length > 0 ? data.samples[0].questions.length : 0}</div>
<div class="stat-label">Questions per Sample</div>
</div>
</div>
<p><strong>Model:</strong> ${data.model_name}</p>
<p><strong>File:</strong> ${data.file_path}</p>
</div>
`;
// Show samples
let html = '';
data.samples.forEach(sample => {
html += `
<div class="sample">
<div class="sample-header">Sample #${sample.index}</div>
<div class="section">
<div class="section-title">📝 Context</div>
<div class="content">${escapeHtml(sample.ctx)}</div>
</div>
<div class="section">
<div class="section-title">❓ Questions & Answers (${sample.questions.length})</div>
${sample.questions.map((q, i) => `
<div class="question-item">
<span class="question-number">Q&A ${i + 1}</span>
<div class="content">${escapeHtml(q)}</div>
</div>
`).join('')}
</div>
</div>
`;
});
resultsDiv.innerHTML = html;
}
function escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
</script>
</body>
</html>