mirror of
https://github.com/SakanaAI/doc-to-lora.git
synced 2026-05-12 00:02:38 +02:00
Doc-to-LoRA release
This commit is contained in:
commit
1abe8ae16d
92 changed files with 22131 additions and 0 deletions
31
webui/SELF_GEN_VIEWER.md
Normal file
31
webui/SELF_GEN_VIEWER.md
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
# Self-Gen Data Viewer
|
||||
|
||||
Thanks Claude.
|
||||
|
||||
Running the viewer
|
||||
```bash
|
||||
uv run self_gen_viewer.py
|
||||
```
|
||||
|
||||
Then open your browser and go to: **http://localhost:5001**
|
||||
|
||||
## Usage
|
||||
|
||||
1. **Select a Model Folder**: Choose from the dropdown list (e.g., `google/gemma-2-2b-it_temp_0.0_closed_qa_prob_1.0`)
|
||||
2. **Select a Parquet File**: Once a folder is selected, available parquet files will appear
|
||||
3. **Set Number of Samples**: Adjust the sample count (default: 100, max: 1000)
|
||||
4. **Click "Load Data"**: View the visualized data with context and Q&A pairs
|
||||
|
||||
## Data Structure
|
||||
|
||||
The viewer expects data in the following structure:
|
||||
```
|
||||
data/raw_datasets/self_gen/
|
||||
├── google/
|
||||
│ └── gemma-2-2b-it_temp_0.0_closed_qa_prob_1.0/
|
||||
│ └── fw_qa_v2/
|
||||
│ └── *.parquet
|
||||
└── mistralai/
|
||||
└── Mistral-7B-Instruct-v0.2_temp_0.0_closed_qa_prob_1.0/
|
||||
└── *.parquet
|
||||
```
|
||||
170
webui/self_gen_viewer.py
Normal file
170
webui/self_gen_viewer.py
Normal file
|
|
@ -0,0 +1,170 @@
|
|||
import traceback
|
||||
from pathlib import Path
|
||||
|
||||
from datasets import load_dataset
|
||||
from flask import Flask, jsonify, render_template, request
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Base path for self_gen data
|
||||
BASE_DATA_PATH = Path(__file__).parent.parent / "data" / "raw_datasets" / "self_gen"
|
||||
|
||||
# Cache for tokenizers
|
||||
tokenizer_cache = {}
|
||||
|
||||
|
||||
def get_tokenizer(model_path):
|
||||
"""Get or create tokenizer with caching"""
|
||||
if model_path not in tokenizer_cache:
|
||||
try:
|
||||
tokenizer_cache[model_path] = AutoTokenizer.from_pretrained(model_path)
|
||||
except Exception as e:
|
||||
print(f"Error loading tokenizer for {model_path}: {e}")
|
||||
return None
|
||||
return tokenizer_cache[model_path]
|
||||
|
||||
|
||||
def discover_folders():
|
||||
"""Discover all model folders in self_gen directory"""
|
||||
folders = []
|
||||
if not BASE_DATA_PATH.exists():
|
||||
return folders
|
||||
|
||||
for vendor_dir in BASE_DATA_PATH.iterdir():
|
||||
if vendor_dir.is_dir():
|
||||
for model_dir in vendor_dir.iterdir():
|
||||
if model_dir.is_dir():
|
||||
rel_path = model_dir.relative_to(BASE_DATA_PATH)
|
||||
folders.append(str(rel_path))
|
||||
|
||||
return sorted(folders)
|
||||
|
||||
|
||||
def discover_parquet_files(folder_path):
|
||||
"""Discover all parquet files in a folder"""
|
||||
full_path = BASE_DATA_PATH / folder_path
|
||||
parquet_files = []
|
||||
|
||||
if full_path.exists():
|
||||
for parquet_file in full_path.glob("**/*.parquet"):
|
||||
rel_path = parquet_file.relative_to(full_path)
|
||||
parquet_files.append(str(rel_path))
|
||||
|
||||
return sorted(parquet_files)
|
||||
|
||||
|
||||
def extract_model_name_from_folder(folder_path):
|
||||
"""Extract base model name from folder path"""
|
||||
# e.g., "google/gemma-2-2b-it_temp_0.0_closed_qa_prob_1.0" -> "google/gemma-2-2b-it"
|
||||
parts = folder_path.split("/")
|
||||
if len(parts) >= 2:
|
||||
vendor = parts[0]
|
||||
model_part = parts[1].split("_temp_")[0]
|
||||
return f"{vendor}/{model_part}"
|
||||
return None
|
||||
|
||||
|
||||
@app.route("/")
|
||||
def index():
|
||||
"""Main page"""
|
||||
folders = discover_folders()
|
||||
return render_template("self_gen_viewer.html", folders=folders)
|
||||
|
||||
|
||||
@app.route("/api/folders")
|
||||
def api_folders():
|
||||
"""API endpoint to get available folders"""
|
||||
folders = discover_folders()
|
||||
return jsonify({"folders": folders})
|
||||
|
||||
|
||||
@app.route("/api/parquet_files")
|
||||
def api_parquet_files():
|
||||
"""API endpoint to get parquet files in a folder"""
|
||||
folder = request.args.get("folder", "")
|
||||
if not folder:
|
||||
return jsonify({"error": "No folder specified"}), 400
|
||||
|
||||
files = discover_parquet_files(folder)
|
||||
return jsonify({"files": files})
|
||||
|
||||
|
||||
@app.route("/api/load_data")
|
||||
def api_load_data():
|
||||
"""API endpoint to load and display data from a parquet file"""
|
||||
folder = request.args.get("folder", "")
|
||||
parquet_file = request.args.get("file", "")
|
||||
num_samples = int(request.args.get("num_samples", 100))
|
||||
|
||||
if not folder or not parquet_file:
|
||||
return jsonify({"error": "Missing parameters"}), 400
|
||||
|
||||
try:
|
||||
# Construct full path
|
||||
full_path = BASE_DATA_PATH / folder / parquet_file
|
||||
|
||||
if not full_path.exists():
|
||||
return jsonify({"error": f"File not found: {full_path}"}), 404
|
||||
|
||||
# Extract model name for tokenizer
|
||||
model_name = extract_model_name_from_folder(folder)
|
||||
if not model_name:
|
||||
return jsonify({"error": "Could not extract model name from folder"}), 400
|
||||
|
||||
# Load tokenizer
|
||||
tokenizer = get_tokenizer(model_name)
|
||||
if tokenizer is None:
|
||||
return jsonify({"error": f"Could not load tokenizer for {model_name}"}), 500
|
||||
|
||||
# Load dataset
|
||||
ds = load_dataset(
|
||||
"parquet", data_files=str(full_path), split=f"train[:{num_samples}]"
|
||||
)
|
||||
|
||||
# Process samples
|
||||
samples = []
|
||||
for i, sample in enumerate(ds):
|
||||
processed_sample = {
|
||||
"index": i,
|
||||
"ctx": tokenizer.decode(sample["ctx_ids"], skip_special_tokens=False)
|
||||
if "ctx_ids" in sample
|
||||
else "N/A",
|
||||
"questions": [],
|
||||
}
|
||||
|
||||
# Decode input_ids if present
|
||||
if "input_ids" in sample:
|
||||
if isinstance(sample["input_ids"][0], list):
|
||||
# Multiple Q&A pairs
|
||||
processed_sample["questions"] = [
|
||||
tokenizer.decode(qa, skip_special_tokens=False)
|
||||
for qa in sample["input_ids"]
|
||||
]
|
||||
else:
|
||||
# Single item
|
||||
processed_sample["questions"] = [
|
||||
tokenizer.decode(sample["input_ids"], skip_special_tokens=False)
|
||||
]
|
||||
|
||||
samples.append(processed_sample)
|
||||
|
||||
return jsonify(
|
||||
{
|
||||
"success": True,
|
||||
"num_samples": len(samples),
|
||||
"model_name": model_name,
|
||||
"file_path": str(parquet_file),
|
||||
"samples": samples,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
return jsonify({"error": str(e), "traceback": traceback.format_exc()}), 500
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(f"Data path: {BASE_DATA_PATH}")
|
||||
print(f"Available folders: {discover_folders()}")
|
||||
app.run(debug=True, host="0.0.0.0", port=5001)
|
||||
422
webui/templates/self_gen_viewer.html
Normal file
422
webui/templates/self_gen_viewer.html
Normal file
|
|
@ -0,0 +1,422 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Self-Gen Data Viewer</title>
|
||||
<style>
|
||||
* {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
min-height: 100vh;
|
||||
padding: 20px;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 1400px;
|
||||
margin: 0 auto;
|
||||
background: white;
|
||||
border-radius: 15px;
|
||||
box-shadow: 0 10px 40px rgba(0, 0, 0, 0.3);
|
||||
padding: 30px;
|
||||
}
|
||||
|
||||
h1 {
|
||||
color: #667eea;
|
||||
margin-bottom: 25px;
|
||||
text-align: center;
|
||||
font-size: 2.5em;
|
||||
}
|
||||
|
||||
.controls {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 15px;
|
||||
margin-bottom: 25px;
|
||||
}
|
||||
|
||||
.control-group {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.samples-row {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr auto;
|
||||
gap: 15px;
|
||||
align-items: end;
|
||||
}
|
||||
|
||||
label {
|
||||
font-weight: 600;
|
||||
margin-bottom: 5px;
|
||||
color: #333;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
select,
|
||||
input {
|
||||
padding: 10px;
|
||||
border: 2px solid #ddd;
|
||||
border-radius: 8px;
|
||||
font-size: 1em;
|
||||
transition: border-color 0.3s;
|
||||
}
|
||||
|
||||
select:focus,
|
||||
input:focus {
|
||||
outline: none;
|
||||
border-color: #667eea;
|
||||
}
|
||||
|
||||
button {
|
||||
padding: 10px 20px;
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
color: white;
|
||||
border: none;
|
||||
border-radius: 8px;
|
||||
cursor: pointer;
|
||||
font-size: 1em;
|
||||
font-weight: 600;
|
||||
transition: transform 0.2s, box-shadow 0.2s;
|
||||
margin-top: auto;
|
||||
}
|
||||
|
||||
button:hover {
|
||||
transform: translateY(-2px);
|
||||
box-shadow: 0 5px 15px rgba(102, 126, 234, 0.4);
|
||||
}
|
||||
|
||||
button:active {
|
||||
transform: translateY(0);
|
||||
}
|
||||
|
||||
button:disabled {
|
||||
background: #ccc;
|
||||
cursor: not-allowed;
|
||||
transform: none;
|
||||
}
|
||||
|
||||
.info-box {
|
||||
background: #f8f9fa;
|
||||
padding: 15px;
|
||||
border-radius: 8px;
|
||||
margin-bottom: 20px;
|
||||
border-left: 4px solid #667eea;
|
||||
}
|
||||
|
||||
.info-box p {
|
||||
margin: 5px 0;
|
||||
color: #555;
|
||||
}
|
||||
|
||||
.loading {
|
||||
text-align: center;
|
||||
padding: 40px;
|
||||
color: #667eea;
|
||||
font-size: 1.2em;
|
||||
}
|
||||
|
||||
.error {
|
||||
background: #fee;
|
||||
color: #c33;
|
||||
padding: 15px;
|
||||
border-radius: 8px;
|
||||
margin: 20px 0;
|
||||
border-left: 4px solid #c33;
|
||||
}
|
||||
|
||||
.sample {
|
||||
background: #f8f9fa;
|
||||
border: 1px solid #e0e0e0;
|
||||
border-radius: 10px;
|
||||
padding: 20px;
|
||||
margin-bottom: 20px;
|
||||
transition: box-shadow 0.3s;
|
||||
}
|
||||
|
||||
.sample:hover {
|
||||
box-shadow: 0 5px 15px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
.sample-header {
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
color: white;
|
||||
padding: 10px 15px;
|
||||
border-radius: 6px;
|
||||
margin-bottom: 15px;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.section {
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.section-title {
|
||||
font-weight: 600;
|
||||
color: #667eea;
|
||||
margin-bottom: 10px;
|
||||
font-size: 1.1em;
|
||||
border-bottom: 2px solid #667eea;
|
||||
padding-bottom: 5px;
|
||||
}
|
||||
|
||||
.content {
|
||||
background: white;
|
||||
padding: 15px;
|
||||
border-radius: 6px;
|
||||
border: 1px solid #e0e0e0;
|
||||
white-space: pre-wrap;
|
||||
word-wrap: break-word;
|
||||
font-family: 'Courier New', monospace;
|
||||
font-size: 0.9em;
|
||||
line-height: 1.6;
|
||||
max-height: 400px;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.question-item {
|
||||
background: #fff;
|
||||
padding: 12px;
|
||||
border-radius: 6px;
|
||||
border: 1px solid #ddd;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.question-number {
|
||||
background: #667eea;
|
||||
color: white;
|
||||
padding: 3px 8px;
|
||||
border-radius: 4px;
|
||||
font-size: 0.85em;
|
||||
font-weight: 600;
|
||||
display: inline-block;
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
|
||||
#results {
|
||||
margin-top: 30px;
|
||||
}
|
||||
|
||||
.load-more {
|
||||
text-align: center;
|
||||
margin-top: 20px;
|
||||
}
|
||||
|
||||
.stats {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
||||
gap: 15px;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.stat-card {
|
||||
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
||||
color: white;
|
||||
padding: 15px;
|
||||
border-radius: 8px;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.stat-value {
|
||||
font-size: 2em;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.stat-label {
|
||||
font-size: 0.9em;
|
||||
opacity: 0.9;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar {
|
||||
width: 8px;
|
||||
height: 8px;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-track {
|
||||
background: #f1f1f1;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-thumb {
|
||||
background: #667eea;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
::-webkit-scrollbar-thumb:hover {
|
||||
background: #764ba2;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1>🔍 Self-Gen Data Viewer</h1>
|
||||
|
||||
<div class="controls">
|
||||
<div class="control-group">
|
||||
<label for="folder-select">Model Folder:</label>
|
||||
<select id="folder-select">
|
||||
<option value="">-- Select a folder --</option>
|
||||
{% for folder in folders %}
|
||||
<option value="{{ folder }}">{{ folder }}</option>
|
||||
{% endfor %}
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div class="control-group">
|
||||
<label for="file-select">Parquet File:</label>
|
||||
<select id="file-select" disabled>
|
||||
<option value="">-- Select a file --</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<div class="samples-row">
|
||||
<div class="control-group">
|
||||
<label for="num-samples">Number of Samples:</label>
|
||||
<input type="number" id="num-samples" value="100" min="1" max="1000" step="10">
|
||||
</div>
|
||||
<button id="load-btn" disabled>Load Data</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="info" style="display: none;"></div>
|
||||
<div id="results"></div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
const folderSelect = document.getElementById('folder-select');
|
||||
const fileSelect = document.getElementById('file-select');
|
||||
const numSamplesInput = document.getElementById('num-samples');
|
||||
const loadBtn = document.getElementById('load-btn');
|
||||
const infoDiv = document.getElementById('info');
|
||||
const resultsDiv = document.getElementById('results');
|
||||
|
||||
// When folder is selected, load parquet files
|
||||
folderSelect.addEventListener('change', async () => {
|
||||
const folder = folderSelect.value;
|
||||
fileSelect.innerHTML = '<option value="">-- Select a file --</option>';
|
||||
fileSelect.disabled = true;
|
||||
loadBtn.disabled = true;
|
||||
|
||||
if (!folder) return;
|
||||
|
||||
try {
|
||||
const response = await fetch(`/api/parquet_files?folder=${encodeURIComponent(folder)}`);
|
||||
const data = await response.json();
|
||||
|
||||
if (data.files && data.files.length > 0) {
|
||||
data.files.forEach(file => {
|
||||
const option = document.createElement('option');
|
||||
option.value = file;
|
||||
option.textContent = file;
|
||||
fileSelect.appendChild(option);
|
||||
});
|
||||
fileSelect.disabled = false;
|
||||
} else {
|
||||
alert('No parquet files found in this folder');
|
||||
}
|
||||
} catch (error) {
|
||||
alert('Error loading files: ' + error.message);
|
||||
}
|
||||
});
|
||||
|
||||
// Enable load button when file is selected
|
||||
fileSelect.addEventListener('change', () => {
|
||||
loadBtn.disabled = !fileSelect.value;
|
||||
});
|
||||
|
||||
// Load data button
|
||||
loadBtn.addEventListener('click', loadData);
|
||||
|
||||
async function loadData() {
|
||||
const folder = folderSelect.value;
|
||||
const file = fileSelect.value;
|
||||
const numSamples = numSamplesInput.value;
|
||||
|
||||
if (!folder || !file) return;
|
||||
|
||||
resultsDiv.innerHTML = '<div class="loading">⏳ Loading data...</div>';
|
||||
infoDiv.style.display = 'none';
|
||||
|
||||
try {
|
||||
const response = await fetch(
|
||||
`/api/load_data?folder=${encodeURIComponent(folder)}&file=${encodeURIComponent(file)}&num_samples=${numSamples}`
|
||||
);
|
||||
const data = await response.json();
|
||||
|
||||
if (data.error) {
|
||||
resultsDiv.innerHTML = `<div class="error"><strong>Error:</strong> ${data.error}</div>`;
|
||||
return;
|
||||
}
|
||||
|
||||
displayData(data);
|
||||
} catch (error) {
|
||||
resultsDiv.innerHTML = `<div class="error"><strong>Error:</strong> ${error.message}</div>`;
|
||||
}
|
||||
}
|
||||
|
||||
function displayData(data) {
|
||||
// Show info box
|
||||
infoDiv.style.display = 'block';
|
||||
infoDiv.innerHTML = `
|
||||
<div class="info-box">
|
||||
<div class="stats">
|
||||
<div class="stat-card">
|
||||
<div class="stat-value">${data.num_samples}</div>
|
||||
<div class="stat-label">Samples</div>
|
||||
</div>
|
||||
<div class="stat-card">
|
||||
<div class="stat-value">${data.samples.length > 0 ? data.samples[0].questions.length : 0}</div>
|
||||
<div class="stat-label">Questions per Sample</div>
|
||||
</div>
|
||||
</div>
|
||||
<p><strong>Model:</strong> ${data.model_name}</p>
|
||||
<p><strong>File:</strong> ${data.file_path}</p>
|
||||
</div>
|
||||
`;
|
||||
|
||||
// Show samples
|
||||
let html = '';
|
||||
data.samples.forEach(sample => {
|
||||
html += `
|
||||
<div class="sample">
|
||||
<div class="sample-header">Sample #${sample.index}</div>
|
||||
|
||||
<div class="section">
|
||||
<div class="section-title">📝 Context</div>
|
||||
<div class="content">${escapeHtml(sample.ctx)}</div>
|
||||
</div>
|
||||
|
||||
<div class="section">
|
||||
<div class="section-title">❓ Questions & Answers (${sample.questions.length})</div>
|
||||
${sample.questions.map((q, i) => `
|
||||
<div class="question-item">
|
||||
<span class="question-number">Q&A ${i + 1}</span>
|
||||
<div class="content">${escapeHtml(q)}</div>
|
||||
</div>
|
||||
`).join('')}
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
});
|
||||
|
||||
resultsDiv.innerHTML = html;
|
||||
}
|
||||
|
||||
function escapeHtml(text) {
|
||||
const div = document.createElement('div');
|
||||
div.textContent = text;
|
||||
return div.innerHTML;
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
Loading…
Add table
Add a link
Reference in a new issue