mirror of
https://github.com/trustgraph-ai/trustgraph.git
synced 2026-04-25 16:36:21 +02:00
568 lines
No EOL
13 KiB
Markdown
568 lines
No EOL
13 KiB
Markdown
# tg-load-doc-embeds
|
|
|
|
Loads document embeddings from MessagePack format into TrustGraph processing pipelines.
|
|
|
|
## Synopsis
|
|
|
|
```bash
|
|
tg-load-doc-embeds -i INPUT_FILE [options]
|
|
```
|
|
|
|
## Description
|
|
|
|
The `tg-load-doc-embeds` command loads document embeddings from MessagePack files into a running TrustGraph system. This is typically used to restore previously saved document embeddings or to load embeddings generated by external systems.
|
|
|
|
The command reads document embedding data in MessagePack format and streams it to TrustGraph's document embeddings import API via WebSocket connections.
|
|
|
|
## Options
|
|
|
|
### Required Arguments
|
|
|
|
- `-i, --input-file FILE`: Input MessagePack file containing document embeddings
|
|
|
|
### Optional Arguments
|
|
|
|
- `-u, --url URL`: TrustGraph API URL (default: `$TRUSTGRAPH_API` or `http://localhost:8088/`)
|
|
- `-f, --flow-id ID`: Flow instance ID to use (default: `default`)
|
|
- `--format FORMAT`: Input format - `msgpack` or `json` (default: `msgpack`)
|
|
- `--user USER`: Override user ID from input data
|
|
- `--collection COLLECTION`: Override collection ID from input data
|
|
|
|
## Examples
|
|
|
|
### Basic Loading
|
|
```bash
|
|
tg-load-doc-embeds -i document-embeddings.msgpack
|
|
```
|
|
|
|
### Load with Custom Flow
|
|
```bash
|
|
tg-load-doc-embeds \
|
|
-i embeddings.msgpack \
|
|
-f "document-processing-flow"
|
|
```
|
|
|
|
### Override User and Collection
|
|
```bash
|
|
tg-load-doc-embeds \
|
|
-i embeddings.msgpack \
|
|
--user "research-team" \
|
|
--collection "research-docs"
|
|
```
|
|
|
|
### Load from JSON Format
|
|
```bash
|
|
tg-load-doc-embeds \
|
|
-i embeddings.json \
|
|
--format json
|
|
```
|
|
|
|
### Production Loading
|
|
```bash
|
|
tg-load-doc-embeds \
|
|
-i production-embeddings.msgpack \
|
|
-u https://trustgraph-api.company.com/ \
|
|
-f "production-flow" \
|
|
--user "system" \
|
|
--collection "production-docs"
|
|
```
|
|
|
|
## Input Data Format
|
|
|
|
### MessagePack Structure
|
|
Document embeddings are stored as MessagePack records with this structure:
|
|
|
|
```json
|
|
["de", {
|
|
"m": {
|
|
"i": "document-id",
|
|
"m": [{"metadata": "objects"}],
|
|
"u": "user-id",
|
|
"c": "collection-id"
|
|
},
|
|
"c": [{
|
|
"c": "text chunk content",
|
|
"v": [0.1, 0.2, 0.3, ...]
|
|
}]
|
|
}]
|
|
```
|
|
|
|
### Components
|
|
- **Document Metadata** (`m`):
|
|
- `i`: Document ID
|
|
- `m`: Document metadata objects
|
|
- `u`: User ID
|
|
- `c`: Collection ID
|
|
- **Chunks** (`c`): Array of text chunks with embeddings:
|
|
- `c`: Text content of the chunk
|
|
- `v`: Vector embedding array
|
|
|
|
## Use Cases
|
|
|
|
### Backup Restoration
|
|
```bash
|
|
# Restore document embeddings from backup
|
|
restore_embeddings() {
|
|
local backup_file="$1"
|
|
local target_collection="$2"
|
|
|
|
echo "Restoring document embeddings from: $backup_file"
|
|
|
|
if [ ! -f "$backup_file" ]; then
|
|
echo "Backup file not found: $backup_file"
|
|
return 1
|
|
fi
|
|
|
|
# Verify backup file
|
|
if tg-dump-msgpack -i "$backup_file" --summary | grep -q "Vector dimension:"; then
|
|
echo "✓ Backup file contains embeddings"
|
|
else
|
|
echo "✗ Backup file does not contain valid embeddings"
|
|
return 1
|
|
fi
|
|
|
|
# Load embeddings
|
|
tg-load-doc-embeds \
|
|
-i "$backup_file" \
|
|
--collection "$target_collection"
|
|
|
|
echo "Embedding restoration complete"
|
|
}
|
|
|
|
# Restore from backup
|
|
restore_embeddings "backup-20231215.msgpack" "restored-docs"
|
|
```
|
|
|
|
### Data Migration
|
|
```bash
|
|
# Migrate embeddings between environments
|
|
migrate_embeddings() {
|
|
local source_file="$1"
|
|
local target_env="$2"
|
|
local target_user="$3"
|
|
|
|
echo "Migrating embeddings to: $target_env"
|
|
|
|
# Load to target environment
|
|
tg-load-doc-embeds \
|
|
-i "$source_file" \
|
|
-u "https://$target_env/api/" \
|
|
--user "$target_user" \
|
|
--collection "migrated-docs"
|
|
|
|
echo "Migration complete"
|
|
}
|
|
|
|
# Migrate to production
|
|
migrate_embeddings "dev-embeddings.msgpack" "prod.company.com" "migration-user"
|
|
```
|
|
|
|
### Batch Processing
|
|
```bash
|
|
# Load multiple embedding files
|
|
batch_load_embeddings() {
|
|
local input_dir="$1"
|
|
local collection="$2"
|
|
|
|
echo "Batch loading embeddings from: $input_dir"
|
|
|
|
for file in "$input_dir"/*.msgpack; do
|
|
if [ -f "$file" ]; then
|
|
echo "Loading: $(basename "$file")"
|
|
|
|
tg-load-doc-embeds \
|
|
-i "$file" \
|
|
--collection "$collection"
|
|
|
|
if [ $? -eq 0 ]; then
|
|
echo "✓ Loaded: $(basename "$file")"
|
|
else
|
|
echo "✗ Failed: $(basename "$file")"
|
|
fi
|
|
fi
|
|
done
|
|
|
|
echo "Batch loading complete"
|
|
}
|
|
|
|
# Load all embeddings
|
|
batch_load_embeddings "embeddings/" "batch-processed"
|
|
```
|
|
|
|
### Incremental Loading
|
|
```bash
|
|
# Load new embeddings incrementally
|
|
incremental_load() {
|
|
local embeddings_dir="$1"
|
|
local processed_log="processed_embeddings.log"
|
|
|
|
# Create log if it doesn't exist
|
|
touch "$processed_log"
|
|
|
|
for file in "$embeddings_dir"/*.msgpack; do
|
|
if [ -f "$file" ]; then
|
|
# Check if already processed
|
|
if grep -q "$(basename "$file")" "$processed_log"; then
|
|
echo "Skipping already processed: $(basename "$file")"
|
|
continue
|
|
fi
|
|
|
|
echo "Processing new file: $(basename "$file")"
|
|
|
|
if tg-load-doc-embeds -i "$file"; then
|
|
echo "$(date): $(basename "$file")" >> "$processed_log"
|
|
echo "✓ Processed: $(basename "$file")"
|
|
else
|
|
echo "✗ Failed: $(basename "$file")"
|
|
fi
|
|
fi
|
|
done
|
|
}
|
|
|
|
# Run incremental loading
|
|
incremental_load "embeddings/"
|
|
```
|
|
|
|
## Advanced Usage
|
|
|
|
### Parallel Loading
|
|
```bash
|
|
# Load multiple files in parallel
|
|
parallel_load_embeddings() {
|
|
local files=("$@")
|
|
local max_parallel=3
|
|
local current_jobs=0
|
|
|
|
for file in "${files[@]}"; do
|
|
# Wait if max parallel jobs reached
|
|
while [ $current_jobs -ge $max_parallel ]; do
|
|
wait -n # Wait for any job to complete
|
|
current_jobs=$((current_jobs - 1))
|
|
done
|
|
|
|
# Start loading in background
|
|
(
|
|
echo "Loading: $file"
|
|
tg-load-doc-embeds -i "$file"
|
|
echo "Completed: $file"
|
|
) &
|
|
|
|
current_jobs=$((current_jobs + 1))
|
|
done
|
|
|
|
# Wait for all remaining jobs
|
|
wait
|
|
echo "All parallel loading completed"
|
|
}
|
|
|
|
# Load files in parallel
|
|
embedding_files=(embeddings1.msgpack embeddings2.msgpack embeddings3.msgpack)
|
|
parallel_load_embeddings "${embedding_files[@]}"
|
|
```
|
|
|
|
### Validation and Loading
|
|
```bash
|
|
# Validate before loading
|
|
validate_and_load() {
|
|
local file="$1"
|
|
local collection="$2"
|
|
|
|
echo "Validating embedding file: $file"
|
|
|
|
# Check file exists and is readable
|
|
if [ ! -r "$file" ]; then
|
|
echo "Error: Cannot read file $file"
|
|
return 1
|
|
fi
|
|
|
|
# Validate MessagePack structure
|
|
if ! tg-dump-msgpack -i "$file" --summary > /dev/null 2>&1; then
|
|
echo "Error: Invalid MessagePack format"
|
|
return 1
|
|
fi
|
|
|
|
# Check for document embeddings
|
|
if ! tg-dump-msgpack -i "$file" | grep -q '^\["de"'; then
|
|
echo "Error: No document embeddings found"
|
|
return 1
|
|
fi
|
|
|
|
# Get embedding statistics
|
|
summary=$(tg-dump-msgpack -i "$file" --summary)
|
|
vector_dim=$(echo "$summary" | grep "Vector dimension:" | awk '{print $3}')
|
|
|
|
if [ -n "$vector_dim" ]; then
|
|
echo "✓ Found embeddings with dimension: $vector_dim"
|
|
else
|
|
echo "Warning: Could not determine vector dimension"
|
|
fi
|
|
|
|
# Load embeddings
|
|
echo "Loading validated embeddings..."
|
|
tg-load-doc-embeds -i "$file" --collection "$collection"
|
|
|
|
echo "Loading complete"
|
|
}
|
|
|
|
# Validate and load
|
|
validate_and_load "embeddings.msgpack" "validated-docs"
|
|
```
|
|
|
|
### Progress Monitoring
|
|
```bash
|
|
# Monitor loading progress
|
|
monitor_loading() {
|
|
local file="$1"
|
|
local log_file="loading_progress.log"
|
|
|
|
# Start loading in background
|
|
tg-load-doc-embeds -i "$file" > "$log_file" 2>&1 &
|
|
local load_pid=$!
|
|
|
|
echo "Monitoring loading progress (PID: $load_pid)..."
|
|
|
|
# Monitor progress
|
|
while kill -0 $load_pid 2>/dev/null; do
|
|
if [ -f "$log_file" ]; then
|
|
# Extract progress from log
|
|
embeddings_count=$(grep -o "Document embeddings:.*[0-9]" "$log_file" | tail -1 | awk '{print $3}')
|
|
if [ -n "$embeddings_count" ]; then
|
|
echo "Progress: $embeddings_count embeddings loaded"
|
|
fi
|
|
fi
|
|
sleep 5
|
|
done
|
|
|
|
# Check final status
|
|
wait $load_pid
|
|
if [ $? -eq 0 ]; then
|
|
echo "✓ Loading completed successfully"
|
|
else
|
|
echo "✗ Loading failed"
|
|
cat "$log_file"
|
|
fi
|
|
|
|
rm "$log_file"
|
|
}
|
|
|
|
# Monitor loading
|
|
monitor_loading "large-embeddings.msgpack"
|
|
```
|
|
|
|
### Data Transformation
|
|
```bash
|
|
# Transform embeddings during loading
|
|
transform_and_load() {
|
|
local input_file="$1"
|
|
local output_file="transformed-$(basename "$input_file")"
|
|
local new_user="$2"
|
|
local new_collection="$3"
|
|
|
|
echo "Transforming embeddings: user=$new_user, collection=$new_collection"
|
|
|
|
# This would require a transformation script
|
|
# For now, we'll show the concept
|
|
|
|
# Load with override parameters
|
|
tg-load-doc-embeds \
|
|
-i "$input_file" \
|
|
--user "$new_user" \
|
|
--collection "$new_collection"
|
|
|
|
echo "Transformation and loading complete"
|
|
}
|
|
|
|
# Transform during loading
|
|
transform_and_load "original.msgpack" "new-user" "new-collection"
|
|
```
|
|
|
|
## Performance Optimization
|
|
|
|
### Memory Management
|
|
```bash
|
|
# Monitor memory usage during loading
|
|
monitor_memory_usage() {
|
|
local file="$1"
|
|
|
|
echo "Starting memory-monitored loading..."
|
|
|
|
# Start loading in background
|
|
tg-load-doc-embeds -i "$file" &
|
|
local load_pid=$!
|
|
|
|
# Monitor memory usage
|
|
while kill -0 $load_pid 2>/dev/null; do
|
|
memory_usage=$(ps -p $load_pid -o rss= 2>/dev/null | awk '{print $1/1024}')
|
|
if [ -n "$memory_usage" ]; then
|
|
echo "Memory usage: ${memory_usage}MB"
|
|
fi
|
|
sleep 10
|
|
done
|
|
|
|
wait $load_pid
|
|
echo "Loading completed"
|
|
}
|
|
```
|
|
|
|
### Chunked Loading
|
|
```bash
|
|
# Load large files in chunks
|
|
chunked_load() {
|
|
local large_file="$1"
|
|
local chunk_size=1000 # Records per chunk
|
|
|
|
echo "Loading large file in chunks: $large_file"
|
|
|
|
# Split the MessagePack file (this would need special tooling)
|
|
# For demonstration, assuming we have pre-split files
|
|
|
|
for chunk in "${large_file%.msgpack}"_chunk_*.msgpack; do
|
|
if [ -f "$chunk" ]; then
|
|
echo "Loading chunk: $(basename "$chunk")"
|
|
tg-load-doc-embeds -i "$chunk"
|
|
|
|
# Add delay between chunks to reduce system load
|
|
sleep 2
|
|
fi
|
|
done
|
|
|
|
echo "Chunked loading complete"
|
|
}
|
|
```
|
|
|
|
## Error Handling
|
|
|
|
### File Not Found
|
|
```bash
|
|
Exception: [Errno 2] No such file or directory
|
|
```
|
|
**Solution**: Verify file path and ensure the MessagePack file exists.
|
|
|
|
### Invalid Format
|
|
```bash
|
|
Exception: Unpack failed
|
|
```
|
|
**Solution**: Verify the file is a valid MessagePack file with document embeddings.
|
|
|
|
### WebSocket Connection Issues
|
|
```bash
|
|
Exception: Connection failed
|
|
```
|
|
**Solution**: Check API URL and ensure TrustGraph is running with WebSocket support.
|
|
|
|
### Memory Errors
|
|
```bash
|
|
MemoryError: Unable to allocate memory
|
|
```
|
|
**Solution**: Process large files in smaller chunks or increase available memory.
|
|
|
|
### Flow Not Found
|
|
```bash
|
|
Exception: Flow not found
|
|
```
|
|
**Solution**: Verify the flow ID exists with `tg-show-flows`.
|
|
|
|
## Integration with Other Commands
|
|
|
|
### Complete Workflow
|
|
```bash
|
|
# Complete document processing workflow
|
|
process_documents_workflow() {
|
|
local pdf_dir="$1"
|
|
local embeddings_file="embeddings.msgpack"
|
|
|
|
echo "Starting complete document workflow..."
|
|
|
|
# 1. Load PDFs
|
|
for pdf in "$pdf_dir"/*.pdf; do
|
|
tg-load-pdf "$pdf"
|
|
done
|
|
|
|
# 2. Wait for processing
|
|
sleep 30
|
|
|
|
# 3. Save embeddings
|
|
tg-save-doc-embeds -o "$embeddings_file"
|
|
|
|
# 4. Process embeddings (example: load to different collection)
|
|
tg-load-doc-embeds -i "$embeddings_file" --collection "processed-docs"
|
|
|
|
echo "Complete workflow finished"
|
|
}
|
|
```
|
|
|
|
### Backup and Restore
|
|
```bash
|
|
# Complete backup and restore cycle
|
|
backup_restore_cycle() {
|
|
local backup_file="embeddings-backup.msgpack"
|
|
|
|
echo "Creating embeddings backup..."
|
|
tg-save-doc-embeds -o "$backup_file"
|
|
|
|
echo "Simulating data loss..."
|
|
# (In real scenario, this might be system failure)
|
|
|
|
echo "Restoring from backup..."
|
|
tg-load-doc-embeds -i "$backup_file" --collection "restored"
|
|
|
|
echo "Backup/restore cycle complete"
|
|
}
|
|
```
|
|
|
|
## Environment Variables
|
|
|
|
- `TRUSTGRAPH_API`: Default API URL
|
|
|
|
## Related Commands
|
|
|
|
- [`tg-save-doc-embeds`](tg-save-doc-embeds.md) - Save document embeddings to MessagePack
|
|
- [`tg-dump-msgpack`](tg-dump-msgpack.md) - Analyze MessagePack files
|
|
- [`tg-load-pdf`](tg-load-pdf.md) - Load PDF documents for processing
|
|
- [`tg-show-flows`](tg-show-flows.md) - List available flows
|
|
|
|
## API Integration
|
|
|
|
This command uses TrustGraph's WebSocket API for document embeddings import, specifically the `/api/v1/flow/{flow-id}/import/document-embeddings` endpoint.
|
|
|
|
## Best Practices
|
|
|
|
1. **Validation**: Always validate MessagePack files before loading
|
|
2. **Backups**: Keep backups of original embedding files
|
|
3. **Monitoring**: Monitor memory usage and loading progress
|
|
4. **Chunking**: Process large files in manageable chunks
|
|
5. **Error Handling**: Implement robust error handling and retry logic
|
|
6. **Documentation**: Document the source and format of embedding files
|
|
7. **Testing**: Test loading procedures in non-production environments
|
|
|
|
## Troubleshooting
|
|
|
|
### Loading Stalls
|
|
```bash
|
|
# Check WebSocket connection
|
|
netstat -an | grep :8088
|
|
|
|
# Check system resources
|
|
free -h
|
|
df -h
|
|
```
|
|
|
|
### Incomplete Loading
|
|
```bash
|
|
# Compare input vs loaded data
|
|
input_count=$(tg-dump-msgpack -i input.msgpack | grep '^\["de"' | wc -l)
|
|
echo "Input embeddings: $input_count"
|
|
|
|
# Check loaded data (would need query command)
|
|
# loaded_count=$(tg-query-embeddings --count)
|
|
# echo "Loaded embeddings: $loaded_count"
|
|
```
|
|
|
|
### Performance Issues
|
|
```bash
|
|
# Monitor network usage
|
|
iftop
|
|
|
|
# Check TrustGraph service logs
|
|
docker logs trustgraph-service
|
|
``` |