Backup and Recovery Guide
Comprehensive backup and recovery procedures for Axon OS, covering data protection, disaster recovery, and business continuity.
Overview
Axon OS backup and recovery strategy follows the 3-2-1 rule:
- 3 copies of important data
- 2 different storage media types
- 1 offsite backup location
Backup Components
- Database: PostgreSQL data and schemas
- Configuration: System and application configurations
- Node Registry: Custom nodes and their dependencies
- File Storage: Uploaded files and workflow artifacts
- Logs: Application and audit logs
- Secrets: Encrypted secrets and certificates
Database Backup
PostgreSQL Backup Strategy
Full Database Backup
#!/bin/bash
# full_backup.sh
# Configuration
DB_HOST="localhost"
DB_PORT="5432"
DB_NAME="axonos"
DB_USER="axonos_user"
BACKUP_DIR="/backups/postgresql"
DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="axonos_full_${DATE}.sql"
# Create backup directory
mkdir -p "$BACKUP_DIR"
# Perform full backup
pg_dump -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" \
--verbose --clean --create --if-exists \
--format=custom --compress=9 \
"$DB_NAME" > "$BACKUP_DIR/$BACKUP_FILE"
# Verify backup
if [ $? -eq 0 ]; then
echo "✅ Full backup completed: $BACKUP_FILE"
# Calculate backup size
BACKUP_SIZE=$(du -h "$BACKUP_DIR/$BACKUP_FILE" | cut -f1)
echo "📊 Backup size: $BACKUP_SIZE"
# Log backup completion
logger "Axon OS: Full database backup completed - $BACKUP_FILE ($BACKUP_SIZE)"
else
echo "❌ Backup failed!"
logger "Axon OS: Database backup FAILED"
exit 1
fi
# Cleanup old backups (keep last 7 days)
find "$BACKUP_DIR" -name "axonos_full_*.sql" -mtime +7 -delete
# Sync to remote storage
rsync -av "$BACKUP_DIR/$BACKUP_FILE" backup-server:/backups/axonos/
Incremental Backup with WAL-E
# Install WAL-E
pip install wal-e[aws]
# Configuration in postgresql.conf
wal_level = replica
archive_mode = on
archive_command = 'wal-e wal-push %p'
archive_timeout = 60
# Environment variables
export AWS_ACCESS_KEY_ID="your_access_key"
export AWS_SECRET_ACCESS_KEY="your_secret_key"
export WALE_S3_PREFIX="s3://your-bucket/wal-e"
export PGUSER="postgres"
# Base backup
wal-e backup-push /var/lib/postgresql/14/main
# Backup script
#!/bin/bash
# incremental_backup.sh
DATE=$(date +%Y%m%d_%H%M%S)
# Perform base backup
envdir /etc/wal-e.d/env wal-e backup-push /var/lib/postgresql/14/main
if [ $? -eq 0 ]; then
echo "✅ Incremental backup completed: $DATE"
logger "Axon OS: Incremental database backup completed - $DATE"
else
echo "❌ Incremental backup failed!"
logger "Axon OS: Incremental database backup FAILED"
exit 1
fi
Logical Backup for Specific Data
-- Backup specific workflows
COPY (
SELECT w.*, wn.* FROM workflows w
LEFT JOIN workflow_nodes wn ON w.id = wn.workflow_id
WHERE w.created_at >= '2024-01-01'
) TO '/backups/workflows_2024.csv' WITH CSV HEADER;
-- Backup user data
COPY (
SELECT id, username, email, created_at, last_login
FROM users
WHERE active = true
) TO '/backups/active_users.csv' WITH CSV HEADER;
-- Backup node registry
COPY (
SELECT * FROM node_registry
WHERE status = 'active'
) TO '/backups/node_registry.csv' WITH CSV HEADER;
Configuration Backup
System Configuration Backup
#!/bin/bash
# config_backup.sh
BACKUP_DIR="/backups/config"
DATE=$(date +%Y%m%d_%H%M%S)
CONFIG_BACKUP="axonos_config_${DATE}.tar.gz"
mkdir -p "$BACKUP_DIR"
# Backup configuration files
tar -czf "$BACKUP_DIR/$CONFIG_BACKUP" \
/etc/axonos/ \
/opt/axonos/config/ \
/etc/nginx/sites-available/axonos \
/etc/systemd/system/axonos.service \
/etc/postgresql/*/main/postgresql.conf \
/etc/postgresql/*/main/pg_hba.conf \
/etc/redis/redis.conf \
/etc/ssl/certs/axonos.* \
2>/dev/null
if [ $? -eq 0 ]; then
echo "✅ Configuration backup completed: $CONFIG_BACKUP"
# Encrypt sensitive configuration
gpg --cipher-algo AES256 --compress-algo 1 --symmetric \
--output "$BACKUP_DIR/${CONFIG_BACKUP}.gpg" \
"$BACKUP_DIR/$CONFIG_BACKUP"
# Remove unencrypted backup
rm "$BACKUP_DIR/$CONFIG_BACKUP"
echo "🔒 Configuration backup encrypted: ${CONFIG_BACKUP}.gpg"
else
echo "❌ Configuration backup failed!"
exit 1
fi
# Cleanup old config backups (keep last 30 days)
find "$BACKUP_DIR" -name "axonos_config_*.tar.gz.gpg" -mtime +30 -delete
Application Configuration Export
// config_export.ts
import * as fs from 'fs';
import * as yaml from 'js-yaml';
interface ConfigExport {
version: string;
timestamp: string;
database: DatabaseConfig;
redis: RedisConfig;
workflow: WorkflowConfig;
security: SecurityConfig;
nodes: NodeConfig[];
}
class ConfigurationBackup {
async exportConfiguration(): Promise<ConfigExport> {
const config: ConfigExport = {
version: process.env.APP_VERSION || 'unknown',
timestamp: new Date().toISOString(),
database: await this.exportDatabaseConfig(),
redis: await this.exportRedisConfig(),
workflow: await this.exportWorkflowConfig(),
security: await this.exportSecurityConfig(),
nodes: await this.exportNodeConfigs()
};
return config;
}
async saveConfiguration(config: ConfigExport, filePath: string): Promise<void> {
// Remove sensitive data
const sanitized = this.sanitizeConfig(config);
// Save as YAML
const yamlData = yaml.dump(sanitized, { indent: 2 });
await fs.promises.writeFile(filePath, yamlData, 'utf8');
console.log(`✅ Configuration exported to: ${filePath}`);
}
private sanitizeConfig(config: ConfigExport): ConfigExport {
const sanitized = JSON.parse(JSON.stringify(config));
// Remove passwords and secrets
if (sanitized.database.password) {
sanitized.database.password = '***REDACTED***';
}
if (sanitized.redis.password) {
sanitized.redis.password = '***REDACTED***';
}
if (sanitized.security.jwt_secret) {
sanitized.security.jwt_secret = '***REDACTED***';
}
return sanitized;
}
}
// Usage
const configBackup = new ConfigurationBackup();
async function backupConfiguration() {
try {
const config = await configBackup.exportConfiguration();
const fileName = `axonos_config_${Date.now()}.yml`;
await configBackup.saveConfiguration(config, `/backups/config/${fileName}`);
} catch (error) {
console.error('Configuration backup failed:', error);
process.exit(1);
}
}
backupConfiguration();
Node Registry Backup
Custom Nodes Backup
#!/bin/bash
# nodes_backup.sh
NODES_DIR="/opt/axonos/nodes"
BACKUP_DIR="/backups/nodes"
DATE=$(date +%Y%m%d_%H%M%S)
NODES_BACKUP="axonos_nodes_${DATE}.tar.gz"
mkdir -p "$BACKUP_DIR"
# Backup entire nodes directory
tar -czf "$BACKUP_DIR/$NODES_BACKUP" \
-C "$(dirname "$NODES_DIR")" \
--exclude="node_modules" \
--exclude="*.log" \
--exclude="tmp" \
"$(basename "$NODES_DIR")"
if [ $? -eq 0 ]; then
echo "✅ Nodes backup completed: $NODES_BACKUP"
# Create manifest of backed up nodes
find "$NODES_DIR" -name "package.json" -exec grep -l "axonos-node" {} \; | \
while read package_file; do
node_dir=$(dirname "$package_file")
node_name=$(jq -r '.name' "$package_file")
node_version=$(jq -r '.version' "$package_file")
echo "$node_name:$node_version:$node_dir"
done > "$BACKUP_DIR/nodes_manifest_${DATE}.txt"
echo "📋 Nodes manifest created: nodes_manifest_${DATE}.txt"
else
echo "❌ Nodes backup failed!"
exit 1
fi
# Cleanup old nodes backups (keep last 14 days)
find "$BACKUP_DIR" -name "axonos_nodes_*.tar.gz" -mtime +14 -delete
find "$BACKUP_DIR" -name "nodes_manifest_*.txt" -mtime +14 -delete
Node Dependencies Backup
// node_dependencies_backup.ts
import * as fs from 'fs';
import * as path from 'path';
import { exec } from 'child_process';
import { promisify } from 'util';
const execAsync = promisify(exec);
interface NodeDependency {
name: string;
version: string;
path: string;
dependencies: string[];
devDependencies: string[];
}
class NodeDependenciesBackup {
async backupDependencies(nodesDir: string): Promise<NodeDependency[]> {
const nodeDirs = await this.findNodeDirectories(nodesDir);
const dependencies: NodeDependency[] = [];
for (const nodeDir of nodeDirs) {
try {
const dependency = await this.analyzeDependencies(nodeDir);
dependencies.push(dependency);
} catch (error) {
console.error(`Error analyzing ${nodeDir}:`, error);
}
}
return dependencies;
}
private async findNodeDirectories(baseDir: string): Promise<string[]> {
const dirs: string[] = [];
const entries = await fs.promises.readdir(baseDir, { withFileTypes: true });
for (const entry of entries) {
if (entry.isDirectory()) {
const packageJsonPath = path.join(baseDir, entry.name, 'package.json');
try {
await fs.promises.access(packageJsonPath);
dirs.push(path.join(baseDir, entry.name));
} catch {
// No package.json, skip
}
}
}
return dirs;
}
private async analyzeDependencies(nodeDir: string): Promise<NodeDependency> {
const packageJsonPath = path.join(nodeDir, 'package.json');
const packageJson = JSON.parse(
await fs.promises.readFile(packageJsonPath, 'utf8')
);
return {
name: packageJson.name,
version: packageJson.version,
path: nodeDir,
dependencies: Object.keys(packageJson.dependencies || {}),
devDependencies: Object.keys(packageJson.devDependencies || {})
};
}
async saveDependenciesManifest(
dependencies: NodeDependency[],
filePath: string
): Promise<void> {
const manifest = {
timestamp: new Date().toISOString(),
total_nodes: dependencies.length,
dependencies: dependencies
};
await fs.promises.writeFile(
filePath,
JSON.stringify(manifest, null, 2),
'utf8'
);
console.log(`✅ Dependencies manifest saved: ${filePath}`);
}
}
File Storage Backup
Workflow Artifacts Backup
#!/bin/bash
# artifacts_backup.sh
ARTIFACTS_DIR="/var/lib/axonos/artifacts"
BACKUP_DIR="/backups/artifacts"
DATE=$(date +%Y%m%d_%H%M%S)
ARTIFACTS_BACKUP="axonos_artifacts_${DATE}.tar.gz"
mkdir -p "$BACKUP_DIR"
# Calculate total size before backup
TOTAL_SIZE=$(du -sh "$ARTIFACTS_DIR" | cut -f1)
echo "📊 Total artifacts size: $TOTAL_SIZE"
# Backup artifacts with progress
tar -czf "$BACKUP_DIR/$ARTIFACTS_BACKUP" \
-C "$(dirname "$ARTIFACTS_DIR")" \
--checkpoint=10000 \
--checkpoint-action=echo="Processed %{TAR_CHECKPOINT} files" \
"$(basename "$ARTIFACTS_DIR")"
if [ $? -eq 0 ]; then
BACKUP_SIZE=$(du -sh "$BACKUP_DIR/$ARTIFACTS_BACKUP" | cut -f1)
echo "✅ Artifacts backup completed: $ARTIFACTS_BACKUP ($BACKUP_SIZE)"
# Generate checksums
cd "$BACKUP_DIR"
sha256sum "$ARTIFACTS_BACKUP" > "${ARTIFACTS_BACKUP}.sha256"
echo "🔍 Checksum generated: ${ARTIFACTS_BACKUP}.sha256"
else
echo "❌ Artifacts backup failed!"
exit 1
fi
# Sync to cloud storage
aws s3 cp "$BACKUP_DIR/$ARTIFACTS_BACKUP" \
s3://axonos-backups/artifacts/ \
--storage-class GLACIER
# Cleanup old artifacts backups (keep last 30 days locally)
find "$BACKUP_DIR" -name "axonos_artifacts_*.tar.gz" -mtime +30 -delete
find "$BACKUP_DIR" -name "axonos_artifacts_*.sha256" -mtime +30 -delete
Incremental File Backup with rsync
#!/bin/bash
# incremental_files_backup.sh
SOURCE_DIR="/var/lib/axonos"
BACKUP_DIR="/backups/incremental"
REMOTE_BACKUP="backup-server:/backups/axonos/incremental"
DATE=$(date +%Y%m%d_%H%M%S)
LOG_FILE="/var/log/axonos/backup_${DATE}.log"
# Create backup directories
mkdir -p "$BACKUP_DIR/current"
mkdir -p "$BACKUP_DIR/snapshots"
# Incremental backup using rsync
rsync -av \
--delete \
--backup \
--backup-dir="$BACKUP_DIR/snapshots/$DATE" \
--exclude="*.log" \
--exclude="tmp/" \
--exclude="cache/" \
--log-file="$LOG_FILE" \
"$SOURCE_DIR/" \
"$BACKUP_DIR/current/"
if [ $? -eq 0 ]; then
echo "✅ Incremental backup completed"
# Create snapshot link
ln -sfn "$BACKUP_DIR/snapshots/$DATE" "$BACKUP_DIR/latest"
# Sync to remote
rsync -av --delete "$BACKUP_DIR/" "$REMOTE_BACKUP/"
echo "📡 Synced to remote backup server"
else
echo "❌ Incremental backup failed!"
exit 1
fi
# Cleanup old snapshots (keep last 7 days)
find "$BACKUP_DIR/snapshots" -maxdepth 1 -type d -mtime +7 -exec rm -rf {} \;
Automated Backup Scheduling
Cron-based Backup Schedule
# /etc/cron.d/axonos-backup
# Full database backup daily at 2 AM
0 2 * * * axonos /opt/axonos/scripts/full_backup.sh
# Incremental backup every 4 hours
0 */4 * * * axonos /opt/axonos/scripts/incremental_backup.sh
# Configuration backup weekly on Sunday at 3 AM
0 3 * * 0 axonos /opt/axonos/scripts/config_backup.sh
# Nodes backup weekly on Saturday at 1 AM
0 1 * * 6 axonos /opt/axonos/scripts/nodes_backup.sh
# Artifacts backup daily at 11 PM
0 23 * * * axonos /opt/axonos/scripts/artifacts_backup.sh
# Cleanup old backups daily at 4 AM
0 4 * * * axonos /opt/axonos/scripts/cleanup_backups.sh
Systemd Timer-based Backup
# /etc/systemd/system/axonos-backup.service
[Unit]
Description=Axon OS Full Backup
Wants=axonos-backup.timer
[Service]
Type=oneshot
User=axonos
ExecStart=/opt/axonos/scripts/full_backup.sh
StandardOutput=journal
StandardError=journal
# /etc/systemd/system/axonos-backup.timer
[Unit]
Description=Run Axon OS backup daily
Requires=axonos-backup.service
[Timer]
OnCalendar=daily
Persistent=true
RandomizedDelaySec=1800
[Install]
WantedBy=timers.target
Backup Monitoring Script
#!/bin/bash
# backup_monitor.sh
BACKUP_DIR="/backups"
LOG_FILE="/var/log/axonos/backup_monitor.log"
ALERT_EMAIL="admin@axonos.dev"
# Check if backups are current
check_backup_freshness() {
local backup_type="$1"
local max_age_hours="$2"
latest_backup=$(find "$BACKUP_DIR" -name "*${backup_type}*" -type f -printf '%T@ %p\n' | sort -n | tail -1 | cut -d' ' -f2-)
if [ -z "$latest_backup" ]; then
echo "❌ No $backup_type backup found!"
return 1
fi
backup_age=$(( ($(date +%s) - $(stat -c %Y "$latest_backup")) / 3600 ))
if [ $backup_age -gt $max_age_hours ]; then
echo "⚠️ $backup_type backup is $backup_age hours old (max: $max_age_hours)"
return 1
else
echo "✅ $backup_type backup is current ($backup_age hours old)"
return 0
fi
}
# Main monitoring
{
echo "=== Backup Status Report - $(date) ==="
# Check different backup types
check_backup_freshness "full" 25 # Daily full backup
check_backup_freshness "config" 168 # Weekly config backup
check_backup_freshness "nodes" 168 # Weekly nodes backup
check_backup_freshness "artifacts" 25 # Daily artifacts backup
# Check backup sizes
echo -e "\n=== Backup Sizes ==="
du -sh "$BACKUP_DIR"/* 2>/dev/null | sort -hr
# Check available disk space
echo -e "\n=== Disk Space ==="
df -h "$BACKUP_DIR"
} | tee -a "$LOG_FILE"
# Send alert if any backup is stale
if ! check_backup_freshness "full" 25 >/dev/null 2>&1; then
mail -s "❌ Axon OS Backup Alert: Stale backups detected" "$ALERT_EMAIL" < "$LOG_FILE"
fi
Disaster Recovery
Recovery Planning
Recovery Time Objectives (RTO)
- Database Recovery: 30 minutes
- Application Recovery: 15 minutes
- Full System Recovery: 2 hours
- Node Registry Recovery: 1 hour
Recovery Point Objectives (RPO)
- Database: 15 minutes (WAL archiving)
- Configuration: 1 week
- Artifacts: 24 hours
- Nodes: 1 week
Database Recovery Procedures
Full Database Restore
#!/bin/bash
# restore_database.sh
BACKUP_FILE="$1"
DB_NAME="axonos"
DB_USER="axonos_user"
if [ -z "$BACKUP_FILE" ]; then
echo "Usage: $0 <backup_file>"
exit 1
fi
echo "🔄 Starting database restore from: $BACKUP_FILE"
# Stop Axon OS service
systemctl stop axonos
# Drop and recreate database
sudo -u postgres psql -c "DROP DATABASE IF EXISTS $DB_NAME;"
sudo -u postgres psql -c "CREATE DATABASE $DB_NAME OWNER $DB_USER;"
# Restore from backup
pg_restore -h localhost -U "$DB_USER" -d "$DB_NAME" \
--verbose --clean --if-exists \
"$BACKUP_FILE"
if [ $? -eq 0 ]; then
echo "✅ Database restore completed successfully"
# Run post-restore checks
echo "🔍 Running post-restore validation..."
# Check table counts
sudo -u postgres psql -d "$DB_NAME" -c "
SELECT schemaname, tablename, n_tup_ins as inserts, n_tup_upd as updates
FROM pg_stat_user_tables
ORDER BY schemaname, tablename;
"
# Start Axon OS service
systemctl start axonos
# Wait for service to be ready
sleep 30
# Validate application
if curl -f http://localhost:8080/health >/dev/null 2>&1; then
echo "✅ Application is healthy after restore"
else
echo "❌ Application health check failed"
exit 1
fi
else
echo "❌ Database restore failed!"
exit 1
fi
Point-in-Time Recovery
#!/bin/bash
# point_in_time_recovery.sh
TARGET_TIME="$1" # Format: 2024-01-15 14:30:00
if [ -z "$TARGET_TIME" ]; then
echo "Usage: $0 'YYYY-MM-DD HH:MM:SS'"
exit 1
fi
echo "🕐 Starting point-in-time recovery to: $TARGET_TIME"
# Stop PostgreSQL
systemctl stop postgresql
# Remove existing data directory
rm -rf /var/lib/postgresql/14/main/*
# Restore base backup using WAL-E
envdir /etc/wal-e.d/env wal-e backup-fetch /var/lib/postgresql/14/main LATEST
# Create recovery configuration
cat > /var/lib/postgresql/14/main/recovery.conf << EOF
restore_command = 'envdir /etc/wal-e.d/env wal-e wal-fetch "%f" "%p"'
recovery_target_time = '$TARGET_TIME'
recovery_target_timeline = 'latest'
EOF
# Fix permissions
chown -R postgres:postgres /var/lib/postgresql/14/main/
# Start PostgreSQL in recovery mode
systemctl start postgresql
echo "✅ Point-in-time recovery initiated"
echo "Monitor PostgreSQL logs to confirm recovery completion"
Application Recovery
Complete System Restore
#!/bin/bash
# complete_system_restore.sh
BACKUP_DATE="$1"
if [ -z "$BACKUP_DATE" ]; then
echo "Usage: $0 <backup_date_YYYYMMDD>"
exit 1
fi
echo "🔄 Starting complete system restore for date: $BACKUP_DATE"
# Stop all services
systemctl stop axonos
systemctl stop nginx
systemctl stop postgresql
systemctl stop redis
# Restore database
DB_BACKUP="/backups/postgresql/axonos_full_${BACKUP_DATE}_*.sql"
latest_db_backup=$(ls -t $DB_BACKUP 2>/dev/null | head -1)
if [ -n "$latest_db_backup" ]; then
echo "📊 Restoring database from: $latest_db_backup"
./restore_database.sh "$latest_db_backup"
else
echo "❌ No database backup found for $BACKUP_DATE"
exit 1
fi
# Restore configuration
CONFIG_BACKUP="/backups/config/axonos_config_${BACKUP_DATE}_*.tar.gz.gpg"
latest_config_backup=$(ls -t $CONFIG_BACKUP 2>/dev/null | head -1)
if [ -n "$latest_config_backup" ]; then
echo "⚙️ Restoring configuration from: $latest_config_backup"
# Decrypt and extract
gpg --decrypt "$latest_config_backup" | tar -xzf - -C /
# Reload systemd
systemctl daemon-reload
else
echo "⚠️ No configuration backup found for $BACKUP_DATE"
fi
# Restore nodes
NODES_BACKUP="/backups/nodes/axonos_nodes_${BACKUP_DATE}_*.tar.gz"
latest_nodes_backup=$(ls -t $NODES_BACKUP 2>/dev/null | head -1)
if [ -n "$latest_nodes_backup" ]; then
echo "🔧 Restoring nodes from: $latest_nodes_backup"
# Remove existing nodes
rm -rf /opt/axonos/nodes/*
# Extract nodes backup
tar -xzf "$latest_nodes_backup" -C /opt/axonos/
# Reinstall node dependencies
find /opt/axonos/nodes -name "package.json" -execdir npm install \;
else
echo "⚠️ No nodes backup found for $BACKUP_DATE"
fi
# Restore artifacts (if needed)
ARTIFACTS_BACKUP="/backups/artifacts/axonos_artifacts_${BACKUP_DATE}_*.tar.gz"
latest_artifacts_backup=$(ls -t $ARTIFACTS_BACKUP 2>/dev/null | head -1)
if [ -n "$latest_artifacts_backup" ]; then
echo "📁 Restoring artifacts from: $latest_artifacts_backup"
# Remove existing artifacts
rm -rf /var/lib/axonos/artifacts/*
# Extract artifacts backup
tar -xzf "$latest_artifacts_backup" -C /var/lib/axonos/
else
echo "⚠️ No artifacts backup found for $BACKUP_DATE"
fi
# Start services
systemctl start postgresql
systemctl start redis
sleep 10
systemctl start axonos
systemctl start nginx
echo "✅ Complete system restore completed"
echo "🔍 Verifying system health..."
# Health checks
sleep 30
if curl -f http://localhost:8080/health >/dev/null 2>&1; then
echo "✅ Axon OS is healthy"
else
echo "❌ Axon OS health check failed"
fi
if systemctl is-active nginx >/dev/null 2>&1; then
echo "✅ Nginx is running"
else
echo "❌ Nginx is not running"
fi
echo "🎉 System restore process completed!"
Backup Testing and Validation
Automated Backup Testing
#!/bin/bash
# test_backup_restore.sh
TEST_DIR="/tmp/backup_test_$(date +%s)"
TEST_DB="axonos_test_restore"
BACKUP_FILE="$1"
if [ -z "$BACKUP_FILE" ]; then
echo "Usage: $0 <backup_file>"
exit 1
fi
echo "🧪 Testing backup restore: $BACKUP_FILE"
# Create test environment
mkdir -p "$TEST_DIR"
# Create test database
sudo -u postgres psql -c "CREATE DATABASE $TEST_DB;"
# Restore backup to test database
pg_restore -h localhost -U axonos_user -d "$TEST_DB" \
--verbose --clean --if-exists \
"$BACKUP_FILE"
if [ $? -eq 0 ]; then
echo "✅ Backup restore test successful"
# Validate data integrity
row_count=$(sudo -u postgres psql -d "$TEST_DB" -t -c "
SELECT COUNT(*) FROM workflows;
" | tr -d ' ')
echo "📊 Workflows in restored database: $row_count"
# Check for critical tables
tables=(workflows users nodes executions)
for table in "${tables[@]}"; do
exists=$(sudo -u postgres psql -d "$TEST_DB" -t -c "
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_name = '$table'
);
" | tr -d ' ')
if [ "$exists" = "t" ]; then
echo "✅ Table $table exists"
else
echo "❌ Table $table missing"
fi
done
else
echo "❌ Backup restore test failed!"
fi
# Cleanup
sudo -u postgres psql -c "DROP DATABASE $TEST_DB;"
rm -rf "$TEST_DIR"
echo "🧹 Cleanup completed"
Backup Verification Script
#!/usr/bin/env python3
# verify_backup.py
import os
import sys
import hashlib
import subprocess
import json
from datetime import datetime, timedelta
class BackupVerifier:
def __init__(self, backup_dir):
self.backup_dir = backup_dir
self.results = {
'timestamp': datetime.now().isoformat(),
'checks': [],
'status': 'unknown'
}
def verify_file_integrity(self, file_path):
"""Verify file integrity using checksums"""
checksum_file = f"{file_path}.sha256"
if not os.path.exists(checksum_file):
return {
'file': file_path,
'status': 'warning',
'message': 'No checksum file found'
}
# Calculate actual checksum
sha256_hash = hashlib.sha256()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
sha256_hash.update(chunk)
actual_checksum = sha256_hash.hexdigest()
# Read expected checksum
with open(checksum_file, 'r') as f:
expected_checksum = f.read().strip().split()[0]
if actual_checksum == expected_checksum:
return {
'file': file_path,
'status': 'pass',
'message': 'Checksum verified'
}
else:
return {
'file': file_path,
'status': 'fail',
'message': f'Checksum mismatch: expected {expected_checksum}, got {actual_checksum}'
}
def verify_backup_freshness(self, max_age_hours=25):
"""Check if backups are recent enough"""
current_time = datetime.now()
threshold = current_time - timedelta(hours=max_age_hours)
backup_files = []
for root, dirs, files in os.walk(self.backup_dir):
for file in files:
if file.endswith(('.sql', '.tar.gz')):
file_path = os.path.join(root, file)
mtime = datetime.fromtimestamp(os.path.getmtime(file_path))
backup_files.append((file_path, mtime))
if not backup_files:
return {
'status': 'fail',
'message': 'No backup files found'
}
latest_backup = max(backup_files, key=lambda x: x[1])
latest_file, latest_time = latest_backup
if latest_time > threshold:
return {
'status': 'pass',
'message': f'Latest backup is recent: {latest_file} ({latest_time})'
}
else:
age_hours = (current_time - latest_time).total_seconds() / 3600
return {
'status': 'fail',
'message': f'Latest backup is too old: {age_hours:.1f} hours'
}
def test_database_backup(self, backup_file):
"""Test database backup by attempting a restore"""
test_db = f"test_restore_{int(datetime.now().timestamp())}"
try:
# Create test database
subprocess.run([
'sudo', '-u', 'postgres', 'psql', '-c',
f'CREATE DATABASE {test_db};'
], check=True, capture_output=True)
# Attempt restore
result = subprocess.run([
'pg_restore', '-h', 'localhost', '-U', 'axonos_user',
'-d', test_db, '--verbose', '--exit-on-error',
backup_file
], capture_output=True, text=True)
if result.returncode == 0:
status = 'pass'
message = 'Database backup restore test successful'
else:
status = 'fail'
message = f'Database restore failed: {result.stderr}'
except subprocess.CalledProcessError as e:
status = 'fail'
message = f'Test database creation failed: {e}'
finally:
# Cleanup test database
try:
subprocess.run([
'sudo', '-u', 'postgres', 'psql', '-c',
f'DROP DATABASE IF EXISTS {test_db};'
], check=True, capture_output=True)
except:
pass
return {
'file': backup_file,
'status': status,
'message': message
}
def run_verification(self):
"""Run all verification checks"""
print("🔍 Starting backup verification...")
# Check backup freshness
freshness_result = self.verify_backup_freshness()
self.results['checks'].append({
'check': 'backup_freshness',
**freshness_result
})
# Find and verify backup files
for root, dirs, files in os.walk(self.backup_dir):
for file in files:
file_path = os.path.join(root, file)
# Verify file integrity
if file.endswith(('.sql', '.tar.gz')):
integrity_result = self.verify_file_integrity(file_path)
self.results['checks'].append({
'check': 'file_integrity',
**integrity_result
})
# Test database backups
if file.endswith('.sql') and 'full' in file:
test_result = self.test_database_backup(file_path)
self.results['checks'].append({
'check': 'database_restore_test',
**test_result
})
# Determine overall status
failed_checks = [c for c in self.results['checks'] if c['status'] == 'fail']
warning_checks = [c for c in self.results['checks'] if c['status'] == 'warning']
if failed_checks:
self.results['status'] = 'fail'
elif warning_checks:
self.results['status'] = 'warning'
else:
self.results['status'] = 'pass'
# Print results
self.print_results()
return self.results
def print_results(self):
"""Print verification results"""
print(f"\n=== Backup Verification Results ===")
print(f"Overall Status: {self.results['status'].upper()}")
print(f"Timestamp: {self.results['timestamp']}")
print()
for check in self.results['checks']:
status_icon = {
'pass': '✅',
'warning': '⚠️',
'fail': '❌'
}.get(check['status'], '❓')
print(f"{status_icon} {check['check']}: {check['message']}")
if 'file' in check:
print(f" File: {check['file']}")
print()
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python3 verify_backup.py <backup_directory>")
sys.exit(1)
backup_dir = sys.argv[1]
verifier = BackupVerifier(backup_dir)
results = verifier.run_verification()
# Exit with appropriate code
if results['status'] == 'fail':
sys.exit(1)
elif results['status'] == 'warning':
sys.exit(2)
else:
sys.exit(0)
Cloud Backup Integration
AWS S3 Backup
#!/bin/bash
# s3_backup.sh
BACKUP_DIR="/backups"
S3_BUCKET="axonos-backups"
AWS_PROFILE="axonos-backup"
# Sync to S3 with encryption
aws s3 sync "$BACKUP_DIR" "s3://$S3_BUCKET" \
--profile "$AWS_PROFILE" \
--storage-class STANDARD_IA \
--server-side-encryption AES256 \
--exclude "*.tmp" \
--delete
# Create lifecycle policy for cost optimization
aws s3api put-bucket-lifecycle-configuration \
--profile "$AWS_PROFILE" \
--bucket "$S3_BUCKET" \
--lifecycle-configuration file://s3-lifecycle.json
Google Cloud Storage Backup
#!/bin/bash
# gcs_backup.sh
BACKUP_DIR="/backups"
GCS_BUCKET="axonos-backups"
# Sync to Google Cloud Storage
gsutil -m rsync -r -d "$BACKUP_DIR" "gs://$GCS_BUCKET"
# Set lifecycle policy
gsutil lifecycle set gcs-lifecycle.json "gs://$GCS_BUCKET"
Backup Retention Policies
Retention Schedule
# backup_retention.yml
retention:
database:
full_backup:
daily: 7 # Keep 7 daily backups
weekly: 4 # Keep 4 weekly backups
monthly: 12 # Keep 12 monthly backups
incremental:
hourly: 24 # Keep 24 hourly backups
configuration:
weekly: 8 # Keep 8 weekly config backups
monthly: 12 # Keep 12 monthly config backups
nodes:
weekly: 4 # Keep 4 weekly node backups
monthly: 6 # Keep 6 monthly node backups
artifacts:
daily: 30 # Keep 30 daily artifact backups
weekly: 12 # Keep 12 weekly artifact backups
logs:
daily: 90 # Keep 90 days of logs
Automated Cleanup Script
#!/usr/bin/env python3
# cleanup_backups.py
import os
import re
import datetime
from pathlib import Path
import yaml
class BackupCleaner:
def __init__(self, config_file):
with open(config_file, 'r') as f:
self.config = yaml.safe_load(f)
def cleanup_backups(self, backup_dir):
"""Clean up old backups based on retention policy"""
backup_path = Path(backup_dir)
for backup_type, retention in self.config['retention'].items():
pattern = f"*{backup_type}*"
files = list(backup_path.glob(pattern))
if backup_type == 'database':
self.cleanup_database_backups(files, retention)
else:
self.cleanup_simple_backups(files, retention)
def cleanup_database_backups(self, files, retention):
"""Clean up database backups with complex retention"""
now = datetime.datetime.now()
# Separate full and incremental backups
full_backups = [f for f in files if 'full' in f.name]
incremental_backups = [f for f in files if 'incremental' in f.name]
# Clean full backups
if 'full_backup' in retention:
self.apply_retention_policy(full_backups, retention['full_backup'], now)
# Clean incremental backups
if 'incremental' in retention:
self.apply_retention_policy(incremental_backups, retention['incremental'], now)
def cleanup_simple_backups(self, files, retention):
"""Clean up simple backups"""
now = datetime.datetime.now()
self.apply_retention_policy(files, retention, now)
def apply_retention_policy(self, files, policy, now):
"""Apply retention policy to file list"""
# Sort files by modification time
files_with_mtime = [(f, datetime.datetime.fromtimestamp(f.stat().st_mtime)) for f in files]
files_with_mtime.sort(key=lambda x: x[1], reverse=True)
keep_files = set()
# Apply each retention rule
for period, count in policy.items():
if period == 'daily':
keep_files.update(self.select_daily_backups(files_with_mtime, count, now))
elif period == 'weekly':
keep_files.update(self.select_weekly_backups(files_with_mtime, count, now))
elif period == 'monthly':
keep_files.update(self.select_monthly_backups(files_with_mtime, count, now))
elif period == 'hourly':
keep_files.update(self.select_hourly_backups(files_with_mtime, count, now))
# Delete files not in keep list
for file_path, mtime in files_with_mtime:
if file_path not in keep_files:
print(f"🗑️ Deleting old backup: {file_path}")
file_path.unlink()
# Also delete checksum file if exists
checksum_file = file_path.with_suffix(file_path.suffix + '.sha256')
if checksum_file.exists():
checksum_file.unlink()
def select_daily_backups(self, files_with_mtime, count, now):
"""Select files for daily retention"""
selected = []
days_seen = set()
for file_path, mtime in files_with_mtime:
day_key = mtime.date()
if day_key not in days_seen and len(days_seen) < count:
selected.append(file_path)
days_seen.add(day_key)
return selected
def select_weekly_backups(self, files_with_mtime, count, now):
"""Select files for weekly retention"""
selected = []
weeks_seen = set()
for file_path, mtime in files_with_mtime:
week_key = mtime.isocalendar()[:2] # (year, week)
if week_key not in weeks_seen and len(weeks_seen) < count:
selected.append(file_path)
weeks_seen.add(week_key)
return selected
def select_monthly_backups(self, files_with_mtime, count, now):
"""Select files for monthly retention"""
selected = []
months_seen = set()
for file_path, mtime in files_with_mtime:
month_key = (mtime.year, mtime.month)
if month_key not in months_seen and len(months_seen) < count:
selected.append(file_path)
months_seen.add(month_key)
return selected
def select_hourly_backups(self, files_with_mtime, count, now):
"""Select files for hourly retention"""
# Simply keep the newest N files
return [f[0] for f in files_with_mtime[:count]]
if __name__ == "__main__":
import sys
if len(sys.argv) != 3:
print("Usage: python3 cleanup_backups.py <config_file> <backup_directory>")
sys.exit(1)
config_file = sys.argv[1]
backup_dir = sys.argv[2]
cleaner = BackupCleaner(config_file)
cleaner.cleanup_backups(backup_dir)
print("✅ Backup cleanup completed")