Skip to main content

Backup and Recovery Guide

Comprehensive backup and recovery procedures for Axon OS, covering data protection, disaster recovery, and business continuity.

Overview

Axon OS backup and recovery strategy follows the 3-2-1 rule:

  • 3 copies of important data
  • 2 different storage media types
  • 1 offsite backup location

Backup Components

  • Database: PostgreSQL data and schemas
  • Configuration: System and application configurations
  • Node Registry: Custom nodes and their dependencies
  • File Storage: Uploaded files and workflow artifacts
  • Logs: Application and audit logs
  • Secrets: Encrypted secrets and certificates

Database Backup

PostgreSQL Backup Strategy

Full Database Backup

#!/bin/bash
# full_backup.sh

# Configuration
DB_HOST="localhost"
DB_PORT="5432"
DB_NAME="axonos"
DB_USER="axonos_user"
BACKUP_DIR="/backups/postgresql"
DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="axonos_full_${DATE}.sql"

# Create backup directory
mkdir -p "$BACKUP_DIR"

# Perform full backup
pg_dump -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" \
--verbose --clean --create --if-exists \
--format=custom --compress=9 \
"$DB_NAME" > "$BACKUP_DIR/$BACKUP_FILE"

# Verify backup
if [ $? -eq 0 ]; then
echo "✅ Full backup completed: $BACKUP_FILE"

# Calculate backup size
BACKUP_SIZE=$(du -h "$BACKUP_DIR/$BACKUP_FILE" | cut -f1)
echo "📊 Backup size: $BACKUP_SIZE"

# Log backup completion
logger "Axon OS: Full database backup completed - $BACKUP_FILE ($BACKUP_SIZE)"
else
echo "❌ Backup failed!"
logger "Axon OS: Database backup FAILED"
exit 1
fi

# Cleanup old backups (keep last 7 days)
find "$BACKUP_DIR" -name "axonos_full_*.sql" -mtime +7 -delete

# Sync to remote storage
rsync -av "$BACKUP_DIR/$BACKUP_FILE" backup-server:/backups/axonos/

Incremental Backup with WAL-E

# Install WAL-E
pip install wal-e[aws]

# Configuration in postgresql.conf
wal_level = replica
archive_mode = on
archive_command = 'wal-e wal-push %p'
archive_timeout = 60

# Environment variables
export AWS_ACCESS_KEY_ID="your_access_key"
export AWS_SECRET_ACCESS_KEY="your_secret_key"
export WALE_S3_PREFIX="s3://your-bucket/wal-e"
export PGUSER="postgres"

# Base backup
wal-e backup-push /var/lib/postgresql/14/main

# Backup script
#!/bin/bash
# incremental_backup.sh

DATE=$(date +%Y%m%d_%H%M%S)

# Perform base backup
envdir /etc/wal-e.d/env wal-e backup-push /var/lib/postgresql/14/main

if [ $? -eq 0 ]; then
echo "✅ Incremental backup completed: $DATE"
logger "Axon OS: Incremental database backup completed - $DATE"
else
echo "❌ Incremental backup failed!"
logger "Axon OS: Incremental database backup FAILED"
exit 1
fi

Logical Backup for Specific Data

-- Backup specific workflows
COPY (
SELECT w.*, wn.* FROM workflows w
LEFT JOIN workflow_nodes wn ON w.id = wn.workflow_id
WHERE w.created_at >= '2024-01-01'
) TO '/backups/workflows_2024.csv' WITH CSV HEADER;

-- Backup user data
COPY (
SELECT id, username, email, created_at, last_login
FROM users
WHERE active = true
) TO '/backups/active_users.csv' WITH CSV HEADER;

-- Backup node registry
COPY (
SELECT * FROM node_registry
WHERE status = 'active'
) TO '/backups/node_registry.csv' WITH CSV HEADER;

Configuration Backup

System Configuration Backup

#!/bin/bash
# config_backup.sh

BACKUP_DIR="/backups/config"
DATE=$(date +%Y%m%d_%H%M%S)
CONFIG_BACKUP="axonos_config_${DATE}.tar.gz"

mkdir -p "$BACKUP_DIR"

# Backup configuration files
tar -czf "$BACKUP_DIR/$CONFIG_BACKUP" \
/etc/axonos/ \
/opt/axonos/config/ \
/etc/nginx/sites-available/axonos \
/etc/systemd/system/axonos.service \
/etc/postgresql/*/main/postgresql.conf \
/etc/postgresql/*/main/pg_hba.conf \
/etc/redis/redis.conf \
/etc/ssl/certs/axonos.* \
2>/dev/null

if [ $? -eq 0 ]; then
echo "✅ Configuration backup completed: $CONFIG_BACKUP"

# Encrypt sensitive configuration
gpg --cipher-algo AES256 --compress-algo 1 --symmetric \
--output "$BACKUP_DIR/${CONFIG_BACKUP}.gpg" \
"$BACKUP_DIR/$CONFIG_BACKUP"

# Remove unencrypted backup
rm "$BACKUP_DIR/$CONFIG_BACKUP"

echo "🔒 Configuration backup encrypted: ${CONFIG_BACKUP}.gpg"
else
echo "❌ Configuration backup failed!"
exit 1
fi

# Cleanup old config backups (keep last 30 days)
find "$BACKUP_DIR" -name "axonos_config_*.tar.gz.gpg" -mtime +30 -delete

Application Configuration Export

// config_export.ts
import * as fs from 'fs';
import * as yaml from 'js-yaml';

interface ConfigExport {
version: string;
timestamp: string;
database: DatabaseConfig;
redis: RedisConfig;
workflow: WorkflowConfig;
security: SecurityConfig;
nodes: NodeConfig[];
}

class ConfigurationBackup {
async exportConfiguration(): Promise<ConfigExport> {
const config: ConfigExport = {
version: process.env.APP_VERSION || 'unknown',
timestamp: new Date().toISOString(),
database: await this.exportDatabaseConfig(),
redis: await this.exportRedisConfig(),
workflow: await this.exportWorkflowConfig(),
security: await this.exportSecurityConfig(),
nodes: await this.exportNodeConfigs()
};

return config;
}

async saveConfiguration(config: ConfigExport, filePath: string): Promise<void> {
// Remove sensitive data
const sanitized = this.sanitizeConfig(config);

// Save as YAML
const yamlData = yaml.dump(sanitized, { indent: 2 });
await fs.promises.writeFile(filePath, yamlData, 'utf8');

console.log(`✅ Configuration exported to: ${filePath}`);
}

private sanitizeConfig(config: ConfigExport): ConfigExport {
const sanitized = JSON.parse(JSON.stringify(config));

// Remove passwords and secrets
if (sanitized.database.password) {
sanitized.database.password = '***REDACTED***';
}
if (sanitized.redis.password) {
sanitized.redis.password = '***REDACTED***';
}
if (sanitized.security.jwt_secret) {
sanitized.security.jwt_secret = '***REDACTED***';
}

return sanitized;
}
}

// Usage
const configBackup = new ConfigurationBackup();

async function backupConfiguration() {
try {
const config = await configBackup.exportConfiguration();
const fileName = `axonos_config_${Date.now()}.yml`;
await configBackup.saveConfiguration(config, `/backups/config/${fileName}`);
} catch (error) {
console.error('Configuration backup failed:', error);
process.exit(1);
}
}

backupConfiguration();

Node Registry Backup

Custom Nodes Backup

#!/bin/bash
# nodes_backup.sh

NODES_DIR="/opt/axonos/nodes"
BACKUP_DIR="/backups/nodes"
DATE=$(date +%Y%m%d_%H%M%S)
NODES_BACKUP="axonos_nodes_${DATE}.tar.gz"

mkdir -p "$BACKUP_DIR"

# Backup entire nodes directory
tar -czf "$BACKUP_DIR/$NODES_BACKUP" \
-C "$(dirname "$NODES_DIR")" \
--exclude="node_modules" \
--exclude="*.log" \
--exclude="tmp" \
"$(basename "$NODES_DIR")"

if [ $? -eq 0 ]; then
echo "✅ Nodes backup completed: $NODES_BACKUP"

# Create manifest of backed up nodes
find "$NODES_DIR" -name "package.json" -exec grep -l "axonos-node" {} \; | \
while read package_file; do
node_dir=$(dirname "$package_file")
node_name=$(jq -r '.name' "$package_file")
node_version=$(jq -r '.version' "$package_file")
echo "$node_name:$node_version:$node_dir"
done > "$BACKUP_DIR/nodes_manifest_${DATE}.txt"

echo "📋 Nodes manifest created: nodes_manifest_${DATE}.txt"
else
echo "❌ Nodes backup failed!"
exit 1
fi

# Cleanup old nodes backups (keep last 14 days)
find "$BACKUP_DIR" -name "axonos_nodes_*.tar.gz" -mtime +14 -delete
find "$BACKUP_DIR" -name "nodes_manifest_*.txt" -mtime +14 -delete

Node Dependencies Backup

// node_dependencies_backup.ts
import * as fs from 'fs';
import * as path from 'path';
import { exec } from 'child_process';
import { promisify } from 'util';

const execAsync = promisify(exec);

interface NodeDependency {
name: string;
version: string;
path: string;
dependencies: string[];
devDependencies: string[];
}

class NodeDependenciesBackup {
async backupDependencies(nodesDir: string): Promise<NodeDependency[]> {
const nodeDirs = await this.findNodeDirectories(nodesDir);
const dependencies: NodeDependency[] = [];

for (const nodeDir of nodeDirs) {
try {
const dependency = await this.analyzeDependencies(nodeDir);
dependencies.push(dependency);
} catch (error) {
console.error(`Error analyzing ${nodeDir}:`, error);
}
}

return dependencies;
}

private async findNodeDirectories(baseDir: string): Promise<string[]> {
const dirs: string[] = [];

const entries = await fs.promises.readdir(baseDir, { withFileTypes: true });

for (const entry of entries) {
if (entry.isDirectory()) {
const packageJsonPath = path.join(baseDir, entry.name, 'package.json');

try {
await fs.promises.access(packageJsonPath);
dirs.push(path.join(baseDir, entry.name));
} catch {
// No package.json, skip
}
}
}

return dirs;
}

private async analyzeDependencies(nodeDir: string): Promise<NodeDependency> {
const packageJsonPath = path.join(nodeDir, 'package.json');
const packageJson = JSON.parse(
await fs.promises.readFile(packageJsonPath, 'utf8')
);

return {
name: packageJson.name,
version: packageJson.version,
path: nodeDir,
dependencies: Object.keys(packageJson.dependencies || {}),
devDependencies: Object.keys(packageJson.devDependencies || {})
};
}

async saveDependenciesManifest(
dependencies: NodeDependency[],
filePath: string
): Promise<void> {
const manifest = {
timestamp: new Date().toISOString(),
total_nodes: dependencies.length,
dependencies: dependencies
};

await fs.promises.writeFile(
filePath,
JSON.stringify(manifest, null, 2),
'utf8'
);

console.log(`✅ Dependencies manifest saved: ${filePath}`);
}
}

File Storage Backup

Workflow Artifacts Backup

#!/bin/bash
# artifacts_backup.sh

ARTIFACTS_DIR="/var/lib/axonos/artifacts"
BACKUP_DIR="/backups/artifacts"
DATE=$(date +%Y%m%d_%H%M%S)
ARTIFACTS_BACKUP="axonos_artifacts_${DATE}.tar.gz"

mkdir -p "$BACKUP_DIR"

# Calculate total size before backup
TOTAL_SIZE=$(du -sh "$ARTIFACTS_DIR" | cut -f1)
echo "📊 Total artifacts size: $TOTAL_SIZE"

# Backup artifacts with progress
tar -czf "$BACKUP_DIR/$ARTIFACTS_BACKUP" \
-C "$(dirname "$ARTIFACTS_DIR")" \
--checkpoint=10000 \
--checkpoint-action=echo="Processed %{TAR_CHECKPOINT} files" \
"$(basename "$ARTIFACTS_DIR")"

if [ $? -eq 0 ]; then
BACKUP_SIZE=$(du -sh "$BACKUP_DIR/$ARTIFACTS_BACKUP" | cut -f1)
echo "✅ Artifacts backup completed: $ARTIFACTS_BACKUP ($BACKUP_SIZE)"

# Generate checksums
cd "$BACKUP_DIR"
sha256sum "$ARTIFACTS_BACKUP" > "${ARTIFACTS_BACKUP}.sha256"
echo "🔍 Checksum generated: ${ARTIFACTS_BACKUP}.sha256"
else
echo "❌ Artifacts backup failed!"
exit 1
fi

# Sync to cloud storage
aws s3 cp "$BACKUP_DIR/$ARTIFACTS_BACKUP" \
s3://axonos-backups/artifacts/ \
--storage-class GLACIER

# Cleanup old artifacts backups (keep last 30 days locally)
find "$BACKUP_DIR" -name "axonos_artifacts_*.tar.gz" -mtime +30 -delete
find "$BACKUP_DIR" -name "axonos_artifacts_*.sha256" -mtime +30 -delete

Incremental File Backup with rsync

#!/bin/bash
# incremental_files_backup.sh

SOURCE_DIR="/var/lib/axonos"
BACKUP_DIR="/backups/incremental"
REMOTE_BACKUP="backup-server:/backups/axonos/incremental"
DATE=$(date +%Y%m%d_%H%M%S)
LOG_FILE="/var/log/axonos/backup_${DATE}.log"

# Create backup directories
mkdir -p "$BACKUP_DIR/current"
mkdir -p "$BACKUP_DIR/snapshots"

# Incremental backup using rsync
rsync -av \
--delete \
--backup \
--backup-dir="$BACKUP_DIR/snapshots/$DATE" \
--exclude="*.log" \
--exclude="tmp/" \
--exclude="cache/" \
--log-file="$LOG_FILE" \
"$SOURCE_DIR/" \
"$BACKUP_DIR/current/"

if [ $? -eq 0 ]; then
echo "✅ Incremental backup completed"

# Create snapshot link
ln -sfn "$BACKUP_DIR/snapshots/$DATE" "$BACKUP_DIR/latest"

# Sync to remote
rsync -av --delete "$BACKUP_DIR/" "$REMOTE_BACKUP/"

echo "📡 Synced to remote backup server"
else
echo "❌ Incremental backup failed!"
exit 1
fi

# Cleanup old snapshots (keep last 7 days)
find "$BACKUP_DIR/snapshots" -maxdepth 1 -type d -mtime +7 -exec rm -rf {} \;

Automated Backup Scheduling

Cron-based Backup Schedule

# /etc/cron.d/axonos-backup

# Full database backup daily at 2 AM
0 2 * * * axonos /opt/axonos/scripts/full_backup.sh

# Incremental backup every 4 hours
0 */4 * * * axonos /opt/axonos/scripts/incremental_backup.sh

# Configuration backup weekly on Sunday at 3 AM
0 3 * * 0 axonos /opt/axonos/scripts/config_backup.sh

# Nodes backup weekly on Saturday at 1 AM
0 1 * * 6 axonos /opt/axonos/scripts/nodes_backup.sh

# Artifacts backup daily at 11 PM
0 23 * * * axonos /opt/axonos/scripts/artifacts_backup.sh

# Cleanup old backups daily at 4 AM
0 4 * * * axonos /opt/axonos/scripts/cleanup_backups.sh

Systemd Timer-based Backup

# /etc/systemd/system/axonos-backup.service
[Unit]
Description=Axon OS Full Backup
Wants=axonos-backup.timer

[Service]
Type=oneshot
User=axonos
ExecStart=/opt/axonos/scripts/full_backup.sh
StandardOutput=journal
StandardError=journal

# /etc/systemd/system/axonos-backup.timer
[Unit]
Description=Run Axon OS backup daily
Requires=axonos-backup.service

[Timer]
OnCalendar=daily
Persistent=true
RandomizedDelaySec=1800

[Install]
WantedBy=timers.target

Backup Monitoring Script

#!/bin/bash
# backup_monitor.sh

BACKUP_DIR="/backups"
LOG_FILE="/var/log/axonos/backup_monitor.log"
ALERT_EMAIL="admin@axonos.dev"

# Check if backups are current
check_backup_freshness() {
local backup_type="$1"
local max_age_hours="$2"

latest_backup=$(find "$BACKUP_DIR" -name "*${backup_type}*" -type f -printf '%T@ %p\n' | sort -n | tail -1 | cut -d' ' -f2-)

if [ -z "$latest_backup" ]; then
echo "❌ No $backup_type backup found!"
return 1
fi

backup_age=$(( ($(date +%s) - $(stat -c %Y "$latest_backup")) / 3600 ))

if [ $backup_age -gt $max_age_hours ]; then
echo "⚠️ $backup_type backup is $backup_age hours old (max: $max_age_hours)"
return 1
else
echo "✅ $backup_type backup is current ($backup_age hours old)"
return 0
fi
}

# Main monitoring
{
echo "=== Backup Status Report - $(date) ==="

# Check different backup types
check_backup_freshness "full" 25 # Daily full backup
check_backup_freshness "config" 168 # Weekly config backup
check_backup_freshness "nodes" 168 # Weekly nodes backup
check_backup_freshness "artifacts" 25 # Daily artifacts backup

# Check backup sizes
echo -e "\n=== Backup Sizes ==="
du -sh "$BACKUP_DIR"/* 2>/dev/null | sort -hr

# Check available disk space
echo -e "\n=== Disk Space ==="
df -h "$BACKUP_DIR"

} | tee -a "$LOG_FILE"

# Send alert if any backup is stale
if ! check_backup_freshness "full" 25 >/dev/null 2>&1; then
mail -s "❌ Axon OS Backup Alert: Stale backups detected" "$ALERT_EMAIL" < "$LOG_FILE"
fi

Disaster Recovery

Recovery Planning

Recovery Time Objectives (RTO)

  • Database Recovery: 30 minutes
  • Application Recovery: 15 minutes
  • Full System Recovery: 2 hours
  • Node Registry Recovery: 1 hour

Recovery Point Objectives (RPO)

  • Database: 15 minutes (WAL archiving)
  • Configuration: 1 week
  • Artifacts: 24 hours
  • Nodes: 1 week

Database Recovery Procedures

Full Database Restore

#!/bin/bash
# restore_database.sh

BACKUP_FILE="$1"
DB_NAME="axonos"
DB_USER="axonos_user"

if [ -z "$BACKUP_FILE" ]; then
echo "Usage: $0 <backup_file>"
exit 1
fi

echo "🔄 Starting database restore from: $BACKUP_FILE"

# Stop Axon OS service
systemctl stop axonos

# Drop and recreate database
sudo -u postgres psql -c "DROP DATABASE IF EXISTS $DB_NAME;"
sudo -u postgres psql -c "CREATE DATABASE $DB_NAME OWNER $DB_USER;"

# Restore from backup
pg_restore -h localhost -U "$DB_USER" -d "$DB_NAME" \
--verbose --clean --if-exists \
"$BACKUP_FILE"

if [ $? -eq 0 ]; then
echo "✅ Database restore completed successfully"

# Run post-restore checks
echo "🔍 Running post-restore validation..."

# Check table counts
sudo -u postgres psql -d "$DB_NAME" -c "
SELECT schemaname, tablename, n_tup_ins as inserts, n_tup_upd as updates
FROM pg_stat_user_tables
ORDER BY schemaname, tablename;
"

# Start Axon OS service
systemctl start axonos

# Wait for service to be ready
sleep 30

# Validate application
if curl -f http://localhost:8080/health >/dev/null 2>&1; then
echo "✅ Application is healthy after restore"
else
echo "❌ Application health check failed"
exit 1
fi

else
echo "❌ Database restore failed!"
exit 1
fi

Point-in-Time Recovery

#!/bin/bash
# point_in_time_recovery.sh

TARGET_TIME="$1" # Format: 2024-01-15 14:30:00

if [ -z "$TARGET_TIME" ]; then
echo "Usage: $0 'YYYY-MM-DD HH:MM:SS'"
exit 1
fi

echo "🕐 Starting point-in-time recovery to: $TARGET_TIME"

# Stop PostgreSQL
systemctl stop postgresql

# Remove existing data directory
rm -rf /var/lib/postgresql/14/main/*

# Restore base backup using WAL-E
envdir /etc/wal-e.d/env wal-e backup-fetch /var/lib/postgresql/14/main LATEST

# Create recovery configuration
cat > /var/lib/postgresql/14/main/recovery.conf << EOF
restore_command = 'envdir /etc/wal-e.d/env wal-e wal-fetch "%f" "%p"'
recovery_target_time = '$TARGET_TIME'
recovery_target_timeline = 'latest'
EOF

# Fix permissions
chown -R postgres:postgres /var/lib/postgresql/14/main/

# Start PostgreSQL in recovery mode
systemctl start postgresql

echo "✅ Point-in-time recovery initiated"
echo "Monitor PostgreSQL logs to confirm recovery completion"

Application Recovery

Complete System Restore

#!/bin/bash
# complete_system_restore.sh

BACKUP_DATE="$1"

if [ -z "$BACKUP_DATE" ]; then
echo "Usage: $0 <backup_date_YYYYMMDD>"
exit 1
fi

echo "🔄 Starting complete system restore for date: $BACKUP_DATE"

# Stop all services
systemctl stop axonos
systemctl stop nginx
systemctl stop postgresql
systemctl stop redis

# Restore database
DB_BACKUP="/backups/postgresql/axonos_full_${BACKUP_DATE}_*.sql"
latest_db_backup=$(ls -t $DB_BACKUP 2>/dev/null | head -1)

if [ -n "$latest_db_backup" ]; then
echo "📊 Restoring database from: $latest_db_backup"
./restore_database.sh "$latest_db_backup"
else
echo "❌ No database backup found for $BACKUP_DATE"
exit 1
fi

# Restore configuration
CONFIG_BACKUP="/backups/config/axonos_config_${BACKUP_DATE}_*.tar.gz.gpg"
latest_config_backup=$(ls -t $CONFIG_BACKUP 2>/dev/null | head -1)

if [ -n "$latest_config_backup" ]; then
echo "⚙️ Restoring configuration from: $latest_config_backup"

# Decrypt and extract
gpg --decrypt "$latest_config_backup" | tar -xzf - -C /

# Reload systemd
systemctl daemon-reload
else
echo "⚠️ No configuration backup found for $BACKUP_DATE"
fi

# Restore nodes
NODES_BACKUP="/backups/nodes/axonos_nodes_${BACKUP_DATE}_*.tar.gz"
latest_nodes_backup=$(ls -t $NODES_BACKUP 2>/dev/null | head -1)

if [ -n "$latest_nodes_backup" ]; then
echo "🔧 Restoring nodes from: $latest_nodes_backup"

# Remove existing nodes
rm -rf /opt/axonos/nodes/*

# Extract nodes backup
tar -xzf "$latest_nodes_backup" -C /opt/axonos/

# Reinstall node dependencies
find /opt/axonos/nodes -name "package.json" -execdir npm install \;
else
echo "⚠️ No nodes backup found for $BACKUP_DATE"
fi

# Restore artifacts (if needed)
ARTIFACTS_BACKUP="/backups/artifacts/axonos_artifacts_${BACKUP_DATE}_*.tar.gz"
latest_artifacts_backup=$(ls -t $ARTIFACTS_BACKUP 2>/dev/null | head -1)

if [ -n "$latest_artifacts_backup" ]; then
echo "📁 Restoring artifacts from: $latest_artifacts_backup"

# Remove existing artifacts
rm -rf /var/lib/axonos/artifacts/*

# Extract artifacts backup
tar -xzf "$latest_artifacts_backup" -C /var/lib/axonos/
else
echo "⚠️ No artifacts backup found for $BACKUP_DATE"
fi

# Start services
systemctl start postgresql
systemctl start redis
sleep 10
systemctl start axonos
systemctl start nginx

echo "✅ Complete system restore completed"
echo "🔍 Verifying system health..."

# Health checks
sleep 30

if curl -f http://localhost:8080/health >/dev/null 2>&1; then
echo "✅ Axon OS is healthy"
else
echo "❌ Axon OS health check failed"
fi

if systemctl is-active nginx >/dev/null 2>&1; then
echo "✅ Nginx is running"
else
echo "❌ Nginx is not running"
fi

echo "🎉 System restore process completed!"

Backup Testing and Validation

Automated Backup Testing

#!/bin/bash
# test_backup_restore.sh

TEST_DIR="/tmp/backup_test_$(date +%s)"
TEST_DB="axonos_test_restore"
BACKUP_FILE="$1"

if [ -z "$BACKUP_FILE" ]; then
echo "Usage: $0 <backup_file>"
exit 1
fi

echo "🧪 Testing backup restore: $BACKUP_FILE"

# Create test environment
mkdir -p "$TEST_DIR"

# Create test database
sudo -u postgres psql -c "CREATE DATABASE $TEST_DB;"

# Restore backup to test database
pg_restore -h localhost -U axonos_user -d "$TEST_DB" \
--verbose --clean --if-exists \
"$BACKUP_FILE"

if [ $? -eq 0 ]; then
echo "✅ Backup restore test successful"

# Validate data integrity
row_count=$(sudo -u postgres psql -d "$TEST_DB" -t -c "
SELECT COUNT(*) FROM workflows;
" | tr -d ' ')

echo "📊 Workflows in restored database: $row_count"

# Check for critical tables
tables=(workflows users nodes executions)
for table in "${tables[@]}"; do
exists=$(sudo -u postgres psql -d "$TEST_DB" -t -c "
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_name = '$table'
);
" | tr -d ' ')

if [ "$exists" = "t" ]; then
echo "✅ Table $table exists"
else
echo "❌ Table $table missing"
fi
done

else
echo "❌ Backup restore test failed!"
fi

# Cleanup
sudo -u postgres psql -c "DROP DATABASE $TEST_DB;"
rm -rf "$TEST_DIR"

echo "🧹 Cleanup completed"

Backup Verification Script

#!/usr/bin/env python3
# verify_backup.py

import os
import sys
import hashlib
import subprocess
import json
from datetime import datetime, timedelta

class BackupVerifier:
def __init__(self, backup_dir):
self.backup_dir = backup_dir
self.results = {
'timestamp': datetime.now().isoformat(),
'checks': [],
'status': 'unknown'
}

def verify_file_integrity(self, file_path):
"""Verify file integrity using checksums"""
checksum_file = f"{file_path}.sha256"

if not os.path.exists(checksum_file):
return {
'file': file_path,
'status': 'warning',
'message': 'No checksum file found'
}

# Calculate actual checksum
sha256_hash = hashlib.sha256()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
sha256_hash.update(chunk)
actual_checksum = sha256_hash.hexdigest()

# Read expected checksum
with open(checksum_file, 'r') as f:
expected_checksum = f.read().strip().split()[0]

if actual_checksum == expected_checksum:
return {
'file': file_path,
'status': 'pass',
'message': 'Checksum verified'
}
else:
return {
'file': file_path,
'status': 'fail',
'message': f'Checksum mismatch: expected {expected_checksum}, got {actual_checksum}'
}

def verify_backup_freshness(self, max_age_hours=25):
"""Check if backups are recent enough"""
current_time = datetime.now()
threshold = current_time - timedelta(hours=max_age_hours)

backup_files = []
for root, dirs, files in os.walk(self.backup_dir):
for file in files:
if file.endswith(('.sql', '.tar.gz')):
file_path = os.path.join(root, file)
mtime = datetime.fromtimestamp(os.path.getmtime(file_path))
backup_files.append((file_path, mtime))

if not backup_files:
return {
'status': 'fail',
'message': 'No backup files found'
}

latest_backup = max(backup_files, key=lambda x: x[1])
latest_file, latest_time = latest_backup

if latest_time > threshold:
return {
'status': 'pass',
'message': f'Latest backup is recent: {latest_file} ({latest_time})'
}
else:
age_hours = (current_time - latest_time).total_seconds() / 3600
return {
'status': 'fail',
'message': f'Latest backup is too old: {age_hours:.1f} hours'
}

def test_database_backup(self, backup_file):
"""Test database backup by attempting a restore"""
test_db = f"test_restore_{int(datetime.now().timestamp())}"

try:
# Create test database
subprocess.run([
'sudo', '-u', 'postgres', 'psql', '-c',
f'CREATE DATABASE {test_db};'
], check=True, capture_output=True)

# Attempt restore
result = subprocess.run([
'pg_restore', '-h', 'localhost', '-U', 'axonos_user',
'-d', test_db, '--verbose', '--exit-on-error',
backup_file
], capture_output=True, text=True)

if result.returncode == 0:
status = 'pass'
message = 'Database backup restore test successful'
else:
status = 'fail'
message = f'Database restore failed: {result.stderr}'

except subprocess.CalledProcessError as e:
status = 'fail'
message = f'Test database creation failed: {e}'

finally:
# Cleanup test database
try:
subprocess.run([
'sudo', '-u', 'postgres', 'psql', '-c',
f'DROP DATABASE IF EXISTS {test_db};'
], check=True, capture_output=True)
except:
pass

return {
'file': backup_file,
'status': status,
'message': message
}

def run_verification(self):
"""Run all verification checks"""
print("🔍 Starting backup verification...")

# Check backup freshness
freshness_result = self.verify_backup_freshness()
self.results['checks'].append({
'check': 'backup_freshness',
**freshness_result
})

# Find and verify backup files
for root, dirs, files in os.walk(self.backup_dir):
for file in files:
file_path = os.path.join(root, file)

# Verify file integrity
if file.endswith(('.sql', '.tar.gz')):
integrity_result = self.verify_file_integrity(file_path)
self.results['checks'].append({
'check': 'file_integrity',
**integrity_result
})

# Test database backups
if file.endswith('.sql') and 'full' in file:
test_result = self.test_database_backup(file_path)
self.results['checks'].append({
'check': 'database_restore_test',
**test_result
})

# Determine overall status
failed_checks = [c for c in self.results['checks'] if c['status'] == 'fail']
warning_checks = [c for c in self.results['checks'] if c['status'] == 'warning']

if failed_checks:
self.results['status'] = 'fail'
elif warning_checks:
self.results['status'] = 'warning'
else:
self.results['status'] = 'pass'

# Print results
self.print_results()

return self.results

def print_results(self):
"""Print verification results"""
print(f"\n=== Backup Verification Results ===")
print(f"Overall Status: {self.results['status'].upper()}")
print(f"Timestamp: {self.results['timestamp']}")
print()

for check in self.results['checks']:
status_icon = {
'pass': '✅',
'warning': '⚠️',
'fail': '❌'
}.get(check['status'], '❓')

print(f"{status_icon} {check['check']}: {check['message']}")
if 'file' in check:
print(f" File: {check['file']}")

print()

if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python3 verify_backup.py <backup_directory>")
sys.exit(1)

backup_dir = sys.argv[1]
verifier = BackupVerifier(backup_dir)
results = verifier.run_verification()

# Exit with appropriate code
if results['status'] == 'fail':
sys.exit(1)
elif results['status'] == 'warning':
sys.exit(2)
else:
sys.exit(0)

Cloud Backup Integration

AWS S3 Backup

#!/bin/bash
# s3_backup.sh

BACKUP_DIR="/backups"
S3_BUCKET="axonos-backups"
AWS_PROFILE="axonos-backup"

# Sync to S3 with encryption
aws s3 sync "$BACKUP_DIR" "s3://$S3_BUCKET" \
--profile "$AWS_PROFILE" \
--storage-class STANDARD_IA \
--server-side-encryption AES256 \
--exclude "*.tmp" \
--delete

# Create lifecycle policy for cost optimization
aws s3api put-bucket-lifecycle-configuration \
--profile "$AWS_PROFILE" \
--bucket "$S3_BUCKET" \
--lifecycle-configuration file://s3-lifecycle.json

Google Cloud Storage Backup

#!/bin/bash
# gcs_backup.sh

BACKUP_DIR="/backups"
GCS_BUCKET="axonos-backups"

# Sync to Google Cloud Storage
gsutil -m rsync -r -d "$BACKUP_DIR" "gs://$GCS_BUCKET"

# Set lifecycle policy
gsutil lifecycle set gcs-lifecycle.json "gs://$GCS_BUCKET"

Backup Retention Policies

Retention Schedule

# backup_retention.yml
retention:
database:
full_backup:
daily: 7 # Keep 7 daily backups
weekly: 4 # Keep 4 weekly backups
monthly: 12 # Keep 12 monthly backups
incremental:
hourly: 24 # Keep 24 hourly backups

configuration:
weekly: 8 # Keep 8 weekly config backups
monthly: 12 # Keep 12 monthly config backups

nodes:
weekly: 4 # Keep 4 weekly node backups
monthly: 6 # Keep 6 monthly node backups

artifacts:
daily: 30 # Keep 30 daily artifact backups
weekly: 12 # Keep 12 weekly artifact backups

logs:
daily: 90 # Keep 90 days of logs

Automated Cleanup Script

#!/usr/bin/env python3
# cleanup_backups.py

import os
import re
import datetime
from pathlib import Path
import yaml

class BackupCleaner:
def __init__(self, config_file):
with open(config_file, 'r') as f:
self.config = yaml.safe_load(f)

def cleanup_backups(self, backup_dir):
"""Clean up old backups based on retention policy"""
backup_path = Path(backup_dir)

for backup_type, retention in self.config['retention'].items():
pattern = f"*{backup_type}*"
files = list(backup_path.glob(pattern))

if backup_type == 'database':
self.cleanup_database_backups(files, retention)
else:
self.cleanup_simple_backups(files, retention)

def cleanup_database_backups(self, files, retention):
"""Clean up database backups with complex retention"""
now = datetime.datetime.now()

# Separate full and incremental backups
full_backups = [f for f in files if 'full' in f.name]
incremental_backups = [f for f in files if 'incremental' in f.name]

# Clean full backups
if 'full_backup' in retention:
self.apply_retention_policy(full_backups, retention['full_backup'], now)

# Clean incremental backups
if 'incremental' in retention:
self.apply_retention_policy(incremental_backups, retention['incremental'], now)

def cleanup_simple_backups(self, files, retention):
"""Clean up simple backups"""
now = datetime.datetime.now()
self.apply_retention_policy(files, retention, now)

def apply_retention_policy(self, files, policy, now):
"""Apply retention policy to file list"""
# Sort files by modification time
files_with_mtime = [(f, datetime.datetime.fromtimestamp(f.stat().st_mtime)) for f in files]
files_with_mtime.sort(key=lambda x: x[1], reverse=True)

keep_files = set()

# Apply each retention rule
for period, count in policy.items():
if period == 'daily':
keep_files.update(self.select_daily_backups(files_with_mtime, count, now))
elif period == 'weekly':
keep_files.update(self.select_weekly_backups(files_with_mtime, count, now))
elif period == 'monthly':
keep_files.update(self.select_monthly_backups(files_with_mtime, count, now))
elif period == 'hourly':
keep_files.update(self.select_hourly_backups(files_with_mtime, count, now))

# Delete files not in keep list
for file_path, mtime in files_with_mtime:
if file_path not in keep_files:
print(f"🗑️ Deleting old backup: {file_path}")
file_path.unlink()

# Also delete checksum file if exists
checksum_file = file_path.with_suffix(file_path.suffix + '.sha256')
if checksum_file.exists():
checksum_file.unlink()

def select_daily_backups(self, files_with_mtime, count, now):
"""Select files for daily retention"""
selected = []
days_seen = set()

for file_path, mtime in files_with_mtime:
day_key = mtime.date()
if day_key not in days_seen and len(days_seen) < count:
selected.append(file_path)
days_seen.add(day_key)

return selected

def select_weekly_backups(self, files_with_mtime, count, now):
"""Select files for weekly retention"""
selected = []
weeks_seen = set()

for file_path, mtime in files_with_mtime:
week_key = mtime.isocalendar()[:2] # (year, week)
if week_key not in weeks_seen and len(weeks_seen) < count:
selected.append(file_path)
weeks_seen.add(week_key)

return selected

def select_monthly_backups(self, files_with_mtime, count, now):
"""Select files for monthly retention"""
selected = []
months_seen = set()

for file_path, mtime in files_with_mtime:
month_key = (mtime.year, mtime.month)
if month_key not in months_seen and len(months_seen) < count:
selected.append(file_path)
months_seen.add(month_key)

return selected

def select_hourly_backups(self, files_with_mtime, count, now):
"""Select files for hourly retention"""
# Simply keep the newest N files
return [f[0] for f in files_with_mtime[:count]]

if __name__ == "__main__":
import sys

if len(sys.argv) != 3:
print("Usage: python3 cleanup_backups.py <config_file> <backup_directory>")
sys.exit(1)

config_file = sys.argv[1]
backup_dir = sys.argv[2]

cleaner = BackupCleaner(config_file)
cleaner.cleanup_backups(backup_dir)

print("✅ Backup cleanup completed")

Need Help?