#!/bin/bash

# PostgreSQL Backup and Disaster Recovery Configuration for RateRight
# Fly.io Production Database Hardening
# App: rateright-db

set -e

# Color codes for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Configuration
DB_APP="rateright-db"
MAIN_APP="rateright-au"
BACKUP_RETENTION_DAYS=7
BACKUP_SCHEDULE="0 2 * * *"  # Daily at 2 AM Sydney time

echo -e "${BLUE}═══════════════════════════════════════════════════════════════${NC}"
echo -e "${BLUE}    PostgreSQL Backup & Disaster Recovery Configuration${NC}"
echo -e "${BLUE}═══════════════════════════════════════════════════════════════${NC}"

# Function to check command success
check_success() {
    if [ $? -eq 0 ]; then
        echo -e "${GREEN}✅ $1${NC}"
    else
        echo -e "${RED}❌ $1 failed${NC}"
        exit 1
    fi
}

# ============================================
# PART 1: AUTOMATED BACKUP CONFIGURATION
# ============================================

echo -e "\n${YELLOW}[1/6] Configuring Automated Backups${NC}"
echo "----------------------------------------"

# Create backup configuration script
cat > backup_config.sql << 'EOF'
-- Backup configuration settings
ALTER SYSTEM SET archive_mode = 'on';
ALTER SYSTEM SET archive_command = 'test ! -f /mnt/backups/%f && cp %p /mnt/backups/%f';
ALTER SYSTEM SET archive_timeout = '1h';
ALTER SYSTEM SET wal_level = 'replica';
ALTER SYSTEM SET max_wal_senders = 3;
ALTER SYSTEM SET wal_keep_size = '1GB';
ALTER SYSTEM SET max_replication_slots = 3;

-- Enable backup compression
ALTER SYSTEM SET wal_compression = 'on';

-- Checkpoint settings for consistent backups
ALTER SYSTEM SET checkpoint_warning = '30s';
ALTER SYSTEM SET checkpoint_completion_target = 0.9;

SELECT pg_reload_conf();
EOF

echo "Applying backup configuration..."
fly postgres connect -a $DB_APP < backup_config.sql
check_success "Backup configuration applied"

# ============================================
# PART 2: CREATE BACKUP SCHEDULE
# ============================================

echo -e "\n${YELLOW}[2/6] Setting Up Backup Schedule${NC}"
echo "----------------------------------------"

# Create backup script
cat > automated_backup.sh << 'EOF'
#!/bin/bash

# RateRight PostgreSQL Backup Script
# Runs daily at 2 AM Sydney time

DB_APP="rateright-db"
BACKUP_NAME="rateright-backup-$(date +%Y%m%d-%H%M%S)"
RETENTION_DAYS=7

echo "Starting backup: $BACKUP_NAME"

# Create backup
fly postgres backup create --app $DB_APP --name "$BACKUP_NAME"

if [ $? -eq 0 ]; then
    echo "✅ Backup created successfully: $BACKUP_NAME"
    
    # Log backup to audit
    fly postgres connect -a $DB_APP -c "
        INSERT INTO audit_logs (action, details, created_at)
        VALUES ('BACKUP_CREATED', '$BACKUP_NAME', NOW());
    "
else
    echo "❌ Backup failed: $BACKUP_NAME"
    # Send alert (configure your alerting mechanism)
    exit 1
fi

# Clean up old backups
echo "Cleaning up backups older than $RETENTION_DAYS days..."
fly postgres backup list --app $DB_APP --json | \
    jq -r ".[] | select(.created_at < (now - ${RETENTION_DAYS}*86400 | strftime(\"%Y-%m-%dT%H:%M:%S\"))) | .id" | \
    while read backup_id; do
        echo "Deleting old backup: $backup_id"
        fly postgres backup delete --app $DB_APP --backup-id "$backup_id"
    done

echo "Backup process completed"
EOF

chmod +x automated_backup.sh
check_success "Backup script created"

# ============================================
# PART 3: POINT-IN-TIME RECOVERY SETUP
# ============================================

echo -e "\n${YELLOW}[3/6] Configuring Point-in-Time Recovery (PITR)${NC}"
echo "----------------------------------------------"

cat > pitr_setup.sql << 'EOF'
-- Enable PITR capabilities
ALTER SYSTEM SET wal_level = 'replica';
ALTER SYSTEM SET archive_mode = 'on';
ALTER SYSTEM SET archive_command = 'test ! -f /mnt/wal_archive/%f && cp %p /mnt/wal_archive/%f';
ALTER SYSTEM SET restore_command = 'cp /mnt/wal_archive/%f %p';

-- Create recovery configuration template
-- This will be used during recovery operations
SELECT pg_reload_conf();

-- Verify WAL archiving is working
SELECT 
    archived_count,
    last_archived_wal,
    last_archived_time,
    failed_count
FROM pg_stat_archiver;
EOF

fly postgres connect -a $DB_APP < pitr_setup.sql
check_success "PITR configuration applied"

# ============================================
# PART 4: DISASTER RECOVERY PROCEDURES
# ============================================

echo -e "\n${YELLOW}[4/6] Creating Disaster Recovery Procedures${NC}"
echo "--------------------------------------------"

cat > disaster_recovery_procedures.md << 'EOF'
# RateRight Disaster Recovery Procedures

## Recovery Time Objectives
- **RTO (Recovery Time Objective)**: 2 hours
- **RPO (Recovery Point Objective)**: 1 hour

## Scenario 1: Complete Database Failure

### Immediate Actions (0-15 minutes)
1. Verify the failure:
   ```bash
   fly status --app rateright-db
   fly postgres connect -a rateright-db -c "SELECT 1;"
   ```

2. Check logs for root cause:
   ```bash
   fly logs --app rateright-db --region syd
   ```

3. Notify stakeholders via established channels

### Recovery Steps (15-60 minutes)

#### Option A: Restore from Latest Backup
```bash
# List available backups
fly postgres backup list --app rateright-db

# Restore from specific backup
fly postgres backup restore --app rateright-db --backup-id <backup-id>

# Verify restoration
fly postgres connect -a rateright-db -c "
    SELECT COUNT(*) FROM users;
    SELECT MAX(created_at) FROM audit_logs;
"
```

#### Option B: Point-in-Time Recovery
```bash
# Restore to specific point in time
fly postgres backup restore --app rateright-db \
    --backup-id <backup-id> \
    --target-time "2025-01-20 10:00:00"
```

### Post-Recovery Validation (60-120 minutes)
1. Data integrity checks
2. Application connectivity tests
3. Performance validation
4. User acceptance testing

## Scenario 2: Data Corruption

### Detection
```sql
-- Check for corruption
SELECT schemaname, tablename 
FROM pg_tables 
WHERE schemaname = 'public'
ORDER BY tablename;

-- Verify specific table
SELECT COUNT(*), 
       COUNT(DISTINCT id),
       MAX(created_at),
       MIN(created_at)
FROM <table_name>;
```

### Recovery
```bash
# Restore specific tables from backup
pg_restore --app rateright-db \
    --table=users \
    --table=contracts \
    --clean \
    backup_file.dump
```

## Scenario 3: Regional Outage

### Failover Process
```bash
# Check regional status
fly regions list --app rateright-db

# Failover to secondary region
fly regions add mel --app rateright-db
fly scale count 0 --region syd --app rateright-db
fly scale count 1 --region mel --app rateright-db
```

## Emergency Contacts
- Database Admin: [Contact Info]
- Platform Team: [Contact Info]
- Fly.io Support: support@fly.io

## Recovery Verification Checklist
- [ ] Database accessible
- [ ] All tables present
- [ ] Row counts match expectations
- [ ] Application can connect
- [ ] Authentication working
- [ ] Critical queries performing well
- [ ] Backups resuming normally
- [ ] Monitoring alerts cleared
EOF

check_success "Disaster recovery procedures documented"

# ============================================
# PART 5: BACKUP TESTING AUTOMATION
# ============================================

echo -e "\n${YELLOW}[5/6] Creating Backup Test Automation${NC}"
echo "----------------------------------------"

cat > test_backup_restore.sh << 'EOF'
#!/bin/bash

# Automated Backup Testing Script
# Run weekly to verify backup integrity

DB_APP="rateright-db"
TEST_DB="rateright-test-restore"
TIMESTAMP=$(date +%Y%m%d-%H%M%S)

echo "Starting automated backup test at $TIMESTAMP"

# Get latest backup
LATEST_BACKUP=$(fly postgres backup list --app $DB_APP --json | jq -r '.[0].id')

if [ -z "$LATEST_BACKUP" ]; then
    echo "❌ No backups found!"
    exit 1
fi

echo "Testing backup: $LATEST_BACKUP"

# Create test database instance
echo "Creating test restore instance..."
fly postgres create --name $TEST_DB --region syd

# Restore backup to test instance
echo "Restoring backup to test instance..."
fly postgres backup restore \
    --app $TEST_DB \
    --backup-id "$LATEST_BACKUP"

# Run validation queries
echo "Running validation queries..."
fly postgres connect -a $TEST_DB << SQL
    -- Check critical tables exist
    SELECT COUNT(*) as table_count 
    FROM information_schema.tables 
    WHERE table_schema = 'public';
    
    -- Verify data integrity
    SELECT 
        'users' as table_name, COUNT(*) as row_count FROM users
    UNION ALL
    SELECT 
        'jobs' as table_name, COUNT(*) as row_count FROM jobs
    UNION ALL
    SELECT 
        'contracts' as table_name, COUNT(*) as row_count FROM contracts;
    
    -- Check for orphaned records
    SELECT COUNT(*) as orphaned_contracts
    FROM contracts c
    LEFT JOIN users u ON c.contractor_id = u.id
    WHERE u.id IS NULL;
SQL

# Clean up test instance
echo "Cleaning up test instance..."
fly apps destroy $TEST_DB --yes

echo "✅ Backup test completed successfully"

# Log test result
fly postgres connect -a $DB_APP -c "
    INSERT INTO audit_logs (action, details, created_at)
    VALUES ('BACKUP_TEST_SUCCESS', 'Tested backup: $LATEST_BACKUP', NOW());
"
EOF

chmod +x test_backup_restore.sh
check_success "Backup test automation created"

# ============================================
# PART 6: MONITORING & ALERTING
# ============================================

echo -e "\n${YELLOW}[6/6] Setting Up Backup Monitoring${NC}"
echo "------------------------------------"

cat > monitor_backups.sql << 'EOF'
-- Create backup monitoring view
CREATE OR REPLACE VIEW backup_health AS
WITH latest_backup AS (
    SELECT 
        MAX(created_at) as last_backup_time
    FROM audit_logs
    WHERE action = 'BACKUP_CREATED'
),
backup_stats AS (
    SELECT 
        COUNT(*) FILTER (WHERE created_at > NOW() - INTERVAL '7 days') as recent_backups,
        COUNT(*) FILTER (WHERE action = 'BACKUP_TEST_SUCCESS' AND created_at > NOW() - INTERVAL '7 days') as tested_backups
    FROM audit_logs
    WHERE action IN ('BACKUP_CREATED', 'BACKUP_TEST_SUCCESS')
)
SELECT 
    lb.last_backup_time,
    EXTRACT(HOURS FROM (NOW() - lb.last_backup_time)) as hours_since_backup,
    bs.recent_backups,
    bs.tested_backups,
    CASE 
        WHEN EXTRACT(HOURS FROM (NOW() - lb.last_backup_time)) > 25 THEN 'CRITICAL'
        WHEN EXTRACT(HOURS FROM (NOW() - lb.last_backup_time)) > 24 THEN 'WARNING'
        ELSE 'OK'
    END as backup_status,
    CASE
        WHEN bs.tested_backups = 0 THEN 'NO_TESTS'
        WHEN bs.tested_backups < bs.recent_backups / 7 THEN 'INSUFFICIENT_TESTING'
        ELSE 'OK'
    END as test_status
FROM latest_backup lb, backup_stats bs;

-- Create alert for missing backups
CREATE OR REPLACE FUNCTION check_backup_health()
RETURNS TABLE(alert_level TEXT, message TEXT) AS $$
DECLARE
    v_backup_status TEXT;
    v_hours_since INTEGER;
BEGIN
    SELECT backup_status, hours_since_backup 
    INTO v_backup_status, v_hours_since
    FROM backup_health;
    
    IF v_backup_status = 'CRITICAL' THEN
        RETURN QUERY SELECT 'CRITICAL', 
            FORMAT('No backup for %s hours! Immediate action required.', v_hours_since);
    ELSIF v_backup_status = 'WARNING' THEN
        RETURN QUERY SELECT 'WARNING',
            FORMAT('Backup may be delayed. Last backup %s hours ago.', v_hours_since);
    END IF;
    
    RETURN;
END;
$$ LANGUAGE plpgsql;
EOF

fly postgres connect -a $DB_APP < monitor_backups.sql
check_success "Backup monitoring configured"

# ============================================
# SUMMARY
# ============================================

echo -e "\n${GREEN}═══════════════════════════════════════════════════════════════${NC}"
echo -e "${GREEN}    Backup & Disaster Recovery Configuration Complete${NC}"
echo -e "${GREEN}═══════════════════════════════════════════════════════════════${NC}"

echo -e "\n${BLUE}Next Steps:${NC}"
echo "1. Schedule automated_backup.sh with cron:"
echo "   ${YELLOW}0 2 * * * /path/to/automated_backup.sh${NC}"
echo ""
echo "2. Schedule weekly backup tests:"
echo "   ${YELLOW}0 3 * * 0 /path/to/test_backup_restore.sh${NC}"
echo ""
echo "3. Configure alerting for backup failures"
echo "4. Review disaster_recovery_procedures.md with your team"
echo "5. Conduct quarterly DR drills"

echo -e "\n${GREEN}✅ Backup and disaster recovery configuration complete!${NC}"
