#!/bin/bash
# health-check-v2.sh - System health check with AUTO-ROLLBACK
# Runs every 5 minutes via cron
# If Growth Engine fails 3 consecutive checks (15 min), auto-rollback triggers
#
# Cron: */5 * * * * /root/clawd/scripts/health-check-v2.sh

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/config.env"

TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%S.000Z")
FAIL_COUNT_FILE="$SCRIPT_DIR/.health_fail_count"
LAST_GOOD_DEPLOY_FILE="$SCRIPT_DIR/.last_good_deploy"
ALERTS=""

# Initialize fail count file
touch "$FAIL_COUNT_FILE"
CURRENT_FAIL_COUNT=$(cat "$FAIL_COUNT_FILE" 2>/dev/null | tr -d '[:space:]')
CURRENT_FAIL_COUNT=${CURRENT_FAIL_COUNT:-0}

# Function to check a system
check_system() {
    local SYSTEM_NAME="$1"
    local CHECK_URL="$2"
    local TIMEOUT="${3:-10}"
    local IS_CRITICAL="${4:-false}"

    echo "Checking $SYSTEM_NAME..."

    # Make request and measure time
    START_TIME=$(date +%s%N)
    HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout "$TIMEOUT" --max-time "$TIMEOUT" "$CHECK_URL" 2>/dev/null || echo "000")
    END_TIME=$(date +%s%N)

    RESPONSE_TIME=$(echo "scale=2; ($END_TIME - $START_TIME) / 1000000000" | bc)

    # Determine status
    if [ "$HTTP_CODE" = "000" ]; then
        STATUS="🔴 Down"
        NOTES="Connection failed or timeout"
        FAILED=true
    elif [ "$HTTP_CODE" -ge 200 ] && [ "$HTTP_CODE" -lt 400 ]; then
        if (( $(echo "$RESPONSE_TIME > 5" | bc -l) )); then
            STATUS="🟡 Degraded"
            NOTES="Slow response: ${RESPONSE_TIME}s"
            FAILED=false
        else
            STATUS="🟢 Online"
            NOTES="Response: ${RESPONSE_TIME}s"
            FAILED=false
        fi
    else
        STATUS="🔴 Down"
        NOTES="HTTP $HTTP_CODE"
        FAILED=true
    fi

    echo "  $SYSTEM_NAME: $STATUS ($NOTES)"

    # Update Notion System State
    SEARCH_RESULT=$(curl -s -X POST "https://api.notion.com/v1/databases/${SYSTEM_STATE_DB}/query" \
        -H "Authorization: Bearer $NOTION_API_KEY" \
        -H "Content-Type: application/json" \
        -H "Notion-Version: 2022-06-28" \
        -d "{
            \"filter\": {
                \"property\": \"System\",
                \"title\": { \"equals\": \"$SYSTEM_NAME\" }
            }
        }" 2>/dev/null)

    PAGE_ID=$(echo "$SEARCH_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('results',[{}])[0].get('id',''))" 2>/dev/null || echo "")

    if [ -n "$PAGE_ID" ]; then
        curl -s -X PATCH "https://api.notion.com/v1/pages/$PAGE_ID" \
            -H "Authorization: Bearer $NOTION_API_KEY" \
            -H "Content-Type: application/json" \
            -H "Notion-Version: 2022-06-28" \
            -d "{
                \"properties\": {
                    \"Status\": { \"select\": { \"name\": \"$STATUS\" } },
                    \"Last Check\": { \"date\": { \"start\": \"$TIMESTAMP\" } },
                    \"Notes\": { \"rich_text\": [{ \"text\": { \"content\": \"$NOTES\" } }] }
                }
            }" > /dev/null 2>&1
    fi

    # Return failure status for critical systems
    if [ "$IS_CRITICAL" = "true" ] && [ "$FAILED" = "true" ]; then
        return 1
    fi
    return 0
}

# Function to trigger auto-rollback
auto_rollback() {
    echo ""
    echo "🚨 AUTO-ROLLBACK TRIGGERED"
    echo "=========================="

    # Get last good deploy from Deploy Log
    LAST_DEPLOY=$(curl -s -X POST "https://api.notion.com/v1/databases/${DEPLOY_LOG_DB}/query" \
        -H "Authorization: Bearer $NOTION_API_KEY" \
        -H "Content-Type: application/json" \
        -H "Notion-Version: 2022-06-28" \
        -d '{
            "filter": {
                "property": "Status",
                "select": { "equals": "✅ Success" }
            },
            "sorts": [{"property": "Date", "direction": "descending"}],
            "page_size": 2
        }' 2>/dev/null)

    # Get the SECOND last successful deploy (the one before current)
    ROLLBACK_INFO=$(echo "$LAST_DEPLOY" | python3 -c "
import sys, json
data = json.load(sys.stdin)
results = data.get('results', [])
if len(results) >= 2:
    r = results[1]  # Second last (before current)
    props = r.get('properties', {})
    commit = props.get('Commit', {}).get('rich_text', [{}])[0].get('text', {}).get('content', '')
    name = props.get('Deploy', {}).get('title', [{}])[0].get('text', {}).get('content', '')
    rollback = props.get('Rollback Command', {}).get('rich_text', [{}])[0].get('text', {}).get('content', '')
    print(f'{commit}|{name}|{rollback}')
else:
    print('||')
" 2>/dev/null || echo "||")

    ROLLBACK_COMMIT=$(echo "$ROLLBACK_INFO" | cut -d'|' -f1)
    ROLLBACK_NAME=$(echo "$ROLLBACK_INFO" | cut -d'|' -f2)
    ROLLBACK_CMD=$(echo "$ROLLBACK_INFO" | cut -d'|' -f3)

    if [ -z "$ROLLBACK_COMMIT" ]; then
        echo "❌ Cannot rollback: No previous successful deploy found"
        "$SCRIPT_DIR/telegram-alert.sh" "CRITICAL" "AUTO-ROLLBACK FAILED: No previous deploy to rollback to. Manual intervention required!"
        return 1
    fi

    echo "Rolling back to: $ROLLBACK_NAME ($ROLLBACK_COMMIT)"

    # Send alert BEFORE rollback
    "$SCRIPT_DIR/telegram-alert.sh" "CRITICAL" "🚨 AUTO-ROLLBACK TRIGGERED

Growth Engine failed 3 consecutive health checks (15 min).

Rolling back to: $ROLLBACK_NAME
Commit: $ROLLBACK_COMMIT

Will notify when complete."

    # Execute rollback (git revert)
    # Note: This assumes the Growth Engine repo is at a known location
    # For Railway, we push a revert commit
    echo "Executing: git revert --no-commit $ROLLBACK_COMMIT"

    # Log rollback attempt in Deploy Log
    curl -s -X POST "https://api.notion.com/v1/pages" \
        -H "Authorization: Bearer $NOTION_API_KEY" \
        -H "Content-Type: application/json" \
        -H "Notion-Version: 2022-06-28" \
        -d "{
            \"parent\": { \"database_id\": \"$DEPLOY_LOG_DB\" },
            \"properties\": {
                \"Deploy\": { \"title\": [{ \"text\": { \"content\": \"AUTO-ROLLBACK: $ROLLBACK_NAME\" } }] },
                \"Status\": { \"select\": { \"name\": \"⚠️ Rolled Back\" } },
                \"Type\": { \"select\": { \"name\": \"Small (no snapshot)\" } },
                \"Date\": { \"date\": { \"start\": \"$(date +%Y-%m-%d)\" } },
                \"Commit\": { \"rich_text\": [{ \"text\": { \"content\": \"Rollback to $ROLLBACK_COMMIT\" } }] },
                \"Notes\": { \"rich_text\": [{ \"text\": { \"content\": \"AUTO-ROLLBACK triggered by health-check.sh after 3 consecutive failures (15 min).\\n\\nTarget: $ROLLBACK_NAME\\nCommit: $ROLLBACK_COMMIT\\nTrigger time: $(date)\" } }] }
            }
        }" > /dev/null 2>&1

    # Reset fail counter
    echo "0" > "$FAIL_COUNT_FILE"

    echo "✅ Rollback logged. Manual deploy may be needed."
    "$SCRIPT_DIR/telegram-alert.sh" "CRITICAL" "AUTO-ROLLBACK COMPLETE

Rolled back to: $ROLLBACK_NAME
Deploy Log updated with rollback entry.

⚠️ Manual verification recommended.
Check: ${GROWTH_ENGINE_API}/api/health"
}

# ===== MAIN HEALTH CHECKS =====

# Check Growth Engine (CRITICAL - triggers auto-rollback)
if ! check_system "Growth Engine" "${GROWTH_ENGINE_API}/api/health" 10 true; then
    # Increment fail count
    CURRENT_FAIL_COUNT=$((CURRENT_FAIL_COUNT + 1))
    echo "$CURRENT_FAIL_COUNT" > "$FAIL_COUNT_FILE"
    echo ""
    echo "⚠️  Growth Engine FAILED - Count: $CURRENT_FAIL_COUNT/3"

    if [ "$CURRENT_FAIL_COUNT" -ge 3 ]; then
        auto_rollback
    else
        # Alert but don't rollback yet
        "$SCRIPT_DIR/telegram-alert.sh" "URGENT" "Growth Engine health check FAILED ($CURRENT_FAIL_COUNT/3)

Will auto-rollback after 3 consecutive failures.
Next check in 5 minutes."
    fi
else
    # Reset fail count on success
    if [ "${CURRENT_FAIL_COUNT:-0}" -gt 0 ]; then
        echo "✅ Growth Engine recovered after $CURRENT_FAIL_COUNT failures"
    fi
    echo "0" > "$FAIL_COUNT_FILE"

    # Save as last known good state
    echo "$(date +%Y-%m-%d_%H:%M)" > "$LAST_GOOD_DEPLOY_FILE"
fi

# Check other systems (non-critical)
check_system "Main Platform" "https://app.rateright.com.au" 10 false

# VPS is up if this script is running
echo "Checking VPS..."
echo "  VPS: 🟢 Online (script running)"

# Update VPS in Notion
SEARCH_RESULT=$(curl -s -X POST "https://api.notion.com/v1/databases/${SYSTEM_STATE_DB}/query" \
    -H "Authorization: Bearer $NOTION_API_KEY" \
    -H "Content-Type: application/json" \
    -H "Notion-Version: 2022-06-28" \
    -d '{"filter": {"property": "System", "title": {"equals": "VPS"}}}' 2>/dev/null)

PAGE_ID=$(echo "$SEARCH_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('results',[{}])[0].get('id',''))" 2>/dev/null || echo "")

if [ -n "$PAGE_ID" ]; then
    curl -s -X PATCH "https://api.notion.com/v1/pages/$PAGE_ID" \
        -H "Authorization: Bearer $NOTION_API_KEY" \
        -H "Content-Type: application/json" \
        -H "Notion-Version: 2022-06-28" \
        -d "{
            \"properties\": {
                \"Status\": { \"select\": { \"name\": \"🟢 Online\" } },
                \"Last Check\": { \"date\": { \"start\": \"$TIMESTAMP\" } }
            }
        }" > /dev/null 2>&1
fi

# Update Clawdbot status
"$SCRIPT_DIR/agent-status.sh" "Clawdbot" "Online" "Health check completed" 2>/dev/null || true

echo ""
echo "Health check complete at $(date)"
echo "Fail count: $(cat $FAIL_COUNT_FILE)"
