#!/bin/bash
# Hermes gateway watchdog — alert-only, never restarts (systemd handles that).
# Run every 15 min via /etc/cron.d/hermes-watchdog.
set -u
LOG=/var/log/hermes-watchdog.log
TS=$(date -u +%Y-%m-%dT%H:%M:%SZ)

if systemctl is-active --quiet hermes-gateway.service; then
  exit 0
fi

echo "${TS} DOWN: hermes-gateway.service is not active" >> "$LOG"

# Load envs
set -a
source /root/.hermes/.env 2>/dev/null
set +a

TOKEN="${TELEGRAM_BOT_TOKEN:-}"
# TELEGRAM_ALLOWED_USERS is a comma-separated list; first ID is the alert target.
CHAT_ID="${TELEGRAM_ALLOWED_USERS%%,*}"

if [ -n "$TOKEN" ] && [ -n "$CHAT_ID" ]; then
  curl -sS -X POST "https://api.telegram.org/bot${TOKEN}/sendMessage" \
    --data-urlencode "chat_id=${CHAT_ID}" \
    --data-urlencode "text=ALERT: hermes-gateway DOWN at ${TS} — systemd should restart within 30s. If repeated, check: journalctl -u hermes-gateway -n 50" \
    >> "$LOG" 2>&1
  echo "${TS} alert sent to ${CHAT_ID}" >> "$LOG"
else
  echo "${TS} ERROR: TELEGRAM_BOT_TOKEN or chat_id missing — could not send alert" >> "$LOG"
fi
