#!/bin/bash
# Daily Hermes/OpsMan health snapshot. Run via /etc/cron.d/hermes-health.
# Output: /home/ccuser/opsman-work/health-log/YYYY-MM-DD.md
set -u

LOCKFILE="/run/hermes-health-snapshot.lock"
PIDFILE="/run/hermes-health-snapshot.pid"

# flock guard — exit if another instance is already running
exec 200>"$LOCKFILE"
if ! flock -n 200; then
  echo "$(date -u '+%Y-%m-%dT%H:%M:%SZ') already running, exiting" >&2
  exit 0
fi

# Write our pid
echo $$ > "$PIDFILE"

DATE=$(date -u +%Y-%m-%d)
TS=$(date -u +%Y-%m-%dT%H:%M:%SZ)
OUT="/home/ccuser/opsman-work/health-log/${DATE}.md"

# Source .env safely — only extract known vars, don't blanket-export everything
if [ -f /root/.hermes/.env ]; then
  while IFS='=' read -r key val; do
    case "$key" in
      MINIMAX_API_KEY|OPENROUTER_API_KEY|DEEPSEEK_API_KEY|OPENAI_API_KEY)
        export "$key"="$val"
        ;;
    esac
  done < <(grep -vE '^[[:space:]]*#' /root/.hermes/.env)
fi

{
  echo "# Health snapshot ${TS}"
  echo
  echo "## Services"
  for svc in hermes-gateway clawdbot-opsman rivet-voice; do
    state=$(systemctl is-active ${svc}.service 2>&1)
    pid=$(systemctl show -p MainPID --value ${svc}.service)
    nrestarts=$(systemctl show -p NRestarts --value ${svc}.service)
    uptime=""
    if [ "$pid" -gt 0 ]; then
      uptime=$(ps -o etime= -p "$pid" 2>/dev/null | tr -d " ")
    fi
    echo "- ${svc}: state=${state} pid=${pid} uptime=${uptime} restarts=${nrestarts}"
  done

  echo
  echo "## MiniMax health probe"
  if [ -n "${MINIMAX_API_KEY:-}" ]; then
    code=$(curl -sS -o /dev/null -w "%{http_code}" \
      --max-time 10 --fail \
      -H "Authorization: Bearer ${MINIMAX_API_KEY}" \
      "https://api.minimax.io/v1/text/chatcompletion_v2" \
      -X POST -H "Content-Type: application/json" \
      -d "{\"model\":\"MiniMax-M2.7\",\"messages\":[{\"role\":\"user\",\"content\":\"ok\"}],\"max_tokens\":1}" \
      2>&1 || echo "curl_err")
    echo "- MiniMax HTTP code: ${code}"
  else
    echo "- MiniMax: no key in .env"
  fi

  echo
  echo "## System cron jobs"
  echo "### root crontab"
  crontab -l 2>/dev/null | grep -vE "^#|^$" | sed "s/^/    /" || echo "    (none)"
  echo "### /etc/cron.d/"
  for f in /etc/cron.d/*; do
    [ -f "$f" ] || continue
    [ "$(basename "$f")" = ".placeholder" ] && continue
    echo "- ${f}: "
    grep -vE "^#|^$|^SHELL|^PATH" "$f" | head -3 | sed "s/^/    /"
  done

  echo
  echo "## PM2 (root)"
  if timeout 10 pm2 jlist 2>/dev/null | python3 -c "
import sys, json
data = json.load(sys.stdin)
for p in data:
    s = p.get('pm2_env',{}).get('status','')
    if s in ('online','stopped','errored'):
        print(f\"    {p['name']}: {s}\")
" 2>/dev/null; then
    :
  else
    echo "    (pm2 unavailable or timed out)"
  fi

  echo
  echo "## Disk"
  df -h / | tail -1 | sed "s/^/    /"

  echo
  echo "## Memory"
  free -h | head -3 | sed "s/^/    /"

  echo
  echo "## Last successful Telegram message (Hermes)"
  ts_ok=$(journalctl -u hermes-gateway.service --since "24 hours ago" --no-pager 2>/dev/null | grep -iE "telegram.*sent|reply sent|message sent" | tail -1 | awk "{print \$1, \$2, \$3}")
  echo "- ${ts_ok:-no recent send events}"

  echo
  echo "## Watchdog alert log (last 5)"
  if [ -f /var/log/hermes-watchdog.log ]; then
    tail -5 /var/log/hermes-watchdog.log | sed "s/^/    /"
  else
    echo "    (log not yet created)"
  fi
} > "${OUT}.tmp"

mv "${OUT}.tmp" "${OUT}"
chown ccuser:ccuser "$OUT"
rm -f "$PIDFILE"
