test-repo/scripts/security-monitors/disk-monitor.sh

179 lines
5.0 KiB
Bash
Executable File

#!/bin/zsh
#
# Disk Space Monitor
# Warns when disk usage exceeds 90% threshold
# Sends alerts via Telegram when critical
#
STATE_DIR="/Users/mattbruce/.openclaw/workspace/scripts/security-monitors/state"
LOG_FILE="/Users/mattbruce/.openclaw/workspace/scripts/security-monitors/logs/disk-monitor.log"
ALERT_STATE_FILE="$STATE_DIR/disk-alert-state"
# Thresholds
WARN_THRESHOLD=80
CRITICAL_THRESHOLD=90
# Create directories
mkdir -p "$(dirname $LOG_FILE)" "$STATE_DIR"
# Timestamp helper
timestamp() {
date '+%Y-%m-%d %H:%M:%S %Z'
}
# Log to file
log() {
echo "[$(timestamp)] $1" >> "$LOG_FILE"
}
# Send alert to queue
send_alert() {
local level="$1"
local message="$2"
echo "$(timestamp) | $level | DISK | $message" >> "$STATE_DIR/alerts.queue"
}
# Check disk usage
check_disk_usage() {
local filesystem="$1"
local usage
usage=$(df -h "$filesystem" 2>/dev/null | tail -1 | awk '{print $5}' | tr -d '%')
echo "$usage"
}
# Get all mounted filesystems and their usage (exclude system/special volumes)
get_all_filesystems() {
df -h 2>/dev/null | tail -n +2 | grep -v "devfs\|map \|CoreSimulator\|Cryptex" | awk '{print $6","$5}'
}
# Main monitoring logic
main() {
local alert_needed=false
local alert_level=""
local alert_details=""
local max_usage=0
local critical_fs=""
# Check main filesystem (/) first
local root_usage
root_usage=$(check_disk_usage "/")
if [[ -n "$root_usage" ]]; then
max_usage=$root_usage
critical_fs="/"
fi
# Check all filesystems
local fs_list
fs_list=$(get_all_filesystems)
local details="Disk Usage Report:\n"
while IFS=',' read -r mount usage; do
[[ -z "$mount" ]] && continue
local usage_num=$(echo "$usage" | tr -d '%')
details="${details} $mount: $usage\n"
if [[ "$usage_num" -gt "$max_usage" ]]; then
max_usage=$usage_num
critical_fs="$mount"
fi
if [[ "$usage_num" -ge "$CRITICAL_THRESHOLD" ]]; then
alert_needed=true
alert_level="CRITICAL"
elif [[ "$usage_num" -ge "$WARN_THRESHOLD" ]] && [[ "$alert_level" != "CRITICAL" ]]; then
alert_needed=true
alert_level="WARNING"
fi
done <<< "$fs_list"
# Check if we already alerted for this state (prevent spam)
local last_state=""
local last_usage=0
if [[ -f "$ALERT_STATE_FILE" ]]; then
last_state=$(cat "$ALERT_STATE_FILE" | cut -d'|' -f1)
last_usage=$(cat "$ALERT_STATE_FILE" | cut -d'|' -f2)
fi
# Alert logic with hysteresis (alert on rising, clear on falling below threshold-5%)
local should_alert=false
if [[ "$alert_needed" == "true" ]]; then
if [[ "$alert_level" == "CRITICAL" ]]; then
# Always alert for critical, but not more than once per hour
if [[ "$last_state" != "CRITICAL" ]] || [[ $((max_usage - last_usage)) -ge 5 ]]; then
should_alert=true
fi
elif [[ "$alert_level" == "WARNING" ]]; then
# Alert for warning if we haven't already, or if it's getting worse
if [[ "$last_state" != "WARNING" ]] && [[ "$last_state" != "CRITICAL" ]]; then
should_alert=true
fi
fi
elif [[ "$last_state" == "CRITICAL" ]] || [[ "$last_state" == "WARNING" ]]; then
# Disk has recovered below threshold - send all-clear
if [[ $max_usage -lt $((WARN_THRESHOLD - 5)) ]]; then
alert_level="RECOVERED"
should_alert=true
fi
fi
if [[ "$should_alert" == "true" ]]; then
local hostname=$(hostname -s)
local emoji=""
local title=""
case "$alert_level" in
CRITICAL)
emoji="🚨"
title="CRITICAL: Disk Space Exhaustion Imminent"
;;
WARNING)
emoji="⚠️"
title="WARNING: Disk Space Running Low"
;;
RECOVERED)
emoji="✅"
title="RESOLVED: Disk Space Recovered"
;;
esac
local alert_msg="$emoji **$title** $emoji
**Host:** $hostname
**Time:** $(timestamp)
**Most Critical Mount:** $critical_fs (${max_usage}% used)
**All Filesystems:**
$details
$(if [[ "$alert_level" == "CRITICAL" ]]; then echo "🛑 **ACTION REQUIRED:** Free up disk space immediately!"; fi)
$(if [[ "$alert_level" == "WARNING" ]]; then echo "💡 **Recommendation:** Review and clean up unnecessary files."; fi)
_Detected by OpenClaw Disk Monitor_"
send_alert "$alert_level" "$alert_msg"
log "$alert_level alert sent for $critical_fs (${max_usage}% usage)"
# Update state
echo "$alert_level|$max_usage|$(timestamp)" > "$ALERT_STATE_FILE"
# Log to daily security log
local daily_log="/Users/mattbruce/.openclaw/workspace/memory/$(date '+%Y-%m-%d')-security.log"
echo "DISK_${alert_level}|$(timestamp)|$critical_fs|${max_usage}%" >> "$daily_log"
else
# Normal operation - log periodically (every 6 runs ~ 30 min)
local counter_file="$STATE_DIR/disk-check-counter"
local counter=0
[[ -f "$counter_file" ]] && counter=$(cat "$counter_file")
counter=$((counter + 1))
if [[ $counter -ge 6 ]]; then
log "Disk check normal. Max usage: $max_usage% on $critical_fs"
counter=0
fi
echo "$counter" > "$counter_file"
fi
}
# Run main function
main "$@"