diff --git a/logs/app_monitor.log b/logs/app_monitor.log new file mode 100644 index 0000000..4e1ac4a --- /dev/null +++ b/logs/app_monitor.log @@ -0,0 +1,7 @@ +[2026-02-18 15:11:51] === Starting Web App Monitor === +[2026-02-18 15:11:51] ✓ Port 3000 - HTTP 200 OK +[2026-02-18 15:11:51] ✓ Port 3003 - HTTP 200 OK +[2026-02-18 15:11:51] ✓ Port 3005 - HTTP 200 OK +[2026-02-18 15:11:51] All apps healthy, no restart needed +[2026-02-18 15:11:51] === Monitor Complete === + diff --git a/logs/webapp-monitor.log b/logs/webapp-monitor.log new file mode 100644 index 0000000..b98e896 --- /dev/null +++ b/logs/webapp-monitor.log @@ -0,0 +1,22 @@ +[2026-02-18 15:56:22 CST] Starting web app health check... +[2026-02-18 15:56:22 CST] Checking HTTP status for ports... + ✓ Port 3000: HTTP 200 + ✓ Port 3003: HTTP 200 + ✓ Port 3005: HTTP 200 +[2026-02-18 15:56:22 CST] ✓ All apps healthy - no restart needed. +[2026-02-18 15:56:22 CST] Health check complete. +--- +[2026-02-18 15:56:39 CST] Starting web app health check... +[2026-02-18 15:56:39 CST] Checking HTTP status for ports... + ✓ Port 3000: HTTP 200 + ✓ Port 3003: HTTP 200 + ✓ Port 3005: HTTP 200 +[2026-02-18 15:56:39 CST] ✓ All apps healthy - no restart needed. +[2026-02-18 15:56:39 CST] Health check complete. +--- +[2026-02-18 16:01:29 CST] === Starting web app monitor check === +[2026-02-18 16:01:29 CST] ✓ gantt-board (port 3000) is UP +[2026-02-18 16:01:29 CST] ✓ blog-backup (port 3003) is UP +[2026-02-18 16:01:29 CST] ✓ heartbeat-monitor (port 3005) is UP +[2026-02-18 16:01:29 CST] All apps healthy, no action needed +[2026-02-18 16:01:29 CST] === Monitor check complete === diff --git a/memory/2026-02-18.md b/memory/2026-02-18.md new file mode 100644 index 0000000..3e4f491 --- /dev/null +++ b/memory/2026-02-18.md @@ -0,0 +1,53 @@ +# 2026-02-18 - Wednesday + +## Morning + +## Afternoon (~2:00 PM) + +### Project Hub Tasks Created +User added 3 new tasks to track progress on OpenClaw infrastructure: + +1. **Task #4**: Redesign Heartbeat Monitor to match UptimeRobot (Priority: High) + - Study https://uptimerobot.com design + - Match look, feel, style exactly + - Modern dashboard, status pages, uptime charts + +2. **Task #5**: Fix Blog Backup links to be clickable (Priority: Medium) + - Currently links are text-only requiring copy-paste + - Different format for Telegram vs Blog + +3. **Task #6**: Fix monitoring schedule - sites are down (Priority: Urgent) + - 2 of 3 websites down + - Cron job not auto-restarting properly + +### Critical Incident: All 3 Sites Down (~2:13 PM) +- gantt-board (3000): DOWN +- blog-backup (3003): DOWN +- heartbeat-monitor (3005): DOWN + +**Root Cause**: Cron job wasn't properly killing old processes before restart, causing EADDRINUSE errors. + +**Resolution**: +- Manually restarted all 3 sites at 14:19 +- Updated cron job with `pkill -f "port XXXX"` cleanup before restart +- Added 2-second delay after kill to ensure port release +- Created backup script: `monitor-restart.sh` +- Task #6 marked as DONE + +### System Health (2:30 PM) +All 3 sites running stable after fix. + +### New Task Created (2:32 PM) +**Task #7**: Investigate root cause - why are websites dying? +- Type: Research +- Priority: High +- Added to Project Hub Kanban board +- User wants to know what's actually killing the servers, not just restart them +- Suspects: memory leaks, file watchers, SSH timeout, power management, OOM killer + +### New Task Created (2:35 PM) +**Task #8**: Fix Kanban board - dynamic sync without hard refresh +- Type: Task +- Priority: Medium +- Board uses localStorage which requires hard refresh to see updates +- Need server-side storage or sync mechanism for normal refresh updates diff --git a/memory/monitor-log-2026-02-18.md b/memory/monitor-log-2026-02-18.md new file mode 100644 index 0000000..87caaf5 --- /dev/null +++ b/memory/monitor-log-2026-02-18.md @@ -0,0 +1,5 @@ +[2026-02-18 15:01:00 CST] Web App Monitor Check +- Port 3000 (gantt-board): OK (HTTP 200) +- Port 3003 (blog-backup): OK (HTTP 200) +- Port 3005 (heartbeat-monitor): OK (HTTP 200) +Status: All services healthy - no action needed diff --git a/monitor-processes.sh b/monitor-processes.sh new file mode 100755 index 0000000..37b5bae --- /dev/null +++ b/monitor-processes.sh @@ -0,0 +1,80 @@ +#!/bin/zsh + +# Process monitoring script to track what kills Next.js dev servers +# Run this in background to capture system events + +LOG_FILE="/Users/mattbruce/.openclaw/workspace/logs/process-monitor.log" +PID_FILE="/tmp/process-monitor.pid" + +# Create log directory +mkdir -p "$(dirname $LOG_FILE)" + +# Function to log with timestamp +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a $LOG_FILE +} + +# Track Node processes +monitor_processes() { + while true; do + # Check if our monitored processes are running + for port in 3000 3003 3005; do + PID=$(lsof -ti:$port 2>/dev/null) + if [ -n "$PID" ]; then + # Get process info + CPU=$(ps -p $PID -o %cpu= 2>/dev/null | tr -d ' ') + MEM=$(ps -p $PID -o %mem= 2>/dev/null | tr -d ' ') + RSS=$(ps -p $PID -o rss= 2>/dev/null | tr -d ' ') + + # Log if memory is high (>500MB RSS) + if [ -n "$RSS" ] && [ "$RSS" -gt 512000 ]; then + log "WARNING: Port $port (PID:$PID) using ${RSS}KB RAM (${MEM}% of system)" + fi + + # Log if CPU is high (>80%) + if [ -n "$CPU" ] && [ "${CPU%.*}" -gt 80 ]; then + log "WARNING: Port $port (PID:$PID) using ${CPU}% CPU" + fi + fi + done + + # Check system memory + FREE_MEM=$(vm_stat | grep "Pages free" | awk '{print $3}' | tr -d '.') + if [ -n "$FREE_MEM" ]; then + FREE_MB=$((FREE_MEM * 4096 / 1024 / 1024)) + if [ "$FREE_MB" -lt 500 ]; then + log "WARNING: Low system memory: ${FREE_MB}MB free" + fi + fi + + # Check file descriptors + for port in 3000 3003 3005; do + PID=$(lsof -ti:$port 2>/dev/null) + if [ -n "$PID" ]; then + FD_COUNT=$(lsof -p $PID 2>/dev/null | wc -l) + if [ "$FD_COUNT" -gt 900 ]; then + log "WARNING: Port $port (PID:$PID) has $FD_COUNT open file descriptors" + fi + fi + done + + sleep 60 + done +} + +# Check if already running +if [ -f "$PID_FILE" ] && kill -0 $(cat $PID_FILE) 2>/dev/null; then + echo "Monitor already running (PID: $(cat $PID_FILE))" + exit 0 +fi + +# Save PID +echo $$ > $PID_FILE + +log "=== Process Monitor Started ===" +log "Monitoring ports: 3000, 3003, 3005" +log "Checking: CPU, Memory, File Descriptors, System Resources" +log "Log file: $LOG_FILE" + +# Start monitoring +monitor_processes & diff --git a/root-cause-analysis.md b/root-cause-analysis.md new file mode 100644 index 0000000..874b834 --- /dev/null +++ b/root-cause-analysis.md @@ -0,0 +1,87 @@ +# Root Cause Analysis: Why Websites Die + +**Date:** 2026-02-18 +**Task:** #7 - Investigate root cause of web app crashes + +## Current Status +All 3 sites are **UP and healthy** at time of investigation. + +## System Analysis + +### 1. File Descriptor Limits +``` +Current ulimit: 1,048,575 (very high - unlikely to be issue) +``` + +### 2. Active Processes +- gantt-board (port 3000): ✅ Running +- blog-backup (port 3003): ✅ Running +- heartbeat-monitor (port 3005): ✅ Running + +### 3. Memory Usage +Next.js dev servers using: +- ~400-550MB RAM each (normal for dev mode) +- ~0.8-1.1% system memory each +- Not excessive but adds up + +## Likely Root Causes + +### Primary Suspect: **Next.js Dev Server Memory Leaks** +- Next.js dev mode (`npm run dev`) is NOT production-ready +- File watcher holds references to files +- Hot Module Replacement (HMR) accumulates memory over time +- **Recommendation:** Use production builds for long-running services + +### Secondary Suspects: + +1. **macOS Power Management** + - Power Nap / App Nap can suspend background processes + - SSH sessions dying kill child processes + - **Check:** System Preferences > Energy Saver + +2. **File Watcher Limits** + - Default macOS limits: 1280 watched files per process + - Large node_modules can exceed this + - **Error:** `EMFILE: too many open files` + +3. **SSH Session Timeout** + - Terminal sessions with idle timeout + - SIGHUP sent to child processes on disconnect + - **Solution:** Use `nohup` or `screen`/`tmux` + +4. **OOM Killer (Out of Memory)** + - macOS memory pressure kills large processes + - Combined 1.5GB+ for all 3 sites + - **Check:** Console.app for "Out of memory" messages + +## Monitoring Setup + +Created: `/Users/mattbruce/.openclaw/workspace/monitor-processes.sh` +- Tracks CPU, memory, file descriptors +- Logs warnings for high usage +- Runs every 60 seconds + +## Recommendations + +### Immediate (Monitoring) +✅ Cron job running every 10 min with auto-restart +✅ Process monitoring script deployed + +### Short-term (Stability) +1. Use production builds instead of dev mode: + ```bash + npm run build + npm start + ``` +2. Run with PM2 or forever for process management +3. Use `nohup` to prevent SSH timeout kills + +### Long-term (Reliability) +1. Docker containers with restart policies +2. Systemd services with auto-restart +3. Reverse proxy (nginx) with health checks + +## Next Steps +1. Monitor logs for next 24-48 hours +2. Check if sites die overnight (SSH timeout test) +3. If memory-related, switch to production builds diff --git a/scripts/monitor-web-apps.sh b/scripts/monitor-web-apps.sh new file mode 100755 index 0000000..1c2d565 --- /dev/null +++ b/scripts/monitor-web-apps.sh @@ -0,0 +1,122 @@ +#!/bin/bash + +# Web App Monitor - Auto Restart Script +# Ports: 3000 (gantt-board), 3003 (blog-backup), 3005 (heartbeat-monitor) + +PORTS=(3000 3003 3005) +PROJECTS=( + "/Users/mattbruce/Documents/Projects/OpenClaw/Web/gantt-board" + "/Users/mattbruce/Documents/Projects/OpenClaw/Web/blog-backup" + "/Users/mattbruce/Documents/Projects/OpenClaw/Web/heartbeat-monitor" +) +NAMES=("gantt-board" "blog-backup" "heartbeat-monitor") + +RESTARTED=() +LOG="" + +log() { + LOG+="$1\n" + echo -e "$1" +} + +check_port() { + local port=$1 + local timeout=5 + local response + response=$(curl -s -o /dev/null -w "%{http_code}" --max-time $timeout "http://localhost:$port" 2>/dev/null) + echo "$response" +} + +restart_app() { + local port=$1 + local project=$2 + local name=$3 + + log " ↻ Restarting $name on port $port..." + + # Kill any process using the port + pkill -f ":$port" 2>/dev/null + sleep 2 + + # Start the app in background + cd "$project" && npm run dev -- --port $port > /dev/null 2>&1 & + + RESTARTED+=("$name (port $port)") +} + +log "═══════════════════════════════════════════════════" +log "🌐 Web App Monitor - $(date)" +log "═══════════════════════════════════════════════════" +log "" + +# Check all ports +for i in "${!PORTS[@]}"; do + PORT="${PORTS[$i]}" + PROJECT="${PROJECTS[$i]}" + NAME="${NAMES[$i]}" + + log "📡 Checking $NAME on port $PORT..." + + STATUS=$(check_port $PORT) + + if [ "$STATUS" = "200" ]; then + log " ✅ Healthy (HTTP $STATUS)" + else + if [ -z "$STATUS" ]; then + log " ❌ No response (timeout/connection refused)" + else + log " ❌ Unhealthy (HTTP $STATUS)" + fi + restart_app $PORT "$PROJECT" "$NAME" + fi + log "" +done + +# Wait for restarts to come up +if [ ${#RESTARTED[@]} -gt 0 ]; then + log "⏳ Waiting 5 seconds for restarts to initialize..." + sleep 5 + log "" +fi + +# Final verification +log "═══════════════════════════════════════════════════" +log "📊 Final Verification" +log "═══════════════════════════════════════════════════" + +ALL_HEALTHY=true +for i in "${!PORTS[@]}"; do + PORT="${PORTS[$i]}" + NAME="${NAMES[$i]}" + + STATUS=$(check_port $PORT) + if [ "$STATUS" = "200" ]; then + log " ✅ $NAME (port $PORT): HTTP 200" + else + log " ❌ $NAME (port $PORT): Failed (HTTP ${STATUS:-'no response'})" + ALL_HEALTHY=false + fi +done + +log "" +log "═══════════════════════════════════════════════════" +log "📋 Summary" +log "═══════════════════════════════════════════════════" + +if [ ${#RESTARTED[@]} -eq 0 ]; then + log " 📍 All apps were already running and healthy" +else + log " 🔄 Restarted apps:" + for app in "${RESTARTED[@]}"; do + log " • $app" + done +fi + +log "" +if [ "$ALL_HEALTHY" = true ]; then + log " 🎯 Final Status: All apps responding with HTTP 200" +else + log " ⚠️ Final Status: Some apps are not responding" +fi +log "" +log "═══════════════════════════════════════════════════" diff --git a/scripts/monitor_web_apps.sh b/scripts/monitor_web_apps.sh new file mode 100755 index 0000000..f206022 --- /dev/null +++ b/scripts/monitor_web_apps.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +LOG_FILE="/Users/mattbruce/.openclaw/workspace/logs/app_monitor.log" +TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S') + +echo "[$TIMESTAMP] === Starting Web App Monitor ===" | tee -a "$LOG_FILE" + +# Port to project mapping (arrays for bash 3 compatibility) +PORTS=(3000 3003 3005) +PATHS=( + "/Users/mattbruce/Documents/Projects/OpenClaw/Web/gantt-board" + "/Users/mattbruce/Documents/Projects/OpenClaw/Web/blog-backup" + "/Users/mattbruce/Documents/Projects/OpenClaw/Web/heartbeat-monitor" +) + +# Track which needed restart +NEEDS_RESTART=() + +# Function to check if port is responding +check_port() { + local port=$1 + local url="http://localhost:$port" + + # Use curl with timeout and follow redirects + response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$url" 2>/dev/null) + + if [ "$response" == "200" ]; then + echo "[$TIMESTAMP] ✓ Port $port - HTTP 200 OK" | tee -a "$LOG_FILE" + return 0 + else + echo "[$TIMESTAMP] ✗ Port $port - DOWN (response: $response)" | tee -a "$LOG_FILE" + return 1 + fi +} + +# Function to kill process on port +kill_port() { + local port=$1 + echo "[$TIMESTAMP] → Killing process on port $port..." | tee -a "$LOG_FILE" + + # Find and kill process using the port + pids=$(lsof -ti:$port 2>/dev/null) + if [ -n "$pids" ]; then + echo "[$TIMESTAMP] → Found PIDs: $pids" | tee -a "$LOG_FILE" + kill -9 $pids 2>/dev/null + sleep 2 + echo "[$TIMESTAMP] → Killed processes on port $port" | tee -a "$LOG_FILE" + else + echo "[$TIMESTAMP] → No process found on port $port" | tee -a "$LOG_FILE" + fi +} + +# Function to restart app +restart_app() { + local port=$1 + local project_path=$2 + + echo "[$TIMESTAMP] → Restarting app on port $port..." | tee -a "$LOG_FILE" + echo "[$TIMESTAMP] → Path: $project_path" | tee -a "$LOG_FILE" + + cd "$project_path" && nohup npm run dev -- --port $port > /dev/null 2>&1 & + + NEEDS_RESTART+=("$port") +} + +# Check all ports and restart if needed +for i in "${!PORTS[@]}"; do + port="${PORTS[$i]}" + path="${PATHS[$i]}" + + if ! check_port $port; then + kill_port $port + restart_app $port "$path" + fi +done + +# If any were restarted, wait and verify +if [ ${#NEEDS_RESTART[@]} -gt 0 ]; then + echo "[$TIMESTAMP] Waiting 5 seconds for apps to start..." | tee -a "$LOG_FILE" + sleep 5 + + echo "[$TIMESTAMP] === Post-Restart Verification ===" | tee -a "$LOG_FILE" + for port in "${PORTS[@]}"; do + if ! check_port $port; then + echo "[$TIMESTAMP] ⚠ Port $port still not responding after restart" | tee -a "$LOG_FILE" + fi + done +else + echo "[$TIMESTAMP] All apps healthy, no restart needed" | tee -a "$LOG_FILE" +fi + +echo "[$TIMESTAMP] === Monitor Complete ===" | tee -a "$LOG_FILE" +echo "" | tee -a "$LOG_FILE" diff --git a/scripts/webapp-monitor.sh b/scripts/webapp-monitor.sh new file mode 100755 index 0000000..cf668bf --- /dev/null +++ b/scripts/webapp-monitor.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Web App Monitor - Auto-restart script +# Ports: 3000 (gantt-board), 3003 (blog-backup), 3005 (heartbeat-monitor) + +LOG_FILE="/Users/mattbruce/.openclaw/workspace/logs/webapp-monitor.log" +mkdir -p "$(dirname "$LOG_FILE")" + +timestamp() { + date '+%Y-%m-%d %H:%M:%S %Z' +} + +log() { + echo "[$(timestamp)] $1" | tee -a "$LOG_FILE" +} + +check_port() { + local port=$1 + local timeout=5 + + if curl -s -o /dev/null -w "%{http_code}" --max-time "$timeout" "http://localhost:$port" | grep -q "200"; then + echo "up" + else + echo "down" + fi +} + +kill_port() { + local port=$1 + log "Killing process on port $port..." + pkill -f ":$port" 2>/dev/null || true + sleep 2 +} + +restart_app() { + local port=$1 + local dir=$2 + local name=$3 + + log "Restarting $name on port $port..." + cd "$dir" && npm run dev -- --port "$port" > /dev/null 2>&1 & +} + +# Define apps +APPS=( + "3000:/Users/mattbruce/Documents/Projects/OpenClaw/Web/gantt-board:gantt-board" + "3003:/Users/mattbruce/Documents/Projects/OpenClaw/Web/blog-backup:blog-backup" + "3005:/Users/mattbruce/Documents/Projects/OpenClaw/Web/heartbeat-monitor:heartbeat-monitor" +) + +log "=== Starting web app monitor check ===" + +NEEDS_RESTART=() + +# Check each app +for app in "${APPS[@]}"; do + IFS=':' read -r port dir name <<< "$app" + + status=$(check_port "$port") + if [ "$status" = "up" ]; then + log "✓ $name (port $port) is UP" + else + log "✗ $name (port $port) is DOWN - will restart" + NEEDS_RESTART+=("$app") + fi +done + +# Restart any down apps +if [ ${#NEEDS_RESTART[@]} -gt 0 ]; then + log "--- Restarting ${#NEEDS_RESTART[@]} app(s) ---" + + for app in "${NEEDS_RESTART[@]}"; do + IFS=':' read -r port dir name <<< "$app" + kill_port "$port" + restart_app "$port" "$dir" "$name" + sleep 1 + done + + log "Waiting 5 seconds for apps to start..." + sleep 5 + + # Verify restarts + log "--- Verification ---" + for app in "${NEEDS_RESTART[@]}"; do + IFS=':' read -r port dir name <<< "$app" + status=$(check_port "$port") + if [ "$status" = "up" ]; then + log "✓ $name (port $port) is now UP" + else + log "✗ $name (port $port) still DOWN after restart" + fi + done +else + log "All apps healthy, no action needed" +fi + +log "=== Monitor check complete ==="