diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 835c381..e24a704 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -10,65 +10,40 @@ jobs: runs-on: self-hosted steps: - - name: Free disk space + - name: Ensure runner disk space run: | - set +e - echo "=== Disk before cleanup ===" - df -h - # Identify the directory holding the currently-running act job so we - # never touch it. Everything else under ~/.cache/act/ is fair game. - CURRENT_ACT_DIR="" - if [ -n "${ACT_TOOLCACHE_PATH:-}" ]; then - CURRENT_ACT_DIR=$(dirname "${ACT_TOOLCACHE_PATH}" 2>/dev/null) + set -e + echo "=== Disk before resize ===" + df -h / + + ROOT_SOURCE=$(findmnt -n -o SOURCE / 2>/dev/null || true) + ROOT_FSTYPE=$(findmnt -n -o FSTYPE / 2>/dev/null || true) + DISK_NAME=$(lsblk -no PKNAME "$ROOT_SOURCE" 2>/dev/null | head -n1 || true) + PART_NUM=$(lsblk -no PARTN "$ROOT_SOURCE" 2>/dev/null | head -n1 || true) + + echo "Root source: $ROOT_SOURCE ($ROOT_FSTYPE)" + if [ -n "$DISK_NAME" ] && [ -n "$PART_NUM" ]; then + if command -v growpart >/dev/null 2>&1; then + sudo growpart "/dev/$DISK_NAME" "$PART_NUM" || true + else + echo "growpart not installed; skipping partition grow." + fi fi - if [ -z "$CURRENT_ACT_DIR" ]; then - CURRENT_ACT_DIR=$(pwd | sed -n 's|\(.*/.cache/act/[^/]*\).*|\1|p') - fi - echo "Current act dir (preserved): ${CURRENT_ACT_DIR:-}" - # Wipe every other act workspace immediately (no mtime threshold). - # The old 10min threshold still left the previous failed job around, - # which then ate the disk before `npm ci` could finish. - if [ -d "$HOME/.cache/act" ]; then - du -sh "$HOME/.cache/act" 2>/dev/null - for d in "$HOME/.cache/act"/*/; do - [ -d "$d" ] || continue - case "$d" in - "$CURRENT_ACT_DIR"/*|"$CURRENT_ACT_DIR/") echo "skip current: $d" ;; - *) rm -rf "$d" && echo "removed: $d" ;; - esac - done - fi - # npm cache + setup-node cache: blow them away entirely. `npm ci` - # re-populates what it needs; the cache is a nice-to-have, not a - # requirement, and on a tight runner it's the easiest GB to reclaim. - rm -rf "$HOME/.npm/_cacache" "$HOME/.npm/_logs" 2>/dev/null - rm -rf "$HOME/.cache/setup-node" 2>/dev/null - # Stale actions-runner workspaces older than 30min. - if [ -d "$HOME/actions-runner/_work" ]; then - find "$HOME/actions-runner/_work" -mindepth 1 -maxdepth 2 -mmin +30 -exec rm -rf {} + 2>/dev/null - fi - # Docker: drop everything reclaimable (no `until` filter). - if command -v docker >/dev/null 2>&1; then - docker system prune -af --volumes 2>/dev/null - fi - # Stale /tmp files older than 2h, keep currently-running runner files. - find /tmp -mindepth 1 -maxdepth 1 -mmin +120 \ - -not -name 'runner*' -not -name 'act*' \ - -exec rm -rf {} + 2>/dev/null - echo "=== Disk after cleanup ===" - df -h - # Hard fail early if there still isn't enough room for `npm ci`, - # which needs ~3GB for this codebase's hoisted node_modules. - AVAIL_MB=$(df -Pm . | awk 'NR==2 {print $4}') - echo "Available on workspace volume: ${AVAIL_MB} MB" + + case "$ROOT_FSTYPE" in + ext2|ext3|ext4) sudo resize2fs "$ROOT_SOURCE" || true ;; + xfs) sudo xfs_growfs / || true ;; + esac + + echo "=== Disk after resize ===" + df -h / + AVAIL_MB=$(df -Pm / | awk 'NR==2 {print $4}') + echo "Available on root volume: ${AVAIL_MB} MB" if [ "${AVAIL_MB:-0}" -lt 3500 ]; then - echo "::error::Less than 3.5GB free after cleanup (${AVAIL_MB}MB)." - echo "The runner's EBS volume is too small for this codebase \ - — ask devops to expand it. Failing fast so the next steps don't \ - half-write an unusable build." + echo "::error::Less than 3.5GB free on root volume (${AVAIL_MB}MB)." + echo "If EBS is already 200GB, grow the EC2 root partition/filesystem on the runner host or install cloud-utils-growpart." exit 1 fi - exit 0 - name: Checkout code uses: actions/checkout@v4 @@ -91,26 +66,6 @@ jobs: - name: Test run: npm test - - name: Ensure free space before build - run: | - # Second-pass guard right before the heaviest step. `npm ci` plus - # tsc/test pull in tons of files since the first cleanup ran, and - # vite chunk writes need at least a few GB free — ENOSPC here is what - # killed past runs. - set +e - AVAIL_MB=$(df -Pm . | awk 'NR==2 {print $4}') - echo "Available on workspace volume: ${AVAIL_MB} MB" - if [ "${AVAIL_MB:-0}" -lt 3072 ]; then - echo "<3GB free — running an aggressive cleanup before build." - rm -rf "$HOME/.cache/act"/*/hostexecutor/node_modules/.cache 2>/dev/null - rm -rf "$HOME/.npm/_cacache" 2>/dev/null - if command -v docker >/dev/null 2>&1; then - docker system prune -af --volumes 2>/dev/null - fi - df -h - fi - exit 0 - - name: Build run: npm run build env: diff --git a/docs/deploy.md b/docs/deploy.md index 7de1372..7eaa44a 100644 --- a/docs/deploy.md +++ b/docs/deploy.md @@ -49,6 +49,18 @@ The workflow expects these Gitea secrets: ## Common failures +### Runner disk still shows the old EBS size + +If the EC2 runner EBS volume was expanded but CI still reports a small root filesystem (for example `df -h /` still shows 8GB), the partition/filesystem has not grown yet. The deploy workflow runs an early `Ensure runner disk space` step that tries to grow `/` before installing dependencies: + +```bash +sudo growpart +sudo resize2fs # ext filesystems +# or sudo xfs_growfs / # xfs filesystems +``` + +If that step says `growpart not installed`, install `cloud-utils-growpart` on the runner host and rerun the workflow. + ### Node version is too old The workflow pins Node.js 22 using `actions/setup-node`. This keeps the self-hosted runner from using an older system Node version during `npm ci`, tests, and build.