ci: simplify runner disk resize check
Some checks failed
Deploy to Frontend Servers / deploy (push) Failing after 2s
Some checks failed
Deploy to Frontend Servers / deploy (push) Failing after 2s
This commit is contained in:
@@ -10,65 +10,40 @@ jobs:
|
||||
runs-on: self-hosted
|
||||
|
||||
steps:
|
||||
- name: Free disk space
|
||||
- name: Ensure runner disk space
|
||||
run: |
|
||||
set +e
|
||||
echo "=== Disk before cleanup ==="
|
||||
df -h
|
||||
# Identify the directory holding the currently-running act job so we
|
||||
# never touch it. Everything else under ~/.cache/act/ is fair game.
|
||||
CURRENT_ACT_DIR=""
|
||||
if [ -n "${ACT_TOOLCACHE_PATH:-}" ]; then
|
||||
CURRENT_ACT_DIR=$(dirname "${ACT_TOOLCACHE_PATH}" 2>/dev/null)
|
||||
set -e
|
||||
echo "=== Disk before resize ==="
|
||||
df -h /
|
||||
|
||||
ROOT_SOURCE=$(findmnt -n -o SOURCE / 2>/dev/null || true)
|
||||
ROOT_FSTYPE=$(findmnt -n -o FSTYPE / 2>/dev/null || true)
|
||||
DISK_NAME=$(lsblk -no PKNAME "$ROOT_SOURCE" 2>/dev/null | head -n1 || true)
|
||||
PART_NUM=$(lsblk -no PARTN "$ROOT_SOURCE" 2>/dev/null | head -n1 || true)
|
||||
|
||||
echo "Root source: $ROOT_SOURCE ($ROOT_FSTYPE)"
|
||||
if [ -n "$DISK_NAME" ] && [ -n "$PART_NUM" ]; then
|
||||
if command -v growpart >/dev/null 2>&1; then
|
||||
sudo growpart "/dev/$DISK_NAME" "$PART_NUM" || true
|
||||
else
|
||||
echo "growpart not installed; skipping partition grow."
|
||||
fi
|
||||
fi
|
||||
if [ -z "$CURRENT_ACT_DIR" ]; then
|
||||
CURRENT_ACT_DIR=$(pwd | sed -n 's|\(.*/.cache/act/[^/]*\).*|\1|p')
|
||||
fi
|
||||
echo "Current act dir (preserved): ${CURRENT_ACT_DIR:-<unknown>}"
|
||||
# Wipe every other act workspace immediately (no mtime threshold).
|
||||
# The old 10min threshold still left the previous failed job around,
|
||||
# which then ate the disk before `npm ci` could finish.
|
||||
if [ -d "$HOME/.cache/act" ]; then
|
||||
du -sh "$HOME/.cache/act" 2>/dev/null
|
||||
for d in "$HOME/.cache/act"/*/; do
|
||||
[ -d "$d" ] || continue
|
||||
case "$d" in
|
||||
"$CURRENT_ACT_DIR"/*|"$CURRENT_ACT_DIR/") echo "skip current: $d" ;;
|
||||
*) rm -rf "$d" && echo "removed: $d" ;;
|
||||
esac
|
||||
done
|
||||
fi
|
||||
# npm cache + setup-node cache: blow them away entirely. `npm ci`
|
||||
# re-populates what it needs; the cache is a nice-to-have, not a
|
||||
# requirement, and on a tight runner it's the easiest GB to reclaim.
|
||||
rm -rf "$HOME/.npm/_cacache" "$HOME/.npm/_logs" 2>/dev/null
|
||||
rm -rf "$HOME/.cache/setup-node" 2>/dev/null
|
||||
# Stale actions-runner workspaces older than 30min.
|
||||
if [ -d "$HOME/actions-runner/_work" ]; then
|
||||
find "$HOME/actions-runner/_work" -mindepth 1 -maxdepth 2 -mmin +30 -exec rm -rf {} + 2>/dev/null
|
||||
fi
|
||||
# Docker: drop everything reclaimable (no `until` filter).
|
||||
if command -v docker >/dev/null 2>&1; then
|
||||
docker system prune -af --volumes 2>/dev/null
|
||||
fi
|
||||
# Stale /tmp files older than 2h, keep currently-running runner files.
|
||||
find /tmp -mindepth 1 -maxdepth 1 -mmin +120 \
|
||||
-not -name 'runner*' -not -name 'act*' \
|
||||
-exec rm -rf {} + 2>/dev/null
|
||||
echo "=== Disk after cleanup ==="
|
||||
df -h
|
||||
# Hard fail early if there still isn't enough room for `npm ci`,
|
||||
# which needs ~3GB for this codebase's hoisted node_modules.
|
||||
AVAIL_MB=$(df -Pm . | awk 'NR==2 {print $4}')
|
||||
echo "Available on workspace volume: ${AVAIL_MB} MB"
|
||||
|
||||
case "$ROOT_FSTYPE" in
|
||||
ext2|ext3|ext4) sudo resize2fs "$ROOT_SOURCE" || true ;;
|
||||
xfs) sudo xfs_growfs / || true ;;
|
||||
esac
|
||||
|
||||
echo "=== Disk after resize ==="
|
||||
df -h /
|
||||
AVAIL_MB=$(df -Pm / | awk 'NR==2 {print $4}')
|
||||
echo "Available on root volume: ${AVAIL_MB} MB"
|
||||
if [ "${AVAIL_MB:-0}" -lt 3500 ]; then
|
||||
echo "::error::Less than 3.5GB free after cleanup (${AVAIL_MB}MB)."
|
||||
echo "The runner's EBS volume is too small for this codebase \
|
||||
— ask devops to expand it. Failing fast so the next steps don't \
|
||||
half-write an unusable build."
|
||||
echo "::error::Less than 3.5GB free on root volume (${AVAIL_MB}MB)."
|
||||
echo "If EBS is already 200GB, grow the EC2 root partition/filesystem on the runner host or install cloud-utils-growpart."
|
||||
exit 1
|
||||
fi
|
||||
exit 0
|
||||
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
@@ -91,26 +66,6 @@ jobs:
|
||||
- name: Test
|
||||
run: npm test
|
||||
|
||||
- name: Ensure free space before build
|
||||
run: |
|
||||
# Second-pass guard right before the heaviest step. `npm ci` plus
|
||||
# tsc/test pull in tons of files since the first cleanup ran, and
|
||||
# vite chunk writes need at least a few GB free — ENOSPC here is what
|
||||
# killed past runs.
|
||||
set +e
|
||||
AVAIL_MB=$(df -Pm . | awk 'NR==2 {print $4}')
|
||||
echo "Available on workspace volume: ${AVAIL_MB} MB"
|
||||
if [ "${AVAIL_MB:-0}" -lt 3072 ]; then
|
||||
echo "<3GB free — running an aggressive cleanup before build."
|
||||
rm -rf "$HOME/.cache/act"/*/hostexecutor/node_modules/.cache 2>/dev/null
|
||||
rm -rf "$HOME/.npm/_cacache" 2>/dev/null
|
||||
if command -v docker >/dev/null 2>&1; then
|
||||
docker system prune -af --volumes 2>/dev/null
|
||||
fi
|
||||
df -h
|
||||
fi
|
||||
exit 0
|
||||
|
||||
- name: Build
|
||||
run: npm run build
|
||||
env:
|
||||
|
||||
@@ -49,6 +49,18 @@ The workflow expects these Gitea secrets:
|
||||
|
||||
## Common failures
|
||||
|
||||
### Runner disk still shows the old EBS size
|
||||
|
||||
If the EC2 runner EBS volume was expanded but CI still reports a small root filesystem (for example `df -h /` still shows 8GB), the partition/filesystem has not grown yet. The deploy workflow runs an early `Ensure runner disk space` step that tries to grow `/` before installing dependencies:
|
||||
|
||||
```bash
|
||||
sudo growpart <root-disk> <root-partition>
|
||||
sudo resize2fs <root-partition> # ext filesystems
|
||||
# or sudo xfs_growfs / # xfs filesystems
|
||||
```
|
||||
|
||||
If that step says `growpart not installed`, install `cloud-utils-growpart` on the runner host and rerun the workflow.
|
||||
|
||||
### Node version is too old
|
||||
|
||||
The workflow pins Node.js 22 using `actions/setup-node`. This keeps the self-hosted runner from using an older system Node version during `npm ci`, tests, and build.
|
||||
|
||||
Reference in New Issue
Block a user