From 93901ba50658bdf182d47a7addd1d9622840022a Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Fri, 26 Jun 2026 15:59:33 +0200 Subject: [PATCH] feat(ci): introduce merge queue Closes #1946 Signed-off-by: Evan Lezar --- .github/actions/pr-gate/action.yml | 4 +- .github/workflows/branch-checks.yml | 2 + .github/workflows/branch-e2e.yml | 32 ++++++++---- .github/workflows/helm-lint.yml | 2 + .github/workflows/required-ci-gates.yml | 66 +++++++++++++++++++++---- CI.md | 38 +++++++++++--- architecture/build.md | 9 ++-- 7 files changed, 122 insertions(+), 31 deletions(-) diff --git a/.github/actions/pr-gate/action.yml b/.github/actions/pr-gate/action.yml index 0c55bf120..3e10435a3 100644 --- a/.github/actions/pr-gate/action.yml +++ b/.github/actions/pr-gate/action.yml @@ -3,8 +3,8 @@ description: > Resolve PR metadata for a `pull-request/` push from copy-pr-bot and decide whether the workflow should run. Sets `should-run=true` only when the pushed SHA still matches the PR head SHA. If `required_label` is provided, the PR - must also carry that label. For non-`push` events (e.g. `workflow_dispatch`), - always sets `should-run=true`. + must also carry that label. For non-`push` events (e.g. `workflow_dispatch` + and `merge_group`), always sets `should-run=true`. inputs: required_label: diff --git a/.github/workflows/branch-checks.yml b/.github/workflows/branch-checks.yml index 7713febb6..b01cdb771 100644 --- a/.github/workflows/branch-checks.yml +++ b/.github/workflows/branch-checks.yml @@ -1,6 +1,8 @@ name: Branch Checks on: + merge_group: + types: [checks_requested] push: branches: - "pull-request/[0-9]+" diff --git a/.github/workflows/branch-e2e.yml b/.github/workflows/branch-e2e.yml index 23d7d1cf6..1fcb73ad4 100644 --- a/.github/workflows/branch-e2e.yml +++ b/.github/workflows/branch-e2e.yml @@ -1,6 +1,8 @@ name: Branch E2E Checks on: + merge_group: + types: [checks_requested] push: branches: - "pull-request/[0-9]+" @@ -37,15 +39,27 @@ jobs: shell: bash run: | set -euo pipefail - if [ "$EVENT_NAME" != "push" ]; then - run_core_e2e=true - run_gpu_e2e=true - run_kubernetes_ha_e2e=true - else - run_core_e2e="$(jq -r 'index("test:e2e") != null' <<< "$LABELS_JSON")" - run_gpu_e2e="$(jq -r 'index("test:e2e-gpu") != null' <<< "$LABELS_JSON")" - run_kubernetes_ha_e2e="$(jq -r 'index("test:e2e-kubernetes") != null' <<< "$LABELS_JSON")" - fi + case "$EVENT_NAME" in + push) + run_core_e2e="$(jq -r 'index("test:e2e") != null' <<< "$LABELS_JSON")" + run_gpu_e2e="$(jq -r 'index("test:e2e-gpu") != null' <<< "$LABELS_JSON")" + run_kubernetes_ha_e2e="$(jq -r 'index("test:e2e-kubernetes") != null' <<< "$LABELS_JSON")" + ;; + merge_group) + run_core_e2e=true + run_gpu_e2e=true + run_kubernetes_ha_e2e=false + ;; + workflow_dispatch) + run_core_e2e=true + run_gpu_e2e=true + run_kubernetes_ha_e2e=true + ;; + *) + echo "::error::Unsupported event '$EVENT_NAME'" >&2 + exit 1 + ;; + esac if [ "$run_core_e2e" = "true" ] || [ "$run_gpu_e2e" = "true" ] || [ "$run_kubernetes_ha_e2e" = "true" ]; then run_any_e2e=true else diff --git a/.github/workflows/helm-lint.yml b/.github/workflows/helm-lint.yml index 8c898011c..b57346101 100644 --- a/.github/workflows/helm-lint.yml +++ b/.github/workflows/helm-lint.yml @@ -4,6 +4,8 @@ name: Helm Lint on: + merge_group: + types: [checks_requested] push: branches: - "pull-request/[0-9]+" diff --git a/.github/workflows/required-ci-gates.yml b/.github/workflows/required-ci-gates.yml index ca068cf5c..b5b521a8f 100644 --- a/.github/workflows/required-ci-gates.yml +++ b/.github/workflows/required-ci-gates.yml @@ -1,6 +1,8 @@ name: Required CI Gates on: + merge_group: + types: [checks_requested] pull_request_target: types: [opened, synchronize, reopened, ready_for_review, labeled, unlabeled] workflow_run: @@ -17,7 +19,7 @@ permissions: statuses: write concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.event.workflow_run.head_sha || github.run_id }} + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.event.workflow_run.head_sha || github.sha || github.run_id }} cancel-in-progress: true jobs: @@ -33,9 +35,13 @@ jobs: PR_NUMBER_FROM_EVENT: ${{ github.event.pull_request.number }} PR_HEAD_SHA_FROM_EVENT: ${{ github.event.pull_request.head.sha }} PR_LABELS_FROM_EVENT: ${{ toJSON(github.event.pull_request.labels.*.name) }} + GITHUB_SHA_FROM_CONTEXT: ${{ github.sha }} + GITHUB_REF_NAME_FROM_CONTEXT: ${{ github.ref_name }} + GITHUB_RUN_ID_FROM_CONTEXT: ${{ github.run_id }} WORKFLOW_RUN_HEAD_SHA: ${{ github.event.workflow_run.head_sha }} WORKFLOW_RUN_HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }} WORKFLOW_RUN_EVENT: ${{ github.event.workflow_run.event }} + WORKFLOW_RUN_HTML_URL: ${{ github.event.workflow_run.html_url }} shell: bash run: | set -euo pipefail @@ -67,12 +73,19 @@ jobs: } resolve_pull_request_event() { + CONTEXT_KIND="pull_request" + WORKFLOW_EVENT="push" PR_NUMBER="$PR_NUMBER_FROM_EVENT" HEAD_SHA="$PR_HEAD_SHA_FROM_EVENT" LABELS_JSON=$(jq -c . <<< "$PR_LABELS_FROM_EVENT") + MIRROR_REF="pull-request/$PR_NUMBER" + EXPECTED_HEAD_BRANCH="$MIRROR_REF" + STATUS_TARGET_URL="https://github.com/$GH_REPO/pull/$PR_NUMBER" } load_pr_context() { + CONTEXT_KIND="pull_request" + WORKFLOW_EVENT="push" PR_NUMBER="$1" local pr state @@ -85,9 +98,35 @@ jobs: HEAD_SHA=$(jq -r '.head.sha' <<< "$pr") LABELS_JSON=$(gh api "repos/$GH_REPO/issues/$PR_NUMBER" --jq '[.labels[].name]') + MIRROR_REF="pull-request/$PR_NUMBER" + EXPECTED_HEAD_BRANCH="$MIRROR_REF" + STATUS_TARGET_URL="https://github.com/$GH_REPO/pull/$PR_NUMBER" + } + + resolve_merge_group_event() { + CONTEXT_KIND="merge_group" + WORKFLOW_EVENT="merge_group" + HEAD_SHA="$GITHUB_SHA_FROM_CONTEXT" + LABELS_JSON="[]" + EXPECTED_HEAD_BRANCH="$GITHUB_REF_NAME_FROM_CONTEXT" + STATUS_TARGET_URL="https://github.com/$GH_REPO/actions/runs/$GITHUB_RUN_ID_FROM_CONTEXT" + } + + resolve_merge_group_workflow_run_event() { + CONTEXT_KIND="merge_group" + WORKFLOW_EVENT="merge_group" + HEAD_SHA="$WORKFLOW_RUN_HEAD_SHA" + LABELS_JSON="[]" + EXPECTED_HEAD_BRANCH="$WORKFLOW_RUN_HEAD_BRANCH" + STATUS_TARGET_URL="${WORKFLOW_RUN_HTML_URL:-https://github.com/$GH_REPO/actions}" } resolve_workflow_run_event() { + if [ "$WORKFLOW_RUN_EVENT" = "merge_group" ]; then + resolve_merge_group_workflow_run_event + return + fi + if [ "$WORKFLOW_RUN_EVENT" != "push" ]; then echo "Ignoring workflow_run from event '$WORKFLOW_RUN_EVENT'." exit 0 @@ -112,6 +151,8 @@ jobs: resolve_context() { if [ "$EVENT_NAME" = "pull_request_target" ]; then resolve_pull_request_event + elif [ "$EVENT_NAME" = "merge_group" ]; then + resolve_merge_group_event elif [ "$EVENT_NAME" = "workflow_run" ]; then resolve_workflow_run_event else @@ -119,22 +160,25 @@ jobs: exit 1 fi - PR_URL="https://github.com/$GH_REPO/pull/$PR_NUMBER" - MIRROR_REF="pull-request/$PR_NUMBER" + STATUS_TARGET_URL="${STATUS_TARGET_URL:-https://github.com/$GH_REPO/actions}" } verify_mirror() { local context="$1" local mirror_sha + if [ "$CONTEXT_KIND" = "merge_group" ]; then + return 0 + fi + mirror_sha=$(gh api "repos/$GH_REPO/branches/$MIRROR_REF" --jq '.commit.sha' 2>/dev/null || true) if [ -z "$mirror_sha" ]; then - post_status "$context" pending "Waiting for /ok to test mirror" "$PR_URL" + post_status "$context" pending "Waiting for /ok to test mirror" "$STATUS_TARGET_URL" return 1 fi if [ "$mirror_sha" != "$HEAD_SHA" ]; then - post_status "$context" pending "Waiting for /ok to test mirror" "$PR_URL" + post_status "$context" pending "Waiting for /ok to test mirror" "$STATUS_TARGET_URL" return 1 fi @@ -149,8 +193,8 @@ jobs: local required_job_name="${5:-}" local workflow_url="https://github.com/$GH_REPO/actions/workflows/$workflow_file" - if [ -n "$required_label" ] && ! has_label "$required_label"; then - post_status "$context" success "$required_label not applied" "$PR_URL" + if [ "$CONTEXT_KIND" = "pull_request" ] && [ -n "$required_label" ] && ! has_label "$required_label"; then + post_status "$context" success "$required_label not applied" "$STATUS_TARGET_URL" return 0 fi @@ -159,8 +203,12 @@ jobs: fi local runs latest run_id status conclusion run_url real_success - runs=$(gh api "repos/$GH_REPO/actions/workflows/$workflow_file/runs?head_sha=$HEAD_SHA&event=push" --jq '.workflow_runs') - latest=$(jq -c --arg branch "$MIRROR_REF" '[.[] | select(.head_branch == $branch)] | sort_by(.created_at) | reverse | .[0] // empty' <<< "$runs") + runs=$(gh api "repos/$GH_REPO/actions/workflows/$workflow_file/runs?head_sha=$HEAD_SHA&event=$WORKFLOW_EVENT" --jq '.workflow_runs') + if [ -n "${EXPECTED_HEAD_BRANCH:-}" ]; then + latest=$(jq -c --arg branch "$EXPECTED_HEAD_BRANCH" '[.[] | select(.head_branch == $branch)] | sort_by(.created_at) | reverse | .[0] // empty' <<< "$runs") + else + latest=$(jq -c 'sort_by(.created_at) | reverse | .[0] // empty' <<< "$runs") + fi if [ -z "$latest" ]; then post_status "$context" pending "Waiting for $workflow_name" "$workflow_url" diff --git a/CI.md b/CI.md index d04668aaf..e14d5bdb9 100644 --- a/CI.md +++ b/CI.md @@ -10,6 +10,8 @@ PR CI that runs on NVIDIA self-hosted runners uses NVIDIA's copy-pr-bot. The bot `Branch Checks` run automatically after copy-pr-bot mirrors the PR. `Required CI Gates` posts PR-head statuses that verify the mirror exists, is current, and ran the expected push-based workflows. E2E suites are opt-in because they are more expensive and publish temporary images. +Merge queue validation is a second integration gate for `main`. After a PR has passed the required PR-head statuses, a maintainer adds it to the merge queue. GitHub creates a temporary merge-group branch that combines the latest `main`, the queued PR, and any earlier queued PRs. The same required `OpenShell / ...` status contexts are then published against the merge-group SHA before GitHub merges it. + Three opt-in labels enable the long-running E2E suites: - `test:e2e` runs the standard E2E suite in `Branch E2E Checks` @@ -75,6 +77,7 @@ Flow: 4. The maintainer opens that link and clicks **Re-run all jobs**. This time `pr_metadata` sees the label and the build/E2E jobs run. 5. When the run finishes, the matching `OpenShell / ...` gate status flips to green automatically. 6. New commits push to the mirror automatically and re-trigger `Branch Checks` plus any labeled E2E jobs in `Branch E2E Checks`. +7. When the PR is ready to merge, use **Add to merge queue** instead of merging directly. The queue validates the final integration state before updating `main`. ### Forked PR @@ -88,9 +91,30 @@ Flow: 1. Open the PR. The vouch check confirms you are vouched (otherwise the PR is auto-closed). 2. copy-pr-bot does not mirror forks automatically. A maintainer reviews the diff and comments `/ok to test ` with your latest commit SHA. 3. After `/ok to test`, copy-pr-bot mirrors to `pull-request/`. From here the flow is identical to internal PRs: `Required CI Gates` verifies the mirror and required push workflows, and maintainers apply the E2E label when the extra suites are needed. +4. When the PR is ready to merge, maintainers add it to the merge queue so the queued integration state is tested before it reaches `main`. Important: every new commit you push requires another `/ok to test ` from a maintainer before push-based CI will run on it. If a label is applied while the mirror is stale, `E2E Label Help` will post a comment explaining what's needed. +## Merge queue + +GitHub merge queue is required for `main`. Repository administrators must enable **Require merge queue** in the branch ruleset for `main` and keep these required status contexts aligned with the PR gates: + +- `OpenShell / Branch Checks` +- `OpenShell / E2E` +- `OpenShell / GPU E2E` +- `OpenShell / Helm Lint` + +Do not require the underlying workflow job names directly. `Required CI Gates` publishes stable commit statuses for both PR-head mirror commits and merge-group commits. + +Merge-group runs use the `merge_group` event. The event is distinct from `pull_request` and `push`, and GitHub will not report required checks for queued PRs unless the workflows include it. In this repository: + +- `Branch Checks` runs the standard non-E2E gates on the merge-group SHA. +- `Branch E2E Checks` runs core E2E and GPU E2E for merge groups. Kubernetes HA E2E remains optional and label-driven on PRs. +- `Helm Lint` runs for merge groups without the PR diff optimization, because the merge-group branch is the final integration state. +- `Required CI Gates` posts the same `OpenShell / ...` statuses to the merge-group SHA and does not require a `pull-request/` mirror for merge-group events. + +Maintainers should add ready PRs to the queue rather than pressing a direct merge button. GitHub removes a PR from the queue if the merge-group checks fail or time out. + ## copy-pr-bot [copy-pr-bot](https://github.com/apps/copy-pr-bot) is a GitHub App maintained by NVIDIA that solves a specific GitHub Actions security problem: by default, `pull_request`-triggered workflows on a self-hosted runner can run an arbitrary contributor's code on hardware the project owns. For projects that need self-hosted runners (GPU access, ARM hardware, on-prem secrets), GitHub's recommended pattern is to never trigger workflows directly from external `pull_request` events. @@ -109,12 +133,12 @@ The bot's full administrator documentation is internal to NVIDIA. The only comma | File | Role | |---|---| -| `.github/workflows/branch-checks.yml` | Required non-E2E PR checks. Triggers on `push: pull-request/[0-9]+`. | -| `.github/workflows/branch-e2e.yml` | Opt-in standard, GPU, and Kubernetes HA E2E. Triggers on `push: pull-request/[0-9]+` and runs jobs selected by `test:e2e`, `test:e2e-gpu`, or `test:e2e-kubernetes`. | -| `.github/workflows/helm-lint.yml` | Helm chart validation. Triggers on `push: pull-request/[0-9]+` and skips lint jobs unless Helm inputs changed. | -| `.github/actions/pr-gate/action.yml` | Composite action that resolves PR metadata and verifies the required label is set. | +| `.github/workflows/branch-checks.yml` | Required non-E2E checks. Triggers on `push: pull-request/[0-9]+` for PR mirrors and `merge_group` for queued merges. | +| `.github/workflows/branch-e2e.yml` | Standard, GPU, and Kubernetes HA E2E. PR mirror pushes use `test:e2e`, `test:e2e-gpu`, and `test:e2e-kubernetes` labels; merge groups run core and GPU E2E. | +| `.github/workflows/helm-lint.yml` | Helm chart validation. PR mirror pushes skip lint jobs unless Helm inputs changed; merge groups always validate Helm because they represent the final integration state. | +| `.github/actions/pr-gate/action.yml` | Composite action that resolves PR metadata and verifies the required label is set for PR mirror pushes. Non-push events are allowed through. | | `.github/actions/pr-merge-base/action.yml` | Composite action that resolves and fetches the merge-base commit for `pull-request/` push workflows. | -| `.github/workflows/required-ci-gates.yml` | Posts required PR-head statuses for push-based CI workflows. This is what branch protection should require. | +| `.github/workflows/required-ci-gates.yml` | Posts required PR-head and merge-group statuses for gated CI workflows. This is what branch protection and merge queue should require. | | `.github/workflows/e2e-label-help.yml` | When a `test:e2e*` label is applied, posts a PR comment telling the maintainer the next manual step (re-run an existing workflow run, or `/ok to test ` to refresh the mirror). | ## Release workflows @@ -129,11 +153,11 @@ These workflows run after merge to publish dev/tagged artifacts and verify them. ## Required status contexts -Require these statuses in the branch ruleset for push-based CI: +Require these statuses in the branch ruleset for PR and merge-queue CI: - `OpenShell / Branch Checks` - `OpenShell / E2E` - `OpenShell / GPU E2E` - `OpenShell / Helm Lint` -Do not require the underlying push workflow jobs directly. Those jobs only appear after copy-pr-bot mirrors trusted code, so they cannot independently prove that an untrusted or stale PR head was tested. +Do not require the underlying workflow jobs directly. PR workflow jobs only appear after copy-pr-bot mirrors trusted code, and merge-group workflow jobs run on temporary queue branches. The stable `OpenShell / ...` contexts prove the expected workflow completed for the commit that GitHub is about to merge. diff --git a/architecture/build.md b/architecture/build.md index 6cd7b15d2..dcf4ab3e2 100644 --- a/architecture/build.md +++ b/architecture/build.md @@ -128,17 +128,18 @@ do not infer from kube context. ## CI and E2E -Required checks run on GitHub Actions. Workflows that use NVIDIA self-hosted runners trigger from copy-pr-bot mirror branches, so trusted PRs are mirrored into `pull-request/` branches before those workflows run. +Required checks run on GitHub Actions. Workflows that use NVIDIA self-hosted runners trigger from copy-pr-bot mirror branches, so trusted PRs are mirrored into `pull-request/` branches before those workflows run. `main` also uses GitHub merge queue so the final queued integration commit is validated before it merges. The high-level CI model: 1. PR-context gate jobs publish required statuses for the PR head commit. 2. Standard branch checks run from trusted mirror branches. 3. Label-gated E2E, GPU, and Kubernetes checks run from trusted mirror branches. -4. Gate jobs verify that the mirror branch matches the PR head and that the expected non-gate workflow actually ran. -5. Release workflows rebuild and publish binaries, wheels, images, and docs. +4. Merge-group checks run against GitHub's temporary queue branch for the final integration state. +5. Gate jobs verify that the mirror branch matches the PR head, or that the merge-group workflow ran for the queued SHA, and that the expected non-gate workflow actually ran. +6. Release workflows rebuild and publish binaries, wheels, images, and docs. -See `CI.md` for the contributor workflow and labels. +See `CI.md` for the contributor workflow, labels, and maintainer merge-queue workflow. ## Docs Site