From 2ee80dcf804e2782f91e19d4693c9120e0f7054c Mon Sep 17 00:00:00 2001 From: Lucas Jia Date: Mon, 15 Jun 2026 11:45:43 -0700 Subject: [PATCH 1/4] chore: Add unit/slow/localmode and unit-test-v3 jobs to CI Health workflow --- .github/workflows/ci-health.yml | 56 +++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/.github/workflows/ci-health.yml b/.github/workflows/ci-health.yml index db94ce084a..f4cb52cd22 100644 --- a/.github/workflows/ci-health.yml +++ b/.github/workflows/ci-health.yml @@ -36,3 +36,59 @@ jobs: with: project-name: sagemaker-python-sdk-ci-health-canaries-v2 source-version: refs/heads/master-v2 + unit-tests: + runs-on: ubuntu-latest + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + - name: Run Unit Tests + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: sagemaker-python-sdk-ci-health-unit-tests + source-version: refs/heads/master-v2 + slow-tests: + runs-on: ubuntu-latest + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + - name: Run Slow Tests + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: sagemaker-python-sdk-ci-health-slow-tests + source-version: refs/heads/master-v2 + localmode-tests: + runs-on: ubuntu-latest + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + - name: Run Local Mode Tests + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: sagemaker-python-sdk-ci-health-localmode-tests + source-version: refs/heads/master-v2 + unit-test-v3: + runs-on: ubuntu-latest + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + - name: Run Unit Tests V3 + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: sagemaker-python-sdk-ci-health-unit-test-v3 + source-version: refs/heads/master From 4c3f02e7011c3a479c63f7690027fb87afceef27 Mon Sep 17 00:00:00 2001 From: Lucas Jia Date: Mon, 15 Jun 2026 11:50:59 -0700 Subject: [PATCH 2/4] chore: specify job version --- .github/workflows/ci-health.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-health.yml b/.github/workflows/ci-health.yml index f4cb52cd22..ab3ed1c961 100644 --- a/.github/workflows/ci-health.yml +++ b/.github/workflows/ci-health.yml @@ -36,7 +36,7 @@ jobs: with: project-name: sagemaker-python-sdk-ci-health-canaries-v2 source-version: refs/heads/master-v2 - unit-tests: + unit-tests-v2: runs-on: ubuntu-latest steps: - name: Configure AWS Credentials @@ -50,7 +50,7 @@ jobs: with: project-name: sagemaker-python-sdk-ci-health-unit-tests source-version: refs/heads/master-v2 - slow-tests: + slow-tests-v2: runs-on: ubuntu-latest steps: - name: Configure AWS Credentials @@ -64,7 +64,7 @@ jobs: with: project-name: sagemaker-python-sdk-ci-health-slow-tests source-version: refs/heads/master-v2 - localmode-tests: + localmode-tests-v2: runs-on: ubuntu-latest steps: - name: Configure AWS Credentials From 4ec6a0aadbbbceda12068650b9104d43284a1a6c Mon Sep 17 00:00:00 2001 From: Lucas Jia Date: Mon, 15 Jun 2026 12:49:46 -0700 Subject: [PATCH 3/4] chore: Split unit-test-v3 into per-submodule parallel CI Health projects Run each v3 submodule's unit tests in its own CodeBuild project (one worker each) so they execute in parallel instead of sequentially, keeping wall-clock time within the 3h timeout. Project names retain the unit-test-v3 substring so they reuse the existing UNIT_TEST_V3 metric/alarm definitions and fold into CIHealthCompositeAlarm. --- .github/workflows/ci-health.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-health.yml b/.github/workflows/ci-health.yml index ab3ed1c961..b9a2ae10ed 100644 --- a/.github/workflows/ci-health.yml +++ b/.github/workflows/ci-health.yml @@ -80,6 +80,10 @@ jobs: source-version: refs/heads/master-v2 unit-test-v3: runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + submodule: [sagemaker-core, sagemaker-train, sagemaker-serve, sagemaker-mlops] steps: - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@v4 @@ -87,8 +91,8 @@ jobs: role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} aws-region: us-west-2 role-duration-seconds: 10800 - - name: Run Unit Tests V3 + - name: Run Unit Tests V3 for ${{ matrix.submodule }} uses: aws-actions/aws-codebuild-run-build@v1 with: - project-name: sagemaker-python-sdk-ci-health-unit-test-v3 + project-name: sagemaker-python-sdk-ci-health-unit-test-v3-${{ matrix.submodule }} source-version: refs/heads/master From 19bdb5b86af9e344fc14536b7408b1b7fde844b7 Mon Sep 17 00:00:00 2001 From: Lucas Jia Date: Mon, 15 Jun 2026 13:19:29 -0700 Subject: [PATCH 4/4] chore: Drive v3 unit tests via SUBMODULE env var on a single CI Health project Mirror the v2 ci-unit-tests pattern: instead of four per-submodule projects, use one unit-test-v3 project parameterized by the SUBMODULE env var, triggered once per submodule by the workflow matrix so they still run in parallel. Also fix the v2 unit-tests job to pass PY_VERSION via a python-version matrix (matching pr-checks), instead of relying on an unset tox envlist. --- .github/workflows/ci-health.yml | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-health.yml b/.github/workflows/ci-health.yml index b9a2ae10ed..f90968ad06 100644 --- a/.github/workflows/ci-health.yml +++ b/.github/workflows/ci-health.yml @@ -38,6 +38,10 @@ jobs: source-version: refs/heads/master-v2 unit-tests-v2: runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["py39", "py310", "py311", "py312"] steps: - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@v4 @@ -45,11 +49,15 @@ jobs: role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} aws-region: us-west-2 role-duration-seconds: 10800 - - name: Run Unit Tests + - name: Run Unit Tests (${{ matrix.python-version }}) uses: aws-actions/aws-codebuild-run-build@v1 with: project-name: sagemaker-python-sdk-ci-health-unit-tests source-version: refs/heads/master-v2 + env-vars-for-codebuild: | + PY_VERSION + env: + PY_VERSION: ${{ matrix.python-version }} slow-tests-v2: runs-on: ubuntu-latest steps: @@ -94,5 +102,9 @@ jobs: - name: Run Unit Tests V3 for ${{ matrix.submodule }} uses: aws-actions/aws-codebuild-run-build@v1 with: - project-name: sagemaker-python-sdk-ci-health-unit-test-v3-${{ matrix.submodule }} + project-name: sagemaker-python-sdk-ci-health-unit-test-v3 source-version: refs/heads/master + env-vars-for-codebuild: | + SUBMODULE + env: + SUBMODULE: ${{ matrix.submodule }}