Release 0.11.0 (#8255 )

Co-authored-by: GitHub Action <action@github.com>
Release 0.11.0rc2 (#8236 )
--- a/.github/workflows/docker-build.yaml
+++ b/.github/workflows/docker-build.yaml
@@ -248,11 +248,21 @@ jobs:
      - name: Create multi-platform manifest
        run: |
          # Create multi-platform manifest from temporary tags
          VERSION="${{ needs.prepare.outputs.version }}"
          
          # Determine if this is a pre-release version (contains rc, alpha, beta, or dev)
          if [[ "$VERSION" =~ (rc|alpha|beta|dev) ]]; then
            echo "Pre-release version detected ($VERSION), skipping :latest tag"
            TAGS="-t ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:$VERSION"
          else
            echo "Stable release version detected ($VERSION), updating :latest tag"
            TAGS="-t ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:latest -t ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:$VERSION"
          fi
          
          docker buildx imagetools create \
            -t ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:latest \
            -t ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:${{ needs.prepare.outputs.version }} \
            ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:${{ needs.prepare.outputs.version }}-linux-amd64 \
            ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:${{ needs.prepare.outputs.version }}-linux-arm64
            $TAGS \
            ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:$VERSION-linux-amd64 \
            ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:$VERSION-linux-arm64

      - name: Clean up temporary tags
        if: always()
@@ -274,9 +284,18 @@ jobs:

      - name: Summary of final tags
        run: |
          VERSION="${{ needs.prepare.outputs.version }}"
          echo "✅ Multi-platform tags created:"
          echo "   - ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:latest"
          echo "   - ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:${{ needs.prepare.outputs.version }}"
          echo "   - ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:$VERSION"
          
          # Check if latest tag was updated
          if [[ "$VERSION" =~ (rc|alpha|beta|dev) ]]; then
            echo ""
            echo "ℹ️  Pre-release version - :latest tag not updated"
          else
            echo "   - ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:latest"
          fi
          
          echo ""
          echo "🚀 Built with native runners for maximum performance!"
          echo "🧹 Temporary tags cleaned up automatically!"
--- a/.github/workflows/nightly-build.yml
+++ b/.github/workflows/nightly-build.yml
@@ -292,6 +292,69 @@ jobs:
      package_name: skypilot-nightly
    secrets: inherit

  summary:
    runs-on: ubuntu-latest
    needs: [check-date, nightly-build-pypi, smoke-tests-aws, smoke-tests-kubernetes-resource-heavy, smoke-tests-kubernetes-no-resource-heavy, smoke-tests-kubernetes-no-resource-heavy-limit-deps, smoke-tests-remote-server-kubernetes, smoke-tests-shared-gke-api-server, smoke-tests-lambda-job-queue, smoke-tests-runpod-minimal, backward-compat-test-nightly, backward-compat-test-stable]
    if: always()
    steps:
      - name: Summary
        run: |
          cat <<EOF >> "$GITHUB_STEP_SUMMARY"
          # Nightly Build Summary

          ## Buildkite Test Links
          EOF
          if [ "${{ needs.smoke-tests-aws.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-aws.outputs.build_number }}" ]; then
            cat <<EOF >> "$GITHUB_STEP_SUMMARY"
          - [Smoke Tests AWS](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-aws.outputs.build_number }}) - $([ "${{ needs.smoke-tests-aws.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          EOF
          fi
          if [ "${{ needs.smoke-tests-kubernetes-resource-heavy.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-kubernetes-resource-heavy.outputs.build_number }}" ]; then
            cat <<EOF >> "$GITHUB_STEP_SUMMARY"
          - [Smoke Tests Kubernetes (Resource Heavy)](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-kubernetes-resource-heavy.outputs.build_number }}) - $([ "${{ needs.smoke-tests-kubernetes-resource-heavy.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          EOF
          fi
          if [ "${{ needs.smoke-tests-kubernetes-no-resource-heavy.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-kubernetes-no-resource-heavy.outputs.build_number }}" ]; then
            cat <<EOF >> "$GITHUB_STEP_SUMMARY"
          - [Smoke Tests Kubernetes (No Resource Heavy)](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-kubernetes-no-resource-heavy.outputs.build_number }}) - $([ "${{ needs.smoke-tests-kubernetes-no-resource-heavy.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          EOF
          fi
          if [ "${{ needs.smoke-tests-kubernetes-no-resource-heavy-limit-deps.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-kubernetes-no-resource-heavy-limit-deps.outputs.build_number }}" ]; then
            cat <<EOF >> "$GITHUB_STEP_SUMMARY"
          - [Smoke Tests Kubernetes (No Resource Heavy, Limit Deps)](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-kubernetes-no-resource-heavy-limit-deps.outputs.build_number }}) - $([ "${{ needs.smoke-tests-kubernetes-no-resource-heavy-limit-deps.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          EOF
          fi
          if [ "${{ needs.smoke-tests-remote-server-kubernetes.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_number }}" ]; then
            cat <<EOF >> "$GITHUB_STEP_SUMMARY"
          - [Smoke Tests Remote Server Kubernetes](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_number }}) - $([ "${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          EOF
          fi
          if [ "${{ needs.smoke-tests-shared-gke-api-server.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-shared-gke-api-server.outputs.build_number }}" ]; then
            cat <<EOF >> "$GITHUB_STEP_SUMMARY"
          - [Smoke Tests Shared GKE API Server](https://buildkite.com/skypilot-1/nightly-build-shared-gke-api-server/builds/${{ needs.smoke-tests-shared-gke-api-server.outputs.build_number }}) - $([ "${{ needs.smoke-tests-shared-gke-api-server.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          EOF
          fi
          if [ "${{ needs.smoke-tests-lambda-job-queue.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-lambda-job-queue.outputs.build_number }}" ]; then
            cat <<EOF >> "$GITHUB_STEP_SUMMARY"
          - [Smoke Tests Lambda Job Queue](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-lambda-job-queue.outputs.build_number }}) - $([ "${{ needs.smoke-tests-lambda-job-queue.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          EOF
          fi
          if [ "${{ needs.smoke-tests-runpod-minimal.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-runpod-minimal.outputs.build_number }}" ]; then
            cat <<EOF >> "$GITHUB_STEP_SUMMARY"
          - [Smoke Tests RunPod Minimal](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-runpod-minimal.outputs.build_number }}) - $([ "${{ needs.smoke-tests-runpod-minimal.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          EOF
          fi
          if [ "${{ needs.backward-compat-test-nightly.result }}" != "skipped" ] && [ -n "${{ needs.backward-compat-test-nightly.outputs.build_number }}" ]; then
            cat <<EOF >> "$GITHUB_STEP_SUMMARY"
          - [Backward Compat Test (Nightly)](https://buildkite.com/skypilot-1/quicktest-core/builds/${{ needs.backward-compat-test-nightly.outputs.build_number }}) - $([ "${{ needs.backward-compat-test-nightly.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          EOF
          fi
          if [ "${{ needs.backward-compat-test-stable.result }}" != "skipped" ] && [ -n "${{ needs.backward-compat-test-stable.outputs.build_number }}" ]; then
            cat <<EOF >> "$GITHUB_STEP_SUMMARY"
          - [Backward Compat Test (Stable)](https://buildkite.com/skypilot-1/quicktest-core/builds/${{ needs.backward-compat-test-stable.outputs.build_number }}) - $([ "${{ needs.backward-compat-test-stable.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          EOF
          fi

  notify-slack-failure:
    runs-on: ubuntu-latest
    needs: [check-date, nightly-build-pypi, smoke-tests-aws, smoke-tests-kubernetes-resource-heavy, smoke-tests-kubernetes-no-resource-heavy, smoke-tests-kubernetes-no-resource-heavy-limit-deps, smoke-tests-remote-server-kubernetes, smoke-tests-shared-gke-api-server, smoke-tests-lambda-job-queue, smoke-tests-runpod-minimal, backward-compat-test-nightly, backward-compat-test-stable, publish-and-validate-both, trigger-helm-release]
--- a/.github/workflows/publish-and-validate.yml
+++ b/.github/workflows/publish-and-validate.yml
@@ -44,6 +44,8 @@ jobs:
      - name: Validate published package
        run: |
          export SKYPILOT_DISABLE_USAGE_COLLECTION=1

          # fastapi has some broken package info on test PyPI, so manually install it from real PyPI.
          pip install fastapi

          # Set up variables for package check
@@ -66,12 +68,13 @@ jobs:
            pip uninstall -y ${{ inputs.package_name }} || true

            # Install the package with no cache
            # Use --pre so that pre-release versions (e.g. rcs) will be selected
            if [ "${{ inputs.repository_type }}" == "test-pypi" ]; then
              echo "Installing from Test PyPI..."
              pip install --no-cache-dir --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple ${{ inputs.package_name }}
              pip install --no-cache-dir --pre --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple ${{ inputs.package_name }}[server]
            else
              echo "Installing from PyPI..."
              pip install --no-cache-dir ${{ inputs.package_name }}
              pip install --no-cache-dir --pre ${{ inputs.package_name }}[server]
            fi

            # Check the version
--- a/.github/workflows/publish-helm.yml
+++ b/.github/workflows/publish-helm.yml
@@ -92,8 +92,20 @@ jobs:
        run: |
          version="${{ inputs.version }}"
          # Convert PEP440 version to SemVer if needed for Helm versioning
          # Handle cases like 1.0.0.dev20250218 -> 1.0.0-dev.20250218
          # Handle cases like:
          #   1.0.0.dev20250218 -> 1.0.0-dev.20250218
          #   0.11.0rc0 -> 0.11.0-rc.0
          #   0.11.0a1 -> 0.11.0-alpha.1
          #   0.11.0b2 -> 0.11.0-beta.2
          #   0.11.0.post1 -> 0.11.0+post.1
          semversion=$(echo "$version" | sed -E 's/([0-9]+\.[0-9]+\.[0-9]+)\.dev([0-9]+)/\1-dev.\2/')
          semversion=$(echo "$semversion" | sed -E 's/([0-9]+\.[0-9]+\.[0-9]+)rc([0-9]+)/\1-rc.\2/')
          semversion=$(echo "$semversion" | sed -E 's/([0-9]+\.[0-9]+\.[0-9]+)a([0-9]+)/\1-alpha.\2/')
          semversion=$(echo "$semversion" | sed -E 's/([0-9]+\.[0-9]+\.[0-9]+)b([0-9]+)/\1-beta.\2/')
          # Post-releases use build metadata (+) since SemVer has no direct equivalent to PEP440's .post
          # PEP440 .post means "after release", but SemVer build metadata has same precedence.
          # TODO(romilb): If both 0.11.0 and 0.11.0+post.1 exist, Helm's "latest" behavior is undefined - some sources claim the newer one wins. Need to verify this.
          semversion=$(echo "$semversion" | sed -E 's/([0-9]+\.[0-9]+\.[0-9]+)\.post([0-9]+)/\1+post.\2/')
          
          # Update the version and name in the main skypilot chart
          sed -i "s/^version:.*$/version: ${semversion}/" src/charts/skypilot/Chart.yaml
--- a/.github/workflows/release-build.yml
+++ b/.github/workflows/release-build.yml
@@ -6,7 +6,7 @@ on:
  workflow_dispatch:
    inputs:
      release_version:
        description: 'Release version (e.g., 0.9.0)'
        description: 'Release version (e.g., 0.9.0 or 0.9.0rc1)'
        required: false
        type: string
      skip_version_checks:
@@ -14,6 +14,11 @@ on:
        required: false
        type: boolean
        default: false
      skip_smoke_tests:
        description: 'Skip smoke tests (recommended when promoting tested RC to stable)'
        required: false
        type: boolean
        default: false

 jobs:
  release-build:
@@ -184,6 +189,10 @@ jobs:

  smoke-tests:
    needs: release-build
    if: |
      always() && 
      needs.release-build.result == 'success' &&
      github.event.inputs.skip_smoke_tests != 'true'
    uses: ./.github/workflows/buildkite-trigger-wait.yml
    with:
      commit: ${{ needs.release-build.outputs.new_commit_sha }}
@@ -198,6 +207,10 @@ jobs:

  quicktest-core:
    needs: release-build
    if: |
      always() && 
      needs.release-build.result == 'success' &&
      github.event.inputs.skip_smoke_tests != 'true'
    uses: ./.github/workflows/buildkite-trigger-wait.yml
    with:
      commit: ${{ needs.release-build.outputs.new_commit_sha }}
@@ -213,6 +226,10 @@ jobs:

  quicktest-core-previous-minor:
    needs: release-build
    if: |
      always() && 
      needs.release-build.result == 'success' &&
      github.event.inputs.skip_smoke_tests != 'true'
    uses: ./.github/workflows/buildkite-trigger-wait.yml
    with:
      commit: ${{ needs.release-build.outputs.new_commit_sha }}
@@ -228,6 +245,10 @@ jobs:

  smoke-tests-remote-server-kubernetes:
    needs: release-build
    if: |
      always() && 
      needs.release-build.result == 'success' &&
      github.event.inputs.skip_smoke_tests != 'true'
    uses: ./.github/workflows/buildkite-trigger-wait.yml
    with:
      commit: ${{ needs.release-build.outputs.new_commit_sha }}
@@ -244,6 +265,10 @@ jobs:

  release-tests:
    needs: release-build
    if: |
      always() && 
      needs.release-build.result == 'success' &&
      github.event.inputs.skip_smoke_tests != 'true'
    uses: ./.github/workflows/buildkite-trigger-wait.yml
    with:
      commit: ${{ needs.release-build.outputs.new_commit_sha }}
@@ -271,34 +296,96 @@ jobs:
          TEST_BRANCH: ${{ needs.release-build.outputs.test_branch }}
          RELEASE_BRANCH: ${{ needs.release-build.outputs.release_branch }}
          RELEASE_VERSION: ${{ needs.release-build.outputs.release_version }}
          SMOKE_TEST_BUILD: ${{ needs.smoke-tests.outputs.build_number }}
          QUICKTEST_BUILD: ${{ needs.quicktest-core.outputs.build_number }}
          QUICKTEST_PREV_MINOR_BUILD: ${{ needs.quicktest-core-previous-minor.outputs.build_number }}
          REMOTE_SERVER_K8S_BUILD: ${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_number }}
          RELEASE_TEST_BUILD: ${{ needs.release-tests.outputs.build_number }}
          SKIP_SMOKE_TESTS: ${{ github.event.inputs.skip_smoke_tests }}
        run: |
          # Configure git
          git config --local user.email "action@github.com"
          git config --local user.name "GitHub Action"

          # Create PR with buildkite links
          PR_BODY="Release ${RELEASE_VERSION}
          # Detect if this is an RC promotion
          SOURCE_BRANCH="${{ github.ref_name }}"
          IS_RC_PROMOTION="false"
          if [[ "$SOURCE_BRANCH" =~ ^releases/.*rc[0-9]+$ ]]; then
            IS_RC_PROMOTION="true"
            RC_VERSION=$(echo "$SOURCE_BRANCH" | sed 's/releases\///')
          fi

          # Build PR body based on whether tests were skipped
          if [ "$SKIP_SMOKE_TESTS" == "true" ]; then
            if [ "$IS_RC_PROMOTION" == "true" ]; then
              PR_BODY="## Promote RC to Stable Release ${RELEASE_VERSION}
          
          **Source:** \`$SOURCE_BRANCH\` (RC version: $RC_VERSION)  
          **Target:** Stable release \`${RELEASE_VERSION}\`
          
          ⚠️ **Smoke tests were SKIPPED** - This release is being promoted from a tested RC.
          
          ### Pre-release Testing
          This version was previously tested as release candidate \`$RC_VERSION\` and deemed stable by early adopters.
          
          ### Changes in this PR
          - Updated \`sky/__init__.py\`: \`$RC_VERSION\` → \`${RELEASE_VERSION}\`
          - Updated \`charts/skypilot/values.yaml\`: Docker image tag \`$RC_VERSION\` → \`${RELEASE_VERSION}\`"
            else
              PR_BODY="Release ${RELEASE_VERSION}
          
          ⚠️ **Smoke tests were SKIPPED** - Please ensure manual testing was performed."
            fi
          else
            # Normal release with test results
            PR_BODY="Release ${RELEASE_VERSION}

          Buildkite Test Links:
          - [Full Smoke Tests](https://buildkite.com/skypilot-1/full-smoke-tests-run/builds/${SMOKE_TEST_BUILD}) - $([ "${{ needs.smoke-tests.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          - [Quicktest Core](https://buildkite.com/skypilot-1/quicktest-core/builds/${QUICKTEST_BUILD}) - $([ "${{ needs.quicktest-core.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          - [Smoke Tests Remote Server Kubernetes](https://buildkite.com/skypilot-1/smoke-tests/builds/${REMOTE_SERVER_K8S_BUILD}) - $([ "${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")"
          if [ "${{ needs.quicktest-core-previous-minor.result }}" == "success" ] || [ "${{ needs.quicktest-core-previous-minor.result }}" == "failure" ]; then
            PR_BODY="${PR_BODY}
          - [Quicktest Core (vs Previous Minor)](https://buildkite.com/skypilot-1/quicktest-core/builds/${QUICKTEST_PREV_MINOR_BUILD}) - $([ "${{ needs.quicktest-core-previous-minor.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")"
          fi
          PR_BODY="${PR_BODY}
          - [Release Tests](https://buildkite.com/skypilot-1/release/builds/${RELEASE_TEST_BUILD}) - ⏳ (not waiting for completion)
          - [Full Smoke Tests](https://buildkite.com/skypilot-1/full-smoke-tests-run/builds/${{ needs.smoke-tests.outputs.build_number }}) - $([ "${{ needs.smoke-tests.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          - [Quicktest Core](https://buildkite.com/skypilot-1/quicktest-core/builds/${{ needs.quicktest-core.outputs.build_number }}) - $([ "${{ needs.quicktest-core.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          - [Quicktest Core (vs Previous Minor)](https://buildkite.com/skypilot-1/quicktest-core/builds/${{ needs.quicktest-core-previous-minor.outputs.build_number }}) - $([ "${{ needs.quicktest-core-previous-minor.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          - [Smoke Tests Remote Server Kubernetes](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_number }}) - $([ "${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          - [Release Tests](https://buildkite.com/skypilot-1/release/builds/${{ needs.release-tests.outputs.build_number }}) - ⏳ (not waiting for completion)

          *Release Tests may take up to 24 hours to complete and might fail due to resource constraints.*"
          fi

          echo "Creating PR from ${TEST_BRANCH} to ${RELEASE_BRANCH}"

          gh pr create --base ${RELEASE_BRANCH} --head ${TEST_BRANCH} \
            --title "Release ${RELEASE_VERSION}" \
            --body "${PR_BODY}"

      - name: Summary
        if: always()
        env:
          SKIP_SMOKE_TESTS: ${{ github.event.inputs.skip_smoke_tests }}
        run: |
          if [ "$SKIP_SMOKE_TESTS" == "true" ]; then
            SOURCE_BRANCH="${{ github.ref_name }}"
            if [[ "$SOURCE_BRANCH" =~ ^releases/.*rc[0-9]+$ ]]; then
              RC_VERSION=$(echo "$SOURCE_BRANCH" | sed 's/releases\///')
              cat <<EOF >> "$GITHUB_STEP_SUMMARY"
          # Release ${{ needs.release-build.outputs.release_version }}

          ## RC Promotion
          Promoting from \`$RC_VERSION\` to stable version \`${{ needs.release-build.outputs.release_version }}\`

          ⚠️ **Smoke tests were SKIPPED** - This release was promoted from a tested RC.
          EOF
            else
              cat <<EOF >> "$GITHUB_STEP_SUMMARY"
          # Release ${{ needs.release-build.outputs.release_version }}

          ⚠️ **Smoke tests were SKIPPED** - Please ensure manual testing was performed.
          EOF
            fi
          else
            cat <<EOF >> "$GITHUB_STEP_SUMMARY"
          # Release ${{ needs.release-build.outputs.release_version }}

          ## Buildkite Test Links
          - [Full Smoke Tests](https://buildkite.com/skypilot-1/full-smoke-tests-run/builds/${{ needs.smoke-tests.outputs.build_number }}) - $([ "${{ needs.smoke-tests.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          - [Quicktest Core](https://buildkite.com/skypilot-1/quicktest-core/builds/${{ needs.quicktest-core.outputs.build_number }}) - $([ "${{ needs.quicktest-core.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          - [Quicktest Core (vs Previous Minor)](https://buildkite.com/skypilot-1/quicktest-core/builds/${{ needs.quicktest-core-previous-minor.outputs.build_number }}) - $([ "${{ needs.quicktest-core-previous-minor.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          - [Smoke Tests Remote Server Kubernetes](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_number }}) - $([ "${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
          - [Release Tests](https://buildkite.com/skypilot-1/release/builds/${{ needs.release-tests.outputs.build_number }}) - ⏳ (not waiting for completion)

          *Release Tests may take up to 24 hours to complete and might fail due to resource constraints.*
          EOF
          fi
--- a/README.md
+++ b/README.md
@@ -39,6 +39,7 @@
 ----

 :fire: *News* :fire:
 - [Nov 2025] Serve **Kimi K2 Thinking** with reasoning capabilities on your Kubernetes or clouds: [**example**](./llm/kimi-k2-thinking/)
 - [Oct 2025] Run **RL training for LLMs** with SkyRL on your Kubernetes or clouds: [**example**](./llm/skyrl/)
 - [Oct 2025] Train and serve [Andrej Karpathy's](https://x.com/karpathy/status/1977755427569111362) **nanochat** - the best ChatGPT that $100 can buy: [**example**](./llm/nanochat)
 - [Oct 2025] Run large-scale **LLM training with TorchTitan** on any AI infra: [**example**](./examples/training/torchtitan)
@@ -49,7 +50,7 @@
 - [Jul 2025] Finetune **Llama4** on any distributed cluster/cloud: [**example**](./llm/llama-4-finetuning/)
 - [Jul 2025] Two-part blog series, `The Evolution of AI Job Orchestration`: (1) [Running AI jobs on GPU Neoclouds](https://blog.skypilot.co/ai-job-orchestration-pt1-gpu-neoclouds/), (2) [The AI-Native Control Plane & Orchestration that Finally Works for ML](https://blog.skypilot.co/ai-job-orchestration-pt2-ai-control-plane/)
 - [Apr 2025] Spin up **Qwen3** on your cluster/cloud: [**example**](./llm/qwen/)
 - [Feb 2025] Prepare and serve **Retrieval Augmented Generation (RAG) with DeepSeek-R1**: [**blog post**](https://blog.skypilot.co/deepseek-rag), [**example**](./llm/rag/)



 **LLM Finetuning Cookbooks**: Finetuning Llama 2 / Llama 3.1 in your own cloud environment, privately: Llama 2 [**example**](./llm/vicuna-llama-2/) and [**blog**](https://blog.skypilot.co/finetuning-llama2-operational-guide/); Llama 3.1 [**example**](./llm/llama-3_1-finetuning/) and [**blog**](https://blog.skypilot.co/finetune-llama-3_1-on-your-infra/)
@@ -183,9 +184,9 @@ Latest featured examples:
 |----------|----------|
 | Training | [Verl](https://docs.skypilot.co/en/latest/examples/training/verl.html), [Finetune Llama 4](https://docs.skypilot.co/en/latest/examples/training/llama-4-finetuning.html), [TorchTitan](https://docs.skypilot.co/en/latest/examples/training/torchtitan.html), [PyTorch](https://docs.skypilot.co/en/latest/getting-started/tutorial.html), [DeepSpeed](https://docs.skypilot.co/en/latest/examples/training/deepspeed.html), [NeMo](https://docs.skypilot.co/en/latest/examples/training/nemo.html), [Ray](https://docs.skypilot.co/en/latest/examples/training/ray.html), [Unsloth](https://docs.skypilot.co/en/latest/examples/training/unsloth.html), [Jax/TPU](https://docs.skypilot.co/en/latest/examples/training/tpu.html) |
 | Serving | [vLLM](https://docs.skypilot.co/en/latest/examples/serving/vllm.html), [SGLang](https://docs.skypilot.co/en/latest/examples/serving/sglang.html), [Ollama](https://docs.skypilot.co/en/latest/examples/serving/ollama.html) |
 | Models | [DeepSeek-R1](https://docs.skypilot.co/en/latest/examples/models/deepseek-r1.html), [Llama 4](https://docs.skypilot.co/en/latest/examples/models/llama-4.html), [Llama 3](https://docs.skypilot.co/en/latest/examples/models/llama-3.html), [CodeLlama](https://docs.skypilot.co/en/latest/examples/models/codellama.html), [Qwen](https://docs.skypilot.co/en/latest/examples/models/qwen.html), [Kimi-K2](https://docs.skypilot.co/en/latest/examples/models/kimi-k2.html), [Mixtral](https://docs.skypilot.co/en/latest/examples/models/mixtral.html) |
 | Models | [DeepSeek-R1](https://docs.skypilot.co/en/latest/examples/models/deepseek-r1.html), [Llama 4](https://docs.skypilot.co/en/latest/examples/models/llama-4.html), [Llama 3](https://docs.skypilot.co/en/latest/examples/models/llama-3.html), [CodeLlama](https://docs.skypilot.co/en/latest/examples/models/codellama.html), [Qwen](https://docs.skypilot.co/en/latest/examples/models/qwen.html), [Kimi-K2](https://docs.skypilot.co/en/latest/examples/models/kimi-k2.html), [Kimi-K2-Thinking](https://docs.skypilot.co/en/latest/examples/models/kimi-k2-thinking.html), [Mixtral](https://docs.skypilot.co/en/latest/examples/models/mixtral.html) |
 | AI apps | [RAG](https://docs.skypilot.co/en/latest/examples/applications/rag.html), [vector databases](https://docs.skypilot.co/en/latest/examples/applications/vector_database.html) (ChromaDB, CLIP) |
 | Common frameworks | [Airflow](https://docs.skypilot.co/en/latest/examples/frameworks/airflow.html), [Jupyter](https://docs.skypilot.co/en/latest/examples/frameworks/jupyter.html) |
 | Common frameworks | [Airflow](https://docs.skypilot.co/en/latest/examples/frameworks/airflow.html), [Jupyter](https://docs.skypilot.co/en/latest/examples/frameworks/jupyter.html), [marimo](https://docs.skypilot.co/en/latest/examples/frameworks/marimo.html)  |

 Source files can be found in [`llm/`](https://github.com/skypilot-org/skypilot/tree/master/llm) and [`examples/`](https://github.com/skypilot-org/skypilot/tree/master/examples).

--- a/charts/skypilot/templates/api-deployment.yaml
+++ b/charts/skypilot/templates/api-deployment.yaml
@@ -71,8 +71,11 @@ spec:
        resources:
          {{- toYaml .Values.apiService.resources | nindent 10 }}
        env:
        {{- if .Values.apiService.extraEnvs }}
        {{- toYaml .Values.apiService.extraEnvs | nindent 8 }}
        {{- with $.Values.global.extraEnvs }}
        {{- toYaml . | nindent 8 }}
        {{- end }}
        {{- with .Values.apiService.extraEnvs }}
        {{- toYaml . | nindent 8 }}
        {{- end }}
        - name: SKYPILOT_DEV
          value: {{ .Values.apiService.skypilotDev | quote }}
@@ -388,6 +391,9 @@ spec:
            sleep 600
          fi
        env:
        {{- with $.Values.global.extraEnvs }}
        {{- toYaml . | nindent 8 }}
        {{- end }}
        - name: AWS_ACCESS_KEY_ID
          valueFrom:
            secretKeyRef:
@@ -435,6 +441,9 @@ spec:
          echo "Account ID:"
          cat /root/.cloudflare/accountid
        env:
        {{- with $.Values.global.extraEnvs }}
        {{- toYaml . | nindent 8 }}
        {{- end }}
        - name: R2_CREDENTIALS
          valueFrom:
            secretKeyRef:
@@ -458,6 +467,9 @@ spec:
        image: {{ include "common.image" (dict "root" . "image" .Values.gcpCredentials.image) }}
        command: ["/bin/sh", "-c"]
        env:
        {{- with $.Values.global.extraEnvs }}
        {{- toYaml . | nindent 8 }}
        {{- end }}
        - name: GOOGLE_APPLICATION_CREDENTIALS
          value: /root/gcp-cred.json
        args:
@@ -492,6 +504,9 @@ spec:
            sleep 600
          fi
        env:
        {{- with $.Values.global.extraEnvs }}
        {{- toYaml . | nindent 8 }}
        {{- end }}
        - name: RUNPOD_API_KEY
          valueFrom:
            secretKeyRef:
@@ -521,6 +536,9 @@ spec:
            sleep 600
          fi
        env:
        {{- with $.Values.global.extraEnvs }}
        {{- toYaml . | nindent 8 }}
        {{- end }}
        - name: LAMBDA_API_KEY
          valueFrom:
            secretKeyRef:
@@ -550,6 +568,9 @@ spec:
            sleep 600
          fi
        env:
        {{- with $.Values.global.extraEnvs }}
        {{- toYaml . | nindent 8 }}
        {{- end }}
        - name: VAST_API_KEY
          valueFrom:
            secretKeyRef:
--- a/charts/skypilot/templates/datasource.yaml
+++ b/charts/skypilot/templates/datasource.yaml
@@ -1,13 +1,13 @@
 {{- if and .Values.grafana.enabled .Values.grafana.sidecar.datasources.enabled }}
 apiVersion: v1
 kind: Secret
 kind: ConfigMap
 metadata:
  name: {{ .Release.Name }}-prometheus-datasource
  namespace: {{ .Release.Namespace }}
  labels:
    app: {{ .Release.Name }}-api
    grafana_datasource: "true"
 stringData:
 data:
  prometheus.yaml: |-
    apiVersion: 1
    datasources:
@@ -16,4 +16,5 @@ stringData:
        url: http://{{ .Release.Name }}-prometheus-server:80
        editable: false
        uid: prometheus
        access: proxy
 {{- end }}
--- a/charts/skypilot/templates/oauth2-proxy-deployment.yaml
+++ b/charts/skypilot/templates/oauth2-proxy-deployment.yaml
@@ -95,6 +95,9 @@ spec:
        {{- end }}
        {{- end }}
        env:
        {{- with .Values.global.extraEnvs }}
        {{- toYaml . | nindent 8 }}
        {{- end }}
        - name: OAUTH2_PROXY_CLIENT_ID
          {{- if (index $oauth2 "client-details-from-secret") }}
          valueFrom:
--- a/charts/skypilot/templates/oauth2-proxy-redis.yaml
+++ b/charts/skypilot/templates/oauth2-proxy-redis.yaml
@@ -30,6 +30,10 @@ spec:
      containers:
      - name: redis
        image: {{ include "common.image" (dict "root" . "image" (default "redis:7-alpine" (index $oauth2 "redis-image"))) }}
        {{- with $.Values.global.extraEnvs }}
        env:
          {{- toYaml . | nindent 10 }}
        {{- end }}
        ports:
        - containerPort: 6379
        resources:
--- a/charts/skypilot/tests/deployment_test.yaml
+++ b/charts/skypilot/tests/deployment_test.yaml
@@ -86,6 +86,25 @@ tests:
          path: spec.template.spec.containers[1].image
          value: registry.example.com/custom/skypilot-dev/sky:dev

  - it: should inject global extra envs into api containers and init containers
    set:
      global.extraEnvs:
        - name: GLOBAL_ENV
          value: global
      awsCredentials.enabled: true
      awsCredentials.useCredentialsFile: false
    asserts:
      - contains:
          path: spec.template.spec.containers[0].env
          content:
            name: GLOBAL_ENV
            value: global
      - contains:
          path: spec.template.spec.initContainers[0].env
          content:
            name: GLOBAL_ENV
            value: global

  - it: should prefix gcp credentials init container image with the global registry override
    set:
      global.imageRegistry: registry.example.com/custom
--- a/charts/skypilot/tests/oauth2_test.yaml
+++ b/charts/skypilot/tests/oauth2_test.yaml
@@ -88,6 +88,32 @@ tests:
          path: spec.template.spec.containers[0].env[1].value
          value: "test-client-secret"

  - it: should apply global extra envs to oauth2-proxy and redis
    set:
      global.extraEnvs:
        - name: GLOBAL_ENV
          value: global
      auth.oauth.enabled: false
      ingress.enabled: true
      ingress.oauth2-proxy.enabled: true
      ingress.oauth2-proxy.oidc-issuer-url: "https://example.okta.com/oauth2/default"
      ingress.oauth2-proxy.client-id: "test-client-id"
      ingress.oauth2-proxy.client-secret: "test-client-secret"
    asserts:
      - contains:
          path: spec.template.spec.containers[0].env
          content:
            name: GLOBAL_ENV
            value: global
        template: templates/oauth2-proxy-deployment.yaml
      - contains:
          path: spec.template.spec.containers[0].env
          content:
            name: GLOBAL_ENV
            value: global
        template: templates/oauth2-proxy-redis.yaml
        documentIndex: 0

  - it: should configure oauth2-proxy with client credentials from secret (legacy)
    set:
      auth.oauth.enabled: false
--- a/charts/skypilot/values.schema.json
+++ b/charts/skypilot/values.schema.json
@@ -292,6 +292,15 @@
        "global": {
            "type": "object",
            "properties": {
                "extraEnvs": {
                    "type": [
                        "array",
                        "null"
                    ],
                    "items": {
                        "type": "object"
                    }
                },
                "imagePullSecrets": {
                    "type": [
                        "array",
--- a/charts/skypilot/values.yaml
+++ b/charts/skypilot/values.yaml
@@ -6,6 +6,9 @@ global:
  # Specify imagePullSecrets for all components in the chart.
  # @schema type: [array, null]; item: object
  imagePullSecrets: null
  # Specify extra environment variables to set on all components in the chart.
  # @schema type: [array, null]; item: object
  extraEnvs: null


 apiService:
--- a/docs/source/examples/frameworks/index.rst
+++ b/docs/source/examples/frameworks/index.rst
@@ -8,6 +8,7 @@ Frameworks
   DVC <dvc>
   GCP DWS <https://docs.skypilot.co/en/latest/reservations/reservations.html#gcp-dynamic-workload-scheduler-dws>
   Jupyter <jupyter>
   marimo <marimo>
   MLFlow <https://nebius.com/blog/posts/orchestrating-llm-fine-tuning-k8s-skypilot-mlflow>
   MPI <mpi>
   Spyder IDE <spyder>
--- a/docs/source/examples/frameworks/marimo.md
+++ b/docs/source/examples/frameworks/marimo.md
@@ -0,0 +1 @@
 ../../generated-examples/marimo.md
--- a/docs/source/examples/models/index.rst
+++ b/docs/source/examples/models/index.rst
@@ -20,6 +20,7 @@ Models
   Mistral 7B <https://docs.mistral.ai/self-deployment/skypilot/>
   Qwen 3 <qwen>
   Kimi K2 <kimi-k2>
   Kimi K2 Thinking <kimi-k2-thinking>
   Yi <yi>
   Gemma <gemma>
   DBRX <dbrx>
--- a/docs/source/examples/models/kimi-k2-thinking.md
+++ b/docs/source/examples/models/kimi-k2-thinking.md
@@ -0,0 +1 @@
 ../../generated-examples/kimi-k2-thinking.md
--- a/docs/source/reference/api-server/helm-values-spec.rst
+++ b/docs/source/reference/api-server/helm-values-spec.rst
@@ -36,6 +36,7 @@ Below is the available helm value keys and the default value of each key:
  :ref:`global <helm-values-global>`:
    :ref:`imageRegistry <helm-values-global-imageRegistry>`: null
    :ref:`imagePullSecrets <helm-values-global-imagePullSecrets>`: null
    :ref:`extraEnvs <helm-values-global-extraEnvs>`: null
  :ref:`apiService <helm-values-apiService>`:
    :ref:`image <helm-values-apiService-image>`: berkeleyskypilot/skypilot-nightly:latest
    :ref:`upgradeStrategy <helm-values-apiService-upgradeStrategy>`: Recreate
@@ -306,6 +307,22 @@ Default: ``null``
    imagePullSecrets:
      - name: my-registry-credentials

 .. _helm-values-global-extraEnvs:

 ``global.extraEnvs``
 ^^^^^^^^^^^^^^^^^^^^

 Specify extra environment variables to set on all components in the chart.

 Default: ``null``

 .. code-block:: yaml

  global:
    extraEnvs:
      - name: HTTP_PROXY
        value: http://proxy.example.com

 .. _helm-values-apiService:

 ``apiService``
--- a/docs/source/reference/kubernetes/kubernetes-getting-started.rst
+++ b/docs/source/reference/kubernetes/kubernetes-getting-started.rst
@@ -193,8 +193,8 @@ Using custom images
 -------------------
 By default, we maintain and use two SkyPilot container images for use on Kubernetes clusters:

 1. ``us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot``: used for CPU-only clusters (`Dockerfile <https://github.com/skypilot-org/skypilot/blob/master/Dockerfile_k8s>`__).
 2. ``us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu``: used for GPU clusters (`Dockerfile <https://github.com/skypilot-org/skypilot/blob/master/Dockerfile_k8s_gpu>`__).
 1. ``us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot``: used for CPU-only clusters (`Dockerfile <https://github.com/skypilot-org/skypilot/blob/master/Dockerfile_k8s>`__).
 2. ``us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot-gpu``: used for GPU clusters (`Dockerfile <https://github.com/skypilot-org/skypilot/blob/master/Dockerfile_k8s_gpu>`__).

 These images are pre-installed with SkyPilot dependencies for fast startup.

--- a/docs/source/reference/kubernetes/kubernetes-troubleshooting.rst
+++ b/docs/source/reference/kubernetes/kubernetes-troubleshooting.rst
@@ -36,7 +36,7 @@ Step A1 - Can you create pods and services?

 As a sanity check, we will now try creating a simple pod running a HTTP server and a service to verify that your cluster and it's networking is functional.

 We will use the SkyPilot default image :code:`us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest` to verify that the image can be pulled from the registry.
 We will use the SkyPilot default image :code:`us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot:latest` to verify that the image can be pulled from the registry.

 .. code-block:: bash

--- a/docs/source/reference/volumes.rst
+++ b/docs/source/reference/volumes.rst
@@ -110,6 +110,10 @@ Quickstart
     run: |
       echo "Hello, World!" > /mnt/data/hello.txt

 .. note::

  For multi-node clusters, volumes are mounted to all nodes. You must configure ``config.access_mode`` to ``ReadWriteMany`` and use a ``storage_class_name`` that supports the ``ReadWriteMany`` access mode. Otherwise, SkyPilot will fail to launch the cluster.

 .. _volumes-on-kubernetes-manage:

 Managing volumes
@@ -320,6 +324,10 @@ When you launch the cluster with ``sky launch``, the ephemeral volumes will be a
  NAME                       TYPE     INFRA                      SIZE  USER  WORKSPACE  AGE   STATUS  LAST_USE             USED_BY                  IS_EPHEMERAL
  my-cluster-43dbb4ab-2f74bf k8s-pvc  Kubernetes/nebius-mk8s-vol 100Gi alice default    58m   IN_USE  2025-11-17 14:30:18  my-cluster-43dbb4ab-head True

 .. note::

  For multi-node clusters, ephemeral volumes are mounted to all nodes. You must configure ``config.access_mode`` to ``ReadWriteMany`` and use a ``storage_class_name`` that supports the ``ReadWriteMany`` access mode. Otherwise, SkyPilot will fail to launch the cluster.

 When you terminate the cluster, the ephemeral volumes are automatically deleted:

 .. code-block:: console
--- a/examples/distributed-pytorch/train-rdzv.yaml
+++ b/examples/distributed-pytorch/train-rdzv.yaml
@@ -22,6 +22,14 @@ run: |
    export LOGLEVEL=INFO

    MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
    # torchrun attempts to autodetect if the current node is the head node based
    # on the hostname, but this depends on network configuration and doesn't
    # work on some clouds, e.g. Nebius VMs. Force to "localhost" to work around.
    # This is only needed for c10d rdzv backend, not static rendezvous.
    # See https://github.com/pytorch/pytorch/issues/79388.
    if [ $SKYPILOT_NODE_RANK -eq 0 ]; then
        MASTER_ADDR=localhost
    fi
    echo "Starting distributed training, head node: $MASTER_ADDR"

    torchrun \
--- a/examples/github_actions/README.md
+++ b/examples/github_actions/README.md
@@ -39,7 +39,7 @@ In this example, create the following repository secrets:
 - ``SKYPILOT_API_URL``: URL to the SkyPilot API server, in format of ``http(s)://url-or-ip``.
 If using basic auth, the URL should also include the credentials in format of ``http(s)://username:password@url-or-ip``.
 - ``SKYPILOT_SERVICE_ACCOUNT_TOKEN``: Only required if using OAuth. Service account token for GitHub actions user generated above.
 - ``SLACK_BOT_TOKEN``: Optional, create a [Slack App](https://api.slack.com/apps) and get a slack "App-Level Token" with `connections:write` permssion to send a summary message. If not provided, a slack message is not sent after a job is queued.
 - ``SLACK_BOT_TOKEN``: Optional, create a [Slack App](https://api.slack.com/apps) and get a slack "App-Level Token" with `connections:write` permission to send a summary message. If not provided, a slack message is not sent after a job is queued.
 - ``SLACK_CHANNEL_ID``: Optional, Slack Channel ID to send a summary message. If not provided, a slack message is not sent after a job is queued.

 ## Repository Structure
--- a/examples/marimo/README.md
+++ b/examples/marimo/README.md
@@ -0,0 +1,23 @@
 # Run marimo on SkyPilot

 Run a personal [marimo](https://marimo.io/) server on a SkyPilot cluster.

 ![marimo Web UI](https://i.imgur.com/iLYbQ6b.png "marimo Web UI")

 ## Launch with CLI

 Launch a marimo cluster with the command:

 ```bash
 sky launch -c marimo-example marimo.yaml
 ```

 Next, run this command to get the endpoint to connect via the browser:

 ```
 sky status marimo-example --endpoints
 ```

 ## Customization

 The `marimo.yaml` file can be customized to change the port, password, and other options. Check the [docs](https://docs.marimo.io/cli/#marimo-edit) for more information.
--- a/examples/marimo/marimo.yaml
+++ b/examples/marimo/marimo.yaml
@@ -0,0 +1,19 @@
 # Example: Launch marimo and auto-expose its port to Internet.
 #
 # Usage:
 #     # First, launch the compute node
 #     $ sky launch -c marimo-example marimo.yaml
 #     # Next, get the endpoint to connect to over the browser
 #     $ sky status marimo-example --endpoints
 #     # This is an alternative to port forwarding.

 resources:
  ports:
    - 29324

 workdir: .

 setup: pip install uv

 # Check the docs for more options: https://docs.marimo.io/cli/#marimo-edit
 run: uvx marimo edit --port 29324 --headless --host=0.0.0.0 --token-password='secretpassword'
--- a/examples/training/torchtitan/README.md
+++ b/examples/training/torchtitan/README.md
@@ -2,7 +2,7 @@

 [TorchTitan](https://github.com/pytorch/torchtitan) is a PyTorch native platform for large-scale LLM training, featuring multi-dimensional parallelisms (FSDP2, Tensor/Pipeline/Context Parallel), distributed checkpointing, torch.compile, and Float8 support.

 This example demonstrates how to run [TorchTitan](https://github.com/pytorch/torchtitan) on your Kubernetes clusters, or any hypersclaers, neoclouds using SkyPilot, in addition to the instructions for runnning on [Slurm](https://github.com/pytorch/torchtitan?tab=readme-ov-file#multi-node-training).
 This example demonstrates how to run [TorchTitan](https://github.com/pytorch/torchtitan) on your Kubernetes clusters, or any hyperscalers, neoclouds using SkyPilot, in addition to the instructions for running on [Slurm](https://github.com/pytorch/torchtitan?tab=readme-ov-file#multi-node-training).

 ## Quick start
 Here is how to finetune Llama 3.1 on 2 nodes with 8 H100 (or 8 H200):
--- a/examples/training_network_storage_benchmarks/README.md
+++ b/examples/training_network_storage_benchmarks/README.md
@@ -8,7 +8,7 @@ Please edit the yamls as you like.

 To run disk tests, run `sky launch e2e_disk.yaml -c e2e_disk --env HF_TOKEN="YOUR TOKEN"`

 Requirements for disk benchmark, 2 s3 buckets (one for mount and one for mount cached) and 1 pvc (Check out [volumnes](https://docs.skypilot.co/en/stable/reference/volumes.html))
 Requirements for disk benchmark, 2 s3 buckets (one for mount and one for mount cached) and 1 pvc (Check out [volumes](https://docs.skypilot.co/en/stable/reference/volumes.html))

 Expected output, something like:

--- a/llm/kimi-k2-thinking/README.md
+++ b/llm/kimi-k2-thinking/README.md
@@ -0,0 +1,149 @@

 <!-- $REMOVE -->
 # Run Kimi K2 Thinking on Kubernetes or Any Cloud
 <!-- $END_REMOVE -->
 <!-- $UNCOMMENT# Kimi K2 Thinking -->

 [Kimi K2 Thinking](https://huggingface.co/moonshotai/Kimi-K2-Thinking) is an advanced large language model created by [Moonshot AI](https://www.moonshot.ai/).

 This recipe shows how to run Kimi K2 Thinking with reasoning capabilities on your Kubernetes or any cloud. It includes two modes:

 - **Low Latency (TP8)**: Best for interactive applications requiring quick responses
 - **High Throughput (TP8+DCP8)**: Best for batch processing and high-volume serving scenarios


 ## Prerequisites

 - Check that you have installed SkyPilot ([docs](https://docs.skypilot.co/en/latest/getting-started/installation.html)).
 - Check that `sky check` shows clouds or Kubernetes is enabled.
 - **Note**: This model requires 8x H200 or H20 GPUs.

 ## Run Kimi K2 Thinking (Low Latency Mode)

 For low-latency scenarios, use tensor parallelism:

 ```bash
 sky launch kimi-k2-thinking.sky.yaml -c kimi-k2-thinking
 ```

 `kimi-k2-thinking.sky.yaml` uses **tensor parallelism** across 8 GPUs for optimal low-latency performance.

 🎉 **Congratulations!** 🎉 You have now launched the Kimi K2 Thinking LLM with reasoning capabilities on your infra.

 ## Run Kimi K2 Thinking (High Throughput Mode)

 For high-throughput scenarios, use Decode Context Parallel (DCP) for **43% faster token generation** and **26% higher throughput**:

 ```bash
 sky launch kimi-k2-thinking-high-throughput.sky.yaml -c kimi-k2-thinking-ht
 ```

 The `kimi-k2-thinking-high-throughput.sky.yaml` adds `--decode-context-parallel-size 8` to enable DCP:

 ```yaml
 run: |
  echo 'Starting vLLM API server for Kimi-K2-Thinking (High Throughput Mode with DCP)...'
  
  vllm serve $MODEL_NAME \
    --port 8081 \
    --tensor-parallel-size 8 \
    --decode-context-parallel-size 8 \
    --enable-auto-tool-choice \
    --tool-call-parser kimi_k2 \
    --reasoning-parser kimi_k2 \
    --trust-remote-code
 ```

 ### DCP Performance Gains

 From [vLLM's benchmark](https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2-Think.html):

 | Metric | TP8 (Low Latency) | TP8+DCP8 (High Throughput) | Improvement |
 |--------|-------------------|----------------------------|-------------|
 | Request Throughput (req/s) | 1.25 | 1.57 | **+25.6%** |
 | Output Token Throughput (tok/s) | 485.78 | 695.13 | **+43.1%** |
 | Mean TTFT (sec) | 271.2 | 227.8 | **+16.0%** |
 | KV Cache Size (tokens) | 715,072 | 5,721,088 | **8x** |

 ## Chat with Kimi K2 Thinking with OpenAI API

 To curl `/v1/chat/completions`:

 ```bash
 ENDPOINT=$(sky status --endpoint 8081 kimi-k2-thinking)

 curl http://$ENDPOINT/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "moonshotai/Kimi-K2-Thinking",
    "messages": [
      {
        "role": "system",
        "content": "You are a helpful assistant with deep reasoning capabilities."
      },
      {
        "role": "user",
        "content": "Explain how to solve the traveling salesman problem for 10 cities."
      }
    ]
  }' | jq .
 ```

 The model will provide its reasoning process in the response, showing its chain-of-thought approach.

 ## Clean up resources
 To shut down all resources:

 ```bash
 sky down kimi-k2-thinking
 ```

 ## Serving Kimi-K2-Thinking: scaling up with SkyServe

 With no change to the YAML, launch a fully managed service with autoscaling replicas and load-balancing on your infra:

 ```bash
 sky serve up kimi-k2-thinking.sky.yaml -n kimi-k2-thinking
 ```

 Wait until the service is ready:

 ```bash
 watch -n10 sky serve status kimi-k2-thinking
 ```

 Get a single endpoint that load-balances across replicas:

 ```bash
 ENDPOINT=$(sky serve status --endpoint kimi-k2-thinking)
 ```

 > **Tip:** SkyServe fully manages the lifecycle of your replicas. For example, if a spot replica is preempted, the controller will automatically replace it. This significantly reduces the operational burden while saving costs.

 To curl the endpoint:

 ```bash
 curl http://$ENDPOINT/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "moonshotai/Kimi-K2-Thinking",
    "messages": [
      {
        "role": "system",
        "content": "You are a helpful assistant with deep reasoning capabilities."
      },
      {
        "role": "user",
        "content": "Design a distributed system for real-time analytics."
      }
    ]
  }' | jq .
 ```

 To shut down all resources:

 ```bash
 sky serve down kimi-k2-thinking
 ```

 See more details in [SkyServe docs](https://docs.skypilot.co/en/latest/serving/sky-serve.html).
--- a/llm/kimi-k2-thinking/kimi-k2-thinking-high-throughput.sky.yaml
+++ b/llm/kimi-k2-thinking/kimi-k2-thinking-high-throughput.sky.yaml
@@ -0,0 +1,41 @@
 # Serve Kimi-K2-Thinking with SkyPilot and vLLM (High Throughput Mode).
 # Uses Decode Context Parallel (DCP) for 43% faster token generation and 26% higher throughput.
 #
 # Usage:
 #   sky launch kimi-k2-thinking-high-throughput.sky.yaml -c kimi-k2-thinking-ht
 #   sky serve up kimi-k2-thinking-high-throughput.sky.yaml -n kimi-k2-thinking-ht
 envs:
  MODEL_NAME: moonshotai/Kimi-K2-Thinking


 resources:
  image_id: docker:vllm/vllm-openai:nightly-f849ee739cdb3d82fce1660a6fd91806e8ae9bff
  accelerators: H200:8
  cpus: 100+
  memory: 1000+
  ports: 8081

 run: |
  echo 'Starting vLLM API server for Kimi-K2-Thinking (High Throughput Mode with DCP)...'
  
  vllm serve $MODEL_NAME \
    --port 8081 \
    --tensor-parallel-size 8 \
    --decode-context-parallel-size 8 \
    --enable-auto-tool-choice \
    --tool-call-parser kimi_k2 \
    --reasoning-parser kimi_k2 \
    --trust-remote-code

 service:
  replicas: 1
  # An actual request for readiness probe.
  readiness_probe:
    path: /v1/chat/completions
    post_data:
      model: $MODEL_NAME
      messages:
        - role: user
          content: What is 2+2?
      max_tokens: 10

--- a/llm/kimi-k2-thinking/kimi-k2-thinking.sky.yaml
+++ b/llm/kimi-k2-thinking/kimi-k2-thinking.sky.yaml
@@ -0,0 +1,39 @@
 # Serve Kimi-K2-Thinking with SkyPilot and vLLM (Low Latency Mode).
 # This model supports deep thinking & tool orchestration with reasoning capabilities.
 #
 # Usage:
 #   sky launch kimi-k2-thinking.sky.yaml -c kimi-k2-thinking
 #   sky serve up kimi-k2-thinking.sky.yaml -n kimi-k2-thinking
 envs:
  MODEL_NAME: moonshotai/Kimi-K2-Thinking

 resources:
  image_id: docker:vllm/vllm-openai:nightly-f849ee739cdb3d82fce1660a6fd91806e8ae9bff
  accelerators: H200:8
  cpus: 100+
  memory: 1000+
  ports: 8081

 run: |
  echo 'Starting vLLM API server for Kimi-K2-Thinking (Low Latency Mode)...'
  
  vllm serve $MODEL_NAME \
    --port 8081 \
    --tensor-parallel-size 8 \
    --enable-auto-tool-choice \
    --tool-call-parser kimi_k2 \
    --reasoning-parser kimi_k2 \
    --trust-remote-code

 service:
  replicas: 1
  # An actual request for readiness probe.
  readiness_probe:
    path: /v1/chat/completions
    post_data:
      model: $MODEL_NAME
      messages:
        - role: user
          content: What is 2+2?
      max_tokens: 10

--- a/sky/init.py
+++ b/sky/init.py
@@ -37,7 +37,7 @@ def _get_git_commit():


 __commit__ = _get_git_commit()
 __version__ = '1.0.0-dev0'
 __version__ = '0.11.0'
 __root_dir__ = directory_utils.get_sky_dir()


--- a/sky/adaptors/gcp.py
+++ b/sky/adaptors/gcp.py
@@ -2,9 +2,20 @@

 # pylint: disable=import-outside-toplevel
 import json
 import warnings

 from sky.adaptors import common

 # Suppress FutureWarning from google.api_core about Python 3.10 support ending.
 # This warning is informational and does not affect functionality.
 # Reference: https://github.com/skypilot-org/skypilot/issues/7886
 warnings.filterwarnings(
    'ignore',
    category=FutureWarning,
    message=
    r'.*You are using a Python version.*which Google will stop supporting.*',
 )

 _IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for GCP. '
                         'Try pip install "skypilot[gcp]"')
 googleapiclient = common.LazyImport('googleapiclient',
--- a/sky/adaptors/nebius.py
+++ b/sky/adaptors/nebius.py
@@ -136,7 +136,9 @@ SKY_CHECK_NAME = 'Nebius (for Nebius Object Storae)'


 def request_error():
    return nebius.aio.service_error.RequestError
    # pylint: disable=import-outside-toplevel
    from nebius.aio import service_error
    return service_error.RequestError


 def compute():
--- a/sky/backends/backend_utils.py
+++ b/sky/backends/backend_utils.py
@@ -1064,7 +1064,11 @@ def write_cluster_config(
        with open(tmp_yaml_path, 'w', encoding='utf-8') as f:
            f.write(restored_yaml_content)

    config_dict['cluster_name_on_cloud'] = cluster_name_on_cloud
    # Read the cluster_name_on_cloud from the restored yaml. This is a hack to
    # make sure that launching on the same cluster across multiple users works
    # correctly. See #8232.
    yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
    config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']

    # Make sure to do this before we optimize file mounts. Optimization is
    # non-deterministic, but everything else before this point should be
@@ -3150,12 +3154,11 @@ def refresh_cluster_records() -> None:
    Raises:
        None
    """
    exclude_managed_clusters = True
    if env_options.Options.SHOW_DEBUG_INFO.get():
        exclude_managed_clusters = False
    # We force to exclude managed clusters to avoid multiple sources
    # manipulating them. For example, SkyServe assumes the replica manager
    # is the only source of truth for the cluster status.
    cluster_names = set(
        global_user_state.get_cluster_names(
            exclude_managed_clusters=exclude_managed_clusters,))
        global_user_state.get_cluster_names(exclude_managed_clusters=True))

    # TODO(syang): we should try not to leak
    # request info in backend_utils.py.
@@ -3633,6 +3636,26 @@ def check_rsync_installed() -> None:
                '  $ sudo apt install rsync') from None


 def check_stale_runtime_on_remote(returncode: int, stderr: str,
                                  cluster_name: str) -> None:
    """Raises RuntimeError if remote SkyPilot runtime needs to be updated.

    We detect this by parsing certain backward-incompatible error messages from
    `stderr`. Typically due to the local client version just got updated, and
    the remote runtime is an older version.
    """
    if returncode != 0:
        if 'SkyPilot runtime is too old' in stderr:
            with ux_utils.print_exception_no_traceback():
                raise RuntimeError(
                    f'{colorama.Fore.RED}SkyPilot runtime needs to be updated '
                    f'on the remote cluster: {cluster_name}. To update, run '
                    '(existing jobs will not be interrupted): '
                    f'{colorama.Style.BRIGHT}sky start -f -y '
                    f'{cluster_name}{colorama.Style.RESET_ALL}'
                    f'\n--- Details ---\n{stderr.strip()}\n') from None


 def get_endpoints(cluster: str,
                  port: Optional[Union[int, str]] = None,
                  skip_status_check: bool = False) -> Dict[int, str]:
--- a/sky/backends/cloud_vm_ray_backend.py
+++ b/sky/backends/cloud_vm_ray_backend.py
@@ -4154,7 +4154,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
        runners = handle.get_command_runners(avoid_ssh_control=True)

        def _setup_node(node_id: int) -> None:
            setup_envs = task.envs_and_secrets
            setup_envs = task_lib.get_plaintext_envs_and_secrets(
                task.envs_and_secrets)
            setup_envs.update(self._skypilot_predefined_env_vars(handle))
            setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips)
            setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id)
@@ -4431,6 +4432,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                                                          job_submit_cmd,
                                                          stream_logs=False,
                                                          require_outputs=True)
            # Happens when someone calls `sky exec` but remote is outdated for
            # running a job. Necessitating calling `sky launch`.
            backend_utils.check_stale_runtime_on_remote(returncode, stderr,
                                                        handle.cluster_name)
            output = stdout + stderr
            if _is_message_too_long(returncode, output=output):
                # If the job submit script is too long, we need to retry it
@@ -4498,6 +4503,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                stream_logs=False,
                require_outputs=True,
                separate_stderr=True)
            # Happens when someone calls `sky exec` but remote is outdated for
            # adding a job. Necessitating calling `sky launch`.
            backend_utils.check_stale_runtime_on_remote(returncode, stderr,
                                                        handle.cluster_name)
            # TODO(zhwu): this sometimes will unexpectedly fail, we can add
            # retry for this, after we figure out the reason.
            subprocess_utils.handle_returncode(returncode, code,
@@ -6364,7 +6373,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
    def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
                           handle: CloudVmRayResourceHandle) -> Dict[str, str]:
        """Returns the environment variables for the task."""
        env_vars = task.envs_and_secrets
        env_vars = task_lib.get_plaintext_envs_and_secrets(
            task.envs_and_secrets)
        # If it is a managed job, the TASK_ID_ENV_VAR will have been already set
        # by the controller.
        if constants.TASK_ID_ENV_VAR not in env_vars:
--- a/sky/catalog/data_fetchers/fetch_gcp.py
+++ b/sky/catalog/data_fetchers/fetch_gcp.py
@@ -182,8 +182,9 @@ TPU_V4_HOST_DF = pd.read_csv(
 SERIES_TO_DESCRIPTION = {
    'a2': 'A2 Instance',
    'a3': 'A3 Instance',
    # TODO(zhwu): GCP does not have A4 instance in SKUs API yet. We keep it here
    # for completeness.
    # NOTE: GCP does not provide separate CPU/RAM pricing for A4 instances.
    # The B200 GPU pricing includes the full VM cost. See special handling in
    # get_vm_price() which sets A4 VM price to 0.
    'a4': 'A4 Instance',
    'c2': 'Compute optimized',
    'c2d': 'C2D AMD Instance',
@@ -394,6 +395,15 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
        if series in ['f1', 'g1']:
            memory_price = 0.0

        # Special case for A4 instances.
        # GCP does not provide separate CPU/RAM pricing for A4 instances in the
        # SKUs API. The GPU pricing (B200) includes the full VM cost.
        # We set the VM price to 0 so the entry is not dropped, and the GPU
        # pricing will provide the total cost.
        if series == 'a4':
            cpu_price = 0.0
            memory_price = 0.0

        # TODO(tian): (2024/11/10) Some SKUs are missing in the SKUs API. We
        # skip them in the catalog for now. We should investigate why they are
        # missing and add them back.
@@ -525,7 +535,24 @@ def get_gpu_df(skus: List[Dict[str, Any]],
            row_gpu_name = row['AcceleratorName']
            if row['Region'] not in sku['serviceRegions']:
                continue
            if sku['category']['usageType'] != ondemand_or_spot:

            # Check usageType matches, with special handling for B200 spot.
            # GCP has a bug where some B200 spot SKUs have usageType='OnDemand'
            # but the description contains 'Spot Preemptible'.
            usage_type = sku['category']['usageType']
            description = sku['description']
            is_spot_description = 'spot preemptible' in description.lower()

            if usage_type != ondemand_or_spot:
                # For B200 spot pricing, also accept SKUs where description
                # says "Spot Preemptible" even if usageType is wrong.
                if not (spot and row_gpu_name == 'B200' and
                        is_spot_description):
                    continue

            # For B200 on-demand, skip SKUs that are actually spot (description
            # says "Spot Preemptible" but usageType is incorrectly 'OnDemand').
            if not spot and row_gpu_name == 'B200' and is_spot_description:
                continue

            gpu_names = [f'{row_gpu_name} GPU']
--- a/sky/client/cli/command.py
+++ b/sky/client/cli/command.py
@@ -3040,34 +3040,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
            # there is no in-prgress managed jobs.
            managed_jobs_ = []
            pools_ = []
        except exceptions.InconsistentConsolidationModeError:
            # If this error is raised, it means the user switched to the
            # consolidation mode but the previous controller cluster is still
            # running. We should allow the user to tear down the controller
            # cluster in this case.
            with skypilot_config.override_skypilot_config(
                {'jobs': {
                    'controller': {
                        'consolidation_mode': False
                    }
                }}):
                # Check again with the consolidation mode disabled. This is to
                # make sure there is no in-progress managed jobs.
                request_id, queue_result_version = (
                    cli_utils.get_managed_job_queue(
                        refresh=False,
                        skip_finished=True,
                        all_users=True,
                        fields=fields,
                    ))
                result = sdk.stream_and_get(request_id)
                if queue_result_version.v2():
                    managed_jobs_, _, status_counts, _ = result
                else:
                    managed_jobs_ = typing.cast(
                        List[responses.ManagedJobRecord], result)
                request_id_pools = managed_jobs.pool_status(pool_names=None)
                pools_ = sdk.stream_and_get(request_id_pools)

    msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed '
           'jobs controller. Please be aware of the following:'
@@ -3144,21 +3116,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
            # controller being STOPPED or being firstly launched, i.e., there is
            # no in-prgress services.
            services = []
        except exceptions.InconsistentConsolidationModeError:
            # If this error is raised, it means the user switched to the
            # consolidation mode but the previous controller cluster is still
            # running. We should allow the user to tear down the controller
            # cluster in this case.
            with skypilot_config.override_skypilot_config(
                {'serve': {
                    'controller': {
                        'consolidation_mode': False
                    }
                }}):
                # Check again with the consolidation mode disabled. This is to
                # make sure there is no in-progress services.
                request_id = serve_lib.status(service_names=None)
                services = sdk.stream_and_get(request_id)

    if services:
        service_names = [service['name'] for service in services]
@@ -6239,33 +6196,6 @@ def local():
              is_flag=True,
              help='Launch cluster without GPU support even '
              'if GPUs are detected on the host.')
@click.option(
    '--ips',
    type=str,
    required=False,
    help='Path to the file containing IP addresses of remote machines.')
@click.option('--ssh-user',
              type=str,
              required=False,
              help='SSH username for accessing remote machines.')
@click.option('--ssh-key-path',
              type=str,
              required=False,
              help='Path to the SSH private key.')
@click.option('--cleanup',
              is_flag=True,
              help='Clean up the remote cluster instead of deploying it.')
@click.option(
    '--context-name',
    type=str,
    required=False,
    help='Name to use for the kubeconfig context. Defaults to "default". '
    'Used with the ip list.')
@click.option('--password',
              type=str,
              required=False,
              help='Password for the ssh-user to execute sudo commands. '
              'Required only if passwordless sudo is not setup.')
@click.option(
    '--name',
    type=str,
@@ -6282,56 +6212,10 @@ def local():
@flags.config_option(expose_value=False)
@_add_click_options(flags.COMMON_OPTIONS)
@usage_lib.entrypoint
 def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
             cleanup: bool, context_name: Optional[str],
             password: Optional[str], name: Optional[str],
             port_start: Optional[int], async_call: bool):
    """Creates a local or remote cluster."""

    def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
        # If any of --ips, --ssh-user, or --ssh-key-path is specified,
        # all must be specified
        if bool(ips) or bool(ssh_user) or bool(ssh_key_path):
            if not (ips and ssh_user and ssh_key_path):
                raise click.BadParameter(
                    'All --ips, --ssh-user, and --ssh-key-path '
                    'must be specified together.')

        # --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
        # are all provided
        if cleanup and not (ips and ssh_user and ssh_key_path):
            raise click.BadParameter('--cleanup can only be used with '
                                     '--ips, --ssh-user and --ssh-key-path.')

    _validate_args(ips, ssh_user, ssh_key_path, cleanup)

    # If remote deployment arguments are specified, run remote up script
    ip_list = None
    ssh_key = None
    if ips and ssh_user and ssh_key_path:
        # Read and validate IP file
        try:
            with open(os.path.expanduser(ips), 'r', encoding='utf-8') as f:
                ip_list = f.read().strip().splitlines()
            if not ip_list:
                raise click.BadParameter(f'IP file is empty: {ips}')
        except (IOError, OSError) as e:
            raise click.BadParameter(f'Failed to read IP file {ips}: {str(e)}')

        # Read and validate SSH key file
        try:
            with open(os.path.expanduser(ssh_key_path), 'r',
                      encoding='utf-8') as f:
                ssh_key = f.read()
            if not ssh_key:
                raise click.BadParameter(
                    f'SSH key file is empty: {ssh_key_path}')
        except (IOError, OSError) as e:
            raise click.BadParameter(
                f'Failed to read SSH key file {ssh_key_path}: {str(e)}')

    request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup,
                              context_name, password, name, port_start)
 def local_up(gpus: bool, name: Optional[str], port_start: Optional[int],
             async_call: bool):
    """Creates a local cluster."""
    request_id = sdk.local_up(gpus, name, port_start)
    _async_call_or_wait(request_id, async_call, request_name='local up')


@@ -6344,12 +6228,7 @@ def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
@_add_click_options(flags.COMMON_OPTIONS)
@usage_lib.entrypoint
 def local_down(name: Optional[str], async_call: bool):
    """Deletes a local cluster.

    This will only delete a local cluster started without the ip list.
    To clean up the local cluster started with a ip list, use `sky local up`
    with the cleanup flag.
    """
    """Deletes a local cluster."""
    request_id = sdk.local_down(name)
    _async_call_or_wait(request_id, async_call, request_name='sky.local.down')

--- a/sky/client/sdk.py
+++ b/sky/client/sdk.py
@@ -675,7 +675,7 @@ def _launch(
        clusters = get(status_request_id)
        cluster_user_hash = common_utils.get_user_hash()
        cluster_user_hash_str = ''
        current_user = common_utils.get_current_user_name()
        current_user = common_utils.get_local_user_name()
        cluster_user_name = current_user
        if not clusters:
            # Show the optimize log before the prompt if the cluster does not
@@ -1712,12 +1712,6 @@ def storage_delete(name: str) -> server_common.RequestId[None]:
@server_common.check_server_healthy_or_start
@annotations.client_api
 def local_up(gpus: bool,
             ips: Optional[List[str]],
             ssh_user: Optional[str],
             ssh_key: Optional[str],
             cleanup: bool,
             context_name: Optional[str] = None,
             password: Optional[str] = None,
             name: Optional[str] = None,
             port_start: Optional[int] = None) -> server_common.RequestId[None]:
    """Launches a Kubernetes cluster on local machines.
@@ -1733,15 +1727,7 @@ def local_up(gpus: bool,
            raise ValueError('`sky local up` is only supported when '
                             'running SkyPilot locally.')

    body = payloads.LocalUpBody(gpus=gpus,
                                ips=ips,
                                ssh_user=ssh_user,
                                ssh_key=ssh_key,
                                cleanup=cleanup,
                                context_name=context_name,
                                password=password,
                                name=name,
                                port_start=port_start)
    body = payloads.LocalUpBody(gpus=gpus, name=name, port_start=port_start)
    response = server_common.make_authenticated_request(
        'POST', '/local_up', json=json.loads(body.model_dump_json()))
    return server_common.get_request_id(response)
--- a/sky/client/sdk_async.py
+++ b/sky/client/sdk_async.py
@@ -656,19 +656,13 @@ async def storage_delete(
@annotations.client_api
 async def local_up(
        gpus: bool,
        ips: Optional[List[str]],
        ssh_user: Optional[str],
        ssh_key: Optional[str],
        cleanup: bool,
        context_name: Optional[str] = None,
        name: Optional[str] = None,
        password: Optional[str] = None,
        port_start: Optional[int] = None,
        stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG) -> None:
    """Async version of local_up() that launches a Kubernetes cluster on
    local machines."""
    request_id = await context_utils.to_thread(sdk.local_up, gpus, ips,
                                               ssh_user, ssh_key, cleanup,
                                               context_name, name, password)
    request_id = await context_utils.to_thread(sdk.local_up, gpus, name,
                                               port_start)
    if stream_logs is not None:
        return await _stream_and_get(request_id, stream_logs)
    else:
--- a/sky/clouds/aws.py
+++ b/sky/clouds/aws.py
@@ -55,26 +55,6 @@ _DEFAULT_GPU_ARM64_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-arm64'
 _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
 _DEFAULT_NEURON_IMAGE_ID = 'skypilot:neuron-ubuntu-2204'

 # This local file (under ~/.aws/) will be uploaded to remote nodes (any
 # cloud), if all of the following conditions hold:
 #   - the current user identity is not using AWS SSO
 #   - this file exists
 # It has the following purposes:
 #   - make all nodes (any cloud) able to access private S3 buckets
 #   - make some remote nodes able to launch new nodes on AWS (i.e., makes
 #     AWS head node able to launch AWS workers, or any-cloud jobs controller
 #     able to launch spot clusters on AWS).
 #
 # If we detect the current user identity is AWS SSO, we will not upload this
 # file to any remote nodes (any cloud). Instead, a SkyPilot IAM role is
 # assigned to both AWS head and workers.
 # TODO(skypilot): This also means we leave open a bug for AWS SSO users that
 # use multiple clouds. The non-AWS nodes will have neither the credential
 # file nor the ability to understand AWS IAM.
 _CREDENTIAL_FILES = [
    'credentials',
 ]

 DEFAULT_AMI_GB = 45
 DEFAULT_SSH_USER = 'ubuntu'
 DEFAULT_ROOT_DEVICE_NAME = '/dev/sda1'
@@ -124,10 +104,25 @@ _EFA_IMAGE_NAME = 'Deep Learning Base OSS Nvidia Driver GPU AMI' \
 # For functions that needs caching per AWS profile.
 _AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE = 5

 # Ref: https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html
 _DEFAULT_AWS_CONFIG_PATH = '~/.aws/credentials'
 _AWS_CONFIG_FILE_ENV_VAR = 'AWS_CONFIG_FILE'

 T = TypeVar('T')
 P = ParamSpec('P')


 def _get_credentials_path() -> str:
    cred_path = os.getenv(_AWS_CONFIG_FILE_ENV_VAR, None)
    if cred_path is not None:
        if not os.path.isfile(os.path.expanduser(cred_path)):
            raise FileNotFoundError(f'{_AWS_CONFIG_FILE_ENV_VAR}={cred_path},'
                                    ' but the file does not exist.')
        return cred_path
    # Fallback to the default config path.
    return _DEFAULT_AWS_CONFIG_PATH


 def aws_profile_aware_lru_cache(*lru_cache_args,
                                scope: Literal['global', 'request'] = 'request',
                                **lru_cache_kwargs) -> Callable:
@@ -997,8 +992,9 @@ class AWS(clouds.Cloud):
        except exceptions.CloudUserIdentityError as e:
            return False, None, str(e)

        credentials_path = _get_credentials_path()
        static_credential_exists = os.path.isfile(
            os.path.expanduser('~/.aws/credentials'))
            os.path.expanduser(credentials_path))
        hints = None
        identity_type = cls._current_identity_type()
        single_cloud_hint = (
@@ -1049,7 +1045,7 @@ class AWS(clouds.Cloud):
            # other clouds to access private s3 buckets and resources like EC2.
            # `get_active_user_identity` does not guarantee this file exists.
            if not static_credential_exists:
                return (False, None, '~/.aws/credentials does not exist. ' +
                return (False, None, f'{credentials_path} does not exist. ' +
                        cls._STATIC_CREDENTIAL_HELP_STR)

        return True, identity_str, hints
@@ -1290,11 +1286,31 @@ class AWS(clouds.Cloud):
        if self._current_identity_type(
        ) != AWSIdentityType.SHARED_CREDENTIALS_FILE:
            return {}
        return {
            f'~/.aws/{filename}': f'~/.aws/{filename}'
            for filename in _CREDENTIAL_FILES
            if os.path.exists(os.path.expanduser(f'~/.aws/{filename}'))
        }

        # This local credentials file (default to ~/.aws/credentials and can be
        # overridden by AWS_CONFIG_FILE environment variable) will be uploaded
        # to remote nodes (any cloud), if all of the following conditions hold:
        #   - the current user identity is not using AWS SSO
        #   - this file exists
        # It has the following purposes:
        #   - make all nodes (any cloud) able to access private S3 buckets
        #   - make some remote nodes able to launch new nodes on AWS (i.e., makes
        #     AWS head node able to launch AWS workers, or any-cloud jobs controller
        #     able to launch spot clusters on AWS).
        #
        # If we detect the current user identity is AWS SSO, we will not upload this
        # file to any remote nodes (any cloud). Instead, a SkyPilot IAM role is
        # assigned to both AWS head and workers.
        # TODO(skypilot): This also means we leave open a bug for AWS SSO users that
        # use multiple clouds. The non-AWS nodes will have neither the credential
        # file nor the ability to understand AWS IAM.
        credentials_path = os.path.expanduser(_get_credentials_path())
        if os.path.exists(credentials_path):
            return {
                # Upload to the default config location on remote cluster.
                _DEFAULT_AWS_CONFIG_PATH: credentials_path
            }
        return {}

    @aws_profile_aware_lru_cache(scope='request',
                                 maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)
--- a/sky/clouds/gcp.py
+++ b/sky/clouds/gcp.py
@@ -1186,8 +1186,8 @@ class GCP(clouds.Cloud):
            # These series don't support pd-standard, use pd-balanced for LOW.
            _propagate_disk_type(
                lowest=tier2name[resources_utils.DiskTier.MEDIUM])
        if instance_type.startswith('a3-ultragpu') or series == 'n4':
            # a3-ultragpu instances only support hyperdisk-balanced.
        if instance_type.startswith('a3-ultragpu') or series in ('n4', 'a4'):
            # a3-ultragpu, n4, and a4 instances only support hyperdisk-balanced.
            _propagate_disk_type(all='hyperdisk-balanced')

        # Series specific handling
--- a/sky/core.py
+++ b/sky/core.py
@@ -1293,44 +1293,10 @@ def realtime_kubernetes_gpu_availability(
 # =================
@usage_lib.entrypoint
 def local_up(gpus: bool,
             ips: Optional[List[str]],
             ssh_user: Optional[str],
             ssh_key: Optional[str],
             cleanup: bool,
             context_name: Optional[str] = None,
             password: Optional[str] = None,
             name: Optional[str] = None,
             port_start: Optional[int] = None) -> None:
    """Creates a local or remote cluster."""

    def _validate_args(ips, ssh_user, ssh_key, cleanup):
        # If any of --ips, --ssh-user, or --ssh-key-path is specified,
        # all must be specified
        if bool(ips) or bool(ssh_user) or bool(ssh_key):
            if not (ips and ssh_user and ssh_key):
                with ux_utils.print_exception_no_traceback():
                    raise ValueError(
                        'All ips, ssh_user, and ssh_key must be specified '
                        'together.')

        # --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
        # are all provided
        if cleanup and not (ips and ssh_user and ssh_key):
            with ux_utils.print_exception_no_traceback():
                raise ValueError(
                    'cleanup can only be used with ips, ssh_user and ssh_key.')

    _validate_args(ips, ssh_user, ssh_key, cleanup)

    # If remote deployment arguments are specified, run remote up script
    if ips:
        assert ssh_user is not None and ssh_key is not None
        kubernetes_deploy_utils.deploy_remote_cluster(ips, ssh_user, ssh_key,
                                                      cleanup, context_name,
                                                      password)
    else:
        # Run local deployment (kind) if no remote args are specified
        kubernetes_deploy_utils.deploy_local_cluster(name, port_start, gpus)
    """Creates a local cluster."""
    kubernetes_deploy_utils.deploy_local_cluster(name, port_start, gpus)


 def local_down(name: Optional[str] = None) -> None:
--- a/sky/dashboard/src/components/infra.jsx
+++ b/sky/dashboard/src/components/infra.jsx
@@ -84,6 +84,7 @@ export function InfrastructureSection({
  isSSH = false, // To differentiate between SSH and Kubernetes
  actionButton = null, // Optional action button for the header
  contextWorkspaceMap = {}, // Mapping of contexts to workspaces
  contextErrors = {}, // Mapping of contexts to error messages
 }) {
  // Add defensive check for contexts
  const safeContexts = contexts || [];
@@ -289,15 +290,19 @@ export function InfrastructureSection({
                            ) : (
                              <span
                                className={
                                  nodes.length === 0 ? 'text-gray-400' : ''
                                  nodes.length === 0 && contextErrors[context]
                                    ? 'text-gray-400'
                                    : ''
                                }
                                title={
                                  nodes.length === 0
                                    ? 'Context may be unavailable or timed out'
                                  nodes.length === 0 && contextErrors[context]
                                    ? contextErrors[context]
                                    : ''
                                }
                              >
                                {nodes.length === 0 ? '0*' : nodes.length}
                                {nodes.length === 0 && contextErrors[context]
                                  ? '0*'
                                  : nodes.length}
                              </span>
                            )}
                          </td>
@@ -1615,6 +1620,7 @@ export function GPUs() {
  const [enabledClouds, setEnabledClouds] = useState(0);
  const [contextStats, setContextStats] = useState({});
  const [contextWorkspaceMap, setContextWorkspaceMap] = useState({});
  const [contextErrors, setContextErrors] = useState({});

  // Workspace-aware infrastructure state
  const [workspaceInfrastructure, setWorkspaceInfrastructure] = useState({});
@@ -1668,6 +1674,7 @@ export function GPUs() {
        setPerNodeGPUs([]);
        setContextStats({});
        setContextWorkspaceMap({});
        setContextErrors({});
        setAvailableWorkspaces([]);
        setKubeDataLoaded(true);
        setKubeLoading(false);
@@ -1714,6 +1721,7 @@ export function GPUs() {
          perNodeGPUs: fetchedPerNodeGPUs,
          contextStats: fetchedContextStats,
          contextWorkspaceMap: fetchedContextWorkspaceMap,
          contextErrors: fetchedContextErrors,
        } = infraData;

        setWorkspaceInfrastructure(fetchedWorkspaceInfrastructure || {});
@@ -1723,6 +1731,7 @@ export function GPUs() {
        setPerNodeGPUs(fetchedPerNodeGPUs || []);
        setContextStats(fetchedContextStats || {});
        setContextWorkspaceMap(fetchedContextWorkspaceMap || {});
        setContextErrors(fetchedContextErrors || {});

        // Extract available workspaces from the workspace infrastructure data
        const workspaceNames = Object.keys(
@@ -1740,6 +1749,7 @@ export function GPUs() {
        setPerNodeGPUs([]);
        setContextStats({});
        setContextWorkspaceMap({});
        setContextErrors({});
        setAvailableWorkspaces([]);
        setKubeDataLoaded(true);
        setKubeLoading(false);
@@ -1753,6 +1763,7 @@ export function GPUs() {
      setPerNodeGPUs([]);
      setContextStats({});
      setContextWorkspaceMap({});
      setContextErrors({});
      setAvailableWorkspaces([]);
      setKubeDataLoaded(true);
      setKubeLoading(false);
@@ -2306,6 +2317,7 @@ export function GPUs() {
        isJobsDataLoading={sshAndKubeJobsDataLoading}
        isSSH={true}
        contextWorkspaceMap={contextWorkspaceMap}
        contextErrors={contextErrors}
        actionButton={
          // TODO: Add back when SSH Node Pool add operation is more robust
          // <button
@@ -2337,6 +2349,7 @@ export function GPUs() {
        isJobsDataLoading={sshAndKubeJobsDataLoading}
        isSSH={false}
        contextWorkspaceMap={contextWorkspaceMap}
        contextErrors={contextErrors}
      />
    );
  };
--- a/sky/dashboard/src/components/workspaces.jsx
+++ b/sky/dashboard/src/components/workspaces.jsx
@@ -126,6 +126,7 @@ export async function getWorkspaceManagedJobs(workspaceName) {
      throw new Error(msg);
    }
    const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
    let errorMessage = fetchedData.statusText;
    if (fetchedData.status === 500) {
      try {
        const data = await fetchedData.json();
@@ -135,17 +136,24 @@ export async function getWorkspaceManagedJobs(workspaceName) {
            // Handle specific error types
            if (error.type && error.type === CLUSTER_NOT_UP_ERROR) {
              return { jobs: [] };
            } else {
              errorMessage = error.message || String(data.detail.error);
            }
          } catch (jsonError) {
            console.error('Error parsing JSON:', jsonError);
            console.error(
              'Error parsing JSON from data.detail.error:',
              jsonError
            );
            errorMessage = String(data.detail.error);
          }
        }
      } catch (parseError) {
        console.error('Error parsing JSON:', parseError);
        console.error('Error parsing response JSON:', parseError);
        errorMessage = String(parseError);
      }
    }
    if (!fetchedData.ok) {
      const msg = `API request to get managed jobs result failed with status ${fetchedData.status} for workspace ${workspaceName}`;
      const msg = `API request to get managed jobs result failed with status ${fetchedData.status}, error: ${errorMessage} for workspace ${workspaceName}`;
      throw new Error(msg);
    }
    const data = await fetchedData.json();
--- a/sky/dashboard/src/data/connectors/client.js
+++ b/sky/dashboard/src/data/connectors/client.js
@@ -1,5 +1,6 @@
 'use client';

 import { getErrorMessageFromResponse } from '@/data/utils';
 import { ENDPOINT } from './constants';

 export const apiClient = {
@@ -44,7 +45,8 @@ export const apiClient = {

      // Handle all error status codes (4xx, 5xx, etc.)
      if (!fetchedData.ok) {
        const msg = `API request to get ${path} result failed with status ${fetchedData.status}`;
        const errorMessage = await getErrorMessageFromResponse(fetchedData);
        const msg = `API request to get ${path} result failed with status ${fetchedData.status}, error: ${errorMessage}`;
        throw new Error(msg);
      }

--- a/sky/dashboard/src/data/connectors/infra.jsx
+++ b/sky/dashboard/src/data/connectors/infra.jsx
@@ -2,6 +2,7 @@ import { CLOUDS_LIST, COMMON_GPUS } from '@/data/connectors/constants';

 // Importing from the same directory
 import { apiClient } from '@/data/connectors/client';
 import { getErrorMessageFromResponse } from '@/data/utils';

 export async function getCloudInfrastructure(forceRefresh = false) {
  const dashboardCache = (await import('@/lib/cache')).default;
@@ -54,7 +55,7 @@ export async function getCloudInfrastructure(forceRefresh = false) {
            `/api/get?request_id=${checkId}`
          );
          if (!checkResult.ok) {
            const msg = `Failed to get sky check result with status ${checkResult.status}`;
            const msg = `Failed to get sky check result with status ${checkResult.status}, error: ${checkResult.statusText}`;
            throw new Error(msg);
          }
          const checkData = await checkResult.json();
@@ -206,6 +207,7 @@ export async function getWorkspaceInfrastructure() {
        perNodeGPUs: [],
        contextStats: {},
        contextWorkspaceMap: {},
        contextErrors: {},
      };
    }

@@ -324,6 +326,7 @@ export async function getWorkspaceInfrastructure() {
      allGPUs: [],
      perContextGPUs: [],
      perNodeGPUs: [],
      contextErrors: {},
    };
    try {
      gpuData = await getKubernetesGPUsFromContexts(validContexts);
@@ -339,6 +342,7 @@ export async function getWorkspaceInfrastructure() {
      perNodeGPUs: gpuData.perNodeGPUs || [],
      contextStats: contextStats,
      contextWorkspaceMap: contextWorkspaceMap,
      contextErrors: gpuData.contextErrors || {},
    };

    console.log('[DEBUG] Final result:', finalResult);
@@ -361,21 +365,39 @@ async function getKubernetesGPUsFromContexts(contextNames) {
        allGPUs: [],
        perContextGPUs: [],
        perNodeGPUs: [],
        contextErrors: {},
      };
    }

    const allGPUsSummary = {};
    const perContextGPUsData = {};
    const perNodeGPUs_dict = {};
    const contextErrors = {};

    // Get all of the node info for all contexts in parallel and put them
    // in a dictionary keyed by context name.
    const contextNodeInfoList = await Promise.all(
    // Use Promise.allSettled to handle partial failures gracefully
    const contextNodeInfoResults = await Promise.allSettled(
      contextNames.map((context) => getKubernetesPerNodeGPUs(context))
    );
    const contextToNodeInfo = {};
    for (let i = 0; i < contextNames.length; i++) {
      contextToNodeInfo[contextNames[i]] = contextNodeInfoList[i];
      const result = contextNodeInfoResults[i];
      if (result.status === 'fulfilled') {
        contextToNodeInfo[contextNames[i]] = result.value;
      } else {
        // Log the error but continue with other contexts
        const errorMessage =
          result.reason?.message ||
          (typeof result.reason === 'string' && result.reason) ||
          'Context may be unavailable or timed out';
        console.warn(
          `Failed to get node info for context ${contextNames[i]}:`,
          result.reason
        );
        contextToNodeInfo[contextNames[i]] = {};
        contextErrors[contextNames[i]] = errorMessage;
      }
    }

    // Populate the gpuToData map for each context.
@@ -509,6 +531,7 @@ async function getKubernetesGPUsFromContexts(contextNames) {
          a.node_name.localeCompare(b.node_name) ||
          a.gpu_name.localeCompare(b.gpu_name)
      ),
      contextErrors: contextErrors,
    };
  } catch (error) {
    console.error('[infra.jsx] Error in getKubernetesGPUsFromContexts:', error);
@@ -522,7 +545,7 @@ async function getKubernetesPerNodeGPUs(context) {
      context: context,
    });
    if (!response.ok) {
      const msg = `Failed to get kubernetes node info with status ${response.status}`;
      const msg = `Failed to get kubernetes node info for context ${context} with status ${response.status}, error: ${response.statusText}`;
      throw new Error(msg);
    }
    const id =
@@ -533,24 +556,9 @@ async function getKubernetesPerNodeGPUs(context) {
      throw new Error(msg);
    }
    const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
    if (fetchedData.status === 500) {
      try {
        const data = await fetchedData.json();
        if (data.detail && data.detail.error) {
          try {
            const error = JSON.parse(data.detail.error);
            const msg = `Context ${context} unavailable: ${error.message}`;
            throw new Error(msg);
          } catch (jsonError) {
            console.error('Error parsing JSON:', jsonError);
          }
        }
      } catch (parseError) {
        console.error('Error parsing JSON:', parseError);
      }
    }
    if (!fetchedData.ok) {
      const msg = `Failed to get kubernetes node info result with status ${fetchedData.status}`;
      const errorMessage = await getErrorMessageFromResponse(fetchedData);
      const msg = `Failed to get kubernetes node info result for context ${context} with status ${fetchedData.status}, error: ${errorMessage}`;
      throw new Error(msg);
    }
    const data = await fetchedData.json();
@@ -661,7 +669,7 @@ export async function getCloudGPUs() {
      gpus_only: true,
    });
    if (!response.ok) {
      const msg = `Failed to get cloud GPUs with status ${response.status}`;
      const msg = `Failed to get cloud GPUs with status ${response.status}, error: ${response.statusText}`;
      throw new Error(msg);
    }
    const id =
@@ -672,24 +680,9 @@ export async function getCloudGPUs() {
      throw new Error(msg);
    }
    const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
    if (fetchedData.status === 500) {
      try {
        const data = await fetchedData.json();
        if (data.detail && data.detail.error) {
          try {
            const error = JSON.parse(data.detail.error);
            const msg = `Error fetching cloud GPUs: ${error.message}`;
            throw new Error(msg);
          } catch (jsonError) {
            console.error('Error parsing JSON:', jsonError);
          }
        }
      } catch (parseError) {
        console.error('Error parsing JSON:', parseError);
      }
    }
    if (!fetchedData.ok) {
      const msg = `Failed to get cloud GPUs result with status ${fetchedData.status}`;
      const errorMessage = await getErrorMessageFromResponse(fetchedData);
      const msg = `Failed to get cloud GPUs result with status ${fetchedData.status}, error: ${errorMessage}`;
      throw new Error(msg);
    }
    const data = await fetchedData.json();
@@ -753,7 +746,7 @@ export async function getDetailedGpuInfo(filter) {
      all_regions: true,
    });
    if (!response.ok) {
      const msg = `Failed to get detailed GPU info with status ${response.status}`;
      const msg = `Failed to get detailed GPU info with status ${response.status}, error: ${response.statusText}`;
      throw new Error(msg);
    }
    const id =
@@ -765,7 +758,8 @@ export async function getDetailedGpuInfo(filter) {
    }
    const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
    if (!fetchedData.ok) {
      const msg = `Failed to get detailed GPU info result with status ${fetchedData.status}`;
      const errorMessage = await getErrorMessageFromResponse(fetchedData);
      const msg = `Failed to get detailed GPU info result with status ${fetchedData.status}, error: ${errorMessage}`;
      throw new Error(msg);
    }

--- a/sky/dashboard/src/data/connectors/jobs.jsx
+++ b/sky/dashboard/src/data/connectors/jobs.jsx
@@ -84,6 +84,7 @@ export async function getManagedJobs(options = {}) {
      throw new Error(msg);
    }
    const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
    let errorMessage = fetchedData.statusText;
    if (fetchedData.status === 500) {
      try {
        const data = await fetchedData.json();
@@ -93,18 +94,25 @@ export async function getManagedJobs(options = {}) {
            // Handle specific error types
            if (error.type && error.type === CLUSTER_NOT_UP_ERROR) {
              return { jobs: [], total: 0, controllerStopped: true };
            } else {
              errorMessage = error.message || String(data.detail.error);
            }
          } catch (jsonError) {
            console.error('Error parsing JSON:', jsonError);
            console.error(
              'Error parsing JSON from data.detail.error:',
              jsonError
            );
            errorMessage = String(data.detail.error);
          }
        }
      } catch (parseError) {
        console.error('Error parsing JSON:', parseError);
        console.error('Error parsing response JSON:', parseError);
        errorMessage = String(parseError);
      }
    }
    // Handle all error status codes (4xx, 5xx, etc.)
    if (!fetchedData.ok) {
      const msg = `API request to get managed jobs result failed with status ${fetchedData.status}`;
      const msg = `API request to get managed jobs result failed with status ${fetchedData.status}, error: ${errorMessage}`;
      throw new Error(msg);
    }
    // print out the response for debugging
@@ -323,7 +331,7 @@ export async function getPoolStatus() {
      throw new Error(msg);
    }
    const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);

    let errorMessage = fetchedData.statusText;
    if (fetchedData.status === 500) {
      try {
        const data = await fetchedData.json();
@@ -332,18 +340,25 @@ export async function getPoolStatus() {
            const error = JSON.parse(data.detail.error);
            if (error.type && error.type === CLUSTER_NOT_UP_ERROR) {
              return { pools: [], controllerStopped: true };
            } else {
              errorMessage = error.message || String(data.detail.error);
            }
          } catch (jsonError) {
            console.error('Failed to parse error JSON:', jsonError);
            console.error(
              'Error parsing JSON from data.detail.error:',
              jsonError
            );
            errorMessage = String(data.detail.error);
          }
        }
      } catch (dataError) {
        console.error('Failed to parse response JSON:', dataError);
        console.error('Error parsing response JSON:', dataError);
        errorMessage = String(dataError);
      }
    }

    if (!fetchedData.ok) {
      const msg = `API request to get pool status result failed with status ${fetchedData.status}`;
      const msg = `API request to get pool status result failed with status ${fetchedData.status}, error: ${errorMessage}`;
      throw new Error(msg);
    }

--- a/sky/dashboard/src/data/connectors/volumes.js
+++ b/sky/dashboard/src/data/connectors/volumes.js
@@ -1,4 +1,5 @@
 import { apiClient } from '@/data/connectors/client';
 import { getErrorMessageFromResponse } from '@/data/utils';

 export async function getVolumes() {
  try {
@@ -73,7 +74,8 @@ export async function deleteVolume(volumeName) {
    }
    const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
    if (!fetchedData.ok) {
      msg = `Failed to delete volume with status ${fetchedData.status}`;
      const errorMessage = await getErrorMessageFromResponse(fetchedData);
      msg = `Failed to delete volume with status ${fetchedData.status}, error: ${errorMessage}`;
      console.error(msg);
      return { success: false, msg: msg };
    }
--- a/sky/dashboard/src/data/utils.jsx
+++ b/sky/dashboard/src/data/utils.jsx
@@ -27,3 +27,35 @@ export function sortData(data, accessor, direction) {
    return 0;
  });
 }

 /**
 * Extracts error message from API response, handling nested JSON parsing
 * @param {Response} fetchedData - The API response object
 * @returns {Promise<string>} The extracted error message
 */
 export async function getErrorMessageFromResponse(fetchedData) {
  let errorMessage = fetchedData.statusText;

  if (fetchedData.status === 500) {
    try {
      const data = await fetchedData.json();
      if (data.detail && data.detail.error) {
        try {
          const error = JSON.parse(data.detail.error);
          errorMessage = error.message || String(data.detail.error);
        } catch (jsonError) {
          console.error(
            'Error parsing JSON from data.detail.error:',
            jsonError
          );
          errorMessage = String(data.detail.error);
        }
      }
    } catch (parseError) {
      console.error('Error parsing response JSON:', parseError);
      errorMessage = String(parseError);
    }
  }

  return errorMessage;
 }
--- a/sky/exceptions.py
+++ b/sky/exceptions.py
@@ -208,12 +208,6 @@ class InconsistentHighAvailabilityError(Exception):
    pass


 class InconsistentConsolidationModeError(Exception):
    """Raised when the consolidation mode property in the user config
    is inconsistent with the actual cluster."""
    pass


 class ProvisionPrechecksError(Exception):
    """Raised when a managed job fails prechecks before provision.

--- a/sky/execution.py
+++ b/sky/execution.py
@@ -15,6 +15,7 @@ from sky import clouds
 from sky import global_user_state
 from sky import optimizer
 from sky import sky_logging
 from sky import task as task_lib
 from sky.backends import backend_utils
 from sky.server.requests import request_names
 from sky.skylet import autostop_lib
@@ -478,7 +479,9 @@ def _execute_dag(
                    cluster_name, status_lib.ClusterStatus.INIT,
                    'Syncing files to cluster',
                    global_user_state.ClusterEventType.STATUS_CHANGE)
            backend.sync_workdir(handle, task.workdir, task.envs_and_secrets)
            envs_and_secrets = task_lib.get_plaintext_envs_and_secrets(
                task.envs_and_secrets)
            backend.sync_workdir(handle, task.workdir, envs_and_secrets)

        if do_file_mounts:
            if cluster_name is not None:
--- a/sky/global_user_state.py
+++ b/sky/global_user_state.py
@@ -2241,7 +2241,7 @@ def get_volumes(is_ephemeral: Optional[bool] = None) -> List[Dict[str, Any]]:
            rows = session.query(volume_table).all()
        else:
            rows = session.query(volume_table).filter_by(
                is_ephemeral=is_ephemeral).all()
                is_ephemeral=int(is_ephemeral)).all()
    records = []
    for row in rows:
        records.append({
@@ -2253,7 +2253,7 @@ def get_volumes(is_ephemeral: Optional[bool] = None) -> List[Dict[str, Any]]:
            'last_attached_at': row.last_attached_at,
            'last_use': row.last_use,
            'status': status_lib.VolumeStatus[row.status],
            'is_ephemeral': row.is_ephemeral,
            'is_ephemeral': bool(row.is_ephemeral),
        })
    return records

@@ -2316,7 +2316,7 @@ def add_volume(
            last_attached_at=last_attached_at,
            last_use=last_use,
            status=status.value,
            is_ephemeral=is_ephemeral,
            is_ephemeral=int(is_ephemeral),
        )
        do_update_stmt = insert_stmnt.on_conflict_do_nothing()
        session.execute(do_update_stmt)
--- a/sky/jobs/log_gc.py
+++ b/sky/jobs/log_gc.py
@@ -1,13 +1,12 @@
 """Log garbage collection for managed jobs."""

 import asyncio
 from datetime import datetime
 import os
 import pathlib
 import shutil
 import threading
 import time

 import anyio
 import filelock

 from sky import sky_logging
@@ -16,7 +15,6 @@ from sky.jobs import constants as managed_job_constants
 from sky.jobs import state as managed_job_state
 from sky.jobs import utils as managed_job_utils
 from sky.utils import context
 from sky.utils import context_utils

 logger = sky_logging.init_logger(__name__)

@@ -40,7 +38,7 @@ def _next_gc_interval(retention_seconds: int) -> int:
               _MOST_FREQUENT_GC_INTERVAL_SECONDS)


 async def gc_controller_logs_for_job():
 def gc_controller_logs_for_job():
    """Garbage collect job and controller logs."""
    while True:
        skypilot_config.reload_config()
@@ -54,11 +52,8 @@ async def gc_controller_logs_for_job():
            try:
                finished = False
                while not finished:
                    finished = await _clean_controller_logs_with_retention(
                    finished = _clean_controller_logs_with_retention(
                        controller_logs_retention)
            except asyncio.CancelledError:
                logger.info('Managed jobs logs GC task cancelled')
                break
            except Exception as e:  # pylint: disable=broad-except
                logger.error(f'Error GC controller logs for job: {e}',
                             exc_info=True)
@@ -68,10 +63,10 @@ async def gc_controller_logs_for_job():
        interval = _next_gc_interval(controller_logs_retention)
        logger.info('Next controller logs GC is scheduled after '
                    f'{interval} seconds')
        await asyncio.sleep(interval)
        time.sleep(interval)


 async def gc_task_logs_for_job():
 def gc_task_logs_for_job():
    """Garbage collect task logs for job."""
    while True:
        skypilot_config.reload_config()
@@ -85,11 +80,8 @@ async def gc_task_logs_for_job():
            try:
                finished = False
                while not finished:
                    finished = await _clean_task_logs_with_retention(
                    finished = _clean_task_logs_with_retention(
                        task_logs_retention)
            except asyncio.CancelledError:
                logger.info('Task logs GC task cancelled')
                break
            except Exception as e:  # pylint: disable=broad-except
                logger.error(f'Error GC task logs for job: {e}', exc_info=True)
        else:
@@ -97,11 +89,11 @@ async def gc_task_logs_for_job():

        interval = _next_gc_interval(task_logs_retention)
        logger.info(f'Next task logs GC is scheduled after {interval} seconds')
        await asyncio.sleep(_next_gc_interval(task_logs_retention))
        time.sleep(_next_gc_interval(task_logs_retention))


 async def _clean_controller_logs_with_retention(retention_seconds: int,
                                                batch_size: int = 100):
 def _clean_controller_logs_with_retention(retention_seconds: int,
                                          batch_size: int = 100):
    """Clean controller logs with retention.

    Returns:
@@ -109,14 +101,14 @@ async def _clean_controller_logs_with_retention(retention_seconds: int,
        still be more controller logs to clean.
    """
    assert batch_size > 0, 'Batch size must be positive'
    jobs = await managed_job_state.get_controller_logs_to_clean_async(
        retention_seconds, batch_size=batch_size)
    jobs = managed_job_state.get_controller_logs_to_clean(retention_seconds,
                                                          batch_size=batch_size)
    job_ids_to_update = []
    for job in jobs:
        job_ids_to_update.append(job['job_id'])
        log_file = managed_job_utils.controller_log_file_for_job(job['job_id'])
        cleaned_at = time.time()
        if await anyio.Path(log_file).exists():
        if os.path.exists(log_file):
            ts_str = datetime.fromtimestamp(cleaned_at).strftime(
                '%Y-%m-%d %H:%M:%S')
            msg = f'Controller log has been cleaned at {ts_str}.'
@@ -124,20 +116,19 @@ async def _clean_controller_logs_with_retention(retention_seconds: int,
            # keep the file and delete the content.
            # TODO(aylei): refactor sync down logs if the inode usage
            # becomes an issue.
            async with await anyio.open_file(log_file, 'w',
                                             encoding='utf-8') as f:
                await f.write(msg + '\n')
            with open(log_file, 'w', encoding='utf-8') as f:
                f.write(msg + '\n')
    # Batch the update, the timestamp will be not accurate but it's okay.
    await managed_job_state.set_controller_logs_cleaned_async(
        job_ids=job_ids_to_update, logs_cleaned_at=time.time())
    managed_job_state.set_controller_logs_cleaned(job_ids=job_ids_to_update,
                                                  logs_cleaned_at=time.time())
    complete = len(jobs) < batch_size
    logger.info(f'Cleaned {len(jobs)} controller logs with retention '
                f'{retention_seconds} seconds, complete: {complete}')
    return complete


 async def _clean_task_logs_with_retention(retention_seconds: int,
                                          batch_size: int = 100):
 def _clean_task_logs_with_retention(retention_seconds: int,
                                    batch_size: int = 100):
    """Clean task logs with retention.

    Returns:
@@ -145,11 +136,11 @@ async def _clean_task_logs_with_retention(retention_seconds: int,
        still be more task logs to clean.
    """
    assert batch_size > 0, 'Batch size must be positive'
    tasks = await managed_job_state.get_task_logs_to_clean_async(
        retention_seconds, batch_size=batch_size)
    tasks = managed_job_state.get_task_logs_to_clean(retention_seconds,
                                                     batch_size=batch_size)
    tasks_to_update = []
    for task in tasks:
        local_log_file = anyio.Path(task['local_log_file'])
        local_log_file = pathlib.Path(task['local_log_file'])
        # We assume the log directory has the following layout:
        # task-id/
        #   - run.log
@@ -157,36 +148,37 @@ async def _clean_task_logs_with_retention(retention_seconds: int,
        #     - run.log
        # and also remove the tasks directory on cleanup.
        task_log_dir = local_log_file.parent.joinpath('tasks')
        await local_log_file.unlink(missing_ok=True)
        await context_utils.to_thread(shutil.rmtree,
                                      str(task_log_dir),
                                      ignore_errors=True)
        local_log_file.unlink(missing_ok=True)
        shutil.rmtree(task_log_dir, ignore_errors=True)
        # We have at least once semantic guarantee for the cleanup here.
        tasks_to_update.append((task['job_id'], task['task_id']))
    await managed_job_state.set_task_logs_cleaned_async(
        tasks=list(tasks_to_update), logs_cleaned_at=time.time())
    managed_job_state.set_task_logs_cleaned(tasks=list(tasks_to_update),
                                            logs_cleaned_at=time.time())
    complete = len(tasks) < batch_size
    logger.info(f'Cleaned {len(tasks)} task logs with retention '
                f'{retention_seconds} seconds, complete: {complete}')
    return complete


@context.contextual_async
 async def run_log_gc():
@context.contextual
 def run_log_gc():
    """Run the log garbage collector."""
    log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
    os.makedirs(log_dir, exist_ok=True)
    log_path = os.path.join(log_dir, 'garbage_collector.log')
    # Remove previous log file
    await anyio.Path(log_path).unlink(missing_ok=True)
    pathlib.Path(log_path).unlink(missing_ok=True)
    ctx = context.get()
    assert ctx is not None, 'Context is not initialized'
    ctx.redirect_log(pathlib.Path(log_path))
    gc_controller_logs_for_job_task = asyncio.create_task(
        gc_controller_logs_for_job())
    gc_task_logs_for_job_task = asyncio.create_task(gc_task_logs_for_job())
    await asyncio.gather(gc_controller_logs_for_job_task,
                         gc_task_logs_for_job_task)
    tasks = []
    tasks.append(
        threading.Thread(target=gc_controller_logs_for_job, daemon=True))
    tasks.append(threading.Thread(target=gc_task_logs_for_job, daemon=True))
    for task in tasks:
        task.start()
    for task in tasks:
        task.join()


 def elect_for_log_gc():
@@ -198,4 +190,4 @@ def elect_for_log_gc():
    on the filelock and bring trivial overhead.
    """
    with filelock.FileLock(_JOB_CONTROLLER_GC_LOCK_PATH):
        asyncio.run(run_log_gc())
        run_log_gc()
--- a/sky/jobs/state.py
+++ b/sky/jobs/state.py
@@ -662,6 +662,9 @@ class ManagedJobScheduleState(enum.Enum):
        """
        protobuf_to_enum = {
            managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED: None,
            # TODO(cooperc): remove this in v0.13.0. See #8105.
            managed_jobsv1_pb2.DEPRECATED_MANAGED_JOB_SCHEDULE_STATE_INVALID:
                (None),
            managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INACTIVE:
                cls.INACTIVE,
            managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_WAITING: cls.WAITING,
@@ -2410,20 +2413,19 @@ def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
        return job_ids


@_init_db_async
 async def get_task_logs_to_clean_async(retention_seconds: int,
                                       batch_size) -> List[Dict[str, Any]]:
@_init_db
 def get_task_logs_to_clean(retention_seconds: int,
                           batch_size: int) -> List[Dict[str, Any]]:
    """Get the logs of job tasks to clean.

    The logs of a task will only cleaned when:
    - the job schedule state is DONE
    - AND the end time of the task is older than the retention period
    """

    assert _SQLALCHEMY_ENGINE_ASYNC is not None
    async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
    assert _SQLALCHEMY_ENGINE is not None
    with orm.Session(_SQLALCHEMY_ENGINE) as session:
        now = time.time()
        result = await session.execute(
        result = session.execute(
            sqlalchemy.select(
                spot_table.c.spot_job_id,
                spot_table.c.task_id,
@@ -2453,21 +2455,19 @@ async def get_task_logs_to_clean_async(retention_seconds: int,
        } for row in rows]


@_init_db_async
 async def get_controller_logs_to_clean_async(
        retention_seconds: int, batch_size: int) -> List[Dict[str, Any]]:
@_init_db
 def get_controller_logs_to_clean(retention_seconds: int,
                                 batch_size: int) -> List[Dict[str, Any]]:
    """Get the controller logs to clean.

    The controller logs will only cleaned when:
    - the job schedule state is DONE
    - AND the end time of the latest task is older than the retention period
    """

    assert _SQLALCHEMY_ENGINE_ASYNC is not None
    async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
    assert _SQLALCHEMY_ENGINE is not None
    with orm.Session(_SQLALCHEMY_ENGINE) as session:
        now = time.time()

        result = await session.execute(
        result = session.execute(
            sqlalchemy.select(job_info_table.c.spot_job_id,).select_from(
                job_info_table.join(
                    spot_table,
@@ -2490,36 +2490,32 @@ async def get_controller_logs_to_clean_async(
        return [{'job_id': row[0]} for row in rows]


@_init_db_async
 async def set_task_logs_cleaned_async(tasks: List[Tuple[int, int]],
                                      logs_cleaned_at: float):
@_init_db
 def set_task_logs_cleaned(tasks: List[Tuple[int, int]], logs_cleaned_at: float):
    """Set the task logs cleaned at."""
    if not tasks:
        return
    # Deduplicate
    task_keys = list(dict.fromkeys(tasks))
    assert _SQLALCHEMY_ENGINE_ASYNC is not None
    async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
        await session.execute(
    assert _SQLALCHEMY_ENGINE is not None
    with orm.Session(_SQLALCHEMY_ENGINE) as session:
        session.execute(
            sqlalchemy.update(spot_table).where(
                sqlalchemy.tuple_(spot_table.c.spot_job_id,
                                  spot_table.c.task_id).in_(task_keys)).values(
                                      logs_cleaned_at=logs_cleaned_at))
        await session.commit()
        session.commit()


@_init_db_async
 async def set_controller_logs_cleaned_async(job_ids: List[int],
                                            logs_cleaned_at: float):
@_init_db
 def set_controller_logs_cleaned(job_ids: List[int], logs_cleaned_at: float):
    """Set the controller logs cleaned at."""
    if not job_ids:
        return
    # Deduplicate
    job_ids = list(dict.fromkeys(job_ids))
    assert _SQLALCHEMY_ENGINE_ASYNC is not None
    async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
        await session.execute(
    assert _SQLALCHEMY_ENGINE is not None
    with orm.Session(_SQLALCHEMY_ENGINE) as session:
        session.execute(
            sqlalchemy.update(job_info_table).where(
                job_info_table.c.spot_job_id.in_(job_ids)).values(
                    controller_logs_cleaned_at=logs_cleaned_at))
        await session.commit()
        session.commit()
--- a/sky/jobs/utils.py
+++ b/sky/jobs/utils.py
@@ -186,13 +186,11 @@ def _validate_consolidation_mode_config(
        controller_cn = (
            controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
        if global_user_state.cluster_with_name_exists(controller_cn):
            with ux_utils.print_exception_no_traceback():
                raise exceptions.InconsistentConsolidationModeError(
                    f'{colorama.Fore.RED}Consolidation mode for jobs is '
                    f'enabled, but the controller cluster '
                    f'{controller_cn} is still running. Please '
                    'terminate the controller cluster first.'
                    f'{colorama.Style.RESET_ALL}')
            logger.warning(
                f'{colorama.Fore.RED}Consolidation mode for jobs is enabled, '
                f'but the controller cluster {controller_cn} is still running. '
                'Please terminate the controller cluster first.'
                f'{colorama.Style.RESET_ALL}')
    else:
        total_jobs = managed_job_state.get_managed_jobs_total()
        if total_jobs > 0:
@@ -200,13 +198,11 @@ def _validate_consolidation_mode_config(
                managed_job_state.get_nonterminal_job_ids_by_name(
                    None, None, all_users=True))
            if nonterminal_jobs:
                with ux_utils.print_exception_no_traceback():
                    raise exceptions.InconsistentConsolidationModeError(
                        f'{colorama.Fore.RED}Consolidation mode '
                        'is disabled, but there are still '
                        f'{len(nonterminal_jobs)} managed jobs '
                        'running. Please terminate those jobs '
                        f'first.{colorama.Style.RESET_ALL}')
                logger.warning(
                    f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
                    f'but there are still {len(nonterminal_jobs)} managed jobs '
                    'running. Please terminate those jobs first.'
                    f'{colorama.Style.RESET_ALL}')
            else:
                logger.warning(
                    f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
@@ -233,14 +229,11 @@ def is_consolidation_mode(on_api_restart: bool = False) -> bool:
    signal_file = pathlib.Path(
        _JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE).expanduser()

    restart_signal_file_exists = signal_file.exists()
    consolidation_mode = (config_consolidation_mode and
                          restart_signal_file_exists)

    if on_api_restart:
        if config_consolidation_mode:
            signal_file.touch()
    else:
        restart_signal_file_exists = signal_file.exists()
        if not restart_signal_file_exists:
            if config_consolidation_mode:
                logger.warning(f'{colorama.Fore.YELLOW}Consolidation mode for '
@@ -259,8 +252,8 @@ def is_consolidation_mode(on_api_restart: bool = False) -> bool:
    # have related config and will always seemingly disabled for consolidation
    # mode. Check #6611 for more details.
    if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
        _validate_consolidation_mode_config(consolidation_mode)
    return consolidation_mode
        _validate_consolidation_mode_config(config_consolidation_mode)
    return config_consolidation_mode


 def ha_recovery_for_consolidation_mode() -> None:
--- a/sky/provision/docker_utils.py
+++ b/sky/provision/docker_utils.py
@@ -29,10 +29,11 @@ SETUP_ENV_VARS_CMD = (
 # Docker daemon may not be ready when the machine is firstly started. The error
 # message starts with the following string. We should wait for a while and retry
 # the command.
 DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
                                'the Docker daemon socket')
 DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to ')

 DOCKER_SOCKET_NOT_READY_STR = ('Is the docker daemon running?')
 DOCKER_SOCKET_NOT_READY_STR_2 = (
    'check if the path is correct and if the daemon is running')

 _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30

@@ -228,7 +229,8 @@ class DockerInitializer:
                separate_stderr=separate_stderr,
                log_path=self.log_path)
            if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr or
                    DOCKER_SOCKET_NOT_READY_STR in stdout + stderr):
                    DOCKER_SOCKET_NOT_READY_STR in stdout + stderr or
                    DOCKER_SOCKET_NOT_READY_STR_2 in stdout + stderr):
                if wait_for_docker_daemon:
                    if time.time(
                    ) - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -585,6 +585,7 @@ def _wait_for_pods_to_run(namespace, context, cluster_name, new_pods):
        return False, reason

    missing_pods_retry = 0
    last_status_msg: Optional[str] = None
    while True:
        # Get all pods in a single API call
        cluster_name_on_cloud = new_pods[0].metadata.labels[
@@ -645,15 +646,16 @@ def _wait_for_pods_to_run(namespace, context, cluster_name, new_pods):
        if pending_reasons_count:
            msg = ', '.join([
                f'{count} pod(s) pending due to {reason}'
                for reason, count in pending_reasons_count.items()
                for reason, count in sorted(pending_reasons_count.items())
            ])
            rich_utils.force_update_status(
                ux_utils.spinner_message(f'Launching ({msg})',
                                         cluster_name=cluster_name))
            status_text = f'Launching ({msg})'
        else:
            rich_utils.force_update_status(
                ux_utils.spinner_message('Launching',
                                         cluster_name=cluster_name))
            status_text = 'Launching'
        new_status_msg = ux_utils.spinner_message(status_text,
                                                  cluster_name=cluster_name)
        if new_status_msg != last_status_msg:
            rich_utils.force_update_status(new_status_msg)
            last_status_msg = new_status_msg
        time.sleep(1)


--- a/sky/provision/nebius/utils.py
+++ b/sky/provision/nebius/utils.py
@@ -282,94 +282,109 @@ def launch(cluster_name_on_cloud: str,

    service = nebius.compute().InstanceServiceClient(nebius.sdk())
    logger.debug(f'Creating instance {instance_name} in project {project_id}.')
    nebius.sync_call(
        service.create(nebius.compute().CreateInstanceRequest(
            metadata=nebius.nebius_common().ResourceMetadata(
                parent_id=project_id,
                name=instance_name,
            ),
            spec=nebius.compute().InstanceSpec(
                gpu_cluster=nebius.compute().InstanceGpuClusterSpec(
                    id=cluster_id,) if cluster_id is not None else None,
                boot_disk=nebius.compute().AttachedDiskSpec(
                    attach_mode=nebius.compute(
                    ).AttachedDiskSpec.AttachMode.READ_WRITE,
                    existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
                cloud_init_user_data=user_data,
                resources=nebius.compute().ResourcesSpec(platform=platform,
                                                         preset=preset),
                filesystems=filesystems_spec if filesystems_spec else None,
                network_interfaces=[
                    nebius.compute().NetworkInterfaceSpec(
                        subnet_id=sub_net.items[0].metadata.id,
                        ip_address=nebius.compute().IPAddress(),
                        name='network-interface-0',
                        public_ip_address=nebius.compute().PublicIPAddress(
                            static=use_static_ip_address)
                        if associate_public_ip_address else None,
                    )
                ],
                recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
                if use_spot else None,
                preemptible=nebius.compute().PreemptibleSpec(
                    priority=1,
                    on_preemption=nebius.compute().PreemptibleSpec.
                    PreemptionPolicy.STOP) if use_spot else None,
            ))))
    instance_id = ''
    retry_count = 0
    while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
        service = nebius.compute().InstanceServiceClient(nebius.sdk())
        instance = nebius.sync_call(
            service.get_by_name(nebius.nebius_common().GetByNameRequest(
                parent_id=project_id,
                name=instance_name,
            )))
        instance_id = instance.metadata.id
        if instance.status.state.name == 'STARTING':
            break

        # All Instances initially have state=STOPPED and reconciling=True,
        # so we need to wait until reconciling is False.
        if instance.status.state.name == 'STOPPED' and \
                not instance.status.reconciling:
            next_token = ''
            total_operations = 0
            while True:
                operations_response = nebius.sync_call(
                    service.list_operations_by_parent(
                        nebius.compute().ListOperationsByParentRequest(
                            parent_id=project_id,
                            page_size=100,
                            page_token=next_token,
                        )))
                total_operations += len(operations_response.operations)
                for operation in operations_response.operations:
                    # Find the most recent operation for the instance.
                    if operation.resource_id == instance_id:
                        error_msg = operation.description
                        if operation.status:
                            error_msg += f' {operation.status.message}'
                        raise RuntimeError(error_msg)
                # If we've fetched too many operations, or there are no more
                # operations to fetch, just raise a generic error.
                if total_operations > _MAX_OPERATIONS_TO_FETCH or \
                        not operations_response.next_page_token:
                    raise RuntimeError(
                        f'Instance {instance_name} failed to start.')
                next_token = operations_response.next_page_token
        time.sleep(POLL_INTERVAL)
        logger.debug(f'Waiting for instance {instance_name} to start running. '
                     f'State: {instance.status.state.name}, '
                     f'Reconciling: {instance.status.reconciling}')
        retry_count += 1
    try:
        nebius.sync_call(
            service.create(nebius.compute().CreateInstanceRequest(
                metadata=nebius.nebius_common().ResourceMetadata(
                    parent_id=project_id,
                    name=instance_name,
                ),
                spec=nebius.compute().InstanceSpec(
                    gpu_cluster=nebius.compute().InstanceGpuClusterSpec(
                        id=cluster_id,) if cluster_id is not None else None,
                    boot_disk=nebius.compute().AttachedDiskSpec(
                        attach_mode=nebius.compute(
                        ).AttachedDiskSpec.AttachMode.READ_WRITE,
                        existing_disk=nebius.compute().ExistingDisk(
                            id=disk_id)),
                    cloud_init_user_data=user_data,
                    resources=nebius.compute().ResourcesSpec(platform=platform,
                                                             preset=preset),
                    filesystems=filesystems_spec if filesystems_spec else None,
                    network_interfaces=[
                        nebius.compute().NetworkInterfaceSpec(
                            subnet_id=sub_net.items[0].metadata.id,
                            ip_address=nebius.compute().IPAddress(),
                            name='network-interface-0',
                            public_ip_address=nebius.compute().PublicIPAddress(
                                static=use_static_ip_address)
                            if associate_public_ip_address else None,
                        )
                    ],
                    recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
                    if use_spot else None,
                    preemptible=nebius.compute().PreemptibleSpec(
                        priority=1,
                        on_preemption=nebius.compute().PreemptibleSpec.
                        PreemptionPolicy.STOP) if use_spot else None,
                ))))
        instance_id = ''
        retry_count = 0
        while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
            service = nebius.compute().InstanceServiceClient(nebius.sdk())
            instance = nebius.sync_call(
                service.get_by_name(nebius.nebius_common().GetByNameRequest(
                    parent_id=project_id,
                    name=instance_name,
                )))
            instance_id = instance.metadata.id
            if instance.status.state.name == 'STARTING':
                break

            # All Instances initially have state=STOPPED and reconciling=True,
            # so we need to wait until reconciling is False.
            if instance.status.state.name == 'STOPPED' and \
                    not instance.status.reconciling:
                next_token = ''
                total_operations = 0
                while True:
                    operations_response = nebius.sync_call(
                        service.list_operations_by_parent(
                            nebius.compute().ListOperationsByParentRequest(
                                parent_id=project_id,
                                page_size=100,
                                page_token=next_token,
                            )))
                    total_operations += len(operations_response.operations)
                    for operation in operations_response.operations:
                        # Find the most recent operation for the instance.
                        if operation.resource_id == instance_id:
                            error_msg = operation.description
                            if operation.status:
                                error_msg += f' {operation.status.message}'
                            raise RuntimeError(error_msg)
                    # If we've fetched too many operations, or there are no more
                    # operations to fetch, just raise a generic error.
                    if total_operations > _MAX_OPERATIONS_TO_FETCH or \
                            not operations_response.next_page_token:
                        raise RuntimeError(
                            f'Instance {instance_name} failed to start.')
                    next_token = operations_response.next_page_token
            time.sleep(POLL_INTERVAL)
            logger.debug(
                f'Waiting for instance {instance_name} to start running. '
                f'State: {instance.status.state.name}, '
                f'Reconciling: {instance.status.reconciling}')
            retry_count += 1

    if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
        raise TimeoutError(
            f'Exceeded maximum retries '
            f'({nebius.MAX_RETRIES_TO_INSTANCE_READY * POLL_INTERVAL}'
            f' seconds) while waiting for instance {instance_name}'
            f' to be ready.')
        if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
            raise TimeoutError(
                f'Exceeded maximum retries '
                f'({nebius.MAX_RETRIES_TO_INSTANCE_READY * POLL_INTERVAL}'
                f' seconds) while waiting for instance {instance_name}'
                f' to be ready.')
    except nebius.request_error() as e:
        # Handle ResourceExhausted quota limit error. In this case, we need to
        # clean up the disk as VM creation failed and we can't proceed.
        # It cannot be handled by the caller (provisioner)'s teardown logic,
        # as we cannot retrieve the disk id, after the instance creation
        # fails
        logger.warning(f'Failed to launch instance {instance_name}: {e}')
        service = nebius.compute().DiskServiceClient(nebius.sdk())
        nebius.sync_call(
            service.delete(nebius.compute().DeleteDiskRequest(id=disk_id)))
        logger.debug(f'Disk {disk_id} deleted.')
        raise e
    return instance_id


--- a/sky/schemas/generated/managed_jobsv1_pb2.py
+++ b/sky/schemas/generated/managed_jobsv1_pb2.py
--- a/sky/schemas/generated/managed_jobsv1_pb2.pyi
+++ b/sky/schemas/generated/managed_jobsv1_pb2.pyi
@@ -26,6 +26,7 @@ class ManagedJobStatus(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
 class ManagedJobScheduleState(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
    __slots__ = ()
    MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED: _ClassVar[ManagedJobScheduleState]
    DEPRECATED_MANAGED_JOB_SCHEDULE_STATE_INVALID: _ClassVar[ManagedJobScheduleState]
    MANAGED_JOB_SCHEDULE_STATE_INACTIVE: _ClassVar[ManagedJobScheduleState]
    MANAGED_JOB_SCHEDULE_STATE_WAITING: _ClassVar[ManagedJobScheduleState]
    MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING: _ClassVar[ManagedJobScheduleState]
@@ -48,6 +49,7 @@ MANAGED_JOB_STATUS_FAILED_PRECHECKS: ManagedJobStatus
 MANAGED_JOB_STATUS_FAILED_NO_RESOURCE: ManagedJobStatus
 MANAGED_JOB_STATUS_FAILED_CONTROLLER: ManagedJobStatus
 MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED: ManagedJobScheduleState
 DEPRECATED_MANAGED_JOB_SCHEDULE_STATE_INVALID: ManagedJobScheduleState
 MANAGED_JOB_SCHEDULE_STATE_INACTIVE: ManagedJobScheduleState
 MANAGED_JOB_SCHEDULE_STATE_WAITING: ManagedJobScheduleState
 MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING: ManagedJobScheduleState
--- a/sky/schemas/proto/managed_jobsv1.proto
+++ b/sky/schemas/proto/managed_jobsv1.proto
@@ -34,6 +34,11 @@ enum ManagedJobStatus {

 enum ManagedJobScheduleState {
  MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED = 0;
  // Old servers mapped `None` to INVALID, instead of omitting the field. Keep
  // the enum value so we can deserialize the response properly. On the client,
  // this should be mapped to `None` / missing field. See #8105.
  // TODO(cooperc): Remove in 0.13.0
  DEPRECATED_MANAGED_JOB_SCHEDULE_STATE_INVALID = 1 [deprecated = true];
  MANAGED_JOB_SCHEDULE_STATE_INACTIVE = 2;
  MANAGED_JOB_SCHEDULE_STATE_WAITING = 3;
  MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING = 4;
--- a/sky/serve/constants.py
+++ b/sky/serve/constants.py
@@ -65,7 +65,8 @@ AUTOSCALER_DEFAULT_DOWNSCALE_DELAY_SECONDS = 1200
 # TODO(tian): We might need to be careful that service logs can take a lot of
 # disk space. Maybe we could use a larger disk size, migrate to cloud storage or
 # do some log rotation.
 CONTROLLER_RESOURCES = {'cpus': '4+', 'disk_size': 200}
 # Set default minimal memory to 8GB to allow at least one service to run.
 CONTROLLER_RESOURCES = {'cpus': '4+', 'memory': '8+', 'disk_size': 200}
 # Autostop config for the jobs controller. These are the default values for
 # serve.controller.autostop in ~/.sky/config.yaml.
 CONTROLLER_AUTOSTOP = {
--- a/sky/serve/controller.py
+++ b/sky/serve/controller.py
@@ -21,6 +21,7 @@ from sky.serve import autoscalers
 from sky.serve import replica_managers
 from sky.serve import serve_state
 from sky.serve import serve_utils
 from sky.skylet import constants
 from sky.utils import common_utils
 from sky.utils import context_utils
 from sky.utils import ux_utils
@@ -288,6 +289,7 @@ class SkyServeController:
 # specific time period.
 def run_controller(service_name: str, service_spec: serve.SkyServiceSpec,
                   version: int, controller_host: str, controller_port: int):
    os.environ[constants.OVERRIDE_CONSOLIDATION_MODE] = 'true'
    # Hijack sys.stdout/stderr to be context aware.
    context_utils.hijack_sys_attrs()
    controller = SkyServeController(service_name, service_spec, version,
--- a/sky/serve/load_balancing_policies.py
+++ b/sky/serve/load_balancing_policies.py
@@ -121,7 +121,7 @@ class LeastLoadPolicy(LoadBalancingPolicy, name='least_load', default=True):
            return
        with self.lock:
            self.ready_replicas = ready_replicas
            for r in self.ready_replicas:
            for r in list(self.load_map.keys()):
                if r not in ready_replicas:
                    del self.load_map[r]
            for replica in ready_replicas:
--- a/sky/serve/serve_utils.py
+++ b/sky/serve/serve_utils.py
@@ -218,25 +218,23 @@ def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
    if current_is_consolidation_mode:
        controller_cn = controller.cluster_name
        if global_user_state.cluster_with_name_exists(controller_cn):
            with ux_utils.print_exception_no_traceback():
                raise exceptions.InconsistentConsolidationModeError(
                    f'{colorama.Fore.RED}Consolidation mode for '
                    f'{controller.controller_type} is enabled, but the '
                    f'controller cluster {controller_cn} is still running. '
                    'Please terminate the controller cluster first.'
                    f'{colorama.Style.RESET_ALL}')
            logger.warning(
                f'{colorama.Fore.RED}Consolidation mode for '
                f'{controller.controller_type} is enabled, but the controller '
                f'cluster {controller_cn} is still running. Please terminate '
                'the controller cluster first.'
                f'{colorama.Style.RESET_ALL}')
    else:
        noun = 'pool' if pool else 'service'
        all_services = [
            svc for svc in serve_state.get_services() if svc['pool'] == pool
        ]
        if all_services:
            with ux_utils.print_exception_no_traceback():
                raise exceptions.InconsistentConsolidationModeError(
                    f'{colorama.Fore.RED}Consolidation mode for '
                    f'{controller.controller_type} is disabled, but there are '
                    f'still {len(all_services)} {noun}s running. Please '
                    f'terminate those {noun}s first.{colorama.Style.RESET_ALL}')
            logger.warning(
                f'{colorama.Fore.RED}Consolidation mode for '
                f'{controller.controller_type} is disabled, but there are '
                f'still {len(all_services)} {noun}s running. Please terminate '
                f'those {noun}s first.{colorama.Style.RESET_ALL}')


@annotations.lru_cache(scope='request', maxsize=1)
--- a/sky/server/constants.py
+++ b/sky/server/constants.py
@@ -10,7 +10,7 @@ from sky.skylet import constants
 # based on version info is needed.
 # For more details and code guidelines, refer to:
 # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
 API_VERSION = 23
 API_VERSION = 24

 # The minimum peer API version that the code should still work with.
 # Notes (dev):
--- a/sky/server/daemons.py
+++ b/sky/server/daemons.py
@@ -83,15 +83,6 @@ class InternalRequestDaemon:
                    sky_logging.reload_logger()
                    level = self.refresh_log_level()
                    self.event_fn()
                # Clear request level cache after each run to avoid
                # using too much memory.
                annotations.clear_request_level_cache()
                timeline.save_timeline()
                # Kill all children processes related to this request.
                # Each executor handles a single request, so we can safely
                # kill all children processes related to this request.
                subprocess_utils.kill_children_processes()
                common_utils.release_memory()
            except Exception:  # pylint: disable=broad-except
                # It is OK to fail to run the event, as the event is not
                # critical, but we should log the error.
@@ -101,6 +92,16 @@ class InternalRequestDaemon:
                    f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
                    'seconds...')
                time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
            finally:
                # Clear request level cache after each run to avoid
                # using too much memory.
                annotations.clear_request_level_cache()
                timeline.save_timeline()
                # Kill all children processes related to this request.
                # Each executor handles a single request, so we can safely
                # kill all children processes related to this request.
                subprocess_utils.kill_children_processes()
                common_utils.release_memory()


 def refresh_cluster_status_event():
@@ -119,14 +120,6 @@ def refresh_cluster_status_event():
    time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)


 # After #7332, we start a local API server for pool/serve controller.
 # We should skip the status refresh event on the pool/serve controller,
 # as they have their own logic to cleanup the cluster records. This refresh
 # will break existing workflows.
 def should_skip_refresh_cluster_status() -> bool:
    return os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None


 def refresh_volume_status_event():
    """Periodically refresh the volume status."""
    # pylint: disable=import-outside-toplevel
@@ -273,7 +266,6 @@ INTERNAL_REQUEST_DAEMONS = [
        id='skypilot-status-refresh-daemon',
        name=request_names.RequestName.REQUEST_DAEMON_STATUS_REFRESH,
        event_fn=refresh_cluster_status_event,
        should_skip=should_skip_refresh_cluster_status,
        default_log_level='DEBUG'),
    # Volume status refresh daemon to update the volume status periodically.
    InternalRequestDaemon(
--- a/sky/server/requests/executor.py
+++ b/sky/server/requests/executor.py
@@ -270,9 +270,16 @@ class RequestWorker:
                queue.put(request_element)
        except exceptions.ExecutionRetryableError as e:
            time.sleep(e.retry_wait_seconds)
            # Reset the request status to PENDING so it can be picked up again.
            # Assume retryable since the error is ExecutionRetryableError.
            request_id, _, _ = request_element
            with api_requests.update_request(request_id) as request_task:
                assert request_task is not None, request_id
                request_task.status = api_requests.RequestStatus.PENDING
            # Reschedule the request.
            queue = _get_queue(self.schedule_type)
            queue.put(request_element)
            logger.info(f'Rescheduled request {request_id} for retry')
        finally:
            # Increment the free executor count when a request finishes
            if metrics_utils.METRICS_ENABLED:
--- a/sky/server/requests/payloads.py
+++ b/sky/server/requests/payloads.py
@@ -72,7 +72,6 @@ EXTERNAL_LOCAL_ENV_VARS = [
 ]


@annotations.lru_cache(scope='global')
 def request_body_env_vars() -> dict:
    env_vars = {}
    for env_var in os.environ:
@@ -83,7 +82,7 @@ def request_body_env_vars() -> dict:
        if common.is_api_server_local() and env_var in EXTERNAL_LOCAL_ENV_VARS:
            env_vars[env_var] = os.environ[env_var]
    env_vars[constants.USER_ID_ENV_VAR] = common_utils.get_user_hash()
    env_vars[constants.USER_ENV_VAR] = common_utils.get_current_user_name()
    env_vars[constants.USER_ENV_VAR] = common_utils.get_local_user_name()
    env_vars[
        usage_constants.USAGE_RUN_ID_ENV_VAR] = usage_lib.messages.usage.run_id
    if not common.is_api_server_local():
@@ -695,12 +694,6 @@ class ListAcceleratorCountsBody(RequestBody):
 class LocalUpBody(RequestBody):
    """The request body for the local up endpoint."""
    gpus: bool = True
    ips: Optional[List[str]] = None
    ssh_user: Optional[str] = None
    ssh_key: Optional[str] = None
    cleanup: bool = False
    context_name: Optional[str] = None
    password: Optional[str] = None
    name: Optional[str] = None
    port_start: Optional[int] = None

--- a/sky/server/server.py
+++ b/sky/server/server.py
@@ -56,6 +56,7 @@ from sky.server import constants as server_constants
 from sky.server import daemons
 from sky.server import metrics
 from sky.server import middleware_utils
 from sky.server import server_utils
 from sky.server import state
 from sky.server import stream_utils
 from sky.server import versions
@@ -470,7 +471,8 @@ async def schedule_on_boot_check_async():
        await executor.schedule_request_async(
            request_id='skypilot-server-on-boot-check',
            request_name=request_names.RequestName.CHECK,
            request_body=payloads.CheckBody(),
            request_body=server_utils.build_body_at_server(
                request=None, body_type=payloads.CheckBody),
            func=sky_check.check,
            schedule_type=requests_lib.ScheduleType.SHORT,
            is_skypilot_system=True,
@@ -493,7 +495,8 @@ async def lifespan(app: fastapi.FastAPI):  # pylint: disable=redefined-outer-nam
            await executor.schedule_request_async(
                request_id=event.id,
                request_name=event.name,
                request_body=payloads.RequestBody(),
                request_body=server_utils.build_body_at_server(
                    request=None, body_type=payloads.RequestBody),
                func=event.run_event,
                schedule_type=requests_lib.ScheduleType.SHORT,
                is_skypilot_system=True,
@@ -747,8 +750,11 @@ async def enabled_clouds(request: fastapi.Request,
    await executor.schedule_request_async(
        request_id=request.state.request_id,
        request_name=request_names.RequestName.ENABLED_CLOUDS,
        request_body=payloads.EnabledCloudsBody(workspace=workspace,
                                                expand=expand),
        request_body=server_utils.build_body_at_server(
            request=request,
            body_type=payloads.EnabledCloudsBody,
            workspace=workspace,
            expand=expand),
        func=core.enabled_clouds,
        schedule_type=requests_lib.ScheduleType.SHORT,
    )
@@ -792,7 +798,8 @@ async def status_kubernetes(request: fastapi.Request) -> None:
    await executor.schedule_request_async(
        request_id=request.state.request_id,
        request_name=request_names.RequestName.STATUS_KUBERNETES,
        request_body=payloads.RequestBody(),
        request_body=server_utils.build_body_at_server(
            request=request, body_type=payloads.RequestBody),
        func=core.status_kubernetes,
        schedule_type=requests_lib.ScheduleType.SHORT,
    )
@@ -1461,7 +1468,8 @@ async def storage_ls(request: fastapi.Request) -> None:
    await executor.schedule_request_async(
        request_id=request.state.request_id,
        request_name=request_names.RequestName.STORAGE_LS,
        request_body=payloads.RequestBody(),
        request_body=server_utils.build_body_at_server(
            request=request, body_type=payloads.RequestBody),
        func=core.storage_ls,
        schedule_type=requests_lib.ScheduleType.SHORT,
    )
@@ -2008,7 +2016,8 @@ async def all_contexts(request: fastapi.Request) -> None:
    await executor.schedule_request_async(
        request_id=request.state.request_id,
        request_name=request_names.RequestName.ALL_CONTEXTS,
        request_body=payloads.RequestBody(),
        request_body=server_utils.build_body_at_server(
            request=request, body_type=payloads.RequestBody),
        func=core.get_all_contexts,
        schedule_type=requests_lib.ScheduleType.SHORT,
    )
--- a/sky/server/server_utils.py
+++ b/sky/server/server_utils.py
@@ -0,0 +1,30 @@
 """Utilities for the API server."""

 from typing import Optional, Type, TypeVar

 import fastapi

 from sky.server.requests import payloads
 from sky.skylet import constants

 _BodyT = TypeVar('_BodyT', bound=payloads.RequestBody)


 # TODO(aylei): remove this and disable request body construction at server-side
 def build_body_at_server(request: Optional[fastapi.Request],
                         body_type: Type[_BodyT], **data) -> _BodyT:
    """Builds the request body at the server.

    For historical reasons, some handlers mimic a client request body
    at server-side in order to coordinate with the interface of executor.
    This will cause issues where the client info like user identity is not
    respected in these handlers. This function is a helper to build the request
    body at server-side with the auth user overridden.
    """
    request_body = body_type(**data)
    if request is not None:
        auth_user = getattr(request.state, 'auth_user', None)
        if auth_user:
            request_body.env_vars[constants.USER_ID_ENV_VAR] = auth_user.id
            request_body.env_vars[constants.USER_ENV_VAR] = auth_user.name
    return request_body
--- a/sky/skylet/subprocess_daemon.py
+++ b/sky/skylet/subprocess_daemon.py
@@ -110,7 +110,8 @@ def kill_process_tree(process: psutil.Process,


 def main():
    # daemonize()
    daemonize()

    parser = argparse.ArgumentParser()
    parser.add_argument('--parent-pid', type=int, required=True)
    parser.add_argument('--proc-pid', type=int, required=True)
--- a/sky/task.py
+++ b/sky/task.py
@@ -7,6 +7,7 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
                    Union)

 import colorama
 from pydantic import SecretStr

 from sky import clouds
 from sky import dag as dag_lib
@@ -112,7 +113,7 @@ def _fill_in_env_vars(


 def _check_docker_login_config(task_envs: Dict[str, str],
                               task_secrets: Dict[str, str]) -> bool:
                               task_secrets: Dict[str, SecretStr]) -> bool:
    """Validates a valid docker login config in task_envs and task_secrets.

    Docker login variables must be specified together either in envs OR secrets,
@@ -173,12 +174,13 @@ def _with_docker_login_config(
    resources: Union[Set['resources_lib.Resources'],
                     List['resources_lib.Resources']],
    task_envs: Dict[str, str],
    task_secrets: Dict[str, str],
    task_secrets: Dict[str, SecretStr],
 ) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
    if not _check_docker_login_config(task_envs, task_secrets):
        return resources
    envs = task_envs.copy()
    envs.update(task_secrets)
    for key, value in task_secrets.items():
        envs[key] = value.get_secret_value()
    docker_login_config = docker_utils.DockerLoginConfig.from_env_vars(envs)

    def _add_docker_login_config(resources: 'resources_lib.Resources'):
@@ -207,10 +209,11 @@ def _with_docker_username_for_runpod(
    resources: Union[Set['resources_lib.Resources'],
                     List['resources_lib.Resources']],
    task_envs: Dict[str, str],
    task_secrets: Dict[str, str],
    task_secrets: Dict[str, SecretStr],
 ) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
    envs = task_envs.copy()
    envs.update(task_secrets)
    for key, value in task_secrets.items():
        envs[key] = value.get_secret_value()
    docker_username_for_runpod = envs.get(
        constants.RUNPOD_DOCKER_USERNAME_ENV_VAR)

@@ -223,6 +226,18 @@ def _with_docker_username_for_runpod(
        for r in resources))


 def get_plaintext_envs_and_secrets(
    envs_and_secrets: Dict[str, Union[str, SecretStr]],) -> Dict[str, str]:
    return {
        k: v.get_secret_value() if isinstance(v, SecretStr) else v
        for k, v in envs_and_secrets.items()
    }


 def get_plaintext_secrets(secrets: Dict[str, SecretStr]) -> Dict[str, str]:
    return {k: v.get_secret_value() for k, v in secrets.items()}


 class Task:
    """Task: a computation to be run on the cloud."""

@@ -343,7 +358,9 @@ class Task:
        self.storage_plans: Dict[storage_lib.Storage,
                                 storage_lib.StoreType] = {}
        self._envs = envs or {}
        self._secrets = secrets or {}
        self._secrets = {}
        if secrets is not None:
            self._secrets = {k: SecretStr(v) for k, v in secrets.items()}
        self._volumes = volumes or {}

        # concatenate commands if given as list
@@ -935,7 +952,7 @@ class Task:
        return self._envs

    @property
    def secrets(self) -> Dict[str, str]:
    def secrets(self) -> Dict[str, SecretStr]:
        return self._secrets

    @property
@@ -1042,7 +1059,8 @@ class Task:
                raise ValueError(
                    'secrets must be List[Tuple[str, str]] or Dict[str, str]: '
                    f'{secrets}')
        self._secrets.update(secrets)
        for key, value in secrets.items():
            self._secrets[key] = SecretStr(value)
        # Validate Docker login configuration if needed
        if _check_docker_login_config(self._envs, self._secrets):
            self.resources = _with_docker_login_config(self.resources,
@@ -1057,7 +1075,7 @@ class Task:
        return any(r.use_spot for r in self.resources)

    @property
    def envs_and_secrets(self) -> Dict[str, str]:
    def envs_and_secrets(self) -> Dict[str, Union[str, SecretStr]]:
        envs = self.envs.copy()
        envs.update(self.secrets)
        return envs
@@ -1643,9 +1661,11 @@ class Task:
            if clone_info.token is None and clone_info.ssh_key is None:
                return self
            if clone_info.token is not None:
                self.secrets[git.GIT_TOKEN_ENV_VAR] = clone_info.token
                self.secrets[git.GIT_TOKEN_ENV_VAR] = SecretStr(
                    clone_info.token)
            if clone_info.ssh_key is not None:
                self.secrets[git.GIT_SSH_KEY_ENV_VAR] = clone_info.ssh_key
                self.secrets[git.GIT_SSH_KEY_ENV_VAR] = SecretStr(
                    clone_info.ssh_key)
        except exceptions.GitError as e:
            with ux_utils.print_exception_no_traceback():
                raise ValueError(f'{str(e)}') from None
@@ -1703,8 +1723,10 @@ class Task:
        add_if_not_none('envs', self.envs, no_empty=True)

        secrets = self.secrets
        if secrets and redact_secrets:
            secrets = {k: '<redacted>' for k in secrets}
        if secrets and not redact_secrets:
            secrets = {k: v.get_secret_value() for k, v in secrets.items()}
        elif secrets and redact_secrets:
            secrets = {k: '<redacted>' for k, v in secrets.items()}
        add_if_not_none('secrets', secrets, no_empty=True)

        add_if_not_none('file_mounts', {})
--- a/sky/utils/common_utils.py
+++ b/sky/utils/common_utils.py
@@ -300,6 +300,7 @@ _current_user: Optional['models.User'] = None
 _current_request_id: Optional[str] = None


 # TODO(aylei,hailong): request context should be contextual
 def set_request_context(client_entrypoint: Optional[str],
                        client_command: Optional[str],
                        using_remote_api_server: bool,
@@ -341,19 +342,32 @@ def get_current_command() -> str:


 def get_current_user() -> 'models.User':
    """Returns the current user."""
    """Returns the user in current server session."""
    if _current_user is not None:
        return _current_user
    return models.User.get_current_user()


 def get_current_user_name() -> str:
    """Returns the current user name."""
    """Returns the user name in current server session."""
    name = get_current_user().name
    assert name is not None
    return name


 def get_local_user_name() -> str:
    """Returns the user name in local environment.

    This is for backward compatibility where anonymous access is implicitly
    allowed when no authentication method at server-side is configured and
    the username from client environment variable will be used to identify the
    user.
    """
    name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
    assert name is not None
    return name


 def set_current_user(user: 'models.User'):
    """Sets the current user."""
    global _current_user
--- a/sky/utils/controller_utils.py
+++ b/sky/utils/controller_utils.py
@@ -569,7 +569,6 @@ def shared_controller_vars_to_fill(
        # with a remote API server.
        constants.USING_REMOTE_API_SERVER_ENV_VAR: str(
            common_utils.get_using_remote_api_server()),
        constants.OVERRIDE_CONSOLIDATION_MODE: 'true',
        constants.IS_SKYPILOT_SERVE_CONTROLLER:
            ('true'
             if controller == Controllers.SKY_SERVE_CONTROLLER else 'false'),
--- a/sky/utils/kubernetes/create_cluster.sh
+++ b/sky/utils/kubernetes/create_cluster.sh
@@ -4,8 +4,8 @@
 set -e

 # Images
 IMAGE="us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest"
 IMAGE_GPU="us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu:latest"
 IMAGE="us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot:latest"
 IMAGE_GPU="us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot-gpu:latest"

 # Arguments
 NAME=$1
--- a/sky/utils/kubernetes/deploy_remote_cluster.py
+++ b/sky/utils/kubernetes/deploy_remote_cluster.py
@@ -468,7 +468,7 @@ def main():
                )
                continue

            context_name = 'ssh-default'
            context_name = f'ssh-{cluster_name}'

            # Check cluster history
            os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)
--- a/sky/utils/kubernetes/gpu_labeler.py
+++ b/sky/utils/kubernetes/gpu_labeler.py
@@ -40,7 +40,9 @@ def cleanup(context: Optional[str] = None) -> Tuple[bool, str]:
            success = True
        except subprocess.CalledProcessError as e:
            output = e.output.decode('utf-8')
            reason = 'Error deleting existing GPU labeler resources: ' + output
            stderr = e.stderr.decode('utf-8')
            reason = ('Error deleting existing GPU labeler resources: ' +
                      output + stderr)
        return success, reason


--- a/sky/utils/kubernetes/k8s_gpu_labeler_job.yaml
+++ b/sky/utils/kubernetes/k8s_gpu_labeler_job.yaml
@@ -13,10 +13,11 @@ spec:
      serviceAccountName: gpu-labeler-sa
      containers:
      - name: gpu-labeler
        image: us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu:latest # Using this image also serves as a way to "pre-pull" the image onto nodes
        image: us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot-gpu:latest # Using this image also serves as a way to "pre-pull" the image onto nodes
        command: ["/bin/bash", "-i", "-c"]
        args:
          - |
            source ~/skypilot-runtime/bin/activate
            python /label_gpus.py
        env:
        - name: MY_NODE_NAME
--- a/sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml
+++ b/sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml
@@ -53,51 +53,51 @@ data:
    import os
    import subprocess
    from typing import Optional
    

    from kubernetes import client
    from kubernetes import config
    

    canonical_gpu_names = [
        'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', 'V100', 
        'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', 'V100',
        'A10', 'P100', 'P40', 'P4', 'L4'
    ]
    
    


    def get_gpu_name() -> Optional[str]:
        try:
            result = subprocess.run(
                ['nvidia-smi', '--query-gpu=name', '--format=csv,noheader,nounits'],
                stdout=subprocess.PIPE)
            gpu_name = result.stdout.decode('utf-8').strip()
            # In the case of multi-gpu nodes, we assume the node is homogenous and 
            # In the case of multi-gpu nodes, we assume the node is homogenous and
            # just use the first GPU name.
            gpu_name = gpu_name.split('\n')[0]
            return gpu_name.lower()
        except Exception as e:
            print(f'Error getting GPU name: {e}')
            return None
    
    


    def label_node(gpu_name: str) -> None:
        try:
            config.load_incluster_config()  # Load in-cluster configuration
            v1 = client.CoreV1Api()
    

            # Fetch the current node's name from the environment variable
            node_name = os.environ.get('MY_NODE_NAME')
            if not node_name:
                raise ValueError('Failed to get node name from environment')
    

            # Label the node with the GPU name
            body = {'metadata': {'labels': {'skypilot.co/accelerator': gpu_name}}}
            v1.patch_node(node_name, body)
    

            print(f'Labeled node {node_name} with GPU {gpu_name}')
    

        except Exception as e:
            print(f'Error labeling node: {e}')
    
    


    def main():
        gpu_name = get_gpu_name()
        if gpu_name is not None:
@@ -119,7 +119,7 @@ data:
                labelled = True
        else:
            print('No GPU detected. Try running nvidia-smi in the container.')
    
    


    if __name__ == '__main__':
        main()
--- a/sky/utils/kubernetes/kubernetes_deploy_utils.py
+++ b/sky/utils/kubernetes/kubernetes_deploy_utils.py
@@ -170,94 +170,6 @@ def deploy_ssh_cluster(cleanup: bool = False,
                    is_local=True))


 def deploy_remote_cluster(ip_list: List[str],
                          ssh_user: str,
                          ssh_key: str,
                          cleanup: bool,
                          context_name: Optional[str] = None,
                          password: Optional[str] = None):
    success = False
    path_to_package = os.path.dirname(__file__)
    up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.py')
    # Get directory of script and run it from there
    cwd = os.path.dirname(os.path.abspath(up_script_path))

    # Create temporary files for the IPs and SSH key
    with tempfile.NamedTemporaryFile(mode='w') as ip_file, \
         tempfile.NamedTemporaryFile(mode='w') as key_file:

        # Write IPs and SSH key to temporary files
        ip_file.write('\n'.join(ip_list))
        ip_file.flush()

        key_file.write(ssh_key)
        key_file.flush()
        os.chmod(key_file.name, 0o600)

        # Use the legacy mode command line arguments for backward compatibility
        deploy_command = [
            sys.executable, up_script_path, '--ips-file', ip_file.name,
            '--user', ssh_user, '--ssh-key', key_file.name
        ]

        if context_name is not None:
            deploy_command.extend(['--context-name', context_name])
        if password is not None:
            deploy_command.extend(['--password', password])
        if cleanup:
            deploy_command.append('--cleanup')

        # Setup logging paths
        run_timestamp = sky_logging.get_run_timestamp()
        log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
                                'local_up.log')

        if cleanup:
            msg_str = 'Cleaning up remote cluster...'
        else:
            msg_str = 'Deploying remote cluster...'

        # Create environment with PYTHONUNBUFFERED=1 to ensure unbuffered output
        env = os.environ.copy()
        env['PYTHONUNBUFFERED'] = '1'

        with rich_utils.safe_status(
                ux_utils.spinner_message(msg_str,
                                         log_path=log_path,
                                         is_local=True)):
            returncode, _, stderr = log_lib.run_with_log(
                cmd=deploy_command,
                log_path=log_path,
                require_outputs=True,
                stream_logs=False,
                line_processor=log_utils.SkyRemoteUpLineProcessor(
                    log_path=log_path, is_local=True),
                cwd=cwd,
                env=env)
        if returncode == 0:
            success = True
        else:
            with ux_utils.print_exception_no_traceback():
                log_hint = ux_utils.log_path_hint(log_path, is_local=True)
                raise RuntimeError('Failed to deploy remote cluster. '
                                   f'Full log: {log_hint}'
                                   f'\nError: {stderr}')

        if success:
            if cleanup:
                logger.info(
                    ux_utils.finishing_message(
                        '🎉 Remote cluster cleaned up successfully.',
                        log_path=log_path,
                        is_local=True))
            else:
                logger.info(
                    ux_utils.finishing_message(
                        '🎉 Remote cluster deployed successfully.',
                        log_path=log_path,
                        is_local=True))


 def generate_kind_config(port_start: int,
                         num_nodes: int = 1,
                         gpus: bool = False) -> str:
--- a/sky/utils/log_utils.py
+++ b/sky/utils/log_utils.py
@@ -198,108 +198,6 @@ class SkyLocalUpLineProcessor(LineProcessor):
        self.status_display.stop()


 class SkyRemoteUpLineProcessor(LineProcessor):
    """A processor for deploy_remote_cluster.py log lines."""

    def __init__(self, log_path: str, is_local: bool):
        self.log_path = log_path
        self.is_local = is_local

    def __enter__(self) -> None:
        # TODO(romilb): Use ux_utils.INDENT_SYMBOL to be consistent with other
        #  messages.
        status = rich_utils.safe_status(
            ux_utils.spinner_message('Creating remote cluster',
                                     log_path=self.log_path,
                                     is_local=self.is_local))
        self.status_display = status
        self.status_display.start()

    def process_line(self, log_line: str) -> None:
        # Pre-flight checks
        if 'SSH connection successful' in log_line:
            logger.info(f'{colorama.Fore.GREEN}SSH connection established.'
                        f'{colorama.Style.RESET_ALL}')

        # Kubernetes installation steps
        if 'Deploying Kubernetes on head node' in log_line:
            self.status_display.update(
                ux_utils.spinner_message(
                    'Creating remote cluster - '
                    'deploying Kubernetes on head node',
                    log_path=self.log_path,
                    is_local=self.is_local))
        if 'K3s deployed on head node.' in log_line:
            logger.info(f'{colorama.Fore.GREEN}'
                        '✔ K3s successfully deployed on head node.'
                        f'{colorama.Style.RESET_ALL}')

        # Worker nodes
        if 'Deploying Kubernetes on worker node' in log_line:
            self.status_display.update(
                ux_utils.spinner_message(
                    'Creating remote cluster - '
                    'deploying Kubernetes on worker nodes',
                    log_path=self.log_path,
                    is_local=self.is_local))
        if 'Kubernetes deployed on worker node' in log_line:
            logger.info(f'{colorama.Fore.GREEN}'
                        '✔ K3s successfully deployed on worker node.'
                        f'{colorama.Style.RESET_ALL}')

        # Cluster configuration
        if 'Configuring local kubectl to connect to the cluster...' in log_line:
            self.status_display.update(
                ux_utils.spinner_message(
                    'Creating remote cluster - '
                    'configuring local kubectl',
                    log_path=self.log_path,
                    is_local=self.is_local))
        if 'kubectl configured to connect to the cluster.' in log_line:
            logger.info(f'{colorama.Fore.GREEN}'
                        '✔ kubectl configured for the remote cluster.'
                        f'{colorama.Style.RESET_ALL}')

        # GPU operator installation
        if 'Installing Nvidia GPU Operator...' in log_line:
            self.status_display.update(
                ux_utils.spinner_message(
                    'Creating remote cluster - '
                    'installing Nvidia GPU Operator',
                    log_path=self.log_path,
                    is_local=self.is_local))
        if 'GPU Operator installed.' in log_line:
            logger.info(f'{colorama.Fore.GREEN}'
                        '✔ Nvidia GPU Operator installed successfully.'
                        f'{colorama.Style.RESET_ALL}')

        # Cleanup steps
        if 'Cleaning up head node' in log_line:
            self.status_display.update(
                ux_utils.spinner_message('Cleaning up head node',
                                         log_path=self.log_path,
                                         is_local=self.is_local))
        if 'Cleaning up node' in log_line:
            self.status_display.update(
                ux_utils.spinner_message('Cleaning up worker node',
                                         log_path=self.log_path,
                                         is_local=self.is_local))
        if 'cleaned up successfully' in log_line:
            logger.info(f'{colorama.Fore.GREEN}'
                        f'{log_line.strip()}{colorama.Style.RESET_ALL}')

        # Final status
        if 'Cluster deployment completed.' in log_line:
            logger.info(f'{colorama.Fore.GREEN}✔ Remote k3s is running.'
                        f'{colorama.Style.RESET_ALL}')

    def __exit__(self, except_type: Optional[Type[BaseException]],
                 except_value: Optional[BaseException],
                 traceback: Optional[types.TracebackType]) -> None:
        del except_type, except_value, traceback  # unused
        self.status_display.stop()


 class SkySSHUpLineProcessor(LineProcessor):
    """A processor for deploy_remote_cluster.py log lines for SSH clusters"""

--- a/tests/kubernetes/README.md
+++ b/tests/kubernetes/README.md
@@ -5,7 +5,7 @@ This directory contains useful scripts and notes for developing SkyPilot on Kube
 ## Building and pushing SkyPilot image

 We maintain a container image that has all basic SkyPilot dependencies installed.
 This image is hosted at `us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest`.
 This image is hosted at `us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot:latest`.

 To build this image locally and optionally push to the SkyPilot registry, run:
 ```bash
--- a/tests/kubernetes/cpu_test_pod.yaml
+++ b/tests/kubernetes/cpu_test_pod.yaml
@@ -18,7 +18,7 @@ metadata:
 spec:
  containers:
  - name: skytest
    image: us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest
    image: us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot:latest
    # Run apt update and run a simple HTTP server
    command: ["/bin/bash", "-c", "--"]
    args: ["sudo apt update && python3 -m http.server 8080"]
--- a/tests/kubernetes/gpu_test_pod.yaml
+++ b/tests/kubernetes/gpu_test_pod.yaml
@@ -8,7 +8,7 @@ spec:
  restartPolicy: Never
  containers:
  - name: skygputest
    image: us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu:latest
    image: us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot-gpu:latest
    command: ["nvidia-smi"]
    resources:
      limits:
--- a/tests/kubernetes/scripts/create_cluster.sh
+++ b/tests/kubernetes/scripts/create_cluster.sh
@@ -5,7 +5,7 @@ set -e
 #   create_cluster.sh gcp <CLUSTER_NAME> <PROJECT_ID> <ZONE> <NODE_COUNT> <MACHINE_TYPE>
 #   create_cluster.sh aws <CLUSTER_NAME> <REGION> <NODE_COUNT> <INSTANCE_TYPE>

 # If EKS_VPC_CONFIG_PUBLIC is set, it will be injected verbatim into the eksctl config
 # If EKS_VPC_CONFIG_PRIVATE is set, it will be injected verbatim into the eksctl config

 PROVIDER=${1:-"gcp"}
 shift || true
@@ -52,6 +52,11 @@ case "$PROVIDER" in
    echo "Region: $REGION"
    echo "Node Count: $NODE_COUNT"
    echo "Instance Type: $INSTANCE_TYPE"
    if [ -n "$EKS_VPC_CONFIG_PRIVATE" ]; then
        echo "Using custom VPC configuration from EKS_VPC_CONFIG_PRIVATE"
    else
        echo "Using default VPC configuration (EKS_VPC_CONFIG_PRIVATE not set)"
    fi

    # Check if cluster exists and delete it if present
    echo "Checking if EKS cluster '$CLUSTER_NAME' exists..."
@@ -71,6 +76,13 @@ case "$PROVIDER" in
    fi

    RESOLVED_CONFIG="/tmp/${CLUSTER_NAME}-eks-cluster-config.yaml"
    # Convert literal \n to actual newlines if EKS_VPC_CONFIG_PRIVATE is set
    if [ -n "$EKS_VPC_CONFIG_PRIVATE" ]; then
        # Use printf to interpret escape sequences like \n
        VPC_CONFIG=$(printf '%b\n' "$EKS_VPC_CONFIG_PRIVATE")
    else
        VPC_CONFIG=""
    fi
    cat > "$RESOLVED_CONFIG" <<EOF
 apiVersion: eksctl.io/v1alpha5
 kind: ClusterConfig
@@ -78,7 +90,7 @@ kind: ClusterConfig
 metadata:
  name: ${CLUSTER_NAME}
  region: ${REGION}
 ${EKS_VPC_CONFIG_PUBLIC}
 ${VPC_CONFIG}
 iam:
  withOIDC: true
 managedNodeGroups:
@@ -95,12 +107,14 @@ EOF

    aws eks --region "$REGION" update-kubeconfig --name "$CLUSTER_NAME"

    # If user provided VPC/subnets via EKS_VPC_CONFIG_PUBLIC, tag those subnets so
    # If user provided VPC/subnets via EKS_VPC_CONFIG_PRIVATE, tag those subnets so
    # Service type LoadBalancer can provision internet-facing ELB/NLB.
    if [ -n "$EKS_VPC_CONFIG_PUBLIC" ]; then
    if [ -n "$EKS_VPC_CONFIG_PRIVATE" ]; then
        echo "Tagging provided public subnets for internet-facing LoadBalancers..."
        # Convert literal \n to actual newlines for parsing
        VPC_CONFIG_FOR_PARSING=$(printf '%b\n' "$EKS_VPC_CONFIG_PRIVATE")
        # Extract all subnet IDs from the config (deduplicated)
        mapfile -t SUBNET_IDS < <(echo "$EKS_VPC_CONFIG_PUBLIC" | grep -E 'id:\s*subnet-' | awk '{print $2}' | tr -d '"' | sort -u)
        mapfile -t SUBNET_IDS < <(echo "$VPC_CONFIG_FOR_PARSING" | grep -E 'id:\s*subnet-' | awk '{print $2}' | tr -d '"' | sort -u)
        for subnet_id in "${SUBNET_IDS[@]}"; do
            if [ -n "$subnet_id" ]; then
                echo "Tagging subnet $subnet_id"
--- a/tests/kubernetes/scripts/skypilot_ssh_k8s_deployment.yaml
+++ b/tests/kubernetes/scripts/skypilot_ssh_k8s_deployment.yaml
@@ -20,7 +20,7 @@ spec:
            secretName: ssh-key-secret
      containers:
      - name: skypilot
        image: us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest
        image: us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot:latest
        imagePullPolicy: Never
        env:
          - name: SECRET_THING
--- a/tests/load_tests/db_scale_tests/create_aws_postgres_db.sh
+++ b/tests/load_tests/db_scale_tests/create_aws_postgres_db.sh
@@ -81,20 +81,22 @@ fi

 # Get VPC and subnets if instance doesn't exist - use EKS VPC if available, otherwise use default VPC
 if [ "$INSTANCE_EXISTS" = "false" ]; then
 if [ -n "$EKS_VPC_CONFIG" ]; then
    echo "Using EKS VPC configuration from EKS_VPC_CONFIG..." >&2
 if [ -n "$EKS_VPC_CONFIG_PRIVATE" ]; then
    echo "Using custom VPC configuration from EKS_VPC_CONFIG_PRIVATE..." >&2
    # Convert literal \n to actual newlines for parsing
    VPC_CONFIG_FOR_PARSING=$(printf '%b\n' "$EKS_VPC_CONFIG_PRIVATE")
    # Parse VPC ID from YAML format: "  id: vpc-xxx" (under vpc:)
    # Look for lines with "id:" that contain "vpc-" pattern
    VPC_ID=$(echo "$EKS_VPC_CONFIG" | grep -E "^\s+id:\s+vpc-" | awk '{print $2}' | tr -d '"' | tr -d "'" | head -n1)
    VPC_ID=$(echo "$VPC_CONFIG_FOR_PARSING" | grep -E "^\s+id:\s+vpc-" | awk '{print $2}' | tr -d '"' | tr -d "'" | head -n1)

    if [ -z "$VPC_ID" ]; then
        echo "WARNING: Could not parse VPC ID from EKS_VPC_CONFIG, falling back to default VPC" >&2
        echo "WARNING: Could not parse VPC ID from EKS_VPC_CONFIG_PRIVATE, falling back to default VPC" >&2
        USE_EKS_VPC=false
    else
        # Parse subnet IDs from YAML format
        # Format: "        id: subnet-xxx" (nested under subnets/public/)
        # Look for lines with "id:" that contain "subnet-" pattern
        SUBNET_IDS=$(echo "$EKS_VPC_CONFIG" | grep -E "^\s+id:\s+subnet-" | awk '{print $2}' | tr -d '"' | tr -d "'")
        SUBNET_IDS=$(echo "$VPC_CONFIG_FOR_PARSING" | grep -E "^\s+id:\s+subnet-" | awk '{print $2}' | tr -d '"' | tr -d "'")

        # Convert to array
        SUBNET_ARRAY=($SUBNET_IDS)
@@ -126,6 +128,7 @@ if [ -n "$EKS_VPC_CONFIG" ]; then
    fi
 else
    USE_EKS_VPC=false
    echo "Using default VPC configuration (EKS_VPC_CONFIG_PRIVATE not set)..." >&2
 fi

 if [ "$USE_EKS_VPC" != "true" ]; then
--- a/tests/smoke_tests/smoke_tests_utils.py
+++ b/tests/smoke_tests/smoke_tests_utils.py
@@ -75,7 +75,7 @@ LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG = {
        'controller': {
            'resources': {
                'cpus': '4+',
                'memory': '4+'
                'memory': '16+'
            }
        }
    },
@@ -83,7 +83,7 @@ LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG = {
        'controller': {
            'resources': {
                'cpus': '4+',
                'memory': '4+'
                'memory': '8+'
            }
        }
    }
@@ -551,7 +551,7 @@ def ensure_iterable_result(func):
 def run_one_test(test: Test, check_sky_status: bool = True) -> None:
    # Fail fast if `sky` CLI somehow errors out.
    if check_sky_status:
        test.commands.insert(0, 'sky status')
        test.commands.insert(0, 'sky status -u')

    log_to_stdout = os.environ.get('LOG_TO_STDOUT', None)
    if log_to_stdout:
@@ -1045,7 +1045,7 @@ def get_dashboard_jobs_queue_request_id() -> str:
    return server_common.get_request_id(response)


 def get_response_from_request_id(request_id: str) -> Any:
 def get_response_from_request_id_dashboard(request_id: str) -> Any:
    """Waits for and gets the result of a request.

    Args:
@@ -1064,7 +1064,7 @@ def get_response_from_request_id(request_id: str) -> Any:
        'GET',
        f'/internal/dashboard/api/get?request_id={request_id}',
        server_url=get_api_server_url(),
        timeout=15)
        timeout=25)
    request_task = None
    if response.status_code == 200:
        request_task = requests_lib.Request.decode(
--- a/tests/smoke_tests/test_api_server.py
+++ b/tests/smoke_tests/test_api_server.py
@@ -35,6 +35,8 @@ def set_user(user_id: str, user_name: str, commands: List[str]) -> List[str]:
@pytest.mark.no_hyperbolic  # Hyperbolic does not support multi-tenant jobs
@pytest.mark.no_shadeform  # Shadeform does not support multi-tenant jobs
@pytest.mark.no_seeweb  # Seeweb does not support multi-tenant jobs
 # Note: we should skip or fix on shared remote cluster because two copies of
 # this test may down each other's clusters (sky down -a with hardcoded user id).
 def test_multi_tenant(generic_cloud: str):
    if smoke_tests_utils.services_account_token_configured_in_env_file():
        pytest.skip(
@@ -63,8 +65,14 @@ def test_multi_tenant(generic_cloud: str):
                # Stopping cluster should not change the ownership of the cluster.
                f's=$(sky status) && echo "$s" && echo "$s" | grep {name}-1 && exit 1 || true',
                f'sky status {name}-1 | grep STOPPED',
                # Both clusters should be stopped.
                f'sky status -u | grep {name}-1 | grep STOPPED',
                # Restarting other user's cluster should work.
                f'sky start -y {name}-1',
                # Cluster should still have the same disk.
                f'sky exec {name}-1 \'ls file || exit 1\'',
                # Restarting cluster should not change the ownership of the cluster.
                f's=$(sky status) && echo "$s" && echo "$s" | grep {name}-1 && exit 1 || true',
                # Cluster 1 should be UP now, but cluster 2 should be STOPPED.
                f'sky status -u | grep {name}-1 | grep UP',
                f'sky status -u | grep {name}-2 | grep STOPPED',
            ]),
    ]
@@ -78,14 +86,17 @@ def test_multi_tenant(generic_cloud: str):
            'echo "==== Test multi-tenant job on single cluster ===="',
            *set_user(user_1, user_1_name, [
                f'sky launch -y -c {name}-1 --cloud {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -n job-1 tests/test_yamls/minimal.yaml',
                f'sky exec {name}-1 -n job-2 \'touch file\'',
                f's=$(sky queue {name}-1) && echo "$s" && echo "$s" | grep job-1 | grep SUCCEEDED | awk \'{{print $1}}\' | grep 1',
                f's=$(sky queue -u {name}-1) && echo "$s" && echo "$s" | grep {user_1_name} | grep job-1 | grep SUCCEEDED',
            ]),
            *set_user(user_2, user_2_name, [
                f'sky exec {name}-1 -n job-2 \'echo "hello" && exit 1\' || [ $? -eq 100 ]',
                f's=$(sky queue {name}-1) && echo "$s" && echo "$s" | grep job-2 | grep FAILED | awk \'{{print $1}}\' | grep 2',
                f'sky exec {name}-1 -n job-3 \'echo "hello" && exit 1\' || [ $? -eq 100 ]',
                f'sky launch -y -c {name}-1 -n job-4 \'ls file || exit 1\'',
                f's=$(sky queue {name}-1) && echo "$s" && echo "$s" | grep job-3 | grep FAILED | awk \'{{print $1}}\' | grep 3',
                f's=$(sky queue {name}-1) && echo "$s" && echo "$s" | grep job-4 | grep SUCCEEDED | awk \'{{print $1}}\' | grep 4',
                f's=$(sky queue {name}-1) && echo "$s" && echo "$s" | grep job-1 && exit 1 || true',
                f's=$(sky queue {name}-1 -u) && echo "$s" && echo "$s" | grep {user_2_name} | grep job-2 | grep FAILED',
                f's=$(sky queue {name}-1 -u) && echo "$s" && echo "$s" | grep {user_2_name} | grep job-3 | grep FAILED',
                f's=$(sky queue {name}-1 -u) && echo "$s" && echo "$s" | grep {user_1_name} | grep job-1 | grep SUCCEEDED',
            ]),
            'echo "==== Test clusters from different users ===="',
@@ -284,6 +295,11 @@ def test_requests_scheduling(generic_cloud: str):


 # ---- Test recent request tracking -----
 # We mark this test as no_remote_server since it requires a dedicated API server
 # for the test otherwise we can't make any guarantees about the most recent
 # request. Replace with another option to skip shared server tests when we have
 # one.
@pytest.mark.no_remote_server
 def test_recent_request_tracking(generic_cloud: str):
    with smoke_tests_utils.override_sky_config():
        # We need to override the sky api endpoint env if --remote-server is
--- a/tests/smoke_tests/test_basic.py
+++ b/tests/smoke_tests/test_basic.py
@@ -637,8 +637,9 @@ def test_core_api_sky_launch_exec(generic_cloud: str):
            cluster_exist = False
            status_request_id = (
                smoke_tests_utils.get_dashboard_cluster_status_request_id())
            status_response = (smoke_tests_utils.get_response_from_request_id(
                status_request_id))
            status_response = (
                smoke_tests_utils.get_response_from_request_id_dashboard(
                    status_request_id))
            for cluster in status_response:
                if cluster['name'] == name:
                    cluster_exist = True
@@ -700,8 +701,9 @@ def test_jobs_launch_and_logs(generic_cloud: str):
            # Check the job status from the dashboard
            queue_request_id = (
                smoke_tests_utils.get_dashboard_jobs_queue_request_id())
            queue_response = (smoke_tests_utils.get_response_from_request_id(
                queue_request_id))
            queue_response = (
                smoke_tests_utils.get_response_from_request_id_dashboard(
                    queue_request_id))
            job_exist = False
            for job in queue_response:
                if job['job_id'] == job_id:
@@ -1775,3 +1777,104 @@ def test_cluster_setup_num_gpus():
            teardown=f'sky down -y {name}',
        )
        smoke_tests_utils.run_one_test(test)


@pytest.mark.aws
 def test_launch_retry_until_up():
    """Test that retry until up considers more resources after trying all zones."""
    cluster_name = smoke_tests_utils.get_cluster_name()
    timeout = 180
    test = smoke_tests_utils.Test(
        'launch-retry-until-up',
        [
            # Launch something we'll never get.
            f's=$(timeout {timeout} sky launch -c {cluster_name} --gpus B200:8 --infra aws echo hi -y -d --retry-until-up --use-spot 2>&1 || true) && '
            # Check that "Retry after" appears in the output
            'echo "$s" | grep -q "Retry after" && '
            # Find the first occurrence of "Retry after" and get its line number
            'RETRY_LINE=$(echo "$s" | grep -n "Retry after" | head -1 | cut -d: -f1) && '
            # Check that "Considered resources" appears after the first "Retry after"
            # We do this by extracting all lines after RETRY_LINE and checking if "Considered resources" appears
            'echo "$s" | tail -n +$((RETRY_LINE + 1)) | grep -q "Considered resources"'
        ],
        timeout=200,  # Slightly more than 180 to account for test overhead
        teardown=f'sky down -y {cluster_name}',
    )
    smoke_tests_utils.run_one_test(test)


 def test_cancel_job_reliability(generic_cloud: str):
    """Test that sky cancel properly terminates running jobs."""
    name = smoke_tests_utils.get_cluster_name()

    # Create a temporary YAML file with a long-running sleep command
    cancel_test_yaml = textwrap.dedent("""
    run: |
        sleep 10000
    """)

    # Helper function to check process count with timeout
    def check_process_count(expected_lines: int, timeout: int = 30) -> str:
        """Check that ps aux | grep 'sleep 10000' shows expected number of lines.
        
        Note: ps aux | grep includes the grep process itself, so:
        - 3 lines = sleep process + grep process + ssh process to check the process count
        - 2 line = grep process (sleep is gone) + ssh process to check the process count
        
        Returns a command that will check the process count with retries.
        """
        return (
            f'for i in $(seq 1 {timeout}); do '
            f'  s=$(ssh {name} "ps aux | grep \'sleep 10000\'" 2>/dev/null); '
            f'  count=$(echo "$s" | wc -l || echo 0); '
            f'  if [ "$count" -eq {expected_lines} ]; then '
            f'    echo "Found {expected_lines} line(s) as expected"; '
            f'    exit 0; '
            f'  fi; '
            f'  echo "Waiting for {expected_lines} line(s), found $count, attempt $i/{timeout}"; '
            f'  echo "Output was: $s"; '
            f'  sleep 1; '
            f'done; '
            f'echo "ERROR: Expected {expected_lines} line(s) but found $count"; '
            f'exit 1')

    with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml') as f:
        f.write(cancel_test_yaml)
        f.flush()

        disk_size_param, _ = smoke_tests_utils.get_disk_size_and_validate_launch_output(
            generic_cloud)

        # Build commands for the test
        commands = [
            # Launch the cluster
            f'sky launch -y -c {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} {disk_size_param} {f.name} -d',
            check_process_count(3, timeout=30),
            f'sky cancel {name} 1 -y',
            check_process_count(2, timeout=30),
        ]

        num_iterations = 10
        # Run the cancel test num_iterations times
        # Note: Job 1 is from the cluster launch, so exec jobs start at job 2
        for iteration in range(1, num_iterations):
            job_num = iteration + 1  # Job 1 is from cluster launch
            commands.extend([
                # Launch a new job with the sleep command
                f'sky exec {name} --infra {generic_cloud} {f.name} -d',
                # Check that we see 3 lines (sleep process + grep process itself + ssh process to check the process count)
                check_process_count(3, timeout=30),
                # Cancel the job
                f'sky cancel {name} {job_num} -y',
                # Check that we now see only 2 lines (grep process + ssh process to check the process count)
                check_process_count(2, timeout=30),
            ])

        test = smoke_tests_utils.Test(
            'test_cancel_job_reliability',
            commands,
            f'sky down -y {name}',
            timeout=smoke_tests_utils.get_timeout(generic_cloud) *
            2,  # Longer timeout for 10 iterations
        )
        smoke_tests_utils.run_one_test(test)
--- a/tests/smoke_tests/test_pools.py
+++ b/tests/smoke_tests/test_pools.py
@@ -12,6 +12,12 @@ from sky.skylet import events
 from sky.utils import common_utils
 from sky.utils import yaml_utils

 # 1. TODO(lloyd): Marking below tests as no_remote_server since PR#7332 changed
 # the resource management logic for pools reducing the number of concurrent
 # pools that can be running. This leads to build failures on the shared GKE
 # test cluster. Remove this when consolidation mode is enabled by default or
 # we have an option to not allow shared env tests.

 _LAUNCH_POOL_AND_CHECK_SUCCESS = (
    's=$(sky jobs pool apply -p {pool_name} {pool_yaml} -y); '
    'echo "$s"; '
@@ -306,6 +312,7 @@ def get_worker_cluster_name(pool_name: str, worker_id: int):
@pytest.mark.parametrize('accelerator', [{'do': 'H100', 'nebius': 'L40S'}])
@pytest.mark.skip(
    'Skipping vllm pool test until more remote server testing is done.')
@pytest.mark.no_remote_server  # see note 1 above
 def test_vllm_pool(generic_cloud: str, accelerator: Dict[str, str]):
    if generic_cloud == 'kubernetes':
        accelerator = smoke_tests_utils.get_avaliabe_gpus_for_k8s_tests()
@@ -436,6 +443,7 @@ def test_vllm_pool(generic_cloud: str, accelerator: Dict[str, str]):
            smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_setup_logs_in_starting_pool(generic_cloud: str):
    """Test that setup logs are streamed in starting state."""
    # Do a very long setup so we know the setup logs are streamed in
@@ -461,6 +469,7 @@ def test_setup_logs_in_starting_pool(generic_cloud: str):
        smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_setup_logs_in_pool_exits(generic_cloud: str):
    """Test that setup logs are streamed and exit once the setup is complete."""
    """We omit --no-follow to test that we exit."""
@@ -482,6 +491,7 @@ def test_setup_logs_in_pool_exits(generic_cloud: str):
        smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_update_workers(generic_cloud: str):
    """Test that we can update the number of workers in a pool, both
    up and down.
@@ -511,6 +521,7 @@ def test_update_workers(generic_cloud: str):
        smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_update_workers_and_yaml(generic_cloud: str):
    """Test that we error if the user specifies a yaml and --workers.
    """
@@ -530,6 +541,7 @@ def test_update_workers_and_yaml(generic_cloud: str):
        smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_update_workers_no_pool(generic_cloud: str):
    """Test that we error if the user specifies a yaml and --workers.
    """
@@ -549,6 +561,7 @@ def test_update_workers_no_pool(generic_cloud: str):
        smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_pool_queueing(generic_cloud: str):
    timeout = smoke_tests_utils.get_timeout(generic_cloud)
    pool_config = basic_pool_conf(num_workers=1,
@@ -588,6 +601,7 @@ def test_pool_queueing(generic_cloud: str):


@pytest.mark.aws
@pytest.mark.no_remote_server  # see note 1 above
 def test_pool_preemption(generic_cloud: str):
    region = 'us-east-2'
    name = smoke_tests_utils.get_cluster_name()
@@ -642,6 +656,7 @@ def test_pool_preemption(generic_cloud: str):
            smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_pool_job_cancel_running(generic_cloud: str):
    timeout = smoke_tests_utils.get_timeout(generic_cloud)
    pool_config = basic_pool_conf(num_workers=1, infra=generic_cloud)
@@ -683,6 +698,7 @@ def test_pool_job_cancel_running(generic_cloud: str):
            smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_pool_job_cancel_instant(generic_cloud: str):
    timeout = smoke_tests_utils.get_timeout(generic_cloud)
    pool_config = basic_pool_conf(num_workers=1, infra=generic_cloud)
@@ -722,6 +738,7 @@ def test_pool_job_cancel_instant(generic_cloud: str):


@pytest.mark.aws
@pytest.mark.no_remote_server  # see note 1 above
 def test_pool_job_cancel_recovery(generic_cloud: str):
    region = 'us-east-2'
    name = smoke_tests_utils.get_cluster_name()
@@ -781,6 +798,7 @@ def test_pool_job_cancel_recovery(generic_cloud: str):
            smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_pool_job_cancel_running_multiple(generic_cloud: str):
    num_jobs = 4
    timeout = smoke_tests_utils.get_timeout(generic_cloud)
@@ -837,6 +855,7 @@ def test_pool_job_cancel_running_multiple(generic_cloud: str):
            smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_pool_job_cancel_running_multiple_simultaneous(generic_cloud: str):
    timeout = smoke_tests_utils.get_timeout(generic_cloud)
    num_jobs = 4
@@ -891,6 +910,7 @@ def test_pool_job_cancel_running_multiple_simultaneous(generic_cloud: str):
            smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_pool_job_cancel_instant_multiple(generic_cloud: str):
    timeout = smoke_tests_utils.get_timeout(generic_cloud)
    num_jobs = 4
@@ -941,6 +961,7 @@ def test_pool_job_cancel_instant_multiple(generic_cloud: str):
            smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_pool_job_cancel_instant_multiple_simultaneous(generic_cloud: str):
    timeout = smoke_tests_utils.get_timeout(generic_cloud)
    num_jobs = 4
@@ -988,6 +1009,7 @@ def test_pool_job_cancel_instant_multiple_simultaneous(generic_cloud: str):
            smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_pools_job_cancel_no_jobs(generic_cloud: str):
    timeout = smoke_tests_utils.get_timeout(generic_cloud)
    pool_config = basic_pool_conf(num_workers=1, infra=generic_cloud)
@@ -1013,6 +1035,7 @@ def test_pools_job_cancel_no_jobs(generic_cloud: str):


 # TODO(Lloyd): Remove once heterogeneous pools are supported.
@pytest.mark.no_remote_server  # see note 1 above
 def test_heterogeneous_pool(generic_cloud: str):
    name = smoke_tests_utils.get_cluster_name()
    pool_name = f'{name}-pool'
@@ -1034,6 +1057,7 @@ def test_heterogeneous_pool(generic_cloud: str):


 #(TODO): Remove once heterogeneous pools are supported.
@pytest.mark.no_remote_server  # see note 1 above
 def test_heterogeneous_pool_counts(generic_cloud: str):
    name = smoke_tests_utils.get_cluster_name()
    pool_name = f'{name}-pool'
@@ -1054,8 +1078,7 @@ def test_heterogeneous_pool_counts(generic_cloud: str):
        smoke_tests_utils.run_one_test(test)


 # This test is failing on shared gke postgres test cluster, we should remove this after we fix it.
@pytest.mark.no_remote_server
@pytest.mark.no_remote_server  # see note 1 above
 def test_pools_num_jobs_basic(generic_cloud: str):
    name = smoke_tests_utils.get_cluster_name()
    pool_name = f'{name}-pool'
@@ -1096,6 +1119,7 @@ def test_pools_num_jobs_basic(generic_cloud: str):
            smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_pool_worker_assignment_in_queue(generic_cloud: str):
    """Test that sky jobs queue shows the worker assignment for running jobs."""
    timeout = smoke_tests_utils.get_timeout(generic_cloud)
@@ -1133,6 +1157,7 @@ def test_pool_worker_assignment_in_queue(generic_cloud: str):
            smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_pools_num_jobs_option(generic_cloud: str):
    name = smoke_tests_utils.get_cluster_name()
    pool_name = f'{name}-pool'
@@ -1170,6 +1195,7 @@ def test_pools_num_jobs_option(generic_cloud: str):


@pytest.mark.gcp
@pytest.mark.no_remote_server  # see note 1 above
 def test_pools_setup_num_gpus(generic_cloud: str):
    """Test that the number of GPUs is set correctly in the setup script."""
    timeout = smoke_tests_utils.get_timeout(generic_cloud)
@@ -1196,6 +1222,7 @@ def test_pools_setup_num_gpus(generic_cloud: str):
        smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_pools_single_yaml(generic_cloud: str):
    name = smoke_tests_utils.get_cluster_name()
    pool_name = f'{name}-pool'
@@ -1223,6 +1250,7 @@ def test_pools_single_yaml(generic_cloud: str):
        smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_pools_double_launch(generic_cloud: str):
    """Test that we can launch a pool with the same name twice.
    """
@@ -1249,17 +1277,42 @@ def test_pools_double_launch(generic_cloud: str):
        smoke_tests_utils.run_one_test(test)


 def check_pool_not_in_status(pool_name: str):
    """Check that a pool does not appear in `sky jobs pool status`."""
    return (f's=$(sky jobs pool status); '
            f'echo "$s"; '
            f'if echo "$s" | grep "{pool_name}"; then '
            f'  echo "ERROR: Pool {pool_name} still exists in pool status"; '
            f'  exit 1; '
            f'fi; '
            f'echo "Pool {pool_name} correctly removed from pool status"')
 def check_pool_not_in_status(pool_name: str,
                             timeout: int = 30,
                             time_between_checks: int = 5):
    """Check that a pool does not appear in `sky jobs pool status`.
    
    Args:
        pool_name: The name of the pool to check for.
        timeout: Maximum time in seconds to wait for the pool to be removed.
        time_between_checks: Time in seconds to wait between checks.
    """
    return (
        'start_time=$SECONDS; '
        'while true; do '
        f'if (( $SECONDS - $start_time > {timeout} )); then '
        f'  echo "Timeout after {timeout} seconds waiting for pool {pool_name} to be removed"; '
        f'  s=$(sky jobs pool status); '
        f'  echo "$s"; '
        f'  if echo "$s" | grep "{pool_name}"; then '
        f'    echo "ERROR: Pool {pool_name} still exists in pool status"; '
        f'    exit 1; '
        f'  fi; '
        f'  exit 0; '
        'fi; '
        f's=$(sky jobs pool status); '
        'echo "$s"; '
        f'if ! echo "$s" | grep "{pool_name}"; then '
        f'  echo "Pool {pool_name} correctly removed from pool status"; '
        '  break; '
        'fi; '
        f'echo "Waiting for pool {pool_name} to be removed..."; '
        f'sleep {time_between_checks}; '
        'done')


@pytest.mark.resource_heavy
@pytest.mark.no_remote_server  # see note 1 above
 def test_pool_down_all_with_running_jobs(generic_cloud: str):
    """Test that `sky jobs pool down -a -y` cancels running jobs and removes pools.
    
@@ -1286,49 +1339,66 @@ def test_pool_down_all_with_running_jobs(generic_cloud: str):
        run_cmd='sleep infinity',
    )

    with tempfile.NamedTemporaryFile(delete=True) as pool_yaml:
        with tempfile.NamedTemporaryFile(delete=True) as job_yaml:
            write_yaml(pool_yaml, pool_config)
            write_yaml(job_yaml, job_config)

            test = smoke_tests_utils.Test(
                'test_pool_down_all_with_running_jobs',
                [
                    _LAUNCH_POOL_AND_CHECK_SUCCESS.format(
                        pool_name=pool_name_1, pool_yaml=pool_yaml.name),
                    wait_until_pool_ready(pool_name_1, timeout=timeout),
                    _LAUNCH_POOL_AND_CHECK_SUCCESS.format(
                        pool_name=pool_name_2, pool_yaml=pool_yaml.name),
                    wait_until_pool_ready(pool_name_2, timeout=timeout),
                    _LAUNCH_JOB_AND_CHECK_SUCCESS_WITH_NAME.format(
                        pool_name=pool_name_1,
                        job_yaml=job_yaml.name,
                        job_name=job_name_1),
                    _LAUNCH_JOB_AND_CHECK_SUCCESS_WITH_NAME.format(
                        pool_name=pool_name_2,
                        job_yaml=job_yaml.name,
                        job_name=job_name_2),
                    wait_until_job_status(job_name_1, ['RUNNING'],
                                          timeout=timeout),
                    wait_until_job_status(job_name_2, ['RUNNING'],
                                          timeout=timeout),
                    'sky jobs pool down -a -y',
                    # Wait a bit for cancellation to propagate
                    'sleep 10',
                    wait_until_job_status(
                        job_name_1, ['CANCELLED'], bad_statuses=[], timeout=30),
                    wait_until_job_status(
                        job_name_2, ['CANCELLED'], bad_statuses=[], timeout=30),
                    check_pool_not_in_status(pool_name_1),
                    check_pool_not_in_status(pool_name_2),
                ],
                timeout=timeout,
                teardown=cancel_jobs_and_teardown_pool(pool_name_1, timeout=5) +
                cancel_jobs_and_teardown_pool(pool_name_2, timeout=5),
            )
            smoke_tests_utils.run_one_test(test)
    # Configure jobs controller resources: 4 cores and 20GB memory
    controller_config = {
        'jobs': {
            'controller': {
                'resources': {
                    'cpus': '4+',
                    'memory': '32+',
                }
            }
        }
    }

    with smoke_tests_utils.override_sky_config(config_dict=controller_config):
        with tempfile.NamedTemporaryFile(delete=True) as pool_yaml:
            with tempfile.NamedTemporaryFile(delete=True) as job_yaml:
                write_yaml(pool_yaml, pool_config)
                write_yaml(job_yaml, job_config)

                test = smoke_tests_utils.Test(
                    'test_pool_down_all_with_running_jobs',
                    [
                        _LAUNCH_POOL_AND_CHECK_SUCCESS.format(
                            pool_name=pool_name_1, pool_yaml=pool_yaml.name),
                        wait_until_pool_ready(pool_name_1, timeout=timeout),
                        _LAUNCH_POOL_AND_CHECK_SUCCESS.format(
                            pool_name=pool_name_2, pool_yaml=pool_yaml.name),
                        wait_until_pool_ready(pool_name_2, timeout=timeout),
                        _LAUNCH_JOB_AND_CHECK_SUCCESS_WITH_NAME.format(
                            pool_name=pool_name_1,
                            job_yaml=job_yaml.name,
                            job_name=job_name_1),
                        _LAUNCH_JOB_AND_CHECK_SUCCESS_WITH_NAME.format(
                            pool_name=pool_name_2,
                            job_yaml=job_yaml.name,
                            job_name=job_name_2),
                        wait_until_job_status(job_name_1, ['RUNNING'],
                                              timeout=timeout),
                        wait_until_job_status(job_name_2, ['RUNNING'],
                                              timeout=timeout),
                        'sky jobs pool down -a -y',
                        # Wait a bit for cancellation to propagate
                        'sleep 10',
                        wait_until_job_status(job_name_1, ['CANCELLED'],
                                              bad_statuses=[],
                                              timeout=30),
                        wait_until_job_status(job_name_2, ['CANCELLED'],
                                              bad_statuses=[],
                                              timeout=30),
                        check_pool_not_in_status(pool_name_1),
                        check_pool_not_in_status(pool_name_2),
                    ],
                    timeout=timeout,
                    teardown=cancel_jobs_and_teardown_pool(pool_name_1,
                                                           timeout=5) +
                    cancel_jobs_and_teardown_pool(pool_name_2, timeout=5),
                )
                smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server  # see note 1 above
 def test_pool_down_single_pool(generic_cloud: str):
    """Test that `sky jobs pool down <pool_name> -y` downs a single pool.
    
@@ -1375,7 +1445,7 @@ def test_pool_down_single_pool(generic_cloud: str):
                    'sleep 10',
                    wait_until_job_status(
                        job_name, ['CANCELLED'], bad_statuses=[], timeout=30),
                    check_pool_not_in_status(pool_name),
                    check_pool_not_in_status(pool_name, timeout=30),
                ],
                timeout=timeout,
                teardown=cancel_jobs_and_teardown_pool(pool_name, timeout=5),
--- a/tests/test_yamls/low_resource_sky_config.yaml
+++ b/tests/test_yamls/low_resource_sky_config.yaml
@@ -2,9 +2,9 @@ jobs:
  controller:
    resources:
      cpus: 4+
      memory: 4+
      memory: 16+
 serve:
  controller:
    resources:
      cpus: 4+
      memory: 4+
      memory: 8+
--- a/tests/unit_tests/kubernetes/test_deploy_remote_cluster.py
+++ b/tests/unit_tests/kubernetes/test_deploy_remote_cluster.py
@@ -0,0 +1,57 @@
 """Tests for Kubernetes remote cluster deployment."""

 import argparse
 from unittest import mock

 from sky.utils.kubernetes import deploy_remote_cluster


 def test_deploy_remote_cluster():
    """Test to check if the remote cluster is deployed successfully."""
    mock_args = argparse.Namespace(
        cleanup=False,
        infra='test-infra',
        kubeconfig_path='~/.kube/config',
        use_ssh_config=False,
        ssh_node_pools_file='~/.sky/ssh_node_pools.yaml')

    mock_hosts_info = [{
        'name': 'test-host',
        'ip': '192.168.1.1',
        'user': 'test-user',
        'identity_file': '~/.ssh/id_rsa',
        'use_ssh_config': False,
        'password': 'test-password'
    }]

    mock_context_name = 'test-infra'

    mock_cluster_config = {mock_context_name: {'hosts': ['test-host']}}

    mock_ssh_targets = [{'name': mock_context_name, 'hosts': ['test-host']}]

    with mock.patch('sky.utils.kubernetes.deploy_remote_cluster.parse_args') as mock_parse_args, \
         mock.patch('sky.utils.kubernetes.deploy_remote_cluster.ssh_utils.load_ssh_targets') as mock_load_ssh_targets, \
         mock.patch('sky.utils.kubernetes.deploy_remote_cluster.ssh_utils.get_cluster_config') as mock_get_cluster_config, \
         mock.patch('sky.utils.kubernetes.deploy_remote_cluster.ssh_utils.prepare_hosts_info') as mock_prepare_hosts_info, \
         mock.patch('sky.utils.kubernetes.deploy_remote_cluster.deploy_cluster') as mock_deploy_cluster:
        mock_parse_args.return_value = mock_args
        mock_load_ssh_targets.return_value = mock_ssh_targets
        mock_get_cluster_config.return_value = mock_cluster_config
        mock_prepare_hosts_info.return_value = mock_hosts_info
        mock_deploy_cluster.return_value = [mock_context_name]
        deploy_remote_cluster.main()
        mock_deploy_cluster.assert_called_once()
        mock_load_ssh_targets.assert_called_once()
        mock_get_cluster_config.assert_called_once()
        # Check that mock_deploy_cluster was called with context_name='ssh-test-infra'
        context_name = None
        expected_context_name = 'ssh-test-infra'
        for call in mock_deploy_cluster.call_args_list:
            # context_name is the 5th positional argument
            # deploy_cluster(head_node, worker_nodes, ssh_user, ssh_key, context_name, ...)
            if len(call.args) >= 5:
                context_name = call.args[4]
            assert context_name == expected_context_name, (
                f"mock_deploy_cluster was not called with context_name='{expected_context_name}', "
                f"but was called with context_name={context_name}")
--- a/tests/unit_tests/test_sky/clouds/test_aws_cloud.py
+++ b/tests/unit_tests/test_sky/clouds/test_aws_cloud.py
@@ -752,3 +752,24 @@ class TestAwsProfileAwareLruCache:
            else:
                os.environ.pop(skypilot_config.ENV_VAR_SKYPILOT_CONFIG, None)
            skypilot_config.reload_config()


 class TestAwsConfigFileEnvVar:
    """Tests for AWS_CONFIG_FILE credential override."""

    def test_get_credential_file_mounts_respects_env_override(
            self, tmp_path, monkeypatch):
        credential_file = tmp_path / 'aws_credentials'
        credential_file.write_text('dummy')
        monkeypatch.setenv('AWS_CONFIG_FILE', str(credential_file))

        aws = aws_mod.AWS()
        with mock.patch.object(
                aws_mod.AWS,
                '_current_identity_type',
                return_value=aws_mod.AWSIdentityType.SHARED_CREDENTIALS_FILE):
            mounts = aws.get_credential_file_mounts()

        assert mounts == {
            aws_mod._DEFAULT_AWS_CONFIG_PATH: str(credential_file)
        }
--- a/tests/unit_tests/test_sky/jobs/test_server_core_secrets.py
+++ b/tests/unit_tests/test_sky/jobs/test_server_core_secrets.py
@@ -9,6 +9,7 @@ This test ensures that:
 import os
 import tempfile

 from pydantic import SecretStr
 import pytest

 from sky import dag as dag_lib
@@ -267,11 +268,14 @@ class TestManagedJobSecrets:
            loaded_task = loaded_dag.tasks[0]

            # The loaded task must have real secrets for execution
            assert loaded_task.secrets['API_KEY'] == 'sk-prod-api-key-12345'
            assert loaded_task.secrets[
                'DB_PASSWORD'] == 'prod-database-secret-password'
            assert loaded_task.secrets[
                'WANDB_API_KEY'] == 'wandb-secret-key-67890'
            assert task_lib.get_plaintext_secrets(
                loaded_task.secrets)['API_KEY'] == 'sk-prod-api-key-12345'
            assert task_lib.get_plaintext_secrets(
                loaded_task.secrets
            )['DB_PASSWORD'] == 'prod-database-secret-password'
            assert task_lib.get_plaintext_secrets(
                loaded_task.secrets
            )['WANDB_API_KEY'] == 'wandb-secret-key-67890'

            # Environment variables should be preserved
            assert loaded_task.envs['MODEL_NAME'] == 'my-model'
@@ -334,11 +338,14 @@ class TestManagedJobSecrets:
        loaded_tasks = loaded_dag.tasks

        assert len(loaded_tasks) == 2
        assert loaded_tasks[0].secrets['DATA_API_KEY'] == 'data-api-secret-key'
        assert loaded_tasks[0].secrets['S3_SECRET'] == 's3-access-secret'
        assert loaded_tasks[1].secrets[
            'MODEL_API_KEY'] == 'model-api-secret-key'
        assert loaded_tasks[1].secrets['WANDB_KEY'] == 'wandb-logging-secret'
        assert task_lib.get_plaintext_secrets(
            loaded_tasks[0].secrets)['DATA_API_KEY'] == 'data-api-secret-key'
        assert task_lib.get_plaintext_secrets(
            loaded_tasks[0].secrets)['S3_SECRET'] == 's3-access-secret'
        assert task_lib.get_plaintext_secrets(
            loaded_tasks[1].secrets)['MODEL_API_KEY'] == 'model-api-secret-key'
        assert task_lib.get_plaintext_secrets(
            loaded_tasks[1].secrets)['WANDB_KEY'] == 'wandb-logging-secret'

    def test_mixed_envs_and_secrets_job_execution(self):
        """Test that envs and secrets are handled correctly for job execution.
--- a/tests/unit_tests/test_sky/jobs/test_state.py
+++ b/tests/unit_tests/test_sky/jobs/test_state.py
@@ -88,8 +88,7 @@ def _insert_job_info(engine,
        return job_id


@pytest.mark.asyncio
 async def test_get_task_logs_to_clean_async_basic(_mock_managed_jobs_db_conn):
 def test_get_task_logs_to_clean_basic(_mock_managed_jobs_db_conn):
    now = time.time()
    retention = 60

@@ -156,7 +155,7 @@ async def test_get_task_logs_to_clean_async_basic(_mock_managed_jobs_db_conn):

    state.scheduler_set_done(job_id)

    res = await state.get_task_logs_to_clean_async(retention, batch_size=10)
    res = state.get_task_logs_to_clean(retention, batch_size=10)
    # Only task 0 should be returned
    assert len(res) == 1
    assert res[0]['job_id'] == job_id
@@ -183,12 +182,11 @@ async def test_get_task_logs_to_clean_async_basic(_mock_managed_jobs_db_conn):
        logs_cleaned_at=None,
    )

    res2 = await state.get_task_logs_to_clean_async(retention, batch_size=2)
    res2 = state.get_task_logs_to_clean(retention, batch_size=2)
    assert len(res2) == 2  # limited by batch size


@pytest.mark.asyncio
 async def test_set_task_logs_cleaned_async(_mock_managed_jobs_db_conn):
 def test_set_task_logs_cleaned(_mock_managed_jobs_db_conn):
    now = time.time()
    retention = 60

@@ -212,11 +210,11 @@ async def test_set_task_logs_cleaned_async(_mock_managed_jobs_db_conn):

    state.scheduler_set_done(job_id)

    res = await state.get_task_logs_to_clean_async(retention, batch_size=10)
    res = state.get_task_logs_to_clean(retention, batch_size=10)
    assert len(res) == 1

    ts = now
    await state.set_task_logs_cleaned_async([(job_id, 0)], ts)
    state.set_task_logs_cleaned([(job_id, 0)], ts)

    # Verify updated
    with orm.Session(state._SQLALCHEMY_ENGINE) as session:
@@ -229,13 +227,11 @@ async def test_set_task_logs_cleaned_async(_mock_managed_jobs_db_conn):
        assert row[0] == ts

    # Should no longer be returned
    res2 = await state.get_task_logs_to_clean_async(retention, batch_size=10)
    res2 = state.get_task_logs_to_clean(retention, batch_size=10)
    assert res2 == []


@pytest.mark.asyncio
 async def test_get_controller_logs_to_clean_async_basic(
        _mock_managed_jobs_db_conn):
 def test_get_controller_logs_to_clean_basic(_mock_managed_jobs_db_conn):
    now = time.time()
    retention = 60

@@ -304,8 +300,7 @@ async def test_get_controller_logs_to_clean_async_basic(
    )
    state.scheduler_set_done(job_d)

    res = await state.get_controller_logs_to_clean_async(retention,
                                                         batch_size=10)
    res = state.get_controller_logs_to_clean(retention, batch_size=10)
    job_ids = {r['job_id'] for r in res}
    assert job_ids == {job_a}

@@ -335,19 +330,17 @@ async def test_get_controller_logs_to_clean_async_basic(
    )
    state.scheduler_set_done(job_f)

    res2 = await state.get_controller_logs_to_clean_async(retention,
                                                          batch_size=2)
    res2 = state.get_controller_logs_to_clean(retention, batch_size=2)
    assert len(res2) == 2


@pytest.mark.asyncio
 async def test_set_controller_logs_cleaned_async(_mock_managed_jobs_db_conn):
 def test_set_controller_logs_cleaned(_mock_managed_jobs_db_conn):
    now = time.time()

    job_id = _insert_job_info(state._SQLALCHEMY_ENGINE,
                              controller_logs_cleaned_at=None)

    await state.set_controller_logs_cleaned_async([job_id], now)
    state.set_controller_logs_cleaned([job_id], now)

    with orm.Session(state._SQLALCHEMY_ENGINE) as session:
        row = session.execute(
--- a/tests/unit_tests/test_sky/server/requests/test_executor.py
+++ b/tests/unit_tests/test_sky/server/requests/test_executor.py
@@ -1,7 +1,9 @@
 """Unit tests for sky.server.requests.executor module."""
 import asyncio
 import concurrent.futures
 import functools
 import os
 import queue as queue_lib
 import time
 from typing import List
 from unittest import mock
@@ -10,6 +12,7 @@ import pytest

 from sky import exceptions
 from sky import skypilot_config
 from sky.server import config as server_config
 from sky.server import constants as server_constants
 from sky.server.requests import executor
 from sky.server.requests import payloads
@@ -336,7 +339,7 @@ async def test_execute_with_isolated_env_and_config(isolated_database,
        os.environ.pop('TEST_VAR_A', None)


 FAKE_FD_START = 100
 FAKE_FD_START = 100000


 def _get_saved_fd_close_count(close_calls: List[int], created_fds: set) -> int:
@@ -452,6 +455,11 @@ def _keyboard_interrupt_entrypoint():
    raise KeyboardInterrupt()


 def _dummy_entrypoint_for_retry_test():
    """Dummy entrypoint for retry test that can be pickled."""
    return None


@pytest.mark.asyncio
@pytest.mark.parametrize('test_case', [
    pytest.param(
@@ -518,3 +526,119 @@ async def test_stdout_stderr_restoration(mock_fd_operations, test_case):
    # Verify no double-close
    _assert_no_double_close(mock_fd_operations['close_calls'],
                            mock_fd_operations['created_fds'])


@pytest.mark.asyncio
 async def test_request_worker_retry_execution_retryable_error(
        isolated_database, monkeypatch):
    """Test that RequestWorker retries requests when ExecutionRetryableError is raised."""
    # Create a request in the database
    request_id = 'test-retry-request'
    request = requests_lib.Request(
        request_id=request_id,
        name='test-request',
        entrypoint=
        _dummy_entrypoint_for_retry_test,  # Won't be called in this test
        request_body=payloads.RequestBody(),
        status=requests_lib.RequestStatus.RUNNING,
        created_at=time.time(),
        user_id='test-user',
    )
    await requests_lib.create_if_not_exists_async(request)

    # Create a mock queue that tracks puts
    queue_items = []
    mock_queue = queue_lib.Queue()

    class MockRequestQueue:

        def __init__(self, queue):
            self.queue = queue

        def get(self):
            try:
                return self.queue.get(block=False)
            except queue_lib.Empty:
                return None

        def put(self, item):
            queue_items.append(item)
            self.queue.put(item)

    request_queue = MockRequestQueue(mock_queue)

    # Mock _get_queue to return our mock queue
    def mock_get_queue(schedule_type):
        return request_queue

    monkeypatch.setattr(executor, '_get_queue', mock_get_queue)

    # Mock time.sleep to track calls (but still sleep for very short waits)
    sleep_calls = []

    def mock_sleep(seconds):
        sleep_calls.append(seconds)

    monkeypatch.setattr('time.sleep', mock_sleep)

    # Create a mock executor that tracks submit_until_success calls
    submit_calls = []

    class MockExecutor:

        def submit_until_success(self, fn, *args, **kwargs):
            submit_calls.append((fn, args, kwargs))
            # Return a future that immediately completes (does nothing)
            fut = concurrent.futures.Future()
            fut.set_result(None)
            return fut

    mock_executor = MockExecutor()

    # Create a RequestWorker
    worker = executor.RequestWorker(
        schedule_type=requests_lib.ScheduleType.LONG,
        config=server_config.WorkerConfig(garanteed_parallelism=1,
                                          burstable_parallelism=0,
                                          num_db_connections_per_worker=0))

    # Create a future that raises ExecutionRetryableError
    retryable_error = exceptions.ExecutionRetryableError(
        'Failed to provision all possible launchable resources.',
        hint='Retry after 30s',
        retry_wait_seconds=30)
    fut = concurrent.futures.Future()
    fut.set_exception(retryable_error)

    # Create request_element tuple
    request_element = (request_id, False, True
                      )  # (request_id, ignore_return_value, retryable)

    # Call handle_task_result - this should catch the exception and reschedule
    worker.handle_task_result(fut, request_element)

    # Verify the request was put back on the queue
    assert queue_items == [
        request_element
    ], (f'Expected {request_element} to be put on queue, got {queue_items[0]}')

    # Verify time.sleep was called with the retry wait time (first call should be 30)
    assert sleep_calls == [
        30
    ], (f'Expected first time.sleep call to be 30 seconds, got {sleep_calls[0]}'
       )

    # Verify the request status was reset to PENDING
    updated_request = requests_lib.get_request(request_id, fields=['status'])
    assert updated_request is not None
    assert updated_request.status == requests_lib.RequestStatus.PENDING, (
        f'Expected request status to be PENDING, got {updated_request.status}')

    # Call process_request - it should pick up the request from the queue
    # and call submit_until_success
    worker.process_request(mock_executor, request_queue)

    # Verify submit_until_success was called
    assert len(submit_calls) == 1, (
        f'Expected submit_until_success to be called once, got {len(submit_calls)} calls'
    )
--- a/tests/unit_tests/test_sky/server/requests/test_payloads.py
+++ b/tests/unit_tests/test_sky/server/requests/test_payloads.py
@@ -17,7 +17,6 @@ def test_request_body_env_vars_includes_expected_keys(monkeypatch):
                       '/tmp/project.yaml')
    monkeypatch.setenv(constants.ENV_VAR_DB_CONNECTION_URI, 'db-uri')

    payloads.request_body_env_vars.cache_clear()
    monkeypatch.setattr(payloads.common, 'is_api_server_local', lambda: True)
    local_env = payloads.request_body_env_vars()
    assert server_env not in local_env
@@ -27,7 +26,6 @@ def test_request_body_env_vars_includes_expected_keys(monkeypatch):
    assert skypilot_config.ENV_VAR_GLOBAL_CONFIG not in local_env
    assert skypilot_config.ENV_VAR_PROJECT_CONFIG not in local_env

    payloads.request_body_env_vars.cache_clear()
    monkeypatch.setattr(payloads.common, 'is_api_server_local', lambda: False)
    remote_env = payloads.request_body_env_vars()
    assert 'AWS_PROFILE' not in remote_env
Author	SHA1	Message	Date
Zhanghao Wu	b2f3519723	Release 0.11.0 (#8255 ) Co-authored-by: GitHub Action <action@github.com>	6 days ago
Zhanghao Wu	59ff48fce7	Release 0.11.0rc2 (#8236 ) Co-authored-by: GitHub Action <action@github.com>	1 week ago
Christopher Cooper	078dad7ada	[core] restore cluster_name_on_cloud from cluster yaml (#8233 ) * [core] restore cluster_name_on_cloud from cluster yaml Fixes #8232. * add smoke test	1 week ago
Aylei	0207108092	Fixed incorrect user info in handlers (#8199 #8209 ) (#8234 ) * Fixed incorrect user in /enabled_clouds API (#8199) * Fixed incorrect user in /enabled_clouds API Signed-off-by: Aylei <rayingecho@gmail.com> * Comment-s * UT-s --------- Signed-off-by: Aylei <rayingecho@gmail.com> * Fixed incorrect user info in handlers (#8209) * Fixed incorrect user info in handlers Signed-off-by: Aylei <rayingecho@gmail.com> * Update sky/utils/common_utils.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --------- Signed-off-by: Aylei <rayingecho@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --------- Signed-off-by: Aylei <rayingecho@gmail.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	1 week ago
Aylei	4546cbc962	[Core] Put Daemonize Call Back to Make Sky Cancel Reliable (#8203 ) (#8208 ) * Add back daemonize call. * Format. * Add new cancellation test. Co-authored-by: lloyd-brown <lloyd@assemblesys.com>	1 week ago
DanielZhangQD	2f0b2720c3	Fix ephemeral volume creation (#8179 ) (#8194 ) Signed-off-by: Aylei <rayingecho@gmail.com> Co-authored-by: Aylei <rayingecho@gmail.com>	1 week ago
Christopher Cooper	83ef36b27a	[gh action] support publishing rc versions to pypi (#8188 ) (#8191 ) * [gh action] support publishing rc versions to pypi * install server dependencies	1 week ago
Zhanghao Wu	25862c5e07	Release 0.11.0rc1 (#8140 ) * Release 0.11.0rc1 * [test] fix test_jobs_launch_and_logs - increase request timeout (#8150) * [Test] Fix `test_pool_down_single_pool` with Timeout to Check Pool Status (#8148) Add timeout to check pool status. * [docker] fix docker on nebius (#8151) Upstream updated the error messages: moby/moby#50285. It seems that Nebius has the new version first. * [example] fix min-gpt train-rdzv on Nebius (#8152) * [example] fix min-gpt train-rdzv on Nebius * update comment * [Tests] Fix serve failure on gcp (#8165) * fix * dont at least allow one service * default resources * more memory for jobs * revert --------- Co-authored-by: Christopher Cooper <christopher@cg505.com> --------- Co-authored-by: GitHub Action <action@github.com> Co-authored-by: Christopher Cooper <christopher@cg505.com> Co-authored-by: lloyd-brown <lloyd@assemblesys.com> Co-authored-by: Tian Xia <cblmemo@gmail.com>	1 week ago
Girish Ramnani	daa361a784	Fix typo in README.md for torchtitan (#8137 ) Fix typo in README.md for hyperscalers	2 weeks ago
Christoph Clement	dad0e910cc	[GCP] Fix B200 spot instance support in catalog fetcher (#8125 ) This fixes two issues preventing GCP B200 (a4-highgpu-8g) spot instances from being recognized: 1. A4 VM pricing missing: GCP doesn't provide separate CPU/RAM pricing for A4 instances in their SKUs API. The B200 GPU pricing includes the full VM cost. Added special handling to set A4 VM price to 0 so entries aren't dropped. 2. B200 spot pricing bug: Some B200 spot SKUs in GCP's API have usageType='OnDemand' even though the description says "Spot Preemptible". Added logic to match on description when usageType doesn't match for B200 spot queries. Fixes #8102 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude <noreply@anthropic.com>	2 weeks ago
Zhanghao Wu	2a5e5fbb5c	[k8s] install k8s dependency for gpu labeller (#8126 ) * [k8s] install k8s dependency for gpu labeller * use python from skypilot runtime	2 weeks ago
DanielZhangQD	44bb3c07cd	[Doc] Add notes for volume config for multi-node clusters (#8114 ) add notes for volume config for multi-node clusters	2 weeks ago
zpoint	60a0fac4ff	[Test] Reuse existing VPC to fix `test_helm_deploy_eks` failure (#8115 ) * fix ENV * more echo hint * debug * format * remove debug log	2 weeks ago
Christopher Cooper	9fa34acba4	remove remaining references to old k8s image (#8112 )	2 weeks ago
Christopher Cooper	f2b9aa1797	[test] fix flaky test_stdout_stderr_restoration (#8110 ) In some cases, we were _actually_ closing an fd >100, which seems to sometimes be used by FileLock. Bump the fake fds values to a much higher number to avoid the conflict.	2 weeks ago
lloyd-brown	5f39acff29	[SDK] Change Task Secrets to SecretStr (#8040 ) * Change secrets to SecretStr. * Add more tests. * Fix tests.	2 weeks ago
Christopher Cooper	450f68d38b	[gpu labeler] update container image to new location (#8111 )	2 weeks ago
Christopher Cooper	d997b43999	[jobs] fix schedule_state grpc enum backcompat (#8105 ) * [jobs] fix backwards compatibility * readd the enum field with warning * lint * lint	2 weeks ago
Christopher Cooper	1d7d66f381	[deprecate] re-add check_stale_runtime_on_remote (#8108 ) * [deprecate] re-add check_stale_runtime_on_remote The deprecation message was added here: `a404e3fc9b (diff-5d3b8c053073eef6241a9fc4439b8e97c680e73e6dbfe01a587eea2bf0000c4d)` but it seems to only refer to the 'SkyPilot runtime is to old' check. * fix	2 weeks ago
Tian Xia	38480ede75	[Consolidation] Fix env vars and skip the status refresh for controllers (#8106 ) upd	2 weeks ago
lloyd-brown	a8f511d029	[Test] Up Job Controller Resources for `test_pool_down_all_with_running_jobs` (#8104 ) * Up memory. * Add error prop. * More logging. * Back to 20. * Mark as resource heavy.	2 weeks ago
Jeahong Hwang	a45687f38b	[MISC] properly cleanup load map in LeastLoadPolicy (#8103 ) [MISC] properly cleanup loadmap in LeastLoadPolicy	2 weeks ago
Aylei	99cc7fc4a6	Make logs gc sync (#8101 ) * Make logs gc sync Signed-off-by: Aylei <rayingecho@gmail.com> * Update Signed-off-by: Aylei <rayingecho@gmail.com> * Update Signed-off-by: Aylei <rayingecho@gmail.com> * Fix UT Signed-off-by: Aylei <rayingecho@gmail.com> --------- Signed-off-by: Aylei <rayingecho@gmail.com>	2 weeks ago
Alexander	6fd6428fc0	[Nebius] Handle disk cleanup on VM creation failure due to quota errors (#8004 )	2 weeks ago
Aylei	565546d18b	Fixed cache no cleaned when status refresh daemon encounters error (#8100 ) Signed-off-by: Aylei <rayingecho@gmail.com>	3 weeks ago
DanielZhangQD	40afb78c9c	[Dashboard] Show detailed error for k8s context access failure and other apis (#8099 ) show detailed error for k8s context access failure and other apis	3 weeks ago
lloyd-brown	43dd937811	[Core] Fix `test_managed_jobs_logs_gc` by Moving GC into Main Event Loop (#8097 ) * Use same event loop. * Remove import. * Use async file lock.	3 weeks ago
Christopher Cooper	24ce7f2f8a	[client] remove client-side cache for request payload env vars (#8095 ) * [client] remove client-side cache for request payload env vars This causes issues with sdk usage, which may invoke the client multiple times with different env vars within the same process. Correctness here is worth the miniscule performance hit. This fixes the smoke test `test_managed_jobs_config_labels_isolation`. That test technically regressed in #7966, but would only fail when both jobs were claimed by the same job controller process, which is rare. After #7332, the test would only have a single job controller process and started consistently failing. * fix test	3 weeks ago
lloyd-brown	12c1601c99	[Tests] Fix Shared Env Tests (#8094 ) * Use pools ratio directly. * Revert "Use pools ratio directly." This reverts commit `0e289c9439`. * Add logging. * Lower resource ratio. * Revert "Lower resource ratio." This reverts commit `cda4497605`. * Update controller size. * Skip remote. * Put newline back.	3 weeks ago
Romil Bhardwaj	0e434d4845	[Examples] Kimi-K2-Thinking example (#7988 ) * Add Kimi-K2-Thinking * use bash codeblock * Update title * Update llm/kimi-k2-thinking/README.md --------- Co-authored-by: Seung Jin <seungjin219@gmail.com>	3 weeks ago
Romil Bhardwaj	5e0a44d90e	[Release] Support release candidate publishing and promotion to stable (#8081 ) * Handle rc in semver, update docker build * Add rc promotion support * add post1 support * add post1 support	3 weeks ago
Caleb Whitaker	7e89b127ca	chore: fix spelling issues (#8091 )	3 weeks ago
Daniel Shin	fbd28a58b9	[BugFix][SSH Node Pools] SSH Node Pools Context Bug Fix (#8087 ) * fix * unit tests * format --------- Co-authored-by: Seung Jin Yang <seungjin219@gmail.com>	3 weeks ago
Sovit Nayak	8b8bcf0d4e	Fix: Suppress FutureWarning from google.api_core about Python 3.10 support (#8086 ) * Fix: Suppress FutureWarning from google.api_core about Python 3.10 support Suppress the FutureWarning from google.api_core._python_version_support that appears when GCP modules are imported. This warning is informational and does not affect functionality. Fixes #7886 * Fix: Use raw string for regex pattern in warning filter Use raw string (r'...') for the regex pattern in warnings.filterwarnings to follow Python best practices for regex patterns. * Fix: Format warning filter to comply with line length limit Split the message parameter across lines to fix: - Line too long error (85/80 characters) - YAPF formatting requirements	3 weeks ago
DanielZhangQD	e189f3dcbc	[Dashboard] Do not fail the whole k8s infra table if some contexts are not available (#8085 ) * do not fail the whole k8s table if some contexts are not available * format code	3 weeks ago
Christopher Cooper	717a13cdf8	ignore restart file on the first run (#8082 ) * ignore restart file on the first run * avoid crashing the server on inconsistent consolidation mode config * Revert "avoid crashing the server on inconsistent consolidation mode config" This reverts commit `dfa985e61d`. * only use a warning for inconsistent consolidation mode	3 weeks ago
lloyd-brown	b50e80e47d	[Core] Ensure `--retry-until-up` Tries Launch After Checking All Zones (#8079 ) * Retry if launch failed. * Fix at API server level. * Add smoke test. * Remove unnecessary statements. * Assert instead of log.	3 weeks ago
Daniel Shin	0a36905464	[Deprecate] Deprecate `sky local up` ip-list (#8065 ) * deprecate local up ip list * bump api server version	3 weeks ago
vincent d warmerdam	9ffe03de8e	Add marimo example to docs (#8031 ) * Adding marimo example * image link * add links to site/docs * add symbolic link for marimo too * also add links * fix symbolic links * typo	3 weeks ago
Rohan Sonecha	63fde129b9	[Metrics] change grafana data source for prometheus from secret to config map (#8045 ) change grafana data source for prometheus from secret to config map	3 weeks ago
zpoint	a08a275d0d	Add buildkite link to release and nigtly summary (#8068 ) add buildkite link to release and nigtly summary	3 weeks ago
Aylei	cf11cc0cf1	Support global extra envs (#8076 ) * Support global extraEnvs override Signed-off-by: Aylei <rayingecho@gmail.com> * Document Signed-off-by: Aylei <rayingecho@gmail.com> * Refine Signed-off-by: Aylei <rayingecho@gmail.com> --------- Signed-off-by: Aylei <rayingecho@gmail.com>	3 weeks ago
Aylei	64f680dc95	Support AWS_CONFIG_FILE env var (#8050 ) * Support AWS_CONFIG_FILE env var Signed-off-by: Aylei <rayingecho@gmail.com> * Unit test Signed-off-by: Aylei <rayingecho@gmail.com> * Refine code Signed-off-by: Aylei <rayingecho@gmail.com> * Refine Signed-off-by: Aylei <rayingecho@gmail.com> --------- Signed-off-by: Aylei <rayingecho@gmail.com>	3 weeks ago
Kevin Mingtarja	666879f604	[k8s] Dedup pending pod reason spinner updates (#8062 )	3 weeks ago