44 Commits

Author SHA1 Message Date
  Zhanghao Wu b2f3519723
Release 0.11.0 (#8255) 6 days ago
  Zhanghao Wu 59ff48fce7
Release 0.11.0rc2 (#8236) 1 week ago
  Christopher Cooper 078dad7ada [core] restore cluster_name_on_cloud from cluster yaml (#8233) 1 week ago
  Aylei 0207108092
Fixed incorrect user info in handlers (#8199 #8209) (#8234) 1 week ago
  Aylei 4546cbc962
[Core] Put Daemonize Call Back to Make Sky Cancel Reliable (#8203) (#8208) 1 week ago
  DanielZhangQD 2f0b2720c3
Fix ephemeral volume creation (#8179) (#8194) 1 week ago
  Christopher Cooper 83ef36b27a
[gh action] support publishing rc versions to pypi (#8188) (#8191) 1 week ago
  Zhanghao Wu 25862c5e07
Release 0.11.0rc1 (#8140) 1 week ago
  Girish Ramnani daa361a784
Fix typo in README.md for torchtitan (#8137) 2 weeks ago
  Christoph Clement dad0e910cc
[GCP] Fix B200 spot instance support in catalog fetcher (#8125) 2 weeks ago
  Zhanghao Wu 2a5e5fbb5c
[k8s] install k8s dependency for gpu labeller (#8126) 2 weeks ago
  DanielZhangQD 44bb3c07cd
[Doc] Add notes for volume config for multi-node clusters (#8114) 2 weeks ago
  zpoint 60a0fac4ff
[Test] Reuse existing VPC to fix `test_helm_deploy_eks` failure (#8115) 2 weeks ago
  Christopher Cooper 9fa34acba4
remove remaining references to old k8s image (#8112) 2 weeks ago
  Christopher Cooper f2b9aa1797
[test] fix flaky test_stdout_stderr_restoration (#8110) 2 weeks ago
  lloyd-brown 5f39acff29
[SDK] Change Task Secrets to SecretStr (#8040) 2 weeks ago
  Christopher Cooper 450f68d38b
[gpu labeler] update container image to new location (#8111) 2 weeks ago
  Christopher Cooper d997b43999
[jobs] fix schedule_state grpc enum backcompat (#8105) 2 weeks ago
  Christopher Cooper 1d7d66f381
[deprecate] re-add check_stale_runtime_on_remote (#8108) 2 weeks ago
  Tian Xia 38480ede75
[Consolidation] Fix env vars and skip the status refresh for controllers (#8106) 2 weeks ago
  lloyd-brown a8f511d029
[Test] Up Job Controller Resources for `test_pool_down_all_with_running_jobs` (#8104) 2 weeks ago
  Jeahong Hwang a45687f38b
[MISC] properly cleanup load map in LeastLoadPolicy (#8103) 2 weeks ago
  Aylei 99cc7fc4a6
Make logs gc sync (#8101) 2 weeks ago
  Alexander 6fd6428fc0
[Nebius] Handle disk cleanup on VM creation failure due to quota errors (#8004) 2 weeks ago
  Aylei 565546d18b
Fixed cache no cleaned when status refresh daemon encounters error (#8100) 3 weeks ago
  DanielZhangQD 40afb78c9c
[Dashboard] Show detailed error for k8s context access failure and other apis (#8099) 3 weeks ago
  lloyd-brown 43dd937811
[Core] Fix `test_managed_jobs_logs_gc` by Moving GC into Main Event Loop (#8097) 3 weeks ago
  Christopher Cooper 24ce7f2f8a
[client] remove client-side cache for request payload env vars (#8095) 3 weeks ago
  lloyd-brown 12c1601c99
[Tests] Fix Shared Env Tests (#8094) 3 weeks ago
  Romil Bhardwaj 0e434d4845
[Examples] Kimi-K2-Thinking example (#7988) 3 weeks ago
  Romil Bhardwaj 5e0a44d90e
[Release] Support release candidate publishing and promotion to stable (#8081) 3 weeks ago
  Caleb Whitaker 7e89b127ca
chore: fix spelling issues (#8091) 3 weeks ago
  Daniel Shin fbd28a58b9
[BugFix][SSH Node Pools] SSH Node Pools Context Bug Fix (#8087) 3 weeks ago
  Sovit Nayak 8b8bcf0d4e
Fix: Suppress FutureWarning from google.api_core about Python 3.10 support (#8086) 3 weeks ago
  DanielZhangQD e189f3dcbc
[Dashboard] Do not fail the whole k8s infra table if some contexts are not available (#8085) 3 weeks ago
  Christopher Cooper 717a13cdf8
ignore restart file on the first run (#8082) 3 weeks ago
  lloyd-brown b50e80e47d
[Core] Ensure `--retry-until-up` Tries Launch After Checking All Zones (#8079) 3 weeks ago
  Daniel Shin 0a36905464
[Deprecate] Deprecate `sky local up` ip-list (#8065) 3 weeks ago
  vincent d warmerdam 9ffe03de8e
Add marimo example to docs (#8031) 3 weeks ago
  Rohan Sonecha 63fde129b9
[Metrics] change grafana data source for prometheus from secret to config map (#8045) 3 weeks ago
  zpoint a08a275d0d
Add buildkite link to release and nigtly summary (#8068) 3 weeks ago
  Aylei cf11cc0cf1
Support global extra envs (#8076) 3 weeks ago
  Aylei 64f680dc95
Support AWS_CONFIG_FILE env var (#8050) 3 weeks ago
  Kevin Mingtarja 666879f604
[k8s] Dedup pending pod reason spinner updates (#8062) 3 weeks ago
100 changed files with 1776 additions and 924 deletions
Split View
  1. +25
    -6
      .github/workflows/docker-build.yaml
  2. +63
    -0
      .github/workflows/nightly-build.yml
  3. +5
    -2
      .github/workflows/publish-and-validate.yml
  4. +13
    -1
      .github/workflows/publish-helm.yml
  5. +104
    -17
      .github/workflows/release-build.yml
  6. +4
    -3
      README.md
  7. +23
    -2
      charts/skypilot/templates/api-deployment.yaml
  8. +3
    -2
      charts/skypilot/templates/datasource.yaml
  9. +3
    -0
      charts/skypilot/templates/oauth2-proxy-deployment.yaml
  10. +4
    -0
      charts/skypilot/templates/oauth2-proxy-redis.yaml
  11. +19
    -0
      charts/skypilot/tests/deployment_test.yaml
  12. +26
    -0
      charts/skypilot/tests/oauth2_test.yaml
  13. +9
    -0
      charts/skypilot/values.schema.json
  14. +3
    -0
      charts/skypilot/values.yaml
  15. +1
    -0
      docs/source/examples/frameworks/index.rst
  16. +1
    -0
      docs/source/examples/frameworks/marimo.md
  17. +1
    -0
      docs/source/examples/models/index.rst
  18. +1
    -0
      docs/source/examples/models/kimi-k2-thinking.md
  19. +17
    -0
      docs/source/reference/api-server/helm-values-spec.rst
  20. +2
    -2
      docs/source/reference/kubernetes/kubernetes-getting-started.rst
  21. +1
    -1
      docs/source/reference/kubernetes/kubernetes-troubleshooting.rst
  22. +8
    -0
      docs/source/reference/volumes.rst
  23. +8
    -0
      examples/distributed-pytorch/train-rdzv.yaml
  24. +1
    -1
      examples/github_actions/README.md
  25. +23
    -0
      examples/marimo/README.md
  26. +19
    -0
      examples/marimo/marimo.yaml
  27. +1
    -1
      examples/training/torchtitan/README.md
  28. +1
    -1
      examples/training_network_storage_benchmarks/README.md
  29. +149
    -0
      llm/kimi-k2-thinking/README.md
  30. +41
    -0
      llm/kimi-k2-thinking/kimi-k2-thinking-high-throughput.sky.yaml
  31. +39
    -0
      llm/kimi-k2-thinking/kimi-k2-thinking.sky.yaml
  32. +1
    -1
      sky/__init__.py
  33. +11
    -0
      sky/adaptors/gcp.py
  34. +3
    -1
      sky/adaptors/nebius.py
  35. +29
    -6
      sky/backends/backend_utils.py
  36. +12
    -2
      sky/backends/cloud_vm_ray_backend.py
  37. +30
    -3
      sky/catalog/data_fetchers/fetch_gcp.py
  38. +5
    -126
      sky/client/cli/command.py
  39. +2
    -16
      sky/client/sdk.py
  40. +3
    -9
      sky/client/sdk_async.py
  41. +43
    -27
      sky/clouds/aws.py
  42. +2
    -2
      sky/clouds/gcp.py
  43. +2
    -36
      sky/core.py
  44. +17
    -4
      sky/dashboard/src/components/infra.jsx
  45. +11
    -3
      sky/dashboard/src/components/workspaces.jsx
  46. +3
    -1
      sky/dashboard/src/data/connectors/client.js
  47. +35
    -41
      sky/dashboard/src/data/connectors/infra.jsx
  48. +22
    -7
      sky/dashboard/src/data/connectors/jobs.jsx
  49. +3
    -1
      sky/dashboard/src/data/connectors/volumes.js
  50. +32
    -0
      sky/dashboard/src/data/utils.jsx
  51. +0
    -6
      sky/exceptions.py
  52. +4
    -1
      sky/execution.py
  53. +3
    -3
      sky/global_user_state.py
  54. +37
    -45
      sky/jobs/log_gc.py
  55. +27
    -31
      sky/jobs/state.py
  56. +13
    -20
      sky/jobs/utils.py
  57. +5
    -3
      sky/provision/docker_utils.py
  58. +9
    -7
      sky/provision/kubernetes/instance.py
  59. +102
    -87
      sky/provision/nebius/utils.py
  60. +6
    -4
      sky/schemas/generated/managed_jobsv1_pb2.py
  61. +2
    -0
      sky/schemas/generated/managed_jobsv1_pb2.pyi
  62. +5
    -0
      sky/schemas/proto/managed_jobsv1.proto
  63. +2
    -1
      sky/serve/constants.py
  64. +2
    -0
      sky/serve/controller.py
  65. +1
    -1
      sky/serve/load_balancing_policies.py
  66. +11
    -13
      sky/serve/serve_utils.py
  67. +1
    -1
      sky/server/constants.py
  68. +10
    -18
      sky/server/daemons.py
  69. +7
    -0
      sky/server/requests/executor.py
  70. +1
    -8
      sky/server/requests/payloads.py
  71. +16
    -7
      sky/server/server.py
  72. +30
    -0
      sky/server/server_utils.py
  73. +2
    -1
      sky/skylet/subprocess_daemon.py
  74. +35
    -13
      sky/task.py
  75. +16
    -2
      sky/utils/common_utils.py
  76. +0
    -1
      sky/utils/controller_utils.py
  77. +2
    -2
      sky/utils/kubernetes/create_cluster.sh
  78. +1
    -1
      sky/utils/kubernetes/deploy_remote_cluster.py
  79. +3
    -1
      sky/utils/kubernetes/gpu_labeler.py
  80. +2
    -1
      sky/utils/kubernetes/k8s_gpu_labeler_job.yaml
  81. +16
    -16
      sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml
  82. +0
    -88
      sky/utils/kubernetes/kubernetes_deploy_utils.py
  83. +0
    -102
      sky/utils/log_utils.py
  84. +1
    -1
      tests/kubernetes/README.md
  85. +1
    -1
      tests/kubernetes/cpu_test_pod.yaml
  86. +1
    -1
      tests/kubernetes/gpu_test_pod.yaml
  87. +19
    -5
      tests/kubernetes/scripts/create_cluster.sh
  88. +1
    -1
      tests/kubernetes/scripts/skypilot_ssh_k8s_deployment.yaml
  89. +8
    -5
      tests/load_tests/db_scale_tests/create_aws_postgres_db.sh
  90. +5
    -5
      tests/smoke_tests/smoke_tests_utils.py
  91. +21
    -5
      tests/smoke_tests/test_api_server.py
  92. +107
    -4
      tests/smoke_tests/test_basic.py
  93. +123
    -53
      tests/smoke_tests/test_pools.py
  94. +2
    -2
      tests/test_yamls/low_resource_sky_config.yaml
  95. +57
    -0
      tests/unit_tests/kubernetes/test_deploy_remote_cluster.py
  96. +21
    -0
      tests/unit_tests/test_sky/clouds/test_aws_cloud.py
  97. +17
    -10
      tests/unit_tests/test_sky/jobs/test_server_core_secrets.py
  98. +12
    -19
      tests/unit_tests/test_sky/jobs/test_state.py
  99. +125
    -1
      tests/unit_tests/test_sky/server/requests/test_executor.py
  100. +0
    -2
      tests/unit_tests/test_sky/server/requests/test_payloads.py

+ 25
- 6
.github/workflows/docker-build.yaml View File

@@ -248,11 +248,21 @@ jobs:
- name: Create multi-platform manifest
run: |
# Create multi-platform manifest from temporary tags
VERSION="${{ needs.prepare.outputs.version }}"
# Determine if this is a pre-release version (contains rc, alpha, beta, or dev)
if [[ "$VERSION" =~ (rc|alpha|beta|dev) ]]; then
echo "Pre-release version detected ($VERSION), skipping :latest tag"
TAGS="-t ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:$VERSION"
else
echo "Stable release version detected ($VERSION), updating :latest tag"
TAGS="-t ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:latest -t ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:$VERSION"
fi
docker buildx imagetools create \
-t ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:latest \
-t ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:${{ needs.prepare.outputs.version }} \
${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:${{ needs.prepare.outputs.version }}-linux-amd64 \
${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:${{ needs.prepare.outputs.version }}-linux-arm64
$TAGS \
${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:$VERSION-linux-amd64 \
${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:$VERSION-linux-arm64

- name: Clean up temporary tags
if: always()
@@ -274,9 +284,18 @@ jobs:

- name: Summary of final tags
run: |
VERSION="${{ needs.prepare.outputs.version }}"
echo "✅ Multi-platform tags created:"
echo " - ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:latest"
echo " - ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:${{ needs.prepare.outputs.version }}"
echo " - ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:$VERSION"
# Check if latest tag was updated
if [[ "$VERSION" =~ (rc|alpha|beta|dev) ]]; then
echo ""
echo "ℹ️ Pre-release version - :latest tag not updated"
else
echo " - ${{ secrets.DOCKER_USERNAME }}/${{ inputs.package_name }}:latest"
fi
echo ""
echo "🚀 Built with native runners for maximum performance!"
echo "🧹 Temporary tags cleaned up automatically!"

+ 63
- 0
.github/workflows/nightly-build.yml View File

@@ -292,6 +292,69 @@ jobs:
package_name: skypilot-nightly
secrets: inherit

summary:
runs-on: ubuntu-latest
needs: [check-date, nightly-build-pypi, smoke-tests-aws, smoke-tests-kubernetes-resource-heavy, smoke-tests-kubernetes-no-resource-heavy, smoke-tests-kubernetes-no-resource-heavy-limit-deps, smoke-tests-remote-server-kubernetes, smoke-tests-shared-gke-api-server, smoke-tests-lambda-job-queue, smoke-tests-runpod-minimal, backward-compat-test-nightly, backward-compat-test-stable]
if: always()
steps:
- name: Summary
run: |
cat <<EOF >> "$GITHUB_STEP_SUMMARY"
# Nightly Build Summary

## Buildkite Test Links
EOF
if [ "${{ needs.smoke-tests-aws.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-aws.outputs.build_number }}" ]; then
cat <<EOF >> "$GITHUB_STEP_SUMMARY"
- [Smoke Tests AWS](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-aws.outputs.build_number }}) - $([ "${{ needs.smoke-tests-aws.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
EOF
fi
if [ "${{ needs.smoke-tests-kubernetes-resource-heavy.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-kubernetes-resource-heavy.outputs.build_number }}" ]; then
cat <<EOF >> "$GITHUB_STEP_SUMMARY"
- [Smoke Tests Kubernetes (Resource Heavy)](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-kubernetes-resource-heavy.outputs.build_number }}) - $([ "${{ needs.smoke-tests-kubernetes-resource-heavy.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
EOF
fi
if [ "${{ needs.smoke-tests-kubernetes-no-resource-heavy.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-kubernetes-no-resource-heavy.outputs.build_number }}" ]; then
cat <<EOF >> "$GITHUB_STEP_SUMMARY"
- [Smoke Tests Kubernetes (No Resource Heavy)](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-kubernetes-no-resource-heavy.outputs.build_number }}) - $([ "${{ needs.smoke-tests-kubernetes-no-resource-heavy.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
EOF
fi
if [ "${{ needs.smoke-tests-kubernetes-no-resource-heavy-limit-deps.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-kubernetes-no-resource-heavy-limit-deps.outputs.build_number }}" ]; then
cat <<EOF >> "$GITHUB_STEP_SUMMARY"
- [Smoke Tests Kubernetes (No Resource Heavy, Limit Deps)](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-kubernetes-no-resource-heavy-limit-deps.outputs.build_number }}) - $([ "${{ needs.smoke-tests-kubernetes-no-resource-heavy-limit-deps.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
EOF
fi
if [ "${{ needs.smoke-tests-remote-server-kubernetes.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_number }}" ]; then
cat <<EOF >> "$GITHUB_STEP_SUMMARY"
- [Smoke Tests Remote Server Kubernetes](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_number }}) - $([ "${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
EOF
fi
if [ "${{ needs.smoke-tests-shared-gke-api-server.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-shared-gke-api-server.outputs.build_number }}" ]; then
cat <<EOF >> "$GITHUB_STEP_SUMMARY"
- [Smoke Tests Shared GKE API Server](https://buildkite.com/skypilot-1/nightly-build-shared-gke-api-server/builds/${{ needs.smoke-tests-shared-gke-api-server.outputs.build_number }}) - $([ "${{ needs.smoke-tests-shared-gke-api-server.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
EOF
fi
if [ "${{ needs.smoke-tests-lambda-job-queue.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-lambda-job-queue.outputs.build_number }}" ]; then
cat <<EOF >> "$GITHUB_STEP_SUMMARY"
- [Smoke Tests Lambda Job Queue](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-lambda-job-queue.outputs.build_number }}) - $([ "${{ needs.smoke-tests-lambda-job-queue.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
EOF
fi
if [ "${{ needs.smoke-tests-runpod-minimal.result }}" != "skipped" ] && [ -n "${{ needs.smoke-tests-runpod-minimal.outputs.build_number }}" ]; then
cat <<EOF >> "$GITHUB_STEP_SUMMARY"
- [Smoke Tests RunPod Minimal](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-runpod-minimal.outputs.build_number }}) - $([ "${{ needs.smoke-tests-runpod-minimal.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
EOF
fi
if [ "${{ needs.backward-compat-test-nightly.result }}" != "skipped" ] && [ -n "${{ needs.backward-compat-test-nightly.outputs.build_number }}" ]; then
cat <<EOF >> "$GITHUB_STEP_SUMMARY"
- [Backward Compat Test (Nightly)](https://buildkite.com/skypilot-1/quicktest-core/builds/${{ needs.backward-compat-test-nightly.outputs.build_number }}) - $([ "${{ needs.backward-compat-test-nightly.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
EOF
fi
if [ "${{ needs.backward-compat-test-stable.result }}" != "skipped" ] && [ -n "${{ needs.backward-compat-test-stable.outputs.build_number }}" ]; then
cat <<EOF >> "$GITHUB_STEP_SUMMARY"
- [Backward Compat Test (Stable)](https://buildkite.com/skypilot-1/quicktest-core/builds/${{ needs.backward-compat-test-stable.outputs.build_number }}) - $([ "${{ needs.backward-compat-test-stable.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
EOF
fi

notify-slack-failure:
runs-on: ubuntu-latest
needs: [check-date, nightly-build-pypi, smoke-tests-aws, smoke-tests-kubernetes-resource-heavy, smoke-tests-kubernetes-no-resource-heavy, smoke-tests-kubernetes-no-resource-heavy-limit-deps, smoke-tests-remote-server-kubernetes, smoke-tests-shared-gke-api-server, smoke-tests-lambda-job-queue, smoke-tests-runpod-minimal, backward-compat-test-nightly, backward-compat-test-stable, publish-and-validate-both, trigger-helm-release]


+ 5
- 2
.github/workflows/publish-and-validate.yml View File

@@ -44,6 +44,8 @@ jobs:
- name: Validate published package
run: |
export SKYPILOT_DISABLE_USAGE_COLLECTION=1

# fastapi has some broken package info on test PyPI, so manually install it from real PyPI.
pip install fastapi

# Set up variables for package check
@@ -66,12 +68,13 @@ jobs:
pip uninstall -y ${{ inputs.package_name }} || true

# Install the package with no cache
# Use --pre so that pre-release versions (e.g. rcs) will be selected
if [ "${{ inputs.repository_type }}" == "test-pypi" ]; then
echo "Installing from Test PyPI..."
pip install --no-cache-dir --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple ${{ inputs.package_name }}
pip install --no-cache-dir --pre --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple ${{ inputs.package_name }}[server]
else
echo "Installing from PyPI..."
pip install --no-cache-dir ${{ inputs.package_name }}
pip install --no-cache-dir --pre ${{ inputs.package_name }}[server]
fi

# Check the version


+ 13
- 1
.github/workflows/publish-helm.yml View File

@@ -92,8 +92,20 @@ jobs:
run: |
version="${{ inputs.version }}"
# Convert PEP440 version to SemVer if needed for Helm versioning
# Handle cases like 1.0.0.dev20250218 -> 1.0.0-dev.20250218
# Handle cases like:
# 1.0.0.dev20250218 -> 1.0.0-dev.20250218
# 0.11.0rc0 -> 0.11.0-rc.0
# 0.11.0a1 -> 0.11.0-alpha.1
# 0.11.0b2 -> 0.11.0-beta.2
# 0.11.0.post1 -> 0.11.0+post.1
semversion=$(echo "$version" | sed -E 's/([0-9]+\.[0-9]+\.[0-9]+)\.dev([0-9]+)/\1-dev.\2/')
semversion=$(echo "$semversion" | sed -E 's/([0-9]+\.[0-9]+\.[0-9]+)rc([0-9]+)/\1-rc.\2/')
semversion=$(echo "$semversion" | sed -E 's/([0-9]+\.[0-9]+\.[0-9]+)a([0-9]+)/\1-alpha.\2/')
semversion=$(echo "$semversion" | sed -E 's/([0-9]+\.[0-9]+\.[0-9]+)b([0-9]+)/\1-beta.\2/')
# Post-releases use build metadata (+) since SemVer has no direct equivalent to PEP440's .post
# PEP440 .post means "after release", but SemVer build metadata has same precedence.
# TODO(romilb): If both 0.11.0 and 0.11.0+post.1 exist, Helm's "latest" behavior is undefined - some sources claim the newer one wins. Need to verify this.
semversion=$(echo "$semversion" | sed -E 's/([0-9]+\.[0-9]+\.[0-9]+)\.post([0-9]+)/\1+post.\2/')
# Update the version and name in the main skypilot chart
sed -i "s/^version:.*$/version: ${semversion}/" src/charts/skypilot/Chart.yaml


+ 104
- 17
.github/workflows/release-build.yml View File

@@ -6,7 +6,7 @@ on:
workflow_dispatch:
inputs:
release_version:
description: 'Release version (e.g., 0.9.0)'
description: 'Release version (e.g., 0.9.0 or 0.9.0rc1)'
required: false
type: string
skip_version_checks:
@@ -14,6 +14,11 @@ on:
required: false
type: boolean
default: false
skip_smoke_tests:
description: 'Skip smoke tests (recommended when promoting tested RC to stable)'
required: false
type: boolean
default: false

jobs:
release-build:
@@ -184,6 +189,10 @@ jobs:

smoke-tests:
needs: release-build
if: |
always() &&
needs.release-build.result == 'success' &&
github.event.inputs.skip_smoke_tests != 'true'
uses: ./.github/workflows/buildkite-trigger-wait.yml
with:
commit: ${{ needs.release-build.outputs.new_commit_sha }}
@@ -198,6 +207,10 @@ jobs:

quicktest-core:
needs: release-build
if: |
always() &&
needs.release-build.result == 'success' &&
github.event.inputs.skip_smoke_tests != 'true'
uses: ./.github/workflows/buildkite-trigger-wait.yml
with:
commit: ${{ needs.release-build.outputs.new_commit_sha }}
@@ -213,6 +226,10 @@ jobs:

quicktest-core-previous-minor:
needs: release-build
if: |
always() &&
needs.release-build.result == 'success' &&
github.event.inputs.skip_smoke_tests != 'true'
uses: ./.github/workflows/buildkite-trigger-wait.yml
with:
commit: ${{ needs.release-build.outputs.new_commit_sha }}
@@ -228,6 +245,10 @@ jobs:

smoke-tests-remote-server-kubernetes:
needs: release-build
if: |
always() &&
needs.release-build.result == 'success' &&
github.event.inputs.skip_smoke_tests != 'true'
uses: ./.github/workflows/buildkite-trigger-wait.yml
with:
commit: ${{ needs.release-build.outputs.new_commit_sha }}
@@ -244,6 +265,10 @@ jobs:

release-tests:
needs: release-build
if: |
always() &&
needs.release-build.result == 'success' &&
github.event.inputs.skip_smoke_tests != 'true'
uses: ./.github/workflows/buildkite-trigger-wait.yml
with:
commit: ${{ needs.release-build.outputs.new_commit_sha }}
@@ -271,34 +296,96 @@ jobs:
TEST_BRANCH: ${{ needs.release-build.outputs.test_branch }}
RELEASE_BRANCH: ${{ needs.release-build.outputs.release_branch }}
RELEASE_VERSION: ${{ needs.release-build.outputs.release_version }}
SMOKE_TEST_BUILD: ${{ needs.smoke-tests.outputs.build_number }}
QUICKTEST_BUILD: ${{ needs.quicktest-core.outputs.build_number }}
QUICKTEST_PREV_MINOR_BUILD: ${{ needs.quicktest-core-previous-minor.outputs.build_number }}
REMOTE_SERVER_K8S_BUILD: ${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_number }}
RELEASE_TEST_BUILD: ${{ needs.release-tests.outputs.build_number }}
SKIP_SMOKE_TESTS: ${{ github.event.inputs.skip_smoke_tests }}
run: |
# Configure git
git config --local user.email "action@github.com"
git config --local user.name "GitHub Action"

# Create PR with buildkite links
PR_BODY="Release ${RELEASE_VERSION}
# Detect if this is an RC promotion
SOURCE_BRANCH="${{ github.ref_name }}"
IS_RC_PROMOTION="false"
if [[ "$SOURCE_BRANCH" =~ ^releases/.*rc[0-9]+$ ]]; then
IS_RC_PROMOTION="true"
RC_VERSION=$(echo "$SOURCE_BRANCH" | sed 's/releases\///')
fi

# Build PR body based on whether tests were skipped
if [ "$SKIP_SMOKE_TESTS" == "true" ]; then
if [ "$IS_RC_PROMOTION" == "true" ]; then
PR_BODY="## Promote RC to Stable Release ${RELEASE_VERSION}
**Source:** \`$SOURCE_BRANCH\` (RC version: $RC_VERSION)
**Target:** Stable release \`${RELEASE_VERSION}\`
⚠️ **Smoke tests were SKIPPED** - This release is being promoted from a tested RC.
### Pre-release Testing
This version was previously tested as release candidate \`$RC_VERSION\` and deemed stable by early adopters.
### Changes in this PR
- Updated \`sky/__init__.py\`: \`$RC_VERSION\` → \`${RELEASE_VERSION}\`
- Updated \`charts/skypilot/values.yaml\`: Docker image tag \`$RC_VERSION\` → \`${RELEASE_VERSION}\`"
else
PR_BODY="Release ${RELEASE_VERSION}
⚠️ **Smoke tests were SKIPPED** - Please ensure manual testing was performed."
fi
else
# Normal release with test results
PR_BODY="Release ${RELEASE_VERSION}

Buildkite Test Links:
- [Full Smoke Tests](https://buildkite.com/skypilot-1/full-smoke-tests-run/builds/${SMOKE_TEST_BUILD}) - $([ "${{ needs.smoke-tests.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
- [Quicktest Core](https://buildkite.com/skypilot-1/quicktest-core/builds/${QUICKTEST_BUILD}) - $([ "${{ needs.quicktest-core.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
- [Smoke Tests Remote Server Kubernetes](https://buildkite.com/skypilot-1/smoke-tests/builds/${REMOTE_SERVER_K8S_BUILD}) - $([ "${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")"
if [ "${{ needs.quicktest-core-previous-minor.result }}" == "success" ] || [ "${{ needs.quicktest-core-previous-minor.result }}" == "failure" ]; then
PR_BODY="${PR_BODY}
- [Quicktest Core (vs Previous Minor)](https://buildkite.com/skypilot-1/quicktest-core/builds/${QUICKTEST_PREV_MINOR_BUILD}) - $([ "${{ needs.quicktest-core-previous-minor.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")"
fi
PR_BODY="${PR_BODY}
- [Release Tests](https://buildkite.com/skypilot-1/release/builds/${RELEASE_TEST_BUILD}) - ⏳ (not waiting for completion)
- [Full Smoke Tests](https://buildkite.com/skypilot-1/full-smoke-tests-run/builds/${{ needs.smoke-tests.outputs.build_number }}) - $([ "${{ needs.smoke-tests.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
- [Quicktest Core](https://buildkite.com/skypilot-1/quicktest-core/builds/${{ needs.quicktest-core.outputs.build_number }}) - $([ "${{ needs.quicktest-core.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
- [Quicktest Core (vs Previous Minor)](https://buildkite.com/skypilot-1/quicktest-core/builds/${{ needs.quicktest-core-previous-minor.outputs.build_number }}) - $([ "${{ needs.quicktest-core-previous-minor.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
- [Smoke Tests Remote Server Kubernetes](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_number }}) - $([ "${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
- [Release Tests](https://buildkite.com/skypilot-1/release/builds/${{ needs.release-tests.outputs.build_number }}) - ⏳ (not waiting for completion)

*Release Tests may take up to 24 hours to complete and might fail due to resource constraints.*"
fi

echo "Creating PR from ${TEST_BRANCH} to ${RELEASE_BRANCH}"

gh pr create --base ${RELEASE_BRANCH} --head ${TEST_BRANCH} \
--title "Release ${RELEASE_VERSION}" \
--body "${PR_BODY}"

- name: Summary
if: always()
env:
SKIP_SMOKE_TESTS: ${{ github.event.inputs.skip_smoke_tests }}
run: |
if [ "$SKIP_SMOKE_TESTS" == "true" ]; then
SOURCE_BRANCH="${{ github.ref_name }}"
if [[ "$SOURCE_BRANCH" =~ ^releases/.*rc[0-9]+$ ]]; then
RC_VERSION=$(echo "$SOURCE_BRANCH" | sed 's/releases\///')
cat <<EOF >> "$GITHUB_STEP_SUMMARY"
# Release ${{ needs.release-build.outputs.release_version }}

## RC Promotion
Promoting from \`$RC_VERSION\` to stable version \`${{ needs.release-build.outputs.release_version }}\`

⚠️ **Smoke tests were SKIPPED** - This release was promoted from a tested RC.
EOF
else
cat <<EOF >> "$GITHUB_STEP_SUMMARY"
# Release ${{ needs.release-build.outputs.release_version }}

⚠️ **Smoke tests were SKIPPED** - Please ensure manual testing was performed.
EOF
fi
else
cat <<EOF >> "$GITHUB_STEP_SUMMARY"
# Release ${{ needs.release-build.outputs.release_version }}

## Buildkite Test Links
- [Full Smoke Tests](https://buildkite.com/skypilot-1/full-smoke-tests-run/builds/${{ needs.smoke-tests.outputs.build_number }}) - $([ "${{ needs.smoke-tests.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
- [Quicktest Core](https://buildkite.com/skypilot-1/quicktest-core/builds/${{ needs.quicktest-core.outputs.build_number }}) - $([ "${{ needs.quicktest-core.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
- [Quicktest Core (vs Previous Minor)](https://buildkite.com/skypilot-1/quicktest-core/builds/${{ needs.quicktest-core-previous-minor.outputs.build_number }}) - $([ "${{ needs.quicktest-core-previous-minor.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
- [Smoke Tests Remote Server Kubernetes](https://buildkite.com/skypilot-1/smoke-tests/builds/${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_number }}) - $([ "${{ needs.smoke-tests-remote-server-kubernetes.outputs.build_status }}" == "success" ] && echo "✅ Success" || echo "❌ Failed")
- [Release Tests](https://buildkite.com/skypilot-1/release/builds/${{ needs.release-tests.outputs.build_number }}) - ⏳ (not waiting for completion)

*Release Tests may take up to 24 hours to complete and might fail due to resource constraints.*
EOF
fi

+ 4
- 3
README.md View File

@@ -39,6 +39,7 @@
----

:fire: *News* :fire:
- [Nov 2025] Serve **Kimi K2 Thinking** with reasoning capabilities on your Kubernetes or clouds: [**example**](./llm/kimi-k2-thinking/)
- [Oct 2025] Run **RL training for LLMs** with SkyRL on your Kubernetes or clouds: [**example**](./llm/skyrl/)
- [Oct 2025] Train and serve [Andrej Karpathy's](https://x.com/karpathy/status/1977755427569111362) **nanochat** - the best ChatGPT that $100 can buy: [**example**](./llm/nanochat)
- [Oct 2025] Run large-scale **LLM training with TorchTitan** on any AI infra: [**example**](./examples/training/torchtitan)
@@ -49,7 +50,7 @@
- [Jul 2025] Finetune **Llama4** on any distributed cluster/cloud: [**example**](./llm/llama-4-finetuning/)
- [Jul 2025] Two-part blog series, `The Evolution of AI Job Orchestration`: (1) [Running AI jobs on GPU Neoclouds](https://blog.skypilot.co/ai-job-orchestration-pt1-gpu-neoclouds/), (2) [The AI-Native Control Plane & Orchestration that Finally Works for ML](https://blog.skypilot.co/ai-job-orchestration-pt2-ai-control-plane/)
- [Apr 2025] Spin up **Qwen3** on your cluster/cloud: [**example**](./llm/qwen/)
- [Feb 2025] Prepare and serve **Retrieval Augmented Generation (RAG) with DeepSeek-R1**: [**blog post**](https://blog.skypilot.co/deepseek-rag), [**example**](./llm/rag/)


**LLM Finetuning Cookbooks**: Finetuning Llama 2 / Llama 3.1 in your own cloud environment, privately: Llama 2 [**example**](./llm/vicuna-llama-2/) and [**blog**](https://blog.skypilot.co/finetuning-llama2-operational-guide/); Llama 3.1 [**example**](./llm/llama-3_1-finetuning/) and [**blog**](https://blog.skypilot.co/finetune-llama-3_1-on-your-infra/)
@@ -183,9 +184,9 @@ Latest featured examples:
|----------|----------|
| Training | [Verl](https://docs.skypilot.co/en/latest/examples/training/verl.html), [Finetune Llama 4](https://docs.skypilot.co/en/latest/examples/training/llama-4-finetuning.html), [TorchTitan](https://docs.skypilot.co/en/latest/examples/training/torchtitan.html), [PyTorch](https://docs.skypilot.co/en/latest/getting-started/tutorial.html), [DeepSpeed](https://docs.skypilot.co/en/latest/examples/training/deepspeed.html), [NeMo](https://docs.skypilot.co/en/latest/examples/training/nemo.html), [Ray](https://docs.skypilot.co/en/latest/examples/training/ray.html), [Unsloth](https://docs.skypilot.co/en/latest/examples/training/unsloth.html), [Jax/TPU](https://docs.skypilot.co/en/latest/examples/training/tpu.html) |
| Serving | [vLLM](https://docs.skypilot.co/en/latest/examples/serving/vllm.html), [SGLang](https://docs.skypilot.co/en/latest/examples/serving/sglang.html), [Ollama](https://docs.skypilot.co/en/latest/examples/serving/ollama.html) |
| Models | [DeepSeek-R1](https://docs.skypilot.co/en/latest/examples/models/deepseek-r1.html), [Llama 4](https://docs.skypilot.co/en/latest/examples/models/llama-4.html), [Llama 3](https://docs.skypilot.co/en/latest/examples/models/llama-3.html), [CodeLlama](https://docs.skypilot.co/en/latest/examples/models/codellama.html), [Qwen](https://docs.skypilot.co/en/latest/examples/models/qwen.html), [Kimi-K2](https://docs.skypilot.co/en/latest/examples/models/kimi-k2.html), [Mixtral](https://docs.skypilot.co/en/latest/examples/models/mixtral.html) |
| Models | [DeepSeek-R1](https://docs.skypilot.co/en/latest/examples/models/deepseek-r1.html), [Llama 4](https://docs.skypilot.co/en/latest/examples/models/llama-4.html), [Llama 3](https://docs.skypilot.co/en/latest/examples/models/llama-3.html), [CodeLlama](https://docs.skypilot.co/en/latest/examples/models/codellama.html), [Qwen](https://docs.skypilot.co/en/latest/examples/models/qwen.html), [Kimi-K2](https://docs.skypilot.co/en/latest/examples/models/kimi-k2.html), [Kimi-K2-Thinking](https://docs.skypilot.co/en/latest/examples/models/kimi-k2-thinking.html), [Mixtral](https://docs.skypilot.co/en/latest/examples/models/mixtral.html) |
| AI apps | [RAG](https://docs.skypilot.co/en/latest/examples/applications/rag.html), [vector databases](https://docs.skypilot.co/en/latest/examples/applications/vector_database.html) (ChromaDB, CLIP) |
| Common frameworks | [Airflow](https://docs.skypilot.co/en/latest/examples/frameworks/airflow.html), [Jupyter](https://docs.skypilot.co/en/latest/examples/frameworks/jupyter.html) |
| Common frameworks | [Airflow](https://docs.skypilot.co/en/latest/examples/frameworks/airflow.html), [Jupyter](https://docs.skypilot.co/en/latest/examples/frameworks/jupyter.html), [marimo](https://docs.skypilot.co/en/latest/examples/frameworks/marimo.html) |

Source files can be found in [`llm/`](https://github.com/skypilot-org/skypilot/tree/master/llm) and [`examples/`](https://github.com/skypilot-org/skypilot/tree/master/examples).



+ 23
- 2
charts/skypilot/templates/api-deployment.yaml View File

@@ -71,8 +71,11 @@ spec:
resources:
{{- toYaml .Values.apiService.resources | nindent 10 }}
env:
{{- if .Values.apiService.extraEnvs }}
{{- toYaml .Values.apiService.extraEnvs | nindent 8 }}
{{- with $.Values.global.extraEnvs }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.apiService.extraEnvs }}
{{- toYaml . | nindent 8 }}
{{- end }}
- name: SKYPILOT_DEV
value: {{ .Values.apiService.skypilotDev | quote }}
@@ -388,6 +391,9 @@ spec:
sleep 600
fi
env:
{{- with $.Values.global.extraEnvs }}
{{- toYaml . | nindent 8 }}
{{- end }}
- name: AWS_ACCESS_KEY_ID
valueFrom:
secretKeyRef:
@@ -435,6 +441,9 @@ spec:
echo "Account ID:"
cat /root/.cloudflare/accountid
env:
{{- with $.Values.global.extraEnvs }}
{{- toYaml . | nindent 8 }}
{{- end }}
- name: R2_CREDENTIALS
valueFrom:
secretKeyRef:
@@ -458,6 +467,9 @@ spec:
image: {{ include "common.image" (dict "root" . "image" .Values.gcpCredentials.image) }}
command: ["/bin/sh", "-c"]
env:
{{- with $.Values.global.extraEnvs }}
{{- toYaml . | nindent 8 }}
{{- end }}
- name: GOOGLE_APPLICATION_CREDENTIALS
value: /root/gcp-cred.json
args:
@@ -492,6 +504,9 @@ spec:
sleep 600
fi
env:
{{- with $.Values.global.extraEnvs }}
{{- toYaml . | nindent 8 }}
{{- end }}
- name: RUNPOD_API_KEY
valueFrom:
secretKeyRef:
@@ -521,6 +536,9 @@ spec:
sleep 600
fi
env:
{{- with $.Values.global.extraEnvs }}
{{- toYaml . | nindent 8 }}
{{- end }}
- name: LAMBDA_API_KEY
valueFrom:
secretKeyRef:
@@ -550,6 +568,9 @@ spec:
sleep 600
fi
env:
{{- with $.Values.global.extraEnvs }}
{{- toYaml . | nindent 8 }}
{{- end }}
- name: VAST_API_KEY
valueFrom:
secretKeyRef:


+ 3
- 2
charts/skypilot/templates/datasource.yaml View File

@@ -1,13 +1,13 @@
{{- if and .Values.grafana.enabled .Values.grafana.sidecar.datasources.enabled }}
apiVersion: v1
kind: Secret
kind: ConfigMap
metadata:
name: {{ .Release.Name }}-prometheus-datasource
namespace: {{ .Release.Namespace }}
labels:
app: {{ .Release.Name }}-api
grafana_datasource: "true"
stringData:
data:
prometheus.yaml: |-
apiVersion: 1
datasources:
@@ -16,4 +16,5 @@ stringData:
url: http://{{ .Release.Name }}-prometheus-server:80
editable: false
uid: prometheus
access: proxy
{{- end }}

+ 3
- 0
charts/skypilot/templates/oauth2-proxy-deployment.yaml View File

@@ -95,6 +95,9 @@ spec:
{{- end }}
{{- end }}
env:
{{- with .Values.global.extraEnvs }}
{{- toYaml . | nindent 8 }}
{{- end }}
- name: OAUTH2_PROXY_CLIENT_ID
{{- if (index $oauth2 "client-details-from-secret") }}
valueFrom:


+ 4
- 0
charts/skypilot/templates/oauth2-proxy-redis.yaml View File

@@ -30,6 +30,10 @@ spec:
containers:
- name: redis
image: {{ include "common.image" (dict "root" . "image" (default "redis:7-alpine" (index $oauth2 "redis-image"))) }}
{{- with $.Values.global.extraEnvs }}
env:
{{- toYaml . | nindent 10 }}
{{- end }}
ports:
- containerPort: 6379
resources:


+ 19
- 0
charts/skypilot/tests/deployment_test.yaml View File

@@ -86,6 +86,25 @@ tests:
path: spec.template.spec.containers[1].image
value: registry.example.com/custom/skypilot-dev/sky:dev

- it: should inject global extra envs into api containers and init containers
set:
global.extraEnvs:
- name: GLOBAL_ENV
value: global
awsCredentials.enabled: true
awsCredentials.useCredentialsFile: false
asserts:
- contains:
path: spec.template.spec.containers[0].env
content:
name: GLOBAL_ENV
value: global
- contains:
path: spec.template.spec.initContainers[0].env
content:
name: GLOBAL_ENV
value: global

- it: should prefix gcp credentials init container image with the global registry override
set:
global.imageRegistry: registry.example.com/custom


+ 26
- 0
charts/skypilot/tests/oauth2_test.yaml View File

@@ -88,6 +88,32 @@ tests:
path: spec.template.spec.containers[0].env[1].value
value: "test-client-secret"

- it: should apply global extra envs to oauth2-proxy and redis
set:
global.extraEnvs:
- name: GLOBAL_ENV
value: global
auth.oauth.enabled: false
ingress.enabled: true
ingress.oauth2-proxy.enabled: true
ingress.oauth2-proxy.oidc-issuer-url: "https://example.okta.com/oauth2/default"
ingress.oauth2-proxy.client-id: "test-client-id"
ingress.oauth2-proxy.client-secret: "test-client-secret"
asserts:
- contains:
path: spec.template.spec.containers[0].env
content:
name: GLOBAL_ENV
value: global
template: templates/oauth2-proxy-deployment.yaml
- contains:
path: spec.template.spec.containers[0].env
content:
name: GLOBAL_ENV
value: global
template: templates/oauth2-proxy-redis.yaml
documentIndex: 0

- it: should configure oauth2-proxy with client credentials from secret (legacy)
set:
auth.oauth.enabled: false


+ 9
- 0
charts/skypilot/values.schema.json View File

@@ -292,6 +292,15 @@
"global": {
"type": "object",
"properties": {
"extraEnvs": {
"type": [
"array",
"null"
],
"items": {
"type": "object"
}
},
"imagePullSecrets": {
"type": [
"array",


+ 3
- 0
charts/skypilot/values.yaml View File

@@ -6,6 +6,9 @@ global:
# Specify imagePullSecrets for all components in the chart.
# @schema type: [array, null]; item: object
imagePullSecrets: null
# Specify extra environment variables to set on all components in the chart.
# @schema type: [array, null]; item: object
extraEnvs: null


apiService:


+ 1
- 0
docs/source/examples/frameworks/index.rst View File

@@ -8,6 +8,7 @@ Frameworks
DVC <dvc>
GCP DWS <https://docs.skypilot.co/en/latest/reservations/reservations.html#gcp-dynamic-workload-scheduler-dws>
Jupyter <jupyter>
marimo <marimo>
MLFlow <https://nebius.com/blog/posts/orchestrating-llm-fine-tuning-k8s-skypilot-mlflow>
MPI <mpi>
Spyder IDE <spyder>

+ 1
- 0
docs/source/examples/frameworks/marimo.md View File

@@ -0,0 +1 @@
../../generated-examples/marimo.md

+ 1
- 0
docs/source/examples/models/index.rst View File

@@ -20,6 +20,7 @@ Models
Mistral 7B <https://docs.mistral.ai/self-deployment/skypilot/>
Qwen 3 <qwen>
Kimi K2 <kimi-k2>
Kimi K2 Thinking <kimi-k2-thinking>
Yi <yi>
Gemma <gemma>
DBRX <dbrx>


+ 1
- 0
docs/source/examples/models/kimi-k2-thinking.md View File

@@ -0,0 +1 @@
../../generated-examples/kimi-k2-thinking.md

+ 17
- 0
docs/source/reference/api-server/helm-values-spec.rst View File

@@ -36,6 +36,7 @@ Below is the available helm value keys and the default value of each key:
:ref:`global <helm-values-global>`:
:ref:`imageRegistry <helm-values-global-imageRegistry>`: null
:ref:`imagePullSecrets <helm-values-global-imagePullSecrets>`: null
:ref:`extraEnvs <helm-values-global-extraEnvs>`: null
:ref:`apiService <helm-values-apiService>`:
:ref:`image <helm-values-apiService-image>`: berkeleyskypilot/skypilot-nightly:latest
:ref:`upgradeStrategy <helm-values-apiService-upgradeStrategy>`: Recreate
@@ -306,6 +307,22 @@ Default: ``null``
imagePullSecrets:
- name: my-registry-credentials

.. _helm-values-global-extraEnvs:

``global.extraEnvs``
^^^^^^^^^^^^^^^^^^^^

Specify extra environment variables to set on all components in the chart.

Default: ``null``

.. code-block:: yaml

global:
extraEnvs:
- name: HTTP_PROXY
value: http://proxy.example.com

.. _helm-values-apiService:

``apiService``


+ 2
- 2
docs/source/reference/kubernetes/kubernetes-getting-started.rst View File

@@ -193,8 +193,8 @@ Using custom images
-------------------
By default, we maintain and use two SkyPilot container images for use on Kubernetes clusters:

1. ``us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot``: used for CPU-only clusters (`Dockerfile <https://github.com/skypilot-org/skypilot/blob/master/Dockerfile_k8s>`__).
2. ``us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu``: used for GPU clusters (`Dockerfile <https://github.com/skypilot-org/skypilot/blob/master/Dockerfile_k8s_gpu>`__).
1. ``us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot``: used for CPU-only clusters (`Dockerfile <https://github.com/skypilot-org/skypilot/blob/master/Dockerfile_k8s>`__).
2. ``us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot-gpu``: used for GPU clusters (`Dockerfile <https://github.com/skypilot-org/skypilot/blob/master/Dockerfile_k8s_gpu>`__).

These images are pre-installed with SkyPilot dependencies for fast startup.



+ 1
- 1
docs/source/reference/kubernetes/kubernetes-troubleshooting.rst View File

@@ -36,7 +36,7 @@ Step A1 - Can you create pods and services?

As a sanity check, we will now try creating a simple pod running a HTTP server and a service to verify that your cluster and it's networking is functional.

We will use the SkyPilot default image :code:`us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest` to verify that the image can be pulled from the registry.
We will use the SkyPilot default image :code:`us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot:latest` to verify that the image can be pulled from the registry.

.. code-block:: bash



+ 8
- 0
docs/source/reference/volumes.rst View File

@@ -110,6 +110,10 @@ Quickstart
run: |
echo "Hello, World!" > /mnt/data/hello.txt

.. note::

For multi-node clusters, volumes are mounted to all nodes. You must configure ``config.access_mode`` to ``ReadWriteMany`` and use a ``storage_class_name`` that supports the ``ReadWriteMany`` access mode. Otherwise, SkyPilot will fail to launch the cluster.

.. _volumes-on-kubernetes-manage:

Managing volumes
@@ -320,6 +324,10 @@ When you launch the cluster with ``sky launch``, the ephemeral volumes will be a
NAME TYPE INFRA SIZE USER WORKSPACE AGE STATUS LAST_USE USED_BY IS_EPHEMERAL
my-cluster-43dbb4ab-2f74bf k8s-pvc Kubernetes/nebius-mk8s-vol 100Gi alice default 58m IN_USE 2025-11-17 14:30:18 my-cluster-43dbb4ab-head True

.. note::

For multi-node clusters, ephemeral volumes are mounted to all nodes. You must configure ``config.access_mode`` to ``ReadWriteMany`` and use a ``storage_class_name`` that supports the ``ReadWriteMany`` access mode. Otherwise, SkyPilot will fail to launch the cluster.

When you terminate the cluster, the ephemeral volumes are automatically deleted:

.. code-block:: console


+ 8
- 0
examples/distributed-pytorch/train-rdzv.yaml View File

@@ -22,6 +22,14 @@ run: |
export LOGLEVEL=INFO

MASTER_ADDR=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
# torchrun attempts to autodetect if the current node is the head node based
# on the hostname, but this depends on network configuration and doesn't
# work on some clouds, e.g. Nebius VMs. Force to "localhost" to work around.
# This is only needed for c10d rdzv backend, not static rendezvous.
# See https://github.com/pytorch/pytorch/issues/79388.
if [ $SKYPILOT_NODE_RANK -eq 0 ]; then
MASTER_ADDR=localhost
fi
echo "Starting distributed training, head node: $MASTER_ADDR"

torchrun \


+ 1
- 1
examples/github_actions/README.md View File

@@ -39,7 +39,7 @@ In this example, create the following repository secrets:
- ``SKYPILOT_API_URL``: URL to the SkyPilot API server, in format of ``http(s)://url-or-ip``.
If using basic auth, the URL should also include the credentials in format of ``http(s)://username:password@url-or-ip``.
- ``SKYPILOT_SERVICE_ACCOUNT_TOKEN``: Only required if using OAuth. Service account token for GitHub actions user generated above.
- ``SLACK_BOT_TOKEN``: Optional, create a [Slack App](https://api.slack.com/apps) and get a slack "App-Level Token" with `connections:write` permssion to send a summary message. If not provided, a slack message is not sent after a job is queued.
- ``SLACK_BOT_TOKEN``: Optional, create a [Slack App](https://api.slack.com/apps) and get a slack "App-Level Token" with `connections:write` permission to send a summary message. If not provided, a slack message is not sent after a job is queued.
- ``SLACK_CHANNEL_ID``: Optional, Slack Channel ID to send a summary message. If not provided, a slack message is not sent after a job is queued.

## Repository Structure


+ 23
- 0
examples/marimo/README.md View File

@@ -0,0 +1,23 @@
# Run marimo on SkyPilot

Run a personal [marimo](https://marimo.io/) server on a SkyPilot cluster.

![marimo Web UI](https://i.imgur.com/iLYbQ6b.png "marimo Web UI")

## Launch with CLI

Launch a marimo cluster with the command:

```bash
sky launch -c marimo-example marimo.yaml
```

Next, run this command to get the endpoint to connect via the browser:

```
sky status marimo-example --endpoints
```

## Customization

The `marimo.yaml` file can be customized to change the port, password, and other options. Check the [docs](https://docs.marimo.io/cli/#marimo-edit) for more information.

+ 19
- 0
examples/marimo/marimo.yaml View File

@@ -0,0 +1,19 @@
# Example: Launch marimo and auto-expose its port to Internet.
#
# Usage:
# # First, launch the compute node
# $ sky launch -c marimo-example marimo.yaml
# # Next, get the endpoint to connect to over the browser
# $ sky status marimo-example --endpoints
# # This is an alternative to port forwarding.

resources:
ports:
- 29324

workdir: .

setup: pip install uv

# Check the docs for more options: https://docs.marimo.io/cli/#marimo-edit
run: uvx marimo edit --port 29324 --headless --host=0.0.0.0 --token-password='secretpassword'

+ 1
- 1
examples/training/torchtitan/README.md View File

@@ -2,7 +2,7 @@

[TorchTitan](https://github.com/pytorch/torchtitan) is a PyTorch native platform for large-scale LLM training, featuring multi-dimensional parallelisms (FSDP2, Tensor/Pipeline/Context Parallel), distributed checkpointing, torch.compile, and Float8 support.

This example demonstrates how to run [TorchTitan](https://github.com/pytorch/torchtitan) on your Kubernetes clusters, or any hypersclaers, neoclouds using SkyPilot, in addition to the instructions for runnning on [Slurm](https://github.com/pytorch/torchtitan?tab=readme-ov-file#multi-node-training).
This example demonstrates how to run [TorchTitan](https://github.com/pytorch/torchtitan) on your Kubernetes clusters, or any hyperscalers, neoclouds using SkyPilot, in addition to the instructions for running on [Slurm](https://github.com/pytorch/torchtitan?tab=readme-ov-file#multi-node-training).

## Quick start
Here is how to finetune Llama 3.1 on 2 nodes with 8 H100 (or 8 H200):


+ 1
- 1
examples/training_network_storage_benchmarks/README.md View File

@@ -8,7 +8,7 @@ Please edit the yamls as you like.

To run disk tests, run `sky launch e2e_disk.yaml -c e2e_disk --env HF_TOKEN="YOUR TOKEN"`

Requirements for disk benchmark, 2 s3 buckets (one for mount and one for mount cached) and 1 pvc (Check out [volumnes](https://docs.skypilot.co/en/stable/reference/volumes.html))
Requirements for disk benchmark, 2 s3 buckets (one for mount and one for mount cached) and 1 pvc (Check out [volumes](https://docs.skypilot.co/en/stable/reference/volumes.html))

Expected output, something like:



+ 149
- 0
llm/kimi-k2-thinking/README.md View File

@@ -0,0 +1,149 @@

<!-- $REMOVE -->
# Run Kimi K2 Thinking on Kubernetes or Any Cloud
<!-- $END_REMOVE -->
<!-- $UNCOMMENT# Kimi K2 Thinking -->

[Kimi K2 Thinking](https://huggingface.co/moonshotai/Kimi-K2-Thinking) is an advanced large language model created by [Moonshot AI](https://www.moonshot.ai/).

This recipe shows how to run Kimi K2 Thinking with reasoning capabilities on your Kubernetes or any cloud. It includes two modes:

- **Low Latency (TP8)**: Best for interactive applications requiring quick responses
- **High Throughput (TP8+DCP8)**: Best for batch processing and high-volume serving scenarios


## Prerequisites

- Check that you have installed SkyPilot ([docs](https://docs.skypilot.co/en/latest/getting-started/installation.html)).
- Check that `sky check` shows clouds or Kubernetes is enabled.
- **Note**: This model requires 8x H200 or H20 GPUs.

## Run Kimi K2 Thinking (Low Latency Mode)

For low-latency scenarios, use tensor parallelism:

```bash
sky launch kimi-k2-thinking.sky.yaml -c kimi-k2-thinking
```

`kimi-k2-thinking.sky.yaml` uses **tensor parallelism** across 8 GPUs for optimal low-latency performance.

🎉 **Congratulations!** 🎉 You have now launched the Kimi K2 Thinking LLM with reasoning capabilities on your infra.

## Run Kimi K2 Thinking (High Throughput Mode)

For high-throughput scenarios, use Decode Context Parallel (DCP) for **43% faster token generation** and **26% higher throughput**:

```bash
sky launch kimi-k2-thinking-high-throughput.sky.yaml -c kimi-k2-thinking-ht
```

The `kimi-k2-thinking-high-throughput.sky.yaml` adds `--decode-context-parallel-size 8` to enable DCP:

```yaml
run: |
echo 'Starting vLLM API server for Kimi-K2-Thinking (High Throughput Mode with DCP)...'
vllm serve $MODEL_NAME \
--port 8081 \
--tensor-parallel-size 8 \
--decode-context-parallel-size 8 \
--enable-auto-tool-choice \
--tool-call-parser kimi_k2 \
--reasoning-parser kimi_k2 \
--trust-remote-code
```

### DCP Performance Gains

From [vLLM's benchmark](https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2-Think.html):

| Metric | TP8 (Low Latency) | TP8+DCP8 (High Throughput) | Improvement |
|--------|-------------------|----------------------------|-------------|
| Request Throughput (req/s) | 1.25 | 1.57 | **+25.6%** |
| Output Token Throughput (tok/s) | 485.78 | 695.13 | **+43.1%** |
| Mean TTFT (sec) | 271.2 | 227.8 | **+16.0%** |
| KV Cache Size (tokens) | 715,072 | 5,721,088 | **8x** |

## Chat with Kimi K2 Thinking with OpenAI API

To curl `/v1/chat/completions`:

```bash
ENDPOINT=$(sky status --endpoint 8081 kimi-k2-thinking)

curl http://$ENDPOINT/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "moonshotai/Kimi-K2-Thinking",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant with deep reasoning capabilities."
},
{
"role": "user",
"content": "Explain how to solve the traveling salesman problem for 10 cities."
}
]
}' | jq .
```

The model will provide its reasoning process in the response, showing its chain-of-thought approach.

## Clean up resources
To shut down all resources:

```bash
sky down kimi-k2-thinking
```

## Serving Kimi-K2-Thinking: scaling up with SkyServe

With no change to the YAML, launch a fully managed service with autoscaling replicas and load-balancing on your infra:

```bash
sky serve up kimi-k2-thinking.sky.yaml -n kimi-k2-thinking
```

Wait until the service is ready:

```bash
watch -n10 sky serve status kimi-k2-thinking
```

Get a single endpoint that load-balances across replicas:

```bash
ENDPOINT=$(sky serve status --endpoint kimi-k2-thinking)
```

> **Tip:** SkyServe fully manages the lifecycle of your replicas. For example, if a spot replica is preempted, the controller will automatically replace it. This significantly reduces the operational burden while saving costs.

To curl the endpoint:

```bash
curl http://$ENDPOINT/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "moonshotai/Kimi-K2-Thinking",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant with deep reasoning capabilities."
},
{
"role": "user",
"content": "Design a distributed system for real-time analytics."
}
]
}' | jq .
```

To shut down all resources:

```bash
sky serve down kimi-k2-thinking
```

See more details in [SkyServe docs](https://docs.skypilot.co/en/latest/serving/sky-serve.html).

+ 41
- 0
llm/kimi-k2-thinking/kimi-k2-thinking-high-throughput.sky.yaml View File

@@ -0,0 +1,41 @@
# Serve Kimi-K2-Thinking with SkyPilot and vLLM (High Throughput Mode).
# Uses Decode Context Parallel (DCP) for 43% faster token generation and 26% higher throughput.
#
# Usage:
# sky launch kimi-k2-thinking-high-throughput.sky.yaml -c kimi-k2-thinking-ht
# sky serve up kimi-k2-thinking-high-throughput.sky.yaml -n kimi-k2-thinking-ht
envs:
MODEL_NAME: moonshotai/Kimi-K2-Thinking


resources:
image_id: docker:vllm/vllm-openai:nightly-f849ee739cdb3d82fce1660a6fd91806e8ae9bff
accelerators: H200:8
cpus: 100+
memory: 1000+
ports: 8081

run: |
echo 'Starting vLLM API server for Kimi-K2-Thinking (High Throughput Mode with DCP)...'
vllm serve $MODEL_NAME \
--port 8081 \
--tensor-parallel-size 8 \
--decode-context-parallel-size 8 \
--enable-auto-tool-choice \
--tool-call-parser kimi_k2 \
--reasoning-parser kimi_k2 \
--trust-remote-code

service:
replicas: 1
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: What is 2+2?
max_tokens: 10


+ 39
- 0
llm/kimi-k2-thinking/kimi-k2-thinking.sky.yaml View File

@@ -0,0 +1,39 @@
# Serve Kimi-K2-Thinking with SkyPilot and vLLM (Low Latency Mode).
# This model supports deep thinking & tool orchestration with reasoning capabilities.
#
# Usage:
# sky launch kimi-k2-thinking.sky.yaml -c kimi-k2-thinking
# sky serve up kimi-k2-thinking.sky.yaml -n kimi-k2-thinking
envs:
MODEL_NAME: moonshotai/Kimi-K2-Thinking

resources:
image_id: docker:vllm/vllm-openai:nightly-f849ee739cdb3d82fce1660a6fd91806e8ae9bff
accelerators: H200:8
cpus: 100+
memory: 1000+
ports: 8081

run: |
echo 'Starting vLLM API server for Kimi-K2-Thinking (Low Latency Mode)...'
vllm serve $MODEL_NAME \
--port 8081 \
--tensor-parallel-size 8 \
--enable-auto-tool-choice \
--tool-call-parser kimi_k2 \
--reasoning-parser kimi_k2 \
--trust-remote-code

service:
replicas: 1
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: What is 2+2?
max_tokens: 10


+ 1
- 1
sky/__init__.py View File

@@ -37,7 +37,7 @@ def _get_git_commit():


__commit__ = _get_git_commit()
__version__ = '1.0.0-dev0'
__version__ = '0.11.0'
__root_dir__ = directory_utils.get_sky_dir()




+ 11
- 0
sky/adaptors/gcp.py View File

@@ -2,9 +2,20 @@

# pylint: disable=import-outside-toplevel
import json
import warnings

from sky.adaptors import common

# Suppress FutureWarning from google.api_core about Python 3.10 support ending.
# This warning is informational and does not affect functionality.
# Reference: https://github.com/skypilot-org/skypilot/issues/7886
warnings.filterwarnings(
'ignore',
category=FutureWarning,
message=
r'.*You are using a Python version.*which Google will stop supporting.*',
)

_IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for GCP. '
'Try pip install "skypilot[gcp]"')
googleapiclient = common.LazyImport('googleapiclient',


+ 3
- 1
sky/adaptors/nebius.py View File

@@ -136,7 +136,9 @@ SKY_CHECK_NAME = 'Nebius (for Nebius Object Storae)'


def request_error():
return nebius.aio.service_error.RequestError
# pylint: disable=import-outside-toplevel
from nebius.aio import service_error
return service_error.RequestError


def compute():


+ 29
- 6
sky/backends/backend_utils.py View File

@@ -1064,7 +1064,11 @@ def write_cluster_config(
with open(tmp_yaml_path, 'w', encoding='utf-8') as f:
f.write(restored_yaml_content)

config_dict['cluster_name_on_cloud'] = cluster_name_on_cloud
# Read the cluster_name_on_cloud from the restored yaml. This is a hack to
# make sure that launching on the same cluster across multiple users works
# correctly. See #8232.
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']

# Make sure to do this before we optimize file mounts. Optimization is
# non-deterministic, but everything else before this point should be
@@ -3150,12 +3154,11 @@ def refresh_cluster_records() -> None:
Raises:
None
"""
exclude_managed_clusters = True
if env_options.Options.SHOW_DEBUG_INFO.get():
exclude_managed_clusters = False
# We force to exclude managed clusters to avoid multiple sources
# manipulating them. For example, SkyServe assumes the replica manager
# is the only source of truth for the cluster status.
cluster_names = set(
global_user_state.get_cluster_names(
exclude_managed_clusters=exclude_managed_clusters,))
global_user_state.get_cluster_names(exclude_managed_clusters=True))

# TODO(syang): we should try not to leak
# request info in backend_utils.py.
@@ -3633,6 +3636,26 @@ def check_rsync_installed() -> None:
' $ sudo apt install rsync') from None


def check_stale_runtime_on_remote(returncode: int, stderr: str,
cluster_name: str) -> None:
"""Raises RuntimeError if remote SkyPilot runtime needs to be updated.

We detect this by parsing certain backward-incompatible error messages from
`stderr`. Typically due to the local client version just got updated, and
the remote runtime is an older version.
"""
if returncode != 0:
if 'SkyPilot runtime is too old' in stderr:
with ux_utils.print_exception_no_traceback():
raise RuntimeError(
f'{colorama.Fore.RED}SkyPilot runtime needs to be updated '
f'on the remote cluster: {cluster_name}. To update, run '
'(existing jobs will not be interrupted): '
f'{colorama.Style.BRIGHT}sky start -f -y '
f'{cluster_name}{colorama.Style.RESET_ALL}'
f'\n--- Details ---\n{stderr.strip()}\n') from None


def get_endpoints(cluster: str,
port: Optional[Union[int, str]] = None,
skip_status_check: bool = False) -> Dict[int, str]:


+ 12
- 2
sky/backends/cloud_vm_ray_backend.py View File

@@ -4154,7 +4154,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
runners = handle.get_command_runners(avoid_ssh_control=True)

def _setup_node(node_id: int) -> None:
setup_envs = task.envs_and_secrets
setup_envs = task_lib.get_plaintext_envs_and_secrets(
task.envs_and_secrets)
setup_envs.update(self._skypilot_predefined_env_vars(handle))
setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips)
setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id)
@@ -4431,6 +4432,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
job_submit_cmd,
stream_logs=False,
require_outputs=True)
# Happens when someone calls `sky exec` but remote is outdated for
# running a job. Necessitating calling `sky launch`.
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
handle.cluster_name)
output = stdout + stderr
if _is_message_too_long(returncode, output=output):
# If the job submit script is too long, we need to retry it
@@ -4498,6 +4503,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
stream_logs=False,
require_outputs=True,
separate_stderr=True)
# Happens when someone calls `sky exec` but remote is outdated for
# adding a job. Necessitating calling `sky launch`.
backend_utils.check_stale_runtime_on_remote(returncode, stderr,
handle.cluster_name)
# TODO(zhwu): this sometimes will unexpectedly fail, we can add
# retry for this, after we figure out the reason.
subprocess_utils.handle_returncode(returncode, code,
@@ -6364,7 +6373,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
handle: CloudVmRayResourceHandle) -> Dict[str, str]:
"""Returns the environment variables for the task."""
env_vars = task.envs_and_secrets
env_vars = task_lib.get_plaintext_envs_and_secrets(
task.envs_and_secrets)
# If it is a managed job, the TASK_ID_ENV_VAR will have been already set
# by the controller.
if constants.TASK_ID_ENV_VAR not in env_vars:


+ 30
- 3
sky/catalog/data_fetchers/fetch_gcp.py View File

@@ -182,8 +182,9 @@ TPU_V4_HOST_DF = pd.read_csv(
SERIES_TO_DESCRIPTION = {
'a2': 'A2 Instance',
'a3': 'A3 Instance',
# TODO(zhwu): GCP does not have A4 instance in SKUs API yet. We keep it here
# for completeness.
# NOTE: GCP does not provide separate CPU/RAM pricing for A4 instances.
# The B200 GPU pricing includes the full VM cost. See special handling in
# get_vm_price() which sets A4 VM price to 0.
'a4': 'A4 Instance',
'c2': 'Compute optimized',
'c2d': 'C2D AMD Instance',
@@ -394,6 +395,15 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
if series in ['f1', 'g1']:
memory_price = 0.0

# Special case for A4 instances.
# GCP does not provide separate CPU/RAM pricing for A4 instances in the
# SKUs API. The GPU pricing (B200) includes the full VM cost.
# We set the VM price to 0 so the entry is not dropped, and the GPU
# pricing will provide the total cost.
if series == 'a4':
cpu_price = 0.0
memory_price = 0.0

# TODO(tian): (2024/11/10) Some SKUs are missing in the SKUs API. We
# skip them in the catalog for now. We should investigate why they are
# missing and add them back.
@@ -525,7 +535,24 @@ def get_gpu_df(skus: List[Dict[str, Any]],
row_gpu_name = row['AcceleratorName']
if row['Region'] not in sku['serviceRegions']:
continue
if sku['category']['usageType'] != ondemand_or_spot:

# Check usageType matches, with special handling for B200 spot.
# GCP has a bug where some B200 spot SKUs have usageType='OnDemand'
# but the description contains 'Spot Preemptible'.
usage_type = sku['category']['usageType']
description = sku['description']
is_spot_description = 'spot preemptible' in description.lower()

if usage_type != ondemand_or_spot:
# For B200 spot pricing, also accept SKUs where description
# says "Spot Preemptible" even if usageType is wrong.
if not (spot and row_gpu_name == 'B200' and
is_spot_description):
continue

# For B200 on-demand, skip SKUs that are actually spot (description
# says "Spot Preemptible" but usageType is incorrectly 'OnDemand').
if not spot and row_gpu_name == 'B200' and is_spot_description:
continue

gpu_names = [f'{row_gpu_name} GPU']


+ 5
- 126
sky/client/cli/command.py View File

@@ -3040,34 +3040,6 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
# there is no in-prgress managed jobs.
managed_jobs_ = []
pools_ = []
except exceptions.InconsistentConsolidationModeError:
# If this error is raised, it means the user switched to the
# consolidation mode but the previous controller cluster is still
# running. We should allow the user to tear down the controller
# cluster in this case.
with skypilot_config.override_skypilot_config(
{'jobs': {
'controller': {
'consolidation_mode': False
}
}}):
# Check again with the consolidation mode disabled. This is to
# make sure there is no in-progress managed jobs.
request_id, queue_result_version = (
cli_utils.get_managed_job_queue(
refresh=False,
skip_finished=True,
all_users=True,
fields=fields,
))
result = sdk.stream_and_get(request_id)
if queue_result_version.v2():
managed_jobs_, _, status_counts, _ = result
else:
managed_jobs_ = typing.cast(
List[responses.ManagedJobRecord], result)
request_id_pools = managed_jobs.pool_status(pool_names=None)
pools_ = sdk.stream_and_get(request_id_pools)

msg = (f'{colorama.Fore.YELLOW}WARNING: Tearing down the managed '
'jobs controller. Please be aware of the following:'
@@ -3144,21 +3116,6 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
# controller being STOPPED or being firstly launched, i.e., there is
# no in-prgress services.
services = []
except exceptions.InconsistentConsolidationModeError:
# If this error is raised, it means the user switched to the
# consolidation mode but the previous controller cluster is still
# running. We should allow the user to tear down the controller
# cluster in this case.
with skypilot_config.override_skypilot_config(
{'serve': {
'controller': {
'consolidation_mode': False
}
}}):
# Check again with the consolidation mode disabled. This is to
# make sure there is no in-progress services.
request_id = serve_lib.status(service_names=None)
services = sdk.stream_and_get(request_id)

if services:
service_names = [service['name'] for service in services]
@@ -6239,33 +6196,6 @@ def local():
is_flag=True,
help='Launch cluster without GPU support even '
'if GPUs are detected on the host.')
@click.option(
'--ips',
type=str,
required=False,
help='Path to the file containing IP addresses of remote machines.')
@click.option('--ssh-user',
type=str,
required=False,
help='SSH username for accessing remote machines.')
@click.option('--ssh-key-path',
type=str,
required=False,
help='Path to the SSH private key.')
@click.option('--cleanup',
is_flag=True,
help='Clean up the remote cluster instead of deploying it.')
@click.option(
'--context-name',
type=str,
required=False,
help='Name to use for the kubeconfig context. Defaults to "default". '
'Used with the ip list.')
@click.option('--password',
type=str,
required=False,
help='Password for the ssh-user to execute sudo commands. '
'Required only if passwordless sudo is not setup.')
@click.option(
'--name',
type=str,
@@ -6282,56 +6212,10 @@ def local():
@flags.config_option(expose_value=False)
@_add_click_options(flags.COMMON_OPTIONS)
@usage_lib.entrypoint
def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
cleanup: bool, context_name: Optional[str],
password: Optional[str], name: Optional[str],
port_start: Optional[int], async_call: bool):
"""Creates a local or remote cluster."""

def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
# If any of --ips, --ssh-user, or --ssh-key-path is specified,
# all must be specified
if bool(ips) or bool(ssh_user) or bool(ssh_key_path):
if not (ips and ssh_user and ssh_key_path):
raise click.BadParameter(
'All --ips, --ssh-user, and --ssh-key-path '
'must be specified together.')

# --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
# are all provided
if cleanup and not (ips and ssh_user and ssh_key_path):
raise click.BadParameter('--cleanup can only be used with '
'--ips, --ssh-user and --ssh-key-path.')

_validate_args(ips, ssh_user, ssh_key_path, cleanup)

# If remote deployment arguments are specified, run remote up script
ip_list = None
ssh_key = None
if ips and ssh_user and ssh_key_path:
# Read and validate IP file
try:
with open(os.path.expanduser(ips), 'r', encoding='utf-8') as f:
ip_list = f.read().strip().splitlines()
if not ip_list:
raise click.BadParameter(f'IP file is empty: {ips}')
except (IOError, OSError) as e:
raise click.BadParameter(f'Failed to read IP file {ips}: {str(e)}')

# Read and validate SSH key file
try:
with open(os.path.expanduser(ssh_key_path), 'r',
encoding='utf-8') as f:
ssh_key = f.read()
if not ssh_key:
raise click.BadParameter(
f'SSH key file is empty: {ssh_key_path}')
except (IOError, OSError) as e:
raise click.BadParameter(
f'Failed to read SSH key file {ssh_key_path}: {str(e)}')

request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup,
context_name, password, name, port_start)
def local_up(gpus: bool, name: Optional[str], port_start: Optional[int],
async_call: bool):
"""Creates a local cluster."""
request_id = sdk.local_up(gpus, name, port_start)
_async_call_or_wait(request_id, async_call, request_name='local up')


@@ -6344,12 +6228,7 @@ def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
@_add_click_options(flags.COMMON_OPTIONS)
@usage_lib.entrypoint
def local_down(name: Optional[str], async_call: bool):
"""Deletes a local cluster.

This will only delete a local cluster started without the ip list.
To clean up the local cluster started with a ip list, use `sky local up`
with the cleanup flag.
"""
"""Deletes a local cluster."""
request_id = sdk.local_down(name)
_async_call_or_wait(request_id, async_call, request_name='sky.local.down')



+ 2
- 16
sky/client/sdk.py View File

@@ -675,7 +675,7 @@ def _launch(
clusters = get(status_request_id)
cluster_user_hash = common_utils.get_user_hash()
cluster_user_hash_str = ''
current_user = common_utils.get_current_user_name()
current_user = common_utils.get_local_user_name()
cluster_user_name = current_user
if not clusters:
# Show the optimize log before the prompt if the cluster does not
@@ -1712,12 +1712,6 @@ def storage_delete(name: str) -> server_common.RequestId[None]:
@server_common.check_server_healthy_or_start
@annotations.client_api
def local_up(gpus: bool,
ips: Optional[List[str]],
ssh_user: Optional[str],
ssh_key: Optional[str],
cleanup: bool,
context_name: Optional[str] = None,
password: Optional[str] = None,
name: Optional[str] = None,
port_start: Optional[int] = None) -> server_common.RequestId[None]:
"""Launches a Kubernetes cluster on local machines.
@@ -1733,15 +1727,7 @@ def local_up(gpus: bool,
raise ValueError('`sky local up` is only supported when '
'running SkyPilot locally.')

body = payloads.LocalUpBody(gpus=gpus,
ips=ips,
ssh_user=ssh_user,
ssh_key=ssh_key,
cleanup=cleanup,
context_name=context_name,
password=password,
name=name,
port_start=port_start)
body = payloads.LocalUpBody(gpus=gpus, name=name, port_start=port_start)
response = server_common.make_authenticated_request(
'POST', '/local_up', json=json.loads(body.model_dump_json()))
return server_common.get_request_id(response)


+ 3
- 9
sky/client/sdk_async.py View File

@@ -656,19 +656,13 @@ async def storage_delete(
@annotations.client_api
async def local_up(
gpus: bool,
ips: Optional[List[str]],
ssh_user: Optional[str],
ssh_key: Optional[str],
cleanup: bool,
context_name: Optional[str] = None,
name: Optional[str] = None,
password: Optional[str] = None,
port_start: Optional[int] = None,
stream_logs: Optional[StreamConfig] = DEFAULT_STREAM_CONFIG) -> None:
"""Async version of local_up() that launches a Kubernetes cluster on
local machines."""
request_id = await context_utils.to_thread(sdk.local_up, gpus, ips,
ssh_user, ssh_key, cleanup,
context_name, name, password)
request_id = await context_utils.to_thread(sdk.local_up, gpus, name,
port_start)
if stream_logs is not None:
return await _stream_and_get(request_id, stream_logs)
else:


+ 43
- 27
sky/clouds/aws.py View File

@@ -55,26 +55,6 @@ _DEFAULT_GPU_ARM64_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-arm64'
_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
_DEFAULT_NEURON_IMAGE_ID = 'skypilot:neuron-ubuntu-2204'

# This local file (under ~/.aws/) will be uploaded to remote nodes (any
# cloud), if all of the following conditions hold:
# - the current user identity is not using AWS SSO
# - this file exists
# It has the following purposes:
# - make all nodes (any cloud) able to access private S3 buckets
# - make some remote nodes able to launch new nodes on AWS (i.e., makes
# AWS head node able to launch AWS workers, or any-cloud jobs controller
# able to launch spot clusters on AWS).
#
# If we detect the current user identity is AWS SSO, we will not upload this
# file to any remote nodes (any cloud). Instead, a SkyPilot IAM role is
# assigned to both AWS head and workers.
# TODO(skypilot): This also means we leave open a bug for AWS SSO users that
# use multiple clouds. The non-AWS nodes will have neither the credential
# file nor the ability to understand AWS IAM.
_CREDENTIAL_FILES = [
'credentials',
]

DEFAULT_AMI_GB = 45
DEFAULT_SSH_USER = 'ubuntu'
DEFAULT_ROOT_DEVICE_NAME = '/dev/sda1'
@@ -124,10 +104,25 @@ _EFA_IMAGE_NAME = 'Deep Learning Base OSS Nvidia Driver GPU AMI' \
# For functions that needs caching per AWS profile.
_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE = 5

# Ref: https://docs.aws.amazon.com/cli/v1/userguide/cli-configure-envvars.html
_DEFAULT_AWS_CONFIG_PATH = '~/.aws/credentials'
_AWS_CONFIG_FILE_ENV_VAR = 'AWS_CONFIG_FILE'

T = TypeVar('T')
P = ParamSpec('P')


def _get_credentials_path() -> str:
cred_path = os.getenv(_AWS_CONFIG_FILE_ENV_VAR, None)
if cred_path is not None:
if not os.path.isfile(os.path.expanduser(cred_path)):
raise FileNotFoundError(f'{_AWS_CONFIG_FILE_ENV_VAR}={cred_path},'
' but the file does not exist.')
return cred_path
# Fallback to the default config path.
return _DEFAULT_AWS_CONFIG_PATH


def aws_profile_aware_lru_cache(*lru_cache_args,
scope: Literal['global', 'request'] = 'request',
**lru_cache_kwargs) -> Callable:
@@ -997,8 +992,9 @@ class AWS(clouds.Cloud):
except exceptions.CloudUserIdentityError as e:
return False, None, str(e)

credentials_path = _get_credentials_path()
static_credential_exists = os.path.isfile(
os.path.expanduser('~/.aws/credentials'))
os.path.expanduser(credentials_path))
hints = None
identity_type = cls._current_identity_type()
single_cloud_hint = (
@@ -1049,7 +1045,7 @@ class AWS(clouds.Cloud):
# other clouds to access private s3 buckets and resources like EC2.
# `get_active_user_identity` does not guarantee this file exists.
if not static_credential_exists:
return (False, None, '~/.aws/credentials does not exist. ' +
return (False, None, f'{credentials_path} does not exist. ' +
cls._STATIC_CREDENTIAL_HELP_STR)

return True, identity_str, hints
@@ -1290,11 +1286,31 @@ class AWS(clouds.Cloud):
if self._current_identity_type(
) != AWSIdentityType.SHARED_CREDENTIALS_FILE:
return {}
return {
f'~/.aws/{filename}': f'~/.aws/{filename}'
for filename in _CREDENTIAL_FILES
if os.path.exists(os.path.expanduser(f'~/.aws/{filename}'))
}

# This local credentials file (default to ~/.aws/credentials and can be
# overridden by AWS_CONFIG_FILE environment variable) will be uploaded
# to remote nodes (any cloud), if all of the following conditions hold:
# - the current user identity is not using AWS SSO
# - this file exists
# It has the following purposes:
# - make all nodes (any cloud) able to access private S3 buckets
# - make some remote nodes able to launch new nodes on AWS (i.e., makes
# AWS head node able to launch AWS workers, or any-cloud jobs controller
# able to launch spot clusters on AWS).
#
# If we detect the current user identity is AWS SSO, we will not upload this
# file to any remote nodes (any cloud). Instead, a SkyPilot IAM role is
# assigned to both AWS head and workers.
# TODO(skypilot): This also means we leave open a bug for AWS SSO users that
# use multiple clouds. The non-AWS nodes will have neither the credential
# file nor the ability to understand AWS IAM.
credentials_path = os.path.expanduser(_get_credentials_path())
if os.path.exists(credentials_path):
return {
# Upload to the default config location on remote cluster.
_DEFAULT_AWS_CONFIG_PATH: credentials_path
}
return {}

@aws_profile_aware_lru_cache(scope='request',
maxsize=_AWS_PROFILE_SCOPED_FUNC_CACHE_SIZE)


+ 2
- 2
sky/clouds/gcp.py View File

@@ -1186,8 +1186,8 @@ class GCP(clouds.Cloud):
# These series don't support pd-standard, use pd-balanced for LOW.
_propagate_disk_type(
lowest=tier2name[resources_utils.DiskTier.MEDIUM])
if instance_type.startswith('a3-ultragpu') or series == 'n4':
# a3-ultragpu instances only support hyperdisk-balanced.
if instance_type.startswith('a3-ultragpu') or series in ('n4', 'a4'):
# a3-ultragpu, n4, and a4 instances only support hyperdisk-balanced.
_propagate_disk_type(all='hyperdisk-balanced')

# Series specific handling


+ 2
- 36
sky/core.py View File

@@ -1293,44 +1293,10 @@ def realtime_kubernetes_gpu_availability(
# =================
@usage_lib.entrypoint
def local_up(gpus: bool,
ips: Optional[List[str]],
ssh_user: Optional[str],
ssh_key: Optional[str],
cleanup: bool,
context_name: Optional[str] = None,
password: Optional[str] = None,
name: Optional[str] = None,
port_start: Optional[int] = None) -> None:
"""Creates a local or remote cluster."""

def _validate_args(ips, ssh_user, ssh_key, cleanup):
# If any of --ips, --ssh-user, or --ssh-key-path is specified,
# all must be specified
if bool(ips) or bool(ssh_user) or bool(ssh_key):
if not (ips and ssh_user and ssh_key):
with ux_utils.print_exception_no_traceback():
raise ValueError(
'All ips, ssh_user, and ssh_key must be specified '
'together.')

# --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
# are all provided
if cleanup and not (ips and ssh_user and ssh_key):
with ux_utils.print_exception_no_traceback():
raise ValueError(
'cleanup can only be used with ips, ssh_user and ssh_key.')

_validate_args(ips, ssh_user, ssh_key, cleanup)

# If remote deployment arguments are specified, run remote up script
if ips:
assert ssh_user is not None and ssh_key is not None
kubernetes_deploy_utils.deploy_remote_cluster(ips, ssh_user, ssh_key,
cleanup, context_name,
password)
else:
# Run local deployment (kind) if no remote args are specified
kubernetes_deploy_utils.deploy_local_cluster(name, port_start, gpus)
"""Creates a local cluster."""
kubernetes_deploy_utils.deploy_local_cluster(name, port_start, gpus)


def local_down(name: Optional[str] = None) -> None:


+ 17
- 4
sky/dashboard/src/components/infra.jsx View File

@@ -84,6 +84,7 @@ export function InfrastructureSection({
isSSH = false, // To differentiate between SSH and Kubernetes
actionButton = null, // Optional action button for the header
contextWorkspaceMap = {}, // Mapping of contexts to workspaces
contextErrors = {}, // Mapping of contexts to error messages
}) {
// Add defensive check for contexts
const safeContexts = contexts || [];
@@ -289,15 +290,19 @@ export function InfrastructureSection({
) : (
<span
className={
nodes.length === 0 ? 'text-gray-400' : ''
nodes.length === 0 && contextErrors[context]
? 'text-gray-400'
: ''
}
title={
nodes.length === 0
? 'Context may be unavailable or timed out'
nodes.length === 0 && contextErrors[context]
? contextErrors[context]
: ''
}
>
{nodes.length === 0 ? '0*' : nodes.length}
{nodes.length === 0 && contextErrors[context]
? '0*'
: nodes.length}
</span>
)}
</td>
@@ -1615,6 +1620,7 @@ export function GPUs() {
const [enabledClouds, setEnabledClouds] = useState(0);
const [contextStats, setContextStats] = useState({});
const [contextWorkspaceMap, setContextWorkspaceMap] = useState({});
const [contextErrors, setContextErrors] = useState({});

// Workspace-aware infrastructure state
const [workspaceInfrastructure, setWorkspaceInfrastructure] = useState({});
@@ -1668,6 +1674,7 @@ export function GPUs() {
setPerNodeGPUs([]);
setContextStats({});
setContextWorkspaceMap({});
setContextErrors({});
setAvailableWorkspaces([]);
setKubeDataLoaded(true);
setKubeLoading(false);
@@ -1714,6 +1721,7 @@ export function GPUs() {
perNodeGPUs: fetchedPerNodeGPUs,
contextStats: fetchedContextStats,
contextWorkspaceMap: fetchedContextWorkspaceMap,
contextErrors: fetchedContextErrors,
} = infraData;

setWorkspaceInfrastructure(fetchedWorkspaceInfrastructure || {});
@@ -1723,6 +1731,7 @@ export function GPUs() {
setPerNodeGPUs(fetchedPerNodeGPUs || []);
setContextStats(fetchedContextStats || {});
setContextWorkspaceMap(fetchedContextWorkspaceMap || {});
setContextErrors(fetchedContextErrors || {});

// Extract available workspaces from the workspace infrastructure data
const workspaceNames = Object.keys(
@@ -1740,6 +1749,7 @@ export function GPUs() {
setPerNodeGPUs([]);
setContextStats({});
setContextWorkspaceMap({});
setContextErrors({});
setAvailableWorkspaces([]);
setKubeDataLoaded(true);
setKubeLoading(false);
@@ -1753,6 +1763,7 @@ export function GPUs() {
setPerNodeGPUs([]);
setContextStats({});
setContextWorkspaceMap({});
setContextErrors({});
setAvailableWorkspaces([]);
setKubeDataLoaded(true);
setKubeLoading(false);
@@ -2306,6 +2317,7 @@ export function GPUs() {
isJobsDataLoading={sshAndKubeJobsDataLoading}
isSSH={true}
contextWorkspaceMap={contextWorkspaceMap}
contextErrors={contextErrors}
actionButton={
// TODO: Add back when SSH Node Pool add operation is more robust
// <button
@@ -2337,6 +2349,7 @@ export function GPUs() {
isJobsDataLoading={sshAndKubeJobsDataLoading}
isSSH={false}
contextWorkspaceMap={contextWorkspaceMap}
contextErrors={contextErrors}
/>
);
};


+ 11
- 3
sky/dashboard/src/components/workspaces.jsx View File

@@ -126,6 +126,7 @@ export async function getWorkspaceManagedJobs(workspaceName) {
throw new Error(msg);
}
const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
let errorMessage = fetchedData.statusText;
if (fetchedData.status === 500) {
try {
const data = await fetchedData.json();
@@ -135,17 +136,24 @@ export async function getWorkspaceManagedJobs(workspaceName) {
// Handle specific error types
if (error.type && error.type === CLUSTER_NOT_UP_ERROR) {
return { jobs: [] };
} else {
errorMessage = error.message || String(data.detail.error);
}
} catch (jsonError) {
console.error('Error parsing JSON:', jsonError);
console.error(
'Error parsing JSON from data.detail.error:',
jsonError
);
errorMessage = String(data.detail.error);
}
}
} catch (parseError) {
console.error('Error parsing JSON:', parseError);
console.error('Error parsing response JSON:', parseError);
errorMessage = String(parseError);
}
}
if (!fetchedData.ok) {
const msg = `API request to get managed jobs result failed with status ${fetchedData.status} for workspace ${workspaceName}`;
const msg = `API request to get managed jobs result failed with status ${fetchedData.status}, error: ${errorMessage} for workspace ${workspaceName}`;
throw new Error(msg);
}
const data = await fetchedData.json();


+ 3
- 1
sky/dashboard/src/data/connectors/client.js View File

@@ -1,5 +1,6 @@
'use client';

import { getErrorMessageFromResponse } from '@/data/utils';
import { ENDPOINT } from './constants';

export const apiClient = {
@@ -44,7 +45,8 @@ export const apiClient = {

// Handle all error status codes (4xx, 5xx, etc.)
if (!fetchedData.ok) {
const msg = `API request to get ${path} result failed with status ${fetchedData.status}`;
const errorMessage = await getErrorMessageFromResponse(fetchedData);
const msg = `API request to get ${path} result failed with status ${fetchedData.status}, error: ${errorMessage}`;
throw new Error(msg);
}



+ 35
- 41
sky/dashboard/src/data/connectors/infra.jsx View File

@@ -2,6 +2,7 @@ import { CLOUDS_LIST, COMMON_GPUS } from '@/data/connectors/constants';

// Importing from the same directory
import { apiClient } from '@/data/connectors/client';
import { getErrorMessageFromResponse } from '@/data/utils';

export async function getCloudInfrastructure(forceRefresh = false) {
const dashboardCache = (await import('@/lib/cache')).default;
@@ -54,7 +55,7 @@ export async function getCloudInfrastructure(forceRefresh = false) {
`/api/get?request_id=${checkId}`
);
if (!checkResult.ok) {
const msg = `Failed to get sky check result with status ${checkResult.status}`;
const msg = `Failed to get sky check result with status ${checkResult.status}, error: ${checkResult.statusText}`;
throw new Error(msg);
}
const checkData = await checkResult.json();
@@ -206,6 +207,7 @@ export async function getWorkspaceInfrastructure() {
perNodeGPUs: [],
contextStats: {},
contextWorkspaceMap: {},
contextErrors: {},
};
}

@@ -324,6 +326,7 @@ export async function getWorkspaceInfrastructure() {
allGPUs: [],
perContextGPUs: [],
perNodeGPUs: [],
contextErrors: {},
};
try {
gpuData = await getKubernetesGPUsFromContexts(validContexts);
@@ -339,6 +342,7 @@ export async function getWorkspaceInfrastructure() {
perNodeGPUs: gpuData.perNodeGPUs || [],
contextStats: contextStats,
contextWorkspaceMap: contextWorkspaceMap,
contextErrors: gpuData.contextErrors || {},
};

console.log('[DEBUG] Final result:', finalResult);
@@ -361,21 +365,39 @@ async function getKubernetesGPUsFromContexts(contextNames) {
allGPUs: [],
perContextGPUs: [],
perNodeGPUs: [],
contextErrors: {},
};
}

const allGPUsSummary = {};
const perContextGPUsData = {};
const perNodeGPUs_dict = {};
const contextErrors = {};

// Get all of the node info for all contexts in parallel and put them
// in a dictionary keyed by context name.
const contextNodeInfoList = await Promise.all(
// Use Promise.allSettled to handle partial failures gracefully
const contextNodeInfoResults = await Promise.allSettled(
contextNames.map((context) => getKubernetesPerNodeGPUs(context))
);
const contextToNodeInfo = {};
for (let i = 0; i < contextNames.length; i++) {
contextToNodeInfo[contextNames[i]] = contextNodeInfoList[i];
const result = contextNodeInfoResults[i];
if (result.status === 'fulfilled') {
contextToNodeInfo[contextNames[i]] = result.value;
} else {
// Log the error but continue with other contexts
const errorMessage =
result.reason?.message ||
(typeof result.reason === 'string' && result.reason) ||
'Context may be unavailable or timed out';
console.warn(
`Failed to get node info for context ${contextNames[i]}:`,
result.reason
);
contextToNodeInfo[contextNames[i]] = {};
contextErrors[contextNames[i]] = errorMessage;
}
}

// Populate the gpuToData map for each context.
@@ -509,6 +531,7 @@ async function getKubernetesGPUsFromContexts(contextNames) {
a.node_name.localeCompare(b.node_name) ||
a.gpu_name.localeCompare(b.gpu_name)
),
contextErrors: contextErrors,
};
} catch (error) {
console.error('[infra.jsx] Error in getKubernetesGPUsFromContexts:', error);
@@ -522,7 +545,7 @@ async function getKubernetesPerNodeGPUs(context) {
context: context,
});
if (!response.ok) {
const msg = `Failed to get kubernetes node info with status ${response.status}`;
const msg = `Failed to get kubernetes node info for context ${context} with status ${response.status}, error: ${response.statusText}`;
throw new Error(msg);
}
const id =
@@ -533,24 +556,9 @@ async function getKubernetesPerNodeGPUs(context) {
throw new Error(msg);
}
const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
if (fetchedData.status === 500) {
try {
const data = await fetchedData.json();
if (data.detail && data.detail.error) {
try {
const error = JSON.parse(data.detail.error);
const msg = `Context ${context} unavailable: ${error.message}`;
throw new Error(msg);
} catch (jsonError) {
console.error('Error parsing JSON:', jsonError);
}
}
} catch (parseError) {
console.error('Error parsing JSON:', parseError);
}
}
if (!fetchedData.ok) {
const msg = `Failed to get kubernetes node info result with status ${fetchedData.status}`;
const errorMessage = await getErrorMessageFromResponse(fetchedData);
const msg = `Failed to get kubernetes node info result for context ${context} with status ${fetchedData.status}, error: ${errorMessage}`;
throw new Error(msg);
}
const data = await fetchedData.json();
@@ -661,7 +669,7 @@ export async function getCloudGPUs() {
gpus_only: true,
});
if (!response.ok) {
const msg = `Failed to get cloud GPUs with status ${response.status}`;
const msg = `Failed to get cloud GPUs with status ${response.status}, error: ${response.statusText}`;
throw new Error(msg);
}
const id =
@@ -672,24 +680,9 @@ export async function getCloudGPUs() {
throw new Error(msg);
}
const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
if (fetchedData.status === 500) {
try {
const data = await fetchedData.json();
if (data.detail && data.detail.error) {
try {
const error = JSON.parse(data.detail.error);
const msg = `Error fetching cloud GPUs: ${error.message}`;
throw new Error(msg);
} catch (jsonError) {
console.error('Error parsing JSON:', jsonError);
}
}
} catch (parseError) {
console.error('Error parsing JSON:', parseError);
}
}
if (!fetchedData.ok) {
const msg = `Failed to get cloud GPUs result with status ${fetchedData.status}`;
const errorMessage = await getErrorMessageFromResponse(fetchedData);
const msg = `Failed to get cloud GPUs result with status ${fetchedData.status}, error: ${errorMessage}`;
throw new Error(msg);
}
const data = await fetchedData.json();
@@ -753,7 +746,7 @@ export async function getDetailedGpuInfo(filter) {
all_regions: true,
});
if (!response.ok) {
const msg = `Failed to get detailed GPU info with status ${response.status}`;
const msg = `Failed to get detailed GPU info with status ${response.status}, error: ${response.statusText}`;
throw new Error(msg);
}
const id =
@@ -765,7 +758,8 @@ export async function getDetailedGpuInfo(filter) {
}
const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
if (!fetchedData.ok) {
const msg = `Failed to get detailed GPU info result with status ${fetchedData.status}`;
const errorMessage = await getErrorMessageFromResponse(fetchedData);
const msg = `Failed to get detailed GPU info result with status ${fetchedData.status}, error: ${errorMessage}`;
throw new Error(msg);
}



+ 22
- 7
sky/dashboard/src/data/connectors/jobs.jsx View File

@@ -84,6 +84,7 @@ export async function getManagedJobs(options = {}) {
throw new Error(msg);
}
const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
let errorMessage = fetchedData.statusText;
if (fetchedData.status === 500) {
try {
const data = await fetchedData.json();
@@ -93,18 +94,25 @@ export async function getManagedJobs(options = {}) {
// Handle specific error types
if (error.type && error.type === CLUSTER_NOT_UP_ERROR) {
return { jobs: [], total: 0, controllerStopped: true };
} else {
errorMessage = error.message || String(data.detail.error);
}
} catch (jsonError) {
console.error('Error parsing JSON:', jsonError);
console.error(
'Error parsing JSON from data.detail.error:',
jsonError
);
errorMessage = String(data.detail.error);
}
}
} catch (parseError) {
console.error('Error parsing JSON:', parseError);
console.error('Error parsing response JSON:', parseError);
errorMessage = String(parseError);
}
}
// Handle all error status codes (4xx, 5xx, etc.)
if (!fetchedData.ok) {
const msg = `API request to get managed jobs result failed with status ${fetchedData.status}`;
const msg = `API request to get managed jobs result failed with status ${fetchedData.status}, error: ${errorMessage}`;
throw new Error(msg);
}
// print out the response for debugging
@@ -323,7 +331,7 @@ export async function getPoolStatus() {
throw new Error(msg);
}
const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
let errorMessage = fetchedData.statusText;
if (fetchedData.status === 500) {
try {
const data = await fetchedData.json();
@@ -332,18 +340,25 @@ export async function getPoolStatus() {
const error = JSON.parse(data.detail.error);
if (error.type && error.type === CLUSTER_NOT_UP_ERROR) {
return { pools: [], controllerStopped: true };
} else {
errorMessage = error.message || String(data.detail.error);
}
} catch (jsonError) {
console.error('Failed to parse error JSON:', jsonError);
console.error(
'Error parsing JSON from data.detail.error:',
jsonError
);
errorMessage = String(data.detail.error);
}
}
} catch (dataError) {
console.error('Failed to parse response JSON:', dataError);
console.error('Error parsing response JSON:', dataError);
errorMessage = String(dataError);
}
}

if (!fetchedData.ok) {
const msg = `API request to get pool status result failed with status ${fetchedData.status}`;
const msg = `API request to get pool status result failed with status ${fetchedData.status}, error: ${errorMessage}`;
throw new Error(msg);
}



+ 3
- 1
sky/dashboard/src/data/connectors/volumes.js View File

@@ -1,4 +1,5 @@
import { apiClient } from '@/data/connectors/client';
import { getErrorMessageFromResponse } from '@/data/utils';

export async function getVolumes() {
try {
@@ -73,7 +74,8 @@ export async function deleteVolume(volumeName) {
}
const fetchedData = await apiClient.get(`/api/get?request_id=${id}`);
if (!fetchedData.ok) {
msg = `Failed to delete volume with status ${fetchedData.status}`;
const errorMessage = await getErrorMessageFromResponse(fetchedData);
msg = `Failed to delete volume with status ${fetchedData.status}, error: ${errorMessage}`;
console.error(msg);
return { success: false, msg: msg };
}


+ 32
- 0
sky/dashboard/src/data/utils.jsx View File

@@ -27,3 +27,35 @@ export function sortData(data, accessor, direction) {
return 0;
});
}

/**
* Extracts error message from API response, handling nested JSON parsing
* @param {Response} fetchedData - The API response object
* @returns {Promise<string>} The extracted error message
*/
export async function getErrorMessageFromResponse(fetchedData) {
let errorMessage = fetchedData.statusText;

if (fetchedData.status === 500) {
try {
const data = await fetchedData.json();
if (data.detail && data.detail.error) {
try {
const error = JSON.parse(data.detail.error);
errorMessage = error.message || String(data.detail.error);
} catch (jsonError) {
console.error(
'Error parsing JSON from data.detail.error:',
jsonError
);
errorMessage = String(data.detail.error);
}
}
} catch (parseError) {
console.error('Error parsing response JSON:', parseError);
errorMessage = String(parseError);
}
}

return errorMessage;
}

+ 0
- 6
sky/exceptions.py View File

@@ -208,12 +208,6 @@ class InconsistentHighAvailabilityError(Exception):
pass


class InconsistentConsolidationModeError(Exception):
"""Raised when the consolidation mode property in the user config
is inconsistent with the actual cluster."""
pass


class ProvisionPrechecksError(Exception):
"""Raised when a managed job fails prechecks before provision.



+ 4
- 1
sky/execution.py View File

@@ -15,6 +15,7 @@ from sky import clouds
from sky import global_user_state
from sky import optimizer
from sky import sky_logging
from sky import task as task_lib
from sky.backends import backend_utils
from sky.server.requests import request_names
from sky.skylet import autostop_lib
@@ -478,7 +479,9 @@ def _execute_dag(
cluster_name, status_lib.ClusterStatus.INIT,
'Syncing files to cluster',
global_user_state.ClusterEventType.STATUS_CHANGE)
backend.sync_workdir(handle, task.workdir, task.envs_and_secrets)
envs_and_secrets = task_lib.get_plaintext_envs_and_secrets(
task.envs_and_secrets)
backend.sync_workdir(handle, task.workdir, envs_and_secrets)

if do_file_mounts:
if cluster_name is not None:


+ 3
- 3
sky/global_user_state.py View File

@@ -2241,7 +2241,7 @@ def get_volumes(is_ephemeral: Optional[bool] = None) -> List[Dict[str, Any]]:
rows = session.query(volume_table).all()
else:
rows = session.query(volume_table).filter_by(
is_ephemeral=is_ephemeral).all()
is_ephemeral=int(is_ephemeral)).all()
records = []
for row in rows:
records.append({
@@ -2253,7 +2253,7 @@ def get_volumes(is_ephemeral: Optional[bool] = None) -> List[Dict[str, Any]]:
'last_attached_at': row.last_attached_at,
'last_use': row.last_use,
'status': status_lib.VolumeStatus[row.status],
'is_ephemeral': row.is_ephemeral,
'is_ephemeral': bool(row.is_ephemeral),
})
return records

@@ -2316,7 +2316,7 @@ def add_volume(
last_attached_at=last_attached_at,
last_use=last_use,
status=status.value,
is_ephemeral=is_ephemeral,
is_ephemeral=int(is_ephemeral),
)
do_update_stmt = insert_stmnt.on_conflict_do_nothing()
session.execute(do_update_stmt)


+ 37
- 45
sky/jobs/log_gc.py View File

@@ -1,13 +1,12 @@
"""Log garbage collection for managed jobs."""

import asyncio
from datetime import datetime
import os
import pathlib
import shutil
import threading
import time

import anyio
import filelock

from sky import sky_logging
@@ -16,7 +15,6 @@ from sky.jobs import constants as managed_job_constants
from sky.jobs import state as managed_job_state
from sky.jobs import utils as managed_job_utils
from sky.utils import context
from sky.utils import context_utils

logger = sky_logging.init_logger(__name__)

@@ -40,7 +38,7 @@ def _next_gc_interval(retention_seconds: int) -> int:
_MOST_FREQUENT_GC_INTERVAL_SECONDS)


async def gc_controller_logs_for_job():
def gc_controller_logs_for_job():
"""Garbage collect job and controller logs."""
while True:
skypilot_config.reload_config()
@@ -54,11 +52,8 @@ async def gc_controller_logs_for_job():
try:
finished = False
while not finished:
finished = await _clean_controller_logs_with_retention(
finished = _clean_controller_logs_with_retention(
controller_logs_retention)
except asyncio.CancelledError:
logger.info('Managed jobs logs GC task cancelled')
break
except Exception as e: # pylint: disable=broad-except
logger.error(f'Error GC controller logs for job: {e}',
exc_info=True)
@@ -68,10 +63,10 @@ async def gc_controller_logs_for_job():
interval = _next_gc_interval(controller_logs_retention)
logger.info('Next controller logs GC is scheduled after '
f'{interval} seconds')
await asyncio.sleep(interval)
time.sleep(interval)


async def gc_task_logs_for_job():
def gc_task_logs_for_job():
"""Garbage collect task logs for job."""
while True:
skypilot_config.reload_config()
@@ -85,11 +80,8 @@ async def gc_task_logs_for_job():
try:
finished = False
while not finished:
finished = await _clean_task_logs_with_retention(
finished = _clean_task_logs_with_retention(
task_logs_retention)
except asyncio.CancelledError:
logger.info('Task logs GC task cancelled')
break
except Exception as e: # pylint: disable=broad-except
logger.error(f'Error GC task logs for job: {e}', exc_info=True)
else:
@@ -97,11 +89,11 @@ async def gc_task_logs_for_job():

interval = _next_gc_interval(task_logs_retention)
logger.info(f'Next task logs GC is scheduled after {interval} seconds')
await asyncio.sleep(_next_gc_interval(task_logs_retention))
time.sleep(_next_gc_interval(task_logs_retention))


async def _clean_controller_logs_with_retention(retention_seconds: int,
batch_size: int = 100):
def _clean_controller_logs_with_retention(retention_seconds: int,
batch_size: int = 100):
"""Clean controller logs with retention.

Returns:
@@ -109,14 +101,14 @@ async def _clean_controller_logs_with_retention(retention_seconds: int,
still be more controller logs to clean.
"""
assert batch_size > 0, 'Batch size must be positive'
jobs = await managed_job_state.get_controller_logs_to_clean_async(
retention_seconds, batch_size=batch_size)
jobs = managed_job_state.get_controller_logs_to_clean(retention_seconds,
batch_size=batch_size)
job_ids_to_update = []
for job in jobs:
job_ids_to_update.append(job['job_id'])
log_file = managed_job_utils.controller_log_file_for_job(job['job_id'])
cleaned_at = time.time()
if await anyio.Path(log_file).exists():
if os.path.exists(log_file):
ts_str = datetime.fromtimestamp(cleaned_at).strftime(
'%Y-%m-%d %H:%M:%S')
msg = f'Controller log has been cleaned at {ts_str}.'
@@ -124,20 +116,19 @@ async def _clean_controller_logs_with_retention(retention_seconds: int,
# keep the file and delete the content.
# TODO(aylei): refactor sync down logs if the inode usage
# becomes an issue.
async with await anyio.open_file(log_file, 'w',
encoding='utf-8') as f:
await f.write(msg + '\n')
with open(log_file, 'w', encoding='utf-8') as f:
f.write(msg + '\n')
# Batch the update, the timestamp will be not accurate but it's okay.
await managed_job_state.set_controller_logs_cleaned_async(
job_ids=job_ids_to_update, logs_cleaned_at=time.time())
managed_job_state.set_controller_logs_cleaned(job_ids=job_ids_to_update,
logs_cleaned_at=time.time())
complete = len(jobs) < batch_size
logger.info(f'Cleaned {len(jobs)} controller logs with retention '
f'{retention_seconds} seconds, complete: {complete}')
return complete


async def _clean_task_logs_with_retention(retention_seconds: int,
batch_size: int = 100):
def _clean_task_logs_with_retention(retention_seconds: int,
batch_size: int = 100):
"""Clean task logs with retention.

Returns:
@@ -145,11 +136,11 @@ async def _clean_task_logs_with_retention(retention_seconds: int,
still be more task logs to clean.
"""
assert batch_size > 0, 'Batch size must be positive'
tasks = await managed_job_state.get_task_logs_to_clean_async(
retention_seconds, batch_size=batch_size)
tasks = managed_job_state.get_task_logs_to_clean(retention_seconds,
batch_size=batch_size)
tasks_to_update = []
for task in tasks:
local_log_file = anyio.Path(task['local_log_file'])
local_log_file = pathlib.Path(task['local_log_file'])
# We assume the log directory has the following layout:
# task-id/
# - run.log
@@ -157,36 +148,37 @@ async def _clean_task_logs_with_retention(retention_seconds: int,
# - run.log
# and also remove the tasks directory on cleanup.
task_log_dir = local_log_file.parent.joinpath('tasks')
await local_log_file.unlink(missing_ok=True)
await context_utils.to_thread(shutil.rmtree,
str(task_log_dir),
ignore_errors=True)
local_log_file.unlink(missing_ok=True)
shutil.rmtree(task_log_dir, ignore_errors=True)
# We have at least once semantic guarantee for the cleanup here.
tasks_to_update.append((task['job_id'], task['task_id']))
await managed_job_state.set_task_logs_cleaned_async(
tasks=list(tasks_to_update), logs_cleaned_at=time.time())
managed_job_state.set_task_logs_cleaned(tasks=list(tasks_to_update),
logs_cleaned_at=time.time())
complete = len(tasks) < batch_size
logger.info(f'Cleaned {len(tasks)} task logs with retention '
f'{retention_seconds} seconds, complete: {complete}')
return complete


@context.contextual_async
async def run_log_gc():
@context.contextual
def run_log_gc():
"""Run the log garbage collector."""
log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
os.makedirs(log_dir, exist_ok=True)
log_path = os.path.join(log_dir, 'garbage_collector.log')
# Remove previous log file
await anyio.Path(log_path).unlink(missing_ok=True)
pathlib.Path(log_path).unlink(missing_ok=True)
ctx = context.get()
assert ctx is not None, 'Context is not initialized'
ctx.redirect_log(pathlib.Path(log_path))
gc_controller_logs_for_job_task = asyncio.create_task(
gc_controller_logs_for_job())
gc_task_logs_for_job_task = asyncio.create_task(gc_task_logs_for_job())
await asyncio.gather(gc_controller_logs_for_job_task,
gc_task_logs_for_job_task)
tasks = []
tasks.append(
threading.Thread(target=gc_controller_logs_for_job, daemon=True))
tasks.append(threading.Thread(target=gc_task_logs_for_job, daemon=True))
for task in tasks:
task.start()
for task in tasks:
task.join()


def elect_for_log_gc():
@@ -198,4 +190,4 @@ def elect_for_log_gc():
on the filelock and bring trivial overhead.
"""
with filelock.FileLock(_JOB_CONTROLLER_GC_LOCK_PATH):
asyncio.run(run_log_gc())
run_log_gc()

+ 27
- 31
sky/jobs/state.py View File

@@ -662,6 +662,9 @@ class ManagedJobScheduleState(enum.Enum):
"""
protobuf_to_enum = {
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED: None,
# TODO(cooperc): remove this in v0.13.0. See #8105.
managed_jobsv1_pb2.DEPRECATED_MANAGED_JOB_SCHEDULE_STATE_INVALID:
(None),
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_INACTIVE:
cls.INACTIVE,
managed_jobsv1_pb2.MANAGED_JOB_SCHEDULE_STATE_WAITING: cls.WAITING,
@@ -2410,20 +2413,19 @@ def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
return job_ids


@_init_db_async
async def get_task_logs_to_clean_async(retention_seconds: int,
batch_size) -> List[Dict[str, Any]]:
@_init_db
def get_task_logs_to_clean(retention_seconds: int,
batch_size: int) -> List[Dict[str, Any]]:
"""Get the logs of job tasks to clean.

The logs of a task will only cleaned when:
- the job schedule state is DONE
- AND the end time of the task is older than the retention period
"""

assert _SQLALCHEMY_ENGINE_ASYNC is not None
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
assert _SQLALCHEMY_ENGINE is not None
with orm.Session(_SQLALCHEMY_ENGINE) as session:
now = time.time()
result = await session.execute(
result = session.execute(
sqlalchemy.select(
spot_table.c.spot_job_id,
spot_table.c.task_id,
@@ -2453,21 +2455,19 @@ async def get_task_logs_to_clean_async(retention_seconds: int,
} for row in rows]


@_init_db_async
async def get_controller_logs_to_clean_async(
retention_seconds: int, batch_size: int) -> List[Dict[str, Any]]:
@_init_db
def get_controller_logs_to_clean(retention_seconds: int,
batch_size: int) -> List[Dict[str, Any]]:
"""Get the controller logs to clean.

The controller logs will only cleaned when:
- the job schedule state is DONE
- AND the end time of the latest task is older than the retention period
"""

assert _SQLALCHEMY_ENGINE_ASYNC is not None
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
assert _SQLALCHEMY_ENGINE is not None
with orm.Session(_SQLALCHEMY_ENGINE) as session:
now = time.time()

result = await session.execute(
result = session.execute(
sqlalchemy.select(job_info_table.c.spot_job_id,).select_from(
job_info_table.join(
spot_table,
@@ -2490,36 +2490,32 @@ async def get_controller_logs_to_clean_async(
return [{'job_id': row[0]} for row in rows]


@_init_db_async
async def set_task_logs_cleaned_async(tasks: List[Tuple[int, int]],
logs_cleaned_at: float):
@_init_db
def set_task_logs_cleaned(tasks: List[Tuple[int, int]], logs_cleaned_at: float):
"""Set the task logs cleaned at."""
if not tasks:
return
# Deduplicate
task_keys = list(dict.fromkeys(tasks))
assert _SQLALCHEMY_ENGINE_ASYNC is not None
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
await session.execute(
assert _SQLALCHEMY_ENGINE is not None
with orm.Session(_SQLALCHEMY_ENGINE) as session:
session.execute(
sqlalchemy.update(spot_table).where(
sqlalchemy.tuple_(spot_table.c.spot_job_id,
spot_table.c.task_id).in_(task_keys)).values(
logs_cleaned_at=logs_cleaned_at))
await session.commit()
session.commit()


@_init_db_async
async def set_controller_logs_cleaned_async(job_ids: List[int],
logs_cleaned_at: float):
@_init_db
def set_controller_logs_cleaned(job_ids: List[int], logs_cleaned_at: float):
"""Set the controller logs cleaned at."""
if not job_ids:
return
# Deduplicate
job_ids = list(dict.fromkeys(job_ids))
assert _SQLALCHEMY_ENGINE_ASYNC is not None
async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
await session.execute(
assert _SQLALCHEMY_ENGINE is not None
with orm.Session(_SQLALCHEMY_ENGINE) as session:
session.execute(
sqlalchemy.update(job_info_table).where(
job_info_table.c.spot_job_id.in_(job_ids)).values(
controller_logs_cleaned_at=logs_cleaned_at))
await session.commit()
session.commit()

+ 13
- 20
sky/jobs/utils.py View File

@@ -186,13 +186,11 @@ def _validate_consolidation_mode_config(
controller_cn = (
controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
if global_user_state.cluster_with_name_exists(controller_cn):
with ux_utils.print_exception_no_traceback():
raise exceptions.InconsistentConsolidationModeError(
f'{colorama.Fore.RED}Consolidation mode for jobs is '
f'enabled, but the controller cluster '
f'{controller_cn} is still running. Please '
'terminate the controller cluster first.'
f'{colorama.Style.RESET_ALL}')
logger.warning(
f'{colorama.Fore.RED}Consolidation mode for jobs is enabled, '
f'but the controller cluster {controller_cn} is still running. '
'Please terminate the controller cluster first.'
f'{colorama.Style.RESET_ALL}')
else:
total_jobs = managed_job_state.get_managed_jobs_total()
if total_jobs > 0:
@@ -200,13 +198,11 @@ def _validate_consolidation_mode_config(
managed_job_state.get_nonterminal_job_ids_by_name(
None, None, all_users=True))
if nonterminal_jobs:
with ux_utils.print_exception_no_traceback():
raise exceptions.InconsistentConsolidationModeError(
f'{colorama.Fore.RED}Consolidation mode '
'is disabled, but there are still '
f'{len(nonterminal_jobs)} managed jobs '
'running. Please terminate those jobs '
f'first.{colorama.Style.RESET_ALL}')
logger.warning(
f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
f'but there are still {len(nonterminal_jobs)} managed jobs '
'running. Please terminate those jobs first.'
f'{colorama.Style.RESET_ALL}')
else:
logger.warning(
f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
@@ -233,14 +229,11 @@ def is_consolidation_mode(on_api_restart: bool = False) -> bool:
signal_file = pathlib.Path(
_JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE).expanduser()

restart_signal_file_exists = signal_file.exists()
consolidation_mode = (config_consolidation_mode and
restart_signal_file_exists)

if on_api_restart:
if config_consolidation_mode:
signal_file.touch()
else:
restart_signal_file_exists = signal_file.exists()
if not restart_signal_file_exists:
if config_consolidation_mode:
logger.warning(f'{colorama.Fore.YELLOW}Consolidation mode for '
@@ -259,8 +252,8 @@ def is_consolidation_mode(on_api_restart: bool = False) -> bool:
# have related config and will always seemingly disabled for consolidation
# mode. Check #6611 for more details.
if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
_validate_consolidation_mode_config(consolidation_mode)
return consolidation_mode
_validate_consolidation_mode_config(config_consolidation_mode)
return config_consolidation_mode


def ha_recovery_for_consolidation_mode() -> None:


+ 5
- 3
sky/provision/docker_utils.py View File

@@ -29,10 +29,11 @@ SETUP_ENV_VARS_CMD = (
# Docker daemon may not be ready when the machine is firstly started. The error
# message starts with the following string. We should wait for a while and retry
# the command.
DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
'the Docker daemon socket')
DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to ')

DOCKER_SOCKET_NOT_READY_STR = ('Is the docker daemon running?')
DOCKER_SOCKET_NOT_READY_STR_2 = (
'check if the path is correct and if the daemon is running')

_DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30

@@ -228,7 +229,8 @@ class DockerInitializer:
separate_stderr=separate_stderr,
log_path=self.log_path)
if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr or
DOCKER_SOCKET_NOT_READY_STR in stdout + stderr):
DOCKER_SOCKET_NOT_READY_STR in stdout + stderr or
DOCKER_SOCKET_NOT_READY_STR_2 in stdout + stderr):
if wait_for_docker_daemon:
if time.time(
) - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:


+ 9
- 7
sky/provision/kubernetes/instance.py View File

@@ -585,6 +585,7 @@ def _wait_for_pods_to_run(namespace, context, cluster_name, new_pods):
return False, reason

missing_pods_retry = 0
last_status_msg: Optional[str] = None
while True:
# Get all pods in a single API call
cluster_name_on_cloud = new_pods[0].metadata.labels[
@@ -645,15 +646,16 @@ def _wait_for_pods_to_run(namespace, context, cluster_name, new_pods):
if pending_reasons_count:
msg = ', '.join([
f'{count} pod(s) pending due to {reason}'
for reason, count in pending_reasons_count.items()
for reason, count in sorted(pending_reasons_count.items())
])
rich_utils.force_update_status(
ux_utils.spinner_message(f'Launching ({msg})',
cluster_name=cluster_name))
status_text = f'Launching ({msg})'
else:
rich_utils.force_update_status(
ux_utils.spinner_message('Launching',
cluster_name=cluster_name))
status_text = 'Launching'
new_status_msg = ux_utils.spinner_message(status_text,
cluster_name=cluster_name)
if new_status_msg != last_status_msg:
rich_utils.force_update_status(new_status_msg)
last_status_msg = new_status_msg
time.sleep(1)




+ 102
- 87
sky/provision/nebius/utils.py View File

@@ -282,94 +282,109 @@ def launch(cluster_name_on_cloud: str,

service = nebius.compute().InstanceServiceClient(nebius.sdk())
logger.debug(f'Creating instance {instance_name} in project {project_id}.')
nebius.sync_call(
service.create(nebius.compute().CreateInstanceRequest(
metadata=nebius.nebius_common().ResourceMetadata(
parent_id=project_id,
name=instance_name,
),
spec=nebius.compute().InstanceSpec(
gpu_cluster=nebius.compute().InstanceGpuClusterSpec(
id=cluster_id,) if cluster_id is not None else None,
boot_disk=nebius.compute().AttachedDiskSpec(
attach_mode=nebius.compute(
).AttachedDiskSpec.AttachMode.READ_WRITE,
existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
cloud_init_user_data=user_data,
resources=nebius.compute().ResourcesSpec(platform=platform,
preset=preset),
filesystems=filesystems_spec if filesystems_spec else None,
network_interfaces=[
nebius.compute().NetworkInterfaceSpec(
subnet_id=sub_net.items[0].metadata.id,
ip_address=nebius.compute().IPAddress(),
name='network-interface-0',
public_ip_address=nebius.compute().PublicIPAddress(
static=use_static_ip_address)
if associate_public_ip_address else None,
)
],
recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
if use_spot else None,
preemptible=nebius.compute().PreemptibleSpec(
priority=1,
on_preemption=nebius.compute().PreemptibleSpec.
PreemptionPolicy.STOP) if use_spot else None,
))))
instance_id = ''
retry_count = 0
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
service = nebius.compute().InstanceServiceClient(nebius.sdk())
instance = nebius.sync_call(
service.get_by_name(nebius.nebius_common().GetByNameRequest(
parent_id=project_id,
name=instance_name,
)))
instance_id = instance.metadata.id
if instance.status.state.name == 'STARTING':
break

# All Instances initially have state=STOPPED and reconciling=True,
# so we need to wait until reconciling is False.
if instance.status.state.name == 'STOPPED' and \
not instance.status.reconciling:
next_token = ''
total_operations = 0
while True:
operations_response = nebius.sync_call(
service.list_operations_by_parent(
nebius.compute().ListOperationsByParentRequest(
parent_id=project_id,
page_size=100,
page_token=next_token,
)))
total_operations += len(operations_response.operations)
for operation in operations_response.operations:
# Find the most recent operation for the instance.
if operation.resource_id == instance_id:
error_msg = operation.description
if operation.status:
error_msg += f' {operation.status.message}'
raise RuntimeError(error_msg)
# If we've fetched too many operations, or there are no more
# operations to fetch, just raise a generic error.
if total_operations > _MAX_OPERATIONS_TO_FETCH or \
not operations_response.next_page_token:
raise RuntimeError(
f'Instance {instance_name} failed to start.')
next_token = operations_response.next_page_token
time.sleep(POLL_INTERVAL)
logger.debug(f'Waiting for instance {instance_name} to start running. '
f'State: {instance.status.state.name}, '
f'Reconciling: {instance.status.reconciling}')
retry_count += 1
try:
nebius.sync_call(
service.create(nebius.compute().CreateInstanceRequest(
metadata=nebius.nebius_common().ResourceMetadata(
parent_id=project_id,
name=instance_name,
),
spec=nebius.compute().InstanceSpec(
gpu_cluster=nebius.compute().InstanceGpuClusterSpec(
id=cluster_id,) if cluster_id is not None else None,
boot_disk=nebius.compute().AttachedDiskSpec(
attach_mode=nebius.compute(
).AttachedDiskSpec.AttachMode.READ_WRITE,
existing_disk=nebius.compute().ExistingDisk(
id=disk_id)),
cloud_init_user_data=user_data,
resources=nebius.compute().ResourcesSpec(platform=platform,
preset=preset),
filesystems=filesystems_spec if filesystems_spec else None,
network_interfaces=[
nebius.compute().NetworkInterfaceSpec(
subnet_id=sub_net.items[0].metadata.id,
ip_address=nebius.compute().IPAddress(),
name='network-interface-0',
public_ip_address=nebius.compute().PublicIPAddress(
static=use_static_ip_address)
if associate_public_ip_address else None,
)
],
recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
if use_spot else None,
preemptible=nebius.compute().PreemptibleSpec(
priority=1,
on_preemption=nebius.compute().PreemptibleSpec.
PreemptionPolicy.STOP) if use_spot else None,
))))
instance_id = ''
retry_count = 0
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
service = nebius.compute().InstanceServiceClient(nebius.sdk())
instance = nebius.sync_call(
service.get_by_name(nebius.nebius_common().GetByNameRequest(
parent_id=project_id,
name=instance_name,
)))
instance_id = instance.metadata.id
if instance.status.state.name == 'STARTING':
break

# All Instances initially have state=STOPPED and reconciling=True,
# so we need to wait until reconciling is False.
if instance.status.state.name == 'STOPPED' and \
not instance.status.reconciling:
next_token = ''
total_operations = 0
while True:
operations_response = nebius.sync_call(
service.list_operations_by_parent(
nebius.compute().ListOperationsByParentRequest(
parent_id=project_id,
page_size=100,
page_token=next_token,
)))
total_operations += len(operations_response.operations)
for operation in operations_response.operations:
# Find the most recent operation for the instance.
if operation.resource_id == instance_id:
error_msg = operation.description
if operation.status:
error_msg += f' {operation.status.message}'
raise RuntimeError(error_msg)
# If we've fetched too many operations, or there are no more
# operations to fetch, just raise a generic error.
if total_operations > _MAX_OPERATIONS_TO_FETCH or \
not operations_response.next_page_token:
raise RuntimeError(
f'Instance {instance_name} failed to start.')
next_token = operations_response.next_page_token
time.sleep(POLL_INTERVAL)
logger.debug(
f'Waiting for instance {instance_name} to start running. '
f'State: {instance.status.state.name}, '
f'Reconciling: {instance.status.reconciling}')
retry_count += 1

if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
raise TimeoutError(
f'Exceeded maximum retries '
f'({nebius.MAX_RETRIES_TO_INSTANCE_READY * POLL_INTERVAL}'
f' seconds) while waiting for instance {instance_name}'
f' to be ready.')
if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
raise TimeoutError(
f'Exceeded maximum retries '
f'({nebius.MAX_RETRIES_TO_INSTANCE_READY * POLL_INTERVAL}'
f' seconds) while waiting for instance {instance_name}'
f' to be ready.')
except nebius.request_error() as e:
# Handle ResourceExhausted quota limit error. In this case, we need to
# clean up the disk as VM creation failed and we can't proceed.
# It cannot be handled by the caller (provisioner)'s teardown logic,
# as we cannot retrieve the disk id, after the instance creation
# fails
logger.warning(f'Failed to launch instance {instance_name}: {e}')
service = nebius.compute().DiskServiceClient(nebius.sdk())
nebius.sync_call(
service.delete(nebius.compute().DeleteDiskRequest(id=disk_id)))
logger.debug(f'Disk {disk_id} deleted.')
raise e
return instance_id




+ 6
- 4
sky/schemas/generated/managed_jobsv1_pb2.py
File diff suppressed because it is too large
View File


+ 2
- 0
sky/schemas/generated/managed_jobsv1_pb2.pyi View File

@@ -26,6 +26,7 @@ class ManagedJobStatus(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
class ManagedJobScheduleState(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
__slots__ = ()
MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED: _ClassVar[ManagedJobScheduleState]
DEPRECATED_MANAGED_JOB_SCHEDULE_STATE_INVALID: _ClassVar[ManagedJobScheduleState]
MANAGED_JOB_SCHEDULE_STATE_INACTIVE: _ClassVar[ManagedJobScheduleState]
MANAGED_JOB_SCHEDULE_STATE_WAITING: _ClassVar[ManagedJobScheduleState]
MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING: _ClassVar[ManagedJobScheduleState]
@@ -48,6 +49,7 @@ MANAGED_JOB_STATUS_FAILED_PRECHECKS: ManagedJobStatus
MANAGED_JOB_STATUS_FAILED_NO_RESOURCE: ManagedJobStatus
MANAGED_JOB_STATUS_FAILED_CONTROLLER: ManagedJobStatus
MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED: ManagedJobScheduleState
DEPRECATED_MANAGED_JOB_SCHEDULE_STATE_INVALID: ManagedJobScheduleState
MANAGED_JOB_SCHEDULE_STATE_INACTIVE: ManagedJobScheduleState
MANAGED_JOB_SCHEDULE_STATE_WAITING: ManagedJobScheduleState
MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING: ManagedJobScheduleState


+ 5
- 0
sky/schemas/proto/managed_jobsv1.proto View File

@@ -34,6 +34,11 @@ enum ManagedJobStatus {

enum ManagedJobScheduleState {
MANAGED_JOB_SCHEDULE_STATE_UNSPECIFIED = 0;
// Old servers mapped `None` to INVALID, instead of omitting the field. Keep
// the enum value so we can deserialize the response properly. On the client,
// this should be mapped to `None` / missing field. See #8105.
// TODO(cooperc): Remove in 0.13.0
DEPRECATED_MANAGED_JOB_SCHEDULE_STATE_INVALID = 1 [deprecated = true];
MANAGED_JOB_SCHEDULE_STATE_INACTIVE = 2;
MANAGED_JOB_SCHEDULE_STATE_WAITING = 3;
MANAGED_JOB_SCHEDULE_STATE_ALIVE_WAITING = 4;


+ 2
- 1
sky/serve/constants.py View File

@@ -65,7 +65,8 @@ AUTOSCALER_DEFAULT_DOWNSCALE_DELAY_SECONDS = 1200
# TODO(tian): We might need to be careful that service logs can take a lot of
# disk space. Maybe we could use a larger disk size, migrate to cloud storage or
# do some log rotation.
CONTROLLER_RESOURCES = {'cpus': '4+', 'disk_size': 200}
# Set default minimal memory to 8GB to allow at least one service to run.
CONTROLLER_RESOURCES = {'cpus': '4+', 'memory': '8+', 'disk_size': 200}
# Autostop config for the jobs controller. These are the default values for
# serve.controller.autostop in ~/.sky/config.yaml.
CONTROLLER_AUTOSTOP = {


+ 2
- 0
sky/serve/controller.py View File

@@ -21,6 +21,7 @@ from sky.serve import autoscalers
from sky.serve import replica_managers
from sky.serve import serve_state
from sky.serve import serve_utils
from sky.skylet import constants
from sky.utils import common_utils
from sky.utils import context_utils
from sky.utils import ux_utils
@@ -288,6 +289,7 @@ class SkyServeController:
# specific time period.
def run_controller(service_name: str, service_spec: serve.SkyServiceSpec,
version: int, controller_host: str, controller_port: int):
os.environ[constants.OVERRIDE_CONSOLIDATION_MODE] = 'true'
# Hijack sys.stdout/stderr to be context aware.
context_utils.hijack_sys_attrs()
controller = SkyServeController(service_name, service_spec, version,


+ 1
- 1
sky/serve/load_balancing_policies.py View File

@@ -121,7 +121,7 @@ class LeastLoadPolicy(LoadBalancingPolicy, name='least_load', default=True):
return
with self.lock:
self.ready_replicas = ready_replicas
for r in self.ready_replicas:
for r in list(self.load_map.keys()):
if r not in ready_replicas:
del self.load_map[r]
for replica in ready_replicas:


+ 11
- 13
sky/serve/serve_utils.py View File

@@ -218,25 +218,23 @@ def _validate_consolidation_mode_config(current_is_consolidation_mode: bool,
if current_is_consolidation_mode:
controller_cn = controller.cluster_name
if global_user_state.cluster_with_name_exists(controller_cn):
with ux_utils.print_exception_no_traceback():
raise exceptions.InconsistentConsolidationModeError(
f'{colorama.Fore.RED}Consolidation mode for '
f'{controller.controller_type} is enabled, but the '
f'controller cluster {controller_cn} is still running. '
'Please terminate the controller cluster first.'
f'{colorama.Style.RESET_ALL}')
logger.warning(
f'{colorama.Fore.RED}Consolidation mode for '
f'{controller.controller_type} is enabled, but the controller '
f'cluster {controller_cn} is still running. Please terminate '
'the controller cluster first.'
f'{colorama.Style.RESET_ALL}')
else:
noun = 'pool' if pool else 'service'
all_services = [
svc for svc in serve_state.get_services() if svc['pool'] == pool
]
if all_services:
with ux_utils.print_exception_no_traceback():
raise exceptions.InconsistentConsolidationModeError(
f'{colorama.Fore.RED}Consolidation mode for '
f'{controller.controller_type} is disabled, but there are '
f'still {len(all_services)} {noun}s running. Please '
f'terminate those {noun}s first.{colorama.Style.RESET_ALL}')
logger.warning(
f'{colorama.Fore.RED}Consolidation mode for '
f'{controller.controller_type} is disabled, but there are '
f'still {len(all_services)} {noun}s running. Please terminate '
f'those {noun}s first.{colorama.Style.RESET_ALL}')


@annotations.lru_cache(scope='request', maxsize=1)


+ 1
- 1
sky/server/constants.py View File

@@ -10,7 +10,7 @@ from sky.skylet import constants
# based on version info is needed.
# For more details and code guidelines, refer to:
# https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
API_VERSION = 23
API_VERSION = 24

# The minimum peer API version that the code should still work with.
# Notes (dev):


+ 10
- 18
sky/server/daemons.py View File

@@ -83,15 +83,6 @@ class InternalRequestDaemon:
sky_logging.reload_logger()
level = self.refresh_log_level()
self.event_fn()
# Clear request level cache after each run to avoid
# using too much memory.
annotations.clear_request_level_cache()
timeline.save_timeline()
# Kill all children processes related to this request.
# Each executor handles a single request, so we can safely
# kill all children processes related to this request.
subprocess_utils.kill_children_processes()
common_utils.release_memory()
except Exception: # pylint: disable=broad-except
# It is OK to fail to run the event, as the event is not
# critical, but we should log the error.
@@ -101,6 +92,16 @@ class InternalRequestDaemon:
f'{server_constants.DAEMON_RESTART_INTERVAL_SECONDS} '
'seconds...')
time.sleep(server_constants.DAEMON_RESTART_INTERVAL_SECONDS)
finally:
# Clear request level cache after each run to avoid
# using too much memory.
annotations.clear_request_level_cache()
timeline.save_timeline()
# Kill all children processes related to this request.
# Each executor handles a single request, so we can safely
# kill all children processes related to this request.
subprocess_utils.kill_children_processes()
common_utils.release_memory()


def refresh_cluster_status_event():
@@ -119,14 +120,6 @@ def refresh_cluster_status_event():
time.sleep(server_constants.CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS)


# After #7332, we start a local API server for pool/serve controller.
# We should skip the status refresh event on the pool/serve controller,
# as they have their own logic to cleanup the cluster records. This refresh
# will break existing workflows.
def should_skip_refresh_cluster_status() -> bool:
return os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None


def refresh_volume_status_event():
"""Periodically refresh the volume status."""
# pylint: disable=import-outside-toplevel
@@ -273,7 +266,6 @@ INTERNAL_REQUEST_DAEMONS = [
id='skypilot-status-refresh-daemon',
name=request_names.RequestName.REQUEST_DAEMON_STATUS_REFRESH,
event_fn=refresh_cluster_status_event,
should_skip=should_skip_refresh_cluster_status,
default_log_level='DEBUG'),
# Volume status refresh daemon to update the volume status periodically.
InternalRequestDaemon(


+ 7
- 0
sky/server/requests/executor.py View File

@@ -270,9 +270,16 @@ class RequestWorker:
queue.put(request_element)
except exceptions.ExecutionRetryableError as e:
time.sleep(e.retry_wait_seconds)
# Reset the request status to PENDING so it can be picked up again.
# Assume retryable since the error is ExecutionRetryableError.
request_id, _, _ = request_element
with api_requests.update_request(request_id) as request_task:
assert request_task is not None, request_id
request_task.status = api_requests.RequestStatus.PENDING
# Reschedule the request.
queue = _get_queue(self.schedule_type)
queue.put(request_element)
logger.info(f'Rescheduled request {request_id} for retry')
finally:
# Increment the free executor count when a request finishes
if metrics_utils.METRICS_ENABLED:


+ 1
- 8
sky/server/requests/payloads.py View File

@@ -72,7 +72,6 @@ EXTERNAL_LOCAL_ENV_VARS = [
]


@annotations.lru_cache(scope='global')
def request_body_env_vars() -> dict:
env_vars = {}
for env_var in os.environ:
@@ -83,7 +82,7 @@ def request_body_env_vars() -> dict:
if common.is_api_server_local() and env_var in EXTERNAL_LOCAL_ENV_VARS:
env_vars[env_var] = os.environ[env_var]
env_vars[constants.USER_ID_ENV_VAR] = common_utils.get_user_hash()
env_vars[constants.USER_ENV_VAR] = common_utils.get_current_user_name()
env_vars[constants.USER_ENV_VAR] = common_utils.get_local_user_name()
env_vars[
usage_constants.USAGE_RUN_ID_ENV_VAR] = usage_lib.messages.usage.run_id
if not common.is_api_server_local():
@@ -695,12 +694,6 @@ class ListAcceleratorCountsBody(RequestBody):
class LocalUpBody(RequestBody):
"""The request body for the local up endpoint."""
gpus: bool = True
ips: Optional[List[str]] = None
ssh_user: Optional[str] = None
ssh_key: Optional[str] = None
cleanup: bool = False
context_name: Optional[str] = None
password: Optional[str] = None
name: Optional[str] = None
port_start: Optional[int] = None



+ 16
- 7
sky/server/server.py View File

@@ -56,6 +56,7 @@ from sky.server import constants as server_constants
from sky.server import daemons
from sky.server import metrics
from sky.server import middleware_utils
from sky.server import server_utils
from sky.server import state
from sky.server import stream_utils
from sky.server import versions
@@ -470,7 +471,8 @@ async def schedule_on_boot_check_async():
await executor.schedule_request_async(
request_id='skypilot-server-on-boot-check',
request_name=request_names.RequestName.CHECK,
request_body=payloads.CheckBody(),
request_body=server_utils.build_body_at_server(
request=None, body_type=payloads.CheckBody),
func=sky_check.check,
schedule_type=requests_lib.ScheduleType.SHORT,
is_skypilot_system=True,
@@ -493,7 +495,8 @@ async def lifespan(app: fastapi.FastAPI): # pylint: disable=redefined-outer-nam
await executor.schedule_request_async(
request_id=event.id,
request_name=event.name,
request_body=payloads.RequestBody(),
request_body=server_utils.build_body_at_server(
request=None, body_type=payloads.RequestBody),
func=event.run_event,
schedule_type=requests_lib.ScheduleType.SHORT,
is_skypilot_system=True,
@@ -747,8 +750,11 @@ async def enabled_clouds(request: fastapi.Request,
await executor.schedule_request_async(
request_id=request.state.request_id,
request_name=request_names.RequestName.ENABLED_CLOUDS,
request_body=payloads.EnabledCloudsBody(workspace=workspace,
expand=expand),
request_body=server_utils.build_body_at_server(
request=request,
body_type=payloads.EnabledCloudsBody,
workspace=workspace,
expand=expand),
func=core.enabled_clouds,
schedule_type=requests_lib.ScheduleType.SHORT,
)
@@ -792,7 +798,8 @@ async def status_kubernetes(request: fastapi.Request) -> None:
await executor.schedule_request_async(
request_id=request.state.request_id,
request_name=request_names.RequestName.STATUS_KUBERNETES,
request_body=payloads.RequestBody(),
request_body=server_utils.build_body_at_server(
request=request, body_type=payloads.RequestBody),
func=core.status_kubernetes,
schedule_type=requests_lib.ScheduleType.SHORT,
)
@@ -1461,7 +1468,8 @@ async def storage_ls(request: fastapi.Request) -> None:
await executor.schedule_request_async(
request_id=request.state.request_id,
request_name=request_names.RequestName.STORAGE_LS,
request_body=payloads.RequestBody(),
request_body=server_utils.build_body_at_server(
request=request, body_type=payloads.RequestBody),
func=core.storage_ls,
schedule_type=requests_lib.ScheduleType.SHORT,
)
@@ -2008,7 +2016,8 @@ async def all_contexts(request: fastapi.Request) -> None:
await executor.schedule_request_async(
request_id=request.state.request_id,
request_name=request_names.RequestName.ALL_CONTEXTS,
request_body=payloads.RequestBody(),
request_body=server_utils.build_body_at_server(
request=request, body_type=payloads.RequestBody),
func=core.get_all_contexts,
schedule_type=requests_lib.ScheduleType.SHORT,
)


+ 30
- 0
sky/server/server_utils.py View File

@@ -0,0 +1,30 @@
"""Utilities for the API server."""

from typing import Optional, Type, TypeVar

import fastapi

from sky.server.requests import payloads
from sky.skylet import constants

_BodyT = TypeVar('_BodyT', bound=payloads.RequestBody)


# TODO(aylei): remove this and disable request body construction at server-side
def build_body_at_server(request: Optional[fastapi.Request],
body_type: Type[_BodyT], **data) -> _BodyT:
"""Builds the request body at the server.

For historical reasons, some handlers mimic a client request body
at server-side in order to coordinate with the interface of executor.
This will cause issues where the client info like user identity is not
respected in these handlers. This function is a helper to build the request
body at server-side with the auth user overridden.
"""
request_body = body_type(**data)
if request is not None:
auth_user = getattr(request.state, 'auth_user', None)
if auth_user:
request_body.env_vars[constants.USER_ID_ENV_VAR] = auth_user.id
request_body.env_vars[constants.USER_ENV_VAR] = auth_user.name
return request_body

+ 2
- 1
sky/skylet/subprocess_daemon.py View File

@@ -110,7 +110,8 @@ def kill_process_tree(process: psutil.Process,


def main():
# daemonize()
daemonize()

parser = argparse.ArgumentParser()
parser.add_argument('--parent-pid', type=int, required=True)
parser.add_argument('--proc-pid', type=int, required=True)


+ 35
- 13
sky/task.py View File

@@ -7,6 +7,7 @@ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
Union)

import colorama
from pydantic import SecretStr

from sky import clouds
from sky import dag as dag_lib
@@ -112,7 +113,7 @@ def _fill_in_env_vars(


def _check_docker_login_config(task_envs: Dict[str, str],
task_secrets: Dict[str, str]) -> bool:
task_secrets: Dict[str, SecretStr]) -> bool:
"""Validates a valid docker login config in task_envs and task_secrets.

Docker login variables must be specified together either in envs OR secrets,
@@ -173,12 +174,13 @@ def _with_docker_login_config(
resources: Union[Set['resources_lib.Resources'],
List['resources_lib.Resources']],
task_envs: Dict[str, str],
task_secrets: Dict[str, str],
task_secrets: Dict[str, SecretStr],
) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
if not _check_docker_login_config(task_envs, task_secrets):
return resources
envs = task_envs.copy()
envs.update(task_secrets)
for key, value in task_secrets.items():
envs[key] = value.get_secret_value()
docker_login_config = docker_utils.DockerLoginConfig.from_env_vars(envs)

def _add_docker_login_config(resources: 'resources_lib.Resources'):
@@ -207,10 +209,11 @@ def _with_docker_username_for_runpod(
resources: Union[Set['resources_lib.Resources'],
List['resources_lib.Resources']],
task_envs: Dict[str, str],
task_secrets: Dict[str, str],
task_secrets: Dict[str, SecretStr],
) -> Union[Set['resources_lib.Resources'], List['resources_lib.Resources']]:
envs = task_envs.copy()
envs.update(task_secrets)
for key, value in task_secrets.items():
envs[key] = value.get_secret_value()
docker_username_for_runpod = envs.get(
constants.RUNPOD_DOCKER_USERNAME_ENV_VAR)

@@ -223,6 +226,18 @@ def _with_docker_username_for_runpod(
for r in resources))


def get_plaintext_envs_and_secrets(
envs_and_secrets: Dict[str, Union[str, SecretStr]],) -> Dict[str, str]:
return {
k: v.get_secret_value() if isinstance(v, SecretStr) else v
for k, v in envs_and_secrets.items()
}


def get_plaintext_secrets(secrets: Dict[str, SecretStr]) -> Dict[str, str]:
return {k: v.get_secret_value() for k, v in secrets.items()}


class Task:
"""Task: a computation to be run on the cloud."""

@@ -343,7 +358,9 @@ class Task:
self.storage_plans: Dict[storage_lib.Storage,
storage_lib.StoreType] = {}
self._envs = envs or {}
self._secrets = secrets or {}
self._secrets = {}
if secrets is not None:
self._secrets = {k: SecretStr(v) for k, v in secrets.items()}
self._volumes = volumes or {}

# concatenate commands if given as list
@@ -935,7 +952,7 @@ class Task:
return self._envs

@property
def secrets(self) -> Dict[str, str]:
def secrets(self) -> Dict[str, SecretStr]:
return self._secrets

@property
@@ -1042,7 +1059,8 @@ class Task:
raise ValueError(
'secrets must be List[Tuple[str, str]] or Dict[str, str]: '
f'{secrets}')
self._secrets.update(secrets)
for key, value in secrets.items():
self._secrets[key] = SecretStr(value)
# Validate Docker login configuration if needed
if _check_docker_login_config(self._envs, self._secrets):
self.resources = _with_docker_login_config(self.resources,
@@ -1057,7 +1075,7 @@ class Task:
return any(r.use_spot for r in self.resources)

@property
def envs_and_secrets(self) -> Dict[str, str]:
def envs_and_secrets(self) -> Dict[str, Union[str, SecretStr]]:
envs = self.envs.copy()
envs.update(self.secrets)
return envs
@@ -1643,9 +1661,11 @@ class Task:
if clone_info.token is None and clone_info.ssh_key is None:
return self
if clone_info.token is not None:
self.secrets[git.GIT_TOKEN_ENV_VAR] = clone_info.token
self.secrets[git.GIT_TOKEN_ENV_VAR] = SecretStr(
clone_info.token)
if clone_info.ssh_key is not None:
self.secrets[git.GIT_SSH_KEY_ENV_VAR] = clone_info.ssh_key
self.secrets[git.GIT_SSH_KEY_ENV_VAR] = SecretStr(
clone_info.ssh_key)
except exceptions.GitError as e:
with ux_utils.print_exception_no_traceback():
raise ValueError(f'{str(e)}') from None
@@ -1703,8 +1723,10 @@ class Task:
add_if_not_none('envs', self.envs, no_empty=True)

secrets = self.secrets
if secrets and redact_secrets:
secrets = {k: '<redacted>' for k in secrets}
if secrets and not redact_secrets:
secrets = {k: v.get_secret_value() for k, v in secrets.items()}
elif secrets and redact_secrets:
secrets = {k: '<redacted>' for k, v in secrets.items()}
add_if_not_none('secrets', secrets, no_empty=True)

add_if_not_none('file_mounts', {})


+ 16
- 2
sky/utils/common_utils.py View File

@@ -300,6 +300,7 @@ _current_user: Optional['models.User'] = None
_current_request_id: Optional[str] = None


# TODO(aylei,hailong): request context should be contextual
def set_request_context(client_entrypoint: Optional[str],
client_command: Optional[str],
using_remote_api_server: bool,
@@ -341,19 +342,32 @@ def get_current_command() -> str:


def get_current_user() -> 'models.User':
"""Returns the current user."""
"""Returns the user in current server session."""
if _current_user is not None:
return _current_user
return models.User.get_current_user()


def get_current_user_name() -> str:
"""Returns the current user name."""
"""Returns the user name in current server session."""
name = get_current_user().name
assert name is not None
return name


def get_local_user_name() -> str:
"""Returns the user name in local environment.

This is for backward compatibility where anonymous access is implicitly
allowed when no authentication method at server-side is configured and
the username from client environment variable will be used to identify the
user.
"""
name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
assert name is not None
return name


def set_current_user(user: 'models.User'):
"""Sets the current user."""
global _current_user


+ 0
- 1
sky/utils/controller_utils.py View File

@@ -569,7 +569,6 @@ def shared_controller_vars_to_fill(
# with a remote API server.
constants.USING_REMOTE_API_SERVER_ENV_VAR: str(
common_utils.get_using_remote_api_server()),
constants.OVERRIDE_CONSOLIDATION_MODE: 'true',
constants.IS_SKYPILOT_SERVE_CONTROLLER:
('true'
if controller == Controllers.SKY_SERVE_CONTROLLER else 'false'),


+ 2
- 2
sky/utils/kubernetes/create_cluster.sh View File

@@ -4,8 +4,8 @@
set -e

# Images
IMAGE="us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest"
IMAGE_GPU="us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu:latest"
IMAGE="us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot:latest"
IMAGE_GPU="us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot-gpu:latest"

# Arguments
NAME=$1


+ 1
- 1
sky/utils/kubernetes/deploy_remote_cluster.py View File

@@ -468,7 +468,7 @@ def main():
)
continue

context_name = 'ssh-default'
context_name = f'ssh-{cluster_name}'

# Check cluster history
os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)


+ 3
- 1
sky/utils/kubernetes/gpu_labeler.py View File

@@ -40,7 +40,9 @@ def cleanup(context: Optional[str] = None) -> Tuple[bool, str]:
success = True
except subprocess.CalledProcessError as e:
output = e.output.decode('utf-8')
reason = 'Error deleting existing GPU labeler resources: ' + output
stderr = e.stderr.decode('utf-8')
reason = ('Error deleting existing GPU labeler resources: ' +
output + stderr)
return success, reason




+ 2
- 1
sky/utils/kubernetes/k8s_gpu_labeler_job.yaml View File

@@ -13,10 +13,11 @@ spec:
serviceAccountName: gpu-labeler-sa
containers:
- name: gpu-labeler
image: us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu:latest # Using this image also serves as a way to "pre-pull" the image onto nodes
image: us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot-gpu:latest # Using this image also serves as a way to "pre-pull" the image onto nodes
command: ["/bin/bash", "-i", "-c"]
args:
- |
source ~/skypilot-runtime/bin/activate
python /label_gpus.py
env:
- name: MY_NODE_NAME


+ 16
- 16
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml View File

@@ -53,51 +53,51 @@ data:
import os
import subprocess
from typing import Optional
from kubernetes import client
from kubernetes import config
canonical_gpu_names = [
'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', 'V100',
'A100-80GB', 'A100', 'A10G', 'H100', 'K80', 'M60', 'T4g', 'T4', 'V100',
'A10', 'P100', 'P40', 'P4', 'L4'
]
def get_gpu_name() -> Optional[str]:
try:
result = subprocess.run(
['nvidia-smi', '--query-gpu=name', '--format=csv,noheader,nounits'],
stdout=subprocess.PIPE)
gpu_name = result.stdout.decode('utf-8').strip()
# In the case of multi-gpu nodes, we assume the node is homogenous and
# In the case of multi-gpu nodes, we assume the node is homogenous and
# just use the first GPU name.
gpu_name = gpu_name.split('\n')[0]
return gpu_name.lower()
except Exception as e:
print(f'Error getting GPU name: {e}')
return None
def label_node(gpu_name: str) -> None:
try:
config.load_incluster_config() # Load in-cluster configuration
v1 = client.CoreV1Api()
# Fetch the current node's name from the environment variable
node_name = os.environ.get('MY_NODE_NAME')
if not node_name:
raise ValueError('Failed to get node name from environment')
# Label the node with the GPU name
body = {'metadata': {'labels': {'skypilot.co/accelerator': gpu_name}}}
v1.patch_node(node_name, body)
print(f'Labeled node {node_name} with GPU {gpu_name}')
except Exception as e:
print(f'Error labeling node: {e}')
def main():
gpu_name = get_gpu_name()
if gpu_name is not None:
@@ -119,7 +119,7 @@ data:
labelled = True
else:
print('No GPU detected. Try running nvidia-smi in the container.')
if __name__ == '__main__':
main()

+ 0
- 88
sky/utils/kubernetes/kubernetes_deploy_utils.py View File

@@ -170,94 +170,6 @@ def deploy_ssh_cluster(cleanup: bool = False,
is_local=True))


def deploy_remote_cluster(ip_list: List[str],
ssh_user: str,
ssh_key: str,
cleanup: bool,
context_name: Optional[str] = None,
password: Optional[str] = None):
success = False
path_to_package = os.path.dirname(__file__)
up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.py')
# Get directory of script and run it from there
cwd = os.path.dirname(os.path.abspath(up_script_path))

# Create temporary files for the IPs and SSH key
with tempfile.NamedTemporaryFile(mode='w') as ip_file, \
tempfile.NamedTemporaryFile(mode='w') as key_file:

# Write IPs and SSH key to temporary files
ip_file.write('\n'.join(ip_list))
ip_file.flush()

key_file.write(ssh_key)
key_file.flush()
os.chmod(key_file.name, 0o600)

# Use the legacy mode command line arguments for backward compatibility
deploy_command = [
sys.executable, up_script_path, '--ips-file', ip_file.name,
'--user', ssh_user, '--ssh-key', key_file.name
]

if context_name is not None:
deploy_command.extend(['--context-name', context_name])
if password is not None:
deploy_command.extend(['--password', password])
if cleanup:
deploy_command.append('--cleanup')

# Setup logging paths
run_timestamp = sky_logging.get_run_timestamp()
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
'local_up.log')

if cleanup:
msg_str = 'Cleaning up remote cluster...'
else:
msg_str = 'Deploying remote cluster...'

# Create environment with PYTHONUNBUFFERED=1 to ensure unbuffered output
env = os.environ.copy()
env['PYTHONUNBUFFERED'] = '1'

with rich_utils.safe_status(
ux_utils.spinner_message(msg_str,
log_path=log_path,
is_local=True)):
returncode, _, stderr = log_lib.run_with_log(
cmd=deploy_command,
log_path=log_path,
require_outputs=True,
stream_logs=False,
line_processor=log_utils.SkyRemoteUpLineProcessor(
log_path=log_path, is_local=True),
cwd=cwd,
env=env)
if returncode == 0:
success = True
else:
with ux_utils.print_exception_no_traceback():
log_hint = ux_utils.log_path_hint(log_path, is_local=True)
raise RuntimeError('Failed to deploy remote cluster. '
f'Full log: {log_hint}'
f'\nError: {stderr}')

if success:
if cleanup:
logger.info(
ux_utils.finishing_message(
'🎉 Remote cluster cleaned up successfully.',
log_path=log_path,
is_local=True))
else:
logger.info(
ux_utils.finishing_message(
'🎉 Remote cluster deployed successfully.',
log_path=log_path,
is_local=True))


def generate_kind_config(port_start: int,
num_nodes: int = 1,
gpus: bool = False) -> str:


+ 0
- 102
sky/utils/log_utils.py View File

@@ -198,108 +198,6 @@ class SkyLocalUpLineProcessor(LineProcessor):
self.status_display.stop()


class SkyRemoteUpLineProcessor(LineProcessor):
"""A processor for deploy_remote_cluster.py log lines."""

def __init__(self, log_path: str, is_local: bool):
self.log_path = log_path
self.is_local = is_local

def __enter__(self) -> None:
# TODO(romilb): Use ux_utils.INDENT_SYMBOL to be consistent with other
# messages.
status = rich_utils.safe_status(
ux_utils.spinner_message('Creating remote cluster',
log_path=self.log_path,
is_local=self.is_local))
self.status_display = status
self.status_display.start()

def process_line(self, log_line: str) -> None:
# Pre-flight checks
if 'SSH connection successful' in log_line:
logger.info(f'{colorama.Fore.GREEN}SSH connection established.'
f'{colorama.Style.RESET_ALL}')

# Kubernetes installation steps
if 'Deploying Kubernetes on head node' in log_line:
self.status_display.update(
ux_utils.spinner_message(
'Creating remote cluster - '
'deploying Kubernetes on head node',
log_path=self.log_path,
is_local=self.is_local))
if 'K3s deployed on head node.' in log_line:
logger.info(f'{colorama.Fore.GREEN}'
'✔ K3s successfully deployed on head node.'
f'{colorama.Style.RESET_ALL}')

# Worker nodes
if 'Deploying Kubernetes on worker node' in log_line:
self.status_display.update(
ux_utils.spinner_message(
'Creating remote cluster - '
'deploying Kubernetes on worker nodes',
log_path=self.log_path,
is_local=self.is_local))
if 'Kubernetes deployed on worker node' in log_line:
logger.info(f'{colorama.Fore.GREEN}'
'✔ K3s successfully deployed on worker node.'
f'{colorama.Style.RESET_ALL}')

# Cluster configuration
if 'Configuring local kubectl to connect to the cluster...' in log_line:
self.status_display.update(
ux_utils.spinner_message(
'Creating remote cluster - '
'configuring local kubectl',
log_path=self.log_path,
is_local=self.is_local))
if 'kubectl configured to connect to the cluster.' in log_line:
logger.info(f'{colorama.Fore.GREEN}'
'✔ kubectl configured for the remote cluster.'
f'{colorama.Style.RESET_ALL}')

# GPU operator installation
if 'Installing Nvidia GPU Operator...' in log_line:
self.status_display.update(
ux_utils.spinner_message(
'Creating remote cluster - '
'installing Nvidia GPU Operator',
log_path=self.log_path,
is_local=self.is_local))
if 'GPU Operator installed.' in log_line:
logger.info(f'{colorama.Fore.GREEN}'
'✔ Nvidia GPU Operator installed successfully.'
f'{colorama.Style.RESET_ALL}')

# Cleanup steps
if 'Cleaning up head node' in log_line:
self.status_display.update(
ux_utils.spinner_message('Cleaning up head node',
log_path=self.log_path,
is_local=self.is_local))
if 'Cleaning up node' in log_line:
self.status_display.update(
ux_utils.spinner_message('Cleaning up worker node',
log_path=self.log_path,
is_local=self.is_local))
if 'cleaned up successfully' in log_line:
logger.info(f'{colorama.Fore.GREEN}'
f'{log_line.strip()}{colorama.Style.RESET_ALL}')

# Final status
if 'Cluster deployment completed.' in log_line:
logger.info(f'{colorama.Fore.GREEN}✔ Remote k3s is running.'
f'{colorama.Style.RESET_ALL}')

def __exit__(self, except_type: Optional[Type[BaseException]],
except_value: Optional[BaseException],
traceback: Optional[types.TracebackType]) -> None:
del except_type, except_value, traceback # unused
self.status_display.stop()


class SkySSHUpLineProcessor(LineProcessor):
"""A processor for deploy_remote_cluster.py log lines for SSH clusters"""



+ 1
- 1
tests/kubernetes/README.md View File

@@ -5,7 +5,7 @@ This directory contains useful scripts and notes for developing SkyPilot on Kube
## Building and pushing SkyPilot image

We maintain a container image that has all basic SkyPilot dependencies installed.
This image is hosted at `us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest`.
This image is hosted at `us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot:latest`.

To build this image locally and optionally push to the SkyPilot registry, run:
```bash


+ 1
- 1
tests/kubernetes/cpu_test_pod.yaml View File

@@ -18,7 +18,7 @@ metadata:
spec:
containers:
- name: skytest
image: us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest
image: us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot:latest
# Run apt update and run a simple HTTP server
command: ["/bin/bash", "-c", "--"]
args: ["sudo apt update && python3 -m http.server 8080"]


+ 1
- 1
tests/kubernetes/gpu_test_pod.yaml View File

@@ -8,7 +8,7 @@ spec:
restartPolicy: Never
containers:
- name: skygputest
image: us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu:latest
image: us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot-gpu:latest
command: ["nvidia-smi"]
resources:
limits:


+ 19
- 5
tests/kubernetes/scripts/create_cluster.sh View File

@@ -5,7 +5,7 @@ set -e
# create_cluster.sh gcp <CLUSTER_NAME> <PROJECT_ID> <ZONE> <NODE_COUNT> <MACHINE_TYPE>
# create_cluster.sh aws <CLUSTER_NAME> <REGION> <NODE_COUNT> <INSTANCE_TYPE>

# If EKS_VPC_CONFIG_PUBLIC is set, it will be injected verbatim into the eksctl config
# If EKS_VPC_CONFIG_PRIVATE is set, it will be injected verbatim into the eksctl config

PROVIDER=${1:-"gcp"}
shift || true
@@ -52,6 +52,11 @@ case "$PROVIDER" in
echo "Region: $REGION"
echo "Node Count: $NODE_COUNT"
echo "Instance Type: $INSTANCE_TYPE"
if [ -n "$EKS_VPC_CONFIG_PRIVATE" ]; then
echo "Using custom VPC configuration from EKS_VPC_CONFIG_PRIVATE"
else
echo "Using default VPC configuration (EKS_VPC_CONFIG_PRIVATE not set)"
fi

# Check if cluster exists and delete it if present
echo "Checking if EKS cluster '$CLUSTER_NAME' exists..."
@@ -71,6 +76,13 @@ case "$PROVIDER" in
fi

RESOLVED_CONFIG="/tmp/${CLUSTER_NAME}-eks-cluster-config.yaml"
# Convert literal \n to actual newlines if EKS_VPC_CONFIG_PRIVATE is set
if [ -n "$EKS_VPC_CONFIG_PRIVATE" ]; then
# Use printf to interpret escape sequences like \n
VPC_CONFIG=$(printf '%b\n' "$EKS_VPC_CONFIG_PRIVATE")
else
VPC_CONFIG=""
fi
cat > "$RESOLVED_CONFIG" <<EOF
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
@@ -78,7 +90,7 @@ kind: ClusterConfig
metadata:
name: ${CLUSTER_NAME}
region: ${REGION}
${EKS_VPC_CONFIG_PUBLIC}
${VPC_CONFIG}
iam:
withOIDC: true
managedNodeGroups:
@@ -95,12 +107,14 @@ EOF

aws eks --region "$REGION" update-kubeconfig --name "$CLUSTER_NAME"

# If user provided VPC/subnets via EKS_VPC_CONFIG_PUBLIC, tag those subnets so
# If user provided VPC/subnets via EKS_VPC_CONFIG_PRIVATE, tag those subnets so
# Service type LoadBalancer can provision internet-facing ELB/NLB.
if [ -n "$EKS_VPC_CONFIG_PUBLIC" ]; then
if [ -n "$EKS_VPC_CONFIG_PRIVATE" ]; then
echo "Tagging provided public subnets for internet-facing LoadBalancers..."
# Convert literal \n to actual newlines for parsing
VPC_CONFIG_FOR_PARSING=$(printf '%b\n' "$EKS_VPC_CONFIG_PRIVATE")
# Extract all subnet IDs from the config (deduplicated)
mapfile -t SUBNET_IDS < <(echo "$EKS_VPC_CONFIG_PUBLIC" | grep -E 'id:\s*subnet-' | awk '{print $2}' | tr -d '"' | sort -u)
mapfile -t SUBNET_IDS < <(echo "$VPC_CONFIG_FOR_PARSING" | grep -E 'id:\s*subnet-' | awk '{print $2}' | tr -d '"' | sort -u)
for subnet_id in "${SUBNET_IDS[@]}"; do
if [ -n "$subnet_id" ]; then
echo "Tagging subnet $subnet_id"


+ 1
- 1
tests/kubernetes/scripts/skypilot_ssh_k8s_deployment.yaml View File

@@ -20,7 +20,7 @@ spec:
secretName: ssh-key-secret
containers:
- name: skypilot
image: us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest
image: us-docker.pkg.dev/sky-dev-465/skypilotk8s/skypilot:latest
imagePullPolicy: Never
env:
- name: SECRET_THING


+ 8
- 5
tests/load_tests/db_scale_tests/create_aws_postgres_db.sh View File

@@ -81,20 +81,22 @@ fi

# Get VPC and subnets if instance doesn't exist - use EKS VPC if available, otherwise use default VPC
if [ "$INSTANCE_EXISTS" = "false" ]; then
if [ -n "$EKS_VPC_CONFIG" ]; then
echo "Using EKS VPC configuration from EKS_VPC_CONFIG..." >&2
if [ -n "$EKS_VPC_CONFIG_PRIVATE" ]; then
echo "Using custom VPC configuration from EKS_VPC_CONFIG_PRIVATE..." >&2
# Convert literal \n to actual newlines for parsing
VPC_CONFIG_FOR_PARSING=$(printf '%b\n' "$EKS_VPC_CONFIG_PRIVATE")
# Parse VPC ID from YAML format: " id: vpc-xxx" (under vpc:)
# Look for lines with "id:" that contain "vpc-" pattern
VPC_ID=$(echo "$EKS_VPC_CONFIG" | grep -E "^\s+id:\s+vpc-" | awk '{print $2}' | tr -d '"' | tr -d "'" | head -n1)
VPC_ID=$(echo "$VPC_CONFIG_FOR_PARSING" | grep -E "^\s+id:\s+vpc-" | awk '{print $2}' | tr -d '"' | tr -d "'" | head -n1)

if [ -z "$VPC_ID" ]; then
echo "WARNING: Could not parse VPC ID from EKS_VPC_CONFIG, falling back to default VPC" >&2
echo "WARNING: Could not parse VPC ID from EKS_VPC_CONFIG_PRIVATE, falling back to default VPC" >&2
USE_EKS_VPC=false
else
# Parse subnet IDs from YAML format
# Format: " id: subnet-xxx" (nested under subnets/public/)
# Look for lines with "id:" that contain "subnet-" pattern
SUBNET_IDS=$(echo "$EKS_VPC_CONFIG" | grep -E "^\s+id:\s+subnet-" | awk '{print $2}' | tr -d '"' | tr -d "'")
SUBNET_IDS=$(echo "$VPC_CONFIG_FOR_PARSING" | grep -E "^\s+id:\s+subnet-" | awk '{print $2}' | tr -d '"' | tr -d "'")

# Convert to array
SUBNET_ARRAY=($SUBNET_IDS)
@@ -126,6 +128,7 @@ if [ -n "$EKS_VPC_CONFIG" ]; then
fi
else
USE_EKS_VPC=false
echo "Using default VPC configuration (EKS_VPC_CONFIG_PRIVATE not set)..." >&2
fi

if [ "$USE_EKS_VPC" != "true" ]; then


+ 5
- 5
tests/smoke_tests/smoke_tests_utils.py View File

@@ -75,7 +75,7 @@ LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG = {
'controller': {
'resources': {
'cpus': '4+',
'memory': '4+'
'memory': '16+'
}
}
},
@@ -83,7 +83,7 @@ LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG = {
'controller': {
'resources': {
'cpus': '4+',
'memory': '4+'
'memory': '8+'
}
}
}
@@ -551,7 +551,7 @@ def ensure_iterable_result(func):
def run_one_test(test: Test, check_sky_status: bool = True) -> None:
# Fail fast if `sky` CLI somehow errors out.
if check_sky_status:
test.commands.insert(0, 'sky status')
test.commands.insert(0, 'sky status -u')

log_to_stdout = os.environ.get('LOG_TO_STDOUT', None)
if log_to_stdout:
@@ -1045,7 +1045,7 @@ def get_dashboard_jobs_queue_request_id() -> str:
return server_common.get_request_id(response)


def get_response_from_request_id(request_id: str) -> Any:
def get_response_from_request_id_dashboard(request_id: str) -> Any:
"""Waits for and gets the result of a request.

Args:
@@ -1064,7 +1064,7 @@ def get_response_from_request_id(request_id: str) -> Any:
'GET',
f'/internal/dashboard/api/get?request_id={request_id}',
server_url=get_api_server_url(),
timeout=15)
timeout=25)
request_task = None
if response.status_code == 200:
request_task = requests_lib.Request.decode(


+ 21
- 5
tests/smoke_tests/test_api_server.py View File

@@ -35,6 +35,8 @@ def set_user(user_id: str, user_name: str, commands: List[str]) -> List[str]:
@pytest.mark.no_hyperbolic # Hyperbolic does not support multi-tenant jobs
@pytest.mark.no_shadeform # Shadeform does not support multi-tenant jobs
@pytest.mark.no_seeweb # Seeweb does not support multi-tenant jobs
# Note: we should skip or fix on shared remote cluster because two copies of
# this test may down each other's clusters (sky down -a with hardcoded user id).
def test_multi_tenant(generic_cloud: str):
if smoke_tests_utils.services_account_token_configured_in_env_file():
pytest.skip(
@@ -63,8 +65,14 @@ def test_multi_tenant(generic_cloud: str):
# Stopping cluster should not change the ownership of the cluster.
f's=$(sky status) && echo "$s" && echo "$s" | grep {name}-1 && exit 1 || true',
f'sky status {name}-1 | grep STOPPED',
# Both clusters should be stopped.
f'sky status -u | grep {name}-1 | grep STOPPED',
# Restarting other user's cluster should work.
f'sky start -y {name}-1',
# Cluster should still have the same disk.
f'sky exec {name}-1 \'ls file || exit 1\'',
# Restarting cluster should not change the ownership of the cluster.
f's=$(sky status) && echo "$s" && echo "$s" | grep {name}-1 && exit 1 || true',
# Cluster 1 should be UP now, but cluster 2 should be STOPPED.
f'sky status -u | grep {name}-1 | grep UP',
f'sky status -u | grep {name}-2 | grep STOPPED',
]),
]
@@ -78,14 +86,17 @@ def test_multi_tenant(generic_cloud: str):
'echo "==== Test multi-tenant job on single cluster ===="',
*set_user(user_1, user_1_name, [
f'sky launch -y -c {name}-1 --cloud {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} -n job-1 tests/test_yamls/minimal.yaml',
f'sky exec {name}-1 -n job-2 \'touch file\'',
f's=$(sky queue {name}-1) && echo "$s" && echo "$s" | grep job-1 | grep SUCCEEDED | awk \'{{print $1}}\' | grep 1',
f's=$(sky queue -u {name}-1) && echo "$s" && echo "$s" | grep {user_1_name} | grep job-1 | grep SUCCEEDED',
]),
*set_user(user_2, user_2_name, [
f'sky exec {name}-1 -n job-2 \'echo "hello" && exit 1\' || [ $? -eq 100 ]',
f's=$(sky queue {name}-1) && echo "$s" && echo "$s" | grep job-2 | grep FAILED | awk \'{{print $1}}\' | grep 2',
f'sky exec {name}-1 -n job-3 \'echo "hello" && exit 1\' || [ $? -eq 100 ]',
f'sky launch -y -c {name}-1 -n job-4 \'ls file || exit 1\'',
f's=$(sky queue {name}-1) && echo "$s" && echo "$s" | grep job-3 | grep FAILED | awk \'{{print $1}}\' | grep 3',
f's=$(sky queue {name}-1) && echo "$s" && echo "$s" | grep job-4 | grep SUCCEEDED | awk \'{{print $1}}\' | grep 4',
f's=$(sky queue {name}-1) && echo "$s" && echo "$s" | grep job-1 && exit 1 || true',
f's=$(sky queue {name}-1 -u) && echo "$s" && echo "$s" | grep {user_2_name} | grep job-2 | grep FAILED',
f's=$(sky queue {name}-1 -u) && echo "$s" && echo "$s" | grep {user_2_name} | grep job-3 | grep FAILED',
f's=$(sky queue {name}-1 -u) && echo "$s" && echo "$s" | grep {user_1_name} | grep job-1 | grep SUCCEEDED',
]),
'echo "==== Test clusters from different users ===="',
@@ -284,6 +295,11 @@ def test_requests_scheduling(generic_cloud: str):


# ---- Test recent request tracking -----
# We mark this test as no_remote_server since it requires a dedicated API server
# for the test otherwise we can't make any guarantees about the most recent
# request. Replace with another option to skip shared server tests when we have
# one.
@pytest.mark.no_remote_server
def test_recent_request_tracking(generic_cloud: str):
with smoke_tests_utils.override_sky_config():
# We need to override the sky api endpoint env if --remote-server is


+ 107
- 4
tests/smoke_tests/test_basic.py View File

@@ -637,8 +637,9 @@ def test_core_api_sky_launch_exec(generic_cloud: str):
cluster_exist = False
status_request_id = (
smoke_tests_utils.get_dashboard_cluster_status_request_id())
status_response = (smoke_tests_utils.get_response_from_request_id(
status_request_id))
status_response = (
smoke_tests_utils.get_response_from_request_id_dashboard(
status_request_id))
for cluster in status_response:
if cluster['name'] == name:
cluster_exist = True
@@ -700,8 +701,9 @@ def test_jobs_launch_and_logs(generic_cloud: str):
# Check the job status from the dashboard
queue_request_id = (
smoke_tests_utils.get_dashboard_jobs_queue_request_id())
queue_response = (smoke_tests_utils.get_response_from_request_id(
queue_request_id))
queue_response = (
smoke_tests_utils.get_response_from_request_id_dashboard(
queue_request_id))
job_exist = False
for job in queue_response:
if job['job_id'] == job_id:
@@ -1775,3 +1777,104 @@ def test_cluster_setup_num_gpus():
teardown=f'sky down -y {name}',
)
smoke_tests_utils.run_one_test(test)


@pytest.mark.aws
def test_launch_retry_until_up():
"""Test that retry until up considers more resources after trying all zones."""
cluster_name = smoke_tests_utils.get_cluster_name()
timeout = 180
test = smoke_tests_utils.Test(
'launch-retry-until-up',
[
# Launch something we'll never get.
f's=$(timeout {timeout} sky launch -c {cluster_name} --gpus B200:8 --infra aws echo hi -y -d --retry-until-up --use-spot 2>&1 || true) && '
# Check that "Retry after" appears in the output
'echo "$s" | grep -q "Retry after" && '
# Find the first occurrence of "Retry after" and get its line number
'RETRY_LINE=$(echo "$s" | grep -n "Retry after" | head -1 | cut -d: -f1) && '
# Check that "Considered resources" appears after the first "Retry after"
# We do this by extracting all lines after RETRY_LINE and checking if "Considered resources" appears
'echo "$s" | tail -n +$((RETRY_LINE + 1)) | grep -q "Considered resources"'
],
timeout=200, # Slightly more than 180 to account for test overhead
teardown=f'sky down -y {cluster_name}',
)
smoke_tests_utils.run_one_test(test)


def test_cancel_job_reliability(generic_cloud: str):
"""Test that sky cancel properly terminates running jobs."""
name = smoke_tests_utils.get_cluster_name()

# Create a temporary YAML file with a long-running sleep command
cancel_test_yaml = textwrap.dedent("""
run: |
sleep 10000
""")

# Helper function to check process count with timeout
def check_process_count(expected_lines: int, timeout: int = 30) -> str:
"""Check that ps aux | grep 'sleep 10000' shows expected number of lines.
Note: ps aux | grep includes the grep process itself, so:
- 3 lines = sleep process + grep process + ssh process to check the process count
- 2 line = grep process (sleep is gone) + ssh process to check the process count
Returns a command that will check the process count with retries.
"""
return (
f'for i in $(seq 1 {timeout}); do '
f' s=$(ssh {name} "ps aux | grep \'sleep 10000\'" 2>/dev/null); '
f' count=$(echo "$s" | wc -l || echo 0); '
f' if [ "$count" -eq {expected_lines} ]; then '
f' echo "Found {expected_lines} line(s) as expected"; '
f' exit 0; '
f' fi; '
f' echo "Waiting for {expected_lines} line(s), found $count, attempt $i/{timeout}"; '
f' echo "Output was: $s"; '
f' sleep 1; '
f'done; '
f'echo "ERROR: Expected {expected_lines} line(s) but found $count"; '
f'exit 1')

with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml') as f:
f.write(cancel_test_yaml)
f.flush()

disk_size_param, _ = smoke_tests_utils.get_disk_size_and_validate_launch_output(
generic_cloud)

# Build commands for the test
commands = [
# Launch the cluster
f'sky launch -y -c {name} --infra {generic_cloud} {smoke_tests_utils.LOW_RESOURCE_ARG} {disk_size_param} {f.name} -d',
check_process_count(3, timeout=30),
f'sky cancel {name} 1 -y',
check_process_count(2, timeout=30),
]

num_iterations = 10
# Run the cancel test num_iterations times
# Note: Job 1 is from the cluster launch, so exec jobs start at job 2
for iteration in range(1, num_iterations):
job_num = iteration + 1 # Job 1 is from cluster launch
commands.extend([
# Launch a new job with the sleep command
f'sky exec {name} --infra {generic_cloud} {f.name} -d',
# Check that we see 3 lines (sleep process + grep process itself + ssh process to check the process count)
check_process_count(3, timeout=30),
# Cancel the job
f'sky cancel {name} {job_num} -y',
# Check that we now see only 2 lines (grep process + ssh process to check the process count)
check_process_count(2, timeout=30),
])

test = smoke_tests_utils.Test(
'test_cancel_job_reliability',
commands,
f'sky down -y {name}',
timeout=smoke_tests_utils.get_timeout(generic_cloud) *
2, # Longer timeout for 10 iterations
)
smoke_tests_utils.run_one_test(test)

+ 123
- 53
tests/smoke_tests/test_pools.py View File

@@ -12,6 +12,12 @@ from sky.skylet import events
from sky.utils import common_utils
from sky.utils import yaml_utils

# 1. TODO(lloyd): Marking below tests as no_remote_server since PR#7332 changed
# the resource management logic for pools reducing the number of concurrent
# pools that can be running. This leads to build failures on the shared GKE
# test cluster. Remove this when consolidation mode is enabled by default or
# we have an option to not allow shared env tests.

_LAUNCH_POOL_AND_CHECK_SUCCESS = (
's=$(sky jobs pool apply -p {pool_name} {pool_yaml} -y); '
'echo "$s"; '
@@ -306,6 +312,7 @@ def get_worker_cluster_name(pool_name: str, worker_id: int):
@pytest.mark.parametrize('accelerator', [{'do': 'H100', 'nebius': 'L40S'}])
@pytest.mark.skip(
'Skipping vllm pool test until more remote server testing is done.')
@pytest.mark.no_remote_server # see note 1 above
def test_vllm_pool(generic_cloud: str, accelerator: Dict[str, str]):
if generic_cloud == 'kubernetes':
accelerator = smoke_tests_utils.get_avaliabe_gpus_for_k8s_tests()
@@ -436,6 +443,7 @@ def test_vllm_pool(generic_cloud: str, accelerator: Dict[str, str]):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_setup_logs_in_starting_pool(generic_cloud: str):
"""Test that setup logs are streamed in starting state."""
# Do a very long setup so we know the setup logs are streamed in
@@ -461,6 +469,7 @@ def test_setup_logs_in_starting_pool(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_setup_logs_in_pool_exits(generic_cloud: str):
"""Test that setup logs are streamed and exit once the setup is complete."""
"""We omit --no-follow to test that we exit."""
@@ -482,6 +491,7 @@ def test_setup_logs_in_pool_exits(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_update_workers(generic_cloud: str):
"""Test that we can update the number of workers in a pool, both
up and down.
@@ -511,6 +521,7 @@ def test_update_workers(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_update_workers_and_yaml(generic_cloud: str):
"""Test that we error if the user specifies a yaml and --workers.
"""
@@ -530,6 +541,7 @@ def test_update_workers_and_yaml(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_update_workers_no_pool(generic_cloud: str):
"""Test that we error if the user specifies a yaml and --workers.
"""
@@ -549,6 +561,7 @@ def test_update_workers_no_pool(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_pool_queueing(generic_cloud: str):
timeout = smoke_tests_utils.get_timeout(generic_cloud)
pool_config = basic_pool_conf(num_workers=1,
@@ -588,6 +601,7 @@ def test_pool_queueing(generic_cloud: str):


@pytest.mark.aws
@pytest.mark.no_remote_server # see note 1 above
def test_pool_preemption(generic_cloud: str):
region = 'us-east-2'
name = smoke_tests_utils.get_cluster_name()
@@ -642,6 +656,7 @@ def test_pool_preemption(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_pool_job_cancel_running(generic_cloud: str):
timeout = smoke_tests_utils.get_timeout(generic_cloud)
pool_config = basic_pool_conf(num_workers=1, infra=generic_cloud)
@@ -683,6 +698,7 @@ def test_pool_job_cancel_running(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_pool_job_cancel_instant(generic_cloud: str):
timeout = smoke_tests_utils.get_timeout(generic_cloud)
pool_config = basic_pool_conf(num_workers=1, infra=generic_cloud)
@@ -722,6 +738,7 @@ def test_pool_job_cancel_instant(generic_cloud: str):


@pytest.mark.aws
@pytest.mark.no_remote_server # see note 1 above
def test_pool_job_cancel_recovery(generic_cloud: str):
region = 'us-east-2'
name = smoke_tests_utils.get_cluster_name()
@@ -781,6 +798,7 @@ def test_pool_job_cancel_recovery(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_pool_job_cancel_running_multiple(generic_cloud: str):
num_jobs = 4
timeout = smoke_tests_utils.get_timeout(generic_cloud)
@@ -837,6 +855,7 @@ def test_pool_job_cancel_running_multiple(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_pool_job_cancel_running_multiple_simultaneous(generic_cloud: str):
timeout = smoke_tests_utils.get_timeout(generic_cloud)
num_jobs = 4
@@ -891,6 +910,7 @@ def test_pool_job_cancel_running_multiple_simultaneous(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_pool_job_cancel_instant_multiple(generic_cloud: str):
timeout = smoke_tests_utils.get_timeout(generic_cloud)
num_jobs = 4
@@ -941,6 +961,7 @@ def test_pool_job_cancel_instant_multiple(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_pool_job_cancel_instant_multiple_simultaneous(generic_cloud: str):
timeout = smoke_tests_utils.get_timeout(generic_cloud)
num_jobs = 4
@@ -988,6 +1009,7 @@ def test_pool_job_cancel_instant_multiple_simultaneous(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_pools_job_cancel_no_jobs(generic_cloud: str):
timeout = smoke_tests_utils.get_timeout(generic_cloud)
pool_config = basic_pool_conf(num_workers=1, infra=generic_cloud)
@@ -1013,6 +1035,7 @@ def test_pools_job_cancel_no_jobs(generic_cloud: str):


# TODO(Lloyd): Remove once heterogeneous pools are supported.
@pytest.mark.no_remote_server # see note 1 above
def test_heterogeneous_pool(generic_cloud: str):
name = smoke_tests_utils.get_cluster_name()
pool_name = f'{name}-pool'
@@ -1034,6 +1057,7 @@ def test_heterogeneous_pool(generic_cloud: str):


#(TODO): Remove once heterogeneous pools are supported.
@pytest.mark.no_remote_server # see note 1 above
def test_heterogeneous_pool_counts(generic_cloud: str):
name = smoke_tests_utils.get_cluster_name()
pool_name = f'{name}-pool'
@@ -1054,8 +1078,7 @@ def test_heterogeneous_pool_counts(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


# This test is failing on shared gke postgres test cluster, we should remove this after we fix it.
@pytest.mark.no_remote_server
@pytest.mark.no_remote_server # see note 1 above
def test_pools_num_jobs_basic(generic_cloud: str):
name = smoke_tests_utils.get_cluster_name()
pool_name = f'{name}-pool'
@@ -1096,6 +1119,7 @@ def test_pools_num_jobs_basic(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_pool_worker_assignment_in_queue(generic_cloud: str):
"""Test that sky jobs queue shows the worker assignment for running jobs."""
timeout = smoke_tests_utils.get_timeout(generic_cloud)
@@ -1133,6 +1157,7 @@ def test_pool_worker_assignment_in_queue(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_pools_num_jobs_option(generic_cloud: str):
name = smoke_tests_utils.get_cluster_name()
pool_name = f'{name}-pool'
@@ -1170,6 +1195,7 @@ def test_pools_num_jobs_option(generic_cloud: str):


@pytest.mark.gcp
@pytest.mark.no_remote_server # see note 1 above
def test_pools_setup_num_gpus(generic_cloud: str):
"""Test that the number of GPUs is set correctly in the setup script."""
timeout = smoke_tests_utils.get_timeout(generic_cloud)
@@ -1196,6 +1222,7 @@ def test_pools_setup_num_gpus(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_pools_single_yaml(generic_cloud: str):
name = smoke_tests_utils.get_cluster_name()
pool_name = f'{name}-pool'
@@ -1223,6 +1250,7 @@ def test_pools_single_yaml(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_pools_double_launch(generic_cloud: str):
"""Test that we can launch a pool with the same name twice.
"""
@@ -1249,17 +1277,42 @@ def test_pools_double_launch(generic_cloud: str):
smoke_tests_utils.run_one_test(test)


def check_pool_not_in_status(pool_name: str):
"""Check that a pool does not appear in `sky jobs pool status`."""
return (f's=$(sky jobs pool status); '
f'echo "$s"; '
f'if echo "$s" | grep "{pool_name}"; then '
f' echo "ERROR: Pool {pool_name} still exists in pool status"; '
f' exit 1; '
f'fi; '
f'echo "Pool {pool_name} correctly removed from pool status"')
def check_pool_not_in_status(pool_name: str,
timeout: int = 30,
time_between_checks: int = 5):
"""Check that a pool does not appear in `sky jobs pool status`.
Args:
pool_name: The name of the pool to check for.
timeout: Maximum time in seconds to wait for the pool to be removed.
time_between_checks: Time in seconds to wait between checks.
"""
return (
'start_time=$SECONDS; '
'while true; do '
f'if (( $SECONDS - $start_time > {timeout} )); then '
f' echo "Timeout after {timeout} seconds waiting for pool {pool_name} to be removed"; '
f' s=$(sky jobs pool status); '
f' echo "$s"; '
f' if echo "$s" | grep "{pool_name}"; then '
f' echo "ERROR: Pool {pool_name} still exists in pool status"; '
f' exit 1; '
f' fi; '
f' exit 0; '
'fi; '
f's=$(sky jobs pool status); '
'echo "$s"; '
f'if ! echo "$s" | grep "{pool_name}"; then '
f' echo "Pool {pool_name} correctly removed from pool status"; '
' break; '
'fi; '
f'echo "Waiting for pool {pool_name} to be removed..."; '
f'sleep {time_between_checks}; '
'done')


@pytest.mark.resource_heavy
@pytest.mark.no_remote_server # see note 1 above
def test_pool_down_all_with_running_jobs(generic_cloud: str):
"""Test that `sky jobs pool down -a -y` cancels running jobs and removes pools.
@@ -1286,49 +1339,66 @@ def test_pool_down_all_with_running_jobs(generic_cloud: str):
run_cmd='sleep infinity',
)

with tempfile.NamedTemporaryFile(delete=True) as pool_yaml:
with tempfile.NamedTemporaryFile(delete=True) as job_yaml:
write_yaml(pool_yaml, pool_config)
write_yaml(job_yaml, job_config)

test = smoke_tests_utils.Test(
'test_pool_down_all_with_running_jobs',
[
_LAUNCH_POOL_AND_CHECK_SUCCESS.format(
pool_name=pool_name_1, pool_yaml=pool_yaml.name),
wait_until_pool_ready(pool_name_1, timeout=timeout),
_LAUNCH_POOL_AND_CHECK_SUCCESS.format(
pool_name=pool_name_2, pool_yaml=pool_yaml.name),
wait_until_pool_ready(pool_name_2, timeout=timeout),
_LAUNCH_JOB_AND_CHECK_SUCCESS_WITH_NAME.format(
pool_name=pool_name_1,
job_yaml=job_yaml.name,
job_name=job_name_1),
_LAUNCH_JOB_AND_CHECK_SUCCESS_WITH_NAME.format(
pool_name=pool_name_2,
job_yaml=job_yaml.name,
job_name=job_name_2),
wait_until_job_status(job_name_1, ['RUNNING'],
timeout=timeout),
wait_until_job_status(job_name_2, ['RUNNING'],
timeout=timeout),
'sky jobs pool down -a -y',
# Wait a bit for cancellation to propagate
'sleep 10',
wait_until_job_status(
job_name_1, ['CANCELLED'], bad_statuses=[], timeout=30),
wait_until_job_status(
job_name_2, ['CANCELLED'], bad_statuses=[], timeout=30),
check_pool_not_in_status(pool_name_1),
check_pool_not_in_status(pool_name_2),
],
timeout=timeout,
teardown=cancel_jobs_and_teardown_pool(pool_name_1, timeout=5) +
cancel_jobs_and_teardown_pool(pool_name_2, timeout=5),
)
smoke_tests_utils.run_one_test(test)
# Configure jobs controller resources: 4 cores and 20GB memory
controller_config = {
'jobs': {
'controller': {
'resources': {
'cpus': '4+',
'memory': '32+',
}
}
}
}

with smoke_tests_utils.override_sky_config(config_dict=controller_config):
with tempfile.NamedTemporaryFile(delete=True) as pool_yaml:
with tempfile.NamedTemporaryFile(delete=True) as job_yaml:
write_yaml(pool_yaml, pool_config)
write_yaml(job_yaml, job_config)

test = smoke_tests_utils.Test(
'test_pool_down_all_with_running_jobs',
[
_LAUNCH_POOL_AND_CHECK_SUCCESS.format(
pool_name=pool_name_1, pool_yaml=pool_yaml.name),
wait_until_pool_ready(pool_name_1, timeout=timeout),
_LAUNCH_POOL_AND_CHECK_SUCCESS.format(
pool_name=pool_name_2, pool_yaml=pool_yaml.name),
wait_until_pool_ready(pool_name_2, timeout=timeout),
_LAUNCH_JOB_AND_CHECK_SUCCESS_WITH_NAME.format(
pool_name=pool_name_1,
job_yaml=job_yaml.name,
job_name=job_name_1),
_LAUNCH_JOB_AND_CHECK_SUCCESS_WITH_NAME.format(
pool_name=pool_name_2,
job_yaml=job_yaml.name,
job_name=job_name_2),
wait_until_job_status(job_name_1, ['RUNNING'],
timeout=timeout),
wait_until_job_status(job_name_2, ['RUNNING'],
timeout=timeout),
'sky jobs pool down -a -y',
# Wait a bit for cancellation to propagate
'sleep 10',
wait_until_job_status(job_name_1, ['CANCELLED'],
bad_statuses=[],
timeout=30),
wait_until_job_status(job_name_2, ['CANCELLED'],
bad_statuses=[],
timeout=30),
check_pool_not_in_status(pool_name_1),
check_pool_not_in_status(pool_name_2),
],
timeout=timeout,
teardown=cancel_jobs_and_teardown_pool(pool_name_1,
timeout=5) +
cancel_jobs_and_teardown_pool(pool_name_2, timeout=5),
)
smoke_tests_utils.run_one_test(test)


@pytest.mark.no_remote_server # see note 1 above
def test_pool_down_single_pool(generic_cloud: str):
"""Test that `sky jobs pool down <pool_name> -y` downs a single pool.
@@ -1375,7 +1445,7 @@ def test_pool_down_single_pool(generic_cloud: str):
'sleep 10',
wait_until_job_status(
job_name, ['CANCELLED'], bad_statuses=[], timeout=30),
check_pool_not_in_status(pool_name),
check_pool_not_in_status(pool_name, timeout=30),
],
timeout=timeout,
teardown=cancel_jobs_and_teardown_pool(pool_name, timeout=5),


+ 2
- 2
tests/test_yamls/low_resource_sky_config.yaml View File

@@ -2,9 +2,9 @@ jobs:
controller:
resources:
cpus: 4+
memory: 4+
memory: 16+
serve:
controller:
resources:
cpus: 4+
memory: 4+
memory: 8+

+ 57
- 0
tests/unit_tests/kubernetes/test_deploy_remote_cluster.py View File

@@ -0,0 +1,57 @@
"""Tests for Kubernetes remote cluster deployment."""

import argparse
from unittest import mock

from sky.utils.kubernetes import deploy_remote_cluster


def test_deploy_remote_cluster():
"""Test to check if the remote cluster is deployed successfully."""
mock_args = argparse.Namespace(
cleanup=False,
infra='test-infra',
kubeconfig_path='~/.kube/config',
use_ssh_config=False,
ssh_node_pools_file='~/.sky/ssh_node_pools.yaml')

mock_hosts_info = [{
'name': 'test-host',
'ip': '192.168.1.1',
'user': 'test-user',
'identity_file': '~/.ssh/id_rsa',
'use_ssh_config': False,
'password': 'test-password'
}]

mock_context_name = 'test-infra'

mock_cluster_config = {mock_context_name: {'hosts': ['test-host']}}

mock_ssh_targets = [{'name': mock_context_name, 'hosts': ['test-host']}]

with mock.patch('sky.utils.kubernetes.deploy_remote_cluster.parse_args') as mock_parse_args, \
mock.patch('sky.utils.kubernetes.deploy_remote_cluster.ssh_utils.load_ssh_targets') as mock_load_ssh_targets, \
mock.patch('sky.utils.kubernetes.deploy_remote_cluster.ssh_utils.get_cluster_config') as mock_get_cluster_config, \
mock.patch('sky.utils.kubernetes.deploy_remote_cluster.ssh_utils.prepare_hosts_info') as mock_prepare_hosts_info, \
mock.patch('sky.utils.kubernetes.deploy_remote_cluster.deploy_cluster') as mock_deploy_cluster:
mock_parse_args.return_value = mock_args
mock_load_ssh_targets.return_value = mock_ssh_targets
mock_get_cluster_config.return_value = mock_cluster_config
mock_prepare_hosts_info.return_value = mock_hosts_info
mock_deploy_cluster.return_value = [mock_context_name]
deploy_remote_cluster.main()
mock_deploy_cluster.assert_called_once()
mock_load_ssh_targets.assert_called_once()
mock_get_cluster_config.assert_called_once()
# Check that mock_deploy_cluster was called with context_name='ssh-test-infra'
context_name = None
expected_context_name = 'ssh-test-infra'
for call in mock_deploy_cluster.call_args_list:
# context_name is the 5th positional argument
# deploy_cluster(head_node, worker_nodes, ssh_user, ssh_key, context_name, ...)
if len(call.args) >= 5:
context_name = call.args[4]
assert context_name == expected_context_name, (
f"mock_deploy_cluster was not called with context_name='{expected_context_name}', "
f"but was called with context_name={context_name}")

+ 21
- 0
tests/unit_tests/test_sky/clouds/test_aws_cloud.py View File

@@ -752,3 +752,24 @@ class TestAwsProfileAwareLruCache:
else:
os.environ.pop(skypilot_config.ENV_VAR_SKYPILOT_CONFIG, None)
skypilot_config.reload_config()


class TestAwsConfigFileEnvVar:
"""Tests for AWS_CONFIG_FILE credential override."""

def test_get_credential_file_mounts_respects_env_override(
self, tmp_path, monkeypatch):
credential_file = tmp_path / 'aws_credentials'
credential_file.write_text('dummy')
monkeypatch.setenv('AWS_CONFIG_FILE', str(credential_file))

aws = aws_mod.AWS()
with mock.patch.object(
aws_mod.AWS,
'_current_identity_type',
return_value=aws_mod.AWSIdentityType.SHARED_CREDENTIALS_FILE):
mounts = aws.get_credential_file_mounts()

assert mounts == {
aws_mod._DEFAULT_AWS_CONFIG_PATH: str(credential_file)
}

+ 17
- 10
tests/unit_tests/test_sky/jobs/test_server_core_secrets.py View File

@@ -9,6 +9,7 @@ This test ensures that:
import os
import tempfile

from pydantic import SecretStr
import pytest

from sky import dag as dag_lib
@@ -267,11 +268,14 @@ class TestManagedJobSecrets:
loaded_task = loaded_dag.tasks[0]

# The loaded task must have real secrets for execution
assert loaded_task.secrets['API_KEY'] == 'sk-prod-api-key-12345'
assert loaded_task.secrets[
'DB_PASSWORD'] == 'prod-database-secret-password'
assert loaded_task.secrets[
'WANDB_API_KEY'] == 'wandb-secret-key-67890'
assert task_lib.get_plaintext_secrets(
loaded_task.secrets)['API_KEY'] == 'sk-prod-api-key-12345'
assert task_lib.get_plaintext_secrets(
loaded_task.secrets
)['DB_PASSWORD'] == 'prod-database-secret-password'
assert task_lib.get_plaintext_secrets(
loaded_task.secrets
)['WANDB_API_KEY'] == 'wandb-secret-key-67890'

# Environment variables should be preserved
assert loaded_task.envs['MODEL_NAME'] == 'my-model'
@@ -334,11 +338,14 @@ class TestManagedJobSecrets:
loaded_tasks = loaded_dag.tasks

assert len(loaded_tasks) == 2
assert loaded_tasks[0].secrets['DATA_API_KEY'] == 'data-api-secret-key'
assert loaded_tasks[0].secrets['S3_SECRET'] == 's3-access-secret'
assert loaded_tasks[1].secrets[
'MODEL_API_KEY'] == 'model-api-secret-key'
assert loaded_tasks[1].secrets['WANDB_KEY'] == 'wandb-logging-secret'
assert task_lib.get_plaintext_secrets(
loaded_tasks[0].secrets)['DATA_API_KEY'] == 'data-api-secret-key'
assert task_lib.get_plaintext_secrets(
loaded_tasks[0].secrets)['S3_SECRET'] == 's3-access-secret'
assert task_lib.get_plaintext_secrets(
loaded_tasks[1].secrets)['MODEL_API_KEY'] == 'model-api-secret-key'
assert task_lib.get_plaintext_secrets(
loaded_tasks[1].secrets)['WANDB_KEY'] == 'wandb-logging-secret'

def test_mixed_envs_and_secrets_job_execution(self):
"""Test that envs and secrets are handled correctly for job execution.


+ 12
- 19
tests/unit_tests/test_sky/jobs/test_state.py View File

@@ -88,8 +88,7 @@ def _insert_job_info(engine,
return job_id


@pytest.mark.asyncio
async def test_get_task_logs_to_clean_async_basic(_mock_managed_jobs_db_conn):
def test_get_task_logs_to_clean_basic(_mock_managed_jobs_db_conn):
now = time.time()
retention = 60

@@ -156,7 +155,7 @@ async def test_get_task_logs_to_clean_async_basic(_mock_managed_jobs_db_conn):

state.scheduler_set_done(job_id)

res = await state.get_task_logs_to_clean_async(retention, batch_size=10)
res = state.get_task_logs_to_clean(retention, batch_size=10)
# Only task 0 should be returned
assert len(res) == 1
assert res[0]['job_id'] == job_id
@@ -183,12 +182,11 @@ async def test_get_task_logs_to_clean_async_basic(_mock_managed_jobs_db_conn):
logs_cleaned_at=None,
)

res2 = await state.get_task_logs_to_clean_async(retention, batch_size=2)
res2 = state.get_task_logs_to_clean(retention, batch_size=2)
assert len(res2) == 2 # limited by batch size


@pytest.mark.asyncio
async def test_set_task_logs_cleaned_async(_mock_managed_jobs_db_conn):
def test_set_task_logs_cleaned(_mock_managed_jobs_db_conn):
now = time.time()
retention = 60

@@ -212,11 +210,11 @@ async def test_set_task_logs_cleaned_async(_mock_managed_jobs_db_conn):

state.scheduler_set_done(job_id)

res = await state.get_task_logs_to_clean_async(retention, batch_size=10)
res = state.get_task_logs_to_clean(retention, batch_size=10)
assert len(res) == 1

ts = now
await state.set_task_logs_cleaned_async([(job_id, 0)], ts)
state.set_task_logs_cleaned([(job_id, 0)], ts)

# Verify updated
with orm.Session(state._SQLALCHEMY_ENGINE) as session:
@@ -229,13 +227,11 @@ async def test_set_task_logs_cleaned_async(_mock_managed_jobs_db_conn):
assert row[0] == ts

# Should no longer be returned
res2 = await state.get_task_logs_to_clean_async(retention, batch_size=10)
res2 = state.get_task_logs_to_clean(retention, batch_size=10)
assert res2 == []


@pytest.mark.asyncio
async def test_get_controller_logs_to_clean_async_basic(
_mock_managed_jobs_db_conn):
def test_get_controller_logs_to_clean_basic(_mock_managed_jobs_db_conn):
now = time.time()
retention = 60

@@ -304,8 +300,7 @@ async def test_get_controller_logs_to_clean_async_basic(
)
state.scheduler_set_done(job_d)

res = await state.get_controller_logs_to_clean_async(retention,
batch_size=10)
res = state.get_controller_logs_to_clean(retention, batch_size=10)
job_ids = {r['job_id'] for r in res}
assert job_ids == {job_a}

@@ -335,19 +330,17 @@ async def test_get_controller_logs_to_clean_async_basic(
)
state.scheduler_set_done(job_f)

res2 = await state.get_controller_logs_to_clean_async(retention,
batch_size=2)
res2 = state.get_controller_logs_to_clean(retention, batch_size=2)
assert len(res2) == 2


@pytest.mark.asyncio
async def test_set_controller_logs_cleaned_async(_mock_managed_jobs_db_conn):
def test_set_controller_logs_cleaned(_mock_managed_jobs_db_conn):
now = time.time()

job_id = _insert_job_info(state._SQLALCHEMY_ENGINE,
controller_logs_cleaned_at=None)

await state.set_controller_logs_cleaned_async([job_id], now)
state.set_controller_logs_cleaned([job_id], now)

with orm.Session(state._SQLALCHEMY_ENGINE) as session:
row = session.execute(


+ 125
- 1
tests/unit_tests/test_sky/server/requests/test_executor.py View File

@@ -1,7 +1,9 @@
"""Unit tests for sky.server.requests.executor module."""
import asyncio
import concurrent.futures
import functools
import os
import queue as queue_lib
import time
from typing import List
from unittest import mock
@@ -10,6 +12,7 @@ import pytest

from sky import exceptions
from sky import skypilot_config
from sky.server import config as server_config
from sky.server import constants as server_constants
from sky.server.requests import executor
from sky.server.requests import payloads
@@ -336,7 +339,7 @@ async def test_execute_with_isolated_env_and_config(isolated_database,
os.environ.pop('TEST_VAR_A', None)


FAKE_FD_START = 100
FAKE_FD_START = 100000


def _get_saved_fd_close_count(close_calls: List[int], created_fds: set) -> int:
@@ -452,6 +455,11 @@ def _keyboard_interrupt_entrypoint():
raise KeyboardInterrupt()


def _dummy_entrypoint_for_retry_test():
"""Dummy entrypoint for retry test that can be pickled."""
return None


@pytest.mark.asyncio
@pytest.mark.parametrize('test_case', [
pytest.param(
@@ -518,3 +526,119 @@ async def test_stdout_stderr_restoration(mock_fd_operations, test_case):
# Verify no double-close
_assert_no_double_close(mock_fd_operations['close_calls'],
mock_fd_operations['created_fds'])


@pytest.mark.asyncio
async def test_request_worker_retry_execution_retryable_error(
isolated_database, monkeypatch):
"""Test that RequestWorker retries requests when ExecutionRetryableError is raised."""
# Create a request in the database
request_id = 'test-retry-request'
request = requests_lib.Request(
request_id=request_id,
name='test-request',
entrypoint=
_dummy_entrypoint_for_retry_test, # Won't be called in this test
request_body=payloads.RequestBody(),
status=requests_lib.RequestStatus.RUNNING,
created_at=time.time(),
user_id='test-user',
)
await requests_lib.create_if_not_exists_async(request)

# Create a mock queue that tracks puts
queue_items = []
mock_queue = queue_lib.Queue()

class MockRequestQueue:

def __init__(self, queue):
self.queue = queue

def get(self):
try:
return self.queue.get(block=False)
except queue_lib.Empty:
return None

def put(self, item):
queue_items.append(item)
self.queue.put(item)

request_queue = MockRequestQueue(mock_queue)

# Mock _get_queue to return our mock queue
def mock_get_queue(schedule_type):
return request_queue

monkeypatch.setattr(executor, '_get_queue', mock_get_queue)

# Mock time.sleep to track calls (but still sleep for very short waits)
sleep_calls = []

def mock_sleep(seconds):
sleep_calls.append(seconds)

monkeypatch.setattr('time.sleep', mock_sleep)

# Create a mock executor that tracks submit_until_success calls
submit_calls = []

class MockExecutor:

def submit_until_success(self, fn, *args, **kwargs):
submit_calls.append((fn, args, kwargs))
# Return a future that immediately completes (does nothing)
fut = concurrent.futures.Future()
fut.set_result(None)
return fut

mock_executor = MockExecutor()

# Create a RequestWorker
worker = executor.RequestWorker(
schedule_type=requests_lib.ScheduleType.LONG,
config=server_config.WorkerConfig(garanteed_parallelism=1,
burstable_parallelism=0,
num_db_connections_per_worker=0))

# Create a future that raises ExecutionRetryableError
retryable_error = exceptions.ExecutionRetryableError(
'Failed to provision all possible launchable resources.',
hint='Retry after 30s',
retry_wait_seconds=30)
fut = concurrent.futures.Future()
fut.set_exception(retryable_error)

# Create request_element tuple
request_element = (request_id, False, True
) # (request_id, ignore_return_value, retryable)

# Call handle_task_result - this should catch the exception and reschedule
worker.handle_task_result(fut, request_element)

# Verify the request was put back on the queue
assert queue_items == [
request_element
], (f'Expected {request_element} to be put on queue, got {queue_items[0]}')

# Verify time.sleep was called with the retry wait time (first call should be 30)
assert sleep_calls == [
30
], (f'Expected first time.sleep call to be 30 seconds, got {sleep_calls[0]}'
)

# Verify the request status was reset to PENDING
updated_request = requests_lib.get_request(request_id, fields=['status'])
assert updated_request is not None
assert updated_request.status == requests_lib.RequestStatus.PENDING, (
f'Expected request status to be PENDING, got {updated_request.status}')

# Call process_request - it should pick up the request from the queue
# and call submit_until_success
worker.process_request(mock_executor, request_queue)

# Verify submit_until_success was called
assert len(submit_calls) == 1, (
f'Expected submit_until_success to be called once, got {len(submit_calls)} calls'
)

+ 0
- 2
tests/unit_tests/test_sky/server/requests/test_payloads.py View File

@@ -17,7 +17,6 @@ def test_request_body_env_vars_includes_expected_keys(monkeypatch):
'/tmp/project.yaml')
monkeypatch.setenv(constants.ENV_VAR_DB_CONNECTION_URI, 'db-uri')

payloads.request_body_env_vars.cache_clear()
monkeypatch.setattr(payloads.common, 'is_api_server_local', lambda: True)
local_env = payloads.request_body_env_vars()
assert server_env not in local_env
@@ -27,7 +26,6 @@ def test_request_body_env_vars_includes_expected_keys(monkeypatch):
assert skypilot_config.ENV_VAR_GLOBAL_CONFIG not in local_env
assert skypilot_config.ENV_VAR_PROJECT_CONFIG not in local_env

payloads.request_body_env_vars.cache_clear()
monkeypatch.setattr(payloads.common, 'is_api_server_local', lambda: False)
remote_env = payloads.request_body_env_vars()
assert 'AWS_PROFILE' not in remote_env


Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save
Baidu
map