22 Commits

Author SHA1 Message Date
  Kevin Mingtarja 8a2a4e0a86 Merge branch 'master' into fix-job-submission-sys-path 5 days ago
  Christopher Cooper afb7768566
[release] fix the helm upgrade test to work with rc versions (#8264) 5 days ago
  zpoint dd21824060
Fix aiohttp version failure in buildkite (#8267) 6 days ago
  Seung Jin e4e2d27653
[UX] Introduce pending state (#8262) 6 days ago
  zpoint 166fd63617
[Dashboard] Add some tests for the dashboard performance (#8227) 6 days ago
  Seung Jin 791356230a
[tests] misc test fixes (#8260) 6 days ago
  Christopher Cooper b5ff2fdb54
[deps] pin pycares<5 to work around aiodns issue (#8259) 6 days ago
  lloyd-brown 9f7bdf7d28
[Tests] Add Cloud Selection to CLI Smoke Tests (#8256) 6 days ago
  DanielZhangQD 6f12d52a0f
[Helm] Support updating ssh node pool config with helm chart (#8249) 1 week ago
  Aylei 3bdad29488
Fixed API server mem bench on k8s (#8254) 1 week ago
  zpoint c28d94abd9
Fix `test_nemorl` failure (#8253) 1 week ago
  zpoint 6259aa05fd
Fix smoke test failure `test_container_logs_two_jobs_kubernetes` (#8250) 1 week ago
  Kevin Mingtarja 8cc5193918
[Slurm] Unify ssh proxycommand config for run and rsync in SlurmCommandRunner (#8248) 1 week ago
  Seung Jin 343123b289
[Slurm] configurable Slurm provision timeout, set default to 10s (#8244) 1 week ago
  Kevin Mingtarja c13dfb9c04
[Slurm] Fix UV_CACHE_DIR permission issues with multiple users (#8245) 1 week ago
  Seung Jin ff928a2757
[Slurm] show Slurm infra at cluster level (as opposed to partition level) (#8246) 1 week ago
  Kevin Mingtarja 6152d193a8
[Slurm] Remove unnecessary setup commands (#8247) 1 week ago
  Kevin Mingtarja fb234495ff
[Test] Skip test_job_queue if k8s cluster has no GPUs (#8242) 1 week ago
  Seung Jin f723ff528f
[k8s] Disable ray memory monitor on k8s (#8231) 1 week ago
  Seung Jin 81892e7f86
Display default Slurm partitions first (#8239) 1 week ago
  Kevin Mingtarja 2ef047a7a4
[Test] Fix capitalization in test_kubernetes_slurm_show_gpus (#8238) 1 week ago
  mk0walsk 4deb1be77f
Update torchtune documentation links (#8237) 1 week ago
27 changed files with 876 additions and 162 deletions
Split View
  1. +5
    -9
      charts/skypilot/templates/api-deployment.yaml
  2. +459
    -1
      docs/source/reference/api-server/api-server-admin-deploy.rst
  3. +2
    -2
      llm/llama-3_1-finetuning/readme.md
  4. +0
    -3
      requirements-dev.txt
  5. +57
    -45
      sky/adaptors/slurm.py
  6. +3
    -12
      sky/clouds/slurm.py
  7. +8
    -1
      sky/core.py
  8. +16
    -2
      sky/provision/slurm/instance.py
  9. +12
    -3
      sky/provision/slurm/utils.py
  10. +15
    -6
      sky/setup_files/dependencies.py
  11. +8
    -0
      sky/templates/kubernetes-ray.yml.j2
  12. +0
    -14
      sky/templates/slurm-ray.yml.j2
  13. +38
    -26
      sky/utils/command_runner.py
  14. +3
    -0
      sky/utils/schemas.py
  15. +7
    -0
      sky/utils/status_lib.py
  16. +4
    -2
      tests/kubernetes/scripts/helm_deploy_and_verify.sh
  17. +14
    -3
      tests/kubernetes/scripts/helm_upgrade.sh
  18. +26
    -6
      tests/load_tests/db_scale_tests/test_large_production_performance.sh
  19. +6
    -2
      tests/smoke_tests/test_api_server_benchmark.py
  20. +1
    -1
      tests/smoke_tests/test_basic.py
  21. +5
    -5
      tests/smoke_tests/test_cli.py
  22. +6
    -2
      tests/smoke_tests/test_cluster_job.py
  23. +1
    -1
      tests/smoke_tests/test_examples.py
  24. +153
    -0
      tests/unit_tests/test_sky/adaptors/test_slurm_adaptor.py
  25. +19
    -13
      tests/unit_tests/test_sky/clouds/test_kubernetes.py
  26. +7
    -3
      tests/unit_tests/test_sky/clouds/test_ssh.py
  27. +1
    -0
      tests/unit_tests/test_sky/test_task.py

+ 5
- 9
charts/skypilot/templates/api-deployment.yaml View File

@@ -188,13 +188,9 @@ spec:
fi
{{- if .Values.apiService.sshNodePools }}
mkdir -p /root/.sky
# The PVC serves as the ground truth for the ssh_node_pools.yaml file, if it already exists we don't overwrite it
if [ ! -s /root/.sky/ssh_node_pools.yaml ]; then
echo "ssh_node_pools.yaml not found in /root/.sky, copying from ConfigMap \`skypilot-ssh-node-pools\`"
cp /var/skypilot/ssh_node_pool/ssh_node_pools.yaml /root/.sky/ssh_node_pools.yaml
else
echo "ssh_node_pools.yaml already exists in /root/.sky, skipping copy"
fi
echo "Linking ssh_node_pools.yaml from secret to /root/.sky/ssh_node_pools.yaml"
# The secret serves as the ground truth for the ssh_node_pools.yaml file, read-only
ln -sf /var/skypilot/ssh_node_pool/ssh_node_pools.yaml /root/.sky/ssh_node_pools.yaml
# ~/.kube/config is required to be persistent when sshNodePools is enabled, init it if it is empty to avoid parsing error.
if [ ! -s /root/.kube/config ]; then
echo "{}" > /root/.kube/config
@@ -329,7 +325,7 @@ spec:
- name: digitalocean-config
mountPath: /root/.config/doctl
readOnly: true
{{- end }}
{{- end }}
{{- if .Values.lambdaCredentials.enabled }}
- name: lambda-config
mountPath: /root/.lambda_cloud
@@ -746,4 +742,4 @@ spec:
{{- with .Values.apiService.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}

+ 459
- 1
docs/source/reference/api-server/api-server-admin-deploy.rst View File

@@ -224,6 +224,30 @@ Following tabs describe how to configure credentials for different clouds on the

The specific cloud's credential for the exec-based authentication also needs to be configured. For example, to enable exec-based authentication for GKE, you also need to setup GCP credentials (see the GCP tab above).

.. dropdown:: Update Kubernetes credentials

After Kubernetes credentials are enabled, you can update the kubeconfig file in ``kube-credentials`` by:

1. Replace the existing secret in place:

.. code-block:: bash

kubectl delete secret kube-credentials
kubectl create secret generic kube-credentials \
--namespace $NAMESPACE \
--from-file=config=$HOME/.kube/config

2. Then it will take tens of seconds to take effect on the API server. You can verify the updated credentials in the API server pod:

.. code-block:: bash

# The NAMESPACE and RELEASE_NAME should be consistent with the API server deployment
API_SERVER_POD_NAME=$(kubectl get pods -n $NAMESPACE -l app=${RELEASE_NAME}-api -o jsonpath='{.items[0].metadata.name}')
# If `SSH Node Pools` is not enabled
kubectl exec $API_SERVER_POD_NAME -n $NAMESPACE -- cat /root/.kube/config
# If `SSH Node Pools` is enabled
#kubectl exec $API_SERVER_POD_NAME -n $NAMESPACE -- cat /var/skypilot/kubeconfig/config

To use multiple Kubernetes clusters, you will need to add the context names to ``allowed_contexts`` in the SkyPilot config. An example config file that allows using the hosting Kubernetes cluster and two additional Kubernetes clusters is shown below:

.. code-block:: yaml
@@ -267,6 +291,52 @@ Following tabs describe how to configure credentials for different clouds on the
--reuse-values \
--set awsCredentials.enabled=true

.. dropdown:: Update AWS credentials (single profile)

After AWS credentials are enabled, update the access or secret key in ``aws-credentials`` using either approach:

1. Create a new secret with a new name:

.. code-block:: bash

kubectl create secret generic aws-credentials-new \
--namespace $NAMESPACE \
--from-literal=aws_access_key_id=YOUR_ACCESS_KEY_ID \
--from-literal=aws_secret_access_key=YOUR_SECRET_ACCESS_KEY

Then point Helm to the new secret name:

.. code-block:: bash

helm upgrade --install skypilot skypilot/skypilot-nightly --devel \
--namespace $NAMESPACE \
--reuse-values \
--set awsCredentials.awsSecretName=aws-credentials-new

2. Replace the existing secret in place, then restart the API server:

.. code-block:: bash

kubectl delete secret aws-credentials
kubectl create secret generic aws-credentials \
--namespace $NAMESPACE \
--from-literal=aws_access_key_id=YOUR_ACCESS_KEY_ID \
--from-literal=aws_secret_access_key=YOUR_SECRET_ACCESS_KEY

Restart the API server:

.. code-block:: bash

kubectl rollout restart deployment/$RELEASE_NAME-api-server -n $NAMESPACE

Verify the updated credentials in the API server pod:

.. code-block:: bash

# The NAMESPACE and RELEASE_NAME should be consistent with the API server deployment
API_SERVER_POD_NAME=$(kubectl get pods -n $NAMESPACE -l app=${RELEASE_NAME}-api -o jsonpath='{.items[0].metadata.name}')
kubectl exec $API_SERVER_POD_NAME -n $NAMESPACE -- cat /root/.aws/credentials

**Option 2: Multiple profiles (for multiple workspaces)**

Use this if you need different AWS profiles for different workspaces. Create a Kubernetes secret from your AWS credentials file:
@@ -288,6 +358,27 @@ Following tabs describe how to configure credentials for different clouds on the
--set awsCredentials.enabled=true \
--set awsCredentials.useCredentialsFile=true

.. dropdown:: Update AWS credentials (multiple profiles)

After AWS credentials are enabled, you can update the credentials file in ``aws-credentials`` by:

1. Replace the existing secret in place:

.. code-block:: bash

kubectl delete secret aws-credentials
kubectl create secret generic aws-credentials \
--namespace $NAMESPACE \
--from-file=credentials=$HOME/.aws/credentials

2. Then it will take tens of seconds to take effect on the API server. You can verify the updated credentials in the API server pod:

.. code-block:: bash

# The NAMESPACE and RELEASE_NAME should be consistent with the API server deployment
API_SERVER_POD_NAME=$(kubectl get pods -n $NAMESPACE -l app=${RELEASE_NAME}-api -o jsonpath='{.items[0].metadata.name}')
kubectl exec $API_SERVER_POD_NAME -n $NAMESPACE -- cat /root/.aws/credentials

.. dropdown:: Use existing AWS credentials

You can also set the following values to use a secret that already contains your AWS credentials:
@@ -352,6 +443,50 @@ Following tabs describe how to configure credentials for different clouds on the
--set gcpCredentials.enabled=true \
--set gcpCredentials.gcpSecretName=your_secret_name

.. dropdown:: Update GCP credentials

After GCP credentials are enabled, you can update the credentials file in ``gcp-credentials`` using either approach:

1. Create a new secret with a new name:

.. code-block:: bash

kubectl create secret generic gcp-credentials-new \
--namespace $NAMESPACE \
--from-file=gcp-cred.json=YOUR_SERVICE_ACCOUNT_JSON_KEY_NEW.json

Then point Helm to the new secret name:

.. code-block:: bash

helm upgrade --install skypilot skypilot/skypilot-nightly --devel \
--namespace $NAMESPACE \
--reuse-values \
--set gcpCredentials.gcpSecretName=gcp-credentials-new

2. Replace the existing secret in place, then restart the API server:

.. code-block:: bash

kubectl delete secret gcp-credentials
kubectl create secret generic gcp-credentials \
--namespace $NAMESPACE \
--from-file=gcp-cred.json=YOUR_SERVICE_ACCOUNT_JSON_KEY.json

Restart the API server:

.. code-block:: bash

kubectl rollout restart deployment/$RELEASE_NAME-api-server -n $NAMESPACE

Verify the updated credentials in the API server pod:

.. code-block:: bash

# The NAMESPACE and RELEASE_NAME should be consistent with the API server deployment
API_SERVER_POD_NAME=$(kubectl get pods -n $NAMESPACE -l app=${RELEASE_NAME}-api -o jsonpath='{.items[0].metadata.name}')
kubectl exec $API_SERVER_POD_NAME -n $NAMESPACE -- ls -lart /root/.config/gcloud

.. tab-item:: RunPod
:sync: runpod-creds-tab

@@ -380,6 +515,50 @@ Following tabs describe how to configure credentials for different clouds on the
--set runpodCredentials.enabled=true \
--set runpodCredentials.runpodSecretName=your_secret_name

.. dropdown:: Update RunPod credentials

After RunPod credentials are enabled, you can update the API key in ``runpod-credentials`` using either approach:

1. Create a new secret with a new name:

.. code-block:: bash

kubectl create secret generic runpod-credentials-new \
--namespace $NAMESPACE \
--from-literal api_key=YOUR_API_KEY_NEW

Then point Helm to the new secret name:

.. code-block:: bash

helm upgrade --install skypilot skypilot/skypilot-nightly --devel \
--namespace $NAMESPACE \
--reuse-values \
--set runpodCredentials.runpodSecretName=runpod-credentials-new

2. Replace the existing secret in place, then restart the API server:

.. code-block:: bash

kubectl delete secret runpod-credentials
kubectl create secret generic runpod-credentials \
--namespace $NAMESPACE \
--from-literal api_key=YOUR_API_KEY

Restart the API server:

.. code-block:: bash

kubectl rollout restart deployment/$RELEASE_NAME-api-server -n $NAMESPACE

Verify the updated credentials in the API server pod:

.. code-block:: bash

# The NAMESPACE and RELEASE_NAME should be consistent with the API server deployment
API_SERVER_POD_NAME=$(kubectl get pods -n $NAMESPACE -l app=${RELEASE_NAME}-api -o jsonpath='{.items[0].metadata.name}')
kubectl exec $API_SERVER_POD_NAME -n $NAMESPACE -- cat /root/.runpod/config.toml

.. tab-item:: Lambda
:sync: lambda-creds-tab

@@ -416,6 +595,50 @@ Following tabs describe how to configure credentials for different clouds on the
--set lambdaCredentials.enabled=true \
--set lambdaCredentials.lambdaSecretName=your_secret_name

.. dropdown:: Update Lambda credentials

After Lambda credentials are enabled, you can update the API key in ``lambda-credentials`` using either approach:

1. Create a new secret with a new name:

.. code-block:: bash

kubectl create secret generic lambda-credentials-new \
--namespace $NAMESPACE \
--from-literal api_key=YOUR_API_KEY_NEW

Then point Helm to the new secret name:

.. code-block:: bash

helm upgrade --install skypilot skypilot/skypilot-nightly --devel \
--namespace $NAMESPACE \
--reuse-values \
--set lambdaCredentials.lambdaSecretName=lambda-credentials-new

2. Replace the existing secret in place, then restart the API server:

.. code-block:: bash

kubectl delete secret lambda-credentials
kubectl create secret generic lambda-credentials \
--namespace $NAMESPACE \
--from-literal api_key=YOUR_API_KEY

Restart the API server:

.. code-block:: bash

kubectl rollout restart deployment/$RELEASE_NAME-api-server -n $NAMESPACE

Verify the updated credentials in the API server pod:

.. code-block:: bash

# The NAMESPACE and RELEASE_NAME should be consistent with the API server deployment
API_SERVER_POD_NAME=$(kubectl get pods -n $NAMESPACE -l app=${RELEASE_NAME}-api -o jsonpath='{.items[0].metadata.name}')
kubectl exec $API_SERVER_POD_NAME -n $NAMESPACE -- cat /root/.lambda_cloud/lambda_keys

.. tab-item:: Nebius
:sync: nebius-creds-tab

@@ -480,6 +703,27 @@ Following tabs describe how to configure credentials for different clouds on the
--set nebiusCredentials.enabled=true \
--set nebiusCredentials.nebiusSecretName=your_secret_name

.. dropdown:: Update Nebius credentials

After Nebius credentials are enabled, you can update the credentials file in ``nebius-credentials`` by:

1. Replace the existing secret in place:

.. code-block:: bash

kubectl delete secret nebius-credentials
kubectl create secret generic nebius-credentials \
--namespace $NAMESPACE \
--from-file=credentials=$HOME/.nebius/credentials.json

2. Then it will take tens of seconds to take effect on the API server. You can verify the updated credentials in the API server pod:

.. code-block:: bash

# The NAMESPACE and RELEASE_NAME should be consistent with the API server deployment
API_SERVER_POD_NAME=$(kubectl get pods -n $NAMESPACE -l app=${RELEASE_NAME}-api -o jsonpath='{.items[0].metadata.name}')
kubectl exec $API_SERVER_POD_NAME -n $NAMESPACE -- cat /root/.nebius/credentials.json

.. tab-item:: Vast
:sync: vast-creds-tab

@@ -516,6 +760,49 @@ Following tabs describe how to configure credentials for different clouds on the
--set vastCredentials.enabled=true \
--set vastCredentials.vastSecretName=your_secret_name

.. dropdown:: Update Vast credentials

After Vast credentials are enabled, you can update the API key in ``vast-credentials`` using either approach:

1. Create a new secret with a new name:

.. code-block:: bash

kubectl create secret generic vast-credentials-new \
--namespace $NAMESPACE \
--from-literal api_key=YOUR_API_KEY_NEW

Then point Helm to the new secret name:

.. code-block:: bash

helm upgrade --install skypilot skypilot/skypilot-nightly --devel \
--namespace $NAMESPACE \
--reuse-values \
--set vastCredentials.vastSecretName=vast-credentials-new

2. Replace the existing secret in place, then restart the API server:

.. code-block:: bash

kubectl delete secret vast-credentials
kubectl create secret generic vast-credentials \
--namespace $NAMESPACE \
--from-literal api_key=YOUR_API_KEY

Restart the API server:

.. code-block:: bash

kubectl rollout restart deployment/$RELEASE_NAME-api-server -n $NAMESPACE

Verify the updated credentials in the API server pod:

.. code-block:: bash

# The NAMESPACE and RELEASE_NAME should be consistent with the API server deployment
API_SERVER_POD_NAME=$(kubectl get pods -n $NAMESPACE -l app=${RELEASE_NAME}-api -o jsonpath='{.items[0].metadata.name}')
kubectl exec $API_SERVER_POD_NAME -n $NAMESPACE -- cat /root/.config/vastai/vast_api_key

.. tab-item:: SSH Node Pools
:sync: ssh-node-pools-tab
@@ -532,6 +819,17 @@ Following tabs describe how to configure credentials for different clouds on the
--reuse-values \
--set-file apiService.sshNodePools=/your/path/to/ssh_node_pools.yaml

.. note::

Updating the value of ``apiService.sshNodePools`` will not restart the API server but it will take tens of seconds to take effect on the API server.
You can verify the config updates on the API server by running the following command:

.. code-block:: bash

# The NAMESPACE and RELEASE_NAME should be consistent with the API server deployment
API_SERVER_POD_NAME=$(kubectl get pods -n $NAMESPACE -l app=${RELEASE_NAME}-api -o jsonpath='{.items[0].metadata.name}')
kubectl exec $API_SERVER_POD_NAME -n $NAMESPACE -- cat /root/.sky/ssh_node_pools.yaml

If your ``ssh_node_pools.yaml`` requires SSH keys, create a secret that contains the keys and set the :ref:`apiService.sshKeySecret <helm-values-apiService-sshKeySecret>` to the secret name:

.. code-block:: bash
@@ -551,6 +849,28 @@ Following tabs describe how to configure credentials for different clouds on the
--reuse-values \
--set apiService.sshKeySecret=$SECRET_NAME

.. dropdown:: Update SSH key credentials

After SSH key credentials are enabled, you can update the credentials file in ``$SECRET_NAME`` by:

1. Replace the existing secret in place:

.. code-block:: bash

kubectl delete secret $SECRET_NAME
kubectl create secret generic $SECRET_NAME \
--namespace $NAMESPACE \
--from-file=id_rsa=/path/to/id_rsa \
--from-file=other_id_rsa=/path/to/other_id_rsa

2. Then it will take tens of seconds to take effect on the API server. You can verify the updated credentials in the API server pod:

.. code-block:: bash

# The NAMESPACE and RELEASE_NAME should be consistent with the API server deployment
API_SERVER_POD_NAME=$(kubectl get pods -n $NAMESPACE -l app=${RELEASE_NAME}-api -o jsonpath='{.items[0].metadata.name}')
kubectl exec $API_SERVER_POD_NAME -n $NAMESPACE -- ls -lart /root/.ssh/

After the API server is deployed, use the ``sky ssh up`` command to set up the SSH Node Pools. Refer to :ref:`existing-machines` for more details.

.. note::
@@ -582,6 +902,53 @@ Following tabs describe how to configure credentials for different clouds on the
--set r2Credentials.enabled=true \
--set r2Credentials.r2SecretName=r2-credentials

.. dropdown:: Update Cloudflare R2 credentials

After Cloudflare R2 credentials are enabled, you can update the credentials file in ``r2-credentials`` using either approach:

1. Create a new secret with a new name:

.. code-block:: bash

kubectl create secret generic r2-credentials-new \
--namespace $NAMESPACE \
--from-file=r2.credentials=$HOME/.cloudflare/r2.credentials
--from-file=accountid=$HOME/.cloudflare/accountid

Then point Helm to the new secret name:

.. code-block:: bash

helm upgrade --install skypilot skypilot/skypilot-nightly --devel \
--namespace $NAMESPACE \
--reuse-values \
--set r2Credentials.r2SecretName=r2-credentials-new

2. Replace the existing secret in place, then restart the API server:

.. code-block:: bash

kubectl delete secret r2-credentials
kubectl create secret generic r2-credentials \
--namespace $NAMESPACE \
--from-file=r2.credentials=$HOME/.cloudflare/r2.credentials
--from-file=accountid=$HOME/.cloudflare/accountid

Restart the API server:

.. code-block:: bash

kubectl rollout restart deployment/$RELEASE_NAME-api-server -n $NAMESPACE

Verify the updated credentials in the API server pod:

.. code-block:: bash

# The NAMESPACE and RELEASE_NAME should be consistent with the API server deployment
API_SERVER_POD_NAME=$(kubectl get pods -n $NAMESPACE -l app=${RELEASE_NAME}-api -o jsonpath='{.items[0].metadata.name}')
kubectl exec $API_SERVER_POD_NAME -n $NAMESPACE -- cat /root/.cloudflare/r2.credentials
kubectl exec $API_SERVER_POD_NAME -n $NAMESPACE -- cat /root/.cloudflare/accountid

.. tab-item:: CoreWeave
:sync: coreweave-creds-tab

@@ -619,6 +986,53 @@ Following tabs describe how to configure credentials for different clouds on the
--set coreweaveCredentials.enabled=true \
--set coreweaveCredentials.coreweaveSecretName=your_secret_name

.. dropdown:: Update CoreWeave CAIOS credentials

After CoreWeave CAIOS credentials are enabled, you can update the credentials file in ``coreweave-credentials`` using either approach:

1. Create a new secret with a new name:

.. code-block:: bash

kubectl create secret generic coreweave-credentials-new \
--namespace $NAMESPACE \
--from-file=cw.config=$HOME/.coreweave/cw.config \
--from-file=cw.credentials=$HOME/.coreweave/cw.credentials

Then point Helm to the new secret name:

.. code-block:: bash

helm upgrade --install skypilot skypilot/skypilot-nightly --devel \
--namespace $NAMESPACE \
--reuse-values \
--set coreweaveCredentials.coreweaveSecretName=coreweave-credentials-new

2. Replace the existing secret in place, then restart the API server:

.. code-block:: bash

kubectl delete secret coreweave-credentials
kubectl create secret generic coreweave-credentials \
--namespace $NAMESPACE \
--from-file=cw.config=$HOME/.coreweave/cw.config \
--from-file=cw.credentials=$HOME/.coreweave/cw.credentials

Restart the API server:

.. code-block:: bash

kubectl rollout restart deployment/$RELEASE_NAME-api-server -n $NAMESPACE

Verify the updated credentials in the API server pod:

.. code-block:: bash

# The NAMESPACE and RELEASE_NAME should be consistent with the API server deployment
API_SERVER_POD_NAME=$(kubectl get pods -n $NAMESPACE -l app=${RELEASE_NAME}-api -o jsonpath='{.items[0].metadata.name}')
kubectl exec $API_SERVER_POD_NAME -n $NAMESPACE -- cat /root/.coreweave/cw.config
kubectl exec $API_SERVER_POD_NAME -n $NAMESPACE -- cat /root/.coreweave/cw.credentials

.. tab-item:: DigitalOcean
:sync: digitalocean-creds-tab

@@ -642,7 +1056,7 @@ Following tabs describe how to configure credentials for different clouds on the
You can also set the following values to use a secret that already contains your DigitalOcean API key:

.. code-block:: bash
# TODO: replace with your secret name
# if secret name is not provided, secret name defaults to `digitalocean-credentials`
helm upgrade --install skypilot skypilot/skypilot-nightly --devel \
@@ -651,6 +1065,50 @@ Following tabs describe how to configure credentials for different clouds on the
--set digitaloceanCredentials.enabled=true \
--set digitaloceanCredentials.digitaloceanSecretName=your_secret_name

.. dropdown:: Update DigitalOcean credentials

After DigitalOcean credentials are enabled, you can update the API key in ``digitalocean-credentials`` using either approach:

1. Create a new secret with a new name:

.. code-block:: bash

kubectl create secret generic digitalocean-credentials-new \
--namespace $NAMESPACE \
--from-literal api_key=YOUR_API_KEY_NEW

Then point Helm to the new secret name:

.. code-block:: bash

helm upgrade --install skypilot skypilot/skypilot-nightly --devel \
--namespace $NAMESPACE \
--reuse-values \
--set digitaloceanCredentials.digitaloceanSecretName=digitalocean-credentials-new

2. Replace the existing secret in place, then restart the API server:

.. code-block:: bash

kubectl delete secret digitalocean-credentials
kubectl create secret generic digitalocean-credentials \
--namespace $NAMESPACE \
--from-literal api_key=YOUR_API_KEY

Restart the API server:

.. code-block:: bash

kubectl rollout restart deployment/$RELEASE_NAME-api-server -n $NAMESPACE

Verify the updated credentials in the API server pod:

.. code-block:: bash

# The NAMESPACE and RELEASE_NAME should be consistent with the API server deployment
API_SERVER_POD_NAME=$(kubectl get pods -n $NAMESPACE -l app=${RELEASE_NAME}-api -o jsonpath='{.items[0].metadata.name}')
kubectl exec $API_SERVER_POD_NAME -n $NAMESPACE -- cat /root/.config/doctl/config.yaml

.. tab-item:: Other clouds
:sync: other-clouds-tab



+ 2
- 2
llm/llama-3_1-finetuning/readme.md View File

@@ -7,7 +7,7 @@

On July 23, 2024, Meta released the [Llama 3.1 model family](https://ai.meta.com/blog/meta-llama-3-1/), including a 405B parameter model in both base model and instruction-tuned forms. Llama 3.1 405B became _the first open LLM that closely rivals top proprietary models_ like GPT-4o and Claude 3.5 Sonnet.

This guide shows how to use [SkyPilot](https://github.com/skypilot-org/skypilot) and [torchtune](https://pytorch.org/torchtune/stable/index.html) to **finetune Llama 3.1 on your own data and infra**. Everything is packaged in a simple [SkyPilot YAML](https://docs.skypilot.co/en/latest/getting-started/quickstart.html), that can be launched with one command on your infra:
This guide shows how to use [SkyPilot](https://github.com/skypilot-org/skypilot) and [torchtune](https://meta-pytorch.org/torchtune/stable/index.html) to **finetune Llama 3.1 on your own data and infra**. Everything is packaged in a simple [SkyPilot YAML](https://docs.skypilot.co/en/latest/getting-started/quickstart.html), that can be launched with one command on your infra:
- Local GPU workstation
- Kubernetes cluster
- Cloud accounts ([12 clouds supported](https://docs.skypilot.co/en/latest/getting-started/installation.html))
@@ -20,7 +20,7 @@ This guide shows how to use [SkyPilot](https://github.com/skypilot-org/skypilot)


## Let's finetune Llama 3.1
We will use [torchtune](https://pytorch.org/torchtune/stable/index.html) to finetune Llama 3.1. The example below uses the [`yahma/alpaca-cleaned`](https://huggingface.co/datasets/yahma/alpaca-cleaned) dataset, which you can replace with your own dataset later.
We will use [torchtune](https://meta-pytorch.org/torchtune/stable/index.html) to finetune Llama 3.1. The example below uses the [`yahma/alpaca-cleaned`](https://huggingface.co/datasets/yahma/alpaca-cleaned) dataset, which you can replace with your own dataset later.

To set up the environment for launching the finetuning job, finish the [Appendix: Preparation](#appendix-preparation) section first.



+ 0
- 3
requirements-dev.txt View File

@@ -40,9 +40,6 @@ buildkite-test-collector
# memory profiler
memory_profiler==0.61.0

# For testing SkyServe
aiohttp==3.9.3

# For mocking AWS
moto==5.1.2



+ 57
- 45
sky/adaptors/slurm.py View File

@@ -19,6 +19,15 @@ SEP = r'\x1f'
# Matches PartitionName=<name> and captures until the next field
_PARTITION_NAME_REGEX = re.compile(r'PartitionName=(.+?)(?:\s+\w+=|$)')

# Default timeout for waiting for job nodes to be allocated, in seconds.
_SLURM_DEFAULT_PROVISION_TIMEOUT = 10


class SlurmPartition(NamedTuple):
"""Information about the Slurm partitions."""
name: str
is_default: bool


# TODO(kevin): Add more API types for other client functions.
class NodeInfo(NamedTuple):
@@ -274,12 +283,12 @@ class SlurmClient:
return output if output != 'None' else None

@timeline.event
def wait_for_job_nodes(self, job_id: str, timeout: int = 300) -> None:
def wait_for_job_nodes(self, job_id: str, timeout: int) -> None:
"""Wait for a Slurm job to have nodes allocated.

Args:
job_id: The Slurm job ID.
timeout: Maximum time to wait in seconds (default: 300).
timeout: Maximum time to wait in seconds.
"""
start_time = time.time()
last_state = None
@@ -322,9 +331,11 @@ class SlurmClient:
f'{timeout} seconds. Last state: {last_state}')

@timeline.event
def get_job_nodes(self,
job_id: str,
wait: bool = True) -> Tuple[List[str], List[str]]:
def get_job_nodes(
self,
job_id: str,
wait: bool = True,
timeout: Optional[int] = None) -> Tuple[List[str], List[str]]:
"""Get the list of nodes and their IPs for a given job ID.

The ordering is guaranteed to be stable for the lifetime of the job.
@@ -332,6 +343,7 @@ class SlurmClient:
Args:
job_id: The Slurm job ID.
wait: If True, wait for nodes to be allocated before returning.
timeout: Maximum time to wait in seconds. Only used when wait=True.

Returns:
A tuple of (nodes, node_ips) where nodes is a list of node names
@@ -339,7 +351,9 @@ class SlurmClient:
"""
# Wait for nodes to be allocated if requested
if wait:
self.wait_for_job_nodes(job_id)
if timeout is None:
timeout = _SLURM_DEFAULT_PROVISION_TIMEOUT
self.wait_for_job_nodes(job_id, timeout=timeout)

cmd = (
f'squeue -h --jobs {job_id} -o "%N" | tr \',\' \'\\n\' | '
@@ -377,32 +391,6 @@ class SlurmClient:

return nodes, node_ips

def get_partitions(self) -> List[str]:
"""Get unique partition names in the Slurm cluster.

Returns:
List of partition names. The default partition will not have a '*'
at the end of the name.
"""
cmd = 'scontrol show partitions -o'
rc, stdout, stderr = self._runner.run(cmd,
require_outputs=True,
stream_logs=False)
subprocess_utils.handle_returncode(rc,
cmd,
'Failed to get Slurm partitions.',
stderr=stderr)

# Extract partition names from PartitionName= fields
partitions = []
for line in stdout.strip().splitlines():
match = _PARTITION_NAME_REGEX.search(line)
if match:
partition = match.group(1).strip()
if partition:
partitions.append(partition)
return partitions

def submit_job(
self,
partition: str,
@@ -440,27 +428,51 @@ class SlurmClient:

return job_id

def get_default_partition(self) -> Optional[str]:
"""Get the default partition for the Slurm cluster.
def get_partitions_info(self) -> List[SlurmPartition]:
"""Get the partitions information for the Slurm cluster.

Returns:
The default partition name, or None if it cannot be determined.
List of SlurmPartition objects.
"""
cmd = 'scontrol show partition -o'
cmd = 'scontrol show partitions -o'
rc, stdout, stderr = self._runner.run(cmd,
require_outputs=True,
stream_logs=False)
if rc != 0:
logger.debug(f'Failed to get default partition: {stderr}')
return None
subprocess_utils.handle_returncode(rc,
cmd,
'Failed to get Slurm partitions.',
stderr=stderr)

partitions = []
for line in stdout.strip().splitlines():
is_default = False
match = _PARTITION_NAME_REGEX.search(line)
if 'Default=YES' in line:
match = _PARTITION_NAME_REGEX.search(line)
if match:
partition = match.group(1).strip()
if partition:
return partition
is_default = True
if match:
partition = match.group(1).strip()
if partition:
partitions.append(
SlurmPartition(name=partition, is_default=is_default))
return partitions

logger.debug('No default partition found')
def get_default_partition(self) -> Optional[str]:
"""Get the default partition name for the Slurm cluster.

Returns:
The default partition name, or None if it cannot be determined.
"""
partitions = self.get_partitions_info()
for partition in partitions:
if partition.is_default:
return partition.name
return None

def get_partitions(self) -> List[str]:
"""Get unique partition names in the Slurm cluster.

Returns:
List of partition names. The default partition will not have a '*'
at the end of the name.
"""
return [partition.name for partition in self.get_partitions_info()]

+ 3
- 12
sky/clouds/slurm.py View File

@@ -568,20 +568,11 @@ class Slurm(clouds.Cloud):

@classmethod
def expand_infras(cls) -> List[str]:
"""Returns a list of enabled Slurm cluster/partition combinations.
"""Returns a list of enabled Slurm clusters.

Each is returned as 'Slurm/cluster-name/partition' to be displayed
as a separate option in the optimizer.
Each is returned as 'Slurm/cluster-name'.
"""
infras = []
for cluster in cls.existing_allowed_clusters(silent=True):
try:
partitions = slurm_utils.get_partitions(cluster)
for partition in partitions:
infras.append(
f'{cls.canonical_name()}/{cluster}/{partition}')
except Exception as e: # pylint: disable=broad-except
# Fall back to cluster-only if partition fetch fails
logger.debug(f'Failed to get partitions for {cluster}: {e}')
infras.append(f'{cls.canonical_name()}/{cluster}')
infras.append(f'{cls.canonical_name()}/{cluster}')
return infras

+ 8
- 1
sky/core.py View File

@@ -1211,6 +1211,7 @@ def enabled_clouds(workspace: Optional[str] = None,
return [cloud.canonical_name() for cloud in cached_clouds]
enabled_ssh_infras = []
enabled_k8s_infras = []
enabled_slurm_infras = []
enabled_cloud_infras = []
for cloud in cached_clouds:
cloud_infra = cloud.expand_infras()
@@ -1218,10 +1219,16 @@ def enabled_clouds(workspace: Optional[str] = None,
enabled_ssh_infras.extend(cloud_infra)
elif isinstance(cloud, clouds.Kubernetes):
enabled_k8s_infras.extend(cloud_infra)
elif isinstance(cloud, clouds.Slurm):
enabled_slurm_infras.extend(cloud_infra)
else:
enabled_cloud_infras.extend(cloud_infra)
# We do not sort slurm infras alphabetically because the
# default partition should appear first.
# Ordering of slurm infras is enforced in Slurm implementation.
all_infras = sorted(enabled_ssh_infras) + sorted(
enabled_k8s_infras) + sorted(enabled_cloud_infras)
enabled_k8s_infras) + enabled_slurm_infras + sorted(
enabled_cloud_infras)
return all_infras




+ 16
- 2
sky/provision/slurm/instance.py View File

@@ -6,6 +6,7 @@ import time
from typing import Any, cast, Dict, List, Optional, Tuple

from sky import sky_logging
from sky import skypilot_config
from sky.adaptors import slurm
from sky.provision import common
from sky.provision import constants
@@ -116,6 +117,15 @@ def _create_virtual_instance(
cluster_name_on_cloud,
['pending', 'running'],
)

# Get provision_timeout from config. If not specified, use None,
# which will use the default timeout specified in the Slurm adaptor.
provision_timeout = skypilot_config.get_effective_region_config(
cloud='slurm',
region=region,
keys=('provision_timeout',),
default_value=None)

if existing_jobs:
assert len(existing_jobs) == 1, (
f'Multiple jobs found with name {cluster_name_on_cloud}: '
@@ -126,7 +136,9 @@ def _create_virtual_instance(
f'(JOBID: {job_id})')

# Wait for nodes to be allocated (job might be in PENDING state)
nodes, _ = client.get_job_nodes(job_id, wait=True)
nodes, _ = client.get_job_nodes(job_id,
wait=True,
timeout=provision_timeout)
return common.ProvisionRecord(provider_name='slurm',
region=region,
zone=partition,
@@ -241,7 +253,9 @@ def _create_virtual_instance(
f'{partition} for cluster {cluster_name_on_cloud} '
f'with {num_nodes} nodes')

nodes, _ = client.get_job_nodes(job_id, wait=True)
nodes, _ = client.get_job_nodes(job_id,
wait=True,
timeout=provision_timeout)
created_instance_ids = [
slurm_utils.instance_id(job_id, node) for node in nodes
]


+ 12
- 3
sky/provision/slurm/utils.py View File

@@ -550,7 +550,9 @@ def get_partitions(cluster_name: str) -> List[str]:
cluster_name: Name of the Slurm cluster.

Returns:
Sorted list of unique partition names available in the cluster.
List of unique partition names available in the cluster.
The default partition appears first,
and the rest are sorted alphabetically.
"""
try:
slurm_config = SSHConfig.from_path(
@@ -565,8 +567,15 @@ def get_partitions(cluster_name: str) -> List[str]:
ssh_proxy_command=slurm_config_dict.get('proxycommand', None),
)

partitions = client.get_partitions()
return sorted(partitions)
partitions_info = client.get_partitions_info()
default_partitions = []
other_partitions = []
for partition in partitions_info:
if partition.is_default:
default_partitions.append(partition.name)
else:
other_partitions.append(partition.name)
return default_partitions + sorted(other_partitions)
except Exception as e: # pylint: disable=broad-except
logger.warning(
f'Failed to get partitions for cluster {cluster_name}: {e}')


+ 15
- 6
sky/setup_files/dependencies.py View File

@@ -204,12 +204,21 @@ cloud_dependencies: Dict[str, List[str]] = {
'ssh': kubernetes_dependencies,
# For the container registry auth api. Reference:
# https://github.com/runpod/runpod-python/releases/tag/1.6.1
# RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python 3.11+
# stdlib provides tomllib; on lower versions we depend on tomli explicitly.
# Instead of installing tomli conditionally, we install it explicitly.
# This is because the conditional installation of tomli does not work
# with controller package installation code.
'runpod': ['runpod>=1.6.1', 'tomli'],
'runpod': [
# For the container registry auth api. Reference:
# https://github.com/runpod/runpod-python/releases/tag/1.6.1
'runpod>=1.6.1',
# RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python
# 3.11+ stdlib provides tomllib; on lower versions we depend on tomli
# explicitly. Instead of installing tomli conditionally, we install it
# explicitly. This is because the conditional installation of tomli does
# not work with controller package installation code.
'tomli',
# runpod installs aiodns (via aiohttp[speedups]), which is incompatible
# with pycares 5.0.0 due to deprecations.
# See https://github.com/aio-libs/aiodns/issues/214
'pycares<5',
],
'fluidstack': [], # No dependencies needed for fluidstack
'cudo': ['cudo-compute>=0.1.10'],
'paperspace': [], # No dependencies needed for paperspace


+ 8
- 0
sky/templates/kubernetes-ray.yml.j2 View File

@@ -523,6 +523,14 @@ available_node_types:
resourceFieldRef:
containerName: ray-node
resource: requests.memory
# Disable Ray memory monitor to prevent Ray's memory manager
# from interfering with kubernetes resource manager.
# If ray memory monitor is enabled, the ray memory monitor kills
# the running job is the job uses more than 95% of allocated memory,
# even if the job is not misbehaving or using its full allocated memory.
# This behavior does not give a chance for k8s scheduler to evict the pod.
- name: RAY_memory_monitor_refresh_ms
value: "0"
{% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
- name: {{ key }}
value: {{ value }}


+ 0
- 14
sky/templates/slurm-ray.yml.j2 View File

@@ -67,23 +67,9 @@ initialization_commands: []
# Increment the following for catching performance bugs easier:
# current num items (num SSH connections): 1
setup_commands:
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
# Create ~/.ssh/config file in case the file does not exist in the image.
# Line 'rm ..': there is another installation of pip.
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
# Line 'mkdir -p ..': disable host key check
- {%- for initial_setup_command in initial_setup_commands %}
{{ initial_setup_command }}
{%- endfor %}
sudo systemctl stop unattended-upgrades || true;
sudo systemctl disable unattended-upgrades || true;
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
sudo kill -9 `sudo lsof /var/lib/dpkg/lock-frontend | awk '{print $2}' | tail -n 1` || true;
sudo pkill -9 apt-get;
sudo pkill -9 dpkg;
sudo dpkg --configure -a;
mkdir -p ~/.ssh; touch ~/.ssh/config;
{{ setup_sky_dirs_commands }}
{{ conda_installation_commands }}
{{ skypilot_wheel_installation_commands }}


+ 38
- 26
sky/utils/command_runner.py View File

@@ -1312,7 +1312,8 @@ class SlurmCommandRunner(SSHCommandRunner):
runner.rsync(source, target, up=True)

Args:
node: (ip, port) The IP address and port of the remote machine.
node: (ip, port) The IP address and port of the remote machine
(login node).
ssh_user: SSH username.
ssh_private_key: Path to SSH private key.
sky_dir: The private directory for the SkyPilot cluster on the
@@ -1320,7 +1321,8 @@ class SlurmCommandRunner(SSHCommandRunner):
skypilot_runtime_dir: The directory for the SkyPilot runtime
on the Slurm cluster.
job_id: The Slurm job ID for this instance.
slurm_node: The Slurm node hostname for this instance.
slurm_node: The Slurm node hostname for this instance
(compute node).
**kwargs: Additional arguments forwarded to SSHCommandRunner
(e.g., ssh_proxy_command).
"""
@@ -1330,6 +1332,29 @@ class SlurmCommandRunner(SSHCommandRunner):
self.job_id = job_id
self.slurm_node = slurm_node

# Build a chained ProxyCommand that goes through the login node to reach
# the compute node where the job is running.

# First, build SSH options to reach the login node, using the user's
# existing proxy command if provided.
proxy_ssh_options = ' '.join(
ssh_options_list(self.ssh_private_key,
None,
ssh_proxy_command=self._ssh_proxy_command,
port=self.port,
disable_control_master=True))
login_node_proxy_command = (f'ssh {proxy_ssh_options} '
f'-W %h:%p {self.ssh_user}@{self.ip}')

# Update the proxy command to be the login node proxy, which will
# be used by super().run() to reach the compute node.
self._ssh_proxy_command = login_node_proxy_command
# Update self.ip to target the compute node.
self.ip = slurm_node
# Assume the compute node's SSH port is 22.
# TODO(kevin): Make this configurable if needed.
self.port = 22

def rsync(
self,
source: str,
@@ -1351,24 +1376,15 @@ class SlurmCommandRunner(SSHCommandRunner):
# if the target dir is in a shared filesystem, since it will
# be accessible by the compute node.

# Build ProxyCommand to proxy through the Slurm login node to
# the compute node where the job is running.
proxy_ssh_options = ' '.join(
ssh_options_list(self.ssh_private_key,
None,
ssh_proxy_command=self._ssh_proxy_command,
port=self.port,
disable_control_master=True))
login_node_proxy_command = (f'ssh {proxy_ssh_options} '
f'-W %h:%p {self.ssh_user}@{self.ip}')

# Build the complete SSH option to pass in to rsync -e 'ssh ...',
# utilizing the login node proxy command we have above.
# Build SSH options for rsync using the ProxyCommand set up in __init__
# to reach the compute node through the login node.
ssh_options = ' '.join(
ssh_options_list(
None, # Assume no key needed to ssh from login to compute node
# Assume nothing and rely on default SSH behavior when -i is
# not specified.
None,
None,
ssh_proxy_command=login_node_proxy_command,
ssh_proxy_command=self._ssh_proxy_command,
disable_control_master=True))
rsh_option = f'ssh {ssh_options}'

@@ -1415,15 +1431,11 @@ class SlurmCommandRunner(SSHCommandRunner):
cmd = (
f'export {constants.SKY_RUNTIME_DIR_ENV_VAR_KEY}='
f'"{self.skypilot_runtime_dir}" && '
# Set the uv cache directory to /tmp/uv_cache to speed up
# package installation. Otherwise it defaults to ~/.cache/uv.
# This also means we can share the uv cache between multiple
# SkyPilot clusters.
f'export UV_CACHE_DIR=/tmp/uv_cache && '
# Set the uv cache directory to /tmp/uv_cache_$(id -u) to speed up
# package installation while avoiding permission conflicts when
# multiple users share the same host. Otherwise it defaults to
# ~/.cache/uv.
f'export UV_CACHE_DIR=/tmp/uv_cache_$(id -u) && '
f'cd {self.sky_dir} && export HOME=$(pwd) && {cmd}')
ssh_options = ('-o StrictHostKeyChecking=no '
'-o UserKnownHostsFile=/dev/null '
'-o LogLevel=ERROR')
cmd = f'ssh {ssh_options} {self.slurm_node} {shlex.quote(cmd)}'

return super().run(cmd, **kwargs)

+ 3
- 0
sky/utils/schemas.py View File

@@ -1417,6 +1417,9 @@ def get_config_schema():
'pattern': '^all$'
}]
},
'provision_timeout': {
'type': 'integer',
},
}
},
'oci': {


+ 7
- 0
sky/utils/status_lib.py View File

@@ -27,6 +27,12 @@ class ClusterStatus(enum.Enum):

STOPPED = 'STOPPED'
"""The cluster is stopped."""
PENDING = 'PENDING'
"""The cluster is pending scheduling.

NOTE: This state is for display only and should not be used in state
machine logic without necessary considerations.
"""

def colored_str(self):
color = _STATUS_TO_COLOR[self]
@@ -37,6 +43,7 @@ _STATUS_TO_COLOR = {
ClusterStatus.INIT: colorama.Fore.BLUE,
ClusterStatus.UP: colorama.Fore.GREEN,
ClusterStatus.STOPPED: colorama.Fore.YELLOW,
ClusterStatus.PENDING: colorama.Fore.CYAN,
}




+ 4
- 2
tests/kubernetes/scripts/helm_deploy_and_verify.sh View File

@@ -68,8 +68,10 @@ echo "Deploying SkyPilot API server..."
if [ "$HELM_VERSION" = "latest" ]; then
extra_flag="--devel"
else
# Convert PEP440 version to SemVer if needed (e.g., 1.0.0.dev20250609 -> 1.0.0-dev.20250609)
SEMVER_VERSION=$(echo "$HELM_VERSION" | sed -E 's/([0-9]+\.[0-9]+\.[0-9]+)\.dev([0-9]+)/\1-dev.\2/')
# Convert PEP440 version to SemVer if needed
# 0.11.0rc1 -> 0.11.0-rc.1
# 1.0.0.dev20250609 -> 1.0.0-dev.20250609
SEMVER_VERSION=$(echo "$HELM_VERSION" | sed -E 's/([0-9]+\.[0-9]+\.[0-9]+)rc([0-9]+)/\1-rc.\2/' | sed -E 's/([0-9]+\.[0-9]+\.[0-9]+)\.dev([0-9]+)/\1-dev.\2/')
extra_flag="--version $SEMVER_VERSION"
fi



+ 14
- 3
tests/kubernetes/scripts/helm_upgrade.sh View File

@@ -131,10 +131,15 @@ get_previous_version() {

# For skypilot-nightly, we need to include --devel flag to get dev versions
if [ "$package_name" = "skypilot-nightly" ]; then
local versions=$(helm search repo skypilot/$package_name --versions --devel --output json | jq -r '.[].version' | sort -V)
local versions=$(helm search repo skypilot/$package_name --versions --devel --output json | jq -r '.[] | select(.name == "skypilot/'"$package_name"'").version' | sort -V)
else
# For skypilot (stable), we don't use --devel flag
local versions=$(helm search repo skypilot/$package_name --versions --output json | jq -r '.[].version' | sort -V)
# If the current_ver is an rc, we still don't want to use --devel,
# because we should compare against the latest stable (non-rc) version
# `helm search skypilot/skypilot` may also return charts for
# skypilot/skypilot-prometheus-server, so filter the package name in jq as
# well.
local versions=$(helm search repo skypilot/$package_name --versions --output json | jq -r '.[] | select(.name == "skypilot/'"$package_name"'").version' | sort -V)
fi

if [ -z "$versions" ]; then
@@ -162,7 +167,13 @@ get_previous_version() {
last_version="$version"
done <<< "$versions"

if [ -z "$previous_version" ]; then
if [[ "$current_ver" =~ [0-9]+\.[0-9]+\.[0-9]+rc[0-9]+ ]]; then
# When current version is an rc, it won't be in the available versions,
# since --devel is not used. We should just compare against the latest
# available version.
echo "Using the latest version $last_version since the current version $current_ver is an rc"
previous_version="$last_version"
elif [ -z "$previous_version" ]; then
echo "Error: Could not find a previous version for $current_ver (looking for $helm_format_ver)"
echo "available versions:"
echo "$versions"


+ 26
- 6
tests/load_tests/db_scale_tests/test_large_production_performance.sh View File

@@ -302,9 +302,30 @@ echo "✓ Minimal sky launch verified - found expected echo content in logs"
# Clean up immediately
sky down "$MINIMAL_CLUSTER_NAME" -y || true

# Step 9: Verify dashboard pages in browser
echo "Step 9: Verifying dashboard pages in browser..."
# Step 9: Build dashboard before verification
echo "Step 9: Building dashboard..."
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SKYPILOT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
DASHBOARD_DIR="${SKYPILOT_ROOT}/sky/dashboard"

if [ -d "$DASHBOARD_DIR" ]; then
if command -v npm &> /dev/null; then
echo "Installing dashboard dependencies..."
npm --prefix "$DASHBOARD_DIR" install
echo "Building dashboard..."
npm --prefix "$DASHBOARD_DIR" run build
echo "✓ Dashboard built successfully"
else
echo "ERROR: npm not found, cannot build dashboard. Please install Node.js and npm."
exit 1
fi
else
echo "ERROR: Dashboard directory not found at $DASHBOARD_DIR"
exit 1
fi

# Step 10: Verify dashboard pages in browser
echo "Step 10: Verifying dashboard pages in browser..."
VERIFY_SCRIPT="${SCRIPT_DIR}/verify_dashboard_browser.py"

# Get API server endpoint
@@ -325,11 +346,10 @@ fi
echo "Using API endpoint: $API_ENDPOINT"

if [ -f "$VERIFY_SCRIPT" ]; then
python3 "$VERIFY_SCRIPT" --endpoint "$API_ENDPOINT" || {
echo "WARNING: Dashboard verification failed, but continuing..."
}
python3 "$VERIFY_SCRIPT" --endpoint "$API_ENDPOINT"
else
echo "WARNING: Dashboard verification script not found at $VERIFY_SCRIPT, skipping browser verification"
echo "ERROR: Dashboard verification script not found at $VERIFY_SCRIPT"
exit 1
fi

echo ""


+ 6
- 2
tests/smoke_tests/test_api_server_benchmark.py View File

@@ -59,12 +59,16 @@ def test_api_server_memory(generic_cloud: str):
stop_event = threading.Event()
metrics_thread = threading.Thread(target=_collect_metrics)
metrics_thread.start()
parallelism = 8
if generic_cloud == 'kubernetes':
# Kubernetes has limited resources, lower the concurrency
parallelism = 4
test = smoke_tests_utils.Test(
'test_api_server_memory',
[
f'python tests/load_tests/workload_benchmark.py -t 8 -r 5 --detail -s workloads/basic.sh --cloud {generic_cloud}'
f'python tests/load_tests/workload_benchmark.py -t {parallelism} -r 5 --detail -s workloads/basic.sh --cloud {generic_cloud}'
],
teardown='sky down -y "load-test-*"; sky jobs cancel -a -y',
teardown='sky down -y "load-test-*"; sky jobs cancel -a -y || true',
# Long timeout for benchmark to complete
timeout=3600,
)


+ 1
- 1
tests/smoke_tests/test_basic.py View File

@@ -1147,7 +1147,7 @@ def test_kubernetes_slurm_show_gpus(generic_cloud: str):
# 2. The cluster has no GPUs, and the expected message is shown
'(echo "$s" | grep -A 1 "REQUESTABLE_QTY_PER_NODE" | '
'grep -E "^[A-Z0-9]+[[:space:]]+[0-9, ]+[[:space:]]+[0-9]+ of [0-9]+ free" || '
f'echo "$s" | grep "No GPUs found in any {generic_cloud} clusters")'
f'echo "$s" | grep "No GPUs found in any {generic_cloud.capitalize()} clusters")'
)],
)
smoke_tests_utils.run_one_test(test)


+ 5
- 5
tests/smoke_tests/test_cli.py View File

@@ -32,7 +32,7 @@ def test_endpoint_output_basic(generic_cloud: str):
"""Test that sky api info endpoint output is correct."""
name = smoke_tests_utils.get_cluster_name()
test = smoke_tests_utils.Test('endpoint_output_basic', [
f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
f's=$(SKYPILOT_DEBUG=0 sky api info | tee /dev/stderr) && echo "\n===Validating endpoint output===" && echo "$s" | grep "Endpoint set to default local API server."',
],
timeout=smoke_tests_utils.get_timeout(
@@ -47,7 +47,7 @@ def test_endpoint_output_basic_no_pg_conn_closed_errors(generic_cloud: str):
name = smoke_tests_utils.get_cluster_name()
test = smoke_tests_utils.Test(
'endpoint_output_basic_no_pg_conn_closed_errors', [
f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT_NO_PG_CONN_CLOSED_ERROR}',
f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT_NO_PG_CONN_CLOSED_ERROR}',
],
timeout=smoke_tests_utils.get_timeout(generic_cloud),
teardown=f'sky down -y {name}')
@@ -71,7 +71,7 @@ def test_endpoint_output_config(generic_cloud: str):

name = smoke_tests_utils.get_cluster_name()
test = smoke_tests_utils.Test('endpoint_output_config', [
f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
f's=$(SKYPILOT_DEBUG=0 sky api info | tee /dev/stderr) && echo "\n===Validating endpoint output===" && echo "$s" | grep "Endpoint set via {f.name}"',
],
timeout=smoke_tests_utils.get_timeout(
@@ -91,7 +91,7 @@ def test_endpoint_output_env(generic_cloud: str):
name = smoke_tests_utils.get_cluster_name()
expected_string = f"Endpoint set via the environment variable {constants.SKY_API_SERVER_URL_ENV_VAR}"
test = smoke_tests_utils.Test('endpoint_output_env', [
f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
f's=$(SKYPILOT_DEBUG=0 sky api info | tee /dev/stderr) && echo "\n===Validating endpoint output===" && echo "Expecting to see: {expected_string}\n" && echo "$s" | grep "{expected_string}"',
],
timeout=smoke_tests_utils.get_timeout(
@@ -167,7 +167,7 @@ def test_cli_auto_retry(generic_cloud: str):
# Chaos proxy will kill TCP connections every 30 seconds.
f'python tests/chaos/chaos_proxy.py --port {port} --interval 30 & echo $! > /tmp/{name}-chaos.pid',
# Both launch streaming and logs streaming should survive the chaos.
f'SKYPILOT_API_SERVER_ENDPOINT={api_proxy_url} sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} \'{run_command}\'',
f'SKYPILOT_API_SERVER_ENDPOINT={api_proxy_url} sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} \'{run_command}\'',
f'kill $(cat /tmp/{name}-chaos.pid)',
],
timeout=smoke_tests_utils.get_timeout(generic_cloud),


+ 6
- 2
tests/smoke_tests/test_cluster_job.py View File

@@ -60,6 +60,8 @@ def test_job_queue(generic_cloud: str, accelerator: Dict[str, str]):
accelerator = smoke_tests_utils.get_available_gpus(infra=generic_cloud)
else:
accelerator = accelerator.get(generic_cloud, 'T4')
if not accelerator:
pytest.skip(f'No available GPUs for {generic_cloud}')
name = smoke_tests_utils.get_cluster_name()
test = smoke_tests_utils.Test(
'job_queue',
@@ -255,6 +257,8 @@ def test_job_queue_multinode(generic_cloud: str, accelerator: Dict[str, str]):
accelerator = smoke_tests_utils.get_available_gpus(infra=generic_cloud)
else:
accelerator = accelerator.get(generic_cloud, 'T4')
if not accelerator:
pytest.skip(f'No available GPUs for {generic_cloud}')
name = smoke_tests_utils.get_cluster_name()
total_timeout_minutes = 30 if generic_cloud == 'azure' else 15
test = smoke_tests_utils.Test(
@@ -1299,8 +1303,8 @@ def test_container_logs_two_jobs_kubernetes():
[
smoke_tests_utils.launch_cluster_for_cloud_cmd(
'kubernetes', name),
f'sky launch -y -c {name} {task_yaml}',
f'sky launch -y -c {name} {task_yaml}',
f'sky launch -y -c {name} --infra kubernetes {task_yaml}',
f'sky launch -y -c {name} --infra kubernetes {task_yaml}',
_check_container_logs(name, pod_logs, 9, 2),
],
f'sky down -y {name} && '


+ 1
- 1
tests/smoke_tests/test_examples.py View File

@@ -201,7 +201,7 @@ def test_nemorl(generic_cloud: str, accelerator: Dict[str, str]) -> None:

infra = generic_cloud
if generic_cloud == 'aws':
infra = 'aws/ap-northeast-1'
infra = 'aws'

name = smoke_tests_utils.get_cluster_name()
original_yaml_path = 'llm/nemorl/nemorl.sky.yaml'


+ 153
- 0
tests/unit_tests/test_sky/adaptors/test_slurm_adaptor.py View File

@@ -1,5 +1,6 @@
"""Unit tests for Slurm adaptor."""

import time
import unittest.mock as mock

import pytest
@@ -84,3 +85,155 @@ class TestInfoNodes:
assert result[2].cpus == 4
assert result[2].memory_gb == 32
assert result[2].partition == 'tpu nodes'


class TestWaitForJobNodes:
"""Test SlurmClient.wait_for_job_nodes()."""

def test_wait_for_job_nodes_uses_default_timeout(self):
"""Test that wait_for_job_nodes uses default timeout of 10 seconds."""
client = slurm.SlurmClient(
ssh_host='localhost',
ssh_port=22,
ssh_user='root',
ssh_key=None,
)

job_id = '12345'
start_time = time.time()

# Mock get_job_state to return PENDING, then RUNNING
# Mock squeue to return empty initially, then nodes
with mock.patch.object(client, 'get_job_state') as mock_get_state, \
mock.patch.object(client._runner, 'run') as mock_run:
mock_get_state.side_effect = ['PENDING', 'PENDING', 'RUNNING']
# First two calls return empty (no nodes), third returns nodes
mock_run.side_effect = [
(0, '', ''), # No nodes allocated yet
(0, '', ''), # Still no nodes
(0, 'node1,node2', ''), # Nodes allocated
]

# Should succeed quickly since nodes are allocated
client.wait_for_job_nodes(
job_id, timeout=slurm._SLURM_DEFAULT_PROVISION_TIMEOUT)

# Verify it didn't wait the full default timeout
elapsed = time.time() - start_time
assert elapsed < 5, 'Should complete quickly when nodes are allocated'

def test_wait_for_job_nodes_uses_custom_timeout(self):
"""Test that wait_for_job_nodes uses custom timeout when provided."""
client = slurm.SlurmClient(
ssh_host='localhost',
ssh_port=22,
ssh_user='root',
ssh_key=None,
)

job_id = '12345'
custom_timeout = 2

# Mock get_job_state to always return PENDING
# Mock squeue to always return empty (no nodes)
with mock.patch.object(client, 'get_job_state') as mock_get_state, \
mock.patch.object(client._runner, 'run') as mock_run, \
mock.patch('time.sleep') as mock_sleep:
mock_get_state.return_value = 'PENDING'
mock_run.return_value = (0, '', '') # No nodes allocated

start_time = time.time()
try:
client.wait_for_job_nodes(job_id, timeout=custom_timeout)
assert False, 'Should raise TimeoutError'
except TimeoutError as e:
assert f'{custom_timeout} seconds' in str(e)
# Verify it waited approximately the custom timeout
elapsed = time.time() - start_time
# Allow some margin for test execution time
assert custom_timeout <= elapsed < (custom_timeout * 1.5)

def test_wait_for_job_nodes_raises_on_job_termination(self):
"""Test that wait_for_job_nodes raises when job terminates."""
client = slurm.SlurmClient(
ssh_host='localhost',
ssh_port=22,
ssh_user='root',
ssh_key=None,
)

job_id = '12345'

with mock.patch.object(client, 'get_job_state') as mock_get_state:
mock_get_state.return_value = 'FAILED'

with pytest.raises(RuntimeError,
match='terminated with state FAILED'):
client.wait_for_job_nodes(
job_id, timeout=slurm._SLURM_DEFAULT_PROVISION_TIMEOUT)


class TestGetJobNodes:
"""Test SlurmClient.get_job_nodes()."""

def test_get_job_nodes_passes_timeout_to_wait_for_job_nodes(self):
"""Test that get_job_nodes passes timeout to wait_for_job_nodes."""
client = slurm.SlurmClient(
ssh_host='localhost',
ssh_port=22,
ssh_user='root',
ssh_key=None,
)

job_id = '12345'
custom_timeout = 20

with mock.patch.object(client, 'wait_for_job_nodes') as mock_wait, \
mock.patch.object(client._runner, 'run') as mock_run:
mock_run.return_value = (0, 'node1 10.0.0.1\nnode2 10.0.0.2', '')

client.get_job_nodes(job_id, wait=True, timeout=custom_timeout)

# Verify wait_for_job_nodes was called with the custom timeout
mock_wait.assert_called_once_with(job_id, timeout=custom_timeout)

def test_get_job_nodes_uses_default_timeout_when_not_provided(self):
"""Test that get_job_nodes uses default timeout when not provided."""
client = slurm.SlurmClient(
ssh_host='localhost',
ssh_port=22,
ssh_user='root',
ssh_key=None,
)

job_id = '12345'

with mock.patch.object(client, 'wait_for_job_nodes') as mock_wait, \
mock.patch.object(client._runner, 'run') as mock_run:
mock_run.return_value = (0, 'node1 10.0.0.1\nnode2 10.0.0.2', '')

client.get_job_nodes(job_id, wait=True)

# Verify wait_for_job_nodes was called with None (which becomes default)
mock_wait.assert_called_once_with(
job_id, timeout=slurm._SLURM_DEFAULT_PROVISION_TIMEOUT)

def test_get_job_nodes_skips_wait_when_wait_false(self):
"""Test that get_job_nodes skips waiting when wait=False."""
client = slurm.SlurmClient(
ssh_host='localhost',
ssh_port=22,
ssh_user='root',
ssh_key=None,
)

job_id = '12345'

with mock.patch.object(client, 'wait_for_job_nodes') as mock_wait, \
mock.patch.object(client._runner, 'run') as mock_run:
mock_run.return_value = (0, 'node1 10.0.0.1\nnode2 10.0.0.2', '')

client.get_job_nodes(job_id, wait=False)

# Verify wait_for_job_nodes was not called
mock_wait.assert_not_called()

+ 19
- 13
tests/unit_tests/test_sky/clouds/test_kubernetes.py View File

@@ -11,6 +11,7 @@ import pytest
from sky.clouds import kubernetes
from sky.clouds.utils import gcp_utils
from sky.provision.kubernetes import utils as kubernetes_utils
from sky.utils import resources_utils


class TestKubernetesExistingAllowedContexts(unittest.TestCase):
@@ -379,7 +380,6 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
setattr(self.resources, 'assert_launchable', lambda: self.resources)

# Import NetworkTier for setting network_tier
from sky.utils import resources_utils
self.resources.network_tier = resources_utils.NetworkTier.BEST

self.cluster_name = "test-cluster"
@@ -439,7 +439,9 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
# Call make_deploy_resources_variables
deploy_vars = k8s_cloud.make_deploy_resources_variables(
resources=self.resources,
cluster_name=self.cluster_name,
cluster_name=resources_utils.ClusterName(
display_name=self.cluster_name,
name_on_cloud=self.cluster_name),
region=self.region,
zones=None,
num_nodes=1,
@@ -506,7 +508,9 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
# Call make_deploy_resources_variables
deploy_vars = k8s_cloud.make_deploy_resources_variables(
resources=self.resources,
cluster_name=self.cluster_name,
cluster_name=resources_utils.ClusterName(
display_name=self.cluster_name,
name_on_cloud=self.cluster_name),
region=self.region,
zones=None,
num_nodes=1,
@@ -539,7 +543,6 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
"""Test that IPC_LOCK capability is disabled when network tier is not BEST."""

# Modify resources to not use BEST network tier
from sky.utils import resources_utils
self.resources.network_tier = resources_utils.NetworkTier.STANDARD

# Setup mocks - when network tier is not BEST, _detect_network_type returns NONE
@@ -575,7 +578,9 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
# Call make_deploy_resources_variables
deploy_vars = k8s_cloud.make_deploy_resources_variables(
resources=self.resources,
cluster_name=self.cluster_name,
cluster_name=resources_utils.ClusterName(
display_name=self.cluster_name,
name_on_cloud=self.cluster_name),
region=self.region,
zones=None,
num_nodes=1,
@@ -620,7 +625,6 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
setattr(gpu_resources, 'assert_launchable', lambda: gpu_resources)

# Set network tier to BEST
from sky.utils import resources_utils
gpu_resources.network_tier = resources_utils.NetworkTier.BEST

# Setup mocks - cluster supports high performance networking (Nebius)
@@ -663,7 +667,9 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
# Call make_deploy_resources_variables
deploy_vars = k8s_cloud.make_deploy_resources_variables(
resources=gpu_resources,
cluster_name="test-nebius-gpu-cluster",
cluster_name=resources_utils.ClusterName(
display_name="test-nebius-gpu-cluster",
name_on_cloud="test-nebius-gpu-cluster"),
region=mock.MagicMock(name="nebius-context"),
zones=None,
num_nodes=1,
@@ -711,7 +717,6 @@ class TestKubernetesMakeDeployResourcesVariables(unittest.TestCase):
setattr(self.resources, 'assert_launchable', lambda: self.resources)

# Import NetworkTier for setting network_tier
from sky.utils import resources_utils
self.resources.network_tier = resources_utils.NetworkTier.BEST

self.cluster_name = "test-k8s-cluster"
@@ -791,7 +796,9 @@ class TestKubernetesMakeDeployResourcesVariables(unittest.TestCase):
# Call make_deploy_resources_variables
deploy_vars = k8s_cloud.make_deploy_resources_variables(
resources=self.resources,
cluster_name=self.cluster_name,
cluster_name=resources_utils.ClusterName(
display_name=self.cluster_name,
name_on_cloud=self.cluster_name),
region=self.region,
zones=None,
num_nodes=1,
@@ -880,7 +887,6 @@ class TestKubernetesMakeDeployResourcesVariables(unittest.TestCase):
prod_resources.image_id = None
setattr(prod_resources, 'assert_launchable', lambda: prod_resources)

from sky.utils import resources_utils
prod_resources.network_tier = resources_utils.NetworkTier.BEST

prod_region = mock.MagicMock()
@@ -889,7 +895,9 @@ class TestKubernetesMakeDeployResourcesVariables(unittest.TestCase):
# Call make_deploy_resources_variables
deploy_vars = k8s_cloud.make_deploy_resources_variables(
resources=prod_resources,
cluster_name="test-prod-cluster",
cluster_name=resources_utils.ClusterName(
display_name="test-prod-cluster",
name_on_cloud="test-prod-cluster"),
region=prod_region,
zones=None,
num_nodes=1,
@@ -1569,7 +1577,6 @@ class TestKubernetesUnsupportedFeaturesForResources(unittest.TestCase):

resources = mock.MagicMock()
resources.region = 'test-context'
from sky.utils import resources_utils
resources.network_tier = resources_utils.NetworkTier.BEST

from sky import clouds
@@ -1600,7 +1607,6 @@ class TestKubernetesUnsupportedFeaturesForResources(unittest.TestCase):

resources = mock.MagicMock()
resources.region = 'test-context'
from sky.utils import resources_utils
resources.network_tier = resources_utils.NetworkTier.BEST

from sky import clouds


+ 7
- 3
tests/unit_tests/test_sky/clouds/test_ssh.py View File

@@ -11,7 +11,7 @@ import pytest
import yaml

from sky.clouds import ssh
from sky.provision.kubernetes import utils as kubernetes_utils
from sky.utils import resources_utils


class TestSSHExistingAllowedContexts(unittest.TestCase):
@@ -402,7 +402,9 @@ class TestSSHMakeDeployResourcesVariables(unittest.TestCase):
# Call make_deploy_resources_variables
deploy_vars = ssh_cloud.make_deploy_resources_variables(
resources=self.resources,
cluster_name=self.cluster_name,
cluster_name=resources_utils.ClusterName(
display_name=self.cluster_name,
name_on_cloud=self.cluster_name),
region=self.region,
zones=None,
num_nodes=1,
@@ -500,7 +502,9 @@ class TestSSHMakeDeployResourcesVariables(unittest.TestCase):
# Call make_deploy_resources_variables
deploy_vars = ssh_cloud.make_deploy_resources_variables(
resources=prod_resources,
cluster_name="test-prod-cluster",
cluster_name=resources_utils.ClusterName(
display_name="test-prod-cluster",
name_on_cloud="test-prod-cluster"),
region=prod_region,
zones=None,
num_nodes=1,


+ 1
- 0
tests/unit_tests/test_sky/test_task.py View File

@@ -710,6 +710,7 @@ def make_mock_resource(cloud=None, region=None, zone=None):
self.cloud = cloud
self.region = region
self.zone = zone
self.priority = 0

def copy(self, **override):
# Return a new instance with overridden attributes


Loading…
Cancel
Save
Baidu
map