6 Commits

Author SHA1 Message Date
  zpoint dd21824060
Fix aiohttp version failure in buildkite (#8267) 6 days ago
  Seung Jin e4e2d27653
[UX] Introduce pending state (#8262) 6 days ago
  zpoint 166fd63617
[Dashboard] Add some tests for the dashboard performance (#8227) 6 days ago
  Seung Jin 791356230a
[tests] misc test fixes (#8260) 6 days ago
  Christopher Cooper b5ff2fdb54
[deps] pin pycares<5 to work around aiodns issue (#8259) 6 days ago
  lloyd-brown 9f7bdf7d28
[Tests] Add Cloud Selection to CLI Smoke Tests (#8256) 6 days ago
8 changed files with 80 additions and 36 deletions
Split View
  1. +0
    -3
      requirements-dev.txt
  2. +15
    -6
      sky/setup_files/dependencies.py
  3. +7
    -0
      sky/utils/status_lib.py
  4. +26
    -6
      tests/load_tests/db_scale_tests/test_large_production_performance.sh
  5. +5
    -5
      tests/smoke_tests/test_cli.py
  6. +19
    -13
      tests/unit_tests/test_sky/clouds/test_kubernetes.py
  7. +7
    -3
      tests/unit_tests/test_sky/clouds/test_ssh.py
  8. +1
    -0
      tests/unit_tests/test_sky/test_task.py

+ 0
- 3
requirements-dev.txt View File

@@ -40,9 +40,6 @@ buildkite-test-collector
# memory profiler
memory_profiler==0.61.0

# For testing SkyServe
aiohttp==3.9.3

# For mocking AWS
moto==5.1.2



+ 15
- 6
sky/setup_files/dependencies.py View File

@@ -204,12 +204,21 @@ cloud_dependencies: Dict[str, List[str]] = {
'ssh': kubernetes_dependencies,
# For the container registry auth api. Reference:
# https://github.com/runpod/runpod-python/releases/tag/1.6.1
# RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python 3.11+
# stdlib provides tomllib; on lower versions we depend on tomli explicitly.
# Instead of installing tomli conditionally, we install it explicitly.
# This is because the conditional installation of tomli does not work
# with controller package installation code.
'runpod': ['runpod>=1.6.1', 'tomli'],
'runpod': [
# For the container registry auth api. Reference:
# https://github.com/runpod/runpod-python/releases/tag/1.6.1
'runpod>=1.6.1',
# RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python
# 3.11+ stdlib provides tomllib; on lower versions we depend on tomli
# explicitly. Instead of installing tomli conditionally, we install it
# explicitly. This is because the conditional installation of tomli does
# not work with controller package installation code.
'tomli',
# runpod installs aiodns (via aiohttp[speedups]), which is incompatible
# with pycares 5.0.0 due to deprecations.
# See https://github.com/aio-libs/aiodns/issues/214
'pycares<5',
],
'fluidstack': [], # No dependencies needed for fluidstack
'cudo': ['cudo-compute>=0.1.10'],
'paperspace': [], # No dependencies needed for paperspace


+ 7
- 0
sky/utils/status_lib.py View File

@@ -27,6 +27,12 @@ class ClusterStatus(enum.Enum):

STOPPED = 'STOPPED'
"""The cluster is stopped."""
PENDING = 'PENDING'
"""The cluster is pending scheduling.

NOTE: This state is for display only and should not be used in state
machine logic without necessary considerations.
"""

def colored_str(self):
color = _STATUS_TO_COLOR[self]
@@ -37,6 +43,7 @@ _STATUS_TO_COLOR = {
ClusterStatus.INIT: colorama.Fore.BLUE,
ClusterStatus.UP: colorama.Fore.GREEN,
ClusterStatus.STOPPED: colorama.Fore.YELLOW,
ClusterStatus.PENDING: colorama.Fore.CYAN,
}




+ 26
- 6
tests/load_tests/db_scale_tests/test_large_production_performance.sh View File

@@ -302,9 +302,30 @@ echo "✓ Minimal sky launch verified - found expected echo content in logs"
# Clean up immediately
sky down "$MINIMAL_CLUSTER_NAME" -y || true

# Step 9: Verify dashboard pages in browser
echo "Step 9: Verifying dashboard pages in browser..."
# Step 9: Build dashboard before verification
echo "Step 9: Building dashboard..."
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SKYPILOT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
DASHBOARD_DIR="${SKYPILOT_ROOT}/sky/dashboard"

if [ -d "$DASHBOARD_DIR" ]; then
if command -v npm &> /dev/null; then
echo "Installing dashboard dependencies..."
npm --prefix "$DASHBOARD_DIR" install
echo "Building dashboard..."
npm --prefix "$DASHBOARD_DIR" run build
echo "✓ Dashboard built successfully"
else
echo "ERROR: npm not found, cannot build dashboard. Please install Node.js and npm."
exit 1
fi
else
echo "ERROR: Dashboard directory not found at $DASHBOARD_DIR"
exit 1
fi

# Step 10: Verify dashboard pages in browser
echo "Step 10: Verifying dashboard pages in browser..."
VERIFY_SCRIPT="${SCRIPT_DIR}/verify_dashboard_browser.py"

# Get API server endpoint
@@ -325,11 +346,10 @@ fi
echo "Using API endpoint: $API_ENDPOINT"

if [ -f "$VERIFY_SCRIPT" ]; then
python3 "$VERIFY_SCRIPT" --endpoint "$API_ENDPOINT" || {
echo "WARNING: Dashboard verification failed, but continuing..."
}
python3 "$VERIFY_SCRIPT" --endpoint "$API_ENDPOINT"
else
echo "WARNING: Dashboard verification script not found at $VERIFY_SCRIPT, skipping browser verification"
echo "ERROR: Dashboard verification script not found at $VERIFY_SCRIPT"
exit 1
fi

echo ""


+ 5
- 5
tests/smoke_tests/test_cli.py View File

@@ -32,7 +32,7 @@ def test_endpoint_output_basic(generic_cloud: str):
"""Test that sky api info endpoint output is correct."""
name = smoke_tests_utils.get_cluster_name()
test = smoke_tests_utils.Test('endpoint_output_basic', [
f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
f's=$(SKYPILOT_DEBUG=0 sky api info | tee /dev/stderr) && echo "\n===Validating endpoint output===" && echo "$s" | grep "Endpoint set to default local API server."',
],
timeout=smoke_tests_utils.get_timeout(
@@ -47,7 +47,7 @@ def test_endpoint_output_basic_no_pg_conn_closed_errors(generic_cloud: str):
name = smoke_tests_utils.get_cluster_name()
test = smoke_tests_utils.Test(
'endpoint_output_basic_no_pg_conn_closed_errors', [
f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT_NO_PG_CONN_CLOSED_ERROR}',
f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT_NO_PG_CONN_CLOSED_ERROR}',
],
timeout=smoke_tests_utils.get_timeout(generic_cloud),
teardown=f'sky down -y {name}')
@@ -71,7 +71,7 @@ def test_endpoint_output_config(generic_cloud: str):

name = smoke_tests_utils.get_cluster_name()
test = smoke_tests_utils.Test('endpoint_output_config', [
f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
f's=$(SKYPILOT_DEBUG=0 sky api info | tee /dev/stderr) && echo "\n===Validating endpoint output===" && echo "$s" | grep "Endpoint set via {f.name}"',
],
timeout=smoke_tests_utils.get_timeout(
@@ -91,7 +91,7 @@ def test_endpoint_output_env(generic_cloud: str):
name = smoke_tests_utils.get_cluster_name()
expected_string = f"Endpoint set via the environment variable {constants.SKY_API_SERVER_URL_ENV_VAR}"
test = smoke_tests_utils.Test('endpoint_output_env', [
f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
f's=$(SKYPILOT_DEBUG=0 sky api info | tee /dev/stderr) && echo "\n===Validating endpoint output===" && echo "Expecting to see: {expected_string}\n" && echo "$s" | grep "{expected_string}"',
],
timeout=smoke_tests_utils.get_timeout(
@@ -167,7 +167,7 @@ def test_cli_auto_retry(generic_cloud: str):
# Chaos proxy will kill TCP connections every 30 seconds.
f'python tests/chaos/chaos_proxy.py --port {port} --interval 30 & echo $! > /tmp/{name}-chaos.pid',
# Both launch streaming and logs streaming should survive the chaos.
f'SKYPILOT_API_SERVER_ENDPOINT={api_proxy_url} sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} \'{run_command}\'',
f'SKYPILOT_API_SERVER_ENDPOINT={api_proxy_url} sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} \'{run_command}\'',
f'kill $(cat /tmp/{name}-chaos.pid)',
],
timeout=smoke_tests_utils.get_timeout(generic_cloud),


+ 19
- 13
tests/unit_tests/test_sky/clouds/test_kubernetes.py View File

@@ -11,6 +11,7 @@ import pytest
from sky.clouds import kubernetes
from sky.clouds.utils import gcp_utils
from sky.provision.kubernetes import utils as kubernetes_utils
from sky.utils import resources_utils


class TestKubernetesExistingAllowedContexts(unittest.TestCase):
@@ -379,7 +380,6 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
setattr(self.resources, 'assert_launchable', lambda: self.resources)

# Import NetworkTier for setting network_tier
from sky.utils import resources_utils
self.resources.network_tier = resources_utils.NetworkTier.BEST

self.cluster_name = "test-cluster"
@@ -439,7 +439,9 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
# Call make_deploy_resources_variables
deploy_vars = k8s_cloud.make_deploy_resources_variables(
resources=self.resources,
cluster_name=self.cluster_name,
cluster_name=resources_utils.ClusterName(
display_name=self.cluster_name,
name_on_cloud=self.cluster_name),
region=self.region,
zones=None,
num_nodes=1,
@@ -506,7 +508,9 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
# Call make_deploy_resources_variables
deploy_vars = k8s_cloud.make_deploy_resources_variables(
resources=self.resources,
cluster_name=self.cluster_name,
cluster_name=resources_utils.ClusterName(
display_name=self.cluster_name,
name_on_cloud=self.cluster_name),
region=self.region,
zones=None,
num_nodes=1,
@@ -539,7 +543,6 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
"""Test that IPC_LOCK capability is disabled when network tier is not BEST."""

# Modify resources to not use BEST network tier
from sky.utils import resources_utils
self.resources.network_tier = resources_utils.NetworkTier.STANDARD

# Setup mocks - when network tier is not BEST, _detect_network_type returns NONE
@@ -575,7 +578,9 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
# Call make_deploy_resources_variables
deploy_vars = k8s_cloud.make_deploy_resources_variables(
resources=self.resources,
cluster_name=self.cluster_name,
cluster_name=resources_utils.ClusterName(
display_name=self.cluster_name,
name_on_cloud=self.cluster_name),
region=self.region,
zones=None,
num_nodes=1,
@@ -620,7 +625,6 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
setattr(gpu_resources, 'assert_launchable', lambda: gpu_resources)

# Set network tier to BEST
from sky.utils import resources_utils
gpu_resources.network_tier = resources_utils.NetworkTier.BEST

# Setup mocks - cluster supports high performance networking (Nebius)
@@ -663,7 +667,9 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
# Call make_deploy_resources_variables
deploy_vars = k8s_cloud.make_deploy_resources_variables(
resources=gpu_resources,
cluster_name="test-nebius-gpu-cluster",
cluster_name=resources_utils.ClusterName(
display_name="test-nebius-gpu-cluster",
name_on_cloud="test-nebius-gpu-cluster"),
region=mock.MagicMock(name="nebius-context"),
zones=None,
num_nodes=1,
@@ -711,7 +717,6 @@ class TestKubernetesMakeDeployResourcesVariables(unittest.TestCase):
setattr(self.resources, 'assert_launchable', lambda: self.resources)

# Import NetworkTier for setting network_tier
from sky.utils import resources_utils
self.resources.network_tier = resources_utils.NetworkTier.BEST

self.cluster_name = "test-k8s-cluster"
@@ -791,7 +796,9 @@ class TestKubernetesMakeDeployResourcesVariables(unittest.TestCase):
# Call make_deploy_resources_variables
deploy_vars = k8s_cloud.make_deploy_resources_variables(
resources=self.resources,
cluster_name=self.cluster_name,
cluster_name=resources_utils.ClusterName(
display_name=self.cluster_name,
name_on_cloud=self.cluster_name),
region=self.region,
zones=None,
num_nodes=1,
@@ -880,7 +887,6 @@ class TestKubernetesMakeDeployResourcesVariables(unittest.TestCase):
prod_resources.image_id = None
setattr(prod_resources, 'assert_launchable', lambda: prod_resources)

from sky.utils import resources_utils
prod_resources.network_tier = resources_utils.NetworkTier.BEST

prod_region = mock.MagicMock()
@@ -889,7 +895,9 @@ class TestKubernetesMakeDeployResourcesVariables(unittest.TestCase):
# Call make_deploy_resources_variables
deploy_vars = k8s_cloud.make_deploy_resources_variables(
resources=prod_resources,
cluster_name="test-prod-cluster",
cluster_name=resources_utils.ClusterName(
display_name="test-prod-cluster",
name_on_cloud="test-prod-cluster"),
region=prod_region,
zones=None,
num_nodes=1,
@@ -1569,7 +1577,6 @@ class TestKubernetesUnsupportedFeaturesForResources(unittest.TestCase):

resources = mock.MagicMock()
resources.region = 'test-context'
from sky.utils import resources_utils
resources.network_tier = resources_utils.NetworkTier.BEST

from sky import clouds
@@ -1600,7 +1607,6 @@ class TestKubernetesUnsupportedFeaturesForResources(unittest.TestCase):

resources = mock.MagicMock()
resources.region = 'test-context'
from sky.utils import resources_utils
resources.network_tier = resources_utils.NetworkTier.BEST

from sky import clouds


+ 7
- 3
tests/unit_tests/test_sky/clouds/test_ssh.py View File

@@ -11,7 +11,7 @@ import pytest
import yaml

from sky.clouds import ssh
from sky.provision.kubernetes import utils as kubernetes_utils
from sky.utils import resources_utils


class TestSSHExistingAllowedContexts(unittest.TestCase):
@@ -402,7 +402,9 @@ class TestSSHMakeDeployResourcesVariables(unittest.TestCase):
# Call make_deploy_resources_variables
deploy_vars = ssh_cloud.make_deploy_resources_variables(
resources=self.resources,
cluster_name=self.cluster_name,
cluster_name=resources_utils.ClusterName(
display_name=self.cluster_name,
name_on_cloud=self.cluster_name),
region=self.region,
zones=None,
num_nodes=1,
@@ -500,7 +502,9 @@ class TestSSHMakeDeployResourcesVariables(unittest.TestCase):
# Call make_deploy_resources_variables
deploy_vars = ssh_cloud.make_deploy_resources_variables(
resources=prod_resources,
cluster_name="test-prod-cluster",
cluster_name=resources_utils.ClusterName(
display_name="test-prod-cluster",
name_on_cloud="test-prod-cluster"),
region=prod_region,
zones=None,
num_nodes=1,


+ 1
- 0
tests/unit_tests/test_sky/test_task.py View File

@@ -710,6 +710,7 @@ def make_mock_resource(cloud=None, region=None, zone=None):
self.cloud = cloud
self.region = region
self.zone = zone
self.priority = 0

def copy(self, **override):
# Return a new instance with overridden attributes


Loading…
Cancel
Save
Baidu
map