Fix aiohttp version failure in buildkite (#8267 )

fix
[UX] Introduce pending state (#8262 )
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -40,9 +40,6 @@ buildkite-test-collector
 # memory profiler
 memory_profiler==0.61.0

 # For testing SkyServe
 aiohttp==3.9.3

 # For mocking AWS
 moto==5.1.2

--- a/sky/setup_files/dependencies.py
+++ b/sky/setup_files/dependencies.py
@@ -204,12 +204,21 @@ cloud_dependencies: Dict[str, List[str]] = {
    'ssh': kubernetes_dependencies,
    # For the container registry auth api. Reference:
    # https://github.com/runpod/runpod-python/releases/tag/1.6.1
    # RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python 3.11+
    # stdlib provides tomllib; on lower versions we depend on tomli explicitly.
    # Instead of installing tomli conditionally, we install it explicitly.
    # This is because the conditional installation of tomli does not work
    # with controller package installation code.
    'runpod': ['runpod>=1.6.1', 'tomli'],
    'runpod': [
        # For the container registry auth api. Reference:
        # https://github.com/runpod/runpod-python/releases/tag/1.6.1
        'runpod>=1.6.1',
        # RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python
        # 3.11+ stdlib provides tomllib; on lower versions we depend on tomli
        # explicitly. Instead of installing tomli conditionally, we install it
        # explicitly. This is because the conditional installation of tomli does
        # not work with controller package installation code.
        'tomli',
        # runpod installs aiodns (via aiohttp[speedups]), which is incompatible
        # with pycares 5.0.0 due to deprecations.
        # See https://github.com/aio-libs/aiodns/issues/214
        'pycares<5',
    ],
    'fluidstack': [],  # No dependencies needed for fluidstack
    'cudo': ['cudo-compute>=0.1.10'],
    'paperspace': [],  # No dependencies needed for paperspace
--- a/sky/utils/status_lib.py
+++ b/sky/utils/status_lib.py
@@ -27,6 +27,12 @@ class ClusterStatus(enum.Enum):

    STOPPED = 'STOPPED'
    """The cluster is stopped."""
    PENDING = 'PENDING'
    """The cluster is pending scheduling.

    NOTE: This state is for display only and should not be used in state
    machine logic without necessary considerations.
    """

    def colored_str(self):
        color = _STATUS_TO_COLOR[self]
@@ -37,6 +43,7 @@ _STATUS_TO_COLOR = {
    ClusterStatus.INIT: colorama.Fore.BLUE,
    ClusterStatus.UP: colorama.Fore.GREEN,
    ClusterStatus.STOPPED: colorama.Fore.YELLOW,
    ClusterStatus.PENDING: colorama.Fore.CYAN,
 }


--- a/tests/load_tests/db_scale_tests/test_large_production_performance.sh
+++ b/tests/load_tests/db_scale_tests/test_large_production_performance.sh
@@ -302,9 +302,30 @@ echo "✓ Minimal sky launch verified - found expected echo content in logs"
 # Clean up immediately
 sky down "$MINIMAL_CLUSTER_NAME" -y || true

 # Step 9: Verify dashboard pages in browser
 echo "Step 9: Verifying dashboard pages in browser..."
 # Step 9: Build dashboard before verification
 echo "Step 9: Building dashboard..."
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 SKYPILOT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
 DASHBOARD_DIR="${SKYPILOT_ROOT}/sky/dashboard"

 if [ -d "$DASHBOARD_DIR" ]; then
    if command -v npm &> /dev/null; then
        echo "Installing dashboard dependencies..."
        npm --prefix "$DASHBOARD_DIR" install
        echo "Building dashboard..."
        npm --prefix "$DASHBOARD_DIR" run build
        echo "✓ Dashboard built successfully"
    else
        echo "ERROR: npm not found, cannot build dashboard. Please install Node.js and npm."
        exit 1
    fi
 else
    echo "ERROR: Dashboard directory not found at $DASHBOARD_DIR"
    exit 1
 fi

 # Step 10: Verify dashboard pages in browser
 echo "Step 10: Verifying dashboard pages in browser..."
 VERIFY_SCRIPT="${SCRIPT_DIR}/verify_dashboard_browser.py"

 # Get API server endpoint
@@ -325,11 +346,10 @@ fi
 echo "Using API endpoint: $API_ENDPOINT"

 if [ -f "$VERIFY_SCRIPT" ]; then
    python3 "$VERIFY_SCRIPT" --endpoint "$API_ENDPOINT" || {
        echo "WARNING: Dashboard verification failed, but continuing..."
    }
    python3 "$VERIFY_SCRIPT" --endpoint "$API_ENDPOINT"
 else
    echo "WARNING: Dashboard verification script not found at $VERIFY_SCRIPT, skipping browser verification"
    echo "ERROR: Dashboard verification script not found at $VERIFY_SCRIPT"
    exit 1
 fi

 echo ""
--- a/tests/smoke_tests/test_cli.py
+++ b/tests/smoke_tests/test_cli.py
@@ -32,7 +32,7 @@ def test_endpoint_output_basic(generic_cloud: str):
    """Test that sky api info endpoint output is correct."""
    name = smoke_tests_utils.get_cluster_name()
    test = smoke_tests_utils.Test('endpoint_output_basic', [
        f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
        f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
        f's=$(SKYPILOT_DEBUG=0 sky api info | tee /dev/stderr) && echo "\n===Validating endpoint output===" && echo "$s" | grep "Endpoint set to default local API server."',
    ],
                                  timeout=smoke_tests_utils.get_timeout(
@@ -47,7 +47,7 @@ def test_endpoint_output_basic_no_pg_conn_closed_errors(generic_cloud: str):
    name = smoke_tests_utils.get_cluster_name()
    test = smoke_tests_utils.Test(
        'endpoint_output_basic_no_pg_conn_closed_errors', [
            f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT_NO_PG_CONN_CLOSED_ERROR}',
            f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT_NO_PG_CONN_CLOSED_ERROR}',
        ],
        timeout=smoke_tests_utils.get_timeout(generic_cloud),
        teardown=f'sky down -y {name}')
@@ -71,7 +71,7 @@ def test_endpoint_output_config(generic_cloud: str):

        name = smoke_tests_utils.get_cluster_name()
        test = smoke_tests_utils.Test('endpoint_output_config', [
            f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
            f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
            f's=$(SKYPILOT_DEBUG=0 sky api info | tee /dev/stderr) && echo "\n===Validating endpoint output===" && echo "$s" | grep "Endpoint set via {f.name}"',
        ],
                                      timeout=smoke_tests_utils.get_timeout(
@@ -91,7 +91,7 @@ def test_endpoint_output_env(generic_cloud: str):
    name = smoke_tests_utils.get_cluster_name()
    expected_string = f"Endpoint set via the environment variable {constants.SKY_API_SERVER_URL_ENV_VAR}"
    test = smoke_tests_utils.Test('endpoint_output_env', [
        f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
        f's=$(SKYPILOT_DEBUG=0 sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} tests/test_yamls/minimal.yaml) && {smoke_tests_utils.VALIDATE_LAUNCH_OUTPUT}',
        f's=$(SKYPILOT_DEBUG=0 sky api info | tee /dev/stderr) && echo "\n===Validating endpoint output===" && echo "Expecting to see: {expected_string}\n" && echo "$s" | grep "{expected_string}"',
    ],
                                  timeout=smoke_tests_utils.get_timeout(
@@ -167,7 +167,7 @@ def test_cli_auto_retry(generic_cloud: str):
            # Chaos proxy will kill TCP connections every 30 seconds.
            f'python tests/chaos/chaos_proxy.py --port {port} --interval 30 & echo $! > /tmp/{name}-chaos.pid',
            # Both launch streaming and logs streaming should survive the chaos.
            f'SKYPILOT_API_SERVER_ENDPOINT={api_proxy_url} sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} \'{run_command}\'',
            f'SKYPILOT_API_SERVER_ENDPOINT={api_proxy_url} sky launch -y -c {name} {smoke_tests_utils.LOW_RESOURCE_ARG} --infra {generic_cloud} \'{run_command}\'',
            f'kill $(cat /tmp/{name}-chaos.pid)',
        ],
        timeout=smoke_tests_utils.get_timeout(generic_cloud),
--- a/tests/unit_tests/test_sky/clouds/test_kubernetes.py
+++ b/tests/unit_tests/test_sky/clouds/test_kubernetes.py
@@ -11,6 +11,7 @@ import pytest
 from sky.clouds import kubernetes
 from sky.clouds.utils import gcp_utils
 from sky.provision.kubernetes import utils as kubernetes_utils
 from sky.utils import resources_utils


 class TestKubernetesExistingAllowedContexts(unittest.TestCase):
@@ -379,7 +380,6 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
        setattr(self.resources, 'assert_launchable', lambda: self.resources)

        # Import NetworkTier for setting network_tier
        from sky.utils import resources_utils
        self.resources.network_tier = resources_utils.NetworkTier.BEST

        self.cluster_name = "test-cluster"
@@ -439,7 +439,9 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
        # Call make_deploy_resources_variables
        deploy_vars = k8s_cloud.make_deploy_resources_variables(
            resources=self.resources,
            cluster_name=self.cluster_name,
            cluster_name=resources_utils.ClusterName(
                display_name=self.cluster_name,
                name_on_cloud=self.cluster_name),
            region=self.region,
            zones=None,
            num_nodes=1,
@@ -506,7 +508,9 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
        # Call make_deploy_resources_variables
        deploy_vars = k8s_cloud.make_deploy_resources_variables(
            resources=self.resources,
            cluster_name=self.cluster_name,
            cluster_name=resources_utils.ClusterName(
                display_name=self.cluster_name,
                name_on_cloud=self.cluster_name),
            region=self.region,
            zones=None,
            num_nodes=1,
@@ -539,7 +543,6 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
        """Test that IPC_LOCK capability is disabled when network tier is not BEST."""

        # Modify resources to not use BEST network tier
        from sky.utils import resources_utils
        self.resources.network_tier = resources_utils.NetworkTier.STANDARD

        # Setup mocks - when network tier is not BEST, _detect_network_type returns NONE
@@ -575,7 +578,9 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
        # Call make_deploy_resources_variables
        deploy_vars = k8s_cloud.make_deploy_resources_variables(
            resources=self.resources,
            cluster_name=self.cluster_name,
            cluster_name=resources_utils.ClusterName(
                display_name=self.cluster_name,
                name_on_cloud=self.cluster_name),
            region=self.region,
            zones=None,
            num_nodes=1,
@@ -620,7 +625,6 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
        setattr(gpu_resources, 'assert_launchable', lambda: gpu_resources)

        # Set network tier to BEST
        from sky.utils import resources_utils
        gpu_resources.network_tier = resources_utils.NetworkTier.BEST

        # Setup mocks - cluster supports high performance networking (Nebius)
@@ -663,7 +667,9 @@ class TestKubernetesSecurityContextMerging(unittest.TestCase):
        # Call make_deploy_resources_variables
        deploy_vars = k8s_cloud.make_deploy_resources_variables(
            resources=gpu_resources,
            cluster_name="test-nebius-gpu-cluster",
            cluster_name=resources_utils.ClusterName(
                display_name="test-nebius-gpu-cluster",
                name_on_cloud="test-nebius-gpu-cluster"),
            region=mock.MagicMock(name="nebius-context"),
            zones=None,
            num_nodes=1,
@@ -711,7 +717,6 @@ class TestKubernetesMakeDeployResourcesVariables(unittest.TestCase):
        setattr(self.resources, 'assert_launchable', lambda: self.resources)

        # Import NetworkTier for setting network_tier
        from sky.utils import resources_utils
        self.resources.network_tier = resources_utils.NetworkTier.BEST

        self.cluster_name = "test-k8s-cluster"
@@ -791,7 +796,9 @@ class TestKubernetesMakeDeployResourcesVariables(unittest.TestCase):
        # Call make_deploy_resources_variables
        deploy_vars = k8s_cloud.make_deploy_resources_variables(
            resources=self.resources,
            cluster_name=self.cluster_name,
            cluster_name=resources_utils.ClusterName(
                display_name=self.cluster_name,
                name_on_cloud=self.cluster_name),
            region=self.region,
            zones=None,
            num_nodes=1,
@@ -880,7 +887,6 @@ class TestKubernetesMakeDeployResourcesVariables(unittest.TestCase):
        prod_resources.image_id = None
        setattr(prod_resources, 'assert_launchable', lambda: prod_resources)

        from sky.utils import resources_utils
        prod_resources.network_tier = resources_utils.NetworkTier.BEST

        prod_region = mock.MagicMock()
@@ -889,7 +895,9 @@ class TestKubernetesMakeDeployResourcesVariables(unittest.TestCase):
        # Call make_deploy_resources_variables
        deploy_vars = k8s_cloud.make_deploy_resources_variables(
            resources=prod_resources,
            cluster_name="test-prod-cluster",
            cluster_name=resources_utils.ClusterName(
                display_name="test-prod-cluster",
                name_on_cloud="test-prod-cluster"),
            region=prod_region,
            zones=None,
            num_nodes=1,
@@ -1569,7 +1577,6 @@ class TestKubernetesUnsupportedFeaturesForResources(unittest.TestCase):

        resources = mock.MagicMock()
        resources.region = 'test-context'
        from sky.utils import resources_utils
        resources.network_tier = resources_utils.NetworkTier.BEST

        from sky import clouds
@@ -1600,7 +1607,6 @@ class TestKubernetesUnsupportedFeaturesForResources(unittest.TestCase):

        resources = mock.MagicMock()
        resources.region = 'test-context'
        from sky.utils import resources_utils
        resources.network_tier = resources_utils.NetworkTier.BEST

        from sky import clouds
--- a/tests/unit_tests/test_sky/clouds/test_ssh.py
+++ b/tests/unit_tests/test_sky/clouds/test_ssh.py
@@ -11,7 +11,7 @@ import pytest
 import yaml

 from sky.clouds import ssh
 from sky.provision.kubernetes import utils as kubernetes_utils
 from sky.utils import resources_utils


 class TestSSHExistingAllowedContexts(unittest.TestCase):
@@ -402,7 +402,9 @@ class TestSSHMakeDeployResourcesVariables(unittest.TestCase):
        # Call make_deploy_resources_variables
        deploy_vars = ssh_cloud.make_deploy_resources_variables(
            resources=self.resources,
            cluster_name=self.cluster_name,
            cluster_name=resources_utils.ClusterName(
                display_name=self.cluster_name,
                name_on_cloud=self.cluster_name),
            region=self.region,
            zones=None,
            num_nodes=1,
@@ -500,7 +502,9 @@ class TestSSHMakeDeployResourcesVariables(unittest.TestCase):
        # Call make_deploy_resources_variables
        deploy_vars = ssh_cloud.make_deploy_resources_variables(
            resources=prod_resources,
            cluster_name="test-prod-cluster",
            cluster_name=resources_utils.ClusterName(
                display_name="test-prod-cluster",
                name_on_cloud="test-prod-cluster"),
            region=prod_region,
            zones=None,
            num_nodes=1,
--- a/tests/unit_tests/test_sky/test_task.py
+++ b/tests/unit_tests/test_sky/test_task.py
@@ -710,6 +710,7 @@ def make_mock_resource(cloud=None, region=None, zone=None):
            self.cloud = cloud
            self.region = region
            self.zone = zone
            self.priority = 0

        def copy(self, **override):
            # Return a new instance with overridden attributes
Author	SHA1	Message	Date
zpoint	dd21824060	Fix aiohttp version failure in buildkite (#8267 ) fix	6 days ago
Seung Jin	e4e2d27653	[UX] Introduce pending state (#8262 ) * introduce pending state * add info comment * Update sky/utils/status_lib.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>	6 days ago
zpoint	166fd63617	[Dashboard] Add some tests for the dashboard performance (#8227 ) npm install before verify	6 days ago
Seung Jin	791356230a	[tests] misc test fixes (#8260 ) misc test fixes	6 days ago
Christopher Cooper	b5ff2fdb54	[deps] pin pycares<5 to work around aiodns issue (#8259 )	6 days ago
lloyd-brown	9f7bdf7d28	[Tests] Add Cloud Selection to CLI Smoke Tests (#8256 ) * Add infra. * Update tests/smoke_tests/test_cli.py	6 days ago