diff --git a/modules/python/clients/aks_client.py b/modules/python/clients/aks_client.py index df20965955..4e571b2840 100644 --- a/modules/python/clients/aks_client.py +++ b/modules/python/clients/aks_client.py @@ -268,11 +268,12 @@ def _begin_update_with_retry( retries: int = 10, retry_wait: int = 30, poll_interval: int = 30, - timeout: int = 1200, + timeout: int = 1800, ) -> None: """ Call begin_create_or_update with retry on OperationNotAllowed/EtagMismatch, polling every poll_interval seconds and raising TimeoutError after timeout seconds. + timeout defaults to 1800s (30 min) for slow GPU node provisioning (A100 MIG). """ for attempt in range(retries): try: @@ -363,6 +364,70 @@ def add_managed_gpu_node_pool( ) logger.info(f"az aks nodepool add succeeded for '{node_pool_name}'") + @staticmethod + def _gpu_mode_metadata( + gpu_node_pool: bool, + enable_managed_gpu: bool, + gpu_instance_profile: Optional[str] = None, + gpu_mig_strategy: Optional[str] = None, + ) -> Dict[str, Any]: + """ + Build normalized GPU-mode metadata distinguishing managed vs fully-managed + GPU and MIG single vs mixed. + + Derived from the operation INPUT flags rather than the AKS read-back: the + stable SDK does not model gpuProfile.nvidia.managementMode, so a + fully-managed pool's mode is dropped from nodepool_info. Flag combinations + are normalized for consistency: enable_managed_gpu / MIG only apply to a + GPU pool, and MIG only to fully-managed pools (dropped otherwise). + + Returns a dict with gpu_mode ("none"|"managed"|"fully_managed"), + enable_managed_gpu, mig_enabled, gpu_instance_profile, gpu_mig_strategy. + Raises ValueError if gpu_mig_strategy is not None / "single" / "mixed". + """ + strategy = (gpu_mig_strategy or None) and str(gpu_mig_strategy).lower() + if strategy not in (None, "single", "mixed"): + raise ValueError( + f"invalid gpu_mig_strategy {gpu_mig_strategy!r} (want single/mixed/None)" + ) + + is_gpu = bool(gpu_node_pool) + fully_managed = is_gpu and bool(enable_managed_gpu) + + if not is_gpu: + gpu_mode = "none" + elif fully_managed: + gpu_mode = "fully_managed" + else: + gpu_mode = "managed" + + # MIG only applies to fully-managed pools; drop it otherwise. + profile = gpu_instance_profile if fully_managed else None + strategy = strategy if fully_managed else None + + return { + "gpu_mode": gpu_mode, + "enable_managed_gpu": fully_managed, + "mig_enabled": bool(profile or strategy), + "gpu_instance_profile": profile, + "gpu_mig_strategy": strategy, + } + + @staticmethod + def _log_gpu_mode(metadata: Dict[str, Any]) -> None: + """Echo the normalized GPU-mode metadata to the console for traceability.""" + if metadata.get("gpu_mode") in (None, "none"): + return + logger.info( + "GPU pool metadata: gpu_mode=%s enable_managed_gpu=%s mig_enabled=%s " + "gpu_instance_profile=%s gpu_mig_strategy=%s", + metadata.get("gpu_mode"), + metadata.get("enable_managed_gpu"), + metadata.get("mig_enabled"), + metadata.get("gpu_instance_profile"), + metadata.get("gpu_mig_strategy"), + ) + def create_node_pool( self, node_pool_name: str, @@ -409,8 +474,14 @@ def create_node_pool( "vm_size": vm_size, "node_count": node_count, "gpu_node_pool": gpu_node_pool, - "enable_managed_gpu": enable_managed_gpu, + **self._gpu_mode_metadata( + gpu_node_pool, + enable_managed_gpu, + gpu_instance_profile, + gpu_mig_strategy, + ), } + self._log_gpu_mode(metadata) # Create operation context to track the operation with self._get_operation_context()( @@ -522,6 +593,7 @@ def scale_node_pool( progressive: bool = False, scale_step_size: int = 1, gpu_instance_profile: Optional[str] = None, + gpu_mig_strategy: Optional[str] = None, ) -> Any: """ Scale a node pool to the specified node count. @@ -555,7 +627,14 @@ def scale_node_pool( "gpu_node_pool": gpu_node_pool, "progressive_scaling": progressive, "scale_step_size": scale_step_size, + **self._gpu_mode_metadata( + gpu_node_pool, + enable_managed_gpu, + gpu_instance_profile, + gpu_mig_strategy, + ), } + self._log_gpu_mode(metadata) node_pool = self.get_node_pool(node_pool_name, cluster_name) current_count = node_pool.count @@ -583,6 +662,7 @@ def scale_node_pool( enable_managed_gpu=enable_managed_gpu, node_pool=node_pool, gpu_instance_profile=gpu_instance_profile, + gpu_mig_strategy=gpu_mig_strategy, ) # Create operation context to track the operation @@ -751,6 +831,7 @@ def _progressive_scale( enable_managed_gpu: bool = False, node_pool: Optional[Any] = None, gpu_instance_profile: Optional[str] = None, + gpu_mig_strategy: Optional[str] = None, ) -> Any: """ Scale a node pool progressively with specified step size @@ -816,7 +897,14 @@ def _progressive_scale( "scale_step_size": scale_step_size, "cluster_name": cluster_name or self.get_cluster_name(), "gpu_node_pool": gpu_node_pool, + **self._gpu_mode_metadata( + gpu_node_pool, + enable_managed_gpu, + gpu_instance_profile, + gpu_mig_strategy, + ), } + self._log_gpu_mode(step_metadata) # Create operation context for this specific step with self._get_operation_context()( diff --git a/modules/python/clients/kubernetes_client.py b/modules/python/clients/kubernetes_client.py index 75caa81bb0..9d7b171353 100644 --- a/modules/python/clients/kubernetes_client.py +++ b/modules/python/clients/kubernetes_client.py @@ -683,18 +683,29 @@ def verify_nvidia_smi_on_node(self, nodes, namespace="default"): logger.info(f"Verifying NVIDIA drivers on node {node_name}") node = self.describe_node(node_name) - # Check if the node has GPUs allocated values (whole GPU or MIG slices) + # Wait for the node to advertise a POSITIVE GPU/MIG count. The device + # plugin can register nvidia.com/gpu with value "0" before MIG instances + # are published, so a MIG-single node briefly looks GPU-less. Waiting on + # key presence (rather than a positive count) would race in during that + # window and skip the node; wait on the count instead. start_time = time.time() + gpu_count = 0 while time.time() < start_time + 600: allocatable = node.status.allocatable or {} - if "nvidia.com/gpu" in allocatable or any(k.startswith("nvidia.com/mig-") for k in allocatable): + gpu_count = int(allocatable.get("nvidia.com/gpu", "0")) + mig_count = sum( + int(v) for k, v in allocatable.items() + if k.startswith("nvidia.com/mig-") + ) + if gpu_count > 0 or mig_count > 0: break - node = self.describe_node(node_name) - logger.info(f"Node allocatable resources: {node.status.allocatable}") - logger.info(f"Waiting for GPUs to be allocated on node {node_name}...") + logger.info( + f"Waiting for GPUs to be allocated on node {node_name}... " + f"(allocatable: {allocatable})" + ) time.sleep(1) - gpu_count = int(node.status.allocatable.get("nvidia.com/gpu", "0")) - has_mig = any(k.startswith("nvidia.com/mig-") for k in node.status.allocatable) + node = self.describe_node(node_name) + has_mig = any(k.startswith("nvidia.com/mig-") for k in (node.status.allocatable or {})) logger.info(f"Node {node_name} has {gpu_count} GPUs, requesting all for validation") diff --git a/modules/python/crud/azure/node_pool_crud.py b/modules/python/crud/azure/node_pool_crud.py index 6f52e69cfd..998299f0cb 100644 --- a/modules/python/crud/azure/node_pool_crud.py +++ b/modules/python/crud/azure/node_pool_crud.py @@ -118,6 +118,7 @@ def scale_node_pool( gpu_node_pool=False, enable_managed_gpu=False, gpu_instance_profile=None, + gpu_mig_strategy=None, ): """ Scale a node pool to specified count @@ -145,6 +146,7 @@ def scale_node_pool( progressive=progressive, scale_step_size=scale_step_size, gpu_instance_profile=gpu_instance_profile, + gpu_mig_strategy=gpu_mig_strategy, ) if result is not None: @@ -253,6 +255,7 @@ def all( gpu_node_pool=gpu_node_pool, enable_managed_gpu=enable_managed_gpu, gpu_instance_profile=gpu_instance_profile, + gpu_mig_strategy=gpu_mig_strategy, ) results["scale_up"] = scale_up_result @@ -276,6 +279,8 @@ def all( scale_step_size=scale_step_size, gpu_node_pool=gpu_node_pool, enable_managed_gpu=enable_managed_gpu, + gpu_instance_profile=gpu_instance_profile, + gpu_mig_strategy=gpu_mig_strategy, ) results["scale_down"] = scale_down_result diff --git a/modules/python/crud/main.py b/modules/python/crud/main.py index 60b605c1d7..1fad21a6a3 100644 --- a/modules/python/crud/main.py +++ b/modules/python/crud/main.py @@ -107,6 +107,16 @@ def handle_node_pool_operation(node_pool_crud, args): command = args.command result = None + # gpu_instance_profile / gpu_mig_strategy are Azure-only MIG inputs. The AWS + # CRUD does not accept these kwargs (and has no **kwargs), so passing them for + # --cloud aws would raise TypeError. Only forward them on Azure. + azure_gpu_kwargs = {} + if args.cloud == "azure": + azure_gpu_kwargs = { + "gpu_instance_profile": args.gpu_instance_profile, + "gpu_mig_strategy": args.gpu_mig_strategy, + } + try: if command == "create": # Prepare create arguments @@ -116,8 +126,7 @@ def handle_node_pool_operation(node_pool_crud, args): "node_count": args.node_count, "gpu_node_pool": args.gpu_node_pool, "enable_managed_gpu": args.enable_managed_gpu, - "gpu_instance_profile": args.gpu_instance_profile, - "gpu_mig_strategy": args.gpu_mig_strategy, + **azure_gpu_kwargs, } result = node_pool_crud.create_node_pool(**create_kwargs) @@ -131,7 +140,7 @@ def handle_node_pool_operation(node_pool_crud, args): "scale_step_size": args.scale_step_size, "gpu_node_pool": args.gpu_node_pool, "enable_managed_gpu": args.enable_managed_gpu, - "gpu_instance_profile": args.gpu_instance_profile, + **azure_gpu_kwargs, } result = node_pool_crud.scale_node_pool(**scale_kwargs) @@ -151,6 +160,7 @@ def handle_node_pool_operation(node_pool_crud, args): "gpu_node_pool": args.gpu_node_pool, "enable_managed_gpu": args.enable_managed_gpu, "step_wait_time": args.step_wait_time, + **azure_gpu_kwargs, } result = node_pool_crud.all(**all_kwargs) diff --git a/modules/python/tests/clients/test_aks_client.py b/modules/python/tests/clients/test_aks_client.py index 1070e3fc32..d9f6a6f11f 100644 --- a/modules/python/tests/clients/test_aks_client.py +++ b/modules/python/tests/clients/test_aks_client.py @@ -629,6 +629,109 @@ def test_scale_gpu_node_pool_down_no_verification(self, mock_time): # Check that NVIDIA verification was NOT performed for scale-down self.mock_k8s.verify_nvidia_smi_on_node.assert_not_called() + def test_gpu_mode_metadata_variants(self): + """_gpu_mode_metadata normalizes managed/fully-managed and MIG single/mixed.""" + gpu_meta = AKSClient._gpu_mode_metadata # pylint: disable=protected-access + # Non-GPU pool + self.assertEqual( + gpu_meta(False, False), + { + "gpu_mode": "none", + "enable_managed_gpu": False, + "mig_enabled": False, + "gpu_instance_profile": None, + "gpu_mig_strategy": None, + }, + ) + # Managed (driver bootstrap only) + managed = gpu_meta(True, False) + self.assertEqual(managed["gpu_mode"], "managed") + self.assertFalse(managed["enable_managed_gpu"]) + self.assertFalse(managed["mig_enabled"]) + # Fully managed + fully = gpu_meta(True, True) + self.assertEqual(fully["gpu_mode"], "fully_managed") + self.assertTrue(fully["enable_managed_gpu"]) + # Fully managed + MIG mixed + mixed = gpu_meta(True, True, "MIG1g", "mixed") + self.assertEqual(mixed["gpu_mode"], "fully_managed") + self.assertTrue(mixed["mig_enabled"]) + self.assertEqual(mixed["gpu_instance_profile"], "MIG1g") + self.assertEqual(mixed["gpu_mig_strategy"], "mixed") + # Fully managed + MIG single + single = gpu_meta(True, True, "MIG1g", "single") + self.assertEqual(single["gpu_mig_strategy"], "single") + self.assertTrue(single["mig_enabled"]) + # Normalization: MIG inputs are dropped for non-fully-managed pools + managed_with_mig = gpu_meta(True, False, "MIG1g", "single") + self.assertEqual(managed_with_mig["gpu_mode"], "managed") + self.assertFalse(managed_with_mig["mig_enabled"]) + self.assertIsNone(managed_with_mig["gpu_instance_profile"]) + self.assertIsNone(managed_with_mig["gpu_mig_strategy"]) + # Normalization: managed flag is meaningless without a GPU pool + not_gpu = gpu_meta(False, True) + self.assertEqual(not_gpu["gpu_mode"], "none") + self.assertFalse(not_gpu["enable_managed_gpu"]) + # Invalid MIG strategy is rejected + with self.assertRaises(ValueError): + gpu_meta(True, True, "MIG1g", "bogus") + + def test_log_gpu_mode_console_echo(self): + """_log_gpu_mode echoes GPU metadata to the console for GPU pools only.""" + log_gpu_mode = AKSClient._log_gpu_mode # pylint: disable=protected-access + with self.assertLogs("clients.aks_client", level="INFO") as cm: + log_gpu_mode( + { + "gpu_mode": "fully_managed", + "enable_managed_gpu": True, + "mig_enabled": True, + "gpu_instance_profile": "MIG1g", + "gpu_mig_strategy": "mixed", + } + ) + self.assertTrue(any("gpu_mode=fully_managed" in m for m in cm.output)) + self.assertTrue(any("gpu_mig_strategy=mixed" in m for m in cm.output)) + # Non-GPU operations must not emit the GPU metadata line. + with self.assertNoLogs("clients.aks_client", level="INFO"): + log_gpu_mode({"gpu_mode": "none"}) + + @mock.patch("clients.aks_client.time") + def test_scale_node_pool_records_gpu_mode_metadata(self, mock_time): + """Scale ops persist gpu_mode + MIG fields even though the SDK read-back drops them.""" + node_pool_name = "h100fullmgd" + node_count = 3 + + mock_time.time.side_effect = [100, 150] + + mock_node_pool = mock.MagicMock() + mock_node_pool.count = 1 + mock_node_pool.vm_size = "Standard_NC40ads_H100_v5" + mock_node_pool.as_dict.return_value = {"count": 1} + self.mock_agent_pools.get.return_value = mock_node_pool + self.aks_client.get_node_pool = mock.MagicMock(return_value=mock_node_pool) + self.mock_k8s.wait_for_nodes_ready.return_value = [mock.MagicMock()] * node_count + self.mock_k8s.verify_managed_gpu_systemd_services = mock.MagicMock(return_value={}) + self.mock_k8s.verify_nvidia_smi_on_node = mock.MagicMock() + self.mock_k8s.verify_mig_allocatable = mock.MagicMock(return_value={}) + + result = self.aks_client.scale_node_pool( + node_pool_name=node_pool_name, + node_count=node_count, + gpu_node_pool=True, + enable_managed_gpu=True, + gpu_instance_profile="MIG1g", + gpu_mig_strategy="mixed", + ) + + self.assertTrue(result) + # The metadata dict is the 3rd positional arg to OperationContext(...) + metadata = self.mock_operation_context.call_args[0][2] + self.assertEqual(metadata["gpu_mode"], "fully_managed") + self.assertTrue(metadata["enable_managed_gpu"]) + self.assertTrue(metadata["mig_enabled"]) + self.assertEqual(metadata["gpu_instance_profile"], "MIG1g") + self.assertEqual(metadata["gpu_mig_strategy"], "mixed") + if __name__ == "__main__": unittest.main() diff --git a/modules/python/tests/clients/test_kubernetes_client.py b/modules/python/tests/clients/test_kubernetes_client.py index a990fa06e1..0ab752312e 100644 --- a/modules/python/tests/clients/test_kubernetes_client.py +++ b/modules/python/tests/clients/test_kubernetes_client.py @@ -2,6 +2,7 @@ """ Unit tests for KubernetesClient class """ +import itertools import unittest from unittest import mock from unittest.mock import patch, mock_open, MagicMock @@ -2113,19 +2114,64 @@ def test_verify_nvidia_smi_general_exception(self, mock_describe_node, _mock_cre result = self.client.verify_nvidia_smi_on_node([node]) self.assertFalse(result) + @patch("time.time", side_effect=itertools.count(0, 1000)) + @patch("time.sleep", return_value=None) @patch("clients.kubernetes_client.KubernetesClient.describe_node") - def test_verify_nvidia_smi_no_gpu_nodes(self, mock_describe_node): - """Test nvidia-smi verification skips nodes with no GPUs.""" + def test_verify_nvidia_smi_no_gpu_nodes(self, mock_describe_node, _mock_sleep, _mock_time): + """Nodes that never advertise a positive GPU/MIG count are skipped (after the wait).""" node = MagicMock() node.metadata.name = "cpu-only-node" node.status.allocatable = {"nvidia.com/gpu": "0"} - # Mock describe_node to return the node with no GPUs + # describe_node keeps reporting 0 GPUs; the wait times out and the node is skipped. mock_describe_node.return_value = node result = self.client.verify_nvidia_smi_on_node([node]) self.assertEqual(result, {}) # Should return empty dict as no nodes processed + @patch("time.time", side_effect=itertools.count(0, 1)) + @patch("time.sleep", return_value=None) + @patch("clients.kubernetes_client.KubernetesClient.get_pod_logs") + @patch("kubernetes.client.CoreV1Api.delete_namespaced_pod") + @patch("kubernetes.client.CoreV1Api.read_namespaced_pod") + @patch("kubernetes.client.CoreV1Api.create_namespaced_pod") + @patch("clients.kubernetes_client.KubernetesClient.describe_node") + def test_verify_nvidia_smi_waits_for_mig_registration( + self, + mock_describe_node, + mock_create_pod, + mock_read_pod, + _mock_delete_pod, + mock_get_logs, + _mock_sleep, + _mock_time, + ): + """A node reporting 0 GPUs mid-registration is NOT skipped — wait for a positive count.""" + node_name = "mig-single-node" + zero_node = MagicMock() + zero_node.metadata.name = node_name + zero_node.status.allocatable = {"nvidia.com/gpu": "0"} # device plugin registered, not yet populated + ready_node = MagicMock() + ready_node.metadata.name = node_name + ready_node.status.allocatable = {"nvidia.com/gpu": "56"} # MIG-single instances published + + # First describe (pre-loop) sees 0; after one wait iteration it sees 56. + mock_describe_node.side_effect = [zero_node, ready_node] + mock_read_pod.side_effect = [ + MagicMock(status=MagicMock(phase="Pending")), + MagicMock(status=MagicMock(phase="Succeeded")), + ] + mock_get_logs.return_value = "NVIDIA-SMI GPU driver info" + + result = self.client.verify_nvidia_smi_on_node([zero_node]) + + # The node was verified, not skipped, and a whole GPU (MIG-single) was requested. + self.assertIn(node_name, result) + self.assertTrue(result[node_name]["device_status"]) + self.assertGreaterEqual(mock_describe_node.call_count, 2) + pod_spec = mock_create_pod.call_args[1]["body"] + self.assertEqual(pod_spec.spec.containers[0].resources.limits["nvidia.com/gpu"], "1") + @patch("kubernetes.client.AppsV1Api.create_namespaced_daemon_set") @patch("requests.get") def test_install_gpu_device_plugin_success(self, mock_requests_get, mock_create_ds): diff --git a/modules/python/tests/crud/test_azure_node_pool_crud.py b/modules/python/tests/crud/test_azure_node_pool_crud.py index 4fd41a0310..0889877c79 100644 --- a/modules/python/tests/crud/test_azure_node_pool_crud.py +++ b/modules/python/tests/crud/test_azure_node_pool_crud.py @@ -114,6 +114,7 @@ def test_scale_node_pool_up(self): progressive=False, scale_step_size=1, gpu_instance_profile=None, + gpu_mig_strategy=None, ) def test_scale_node_pool_down(self): @@ -143,6 +144,7 @@ def test_scale_node_pool_down(self): progressive=False, scale_step_size=1, gpu_instance_profile=None, + gpu_mig_strategy=None, ) def test_delete_node_pool(self): diff --git a/modules/python/tests/crud/test_main.py b/modules/python/tests/crud/test_main.py index f215e21f62..ccbb189ba4 100644 --- a/modules/python/tests/crud/test_main.py +++ b/modules/python/tests/crud/test_main.py @@ -68,6 +68,7 @@ def test_handle_node_pool_operation_create(self, mock_azure_crud): # Setup mock_args = mock.MagicMock() mock_args.command = "create" + mock_args.cloud = "azure" mock_args.node_pool_name = "test-np" mock_args.vm_size = "Standard_D2s_v3" mock_args.node_count = 3 @@ -98,6 +99,7 @@ def test_handle_node_pool_operation_scale(self, mock_azure_crud): # Setup mock_args = mock.MagicMock() mock_args.command = "scale" + mock_args.cloud = "azure" mock_args.node_pool_name = "test-np" mock_args.target_count = 5 mock_args.scale_step_size = ( @@ -122,6 +124,7 @@ def test_handle_node_pool_operation_scale(self, mock_azure_crud): gpu_node_pool=False, enable_managed_gpu=False, gpu_instance_profile=mock_args.gpu_instance_profile, + gpu_mig_strategy=mock_args.gpu_mig_strategy, ) @mock.patch("crud.main.AzureNodePoolCRUD") @@ -130,6 +133,7 @@ def test_handle_node_pool_operation_scale_non_progressive(self, mock_azure_crud) # Setup - when scale_step_size equals target_count, progressive should be False mock_args = mock.MagicMock() mock_args.command = "scale" + mock_args.cloud = "azure" mock_args.node_pool_name = "test-np" mock_args.target_count = 3 mock_args.scale_step_size = ( @@ -154,6 +158,7 @@ def test_handle_node_pool_operation_scale_non_progressive(self, mock_azure_crud) gpu_node_pool=False, enable_managed_gpu=False, gpu_instance_profile=mock_args.gpu_instance_profile, + gpu_mig_strategy=mock_args.gpu_mig_strategy, ) @mock.patch("crud.main.logger") @@ -170,6 +175,7 @@ def test_handle_node_pool_operation_scale_fails_returns_error( # Setup - progressive scaling where operation fails mock_args = mock.MagicMock() mock_args.command = "scale" + mock_args.cloud = "azure" mock_args.node_pool_name = "test-np" mock_args.target_count = 10 mock_args.scale_step_size = 2 # Progressive scaling @@ -192,6 +198,7 @@ def test_handle_node_pool_operation_scale_fails_returns_error( gpu_node_pool=False, enable_managed_gpu=False, gpu_instance_profile=mock_args.gpu_instance_profile, + gpu_mig_strategy=mock_args.gpu_mig_strategy, ) mock_logger.error.assert_called_with("Operation 'scale' failed") @@ -257,6 +264,7 @@ def test_handle_node_pool_operation_all(self, mock_azure_crud): # Setup mock_args = mock.MagicMock() mock_args.command = "all" + mock_args.cloud = "azure" mock_args.node_pool_name = "test-np" mock_args.vm_size = "Standard_D2s_v3" mock_args.node_count = 1 @@ -284,14 +292,38 @@ def test_handle_node_pool_operation_all(self, mock_azure_crud): gpu_node_pool=True, enable_managed_gpu=False, step_wait_time=30, + gpu_instance_profile=mock_args.gpu_instance_profile, + gpu_mig_strategy=mock_args.gpu_mig_strategy, ) + def test_handle_node_pool_operation_scale_aws_omits_mig_kwargs(self): + """AWS scale must not receive Azure-only MIG kwargs (the AWS CRUD rejects them).""" + mock_args = mock.MagicMock() + mock_args.command = "scale" + mock_args.cloud = "aws" + mock_args.node_pool_name = "test-np" + mock_args.target_count = 5 + mock_args.scale_step_size = 1 + mock_args.gpu_node_pool = False + mock_args.enable_managed_gpu = False + + mock_crud = mock.MagicMock() + mock_crud.scale_node_pool.return_value = True + + result = handle_node_pool_operation(mock_crud, mock_args) + + self.assertEqual(result, 0) + call_kwargs = mock_crud.scale_node_pool.call_args.kwargs + self.assertNotIn("gpu_instance_profile", call_kwargs) + self.assertNotIn("gpu_mig_strategy", call_kwargs) + @mock.patch("crud.main.AzureNodePoolCRUD") def test_handle_node_pool_operation_failure(self, mock_azure_crud): """Test handle_node_pool_operation when operation fails""" # Setup mock_args = mock.MagicMock() mock_args.command = "create" + mock_args.cloud = "azure" mock_args.node_pool_name = "test-np" mock_args.vm_size = "Standard_D2s_v3" mock_args.node_count = 1 @@ -353,6 +385,7 @@ def test_handle_node_pool_operation_exception(self, mock_azure_crud, mock_logger # Setup mock_args = mock.MagicMock() mock_args.command = "create" + mock_args.cloud = "azure" mock_args.node_pool_name = "test-np" mock_args.vm_size = "Standard_D2s_v3" mock_args.node_count = 1 diff --git a/steps/engine/crud/k8s/execute.yml b/steps/engine/crud/k8s/execute.yml index fa666a0023..5ee588cf0c 100644 --- a/steps/engine/crud/k8s/execute.yml +++ b/steps/engine/crud/k8s/execute.yml @@ -16,8 +16,8 @@ steps: --step-timeout "$STEP_TIME_OUT" \ ${GPU_NODE_POOL:+--gpu-node-pool} \ $([[ "${ENABLE_MANAGED_GPU,,}" == "true" ]] && echo "--enable-managed-gpu" || true) \ - $([[ "${GPU_INSTANCE_PROFILE}" =~ ^MIG ]] && echo "--gpu-instance-profile ${GPU_INSTANCE_PROFILE}" || true) \ - $([[ "${GPU_MIG_STRATEGY}" =~ ^(mixed|single)$ ]] && echo "--gpu-mig-strategy ${GPU_MIG_STRATEGY}" || true) \ + $([[ "${ENABLE_MANAGED_GPU,,}" == "true" && "${GPU_INSTANCE_PROFILE}" =~ ^MIG ]] && echo "--gpu-instance-profile ${GPU_INSTANCE_PROFILE}" || true) \ + $([[ "${ENABLE_MANAGED_GPU,,}" == "true" && "${GPU_MIG_STRATEGY}" =~ ^(mixed|single)$ ]] && echo "--gpu-mig-strategy ${GPU_MIG_STRATEGY}" || true) \ --capacity-type "${CAPACITY_TYPE:-ON_DEMAND}" # Scale Up Node Pool @@ -32,7 +32,8 @@ steps: --step-timeout "$STEP_TIME_OUT" \ ${GPU_NODE_POOL:+--gpu-node-pool} \ $([[ "${ENABLE_MANAGED_GPU,,}" == "true" ]] && echo "--enable-managed-gpu" || true) \ - $([[ "${GPU_INSTANCE_PROFILE}" =~ ^MIG ]] && echo "--gpu-instance-profile ${GPU_INSTANCE_PROFILE}" || true) + $([[ "${ENABLE_MANAGED_GPU,,}" == "true" && "${GPU_INSTANCE_PROFILE}" =~ ^MIG ]] && echo "--gpu-instance-profile ${GPU_INSTANCE_PROFILE}" || true) \ + $([[ "${ENABLE_MANAGED_GPU,,}" == "true" && "${GPU_MIG_STRATEGY}" =~ ^(mixed|single)$ ]] && echo "--gpu-mig-strategy ${GPU_MIG_STRATEGY}" || true) displayName: 'Execute K8s Create & Scale Up Operations for ${{ parameters.cloud }}' workingDirectory: modules/python @@ -155,7 +156,10 @@ steps: --scale-step-size "$SCALE_STEP_SIZE" \ --step-wait-time "$STEP_WAIT_TIME" \ --step-timeout "$STEP_TIME_OUT" \ - ${GPU_NODE_POOL:+--gpu-node-pool} + ${GPU_NODE_POOL:+--gpu-node-pool} \ + $([[ "${ENABLE_MANAGED_GPU,,}" == "true" ]] && echo "--enable-managed-gpu" || true) \ + $([[ "${ENABLE_MANAGED_GPU,,}" == "true" && "${GPU_INSTANCE_PROFILE}" =~ ^MIG ]] && echo "--gpu-instance-profile ${GPU_INSTANCE_PROFILE}" || true) \ + $([[ "${ENABLE_MANAGED_GPU,,}" == "true" && "${GPU_MIG_STRATEGY}" =~ ^(mixed|single)$ ]] && echo "--gpu-mig-strategy ${GPU_MIG_STRATEGY}" || true) # Delete Node Pool PYTHONPATH=$PYTHONPATH:$(pwd) python3 "$PYTHON_SCRIPT_FILE" delete \ @@ -178,6 +182,9 @@ steps: STEP_TIME_OUT: $(STEP_TIME_OUT) RESULT_DIR: $(System.DefaultWorkingDirectory)/$(RUN_ID) GPU_NODE_POOL: $(GPU_NODE_POOL) + ENABLE_MANAGED_GPU: $(ENABLE_MANAGED_GPU) + GPU_INSTANCE_PROFILE: $(GPU_INSTANCE_PROFILE) + GPU_MIG_STRATEGY: $(GPU_MIG_STRATEGY) STEP_WAIT_TIME: $(STEP_WAIT_TIME) ${{ if eq(parameters.cloud, 'aws') }}: CAPACITY_TYPE: $(CAPACITY_TYPE)