Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 61 additions & 1 deletion modules/python/clients/aks_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,46 @@ def add_managed_gpu_node_pool(
)
logger.info(f"az aks nodepool add succeeded for '{node_pool_name}'")

@staticmethod
def _gpu_mode_metadata(
gpu_node_pool: bool,
enable_managed_gpu: bool,
gpu_instance_profile: Optional[str] = None,
gpu_mig_strategy: Optional[str] = None,
) -> Dict[str, Any]:
"""
Build normalized GPU-mode metadata that reliably distinguishes
managed vs fully-managed GPU and MIG single vs mixed.

These fields are derived from the operation's INPUT flags rather than the
AKS read-back: the stable Python SDK (azure-mgmt-containerservice) does not
model gpuProfile.nvidia.managementMode, so a fully-managed pool's mode is
silently dropped from nodepool_info. Recording the input flags here keeps
the distinction queryable downstream regardless of SDK coverage.

Returns:
Dict with:
- gpu_mode: "none" | "managed" | "fully_managed"
- enable_managed_gpu: the raw fully-managed flag
- mig_enabled: whether a MIG profile/strategy was requested
- gpu_instance_profile: MIG instance profile (e.g. "MIG1g") or None
- gpu_mig_strategy: "single" | "mixed" | None
"""
if not gpu_node_pool:
gpu_mode = "none"
elif enable_managed_gpu:
gpu_mode = "fully_managed"
else:
gpu_mode = "managed"

return {
"gpu_mode": gpu_mode,
"enable_managed_gpu": enable_managed_gpu,
"mig_enabled": bool(gpu_instance_profile or gpu_mig_strategy),
"gpu_instance_profile": gpu_instance_profile,
"gpu_mig_strategy": gpu_mig_strategy,
}

def create_node_pool(
self,
node_pool_name: str,
Expand Down Expand Up @@ -409,7 +449,12 @@ def create_node_pool(
"vm_size": vm_size,
"node_count": node_count,
"gpu_node_pool": gpu_node_pool,
"enable_managed_gpu": enable_managed_gpu,
**self._gpu_mode_metadata(
gpu_node_pool,
enable_managed_gpu,
gpu_instance_profile,
gpu_mig_strategy,
),
}

# Create operation context to track the operation
Expand Down Expand Up @@ -522,6 +567,7 @@ def scale_node_pool(
progressive: bool = False,
scale_step_size: int = 1,
gpu_instance_profile: Optional[str] = None,
gpu_mig_strategy: Optional[str] = None,
) -> Any:
"""
Scale a node pool to the specified node count.
Expand Down Expand Up @@ -555,6 +601,12 @@ def scale_node_pool(
"gpu_node_pool": gpu_node_pool,
"progressive_scaling": progressive,
"scale_step_size": scale_step_size,
**self._gpu_mode_metadata(
gpu_node_pool,
enable_managed_gpu,
gpu_instance_profile,
gpu_mig_strategy,
),
}
node_pool = self.get_node_pool(node_pool_name, cluster_name)

Expand Down Expand Up @@ -583,6 +635,7 @@ def scale_node_pool(
enable_managed_gpu=enable_managed_gpu,
node_pool=node_pool,
gpu_instance_profile=gpu_instance_profile,
gpu_mig_strategy=gpu_mig_strategy,
)

# Create operation context to track the operation
Expand Down Expand Up @@ -751,6 +804,7 @@ def _progressive_scale(
enable_managed_gpu: bool = False,
node_pool: Optional[Any] = None,
gpu_instance_profile: Optional[str] = None,
gpu_mig_strategy: Optional[str] = None,
) -> Any:
"""
Scale a node pool progressively with specified step size
Expand Down Expand Up @@ -816,6 +870,12 @@ def _progressive_scale(
"scale_step_size": scale_step_size,
"cluster_name": cluster_name or self.get_cluster_name(),
"gpu_node_pool": gpu_node_pool,
**self._gpu_mode_metadata(
gpu_node_pool,
enable_managed_gpu,
gpu_instance_profile,
gpu_mig_strategy,
),
}

# Create operation context for this specific step
Expand Down
5 changes: 5 additions & 0 deletions modules/python/crud/azure/node_pool_crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def scale_node_pool(
gpu_node_pool=False,
enable_managed_gpu=False,
gpu_instance_profile=None,
gpu_mig_strategy=None,
):
"""
Scale a node pool to specified count
Expand Down Expand Up @@ -145,6 +146,7 @@ def scale_node_pool(
progressive=progressive,
scale_step_size=scale_step_size,
gpu_instance_profile=gpu_instance_profile,
gpu_mig_strategy=gpu_mig_strategy,
)

if result is not None:
Expand Down Expand Up @@ -253,6 +255,7 @@ def all(
gpu_node_pool=gpu_node_pool,
enable_managed_gpu=enable_managed_gpu,
gpu_instance_profile=gpu_instance_profile,
gpu_mig_strategy=gpu_mig_strategy,
)
results["scale_up"] = scale_up_result

Expand All @@ -276,6 +279,8 @@ def all(
scale_step_size=scale_step_size,
gpu_node_pool=gpu_node_pool,
enable_managed_gpu=enable_managed_gpu,
gpu_instance_profile=gpu_instance_profile,
gpu_mig_strategy=gpu_mig_strategy,
)
results["scale_down"] = scale_down_result

Expand Down
3 changes: 3 additions & 0 deletions modules/python/crud/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def handle_node_pool_operation(node_pool_crud, args):
"gpu_node_pool": args.gpu_node_pool,
"enable_managed_gpu": args.enable_managed_gpu,
"gpu_instance_profile": args.gpu_instance_profile,
"gpu_mig_strategy": args.gpu_mig_strategy,
}
Comment on lines 141 to 144

result = node_pool_crud.scale_node_pool(**scale_kwargs)
Expand All @@ -151,6 +152,8 @@ def handle_node_pool_operation(node_pool_crud, args):
"gpu_node_pool": args.gpu_node_pool,
"enable_managed_gpu": args.enable_managed_gpu,
"step_wait_time": args.step_wait_time,
"gpu_instance_profile": args.gpu_instance_profile,
"gpu_mig_strategy": args.gpu_mig_strategy,
}

result = node_pool_crud.all(**all_kwargs)
Expand Down
70 changes: 70 additions & 0 deletions modules/python/tests/clients/test_aks_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,76 @@ def test_scale_gpu_node_pool_down_no_verification(self, mock_time):
# Check that NVIDIA verification was NOT performed for scale-down
self.mock_k8s.verify_nvidia_smi_on_node.assert_not_called()

def test_gpu_mode_metadata_variants(self):
"""_gpu_mode_metadata normalizes managed/fully-managed and MIG single/mixed."""
# Non-GPU pool
self.assertEqual(
AKSClient._gpu_mode_metadata(False, False),
{
"gpu_mode": "none",
"enable_managed_gpu": False,
"mig_enabled": False,
"gpu_instance_profile": None,
"gpu_mig_strategy": None,
},
)
# Managed (driver bootstrap only)
managed = AKSClient._gpu_mode_metadata(True, False)
self.assertEqual(managed["gpu_mode"], "managed")
self.assertFalse(managed["enable_managed_gpu"])
self.assertFalse(managed["mig_enabled"])
# Fully managed
fully = AKSClient._gpu_mode_metadata(True, True)
self.assertEqual(fully["gpu_mode"], "fully_managed")
self.assertTrue(fully["enable_managed_gpu"])
# Fully managed + MIG mixed
mixed = AKSClient._gpu_mode_metadata(True, True, "MIG1g", "mixed")
self.assertEqual(mixed["gpu_mode"], "fully_managed")
self.assertTrue(mixed["mig_enabled"])
self.assertEqual(mixed["gpu_instance_profile"], "MIG1g")
self.assertEqual(mixed["gpu_mig_strategy"], "mixed")
# Fully managed + MIG single
single = AKSClient._gpu_mode_metadata(True, True, "MIG1g", "single")
self.assertEqual(single["gpu_mig_strategy"], "single")
self.assertTrue(single["mig_enabled"])

@mock.patch("clients.aks_client.time")
def test_scale_node_pool_records_gpu_mode_metadata(self, mock_time):
"""Scale ops persist gpu_mode + MIG fields even though the SDK read-back drops them."""
node_pool_name = "h100fullmgd"
node_count = 3

mock_time.time.side_effect = [100, 150]

mock_node_pool = mock.MagicMock()
mock_node_pool.count = 1
mock_node_pool.vm_size = "Standard_NC40ads_H100_v5"
mock_node_pool.as_dict.return_value = {"count": 1}
self.mock_agent_pools.get.return_value = mock_node_pool
self.aks_client.get_node_pool = mock.MagicMock(return_value=mock_node_pool)
self.mock_k8s.wait_for_nodes_ready.return_value = [mock.MagicMock()] * node_count
self.mock_k8s.verify_managed_gpu_systemd_services = mock.MagicMock(return_value={})
self.mock_k8s.verify_nvidia_smi_on_node = mock.MagicMock()
self.mock_k8s.verify_mig_allocatable = mock.MagicMock(return_value={})

result = self.aks_client.scale_node_pool(
node_pool_name=node_pool_name,
node_count=node_count,
gpu_node_pool=True,
enable_managed_gpu=True,
gpu_instance_profile="MIG1g",
gpu_mig_strategy="mixed",
)

self.assertTrue(result)
# The metadata dict is the 3rd positional arg to OperationContext(...)
metadata = self.mock_operation_context.call_args[0][2]
self.assertEqual(metadata["gpu_mode"], "fully_managed")
self.assertTrue(metadata["enable_managed_gpu"])
self.assertTrue(metadata["mig_enabled"])
self.assertEqual(metadata["gpu_instance_profile"], "MIG1g")
self.assertEqual(metadata["gpu_mig_strategy"], "mixed")


if __name__ == "__main__":
unittest.main()
11 changes: 9 additions & 2 deletions steps/engine/crud/k8s/execute.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ steps:
--step-timeout "$STEP_TIME_OUT" \
${GPU_NODE_POOL:+--gpu-node-pool} \
$([[ "${ENABLE_MANAGED_GPU,,}" == "true" ]] && echo "--enable-managed-gpu" || true) \
$([[ "${GPU_INSTANCE_PROFILE}" =~ ^MIG ]] && echo "--gpu-instance-profile ${GPU_INSTANCE_PROFILE}" || true)
$([[ "${GPU_INSTANCE_PROFILE}" =~ ^MIG ]] && echo "--gpu-instance-profile ${GPU_INSTANCE_PROFILE}" || true) \
$([[ "${GPU_MIG_STRATEGY}" =~ ^(mixed|single)$ ]] && echo "--gpu-mig-strategy ${GPU_MIG_STRATEGY}" || true)

displayName: 'Execute K8s Create & Scale Up Operations for ${{ parameters.cloud }}'
workingDirectory: modules/python
Expand Down Expand Up @@ -155,7 +156,10 @@ steps:
--scale-step-size "$SCALE_STEP_SIZE" \
--step-wait-time "$STEP_WAIT_TIME" \
--step-timeout "$STEP_TIME_OUT" \
${GPU_NODE_POOL:+--gpu-node-pool}
${GPU_NODE_POOL:+--gpu-node-pool} \
$([[ "${ENABLE_MANAGED_GPU,,}" == "true" ]] && echo "--enable-managed-gpu" || true) \
$([[ "${GPU_INSTANCE_PROFILE}" =~ ^MIG ]] && echo "--gpu-instance-profile ${GPU_INSTANCE_PROFILE}" || true) \
$([[ "${GPU_MIG_STRATEGY}" =~ ^(mixed|single)$ ]] && echo "--gpu-mig-strategy ${GPU_MIG_STRATEGY}" || true)
Comment thread
Copilot marked this conversation as resolved.
Outdated

# Delete Node Pool
PYTHONPATH=$PYTHONPATH:$(pwd) python3 "$PYTHON_SCRIPT_FILE" delete \
Expand All @@ -178,6 +182,9 @@ steps:
STEP_TIME_OUT: $(STEP_TIME_OUT)
RESULT_DIR: $(System.DefaultWorkingDirectory)/$(RUN_ID)
GPU_NODE_POOL: $(GPU_NODE_POOL)
ENABLE_MANAGED_GPU: $(ENABLE_MANAGED_GPU)
GPU_INSTANCE_PROFILE: $(GPU_INSTANCE_PROFILE)
GPU_MIG_STRATEGY: $(GPU_MIG_STRATEGY)
STEP_WAIT_TIME: $(STEP_WAIT_TIME)
${{ if eq(parameters.cloud, 'aws') }}:
CAPACITY_TYPE: $(CAPACITY_TYPE)
Loading