Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cmd/manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -397,8 +397,10 @@ func main() {
if slices.Contains(mainConfig.EnabledControllers, "nova-pipeline-controllers") {
// Filter-weigher pipeline controller setup.
filterWeigherController := &nova.FilterWeigherPipelineController{
Monitor: filterWeigherPipelineMonitor,
Monitor: filterWeigherPipelineMonitor,
NoHostFoundCounter: nova.NewNoHostFoundCounter(),
}
metrics.Registry.MustRegister(filterWeigherController.NoHostFoundCounter)
// Inferred through the base controller.
filterWeigherController.Client = multiclusterClient
if err := filterWeigherController.SetupWithManager(mgr, multiclusterClient); err != nil {
Expand Down
199 changes: 199 additions & 0 deletions internal/scheduling/nova/cr_allocation.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
// Copyright SAP SE
// SPDX-License-Identifier: Apache-2.0

package nova

import (
"context"
"errors"
"fmt"

api "github.com/cobaltcore-dev/cortex/api/external/nova"
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/util/retry"

"github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute"
"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations"
hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// recordCRAllocation writes the placed VM UUID into the matching Reservation
// Spec.CommittedResourceReservation.Allocations after a successful Nova placement.
// Best-effort: any failure is logged but never propagated to the caller.
func (c *FilterWeigherPipelineController) recordCRAllocation(ctx context.Context, decision *v1alpha1.Decision, request api.ExternalSchedulerRequest) {
log := ctrl.LoggerFrom(ctx)

instanceUUID := request.Spec.Data.InstanceUUID
projectID := request.Context.ProjectID
flavorName := request.Spec.Data.Flavor.Data.Name
selectedHost := *decision.Status.Result.TargetHost

flavorGroupName, flavorInGroup, err := c.resolveFlavorGroup(ctx, flavorName)
if err != nil {
if errors.Is(err, errFlavorNotInGroup) {
log.V(1).Info("CR allocation: flavor not in any group, PAYG placement", "flavor", flavorName)
} else {
log.Error(err, "CR allocation: failed to resolve flavor group",
"flavor", flavorName, "instanceUUID", instanceUUID)
}
return
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

// List all CR reservations and filter to candidates matching this placement.
var reservationList v1alpha1.ReservationList
if err := c.List(ctx, &reservationList,
client.MatchingLabels{v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource},
); err != nil {
log.Error(err, "CR allocation: failed to list reservations", "instanceUUID", instanceUUID)
return
}

var candidates []v1alpha1.Reservation
for _, res := range reservationList.Items {
cr := res.Spec.CommittedResourceReservation
if cr == nil {
continue
}
if res.Spec.TargetHost != selectedHost || cr.ProjectID != projectID || cr.ResourceGroup != flavorGroupName {
continue
}
// Idempotency: if this VM UUID is already recorded, the work is done.
if _, exists := cr.Allocations[instanceUUID]; exists {
log.Info("CR allocation: VM UUID already in reservation, skipping",
"instanceUUID", instanceUUID, "reservation", res.Name)
return
}
candidates = append(candidates, res)
}

if len(candidates) == 0 {
log.V(1).Info("CR allocation: no matching reservation slot, PAYG placement",
"instanceUUID", instanceUUID, "host", selectedHost,
"projectID", projectID, "flavorGroup", flavorGroupName)
return
}

vmMemoryBytes := int64(flavorInGroup.MemoryMB) * 1024 * 1024 //nolint:gosec // flavor memory bounded by specs
vmCPUs := int64(flavorInGroup.VCPUs) //nolint:gosec // VCPUs bounded by specs

slotName := pickReservationSlot(candidates, vmMemoryBytes, vmCPUs)
if slotName == "" {
log.Error(nil, "CR allocation: no reservation slot has sufficient remaining capacity",
"instanceUUID", instanceUUID, "vmMemoryBytes", vmMemoryBytes,
"host", selectedHost, "candidates", len(candidates))
return
}

log.Info("CR allocation: writing VM UUID into reservation",
"instanceUUID", instanceUUID, "reservation", slotName,
"projectID", projectID, "flavorGroup", flavorGroupName, "host", selectedHost)

vmResources := map[hv1.ResourceName]resource.Quantity{
hv1.ResourceMemory: *resource.NewQuantity(vmMemoryBytes, resource.BinarySI),
hv1.ResourceCPU: *resource.NewQuantity(vmCPUs, resource.DecimalSI),
}
if retryErr := retry.RetryOnConflict(retry.DefaultRetry, func() error {
latest := &v1alpha1.Reservation{}
if err := c.Get(ctx, client.ObjectKey{Name: slotName}, latest); err != nil {
return err
}
if latest.Spec.CommittedResourceReservation == nil {
return fmt.Errorf("reservation %s lost CommittedResourceReservation spec", slotName)
}
if latest.Spec.CommittedResourceReservation.Allocations == nil {
latest.Spec.CommittedResourceReservation.Allocations = make(map[string]v1alpha1.CommittedResourceAllocation)
}
latest.Spec.CommittedResourceReservation.Allocations[instanceUUID] = v1alpha1.CommittedResourceAllocation{
CreationTimestamp: metav1.Now(),
Resources: vmResources,
}
return c.Update(ctx, latest)
}); retryErr != nil {
log.Error(retryErr, "CR allocation: failed to patch reservation",
"reservation", slotName, "instanceUUID", instanceUUID)
return
}

log.Info("CR allocation: done", "instanceUUID", instanceUUID, "reservation", slotName)
}

// pickReservationSlot selects the reservation slot with the least remaining
// memory that can still fully fit vmMemoryBytes and vmCPUs.
// Tiebreaks: least remaining CPU, then reservation name (lexicographic).
// Returns the slot name, or "" if no slot fits.
func pickReservationSlot(candidates []v1alpha1.Reservation, vmMemoryBytes, vmCPUs int64) string {
bestName := ""
var bestRemMem, bestRemCPU int64

for _, res := range candidates {
cr := res.Spec.CommittedResourceReservation

totalCPUQ := res.Spec.Resources[hv1.ResourceCPU]
totalCPU := totalCPUQ.Value()

var usedCPU int64
for _, alloc := range cr.Allocations {
cpuQ := alloc.Resources[hv1.ResourceCPU]
usedCPU += cpuQ.Value()
}

remMem := reservationRemainingMemory(res)
remCPU := max(totalCPU-usedCPU, 0)

if remMem < vmMemoryBytes || remCPU < vmCPUs {
continue // Slot doesn't have enough remaining capacity.
}

if bestName == "" ||
remMem < bestRemMem ||
(remMem == bestRemMem && remCPU < bestRemCPU) ||
(remMem == bestRemMem && remCPU == bestRemCPU && res.Name < bestName) {
bestName = res.Name
bestRemMem = remMem
bestRemCPU = remCPU
}
}

return bestName
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

// reservationRemainingMemory returns how many bytes of memory remain
// unallocated in a reservation slot. Returns 0 if the slot is full or nil.
func reservationRemainingMemory(res v1alpha1.Reservation) int64 {
cr := res.Spec.CommittedResourceReservation
if cr == nil {
return 0
}
totalMemQ := res.Spec.Resources[hv1.ResourceMemory]
var usedMem int64
for _, alloc := range cr.Allocations {
allocMemQ := alloc.Resources[hv1.ResourceMemory]
usedMem += allocMemQ.Value()
}
return max(totalMemQ.Value()-usedMem, 0)
}

// errFlavorNotInGroup is returned by resolveFlavorGroup when the flavor is not
// part of any configured flavor group (PAYG placement). Callers should
// distinguish this from real lookup errors.
var errFlavorNotInGroup = errors.New("flavor not in any group")

// resolveFlavorGroup looks up which flavor group the given flavor belongs to.
// Returns errFlavorNotInGroup (PAYG) if the flavor is not in any group.
// Returns a different error for transient failures (Knowledge CRD unavailable, etc).
func (c *FilterWeigherPipelineController) resolveFlavorGroup(ctx context.Context, flavorName string) (string, *compute.FlavorInGroup, error) {
fgClient := reservations.FlavorGroupKnowledgeClient{Client: c.Client}
flavorGroups, err := fgClient.GetAllFlavorGroups(ctx, nil)
if err != nil {
return "", nil, err
}
groupName, flavor, err := reservations.FindFlavorInGroups(flavorName, flavorGroups)
if err != nil {
return "", nil, errFlavorNotInGroup
}
return groupName, flavor, nil
}
126 changes: 126 additions & 0 deletions internal/scheduling/nova/cr_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
// Copyright SAP SE
// SPDX-License-Identifier: Apache-2.0

package nova

import (
"context"
"errors"

api "github.com/cobaltcore-dev/cortex/api/external/nova"
"github.com/cobaltcore-dev/cortex/api/v1alpha1"
"github.com/prometheus/client_golang/prometheus"
"k8s.io/apimachinery/pkg/api/resource"

ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// NewNoHostFoundCounter creates the Prometheus counter for no-host-found classification.
// Register it with the metrics registry before assigning it to the controller.
func NewNoHostFoundCounter() *prometheus.CounterVec {
return prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "cortex_nova_no_host_found_total",
Help: "Nova no-host-found results classified by committed resource coverage (cases A/B/C/D).",
}, []string{"case", "flavor_group", "intent"})
}

// classifyNoHostFound determines why no host was found for a nova placement request,
// in terms of committed resource coverage:
//
// - D: project has no active CommittedResources for the flavor group
// - A: CommittedResources exist but are fully occupied (used >= capacity)
// - B: CommittedResources have remaining capacity but no free Reservation slot
// - C: free Reservation slots exist but placement constraints excluded all candidates
func classifyNoHostFound(
activeCRs []v1alpha1.CommittedResource,
reservations []v1alpha1.Reservation,
projectID, flavorGroupName string,
) string {

if len(activeCRs) == 0 {
return "D"
}

totalCapacity := resource.Quantity{}
totalUsed := resource.Quantity{}
for _, cr := range activeCRs {
totalCapacity.Add(cr.Spec.Amount)
if used, ok := cr.Status.UsedResources["memory"]; ok {
totalUsed.Add(used)
}
}
if totalUsed.Cmp(totalCapacity) >= 0 {
return "A"
}

for _, res := range reservations {
cr := res.Spec.CommittedResourceReservation
if cr == nil || cr.ProjectID != projectID || cr.ResourceGroup != flavorGroupName {
continue
}
if reservationRemainingMemory(res) > 0 {
return "C"
}
}
return "B"
}

// logNoHostFound classifies a no-host-found result and emits a log line and metric.
func (c *FilterWeigherPipelineController) logNoHostFound(ctx context.Context, decision *v1alpha1.Decision, request api.ExternalSchedulerRequest) {
log := ctrl.LoggerFrom(ctx)

projectID := request.Context.ProjectID
flavorName := request.Spec.Data.Flavor.Data.Name
instanceUUID := request.Spec.Data.InstanceUUID
intent := decision.Spec.Intent

flavorGroupName, _, err := c.resolveFlavorGroup(ctx, flavorName)
if err != nil {
if errors.Is(err, errFlavorNotInGroup) {
log.V(1).Info("no-host-found: PAYG flavor, not CR-relevant",
"instanceUUID", instanceUUID, "flavor", flavorName, "intent", intent)
} else {
log.Error(err, "no-host-found: failed to resolve flavor group",
"instanceUUID", instanceUUID, "flavor", flavorName)
}
return
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

var crList v1alpha1.CommittedResourceList
if err := c.List(ctx, &crList); err != nil {
log.Error(err, "no-host-found: failed to list committed resources", "instanceUUID", instanceUUID)
return
}
var activeCRs []v1alpha1.CommittedResource
for _, cr := range crList.Items {
if cr.Spec.ProjectID != projectID || cr.Spec.FlavorGroupName != flavorGroupName {
continue
}
if cr.Spec.State != v1alpha1.CommitmentStatusConfirmed && cr.Spec.State != v1alpha1.CommitmentStatusGuaranteed {
continue
}
activeCRs = append(activeCRs, cr)
}

var reservationList v1alpha1.ReservationList
if err := c.List(ctx, &reservationList,
client.MatchingLabels{v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource},
); err != nil {
log.Error(err, "no-host-found: failed to list reservations", "instanceUUID", instanceUUID)
return
}

noHostFoundCase := classifyNoHostFound(activeCRs, reservationList.Items, projectID, flavorGroupName)

log.Info("no-host-found classified",
"case", noHostFoundCase,
"instanceUUID", instanceUUID,
"projectID", projectID,
"flavorGroup", flavorGroupName,
"intent", intent,
)
if c.NoHostFoundCounter != nil {
c.NoHostFoundCounter.WithLabelValues(noHostFoundCase, flavorGroupName, string(intent)).Inc()
}
}
Loading
Loading