diff --git a/src/_util/backup_config.py b/src/_util/backup_config.py new file mode 100644 index 000000000..fe32512d7 --- /dev/null +++ b/src/_util/backup_config.py @@ -0,0 +1,5 @@ +import os + +SNAPSHOT_TIMEOUT_SEC = int(os.environ.get("SNAPSHOT_TIMEOUT_SEC", "120")) +SNAPSHOT_POLL_INTERVAL_SEC = int(os.environ.get("SNAPSHOT_POLL_INTERVAL_SEC", "5")) +VOLUME_SNAPSHOT_CLASS = os.environ.get("VOLUME_SNAPSHOT_CLASS", "simplyblock-csi-snapshotclass") diff --git a/src/api/_util/backups.py b/src/api/_util/backups.py index 7c1803748..aa3def9a4 100644 --- a/src/api/_util/backups.py +++ b/src/api/_util/backups.py @@ -9,9 +9,7 @@ from ...models.backups import BackupEntry, BackupSchedule, BackupScheduleRow, NextBackup from ...models.branch import Branch -from ..backup_snapshots import ( - delete_branch_snapshot, -) +from ..backup_snapshots import build_snapshot_metadata, delete_snapshot logger = logging.getLogger(__name__) @@ -114,16 +112,11 @@ async def delete_branch_backups(session: SessionDep, branch_id: Identifier) -> N return for backup in backups: + snapshot = build_snapshot_metadata(backup) + if snapshot is None: + logger.warning("Skipping snapshot deletion for backup %s because metadata was incomplete", backup.id) + continue try: - await delete_branch_snapshot( - name=backup.snapshot_name, - namespace=backup.snapshot_namespace, - content_name=backup.snapshot_content_name, - ) + await delete_snapshot(snapshot) except Exception: - logger.exception( - "Failed to delete snapshot %s/%s for backup %s", - backup.snapshot_namespace, - backup.snapshot_name, - backup.id, - ) + logger.exception("Failed to delete snapshots for branch %s", branch_id) diff --git a/src/api/backup.py b/src/api/backup.py index 83a1222e0..ed9ebdb11 100644 --- a/src/api/backup.py +++ b/src/api/backup.py @@ -33,11 +33,7 @@ from ..models.project import Project from ._util.backups import _remove_existing_schedule, _validate_project_retention_budget from .auth import authenticated_user -from .backup_snapshots import ( - SNAPSHOT_POLL_INTERVAL_SEC, - create_branch_snapshot, - delete_branch_snapshot, -) +from .backup_snapshots import build_snapshot_metadata, create_branch_db_snapshot, delete_snapshot from .db import SessionDep from .dependencies import OrganizationDep @@ -46,7 +42,6 @@ # --------------------------- # Constants # --------------------------- -VOLUME_SNAPSHOT_CLASS = os.environ.get("VOLUME_SNAPSHOT_CLASS", "simplyblock-csi-snapshotclass") MANUAL_BACKUP_TIMEOUT_SEC = int(os.environ.get("MANUAL_BACKUP_TIMEOUT_SEC", "10")) UNIT_MULTIPLIER = { @@ -539,13 +534,10 @@ async def manual_backup(session: SessionDep, branch_id: Identifier) -> BackupCre recorded_at = datetime.now(UTC) try: - snapshot = await create_branch_snapshot( + snapshot = await create_branch_db_snapshot( branch.id, backup_id=backup_id, - snapshot_class=VOLUME_SNAPSHOT_CLASS, - poll_interval=SNAPSHOT_POLL_INTERVAL_SEC, label="manual", - time_limit=MANUAL_BACKUP_TIMEOUT_SEC, ) except Exception as exc: logger.exception("Manual backup failed for branch %s within timeout", branch.id) @@ -586,15 +578,13 @@ async def delete_backup(session: SessionDep, backup_id: Identifier) -> BackupDel if not backup: raise HTTPException(status_code=404, detail="Backup not found") - try: - await delete_branch_snapshot( - name=backup.snapshot_name, - namespace=backup.snapshot_namespace, - content_name=backup.snapshot_content_name, - ) - except Exception as exc: - logger.exception("Failed to delete snapshot for backup %s", backup_id) - raise HTTPException(status_code=500, detail="Failed to delete backup snapshot") from exc + metadata = build_snapshot_metadata(backup) + if metadata is not None: + try: + await delete_snapshot(metadata) + except Exception as exc: + logger.exception("Failed to delete snapshot for backup %s", backup_id) + raise HTTPException(status_code=500, detail="Failed to delete backup snapshot") from exc await session.delete(backup) diff --git a/src/api/backup_snapshots.py b/src/api/backup_snapshots.py index 2680d79ff..e938458f3 100644 --- a/src/api/backup_snapshots.py +++ b/src/api/backup_snapshots.py @@ -2,13 +2,22 @@ import asyncio import logging -import os import re from dataclasses import dataclass from typing import TYPE_CHECKING +from pydantic import BaseModel, Field + from .._util import Identifier, quantity_to_bytes -from ..deployment import AUTOSCALER_PVC_SUFFIX, get_autoscaler_vm_identity +from .._util.backup_config import ( + SNAPSHOT_POLL_INTERVAL_SEC, + SNAPSHOT_TIMEOUT_SEC, + VOLUME_SNAPSHOT_CLASS, +) +from ..deployment import ( + AUTOSCALER_PVC_SUFFIX, + get_autoscaler_vm_identity, +) from ..deployment.kubernetes.snapshot import ( create_snapshot_from_pvc, ensure_snapshot_absent, @@ -21,12 +30,38 @@ if TYPE_CHECKING: from ulid import ULID -logger = logging.getLogger(__name__) + from ..models.backups import BackupEntry -SNAPSHOT_TIMEOUT_SEC = int(os.environ.get("SNAPSHOT_TIMEOUT_SEC", "120")) -SNAPSHOT_POLL_INTERVAL_SEC = int(os.environ.get("SNAPSHOT_POLL_INTERVAL_SEC", "5")) +logger = logging.getLogger(__name__) _K8S_NAME_MAX_LENGTH = 63 +DEFAULT_SNAPSHOT_TIMEOUT_SEC = float(SNAPSHOT_TIMEOUT_SEC) +DEFAULT_SNAPSHOT_POLL_INTERVAL_SEC = float(SNAPSHOT_POLL_INTERVAL_SEC) + + +class SnapshotMetadata(BaseModel): + name: str = Field(..., min_length=1) + namespace: str = Field(..., min_length=1) + # content_name stays optional because there are runtime scenarios where the + # VolumeSnapshotContent hasn’t been bound yet + content_name: str | None + + +def build_snapshot_metadata(backup: BackupEntry) -> SnapshotMetadata | None: + name = backup.snapshot_name + namespace = backup.snapshot_namespace + if not name or not namespace: + logger.debug( + "Skipping metadata for missing snapshot identifiers (name=%r namespace=%r)", + name, + namespace, + ) + return None + return SnapshotMetadata( + name=name, + namespace=namespace, + content_name=backup.snapshot_content_name, + ) @dataclass(frozen=True) @@ -59,20 +94,18 @@ def _build_snapshot_name(*, label: str, backup_id: ULID) -> str: return f"{label_component}{separator}{backup_component}" -async def create_branch_snapshot( - branch_id: Identifier, +async def _create_snapshot_from_pvc( *, + namespace: str, + pvc_name: str, backup_id: ULID, snapshot_class: str, - poll_interval: float, label: str, + poll_interval: float, time_limit: float, ) -> SnapshotDetails: - namespace, autoscaler_vm_name = get_autoscaler_vm_identity(branch_id) - pvc_name = f"{autoscaler_vm_name}{AUTOSCALER_PVC_SUFFIX}" snapshot_name = _build_snapshot_name(label=label, backup_id=backup_id) - - logger.info("Creating VolumeSnapshot %s/%s for branch %s", namespace, snapshot_name, branch_id) + logger.info("Creating VolumeSnapshot %s/%s for branch PVC %s", namespace, snapshot_name, pvc_name) try: async with asyncio.timeout(time_limit): await create_snapshot_from_pvc( @@ -89,14 +122,14 @@ async def create_branch_snapshot( ) except TimeoutError as exc: logger.exception( - "Timed out creating VolumeSnapshot %s/%s for branch %s within %s seconds", + "Timed out creating VolumeSnapshot %s/%s for PVC %s within %s seconds", namespace, snapshot_name, - branch_id, + pvc_name, time_limit, ) raise VelaSnapshotTimeoutError( - f"Timed out creating VolumeSnapshot {namespace}/{snapshot_name} for branch {branch_id}" + f"Timed out creating VolumeSnapshot {namespace}/{snapshot_name} for namespace {namespace}" ) from exc status = snapshot.get("status") or {} @@ -118,29 +151,43 @@ async def create_branch_snapshot( ) -async def delete_branch_snapshot( +async def create_branch_db_snapshot( + branch_id: Identifier, *, - name: str | None, - namespace: str | None, - content_name: str | None, - time_limit: float = SNAPSHOT_TIMEOUT_SEC, - poll_interval: float = SNAPSHOT_POLL_INTERVAL_SEC, -) -> None: - if not name or not namespace: - logger.debug( - "Skipping deletion for VolumeSnapshot with missing metadata (name=%s namespace=%s)", - name, - namespace, - ) - return + backup_id: ULID, + snapshot_class: str = VOLUME_SNAPSHOT_CLASS, + poll_interval: float = DEFAULT_SNAPSHOT_POLL_INTERVAL_SEC, + label: str, + time_limit: float = DEFAULT_SNAPSHOT_TIMEOUT_SEC, +) -> SnapshotDetails: + namespace, autoscaler_vm_name = get_autoscaler_vm_identity(branch_id) + pvc_name = f"{autoscaler_vm_name}{AUTOSCALER_PVC_SUFFIX}" + return await _create_snapshot_from_pvc( + namespace=namespace, + pvc_name=pvc_name, + backup_id=backup_id, + snapshot_class=snapshot_class, + poll_interval=poll_interval, + label=label, + time_limit=time_limit, + ) + - derived_content_name = content_name +async def delete_snapshot( + metadata: SnapshotMetadata, + *, + time_limit: float = DEFAULT_SNAPSHOT_TIMEOUT_SEC, + poll_interval: float = DEFAULT_SNAPSHOT_POLL_INTERVAL_SEC, +) -> None: + name = metadata.name + namespace = metadata.namespace + content_name = metadata.content_name try: async with asyncio.timeout(time_limit): snapshot = await read_snapshot(namespace, name) if snapshot is not None: status = snapshot.get("status") or {} - derived_content_name = derived_content_name or status.get("boundVolumeSnapshotContentName") + content_name = content_name or status.get("boundVolumeSnapshotContentName") logger.info("Deleting VolumeSnapshot %s/%s", namespace, name) await ensure_snapshot_absent( namespace, @@ -151,10 +198,10 @@ async def delete_branch_snapshot( else: logger.info("VolumeSnapshot %s/%s already absent", namespace, name) - if derived_content_name: - logger.info("Ensuring VolumeSnapshotContent %s is absent", derived_content_name) + if content_name: + logger.info("Ensuring VolumeSnapshotContent %s is absent", content_name) await ensure_snapshot_content_absent( - derived_content_name, + content_name, timeout=time_limit, poll_interval=poll_interval, ) diff --git a/src/api/backupmonitor.py b/src/api/backupmonitor.py index 8cebef383..ace181348 100644 --- a/src/api/backupmonitor.py +++ b/src/api/backupmonitor.py @@ -10,6 +10,7 @@ from sqlmodel import SQLModel, asc, delete, select from ulid import ULID +from .._util.backup_config import SNAPSHOT_POLL_INTERVAL_SEC, SNAPSHOT_TIMEOUT_SEC, VOLUME_SNAPSHOT_CLASS from ..models.backups import ( BackupEntry, BackupLog, @@ -21,10 +22,9 @@ from ..models.organization import Organization from ..models.project import Project from .backup_snapshots import ( - SNAPSHOT_POLL_INTERVAL_SEC, - SNAPSHOT_TIMEOUT_SEC, - create_branch_snapshot, - delete_branch_snapshot, + build_snapshot_metadata, + create_branch_db_snapshot, + delete_snapshot, ) from .organization.project.branch import refresh_branch_status from .settings import get_settings @@ -32,7 +32,6 @@ # --------------------------- # Config # --------------------------- -VOLUME_SNAPSHOT_CLASS = os.environ.get("VOLUME_SNAPSHOT_CLASS", "simplyblock-csi-snapshotclass") POLL_INTERVAL = int(os.environ.get("POLL_INTERVAL", "60")) logger = logging.getLogger(__name__) @@ -204,11 +203,15 @@ async def _delete_many( deleted_ids: list[ULID] = [] for backup in backups: + metadata = build_snapshot_metadata(backup) + if metadata is None: + logger.warning("Skipping snapshot deletion for backup %s because metadata was incomplete", backup.id) + continue try: - await delete_branch_snapshot( - name=backup.snapshot_name, - namespace=backup.snapshot_namespace, - content_name=backup.snapshot_content_name, + await delete_snapshot( + metadata, + time_limit=SNAPSHOT_TIMEOUT_SEC, + poll_interval=SNAPSHOT_POLL_INTERVAL_SEC, ) except Exception: context = { @@ -303,13 +306,11 @@ async def execute_backup(self, db: AsyncSession, branch: Branch, row: BackupSche backup_id = ULID() try: - snapshot = await create_branch_snapshot( + snapshot = await create_branch_db_snapshot( branch.id, backup_id=backup_id, snapshot_class=VOLUME_SNAPSHOT_CLASS, - poll_interval=SNAPSHOT_POLL_INTERVAL_SEC, label=f"row-{row.row_index}", - time_limit=SNAPSHOT_TIMEOUT_SEC, ) except Exception: nb.next_at = next_due diff --git a/src/deployment/__init__.py b/src/deployment/__init__.py index 0dc3a21d1..140349087 100644 --- a/src/deployment/__init__.py +++ b/src/deployment/__init__.py @@ -74,6 +74,7 @@ DATABASE_PVC_SUFFIX = "-db-pvc" AUTOSCALER_PVC_SUFFIX = "-block-data" AUTOSCALER_WAL_PVC_SUFFIX = "-pg-wal" +AUTOSCALER_PVC_SUFFIX = "-block-data" _LOAD_BALANCER_TIMEOUT_SECONDS = float(600) _LOAD_BALANCER_POLL_INTERVAL_SECONDS = float(2) _OVERLAY_IP_TIMEOUT_SECONDS = float(300)