diff --git a/.github/workflows/py-cli-e2e-tests-v2.yml b/.github/workflows/py-cli-e2e-tests-v2.yml new file mode 100644 index 000000000000..59d2c83aaa26 --- /dev/null +++ b/.github/workflows/py-cli-e2e-tests-v2.yml @@ -0,0 +1,99 @@ +# Copyright 2026 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# CLI E2E v2 — strangler-fig replacement for py-cli-e2e-tests.yml. +# +# Each connector lives under ingestion/tests/cli_e2e_v2// and +# is a self-contained pytest module (no inheritance). The matrix below +# grows by one entry per connector migration PR; the connector's +# corresponding entry is removed from py-cli-e2e-tests.yml in the same PR. +# +# Triggers: workflow_dispatch only during the stabilization window for +# the MySQL pilot. The schedule cron will be added once the pilot is +# consistently green (see spec §7.1). + +name: py-cli-e2e-tests-v2 +on: + workflow_dispatch: + inputs: + connectors: + description: "Connectors to run (JSON array)" + required: true + default: '["mysql"]' + +permissions: + id-token: write + contents: read + +jobs: + py-cli-e2e-tests-v2: + runs-on: ubuntu-latest + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + connector: ${{ fromJSON(inputs.connectors || '["mysql"]') }} + environment: test + + steps: + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + tool-cache: false + android: true + dotnet: true + haskell: true + large-packages: false + swap-storage: true + docker-images: false + + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Openmetadata Test Environment + uses: ./.github/actions/setup-openmetadata-test-environment + with: + python-version: '3.10' + + - name: Run CLI E2E v2 tests + id: e2e-v2-test + env: + # MySQL test data lives in a dedicated MySQL container that the + # session-scoped `mysql_container` pytest fixture (testcontainers) + # boots, bootstraps with the OM-doc minimum grants, and tears + # down. Teammates run the same way locally — no env plumbing. + # Only OM-server admin creds (used to mint the ingestion-bot + # JWT) need to be exported here; they come from the bundled + # docker-compose and are not secrets. + OM_ADMIN_EMAIL: admin@open-metadata.org + OM_ADMIN_PASSWORD: admin + run: | + source env/bin/activate + cd ingestion + mkdir -p junit + pytest -v \ + --junitxml=junit/test-results-v2-${{ matrix.connector }}.xml \ + tests/cli_e2e_v2/${{ matrix.connector }} + shell: bash + + - name: Upload tests artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: tests-v2-${{ matrix.connector }} + path: ingestion/junit/test-results-v2-*.xml + + - name: Clean Up + if: always() + run: | + cd ./docker/development + docker compose down --remove-orphans + sudo rm -rf ${PWD}/docker-volume diff --git a/ingestion/pyproject.toml b/ingestion/pyproject.toml index 4992240ae06a..519be8a055a6 100644 --- a/ingestion/pyproject.toml +++ b/ingestion/pyproject.toml @@ -222,6 +222,12 @@ ignore = [ # lands in a later stage tests don't immediately error out. "tests/**/*.py" = ["S101", "PLR2004", "PLC0415"] "ingestion/tests/**/*.py" = ["S101", "PLR2004", "PLC0415"] +# v2 CLI E2E framework uses relative imports by design (connector-centric +# layout — connectors live in subdirs and import from `..core.*` / `.connector`). +# `T201` (print) is allowed in the top-level conftest for the session-start +# posture banner. Path listed twice for the dual-cwd pattern above. +"tests/cli_e2e_v2/**/*.py" = ["S101", "PLR2004", "PLC0415", "TID252", "T201"] +"ingestion/tests/cli_e2e_v2/**/*.py" = ["S101", "PLR2004", "PLC0415", "TID252", "T201"] # Auto-generated from JSON Schema — never edit, never lint. "src/metadata/generated/**" = ["ALL"] "ingestion/src/metadata/generated/**" = ["ALL"] diff --git a/ingestion/src/metadata/cli/app.py b/ingestion/src/metadata/cli/app.py index f101e9c1f956..d4192b1e652f 100644 --- a/ingestion/src/metadata/cli/app.py +++ b/ingestion/src/metadata/cli/app.py @@ -17,6 +17,7 @@ import traceback from pathlib import Path +from metadata.cli.common import execute_workflow from metadata.config.common import load_config_file from metadata.utils.logger import cli_logger from metadata.workflow.application import ApplicationWorkflow @@ -24,13 +25,14 @@ logger = cli_logger() -def run_app(config_path: Path) -> None: +def run_app(config_path: Path, status_file: Path | None = None) -> None: """ Run the application workflow from a config path to a JSON or YAML file :param config_path: Path to load JSON config """ + config_dict = None try: config_dict = load_config_file(config_path) # no logging for config because apps might have custom secrets @@ -40,7 +42,4 @@ def run_app(config_path: Path) -> None: logger.debug(traceback.format_exc()) sys.exit(1) - workflow.execute() - workflow.stop() - workflow.print_status() - workflow.raise_from_status() + execute_workflow(workflow=workflow, config_dict=config_dict, status_file=status_file) diff --git a/ingestion/src/metadata/cli/classify.py b/ingestion/src/metadata/cli/classify.py index 5ae035ea28b4..a98993e5ad12 100644 --- a/ingestion/src/metadata/cli/classify.py +++ b/ingestion/src/metadata/cli/classify.py @@ -28,7 +28,7 @@ logger = cli_logger() -def run_classification(config_path: Path) -> None: +def run_classification(config_path: Path, status_file: Path | None = None) -> None: """ Run the sampler workflow from a config path to a JSON or YAML file @@ -48,4 +48,4 @@ def run_classification(config_path: Path) -> None: WorkflowInitErrorHandler.print_init_error(exc, config_dict, PipelineType.metadata) sys.exit(1) - execute_workflow(workflow=workflow, config_dict=config_dict) + execute_workflow(workflow=workflow, config_dict=config_dict, status_file=status_file) diff --git a/ingestion/src/metadata/cli/common.py b/ingestion/src/metadata/cli/common.py index 38a8a82b840c..522bd97ed9b0 100644 --- a/ingestion/src/metadata/cli/common.py +++ b/ingestion/src/metadata/cli/common.py @@ -13,14 +13,23 @@ Handle workflow execution """ +from pathlib import Path from typing import Any, Dict # noqa: UP035 from metadata.workflow.base import BaseWorkflow -def execute_workflow(workflow: BaseWorkflow, config_dict: Dict[str, Any]) -> None: # noqa: UP006 - """Execute the workflow and raise if needed""" - workflow.execute() - workflow.stop() +def execute_workflow( + workflow: BaseWorkflow, + config_dict: Dict[str, Any], # noqa: UP006 + status_file: Path | None = None, +) -> None: + """Execute the workflow, write status file if requested, raise on failure if configured.""" + try: + workflow.execute() + finally: + workflow.stop() + if status_file is not None: + workflow.write_status_file(status_file) if config_dict.get("workflowConfig", {}).get("raiseOnError", True): workflow.raise_from_status() diff --git a/ingestion/src/metadata/cli/dataquality.py b/ingestion/src/metadata/cli/dataquality.py index 65d449d98675..e71781732ab2 100644 --- a/ingestion/src/metadata/cli/dataquality.py +++ b/ingestion/src/metadata/cli/dataquality.py @@ -28,7 +28,7 @@ logger = cli_logger() -def run_test(config_path: Path) -> None: +def run_test(config_path: Path, status_file: Path | None = None) -> None: """ Run the Data Quality Test Suites workflow from a config path to a JSON or YAML file @@ -48,4 +48,4 @@ def run_test(config_path: Path) -> None: WorkflowInitErrorHandler.print_init_error(exc, workflow_config_dict, PipelineType.TestSuite) sys.exit(1) - execute_workflow(workflow=workflow, config_dict=workflow_config_dict) + execute_workflow(workflow=workflow, config_dict=workflow_config_dict, status_file=status_file) diff --git a/ingestion/src/metadata/cli/ingest.py b/ingestion/src/metadata/cli/ingest.py index 4b6223773f23..fa83fe24fe2d 100644 --- a/ingestion/src/metadata/cli/ingest.py +++ b/ingestion/src/metadata/cli/ingest.py @@ -29,7 +29,7 @@ logger = cli_logger() -def run_ingest(config_path: Path) -> None: +def run_ingest(config_path: Path, status_file: Path | None = None) -> None: """ Run the ingestion workflow from a config path to a JSON or YAML file @@ -46,4 +46,4 @@ def run_ingest(config_path: Path) -> None: WorkflowInitErrorHandler.print_init_error(exc, config_dict, PipelineType.metadata) sys.exit(1) - execute_workflow(workflow=workflow, config_dict=config_dict) + execute_workflow(workflow=workflow, config_dict=config_dict, status_file=status_file) diff --git a/ingestion/src/metadata/cli/ingest_dbt.py b/ingestion/src/metadata/cli/ingest_dbt.py index 54a4aaaaa607..1c055e9756d9 100644 --- a/ingestion/src/metadata/cli/ingest_dbt.py +++ b/ingestion/src/metadata/cli/ingest_dbt.py @@ -25,6 +25,7 @@ from dotenv import load_dotenv from pydantic import BaseModel, Field, field_validator +from metadata.cli.common import execute_workflow from metadata.ingestion.ometa.credentials import URL from metadata.utils.logger import cli_logger from metadata.workflow.metadata import MetadataWorkflow @@ -291,7 +292,7 @@ def create_dbt_workflow_config(dbt_project_path: Path, om_config: OpenMetadataDB return config # noqa: RET504 -def run_ingest_dbt(dbt_project_path: Path) -> None: +def run_ingest_dbt(dbt_project_path: Path, status_file: Path | None = None) -> None: """ Run the dbt artifacts ingestion workflow from a dbt project path @@ -321,13 +322,13 @@ def run_ingest_dbt(dbt_project_path: Path) -> None: logger.info("Creating workflow configuration...") workflow_config = create_dbt_workflow_config(dbt_project_path, om_config) - # Create and execute the MetadataWorkflow (reusing existing infrastructure) logger.info("Starting OpenMetadata ingestion workflow...") workflow = MetadataWorkflow.create(workflow_config) - workflow.execute() - workflow.raise_from_status() - workflow.print_status() - workflow.stop() + execute_workflow( + workflow=workflow, + config_dict=workflow_config, + status_file=status_file, + ) logger.info("DBT artifacts ingestion completed successfully") diff --git a/ingestion/src/metadata/cli/lineage.py b/ingestion/src/metadata/cli/lineage.py index 9deaa7fc8cf1..b069dc8567d7 100644 --- a/ingestion/src/metadata/cli/lineage.py +++ b/ingestion/src/metadata/cli/lineage.py @@ -47,7 +47,7 @@ class LineageWorkflow(BaseModel): parserType: Optional[QueryParserType] = QueryParserType.Auto # noqa: N815, UP045 -def run_lineage(config_path: Path) -> None: +def run_lineage(config_path: Path, status_file: Path | None = None) -> None: """ Run the ingestion workflow from a config path to a JSON or YAML file diff --git a/ingestion/src/metadata/cli/profile.py b/ingestion/src/metadata/cli/profile.py index 93977318c870..403fd3d20c30 100644 --- a/ingestion/src/metadata/cli/profile.py +++ b/ingestion/src/metadata/cli/profile.py @@ -29,7 +29,7 @@ logger = cli_logger() -def run_profiler(config_path: Path) -> None: +def run_profiler(config_path: Path, status_file: Path | None = None) -> None: """ Run the Profiler workflow from a config path to a JSON or YAML file @@ -46,4 +46,4 @@ def run_profiler(config_path: Path) -> None: WorkflowInitErrorHandler.print_init_error(exc, workflow_config_dict, PipelineType.profiler) sys.exit(1) - execute_workflow(workflow=workflow, config_dict=workflow_config_dict) + execute_workflow(workflow=workflow, config_dict=workflow_config_dict, status_file=status_file) diff --git a/ingestion/src/metadata/cli/usage.py b/ingestion/src/metadata/cli/usage.py index 420a8542a8e3..c23a39bee645 100644 --- a/ingestion/src/metadata/cli/usage.py +++ b/ingestion/src/metadata/cli/usage.py @@ -29,7 +29,7 @@ logger = cli_logger() -def run_usage(config_path: Path) -> None: +def run_usage(config_path: Path, status_file: Path | None = None) -> None: """ Run the usage workflow from a config path to a JSON or YAML file @@ -46,4 +46,4 @@ def run_usage(config_path: Path) -> None: WorkflowInitErrorHandler.print_init_error(exc, config_dict, PipelineType.usage) sys.exit(1) - execute_workflow(workflow=workflow, config_dict=config_dict) + execute_workflow(workflow=workflow, config_dict=config_dict, status_file=status_file) diff --git a/ingestion/src/metadata/cmd.py b/ingestion/src/metadata/cmd.py index bf56329ad6f3..7055bbbece8c 100644 --- a/ingestion/src/metadata/cmd.py +++ b/ingestion/src/metadata/cmd.py @@ -77,6 +77,13 @@ def create_common_config_parser_args(parser: argparse.ArgumentParser): type=Path, required=True, ) + parser.add_argument( + "--status-file", + help="path to write structured JSON status output (optional)", + type=Path, + required=False, + default=None, + ) def create_dbt_parser_args(parser: argparse.ArgumentParser): @@ -220,6 +227,7 @@ def metadata(args: Optional[List[str]] = None): # noqa: UP006, UP045 metadata_workflow = contains_args.get("command") config_file: Optional[Path] = contains_args.get("config") # noqa: UP045 dbt_project_path: Optional[Path] = contains_args.get("dbt_project_path") # noqa: UP045 + status_file: Optional[Path] = contains_args.get("status_file") # noqa: UP045 path = None if config_file: @@ -234,7 +242,7 @@ def metadata(args: Optional[List[str]] = None): # noqa: UP006, UP045 set_loggers_level(log_level) if path and metadata_workflow and metadata_workflow in RUN_PATH_METHODS: - RUN_PATH_METHODS[metadata_workflow](path) + RUN_PATH_METHODS[metadata_workflow](path, status_file) if metadata_workflow == MetadataCommands.SCAFFOLD_CONNECTOR.value: has_name = contains_args.get("name") diff --git a/ingestion/src/metadata/workflow/base.py b/ingestion/src/metadata/workflow/base.py index 2bbf93a0c011..b4cf76ac372c 100644 --- a/ingestion/src/metadata/workflow/base.py +++ b/ingestion/src/metadata/workflow/base.py @@ -12,10 +12,12 @@ Base workflow definition. """ +import json import traceback import uuid from abc import ABC, abstractmethod from datetime import datetime +from pathlib import Path from statistics import mean from typing import Any, Dict, List, Optional, TypeVar, Union # noqa: UP035 @@ -212,13 +214,21 @@ def get_failures(self) -> List[StackTraceError]: # noqa: UP006 def workflow_steps(self) -> List[Step]: # noqa: UP006 """Steps to report status from""" + def _step_meets_success_threshold(self, step: Step) -> bool: + """True iff the step has no failures, or its success ratio meets the workflow's threshold. + + Shared by `raise_from_status_internal` (which raises on failure) and + `write_status_file` (which reports the CLI's observable success/failure state). + """ + status = step.get_status() + if not status.failures: + return True + return status.calculate_success() >= self.workflow_config.successThreshold # pyright: ignore[reportOperatorIssue] + def raise_from_status_internal(self, raise_warnings=False) -> None: """Based on the internal workflow status, raise a WorkflowExecutionError""" for step in self.workflow_steps(): - if ( - step.get_status().failures - and step.get_status().calculate_success() < self.workflow_config.successThreshold - ): + if not self._step_meets_success_threshold(step): raise WorkflowExecutionError(f"{step.name} reported errors: {Summary.from_step(step)}") if raise_warnings and step.status.warnings: @@ -400,3 +410,28 @@ def print_status(self): start_time, self._is_debug_enabled(), ) + + def write_status_file(self, path: Path) -> None: + """Serialize per-step status to JSON at the given path. + + The `success` field mirrors the CLI's exit-code semantic: True iff every + step meets its success threshold (the same condition under which + `raise_from_status_internal` does NOT raise). + + Shape: + { + "pipeline_type": str, + "ingestion_pipeline_fqn": str | None, + "success": bool, + "steps": [] + } + """ + ingestion_status = self.build_ingestion_status() + success = all(self._step_meets_success_threshold(step) for step in self.workflow_steps()) + payload = { + "pipeline_type": self.config.source.type, # pyright: ignore[reportAttributeAccessIssue] + "ingestion_pipeline_fqn": self.config.ingestionPipelineFQN, # pyright: ignore[reportAttributeAccessIssue] + "success": success, + "steps": ingestion_status.model_dump(), + } + path.write_text(json.dumps(payload, indent=2, default=str)) diff --git a/ingestion/src/metadata/workflow/workflow_status_mixin.py b/ingestion/src/metadata/workflow/workflow_status_mixin.py index a59e55a80e09..66d19b3dcdd3 100644 --- a/ingestion/src/metadata/workflow/workflow_status_mixin.py +++ b/ingestion/src/metadata/workflow/workflow_status_mixin.py @@ -159,7 +159,7 @@ def result_status(self) -> WorkflowResultStatus: return WorkflowResultStatus.FAILURE return WorkflowResultStatus.SUCCESS - def build_ingestion_status(self) -> Optional[IngestionStatus]: # noqa: UP045 + def build_ingestion_status(self) -> IngestionStatus: """ Get the results from the steps and prep the payload we'll send to the API diff --git a/ingestion/tests/cli_e2e_v2/CONNECTORS.md b/ingestion/tests/cli_e2e_v2/CONNECTORS.md new file mode 100644 index 000000000000..52bf85576969 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/CONNECTORS.md @@ -0,0 +1,34 @@ +# Adding a new connector + +`mysql/` is the reference. Mirror its file layout. + +## Scaffold + +``` +/ + __init__.py # empty + baseline.py # SQLAlchemy MetaData + seeds + views + SPs + get_policy() + connector.py # service_name() + build__config() + enforcer.py # SqlBaselineEnforcer subclass + expected.py # TYPE_MAP extension + _expected() helper + conftest.py # _container fixture + thin wiring + test_.py # tests +``` + +## Per file + +1. **`baseline.py`** — declare schema with SQLAlchemy Core. Reuse `core/source/common_baseline.py` for portable tables (customers, transactions). Put dialect-specific types on a wide `all_types` table keyed on `BigInteger id`. +2. **`enforcer.py`** — subclass `SqlBaselineEnforcer`. Usually only override `_stored_procedure_query_sql` (returns `(schema, name)` rows). Other overrides are rare; see `mysql/enforcer.py`. +3. **`expected.py`** — extend `CORE_TYPE_MAP` with dialect types. Export `_expected(service_name, tables=None)` calling `derive_expected_service(...)`. +4. **`connector.py`** — `_service_name(session_uuid, variant="")` and `build__config(service_name, server)`. The config emits `${E2E__*}` refs — never embed raw secrets. +5. **`conftest.py`** — session-scoped `_container` boots the source via testcontainers, creates the scoped ingest user with OM-doc-minimum GRANTs, and populates `E2E__*` env vars (so `Env(key).ref()` in `connector.py` resolves). Then the thin wiring fixtures (`_source_ready`, `_service`, `_cfg`, `_expected_factory`, `_metadata_ingested`). Mirror `mysql/conftest.py`. +6. **`test_.py`** — one `test_vanilla_ingest_structural`, one test per pipeline you ship (profiler / lineage / classification), and a parametrized filter matrix using `COMMON_FILTER_SCENARIOS` + a per-connector `_EXPECTED_TABLES_BY_VARIANT` dict. Mirror `mysql/test_mysql.py`. + +## Validate + +```bash +docker compose -f docker/development/docker-compose.yml up -d +pytest tests/cli_e2e_v2/ -v +``` + +Failures: see `README.md`. diff --git a/ingestion/tests/cli_e2e_v2/README.md b/ingestion/tests/cli_e2e_v2/README.md new file mode 100644 index 000000000000..e784b2fe27cd --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/README.md @@ -0,0 +1,53 @@ +# CLI E2E v2 + +End-to-end tests for the `metadata` CLI against a real OpenMetadata server and a real data source. Each test brings its source into a declared shape, runs CLI pipelines, asserts on what landed in OM. + +## Run + +```bash +docker compose -f docker/development/docker-compose.yml up -d +source env/bin/activate +cd ingestion +pytest tests/cli_e2e_v2/mysql -v +``` + +Each connector boots its own source via testcontainers. Docker is the only prerequisite — no DB ports, credentials, or grants to manage. + +## Layout + +``` +tests/cli_e2e_v2/ + conftest.py # session fixtures (uuid, server config) + core/ # framework internals + mysql/ # reference connector + CONNECTORS.md # how to add a new connector +``` + +## Debugging + +Every CLI run writes three files to pytest's tmp_path; the runner logs the paths at INFO: + +``` +cfg__.yaml # rendered config (secrets are ${refs}) +status__.json # status report — failures[] lives here +stdout__.log # full stdout +``` + +First place to look on failure: the exception body, then the status JSON. + +| Error | Fix | +|---|---| +| `CliExecutionError` | Inspect the embedded failures, stderr, status path | +| `StructuralMismatch` | Jump to the first diff; rest cascade | +| `401` / `Invalid token` | `unset OM_JWT_TOKEN` and rerun | +| `permission denied` from CLI | Add the missing GRANT to the connector's `conftest.py` | +| `Eventually timed out` | Raise `.eventually(120)` or set `E2E_POLL_VERBOSE=1` | + +## Env toggles (rarely needed) + +| Var | Default | Effect | +|---|---|---| +| `OM_SERVER_URL` | `http://localhost:8585/api` | OM server URL | +| `OM_JWT_TOKEN` | minted | Pre-minted token; bypasses admin login | +| `OM_ADMIN_EMAIL` / `OM_ADMIN_PASSWORD` | `admin@open-metadata.org` / `admin` | Admin for token minting | +| `E2E_POLL_VERBOSE` | unset | `=1` logs every poll attempt | diff --git a/ingestion/tests/cli_e2e_v2/__init__.py b/ingestion/tests/cli_e2e_v2/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/ingestion/tests/cli_e2e_v2/conftest.py b/ingestion/tests/cli_e2e_v2/conftest.py new file mode 100644 index 000000000000..ee3285b9a790 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/conftest.py @@ -0,0 +1,205 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Top-level fixtures for the CLI E2E v2 test package. + +Pytest auto-discovers this conftest for all tests under tests/cli_e2e_v2/. +Per-connector conftests (e.g., mysql/conftest.py) compose on top of these +session-level primitives. + +Fixture graph: + + session_uuid ─────────────┐ + ├─→ (consumed by per-connector service names) + om_server_config ────┬────┘ + │ + ├─→ om_http_client ─┬─→ om_client (per-test) + │ └─→ registered_services (cleanup) + │ + └─→ _posture_log (autouse) + + tmp_path (pytest builtin) ─→ cli_runner (per-test) +""" + +from __future__ import annotations + +import logging +import os +import uuid +from typing import TYPE_CHECKING + +import pytest + +from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( + AuthProvider, + OpenMetadataConnection, +) +from metadata.generated.schema.entity.services.databaseService import DatabaseService +from metadata.generated.schema.security.client.openMetadataJWTClientConfig import ( + OpenMetadataJWTClientConfig, +) +from metadata.ingestion.ometa.ometa_api import OpenMetadata + +from .core.config.server import ServerConfig +from .core.expected.differ import StructuralMismatch +from .core.fluent.om_client import OmClient +from .core.runner.cli_runner import CliRunner + +if TYPE_CHECKING: + from collections.abc import Iterator + from pathlib import Path + +logger = logging.getLogger(__name__) + + +# ----------------------------------------------------------------------------- +# pytest hooks +# ----------------------------------------------------------------------------- + + +def pytest_assertrepr_compare(op, left, right): + """Render `StructuralMismatch` in full when it appears in an `assert ==` / + `assert is` comparison instead of pytest's default short repr. + + `StructuralMismatch` is normally raised, in which case pytest displays + its `__str__` directly via the exception path. This hook covers the + less-common but still real case where a test compares a captured + mismatch against a sentinel (e.g. `assert run_diff() == NO_DIFFS`) — + pytest would otherwise truncate the diff body to its short repr and + swallow the path-grouped diagnostics we put in `__str__`. + """ + target = ( + left if isinstance(left, StructuralMismatch) else (right if isinstance(right, StructuralMismatch) else None) + ) + if target is None: + return None + # Each line of the rendered mismatch becomes its own report line so + # pytest's terminal writer wraps cleanly and indentation survives. + return [f"StructuralMismatch ({op}):"] + str(target).splitlines() + + +# ----------------------------------------------------------------------------- +# session identity + server +# ----------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def session_uuid() -> str: + """One 8-char hex UUID per pytest session. + + Used to suffix every OM service name so parallel matrix jobs never collide + and re-runs start from a clean namespace. Short form (8 hex chars) keeps + service names readable. + """ + return uuid.uuid4().hex[:8] + + +@pytest.fixture(scope="session") +def om_server_config() -> ServerConfig: + """Shared OM server URL + JWT, read from env once per session. + + This fixture is also the SINGLE place in the framework that installs + the resolved JWT into `os.environ["OM_JWT_TOKEN"]`. CLI subprocesses + inherit the parent env, and their rendered YAMLs carry + `${OM_JWT_TOKEN}` refs that `os.path.expandvars` resolves at load + time — so the install is necessary, but keeping it here (rather than + in `ServerConfig.from_env()`) leaves the factory pure and makes the + mutation explicit and named. + """ + cfg = ServerConfig.from_env() + # Bridge to subprocesses: the rendered cfg_*.yaml uses ${OM_JWT_TOKEN} + # so the subprocess needs it in its env. A pre-exported OM_JWT_TOKEN + # and a minted one both land at the same key. + os.environ["OM_JWT_TOKEN"] = cfg.jwt_token + return cfg + + +@pytest.fixture(scope="session") +def om_http_client(om_server_config: ServerConfig) -> OpenMetadata: + """Authenticated OpenMetadata HTTP client, session-scoped. + + Built once, reused by all OmClient wrappers and by the cleanup finalizer. + """ + conn = OpenMetadataConnection( + hostPort=om_server_config.server_url, + authProvider=AuthProvider.openmetadata, + securityConfig=OpenMetadataJWTClientConfig( + jwtToken=om_server_config.jwt_token, + ), + ) + return OpenMetadata(conn) + + +# ----------------------------------------------------------------------------- +# per-test fluent + runner +# ----------------------------------------------------------------------------- + + +@pytest.fixture +def om_client(om_http_client: OpenMetadata) -> OmClient: + """Fluent OmClient wrapping the shared HTTP client.""" + return OmClient(om_http_client) + + +@pytest.fixture +def cli_runner(tmp_path: Path) -> CliRunner: + """Per-test CliRunner bound to pytest's tmp_path. + + Each test gets its own tmp_path so cfg_*.yaml and status_*.json artifacts + don't collide across parallel or sequential tests. + """ + return CliRunner(tmp_path) + + +# ----------------------------------------------------------------------------- +# session cleanup +# ----------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def registered_services(om_http_client: OpenMetadata) -> Iterator[list[str]]: + """Session-scoped list of service names for end-of-session cleanup. + + Tests append names here when they create services; the finalizer deletes + each service via the OM API (hard delete, recursive) when the pytest + session ends. Errors during cleanup are logged but don't fail the test + run — cleanup is best-effort. + """ + names: list[str] = [] + yield names + + for name in names: + try: + svc = om_http_client.get_by_name(entity=DatabaseService, fqn=name) + if svc is None: + continue + om_http_client.delete( + entity=DatabaseService, + entity_id=str(svc.id.root), + hard_delete=True, + recursive=True, + ) + logger.info("session teardown: deleted service %s", name) + except Exception as exc: + logger.warning("session teardown: failed to delete %s: %s", name, exc) + + +# ----------------------------------------------------------------------------- +# session posture log +# ----------------------------------------------------------------------------- + + +@pytest.fixture(scope="session", autouse=True) +def _posture_log(session_uuid: str, om_server_config: ServerConfig) -> None: + """Print session UUID + server URL + token provenance at session start. + + The three lines are the minimum needed to answer post-mortem questions + like "did that run actually hit the server I expected?" and "was the + failure a stale env token or a freshly minted one?" — cheap to log + once, invaluable when triaging a flake. + """ + print("\n==== CLI E2E v2 session start ====") + print(f"session uuid: {session_uuid}") + print(f"server url: {om_server_config.server_url}") + print(f"token source: {om_server_config.token_source}") + print("==================================\n") diff --git a/ingestion/tests/cli_e2e_v2/core/__init__.py b/ingestion/tests/cli_e2e_v2/core/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/ingestion/tests/cli_e2e_v2/core/_om_compat.py b/ingestion/tests/cli_e2e_v2/core/_om_compat.py new file mode 100644 index 000000000000..50c0f09b9a01 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/_om_compat.py @@ -0,0 +1,35 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""OM Pydantic compatibility shims for the v2 framework. + +OM's generated schema sometimes wraps list-typed fields in `RootModel[list[X]]` +(notably `owners`) and sometimes uses plain `list[X] | None` (today: `tags`, +`columns`). The shape can flip between OM minor versions without warning, +which historically forced sweeping changes through every test that walked +the field. + +`unwrap_root_list` centralizes the read so a future RootModel promotion +(or demotion) of any list field touches one helper rather than ~12 +callsites scattered across the differ and the fluent layer. It mirrors +the role `model_str` plays for scalar RootModel fields (tagFQN, name, +description) — the asymmetry of having a scalar shim but no list shim +was the smell that motivated this helper. +""" + +from __future__ import annotations + +from typing import Any + + +def unwrap_root_list(field: Any) -> list: + """Return a plain list whether `field` is None, a list, or a RootModel[list]. + + The defensive branches make the helper safe to drop in at any list + access site without checking the field's current Pydantic shape. + """ + if field is None: + return [] + if hasattr(field, "root"): + return field.root + return field diff --git a/ingestion/tests/cli_e2e_v2/core/config/__init__.py b/ingestion/tests/cli_e2e_v2/core/config/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/ingestion/tests/cli_e2e_v2/core/config/builder.py b/ingestion/tests/cli_e2e_v2/core/config/builder.py new file mode 100644 index 000000000000..ad3dd81b1827 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/config/builder.py @@ -0,0 +1,192 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Immutable WorkflowConfig builder rendered to YAML for the metadata CLI. + +Two-step: factory returns a base (connection + service + server); +`.pipeline(options)` picks the pipeline (options is an OM Pydantic model); +`.with_filter(...)` layers filter patterns. Filters persist across later +`.pipeline()` transitions; inline filters on the options take precedence. +Render fails loudly when no pipeline is set. +""" + +from __future__ import annotations + +import copy +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any + +import yaml + +from .pipelines import ( + AutoClassificationPipeline, + PipelineOptions, + ProfilerPipeline, + cli_subcommand_for, + pipeline_identifier, + source_type_suffix_for, +) + +if TYPE_CHECKING: + from pathlib import Path + + from .server import ServerConfig + +_FILTER_KEYS: tuple[str, ...] = ( + "databaseFilterPattern", + "schemaFilterPattern", + "tableFilterPattern", +) + +# Pipelines that require a `processor` block in the rendered YAML. +# OM's Profiler + AutoClassification workflows instantiate an ORM profiler +# to compute column statistics / PII inference; without a processor entry, +# workflow init crashes with `'NoneType' object has no attribute 'model_dump'`. +_PIPELINES_NEEDING_PROCESSOR: tuple[type, ...] = ( + ProfilerPipeline, + AutoClassificationPipeline, +) + + +class PipelineNotSetError(RuntimeError): + """Raised when a WorkflowConfig is rendered or queried before a pipeline + has been selected via `.pipeline(...)`.""" + + +@dataclass(frozen=True) +class WorkflowConfig: + """Frozen carrier for one workflow's rendered config + active pipeline. + + Two fields: + _doc — the full YAML document as a dict tree + _options — the Pydantic pipeline options model (None on base configs + returned from the factory; set by .pipeline()) + + Instances are frozen — overlays return new instances via copy.deepcopy. + """ + + _doc: dict[str, Any] + _options: PipelineOptions | None = None + + # --- construction --------------------------------------------------- + @classmethod + def build( + cls, + *, + source_type: str, + service_name: str, + service_connection: dict[str, Any], + server: ServerConfig, + ) -> WorkflowConfig: + """Build a base config without any pipeline selected. + + Callers pass `service_connection` as a plain dict (either model_dump'd + from an OM connection class or built manually with env refs). + """ + doc: dict[str, Any] = { + "source": { + "type": source_type, + "serviceName": service_name, + "serviceConnection": {"config": dict(service_connection)}, + "sourceConfig": {"config": {}}, + }, + "sink": server.to_sink_config_dict(), + "workflowConfig": server.to_workflow_config_dict(), + } + return cls(_doc=doc, _options=None) + + # --- pipeline transition -------------------------------------------- + def pipeline(self, options: PipelineOptions) -> WorkflowConfig: + """Transition to a concrete pipeline. + + `options` is one of the OM-generated Pydantic pipeline models + (re-exported with short aliases in `pipelines.py`). The instance's + `.type` field discriminator is carried through into the rendered + YAML as `sourceConfig.config.type`. + + Filter patterns already set on this config (via `.with_filter(...)`) + persist across the transition. Filters set inline on `options` take + precedence over preserved filters. + """ + dumped = options.model_dump(mode="json", exclude_none=True) + + new_doc = copy.deepcopy(self._doc) + prior_cfg = new_doc["source"]["sourceConfig"]["config"] + for key in _FILTER_KEYS: + if key in prior_cfg: + dumped.setdefault(key, prior_cfg[key]) + + new_doc["source"]["sourceConfig"]["config"] = dumped + + # OM's `import_source_class` selects the connector class by + # splitting `source.type` on "-" and dispatching to metadata_source_class, + # lineage_source_class, or usage_source_class. The suffix must match + # the pipeline: e.g. "mysql-lineage" for a DatabaseLineage run. + base_connector = new_doc["source"]["type"].split("-", 1)[0] + new_doc["source"]["type"] = base_connector + source_type_suffix_for(options) + + if isinstance(options, _PIPELINES_NEEDING_PROCESSOR): + new_doc["processor"] = {"type": "orm-profiler", "config": {}} + else: + new_doc.pop("processor", None) + + return WorkflowConfig(_doc=new_doc, _options=options) + + # --- filter overlay ------------------------------------------------- + def with_filter( + self, + *, + databases_include: list[str] | None = None, + databases_exclude: list[str] | None = None, + schemas_include: list[str] | None = None, + schemas_exclude: list[str] | None = None, + tables_include: list[str] | None = None, + tables_exclude: list[str] | None = None, + ) -> WorkflowConfig: + """Append include/exclude patterns at database, schema, or table level. + + Multiple calls MERGE (append), not replace. Include AND exclude at the + same level are allowed — OM's filter semantic applies exclude over + include on overlapping matches. + """ + new_doc = copy.deepcopy(self._doc) + cfg = new_doc["source"]["sourceConfig"]["config"] + + def _merge(key: str, includes: list[str] | None, excludes: list[str] | None) -> None: + if not includes and not excludes: + return + pattern = cfg.setdefault(key, {}) + if includes: + pattern.setdefault("includes", []).extend(includes) + if excludes: + pattern.setdefault("excludes", []).extend(excludes) + + _merge("databaseFilterPattern", databases_include, databases_exclude) + _merge("schemaFilterPattern", schemas_include, schemas_exclude) + _merge("tableFilterPattern", tables_include, tables_exclude) + + return WorkflowConfig(_doc=new_doc, _options=self._options) + + # --- accessors ------------------------------------------------------ + @property + def pipeline_identifier(self) -> str: + """Short id for artifact filenames and invocation counters.""" + if self._options is None: + raise PipelineNotSetError("pipeline not set — call .pipeline(options) before querying identifier") + return pipeline_identifier(self._options) + + @property + def cli_subcommand(self) -> str: + """The `metadata ` subcommand CliRunner will invoke.""" + if self._options is None: + raise PipelineNotSetError("pipeline not set — call .pipeline(options) before querying subcommand") + return cli_subcommand_for(self._options) + + # --- rendering ------------------------------------------------------ + def write_tmp(self, tmp_path: Path, invocation: int = 0) -> Path: + """Dump to `/cfg__.yaml` and return the path.""" + if self._options is None: + raise PipelineNotSetError("pipeline not set — call .pipeline(options) before rendering") + path = tmp_path / f"cfg_{self.pipeline_identifier}_{invocation}.yaml" + path.write_text(yaml.safe_dump(self._doc, sort_keys=False)) + return path diff --git a/ingestion/tests/cli_e2e_v2/core/config/env.py b/ingestion/tests/cli_e2e_v2/core/config/env.py new file mode 100644 index 000000000000..c7df7409a8dc --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/config/env.py @@ -0,0 +1,103 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Env var accessor — class with `Generic[_Req]` + `typing.overload` narrowing. + +Construction captures (key, default, required); terminals: + .ref() -> "${KEY}" for YAML embedding + .get() -> str when required=True (default) + .get() -> str | None when required=False + +Runtime is a plain class; the Generic machinery is type-only. See +`memory/project-v2-env-class-design.md` for the shape's rationale. +""" + +from __future__ import annotations + +import os +from typing import Generic, Literal, TypeVar, overload + + +class EnvLoadError(RuntimeError): + """Raised when a required env var is unset (or empty).""" + + +# Constrained — callers can only parameterize Env with True or False, matching +# the two concrete `required` states. Anything else is a type error. +_Req = TypeVar("_Req", Literal[True], Literal[False]) + + +class Env(Generic[_Req]): + """Capture an env-var access pattern; ref() and get() are the terminals. + + Generic over the `required` flag so `.get()` returns `str` when + required=True and `str | None` when required=False. The required value + is kwarg-only to keep the `__new__` overloads unambiguous. + """ + + key: str + + # Two __new__ overloads — one per Literal[required] value — let the type + # checker pick the right `Env[Literal[...]]` specialization at the call + # site. The runtime __new__ is just object.__new__; Generic is erased. + @overload + def __new__( + cls, + key: str, + default: str | None = None, + *, + required: Literal[True] = True, + ) -> Env[Literal[True]]: ... + + @overload + def __new__( + cls, + key: str, + default: str | None = None, + *, + required: Literal[False], + ) -> Env[Literal[False]]: ... + + def __new__( + cls, + key: str, + default: str | None = None, + *, + required: bool = True, + ) -> Env: + return object.__new__(cls) + + def __init__( + self, + key: str, + default: str | None = None, + *, + required: bool = True, + ) -> None: + self.key = key + if default is not None: + os.environ.setdefault(key, default) + if required and not os.environ.get(key): + raise EnvLoadError(f"required env var {key} not set. Set it in your shell or GitHub Actions secrets.") + + def ref(self) -> str: + """Return '${KEY}' for embedding in YAML. + + The metadata CLI's load_config_file applies os.path.expandvars to + the raw YAML before parsing, so the subprocess resolves the reference + at load time — the rendered YAML on disk only ever contains the + literal reference, keeping secrets out of tmp_path artifacts. + """ + return f"${{{self.key}}}" + + # Two .get() overloads narrow by the specialization of Env: + # Env[Literal[True]].get() -> str (construction validated) + # Env[Literal[False]].get() -> str | None (caller must handle None) + @overload + def get(self: Env[Literal[True]]) -> str: ... + + @overload + def get(self: Env[Literal[False]]) -> str | None: ... + + def get(self) -> str | None: + return os.environ.get(self.key) diff --git a/ingestion/tests/cli_e2e_v2/core/config/pipelines.py b/ingestion/tests/cli_e2e_v2/core/config/pipelines.py new file mode 100644 index 000000000000..e7608f593bb8 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/config/pipelines.py @@ -0,0 +1,101 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Pipeline options — re-exports of OM's generated Pydantic pipeline models. + +Each pipeline maps to one Pydantic class carrying the full OM schema +(including filter patterns, incremental flags, and pipeline-specific +knobs). Short aliases keep test call sites compact; dispatch for CLI +subcommand + artifact identifier goes through a single `_SPECS` map. + +Usage: + + from ..core.config.pipelines import MetadataPipeline + + cfg = base.pipeline( + MetadataPipeline(includeStoredProcedures=True), + ).with_filter(tables_include=["customers"]) +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from metadata.generated.schema.metadataIngestion.databaseServiceAutoClassificationPipeline import ( + DatabaseServiceAutoClassificationPipeline as AutoClassificationPipeline, +) +from metadata.generated.schema.metadataIngestion.databaseServiceMetadataPipeline import ( + DatabaseServiceMetadataPipeline as MetadataPipeline, +) +from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import ( + DatabaseServiceProfilerPipeline as ProfilerPipeline, +) +from metadata.generated.schema.metadataIngestion.databaseServiceQueryLineagePipeline import ( + DatabaseServiceQueryLineagePipeline as LineagePipeline, +) +from metadata.generated.schema.metadataIngestion.databaseServiceQueryUsagePipeline import ( + DatabaseServiceQueryUsagePipeline as UsagePipeline, +) +from metadata.generated.schema.metadataIngestion.testSuitePipeline import ( + TestSuitePipeline as TestPipeline, +) + +PipelineOptions = ( + MetadataPipeline | ProfilerPipeline | LineagePipeline | UsagePipeline | TestPipeline | AutoClassificationPipeline +) + + +@dataclass(frozen=True) +class _PipelineSpec: + """Per-pipeline dispatch. + + source_type_suffix: appended to `source.type` in the rendered YAML so + OM's `import_source_class` routes to the right class. For lineage and + usage, OM looks up `-lineage` / `-usage` in the + connector's ServiceSpec; everything else uses the plain connector name. + """ + + cli_subcommand: str + identifier: str + source_type_suffix: str = "" + + +# Single source of truth for per-pipeline dispatch. Adding a pipeline +# touches exactly this dict plus the re-export above. +_SPECS: dict[type, _PipelineSpec] = { + MetadataPipeline: _PipelineSpec("ingest", "metadata", ""), + ProfilerPipeline: _PipelineSpec("profile", "profiler", ""), + LineagePipeline: _PipelineSpec("ingest", "lineage", "-lineage"), + UsagePipeline: _PipelineSpec("usage", "usage", "-usage"), + TestPipeline: _PipelineSpec("test", "test", ""), + AutoClassificationPipeline: _PipelineSpec("classify", "classify", ""), +} + + +def cli_subcommand_for(options: PipelineOptions) -> str: + """Return the `metadata ` subcommand to run for these options.""" + return _SPECS[type(options)].cli_subcommand + + +def pipeline_identifier(options: PipelineOptions) -> str: + """Short identifier for artifact filenames and invocation counters.""" + return _SPECS[type(options)].identifier + + +def source_type_suffix_for(options: PipelineOptions) -> str: + """Suffix to append to `source.type` for this pipeline (e.g. `-lineage`).""" + return _SPECS[type(options)].source_type_suffix + + +__all__ = [ + "AutoClassificationPipeline", + "LineagePipeline", + "MetadataPipeline", + "PipelineOptions", + "ProfilerPipeline", + "TestPipeline", + "UsagePipeline", + "cli_subcommand_for", + "pipeline_identifier", + "source_type_suffix_for", +] diff --git a/ingestion/tests/cli_e2e_v2/core/config/server.py b/ingestion/tests/cli_e2e_v2/core/config/server.py new file mode 100644 index 000000000000..163cf7fa6f3e --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/config/server.py @@ -0,0 +1,169 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Shared OpenMetadata server configuration for ingestion tests. + +Instance fields hold resolved values for the session HTTP client (which +authenticates directly, no YAML indirection). Rendered YAML emits ${OM_*} +references so cfg_*.yaml artifacts never embed raw JWTs — safe to share. + +Token resolution: if OM_JWT_TOKEN is exported, use it. Otherwise mint a +long-lived ingestion-bot token from the running server (admin login → +GET /bots/name/ingestion-bot → GET /users/auth-mechanism/{userId}). The +minted token is signed by THIS server's keystore, so it works against +any OM instance regardless of how it was bootstrapped — no shared dev +keypair assumption. +""" + +from __future__ import annotations + +import base64 +import os +from dataclasses import dataclass +from typing import Any, Literal + +import requests + +from ..runner.errors import E2ESetupError +from .env import Env + +TokenSource = Literal["env", "minted"] + +_DEFAULT_OM_SERVER_URL = "http://localhost:8585/api" +_DEFAULT_ADMIN_EMAIL = "admin@open-metadata.org" +_DEFAULT_ADMIN_PASSWORD = "admin" +_INGESTION_BOT_NAME = "ingestion-bot" +_HTTP_TIMEOUT_SECONDS = 10 + + +class TokenMintError(E2ESetupError): + """Raised when the bot-token mint flow fails (login, lookup, or fetch).""" + + +def _mint_ingestion_bot_token(server_url: str, admin_email: str, admin_password: str) -> str: + """Mint a server-signed, long-lived ingestion-bot JWT. + + Three hops against the live OM server: + 1. POST /v1/users/login → short-lived admin access token. + 2. GET /v1/bots/name/ingestion-bot → bot's linked user id. + 3. GET /v1/users/auth-mechanism/{user_id} → bot's permanent JWT. + + The returned token is signed by THIS server's RSA keypair, so it + validates regardless of which keystore the OM instance was + bootstrapped with. Bot tokens have `JWTTokenExpiry: Unlimited` per + OM's default bot bootstrap, so they survive long test sessions. + + Admin password is base64-encoded in the login payload to match OM's + expectation (the server decodes before bcrypt-comparing). + """ + encoded_password = base64.b64encode(admin_password.encode()).decode() + try: + login = requests.post( + f"{server_url}/v1/users/login", + json={"email": admin_email, "password": encoded_password}, + timeout=_HTTP_TIMEOUT_SECONDS, + ) + login.raise_for_status() + admin_token = login.json()["accessToken"] + + headers = {"Authorization": f"Bearer {admin_token}"} + bot = requests.get( + f"{server_url}/v1/bots/name/{_INGESTION_BOT_NAME}", + headers=headers, + timeout=_HTTP_TIMEOUT_SECONDS, + ) + bot.raise_for_status() + bot_user_id = bot.json()["botUser"]["id"] + + auth = requests.get( + f"{server_url}/v1/users/auth-mechanism/{bot_user_id}", + headers=headers, + timeout=_HTTP_TIMEOUT_SECONDS, + ) + auth.raise_for_status() + return auth.json()["config"]["JWTToken"] + except (requests.RequestException, KeyError, ValueError) as exc: + raise TokenMintError( + f"failed to mint ingestion-bot token from {server_url}: {exc}. " + f"Set OM_JWT_TOKEN to bypass minting, or set OM_ADMIN_EMAIL / " + f"OM_ADMIN_PASSWORD if the OM instance uses non-default admin creds." + ) from exc + + +@dataclass(frozen=True) +class ServerConfig: + """Shared sinkConfig + workflowConfig.openMetadataServerConfig applied to every test. + + `token_source` records how `jwt_token` was obtained — "env" when + OM_JWT_TOKEN was already exported at session start, "minted" when + from_env() had to mint via the bot-token flow. Exposed for the + session posture log so a developer can see at a glance which auth + path was taken without having to instrument the fixture. + """ + + server_url: str + jwt_token: str + token_source: TokenSource + + @classmethod + def from_env(cls) -> ServerConfig: + """Resolve server URL + JWT for the session — PURE: no side effects. + + OM_JWT_TOKEN, if exported, wins (escape hatch for hermetic CI or + deliberately scoped tokens). Otherwise mint via the bot-token + flow against OM_SERVER_URL using OM_ADMIN_EMAIL / OM_ADMIN_PASSWORD + (defaults: admin@open-metadata.org / admin — the docker-compose + bootstrap creds). + + This method DOES NOT write OM_JWT_TOKEN back into os.environ. The + `om_server_config` fixture in the top-level conftest is the single + named place that does that install step — keeping the factory + pure and the ambient-env mutation explicit. + """ + server_url = Env("OM_SERVER_URL", default=_DEFAULT_OM_SERVER_URL).get() + + existing = os.environ.get("OM_JWT_TOKEN") + if existing: + return cls( + server_url=server_url, + jwt_token=existing, + token_source="env", + ) + + minted = _mint_ingestion_bot_token( + server_url=server_url, + admin_email=Env("OM_ADMIN_EMAIL", default=_DEFAULT_ADMIN_EMAIL).get(), + admin_password=Env("OM_ADMIN_PASSWORD", default=_DEFAULT_ADMIN_PASSWORD).get(), + ) + return cls( + server_url=server_url, + jwt_token=minted, + token_source="minted", + ) + + def to_workflow_config_dict(self) -> dict[str, Any]: + """Builds the workflowConfig block for a rendered config YAML. + + Emits ${OM_*} refs. metadata CLI expands them at subprocess load time; + the rendered YAML on disk never embeds the raw JWT. + """ + return { + "openMetadataServerConfig": { + "hostPort": Env("OM_SERVER_URL").ref(), + "authProvider": "openmetadata", + "securityConfig": {"jwtToken": Env("OM_JWT_TOKEN").ref()}, + } + } + + def to_sink_config_dict(self) -> dict[str, Any]: + """Builds the sink block for a rendered config YAML. + + `bulk_sink_batch_size: 1` forces the OM sink to flush each entity + synchronously instead of buffering up to 100. Required for the FK + post-process path: `yield_table_constraints` runs BEFORE the final + sink flush, so deferred FK lookups (`metadata.get_by_name(...)` on + the referred table) otherwise miss entities still sitting in the + buffer. Production runs usually cross the buffer threshold and + hide this; small E2E fixtures (<100 entities) don't. + """ + return {"type": "metadata-rest", "config": {"bulk_sink_batch_size": 1}} diff --git a/ingestion/tests/cli_e2e_v2/core/expected/__init__.py b/ingestion/tests/cli_e2e_v2/core/expected/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/ingestion/tests/cli_e2e_v2/core/expected/derive.py b/ingestion/tests/cli_e2e_v2/core/expected/derive.py new file mode 100644 index 000000000000..4fa9842de50f --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/expected/derive.py @@ -0,0 +1,112 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Derive Expected* trees from a SQLAlchemy MetaData. + +Replaces hand-authored `ExpectedColumn` lists in per-dialect expected +modules. For each Table in `metadata`, builds an `ExpectedTable` with: + - data_type resolved via the dialect's `TypeMap` (SQLAlchemy -> OM) + - primary_key / constraint derived from `col.primary_key` / `col.nullable` + - description pulled straight from `col.comment` / `tbl.comment` + +Stored procedures are NOT derivable (not in MetaData) — callers pass their +own hand-authored list into `derive_expected_service`. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from metadata.generated.schema.entity.data.table import Constraint + +from .type_map import TypeMap, resolve_om_type +from .types import ( + ExpectedColumn, + ExpectedDatabase, + ExpectedSchema, + ExpectedService, + ExpectedStoredProcedure, + ExpectedTable, +) + +if TYPE_CHECKING: + from sqlalchemy import MetaData + from sqlalchemy.schema import Column as SqlColumn + + from metadata.generated.schema.entity.services.databaseService import ( + DatabaseServiceType, + ) + + +def derive_expected_tables(metadata: MetaData, type_map: TypeMap) -> list[ExpectedTable]: + """Build one ExpectedTable per Table in `metadata`. + + Columns come straight off the SQLAlchemy Column — name, type (via + type_map), primary_key, constraint (from nullable), comment (as + description). Tables iterated in FK-safe order via `sorted_tables`. + """ + return [ + ExpectedTable( + name=tbl.name, + columns=[_derive_column(col, type_map) for col in tbl.columns], + description=tbl.comment, + ) + for tbl in metadata.sorted_tables + ] + + +def _derive_column(col: SqlColumn, type_map: TypeMap) -> ExpectedColumn: + return ExpectedColumn( + name=col.name, + data_type=resolve_om_type(col.type, type_map), + primary_key=bool(col.primary_key), + constraint=_constraint_for(col), + description=col.comment, + ) + + +def _constraint_for(col: SqlColumn) -> Constraint | None: + if col.primary_key: + return Constraint.PRIMARY_KEY + if not col.nullable: + return Constraint.NOT_NULL + return None + + +def derive_expected_service( + *, + service_name: str, + service_type: DatabaseServiceType, + metadata: MetaData, + type_map: TypeMap, + database: str = "default", + schema: str | None = None, + views: list[ExpectedTable] | None = None, + stored_procedures: list[ExpectedStoredProcedure] | None = None, +) -> ExpectedService: + """Build a full ExpectedService tree (service -> db -> schema -> tables + SPs). + + `schema` defaults to `metadata.schema`. `views` and `stored_procedures` + are hand-authored — neither lives in SQLAlchemy MetaData. Views join + the regular table list (OM models views as Table entities with + tableType=View, so STRICT extras checks see them together). + """ + schema_name = schema or metadata.schema + if schema_name is None: + raise ValueError("metadata has no schema — pass `schema=` explicitly") + return ExpectedService( + name=service_name, + service_type=service_type, + databases=[ + ExpectedDatabase( + name=database, + schemas=[ + ExpectedSchema( + name=schema_name, + tables=derive_expected_tables(metadata, type_map) + (views or []), + stored_procedures=stored_procedures or [], + ), + ], + ), + ], + ) diff --git a/ingestion/tests/cli_e2e_v2/core/expected/differ.py b/ingestion/tests/cli_e2e_v2/core/expected/differ.py new file mode 100644 index 000000000000..c965f27fb94c --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/expected/differ.py @@ -0,0 +1,430 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Structural differ — walks an Expected* tree, fetches actual OM state per-level, +collects path-qualified diffs, raises StructuralMismatch when anything doesn't match. + +Public surface: `assert_service_matches(expected, om, mode=SUPERSET)`. + +Internal shape: every node-level differ has the **uniform signature** +`_diff_(node, parent_path, om, mode, diffs)`. The single `_diff_node` +entry point dispatches on `type(node)` via `_DIFFERS`, and each differ +recurses into children by calling `_diff_node` on them. Adding a new +node type (e.g. ExpectedView) is one registry entry plus one function. +Parent-path threading is uniform: every differ receives the owning FQN +and builds `self_fqn = f"{parent_path}.{node.name}"` the same way. + +Diffs use bracket-path notation (`service[foo].database[bar].table[baz]. +column[qux].dataType`) for readability in pytest failure output. +""" + +from __future__ import annotations + +import re +from collections.abc import Callable +from typing import TYPE_CHECKING + +from metadata.generated.schema.entity.data.database import Database +from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema +from metadata.generated.schema.entity.data.storedProcedure import StoredProcedure +from metadata.generated.schema.entity.data.table import Table +from metadata.generated.schema.entity.services.databaseService import DatabaseService +from metadata.ingestion.ometa.ometa_api import OpenMetadata +from metadata.ingestion.ometa.utils import model_str + +from .._om_compat import unwrap_root_list +from ..source.types import Diff, DiffKind +from .types import ( + ExpectedColumn, + ExpectedDatabase, + ExpectedSchema, + ExpectedService, + ExpectedStoredProcedure, + ExpectedTable, + MatchMode, +) + +if TYPE_CHECKING: + from ..fluent.om_client import OmClient + + +class StructuralMismatch(AssertionError): # noqa: N818 (intentional API surface — public exception name) + """Aggregate assertion error carrying all collected diffs. + + Renders with a summary header (counts by category) and path-sorted body + grouped by owning entity — so a failure with 20 column diffs is + scannable rather than a wall of text. + """ + + def __init__(self, diffs: list[Diff]) -> None: + self.diffs = list(diffs) + super().__init__(self._format(self.diffs)) + + @staticmethod + def _format(diffs: list[Diff]) -> str: + if not diffs: + return "StructuralMismatch: (no diffs)" + + sorted_diffs = sorted(diffs, key=lambda d: d.path) + classified = [(d, *_classify_path(d.path)) for d in sorted_diffs] + + # Header: category counts, most-frequent first, alphabetical on ties. + counts: dict[str, int] = {} + for _, _, category in classified: + counts[category] = counts.get(category, 0) + 1 + summary = ", ".join( + f"{n} {cat}{'' if n == 1 else 's'}" for cat, n in sorted(counts.items(), key=lambda kv: (-kv[1], kv[0])) + ) + header = f"StructuralMismatch: {len(sorted_diffs)} diff{'' if len(sorted_diffs) == 1 else 's'} ({summary})" + + # Body: diffs grouped by owning-entity scope. + body_lines: list[str] = [] + last_scope: str | None = None + for d, scope, _ in classified: + if last_scope is not None and scope != last_scope: + body_lines.append("") # blank line between entity scopes + last_scope = scope + body_lines.append(str(d)) + + return header + "\n" + "\n".join(body_lines) + + +# One table driving both category tally and scope clustering. +# token: substring searched for in the path string +# category: label used in the summary header +# is_scope: whether this level counts as an owning-entity scope (the +# body groups diffs by the finest scope-level bracket segment). +# Column / seed diffs are category buckets but NOT scope +# levels — they cluster under their owning table. +# Ordered from finest-grained to coarsest; both passes walk top-to-bottom +# so the first hit wins for category and scope alike. +_PATH_LEVELS: tuple[tuple[str, str, bool], ...] = ( + (".column[", "column", False), + (".seed", "seed", False), + ("procedure[", "procedure", True), + ("view[", "view", True), + ("table[", "table", True), + ("schema[", "schema", True), + ("database[", "database", True), + ("service[", "service", True), +) + + +def _classify_path(path: str) -> tuple[str, str]: + """Return (scope, category) for a diff path in one pass. + + `category` = the finest-grained level token present in the path, + used for the summary-line tally. + `scope` = the owning-entity bracket segment (e.g. `table[customers]`), + used to cluster related diffs in the failure body. Columns and seeds + collapse into their owning table's scope rather than introducing a + scope of their own. Falls back to the whole path when no bracket + token matches. + """ + category: str | None = None + scope: str | None = None + for token, label, is_scope in _PATH_LEVELS: + if token not in path: + continue + if category is None: + category = label + if is_scope and scope is None: + m = re.search(rf"{re.escape(token)}[^\]]+\]", path) + if m: + scope = m.group(0) + return scope or path, category or "service" + + +def assert_service_matches( + expected: ExpectedService, + om: OmClient, + *, + mode: MatchMode = MatchMode.SUPERSET, +) -> None: + """Walk `expected`, fetch actual state via `om.raw`, raise StructuralMismatch on diffs. + + SUPERSET (default): extras in actual are tolerated (cloud drift, unrelated tables). + STRICT: actual must equal expected exactly. + """ + diffs: list[Diff] = [] + _diff_node(expected, parent_path="", om=om.raw, mode=mode, diffs=diffs) + if diffs: + raise StructuralMismatch(diffs) + + +# ----------------------------------------------------------------------------- +# Node dispatch +# ----------------------------------------------------------------------------- + + +_NodeDiffer = Callable[[object, str, OpenMetadata, MatchMode, list[Diff]], None] + +_STRICT_LIST_LIMIT = 1000 + + +def _diff_node( + node: object, + parent_path: str, + om: OpenMetadata, + mode: MatchMode, + diffs: list[Diff], +) -> None: + """Dispatch entry — looks up the per-type differ in `_DIFFERS`. + + Unknown node types are a plan bug, not a runtime condition — raising + TypeError surfaces the mismatch at author time. + """ + differ = _DIFFERS.get(type(node)) + if differ is None: + raise TypeError(f"no differ registered for {type(node).__name__}; add an entry to _DIFFERS in differ.py") + differ(node, parent_path, om, mode, diffs) + + +# ----------------------------------------------------------------------------- +# Per-node differs — all have the same signature +# (node, parent_path, om, mode, diffs) +# ----------------------------------------------------------------------------- + + +def _diff_service( + node: object, + parent_path: str, + om: OpenMetadata, + mode: MatchMode, + diffs: list[Diff], +) -> None: + assert isinstance(node, ExpectedService) + assert parent_path == "", "ExpectedService must be the root node" + self_fqn = node.name + path = f"service[{node.name}]" + + actual = om.get_by_name(entity=DatabaseService, fqn=self_fqn) + if actual is None: + diffs.append(Diff(path=path, kind=DiffKind.MISSING)) + return + if actual.serviceType != node.service_type: + diffs.append(Diff(path=f"{path}.serviceType", expected=node.service_type, actual=actual.serviceType)) + + for child in node.databases: + _diff_node(child, self_fqn, om, mode, diffs) + + if mode == MatchMode.STRICT: + _check_strict_extras( + entity_cls=Database, + expected_names={d.name for d in node.databases}, + list_params={"service": self_fqn}, + path_fmt=f"{path}.database[{{name}}](strict)", + om=om, + diffs=diffs, + ) + + +def _diff_database( + node: object, + parent_path: str, + om: OpenMetadata, + mode: MatchMode, + diffs: list[Diff], +) -> None: + assert isinstance(node, ExpectedDatabase) + self_fqn = f"{parent_path}.{node.name}" + path = f"service[{parent_path}].database[{node.name}]" + + actual = om.get_by_name(entity=Database, fqn=self_fqn) + if actual is None: + diffs.append(Diff(path=path, kind=DiffKind.MISSING)) + return + + for child in node.schemas: + _diff_node(child, self_fqn, om, mode, diffs) + + if mode == MatchMode.STRICT: + _check_strict_extras( + entity_cls=DatabaseSchema, + expected_names={s.name for s in node.schemas}, + list_params={"database": self_fqn}, + path_fmt=f"{self_fqn}.schema[{{name}}](strict)", + om=om, + diffs=diffs, + ) + + +def _diff_schema( + node: object, + parent_path: str, + om: OpenMetadata, + mode: MatchMode, + diffs: list[Diff], +) -> None: + assert isinstance(node, ExpectedSchema) + self_fqn = f"{parent_path}.{node.name}" + path = f"{parent_path}.schema[{node.name}]" + + actual = om.get_by_name(entity=DatabaseSchema, fqn=self_fqn) + if actual is None: + diffs.append(Diff(path=path, kind=DiffKind.MISSING)) + return + + for child in node.tables: + _diff_node(child, self_fqn, om, mode, diffs) + for child in node.stored_procedures: + _diff_node(child, self_fqn, om, mode, diffs) + + if mode == MatchMode.STRICT: + _check_strict_extras( + entity_cls=Table, + expected_names={t.name for t in node.tables}, + list_params={"databaseSchema": self_fqn}, + path_fmt=f"{path}.table[{{name}}](strict)", + om=om, + diffs=diffs, + ) + _check_strict_extras( + entity_cls=StoredProcedure, + expected_names={sp.name for sp in node.stored_procedures}, + list_params={"databaseSchema": self_fqn}, + path_fmt=f"{path}.procedure[{{name}}](strict)", + om=om, + diffs=diffs, + ) + + +def _diff_table( + node: object, + parent_path: str, + om: OpenMetadata, + mode: MatchMode, + diffs: list[Diff], +) -> None: + assert isinstance(node, ExpectedTable) + self_fqn = f"{parent_path}.{node.name}" + path = f"table[{node.name}]" + + actual = om.get_by_name(entity=Table, fqn=self_fqn, fields=["tags", "owners", "columns"]) + if actual is None: + diffs.append(Diff(path=path, kind=DiffKind.MISSING)) + return + + # owner (single-owner check — matches when exp.owner appears in any actual owner) + if node.owner is not None: + actual_owners = {o.name for o in unwrap_root_list(actual.owners)} + if node.owner not in actual_owners: + diffs.append(Diff(path=f"{path}.owner", expected=node.owner, actual=sorted(actual_owners))) + + # tags (subset match — all expected tags must be present). + if node.tags: + actual_tags = {model_str(t.tagFQN) for t in unwrap_root_list(actual.tags)} + if node.tags - actual_tags: + diffs.append(Diff(path=f"{path}.tags", expected=sorted(node.tags), actual=sorted(actual_tags))) + + # description (substring match per Decision #16) + if node.description is not None: + actual_desc = model_str(actual.description) if actual.description else "" + if node.description not in actual_desc: + diffs.append( + Diff(path=f"{path}.description", expected=f"contains {node.description!r}", actual=actual_desc) + ) + + # columns — no separate OM fetch; walk the actual.columns set in place. + actual_columns_by_name = {model_str(c.name): c for c in unwrap_root_list(actual.columns)} + for exp_col in node.columns: + _diff_column(exp_col, path, actual_columns_by_name, diffs) + + if mode == MatchMode.STRICT: + expected_names = {c.name for c in node.columns} + extra = set(actual_columns_by_name.keys()) - expected_names + if extra: + diffs.append( + Diff( + path=f"{path}.columns(strict)", + kind=DiffKind.UNEXPECTED, + actual=sorted(extra), + ) + ) + + +def _diff_stored_procedure( + node: object, + parent_path: str, + om: OpenMetadata, + mode: MatchMode, + diffs: list[Diff], +) -> None: + assert isinstance(node, ExpectedStoredProcedure) + self_fqn = f"{parent_path}.{node.name}" + path = f"procedure[{node.name}]" + + actual = om.get_by_name(entity=StoredProcedure, fqn=self_fqn) + if actual is None: + diffs.append(Diff(path=path, kind=DiffKind.MISSING)) + return + + if node.description is not None: + actual_desc = model_str(actual.description) if actual.description else "" + if node.description not in actual_desc: + diffs.append( + Diff(path=f"{path}.description", expected=f"contains {node.description!r}", actual=actual_desc) + ) + + +# Column-level diffs don't fetch from OM and don't recurse, so they're NOT +# registered in _DIFFERS. `_diff_table` calls this helper directly for each +# expected column with the already-fetched `actual.columns` dict. +def _diff_column( + exp_col: ExpectedColumn, + table_path: str, + actual_columns_by_name: dict, + diffs: list[Diff], +) -> None: + path = f"{table_path}.column[{exp_col.name}]" + actual = actual_columns_by_name.get(exp_col.name) + if actual is None: + diffs.append(Diff(path=path, kind=DiffKind.MISSING)) + return + if actual.dataType != exp_col.data_type: + diffs.append(Diff(path=f"{path}.dataType", expected=exp_col.data_type, actual=actual.dataType)) + if exp_col.constraint is not None and actual.constraint != exp_col.constraint: + diffs.append(Diff(path=f"{path}.constraint", expected=exp_col.constraint, actual=actual.constraint)) + if exp_col.tags: + actual_tags = {model_str(t.tagFQN) for t in unwrap_root_list(actual.tags)} + if exp_col.tags - actual_tags: + diffs.append(Diff(path=f"{path}.tags", expected=sorted(exp_col.tags), actual=sorted(actual_tags))) + if exp_col.description is not None: + actual_desc = model_str(actual.description) if actual.description else "" + if exp_col.description not in actual_desc: + diffs.append( + Diff(path=f"{path}.description", expected=f"contains {exp_col.description!r}", actual=actual_desc) + ) + + +def _check_strict_extras( + *, + entity_cls: type, + expected_names: set[str], + list_params: dict[str, str], + path_fmt: str, + om: OpenMetadata, + diffs: list[Diff], +) -> None: + """Flag actual entities under a parent that weren't declared as expected. + + `path_fmt` must contain a `{name}` slot filled with each extra entity's + name at emit time. Pagination: capped at _STRICT_LIST_LIMIT — fine for + e2e-sized services. + """ + for actual in om.list_all_entities(entity=entity_cls, params=list_params, limit=_STRICT_LIST_LIMIT): + name = model_str(actual.name) + if name in expected_names: + continue + diffs.append(Diff(path=path_fmt.format(name=name), kind=DiffKind.UNEXPECTED)) + + +# Registry is declared AFTER the per-node differs so it can reference them +# by name. Adding a new node type = one function above + one entry here. +_DIFFERS: dict[type, _NodeDiffer] = { + ExpectedService: _diff_service, + ExpectedDatabase: _diff_database, + ExpectedSchema: _diff_schema, + ExpectedTable: _diff_table, + ExpectedStoredProcedure: _diff_stored_procedure, +} diff --git a/ingestion/tests/cli_e2e_v2/core/expected/type_map.py b/ingestion/tests/cli_e2e_v2/core/expected/type_map.py new file mode 100644 index 000000000000..8d84f1e075ef --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/expected/type_map.py @@ -0,0 +1,82 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""SQLAlchemy -> OM DataType map. + +Used by `derive_expected_tables` to build Expected trees directly from the +baseline's SQLAlchemy MetaData. `CORE_TYPE_MAP` covers the portable types +used in `common_baseline.py`; each dialect's expected module extends it +with dialect-specific classes (e.g. `mysql.MEDIUMINT`, `mysql.ENUM`). + +Resolution walks the SQLAlchemy type's MRO so subclasses inherit parent +entries unless explicitly overridden. +""" + +from __future__ import annotations + +from sqlalchemy import ( + CHAR, + JSON, + TIMESTAMP, + BigInteger, + Boolean, + Date, + DateTime, + Enum, + Float, + Integer, + LargeBinary, + Numeric, + SmallInteger, + String, + Text, + Time, +) + +from metadata.generated.schema.entity.data.table import DataType + +TypeMap = dict[type, DataType] + + +# CORE entries marked with (via MRO) are the ones that let dialect maps +# DROP their equivalent `dialects..FOO` entry: mysql.JSON / mysql.ENUM / +# mysql.BLOB / mysql.TIMESTAMP all inherit from these core classes, so the +# MRO walk in `resolve_om_type` hits the core entry without needing a +# dialect duplicate. Dialect-specific size variants (MEDIUMTEXT, LONGBLOB, +# TINYINT, etc.) still need per-dialect entries — they extend PRIVATE +# bases (`_StringType`, `_Binary`) that MRO skips past the public +# `String` / `LargeBinary`, or they want a more-specific OM DataType than +# the core parent yields. +CORE_TYPE_MAP: TypeMap = { + Integer: DataType.INT, + BigInteger: DataType.BIGINT, + SmallInteger: DataType.SMALLINT, + String: DataType.VARCHAR, + Text: DataType.TEXT, + CHAR: DataType.CHAR, + Date: DataType.DATE, + DateTime: DataType.DATETIME, + Time: DataType.TIME, + TIMESTAMP: DataType.TIMESTAMP, # via MRO: mysql.TIMESTAMP, pg.TIMESTAMP + Numeric: DataType.DECIMAL, + Float: DataType.FLOAT, + Boolean: DataType.BOOLEAN, # dialect overrides (e.g. MySQL: TINYINT) + Enum: DataType.ENUM, # via MRO: mysql.ENUM, pg.ENUM + JSON: DataType.JSON, # via MRO: mysql.JSON, pg.JSON + LargeBinary: DataType.BLOB, # via MRO: mysql.BLOB +} + + +def resolve_om_type(col_type: object, type_map: TypeMap) -> DataType: + """Return the OM DataType for a SQLAlchemy column-type instance. + + Walks the instance's class MRO, returning the first match in `type_map`. + Raises ValueError naming the unmapped class when no ancestor matches — + the dialect map just needs one new line added. + """ + for cls in type(col_type).__mro__: + if cls in type_map: + return type_map[cls] + raise ValueError( + f"no OM DataType mapping for SQLAlchemy type {type(col_type).__name__}. Add an entry to the dialect's type map." + ) diff --git a/ingestion/tests/cli_e2e_v2/core/expected/types.py b/ingestion/tests/cli_e2e_v2/core/expected/types.py new file mode 100644 index 000000000000..5dde7fe6cda1 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/expected/types.py @@ -0,0 +1,101 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Declarative dataclasses describing expected OM-side state post-ingestion. + +Per Decision #4 of the v2 spec: these reuse OM's Pydantic value types +(DataType, Constraint, DatabaseServiceType) for fields that map to schema +enums — automatic drift-safety whenever the generated schema updates. +They deliberately expose ONLY fields tests assert on, not every field on +the underlying OM entity (Table alone has 30+ fields, most noise for a +structural spec). + +Rules enforced: + - Fields that map to OM schema enums MUST use the OM enum type. + - Fields that don't map to enums stay as plain Python types. + - Unset / None means "don't assert this field" — differ skips it. + - For string fields like description, non-None means substring-match + (Decision #16), not exact equality. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from metadata.generated.schema.entity.data.table import Constraint, DataType + from metadata.generated.schema.entity.services.databaseService import ( + DatabaseServiceType, + ) + + +class MatchMode(Enum): + """Controls how strictly the structural differ treats "extra" entities in actual. + + - STRICT: actual must equal expected exactly — any unexpected table or column + flags as a diff. Used for filter tests where we care the filter eliminated + unwanted entities. + - SUPERSET (default): actual ⊇ expected. Extras are tolerated; only missing + or mismatched entities flag. Right for cloud accounts where shared schemas + may accumulate unrelated tables over time. + """ + + STRICT = "strict" + SUPERSET = "superset" + + +@dataclass(frozen=True) +class ExpectedColumn: + """A single column's expected shape in OM.""" + + name: str + data_type: DataType + tags: frozenset[str] = field(default_factory=frozenset) + constraint: Constraint | None = None + description: str | None = None # None = don't assert; str = substring match + primary_key: bool = False + + +@dataclass(frozen=True) +class ExpectedTable: + """A single table's expected shape in OM. + + Column matching is always by-name (dict lookup). Use STRICT match mode + on the differ to fail when actual tables carry unexpected extra columns. + """ + + name: str + columns: list[ExpectedColumn] + owner: str | None = None + tags: frozenset[str] = field(default_factory=frozenset) + description: str | None = None + + +@dataclass(frozen=True) +class ExpectedStoredProcedure: + """A single stored procedure's expected presence in OM.""" + + name: str + description: str | None = None # None = don't assert; str = substring match + + +@dataclass(frozen=True) +class ExpectedSchema: + name: str + tables: list[ExpectedTable] + stored_procedures: list[ExpectedStoredProcedure] = field(default_factory=list) + + +@dataclass(frozen=True) +class ExpectedDatabase: + name: str + schemas: list[ExpectedSchema] + + +@dataclass(frozen=True) +class ExpectedService: + name: str + service_type: DatabaseServiceType + databases: list[ExpectedDatabase] diff --git a/ingestion/tests/cli_e2e_v2/core/filter_scenarios.py b/ingestion/tests/cli_e2e_v2/core/filter_scenarios.py new file mode 100644 index 000000000000..ef96e0931bf3 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/filter_scenarios.py @@ -0,0 +1,111 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Portable filter scenarios for per-connector `test_filter` parametrization. + +Every SQL connector that ships a `customers` / `transactions` baseline +pair (all of them — see `core/source/common_baseline.py`) can run the +same matrix of filter semantics: include-exact, exclude-exact, schema- +only, and regex include+exclude with exclude priority. + +Shape: + - `FilterScenario.filter_kwargs` is PORTABLE — only mentions table / + schema names that exist in the common baseline. + - Expected-tables per variant is NOT portable (baselines add + connector-specific tables like MySQL's `all_types`, Postgres's + future `geom_table`, etc.). Each connector's test module maps + scenario.variant → its own expected-tables list. + +This keeps the filter-semantics matrix declared once: when we add a +fifth scenario (e.g. "include + exclude same pattern"), it's one edit +here that all connectors pick up. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field + + +@dataclass(frozen=True) +class FilterScenario: + """One row in the filter-parametrize matrix. + + variant: short token used as the service-name suffix (keeps + per-variant OM services isolated for STRICT-mode + extras detection). + id: human-readable pytest id for the test report. + filter_kwargs: kwargs to pass straight into `WorkflowConfig.with_filter`. + """ + + variant: str + id: str + filter_kwargs: dict[str, list[str]] = field(default_factory=dict) + + +def expected_tables_for( + scenario: FilterScenario, + mapping: dict[str, list[str] | None], + *, + connector: str, +) -> list[str] | None: + """Safe lookup for per-connector `_EXPECTED_TABLES_BY_VARIANT` dicts. + + When a new `FilterScenario` is added to `COMMON_FILTER_SCENARIOS`, every + connector must supply a corresponding entry in its per-connector + mapping. A missing entry manifests as `KeyError` at test collection + time, which hides the actionable message — this helper turns it into + an AssertionError naming the connector, the missing variant, and the + fix location so a junior can resolve it in one read. + """ + try: + return mapping[scenario.variant] + except KeyError as exc: + raise AssertionError( + f"[{connector}] no expected_tables entry for filter scenario " + f"{scenario.variant!r} (pytest id: {scenario.id}). Add it to " + f"the connector's _EXPECTED_TABLES_BY_VARIANT mapping." + ) from exc + + +# Ordered by increasing complexity so a failing earlier scenario typically +# points at a more-fundamental issue than a failing later one. +COMMON_FILTER_SCENARIOS: tuple[FilterScenario, ...] = ( + FilterScenario( + variant="inc_exact", + id="tables_include_exact", + filter_kwargs={ + "schemas_include": ["e2e"], + "tables_include": ["customers"], + }, + ), + FilterScenario( + variant="exc_exact", + id="tables_exclude_exact", + # `transactions` is guaranteed present in every SQL baseline via + # common_baseline; connectors that add dialect-specific tables + # (e.g. MySQL's `all_types`) should include those in the expected + # list for this variant on their side. + filter_kwargs={ + "schemas_include": ["e2e"], + "tables_exclude": ["transactions"], + }, + ), + FilterScenario( + variant="sch_inc", + id="schemas_include_only_e2e", + filter_kwargs={"schemas_include": ["e2e"]}, + ), + FilterScenario( + variant="regex_prio", + id="regex_exclude_has_priority_over_include", + # include=customer.* matches both `customers` and any view + # starting with `customer_` (e.g. MySQL's `customer_txn_summary`); + # exclude=customer_txn.* trims the latter. With exclude priority, + # only `customers` should survive. + filter_kwargs={ + "schemas_include": ["e2e"], + "tables_include": ["customer.*"], + "tables_exclude": ["customer_txn.*"], + }, + ), +) diff --git a/ingestion/tests/cli_e2e_v2/core/fixtures.py b/ingestion/tests/cli_e2e_v2/core/fixtures.py new file mode 100644 index 000000000000..2167d709b97e --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/fixtures.py @@ -0,0 +1,81 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Shared helpers for per-connector pytest fixtures. + +Per-connector conftests (`/conftest.py`) wire their own +`_source_ready` and `_metadata_ingested` fixtures on +top of these helpers instead of copy-pasting the body. + +Design: + - These are PLAIN FUNCTIONS (not pytest fixtures). The per-connector + conftest is still where pytest scoping (`scope="session"` / + `scope="module"`) lives — otherwise pytest couldn't build the + dependency graph. The helpers carry just the body. + - `run_source_baseline` takes a zero-arg policy factory so each + connector's `get_policy` stays lazy (its engine shouldn't be + constructed at module import time). + - `metadata_ingest_once` applies an optional filter overlay so the + vast majority of connectors can pass `schemas_include=[...]` + without writing a one-off pipeline-chain-and-run code block. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .runner.cli_runner import CliRunner +from .source.orchestrator import EnforcementPolicy, ensure_baseline + +if TYPE_CHECKING: + from collections.abc import Callable + + import pytest + + from .config.builder import WorkflowConfig + from .config.pipelines import PipelineOptions + from .source.types import BaselineSpec + + +def run_source_baseline( + policy_factory: Callable[[], EnforcementPolicy], + baseline: BaselineSpec, + *, + connector_name: str, +) -> None: + """Thin wrapper around `ensure_baseline` for per-connector `source_ready` fixtures. + + The factory indirection keeps engine construction lazy — `get_policy` + opens a SQLAlchemy engine, and we don't want that happening at module + import time (pytest collects conftests eagerly). + """ + ensure_baseline(policy_factory(), baseline, connector_name=connector_name) + + +def metadata_ingest_once( + tmp_path_factory: pytest.TempPathFactory, + cfg: WorkflowConfig, + registered_services: list[str], + *, + service_name: str, + pipeline_options: PipelineOptions, + filter_kwargs: dict | None = None, + label: str = "metadata", +) -> None: + """Run one metadata CLI ingest and assert success. + + Registers `service_name` for session-end cleanup so individual tests + don't need to. `label` controls the tmp-path prefix and failure- + message wording — pass the connector name for readable artifacts + (e.g. `mysql_ingest0/`). + """ + if service_name not in registered_services: + registered_services.append(service_name) + + pipeline_cfg = cfg.pipeline(pipeline_options) + if filter_kwargs: + pipeline_cfg = pipeline_cfg.with_filter(**filter_kwargs) + + runner = CliRunner(tmp_path_factory.mktemp(f"{label}_ingest")) + status = runner.run(pipeline_cfg) + assert status.success, f"module-scoped {label} metadata ingest failed: {status.all_failures}" diff --git a/ingestion/tests/cli_e2e_v2/core/fluent/__init__.py b/ingestion/tests/cli_e2e_v2/core/fluent/__init__.py new file mode 100644 index 000000000000..91e839083a15 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/fluent/__init__.py @@ -0,0 +1,74 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Fluent assertion API for CLI E2E v2 tests. + +Entry point: `om_client` fixture → `OmClient`. Every fluent chain starts +with one of `.table(fqn)`, `.service(name)`, or `.stored_procedure(fqn)`. + +Sync vs eventually +------------------ +Most terminals are **synchronous** — they assume the data is already in OM +(typical after a completed metadata ingest). A few entity domains are +**eventually-consistent** and must be wrapped in `.eventually(timeout=60)` +one-shot arming: + + eventually: profile, lineage, foreign-key constraint, service entity count + sync only: column.has_type, column.has_tag, stored_procedure, structural + +Arming is ONE-SHOT: it applies to the very next terminal in the chain and +resets afterward. Arm again for each eventually-polled assertion. + +Assertion catalog +----------------- + + # Table (om_client.table(fqn) -> TableAssert) + .exists() # sync + .get() -> Table # escape hatch, returns raw entity + .has_description_containing(text) # sync or eventually + .has_tag(tag_fqn) # sync or eventually + .has_owner(name) # sync or eventually + .eventually(60).has_foreign_key_constraint(column=..., referenced_table=..., referenced_column=...) + + # Column (via table.column(name) -> ColumnAssert, sync only) + .has_type(DataType.X) + .has_tag(tag_fqn) + .has_description_containing(text) + + # Profile (via table.profile, MUST arm with .eventually()) + table.profile.eventually(60).row_count().equals(N) + table.profile.eventually(60).row_count().at_least(N) + table.profile.eventually(60).row_count().between(lo, hi) + + # Lineage (via table.lineage, MUST arm with .eventually()) + table.lineage.eventually(60).has_upstream(fqn) + table.lineage.eventually(60).has_downstream(fqn) + table.lineage.eventually(60).has_column_lineage(source_col, target_col) + + # Service (om_client.service(name) -> ServiceAssert) + .exists() + .has_description_containing(text) + .eventually(60).has_entity_count("tables", at_least=N) + .eventually(60).has_entity_count("schemas", at_least=N) + + # Stored procedure (om_client.stored_procedure(fqn) -> StoredProcedureAssert) + .exists() + .has_description_containing(text) + .has_code_containing(text) + +Structural differ (a different entry point, not fluent) +------------------------------------------------------- + from ..core.expected.differ import assert_service_matches, MatchMode + assert_service_matches(expected_tree, om_client) # SUPERSET (default) + assert_service_matches(expected_tree, om_client, mode=MatchMode.STRICT) # filter tests + +Walks the entire Expected* tree at once and raises `StructuralMismatch` +collecting every diff — use this over one-off chains when you're +verifying "the whole catalog looks right." + +Extending +--------- +New entity namespace (e.g. `DqAssert`): inherit `EntityAssert[T]` if the +class IS the entity; compose an `EventuallyRunner` directly if it's a +namespace hanging off a parent (see LineageAssert / ProfileAssert). +""" diff --git a/ingestion/tests/cli_e2e_v2/core/fluent/entity_assert.py b/ingestion/tests/cli_e2e_v2/core/fluent/entity_assert.py new file mode 100644 index 000000000000..b92e8abee3f0 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/fluent/entity_assert.py @@ -0,0 +1,78 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Shared base for fluent entity-assertion classes. + +`EntityAssert[T]` hoists the 20 lines of boilerplate every entity-assert +class shared into one place: the om/fqn/runner constructor, `_fetch()` + +`exists()` + `get()`, one-shot `.eventually(timeout)`, and the ubiquitous +`has_description_containing(text)` terminal. + +Subclasses declare: + - `_entity_cls: type[T]` -- the OM Pydantic class (e.g. Table) + - `_default_fields: list[str]` -- fields to request from the OM API + +Entity-specific terminals (e.g. TableAssert.has_foreign_key_constraint, +ServiceAssert.has_entity_count, StoredProcedureAssert.has_code_containing) +stay on the subclass. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar + +from metadata.ingestion.ometa.utils import model_str + +from .eventually import EventuallyRunner + +if TYPE_CHECKING: + from metadata.ingestion.ometa.ometa_api import OpenMetadata + +T = TypeVar("T") + + +class EntityAssert(Generic[T]): + """Base class carrying fluent terminals shared by every entity-assert.""" + + _entity_cls: type[T] + _default_fields: ClassVar[list[str]] = [] + + def __init__(self, om: OpenMetadata, fqn: str) -> None: + self._om = om + self._fqn = fqn + self._eventually = EventuallyRunner() + + def eventually(self, timeout: int = 60): + """One-shot: the next terminal polls until success/timeout.""" + self._eventually.arm(timeout) + return self + + def _fetch(self, *, fields: list[str] | None = None) -> T: + entity = self._om.get_by_name( + entity=self._entity_cls, + fqn=self._fqn, + fields=fields if fields is not None else self._default_fields, + ) + if entity is None: + raise AssertionError(f"{self._entity_cls.__name__} not found: {self._fqn}") + return entity + + def exists(self) -> None: + """Synchronous — primary API is consistent immediately post-ingest.""" + self._fetch() + + def get(self) -> T: + """Escape hatch — returns the raw Pydantic entity.""" + return self._fetch() + + def has_description_containing(self, text: str): + def _check() -> None: + entity = self._fetch() + desc = model_str(entity.description) if entity.description else "" + if text not in desc: + raise AssertionError( + f"{self._entity_cls.__name__} {self._fqn} description does not contain {text!r}. Actual: {desc!r}" + ) + + self._eventually.run(_check, name=f"has_description_containing({text!r})") + return self diff --git a/ingestion/tests/cli_e2e_v2/core/fluent/eventually.py b/ingestion/tests/cli_e2e_v2/core/fluent/eventually.py new file mode 100644 index 000000000000..f36fbb1dcf4f --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/fluent/eventually.py @@ -0,0 +1,120 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Polling primitives for fluent assertion chains. + +`retry_until` is the low-level deadline-based retry. `EventuallyRunner` is +a one-shot arming wrapper held by each fluent assert class to dispatch +terminal checks either synchronously or via `retry_until`. + +Logging levels: + - DEBUG first-attempt failure (the single "starting to retry" signal) + - INFO every attempt when E2E_POLL_VERBOSE=1 — surfaces intermittent + flakes that otherwise disappear into DEBUG. Use it in CI when + a poll is blinking without obvious cause. + - ERROR final timeout +""" + +from __future__ import annotations + +import logging +import os +import time +from dataclasses import dataclass +from typing import TYPE_CHECKING, TypeVar + +if TYPE_CHECKING: + from collections.abc import Callable + +logger = logging.getLogger(__name__) + +DEFAULT_TIMEOUT_SECONDS = 60 +DEFAULT_POLL_INTERVAL_SECONDS = 2.0 + +T = TypeVar("T") + + +def _verbose_polling() -> bool: + """Reads E2E_POLL_VERBOSE at call time so the env var can be toggled + within a single pytest session via monkeypatch if needed.""" + return os.environ.get("E2E_POLL_VERBOSE", "").lower() in ("1", "true", "yes") + + +def retry_until( + check: Callable[[], T], + *, + timeout: int = DEFAULT_TIMEOUT_SECONDS, + poll_interval: float = DEFAULT_POLL_INTERVAL_SECONDS, + name: str = "check", +) -> T: + """Retry `check` until it returns without raising AssertionError. + + AssertionError signals "not ready yet"; any other exception propagates + immediately. Returns the first successful check's return value. + """ + start = time.monotonic() + deadline = start + timeout + attempts = 0 + verbose = _verbose_polling() + + while True: + attempts += 1 + try: + return check() + except AssertionError as exc: + if attempts == 1: + logger.debug( + "[eventually:%s] attempt %d failed: %s (retrying for up to %ds)", + name, + attempts, + exc, + timeout, + ) + if verbose: + elapsed = time.monotonic() - start + logger.info( + "[eventually:%s] attempt %d failed at %.1fs: %s", + name, + attempts, + elapsed, + exc, + ) + if time.monotonic() >= deadline: + elapsed = time.monotonic() - start + logger.error( + "[eventually:%s] gave up after %d attempts in %.1fs: %s", + name, + attempts, + elapsed, + exc, + ) + raise AssertionError( + f"eventually[{name}] timed out after {attempts} attempts " + f"over {elapsed:.1f}s (timeout={timeout}s, " + f"poll_interval={poll_interval}s).\n" + f"Last failure:\n{exc}" + ) from exc + time.sleep(poll_interval) + + +@dataclass +class EventuallyRunner: + """One-shot arming dispatcher shared by every fluent assert class. + + `.arm(timeout)` queues polling for the NEXT terminal; `.run` consumes + the arming and reverts to sync for subsequent calls. `.run` returns + whatever `check` returns — callers that don't need the value simply + ignore it (None-returning checks still type-check as `T=None`). + """ + + _timeout: int | None = None + + def arm(self, timeout: int) -> None: + self._timeout = timeout + + def run(self, check: Callable[[], T], *, name: str) -> T: + if self._timeout is not None: + timeout = self._timeout + self._timeout = None + return retry_until(check, timeout=timeout, name=name) + return check() diff --git a/ingestion/tests/cli_e2e_v2/core/fluent/lineage_assert.py b/ingestion/tests/cli_e2e_v2/core/fluent/lineage_assert.py new file mode 100644 index 000000000000..296fdf73b27a --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/fluent/lineage_assert.py @@ -0,0 +1,93 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""LineageAssert — polling-friendly lineage edge and column-lineage checks.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal + +from metadata.generated.schema.entity.data.table import Table + +from .eventually import EventuallyRunner + +if TYPE_CHECKING: + from metadata.ingestion.ometa.ometa_api import OpenMetadata + +_Direction = Literal["upstream", "downstream"] + + +class LineageAssert: + """Lineage namespace — reached via TableAssert.lineage. + + Lineage propagation is eventually-consistent; all terminals accept + `.eventually(timeout)` one-shot arming. + """ + + def __init__(self, om: OpenMetadata, table_fqn: str) -> None: + self._om = om + self._fqn = table_fqn + self._eventually = EventuallyRunner() + + def eventually(self, timeout: int = 60) -> LineageAssert: + self._eventually.arm(timeout) + return self + + def _lineage(self) -> dict: + return self._om.get_lineage_by_name(entity=Table, fqn=self._fqn) or {} + + def _check_edge(self, direction: _Direction, fqn: str) -> None: + """Match direction-typed edges; resolve UUID-only Edge endpoints via nodes/entity FQN map.""" + data = self._lineage() + nodes = data.get("nodes") or [] + central = data.get("entity") or {} + uuid_to_fqn: dict[str, str] = {} + for n in [*nodes, central]: + uid, ref_fqn = n.get("id"), n.get("fullyQualifiedName") + if uid and ref_fqn: + uuid_to_fqn[uid] = ref_fqn + counterpart_field = "fromEntity" if direction == "upstream" else "toEntity" + self_field = "toEntity" if direction == "upstream" else "fromEntity" + matched: set[str] = set() + for e in data.get(f"{direction}Edges") or []: + if uuid_to_fqn.get(e.get(self_field)) == self._fqn: + cp = uuid_to_fqn.get(e.get(counterpart_field)) + if cp: + matched.add(cp) + if fqn in matched: + return + nodes_fqns = sorted(uuid_to_fqn.values()) + raise AssertionError( + f"Table {self._fqn} has no {direction} {fqn!r}. " + f"{direction}Edges resolved to FQNs={sorted(matched)} nodes={nodes_fqns}" + ) + + def has_upstream(self, fqn: str) -> LineageAssert: + self._eventually.run( + lambda: self._check_edge("upstream", fqn), + name=f"has_upstream({fqn})", + ) + return self + + def has_downstream(self, fqn: str) -> LineageAssert: + self._eventually.run( + lambda: self._check_edge("downstream", fqn), + name=f"has_downstream({fqn})", + ) + return self + + def has_column_lineage(self, source: str, target: str) -> LineageAssert: + def _check() -> None: + data = self._lineage() + edges = (data.get("upstreamEdges") or []) + (data.get("downstreamEdges") or []) + for edge in edges: + lineage_details = edge.get("lineageDetails") or {} + for col_edge in lineage_details.get("columnsLineage") or []: + froms = col_edge.get("fromColumns") or [] + to = col_edge.get("toColumn") or "" + if any(source in f for f in froms) and target in to: + return + raise AssertionError(f"No column lineage {source!r} -> {target!r} on table {self._fqn}") + + self._eventually.run(_check, name=f"has_column_lineage({source}->{target})") + return self diff --git a/ingestion/tests/cli_e2e_v2/core/fluent/om_client.py b/ingestion/tests/cli_e2e_v2/core/fluent/om_client.py new file mode 100644 index 000000000000..d40acffcbe96 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/fluent/om_client.py @@ -0,0 +1,43 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Fluent entry point wrapping the existing OpenMetadata HTTP client. + +Per Decision #21 of the v2 spec, OmClient is a thin facade — we do NOT build +a new HTTP client. All actual REST calls delegate to +metadata.ingestion.ometa.OpenMetadata, which already handles auth, retries, +and Pydantic deserialization. + +OmClient's public surface is the fluent layer: .table(fqn), .service(name), +.stored_procedure(fqn), plus .raw for escape-hatch tests that need the +underlying client directly. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from .service_assert import ServiceAssert +from .stored_procedure_assert import StoredProcedureAssert +from .table_assert import TableAssert + +if TYPE_CHECKING: + from metadata.ingestion.ometa.ometa_api import OpenMetadata + + +class OmClient: + def __init__(self, om: OpenMetadata) -> None: + self._om = om + + @property + def raw(self) -> OpenMetadata: + return self._om + + def table(self, fqn: str) -> TableAssert: + return TableAssert(self._om, fqn) + + def service(self, name: str) -> ServiceAssert: + return ServiceAssert(self._om, name) + + def stored_procedure(self, fqn: str) -> StoredProcedureAssert: + return StoredProcedureAssert(self._om, fqn) diff --git a/ingestion/tests/cli_e2e_v2/core/fluent/profile_assert.py b/ingestion/tests/cli_e2e_v2/core/fluent/profile_assert.py new file mode 100644 index 000000000000..1a0dc1a88158 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/fluent/profile_assert.py @@ -0,0 +1,181 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""ProfileAssert + ColumnProfileAssert + NumericAssert. + +Two assertion surfaces: + - table-level: row count via `.profile.eventually().row_count().equals(N)` + - column-level: arbitrary metric subset via + `.profile.eventually().column(name).has_metrics(min=600, max=750, ...)` + +Both share the same poll-and-fetch primitive — when armed with +`.eventually(timeout)`, the profile is polled until present. Column- +metric assertion uses kwargs that map 1:1 to OM's ColumnProfile field +names; an unknown kwarg raises so a typo doesn't silently pass. +""" + +from __future__ import annotations + +from decimal import Decimal +from typing import TYPE_CHECKING, Any + +from metadata.ingestion.ometa.utils import model_str + +from .._om_compat import unwrap_root_list +from .eventually import EventuallyRunner + +if TYPE_CHECKING: + from metadata.generated.schema.entity.data.table import Column, Table + from metadata.ingestion.ometa.ometa_api import OpenMetadata + + +class NumericAssert: + """Terminal numeric comparators for a single metric value.""" + + def __init__(self, value: int | float | None, *, label: str) -> None: + self._value = value + self._label = label + + def at_least(self, n: int) -> None: + if self._value is None or self._value < n: + raise AssertionError(f"{self._label}: expected >= {n}, got {self._value}") + + def equals(self, n: int) -> None: + if self._value != n: + raise AssertionError(f"{self._label}: expected {n}, got {self._value}") + + def between(self, lo: int, hi: int) -> None: + if self._value is None or not (lo <= self._value <= hi): + raise AssertionError(f"{self._label}: expected in [{lo}, {hi}], got {self._value}") + + +class ColumnProfileAssert: + """Per-column profile assertions reached via + `.profile.eventually().column(name)`. + + `has_metrics(**expected)` accepts any subset of OM's ColumnProfile + field names as kwargs (e.g. `min=600, max=750, distinctCount=5, + nullCount=0, mean=680`). Each kwarg is compared against the + corresponding profile field; numeric values that come back as + Decimal/float are normalized for the comparison. + """ + + def __init__( + self, + om: OpenMetadata, + table_fqn: str, + column_name: str, + runner: EventuallyRunner, + ) -> None: + self._om = om + self._fqn = table_fqn + self._column_name = column_name + self._eventually = runner + + def has_metrics(self, **expected: Any) -> ColumnProfileAssert: + """Assert each given metric matches the column's actual profile. + + Unknown kwargs (typos / fields the OM Pydantic model doesn't + carry) raise immediately so a misspelled metric name fails loud + rather than silently passing. + """ + if not expected: + raise ValueError("has_metrics requires at least one kwarg") + label = f"column_profile({self._fqn}.{self._column_name})" + + def _check() -> None: + col = self._fetch_column_profile() + mismatches: list[str] = [] + for field, want in expected.items(): + if not hasattr(col, field): + raise AssertionError( + f"{label}: unknown ColumnProfile field {field!r}. " + f"Available fields: " + f"{sorted(col.model_fields.keys())}" + ) + got = getattr(col, field) + if not _values_match(got, want): + mismatches.append(f"{field}: expected {want!r}, got {got!r}") + if mismatches: + raise AssertionError(f"{label} metric mismatches:\n " + "\n ".join(mismatches)) + + self._eventually.run(_check, name=f"has_metrics({sorted(expected)})") + return self + + def _fetch_column_profile(self) -> Column: + table = self._om.get_latest_table_profile(self._fqn) + if table is None: + raise AssertionError(f"Table not found: {self._fqn}") + for c in unwrap_root_list(table.columns): + if model_str(c.name) == self._column_name: + if c.profile is None: + raise AssertionError(f"Column {self._fqn}.{self._column_name} has no profile yet") + return c.profile + raise AssertionError(f"Column {self._column_name!r} not found on table {self._fqn}") + + +class ProfileAssert: + """Profile namespace — reached via TableAssert.profile. + + Profiler output is eventually-consistent; `.row_count()` and + `.column(name)` both compose with `.eventually()` by polling until + the data is available. + """ + + def __init__(self, om: OpenMetadata, table_fqn: str) -> None: + self._om = om + self._fqn = table_fqn + self._eventually = EventuallyRunner() + + def eventually(self, timeout: int = 60) -> ProfileAssert: + self._eventually.arm(timeout) + return self + + def _fetch_profile(self) -> Table: + table = self._om.get_latest_table_profile(self._fqn) + if table is None: + raise AssertionError(f"Table not found: {self._fqn}") + if table.profile is None: + raise AssertionError(f"Table {self._fqn} has no profile data") + return table + + def row_count(self) -> NumericAssert: + """Extract rowCount from the profile, returning a NumericAssert. + + When armed via `.eventually()`, polls until `profile.rowCount` is + non-None, then constructs NumericAssert with the polled value. + """ + label = f"rowCount({self._fqn})" + + def _get() -> int: + table = self._fetch_profile() + if table.profile.rowCount is None: + raise AssertionError(f"{label}: no rowCount yet") + return int(table.profile.rowCount) + + value = self._eventually.run(_get, name=label) + return NumericAssert(value, label=label) + + def column(self, name: str) -> ColumnProfileAssert: + """Reach a ColumnProfileAssert scoped to the given column. + + Inherits the parent ProfileAssert's arm — calling + `.profile.eventually().column(...)` makes the next column-level + terminal poll, just like `.row_count()` does. + """ + return ColumnProfileAssert(self._om, self._fqn, name, runner=self._eventually) + + +def _values_match(actual: Any, expected: Any) -> bool: + """Compare profile-metric values tolerating Decimal/float/int crossover. + + OM serializes numeric profile metrics as Decimal in some cases and + float in others; tests want to write `min=600` without thinking + about which path the value took. Falls back to == for non-numeric + types (strings, None, bools). + """ + if actual is None: + return False + if isinstance(actual, (Decimal, float, int)) and isinstance(expected, (Decimal, float, int)): + return float(actual) == float(expected) + return actual == expected diff --git a/ingestion/tests/cli_e2e_v2/core/fluent/service_assert.py b/ingestion/tests/cli_e2e_v2/core/fluent/service_assert.py new file mode 100644 index 000000000000..2ca0b6d59713 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/fluent/service_assert.py @@ -0,0 +1,63 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""ServiceAssert — database-service-level fluent checks.""" + +from __future__ import annotations + +from typing import Literal + +from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema +from metadata.generated.schema.entity.data.table import Table +from metadata.generated.schema.entity.services.databaseService import DatabaseService + +from .entity_assert import EntityAssert + +_ENTITY_COUNT_LIMIT = 1000 + + +class ServiceAssert(EntityAssert[DatabaseService]): + """Service namespace — reached via OmClient.service(name). + + Provides smoke-level checks beyond the shared base: bulk entity counts. + Inherits exists / get / eventually / has_description_containing. + """ + + _entity_cls = DatabaseService + + def _count_entities(self, kind: Literal["tables", "schemas"]) -> int: + self._fetch() + entity_cls = Table if kind == "tables" else DatabaseSchema + items = list( + self._om.list_all_entities( + entity=entity_cls, + limit=_ENTITY_COUNT_LIMIT, + params={"service": self._fqn}, + ) + ) + return len(items) + + def has_entity_count( + self, + kind: Literal["tables", "schemas"], + *, + at_least: int, + ) -> None: + """Assert the service has at least `at_least` entities of `kind`. + + Raises ValueError when `at_least` exceeds the list_all_entities cap + (pagination is not implemented at this assertion level). + """ + if at_least > _ENTITY_COUNT_LIMIT: + raise ValueError( + f"has_entity_count(at_least={at_least}) exceeds the " + f"list_all_entities cap ({_ENTITY_COUNT_LIMIT}); pagination " + f"is not implemented for this assertion." + ) + + def _check() -> None: + actual = self._count_entities(kind) + if actual < at_least: + raise AssertionError(f"Service {self._fqn}: expected >= {at_least} {kind}, got {actual}") + + self._eventually.run(_check, name=f"has_entity_count({kind},{at_least})") diff --git a/ingestion/tests/cli_e2e_v2/core/fluent/stored_procedure_assert.py b/ingestion/tests/cli_e2e_v2/core/fluent/stored_procedure_assert.py new file mode 100644 index 000000000000..50a6efeb6e07 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/fluent/stored_procedure_assert.py @@ -0,0 +1,36 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""StoredProcedureAssert — fluent assertions on stored procedure entities.""" + +from __future__ import annotations + +from metadata.generated.schema.entity.data.storedProcedure import StoredProcedure + +from .entity_assert import EntityAssert + + +class StoredProcedureAssert(EntityAssert[StoredProcedure]): + """Fluent assertions on a single stored procedure by FQN. + + Inherits exists / get / eventually / has_description_containing from + EntityAssert; adds `has_code_containing` which reads the SP body. + """ + + _entity_cls = StoredProcedure + + def has_code_containing(self, text: str) -> StoredProcedureAssert: + """Assert the stored procedure's SQL body contains the given substring.""" + + def _check() -> None: + sp = self._fetch() + code = "" + if sp.storedProcedureCode is not None and sp.storedProcedureCode.code is not None: + code = sp.storedProcedureCode.code + if text not in code: + raise AssertionError( + f"StoredProcedure {self._fqn} code does not contain {text!r}. Actual code: {code!r}" + ) + + self._eventually.run(_check, name=f"has_code_containing({text!r})") + return self diff --git a/ingestion/tests/cli_e2e_v2/core/fluent/table_assert.py b/ingestion/tests/cli_e2e_v2/core/fluent/table_assert.py new file mode 100644 index 000000000000..4ecae1af58b1 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/fluent/table_assert.py @@ -0,0 +1,266 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""TableAssert + ColumnAssert — fluent assertions on Table entities. + +TableAssert inherits shared fluent surface (exists / get / eventually / +has_description_containing) from `EntityAssert[Table]`. Entity-specific +terminals (tags, owners, FK constraint, column descent, lineage/profile +namespaces) live here. + +ColumnAssert is synchronous — column checks on fresh ingests are reliable +in practice; polling chains off TableAssert. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, ClassVar + +from metadata.generated.schema.entity.data.table import ( + Column, + ConstraintType, + DataType, + Table, + TableConstraint, +) +from metadata.ingestion.ometa.utils import model_str + +from .._om_compat import unwrap_root_list +from .entity_assert import EntityAssert +from .lineage_assert import LineageAssert +from .profile_assert import ProfileAssert + +if TYPE_CHECKING: + from metadata.ingestion.ometa.ometa_api import OpenMetadata + + +def _fk_matches( + constraint: TableConstraint, + column: str, + referenced_table: str, + referenced_column: str, +) -> bool: + """True if `constraint` is a FOREIGN_KEY on `column` pointing at the + named referred column. + + The referredColumns FQNs may be rendered as either the full + `service.database.schema.table.column` form or the shorter + `table.column` form depending on how OM resolved them at ingest time; + both are accepted via tail-match. + """ + if constraint.constraintType != ConstraintType.FOREIGN_KEY: + return False + own_cols = {model_str(x) for x in unwrap_root_list(constraint.columns)} + if column not in own_cols: + return False + wanted_tail = f".{referenced_table}.{referenced_column}" + wanted_short = f"{referenced_table}.{referenced_column}" + return any( + model_str(ref).endswith(wanted_tail) or model_str(ref) == wanted_short + for ref in unwrap_root_list(constraint.referredColumns) + ) + + +class TableAssert(EntityAssert[Table]): + """Fluent assertions on a single Table identified by FQN.""" + + _entity_cls = Table + _default_fields: ClassVar[list[str]] = ["tags", "owners", "columns"] + + # --- terminals ---------------------------------------------------- + + def has_tag(self, fqn: str) -> TableAssert: + def _check() -> None: + table = self._fetch() + actual = {model_str(t.tagFQN) for t in unwrap_root_list(table.tags)} + if fqn not in actual: + raise AssertionError(f"Table {self._fqn} missing tag {fqn!r}. Actual tags: {sorted(actual)}") + + self._eventually.run(_check, name=f"has_tag({fqn})") + return self + + def has_owner(self, name: str) -> TableAssert: + def _check() -> None: + table = self._fetch() + actual = {o.name for o in unwrap_root_list(table.owners)} + if name not in actual: + raise AssertionError(f"Table {self._fqn} missing owner {name!r}. Actual owners: {sorted(actual)}") + + self._eventually.run(_check, name=f"has_owner({name})") + return self + + def has_foreign_key_constraint( + self, + column: str, + referenced_table: str, + referenced_column: str, + ) -> TableAssert: + """Assert the table carries a FOREIGN_KEY TableConstraint on `column` + pointing at `referenced_table.referenced_column`. + + MySQL lands FK data here — not as a lineage edge. Matching delegates + to `_fk_matches`. + """ + + def _check() -> None: + constraints = unwrap_root_list(self._fetch(fields=["tableConstraints"]).tableConstraints) + if any(_fk_matches(c, column, referenced_table, referenced_column) for c in constraints): + return + raise AssertionError( + f"Table {self._fqn} missing FOREIGN_KEY({column}) -> " + f"{referenced_table}({referenced_column}). " + f"Constraints present: {constraints!r}" + ) + + self._eventually.run( + _check, + name=f"has_foreign_key_constraint({column}->{referenced_table}.{referenced_column})", + ) + return self + + def has_schema_definition_containing(self, text: str) -> TableAssert: + """Assert `schemaDefinition` (raw DDL stored on the entity) contains + `text` — case-insensitive substring match. + + Populated for views when metadata ingest runs with `includeDDL=True`, + and for tables when the connector emits CREATE TABLE bodies. Used as + the prerequisite check that ingest actually plumbed DDL through — + a failed lineage parse with empty `schemaDefinition` is a different + bug than a failed parse on present DDL. + + Case insensitivity matters: MySQL normalizes view DDL to lowercase + (`left join`, not `LEFT JOIN`); other dialects preserve case. The + assertion keeps tests portable across dialects without each one + having to know the specific casing. + """ + wanted_lower = text.lower() + + def _check() -> None: + entity = self._fetch(fields=["schemaDefinition"]) + actual = model_str(entity.schemaDefinition) if entity.schemaDefinition else "" + if wanted_lower not in actual.lower(): + raise AssertionError( + f"Table {self._fqn} schemaDefinition does not contain " + f"{text!r} (case-insensitive). Actual: {actual!r}" + ) + + self._eventually.run(_check, name=f"has_schema_definition_containing({text!r})") + return self + + def is_soft_deleted(self) -> TableAssert: + """Assert the table exists in OM but is marked `deleted=True`. + + Soft-deleted entities are filtered out of `get_by_name` by default. + We use `list_entities` with `include=all` to find the entity even + when soft-deleted, then check the `deleted` field. Used by + mark-deleted tests after re-ingest with `markDeletedTables=True`. + """ + + def _check() -> None: + if not self._fetch_any_state().deleted: + raise AssertionError(f"Table {self._fqn} is not soft-deleted (deleted=False)") + + self._eventually.run(_check, name="is_soft_deleted") + return self + + def is_not_deleted(self) -> TableAssert: + """Assert the table exists in OM with `deleted=False`.""" + + def _check() -> None: + entity = self._fetch_any_state() + if entity.deleted: + raise AssertionError(f"Table {self._fqn} is unexpectedly soft-deleted (deleted=True)") + + self._eventually.run(_check, name="is_not_deleted") + return self + + def _fetch_any_state(self) -> Table: + """Fetch the table including soft-deleted state (default get_by_name + filters those out).""" + entity = self._om.get_by_name( + entity=Table, + fqn=self._fqn, + fields=["deleted"], + include="all", + ) + if entity is None: + raise AssertionError(f"Table not found (in any state): {self._fqn}") + return entity + + # --- descent into column / namespaces ----------------------------- + + def column(self, name: str) -> ColumnAssert: + return ColumnAssert(self._om, self._fqn, name) + + @property + def lineage(self) -> LineageAssert: + return LineageAssert(self._om, self._fqn) + + @property + def profile(self) -> ProfileAssert: + return ProfileAssert(self._om, self._fqn) + + +class ColumnAssert: + """Synchronous assertions on a named column of a Table.""" + + def __init__(self, om: OpenMetadata, table_fqn: str, column_name: str) -> None: + self._om = om + self._table_fqn = table_fqn + self._column_name = column_name + + def _fetch_column(self) -> Column: + table = self._om.get_by_name( + entity=Table, + fqn=self._table_fqn, + fields=["tags", "columns"], + ) + if table is None: + raise AssertionError(f"Table not found: {self._table_fqn}") + for c in unwrap_root_list(table.columns): + if model_str(c.name) == self._column_name: + return c + raise AssertionError(f"Column {self._column_name!r} not found on table {self._table_fqn}") + + def has_tag(self, fqn: str) -> ColumnAssert: + column = self._fetch_column() + actual = {model_str(t.tagFQN) for t in unwrap_root_list(column.tags)} + if fqn not in actual: + raise AssertionError( + f"Column {self._table_fqn}.{self._column_name} missing tag {fqn!r}. Actual tags: {sorted(actual)}" + ) + return self + + def has_no_tag(self, fqn: str) -> ColumnAssert: + """Assert the column does NOT carry the given tag. + + Used as the negative complement to `has_tag` — guards against + regressions where a classifier becomes overconfident and tags + non-PII columns. Without this, a positive-only suite passes + cleanly even when every column gets PII-flagged. + """ + column = self._fetch_column() + actual = {model_str(t.tagFQN) for t in unwrap_root_list(column.tags)} + if fqn in actual: + raise AssertionError( + f"Column {self._table_fqn}.{self._column_name} unexpectedly " + f"carries tag {fqn!r}. Actual tags: {sorted(actual)}" + ) + return self + + def has_type(self, data_type: DataType) -> ColumnAssert: + column = self._fetch_column() + if column.dataType != data_type: + raise AssertionError( + f"Column {self._table_fqn}.{self._column_name} has type {column.dataType}, expected {data_type}" + ) + return self + + def has_description_containing(self, text: str) -> ColumnAssert: + column = self._fetch_column() + desc = model_str(column.description) if column.description else "" + if text not in desc: + raise AssertionError( + f"Column {self._table_fqn}.{self._column_name} description does not contain {text!r}. Actual: {desc!r}" + ) + return self diff --git a/ingestion/tests/cli_e2e_v2/core/runner/__init__.py b/ingestion/tests/cli_e2e_v2/core/runner/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/ingestion/tests/cli_e2e_v2/core/runner/cli_runner.py b/ingestion/tests/cli_e2e_v2/core/runner/cli_runner.py new file mode 100644 index 000000000000..b923bdecab0c --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/runner/cli_runner.py @@ -0,0 +1,174 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Runs `metadata ` via subprocess and returns a typed Status. + +One CliRunner per test, bound to tmp_path. Each `.run()` writes numbered +cfg / status / stdout artifacts. Subprocess has a bounded timeout +(default 600s, kwarg override). CliExecutionError carries exit_code, +stderr, stdout, config_path, and argv for post-mortem. +""" + +from __future__ import annotations + +import json +import logging +import subprocess +from typing import TYPE_CHECKING + +from .errors import CliExecutionError +from .status import Status + +if TYPE_CHECKING: + from pathlib import Path + + from ..config.builder import WorkflowConfig + +logger = logging.getLogger(__name__) + +DEFAULT_TIMEOUT_SECONDS = 600 + +# Cap inline step failures GLOBALLY across all steps. Failures cascade — +# step 1's first failure is overwhelmingly the root cause, step 5's third +# failure is downstream noise. The full list is in the status JSON for +# deep dives. +_INLINE_FAILURES_LIMIT = 3 +_INLINE_FAILURE_CHARS = 500 + + +def _summarize_step_failures(status_path: Path) -> str | None: + """Best-effort: read the status JSON and pull out the first few step + failures (across all steps, capped globally) as a short, scannable + block. Returns None on any read / parse failure — caller falls back + to the raw stdout/stderr dump. + + Output shape (one line per failure, truncated): + [StepName::FailureName] first-line-of-error… + """ + if not status_path.exists(): + return None + try: + data = json.loads(status_path.read_text()) + except (json.JSONDecodeError, OSError): + return None + + lines: list[str] = [] + for step in data.get("steps") or []: + step_name = step.get("name", "?") + for failure in step.get("failures") or []: + if len(lines) >= _INLINE_FAILURES_LIMIT: + return "\n".join(lines) + name = failure.get("name", "?") + err = (failure.get("error") or "").splitlines()[0][:_INLINE_FAILURE_CHARS] + lines.append(f" [{step_name}::{name}] {err}") + return "\n".join(lines) if lines else None + + +class CliRunner: + """Runs `metadata ` via subprocess and returns a typed Status. + + Usage: + runner = CliRunner(tmp_path) + status = runner.run(cfg) # ingest + status2 = runner.run(cfg.pipeline(Profiler...)) # profile + """ + + def __init__(self, tmp_path: Path) -> None: + self.tmp_path = tmp_path + self._invocation_counter: dict[str, int] = {} + + def run( + self, + config: WorkflowConfig, + *, + timeout: int = DEFAULT_TIMEOUT_SECONDS, + ) -> Status: + identifier = config.pipeline_identifier + n = self._invocation_counter.get(identifier, 0) + self._invocation_counter[identifier] = n + 1 + + cfg_path = config.write_tmp(self.tmp_path, invocation=n) + status_path = self.tmp_path / f"status_{identifier}_{n}.json" + stdout_path = self.tmp_path / f"stdout_{identifier}_{n}.log" + + command = [ + "metadata", + config.cli_subcommand, + "-c", + str(cfg_path), + "--status-file", + str(status_path), + ] + + try: + result = subprocess.run( + command, + capture_output=True, + text=True, + check=False, + timeout=timeout, + ) + except subprocess.TimeoutExpired as exc: + # exc.stdout / exc.stderr may be bytes or str depending on + # capture config — normalize to str for consistent logging. + out = _coerce_text(exc.stdout) + err = _coerce_text(exc.stderr) + stdout_path.write_text(out) + raise CliExecutionError( + exit_code=-1, + stderr=(f"CLI timed out after {timeout}s.\nstderr so far:\n{err}"), + stdout=out, + config_path=cfg_path, + status_path=status_path, + command=command, + ) from exc + + # Persist stdout unconditionally — useful for debugging both + # successful runs (checking warnings) and failed ones. + stdout_path.write_text(result.stdout or "") + + # One line with the three artifact paths. Invaluable for post-mortem + # because pytest's tmp_path lives under a deep auto-generated dir. + logger.info( + "[cli] %s invocation=%d exit=%d cfg=%s status=%s stdout=%s", + identifier, + n, + result.returncode, + cfg_path, + status_path, + stdout_path, + ) + + if result.returncode != 0: + raise CliExecutionError( + exit_code=result.returncode, + stderr=result.stderr, + stdout=result.stdout, + config_path=cfg_path, + status_path=status_path, + command=command, + step_failures_summary=_summarize_step_failures(status_path), + ) + + # Defensive: CLI reported success but wrote no status file — something + # broke between workflow completion and file emission (e.g., a future + # BaseWorkflow.write_status_file regression). + if not status_path.exists(): + raise CliExecutionError( + exit_code=0, + stderr=(f"CLI exited 0 but no status file was written at {status_path}."), + stdout=result.stdout, + config_path=cfg_path, + status_path=status_path, + command=command, + ) + + return Status.from_json(status_path) + + +def _coerce_text(value: object) -> str: + if value is None: + return "" + if isinstance(value, bytes): + return value.decode("utf-8", errors="replace") + return str(value) diff --git a/ingestion/tests/cli_e2e_v2/core/runner/errors.py b/ingestion/tests/cli_e2e_v2/core/runner/errors.py new file mode 100644 index 000000000000..cf72be9865e0 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/runner/errors.py @@ -0,0 +1,105 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Errors raised by the CLI runner and baseline layers. + +`E2ESetupError` is the shared base for any exception that signals +"the test couldn't run" (CLI failure, source baseline drift in +check_only mode, JWT mint failure). It inherits `Exception` — NOT +`AssertionError` — so pytest reports these as test errors (E) rather +than test failures (F). Assertion failures (`StructuralMismatch` in +`core/expected/differ.py`) keep the `AssertionError` lineage so pytest +renders their diffs with introspection. + +Two-category rule: + - E2ESetupError (→ Exception) : infrastructure couldn't complete → E + - AssertionError : test assertion failed → F + +Downstream code that wants to catch any setup failure (e.g. a retry +wrapper, a diagnostic collector) imports E2ESetupError rather than +enumerating the concrete subclasses. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pathlib import Path + + +class E2ESetupError(Exception): + """Base for setup-phase failures — the test couldn't run as intended. + + Subclassed by every exception that signals infrastructure trouble: + CLI subprocess failure, source-baseline drift, JWT mint failure. + """ + + +class CliExecutionError(E2ESetupError): + """Raised when `metadata` CLI subprocess exits with a non-zero code OR + times out OR completes successfully without writing a status file. + + Carries full diagnostic context so pytest's default failure rendering + surfaces everything a developer needs for post-mortem: + - exit_code: subprocess return code (-1 for a timeout) + - stderr: complete captured stderr + - stdout: complete captured stdout (often carries step-level + progress logs the CLI doesn't persist elsewhere) + - config_path: rendered YAML location — survives test teardown via + pytest's tmp_path + - status_path: path where the status JSON was expected to land — + included even when the file wasn't written so a + developer can inspect the (existing or missing) file + directly from the failure message + - command: full argv of the subprocess + """ + + def __init__( + self, + exit_code: int, + stderr: str, + config_path: Path, + command: list[str], + stdout: str = "", + status_path: Path | None = None, + step_failures_summary: str | None = None, + ) -> None: + self.exit_code = exit_code + self.stderr = stderr + self.stdout = stdout + self.config_path = config_path + self.status_path = status_path + self.command = command + self.step_failures_summary = step_failures_summary + + status_line = ( + f" status: {status_path} (exists={status_path.exists() if status_path else 'n/a'})\n" + if status_path is not None + else "" + ) + # Surface extracted step failures above the raw stdout/stderr dump so + # a developer scanning the exception sees the actionable content + # first — the wall of capture logs is still below for deep dives. + failures_block = ( + f" step failures (from status file):\n{step_failures_summary}\n" if step_failures_summary else "" + ) + super().__init__( + f"metadata CLI exited with code {exit_code}\n" + f" command: {' '.join(command)}\n" + f" config: {config_path}\n" + f"{status_line}" + f"{failures_block}" + f" stdout:\n{stdout}\n" + f" stderr:\n{stderr}" + ) + + +class SourceBaselineDrift(E2ESetupError): # noqa: N818 (intentional API surface — public exception name) + """Raised by `ensure_baseline` when source state does not match the declared + baseline in check_only mode. + + Cloud sources default to check_only so we never mutate shared resources; when + drift is detected, the test setup fails loudly with operator instructions + rather than silently diverging. + """ diff --git a/ingestion/tests/cli_e2e_v2/core/runner/status.py b/ingestion/tests/cli_e2e_v2/core/runner/status.py new file mode 100644 index 000000000000..17fa7189a2bb --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/runner/status.py @@ -0,0 +1,91 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Typed status contract between CliRunner and tests. + +Mirrors the JSON shape written by BaseWorkflow.write_status_file (see +ingestion/src/metadata/workflow/base.py). Observed live in smoke testing: + { + "pipeline_type": "mysql", + "ingestion_pipeline_fqn": null, + "success": true, + "steps": [ + {"name": "Mysql", "records": 178, "updated_records": 47, + "warnings": 0, "errors": 0, "filtered": 0, + "failures": null, "progress": null, "operationMetrics": null, + "sourceTimeMs": null, "sinkTimeMs": null}, + ... + ] + } + +Parsing contract: + - required keys must be present (pipeline_type, success, steps) + - required step keys must be present (name, records, updated_records, + warnings, errors, filtered, failures) + - step `failures` may be `null` (mapped to empty list) or a list of dicts + - A schema change on the CLI side surfaces as a KeyError at parse time, + not a silent mis-count — the test halts loudly rather than passing with + zeroes it inferred from missing keys. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from pathlib import Path + + +@dataclass(frozen=True) +class StepStatus: + name: str + records: int + updated_records: int + warnings: int + errors: int + filtered: int + failures: list[dict] = field(default_factory=list) + + @classmethod + def from_dict(cls, step: dict[str, Any]) -> StepStatus: + return cls( + name=str(step["name"]), + records=int(step["records"] or 0), + updated_records=int(step["updated_records"] or 0), + warnings=int(step["warnings"] or 0), + errors=int(step["errors"] or 0), + filtered=int(step["filtered"] or 0), + failures=list(step["failures"] or []), + ) + + +@dataclass(frozen=True) +class Status: + pipeline_type: str + ingestion_pipeline_fqn: str | None + success: bool + steps: list[StepStatus] + + @classmethod + def from_json(cls, path: Path) -> Status: + data: dict[str, Any] = json.loads(path.read_text()) + return cls( + pipeline_type=str(data["pipeline_type"]), + ingestion_pipeline_fqn=data.get("ingestion_pipeline_fqn"), + success=bool(data["success"]), + steps=[StepStatus.from_dict(s) for s in (data.get("steps") or [])], + ) + + @property + def all_failures(self) -> list[dict]: + """Flat list of failure detail dicts across all steps.""" + return [f for step in self.steps for f in step.failures] + + def step(self, name: str) -> StepStatus | None: + """Look up a step by name (e.g. 'Mysql', 'OpenMetadata', 'Profiler').""" + for s in self.steps: + if s.name == name: + return s + return None diff --git a/ingestion/tests/cli_e2e_v2/core/source/__init__.py b/ingestion/tests/cli_e2e_v2/core/source/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/ingestion/tests/cli_e2e_v2/core/source/common_baseline.py b/ingestion/tests/cli_e2e_v2/core/source/common_baseline.py new file mode 100644 index 000000000000..bad6d41697f8 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/source/common_baseline.py @@ -0,0 +1,258 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Common portable baseline — tables + seed data shared across SQL dialects. + +`build_common_metadata(schema)` returns a SQLAlchemy `MetaData` with the +portable tables (`customers`, `transactions`) declared via Core types so +`metadata.create_all(conn)` emits dialect-correct DDL everywhere. + +`COMMON_CUSTOMER_ROWS` and `COMMON_TRANSACTION_ROWS` are the portable seed +data. Dialects consume them through a TableSeed with a dialect-specific +`insert_sql` template (`ON DUPLICATE KEY UPDATE` for MySQL, `ON CONFLICT +DO UPDATE` for Postgres, etc.) — the base enforcer runs the template +against these rows via SQLAlchemy's executemany binding, no dialect +branching needed. + +Dialect-specific tables (e.g., MySQL's `all_types`) live in each +connector's baseline module and are added to its extended MetaData. +""" + +from __future__ import annotations + +from datetime import date, datetime +from decimal import Decimal +from typing import Any + +from sqlalchemy import ( + CHAR, + BigInteger, + Boolean, + Column, + Date, + DateTime, + ForeignKey, + Integer, + MetaData, + Numeric, + String, + Table, + Text, +) + + +def build_common_metadata(schema: str = "e2e") -> MetaData: + """Build a MetaData carrying portable tables (customers, transactions). + + Connector baselines call this and may add dialect-specific tables to the + returned object before handing it to `SqlSourceBaseline`. + """ + md = MetaData(schema=schema) + + Table( + "customers", + md, + Column("id", Integer, primary_key=True, nullable=False, comment="Primary key identifying the customer."), + Column("first_name", String(50), nullable=False, comment="Customer first name."), + Column("last_name", String(50), nullable=False), + Column("full_name", String(100), nullable=False), + Column("email", String(255), nullable=False, comment="Customer email address."), + Column("address", String(255), nullable=True), + Column("city", String(100), nullable=True), + Column("country", String(100), nullable=True), + Column("zipcode", String(20), nullable=True), + Column("date_of_birth", Date, nullable=True), + Column("age", Integer, nullable=True), + Column("credit_score", Integer, nullable=True), + Column("status", String(20), nullable=False), + Column("is_active", Boolean, nullable=False), + Column("bio", Text, nullable=True), + Column("joined_date", Date, nullable=False), + comment="Customer master table used by CLI E2E v2 MySQL pilot.", + ) + + Table( + "transactions", + md, + Column("id", BigInteger, primary_key=True, nullable=False), + Column( + "customer_id", + Integer, + ForeignKey(f"{schema}.customers.id"), + nullable=False, + comment="FK referencing e2e.customers.id.", + ), + Column("amount", Numeric(10, 2), nullable=False, comment="Transaction amount in the ticker currency."), + Column("currency", CHAR(3), nullable=False), + Column("exchange_rate", Numeric(10, 4), nullable=True), + Column("status", String(20), nullable=False), + Column("txn_at", DateTime, nullable=False), + Column("reference_number", CHAR(12), nullable=False), + Column("ip_address", String(45), nullable=True), + Column("notes", Text, nullable=True), + comment="Customer transaction events with FK to customers.id.", + ) + + return md + + +# ----------------------------------------------------------------------------- +# Seed data — portable Python values. Dialects bind via :key placeholders. +# ----------------------------------------------------------------------------- + +COMMON_CUSTOMER_ROWS: list[dict[str, Any]] = [ + { + "id": 1, + "first_name": "Alice", + "last_name": "Anderson", + "full_name": "Alice Anderson", + "email": "alice@test.com", + "address": "100 Main St", + "city": "Springfield", + "country": "USA", + "zipcode": "11111", + "date_of_birth": date(1990, 1, 15), + "age": 36, + "credit_score": 720, + "status": "active", + "is_active": True, + "bio": "Loyal customer since 2026.", + "joined_date": date(2026, 1, 1), + }, + { + "id": 2, + "first_name": "Bob", + "last_name": "Brown", + "full_name": "Bob Brown", + "email": "bob@test.com", + "address": "200 Oak Ave", + "city": "Portland", + "country": "USA", + "zipcode": "22222", + "date_of_birth": date(1985, 3, 20), + "age": 41, + "credit_score": 680, + "status": "active", + "is_active": True, + "bio": None, + "joined_date": date(2026, 1, 2), + }, + { + "id": 3, + "first_name": "Charlie", + "last_name": "Chen", + "full_name": "Charlie Chen", + "email": "charlie@test.com", + "address": "300 Pine Rd", + "city": "Seattle", + "country": "USA", + "zipcode": "33333", + "date_of_birth": date(1992, 6, 10), + "age": 34, + "credit_score": 650, + "status": "inactive", + "is_active": False, + "bio": "Churned in Q2 2026.", + "joined_date": date(2026, 1, 3), + }, + { + "id": 4, + "first_name": "Diana", + "last_name": "Davis", + "full_name": "Diana Davis", + "email": "diana@test.com", + "address": "400 Elm St", + "city": "Austin", + "country": "USA", + "zipcode": "44444", + "date_of_birth": date(1988, 11, 2), + "age": 38, + "credit_score": 750, + "status": "active", + "is_active": True, + "bio": "High-value account.", + "joined_date": date(2026, 1, 4), + }, + { + "id": 5, + "first_name": "Eve", + "last_name": "Evans", + "full_name": "Eve Evans", + "email": "eve@test.com", + "address": "500 Birch Ln", + "city": "Denver", + "country": "USA", + "zipcode": "55555", + "date_of_birth": date(2000, 5, 25), + "age": 26, + "credit_score": 600, + "status": "pending", + "is_active": True, + "bio": None, + "joined_date": date(2026, 1, 5), + }, +] + + +COMMON_TRANSACTION_ROWS: list[dict[str, Any]] = [ + { + "id": 1, + "customer_id": 1, + "amount": Decimal("125.50"), + "currency": "USD", + "exchange_rate": Decimal("1.0000"), + "status": "completed", + "txn_at": datetime(2026, 2, 1, 9, 15, 0), + "reference_number": "TXN000000001", + "ip_address": "10.0.0.1", + "notes": "Monthly subscription renewal.", + }, + { + "id": 2, + "customer_id": 1, + "amount": Decimal("49.99"), + "currency": "USD", + "exchange_rate": Decimal("1.0000"), + "status": "completed", + "txn_at": datetime(2026, 2, 5, 14, 30, 0), + "reference_number": "TXN000000002", + "ip_address": "10.0.0.1", + "notes": None, + }, + { + "id": 3, + "customer_id": 2, + "amount": Decimal("250.00"), + "currency": "USD", + "exchange_rate": Decimal("1.0000"), + "status": "completed", + "txn_at": datetime(2026, 2, 10, 11, 20, 0), + "reference_number": "TXN000000003", + "ip_address": "10.0.0.2", + "notes": "Premium upgrade.", + }, + { + "id": 4, + "customer_id": 3, + "amount": Decimal("19.99"), + "currency": "USD", + "exchange_rate": Decimal("1.0000"), + "status": "refunded", + "txn_at": datetime(2026, 2, 12, 16, 45, 0), + "reference_number": "TXN000000004", + "ip_address": "10.0.0.3", + "notes": "Customer requested refund.", + }, + { + "id": 5, + "customer_id": 4, + "amount": Decimal("125.50"), + "currency": "EUR", + "exchange_rate": Decimal("1.0850"), + "status": "completed", + "txn_at": datetime(2026, 2, 18, 13, 10, 0), + "reference_number": "TXN000000005", + "ip_address": "10.0.0.4", + "notes": None, + }, +] diff --git a/ingestion/tests/cli_e2e_v2/core/source/orchestrator.py b/ingestion/tests/cli_e2e_v2/core/source/orchestrator.py new file mode 100644 index 000000000000..9339840531a8 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/source/orchestrator.py @@ -0,0 +1,108 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Source baseline orchestrator: ensure_baseline + EnforcementPolicy + trust mode. + +`ensure_baseline` is the uniform orchestrator for every source family. Each +per-connector baseline fixture calls it once per session; the policy decides +whether drifts apply (local Docker) or raise (shared cloud sources). + +Trust mode (policy=None and expected=None) short-circuits with a WARNING, +letting a connector migrate to v2 before its baseline is fully modeled +(Decision #18). +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from enum import Enum +from typing import TYPE_CHECKING + +from ..runner.errors import SourceBaselineDrift + +if TYPE_CHECKING: + from .types import BaselineSpec, Diff, SourceBaselineEnforcer + +logger = logging.getLogger(__name__) + + +class EnforcementMode(Enum): + """How a policy reconciles detected source-baseline drift. + + Matches the style of `MatchMode` (also an enum) so the two + comparison-lifecycle modes in the framework share one idiom. + """ + + APPLY = "apply" # drifts trigger enforcer.apply (mutates the source) + CHECK_ONLY = "check_only" # drifts raise SourceBaselineDrift + + +@dataclass(frozen=True) +class EnforcementPolicy: + """Binds an enforcer to a mode. + + APPLY: drifts trigger enforcer.apply (mutates the source). + Default for local Docker-backed connectors. + CHECK_ONLY: drifts raise SourceBaselineDrift. + Default for shared cloud sources — never mutate. + """ + + enforcer: SourceBaselineEnforcer + mode: EnforcementMode = EnforcementMode.APPLY + + +def ensure_baseline( + policy: EnforcementPolicy | None, + expected: BaselineSpec | None, + *, + connector_name: str, +) -> None: + """Three-phase lifecycle with trust-mode short-circuit. + + Trust mode: policy or expected is None → log a warning, do nothing. + Lets a connector migrate to v2 before its baseline is declared. + + Otherwise: introspect → compare → apply or raise: + - no drifts → log and return + - drifts + CHECK_ONLY → raise SourceBaselineDrift listing each drift. + The exception message tells the operator to re-run locally with + APPLY against a dedicated database — the standalone apply CLI + considered in the v2 design was deferred to the first cloud + connector (see `project-cloud-baseline-recovery-deferred.md`). + - drifts + APPLY → call enforcer.apply(drifts) + """ + if policy is None or expected is None: + logger.warning( + "[%s] running in TRUST MODE — no source baseline enforced. Source state is assumed correct.", + connector_name, + ) + return + + drifts = policy.enforcer.compare(expected) + + if not drifts: + logger.info("[%s] source baseline in sync", connector_name) + return + + if policy.mode is EnforcementMode.CHECK_ONLY: + raise SourceBaselineDrift( + f"[{connector_name}] baseline drift detected ({len(drifts)} items):\n" + f"{_render_drift_list(drifts)}\n\n" + f"This connector runs in check_only mode — baselines must be applied " + f"out-of-band (e.g., re-run the test suite locally against this source " + f"with EnforcementMode.APPLY on a dedicated DB). Contact the connector owner if unsure." + ) + + logger.info("[%s] applying %d baseline drift fixes", connector_name, len(drifts)) + policy.enforcer.apply(drifts) + + +def _render_drift_list(drifts: list[Diff]) -> str: + """Inline renderer for a drift list inside the check-only error message. + + Uses Diff's own `__str__` so source-side and OM-side error output share + the same ` path:\\n expected: X\\n actual: Y` shape — consistent + reading across both failure surfaces. + """ + return "\n".join(str(d) for d in drifts) diff --git a/ingestion/tests/cli_e2e_v2/core/source/sql.py b/ingestion/tests/cli_e2e_v2/core/source/sql.py new file mode 100644 index 000000000000..6d5402e8109d --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/source/sql.py @@ -0,0 +1,90 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""SQL-family baseline types. + +`SqlSourceBaseline` carries a SQLAlchemy `MetaData` (tables + columns + FKs + +comments) plus companion data for things Core doesn't model: seed rows, +view definitions, stored procedures. DDL is emitted by +`metadata.create_all(conn)` in the enforcer; seed INSERTs are dialect-specific +(the `TableSeed.insert_sql` template is supplied by each connector's +baseline). +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any + +from .types import BaselineSpec + +if TYPE_CHECKING: + from sqlalchemy import MetaData + + +@dataclass(frozen=True) +class TableSeed: + """Deterministic seed rows for a baseline table. + + `rows` is portable data (list of dicts). `insert_sql` is a + dialect-specific template with `:key` placeholders that SQLAlchemy binds + against each row via executemany — this is where idempotent upsert + clauses live (MySQL `ON DUPLICATE KEY UPDATE`, Postgres `ON CONFLICT DO + UPDATE`, etc.). The base enforcer runs `insert_sql` against `rows` + without knowing the dialect. + + `expected_row_count` is derived — `len(rows)` — so the seed spec has + one source of truth. + """ + + table_name: str + rows: list[dict[str, Any]] + insert_sql: str + + @property + def expected_row_count(self) -> int: + return len(self.rows) + + +@dataclass(frozen=True) +class ViewDefinition: + """A single expected view. + + `definition_sql` is executed verbatim at apply time — baselines supply a + CREATE OR REPLACE VIEW (or dialect equivalent) statement. + """ + + schema: str + name: str + definition_sql: str + + +@dataclass(frozen=True) +class StoredProcedureDefinition: + """A single expected stored procedure. + + Dialect-specific: MySQL drops + creates (no CREATE OR REPLACE PROCEDURE); + Postgres uses CREATE OR REPLACE PROCEDURE. The enforcer subclass owns + the dialect DDL; `definition_sql` carries the body as supplied by the + baseline. + """ + + schema: str + name: str + definition_sql: str + + +@dataclass(frozen=True) +class SqlSourceBaseline(BaselineSpec): + """Top-level declarative spec for a SQL-based source. + + `metadata` holds the table DDL via SQLAlchemy Core — one source of truth + for column types, nullability, primary keys, foreign keys, and comments. + Seeds / views / stored procedures live alongside as companion data. + """ + + schemas: list[str] + metadata: MetaData + seeds: list[TableSeed] = field(default_factory=list) + views: list[ViewDefinition] = field(default_factory=list) + stored_procedures: list[StoredProcedureDefinition] = field(default_factory=list) diff --git a/ingestion/tests/cli_e2e_v2/core/source/sql_enforcer.py b/ingestion/tests/cli_e2e_v2/core/source/sql_enforcer.py new file mode 100644 index 000000000000..92698a3d9744 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/source/sql_enforcer.py @@ -0,0 +1,301 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Dialect-agnostic SQL baseline enforcer via SQLAlchemy Inspector + Core. + +Introspection goes through `sqlalchemy.inspect(conn)` — dialect-agnostic. +DDL emission goes through `metadata.create_all(conn)` — also dialect-aware +via SQLAlchemy Core. Seeds apply via a dialect-specific INSERT template +carried on each `TableSeed`, so the base enforcer runs them without +knowing the dialect. Stored procedures and their listing query stay +subclass responsibility (SQLAlchemy doesn't model SPs uniformly). +""" + +from __future__ import annotations + +import logging +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Any, TypedDict + +from sqlalchemy import bindparam, inspect, text + +from .sql import ( + SqlSourceBaseline, + StoredProcedureDefinition, + TableSeed, + ViewDefinition, +) +from .types import BaselineSpec, Diff, DiffKind + +if TYPE_CHECKING: + from sqlalchemy.engine import Connection, Engine + from sqlalchemy.schema import Table + +logger = logging.getLogger(__name__) + + +class _TableSnapshot(TypedDict): + """Per-table metadata collected by the Inspector snapshot.""" + + columns: dict[str, dict[str, Any]] + + +class _SqlSnapshot(TypedDict): + """Typed shape of `_snapshot()`'s return payload. + + Lets the `_diff_*` methods take a real type (not `dict`) so typos like + `state["tabels"]` are caught by the type checker rather than silently + at runtime. + """ + + schemas: set[str] + tables: dict[tuple[str, str], _TableSnapshot] + views: set[tuple[str, str]] + stored_procedures: set[tuple[str, str]] + + +_TYPE_ALIASES: dict[str, str] = { + "INTEGER": "INT", + "NUMERIC": "DECIMAL", +} + +_INTEGER_TYPES: frozenset[str] = frozenset({"TINYINT", "SMALLINT", "MEDIUMINT", "INT", "BIGINT"}) + + +class SqlBaselineEnforcer(ABC): + """SQL-family SourceBaselineEnforcer via SQLAlchemy Inspector + Core. + + Subclasses customize only: + - `_stored_procedure_query_sql`: raw SQL returning `(schema, name)` + rows for procedures; binds a `:schemas` IN-list (expanding). + - `_apply_stored_procedure(conn, sp)`: dialect-specific procedure DDL. + Required override (abstract) — subclasses without stored procedures + in their baseline can implement as a `pass` no-op. + - `_apply_view` default runs `view.definition_sql` verbatim — override + only if the dialect needs special plumbing. + + Tables, columns, FKs, comments, and PK come from the baseline's + SQLAlchemy `MetaData` — `metadata.create_all(conn)` emits the right + DDL per dialect. Seed INSERTs are dialect-specific templates on each + `TableSeed`, bound against the (portable) row data at apply time. + + Marking `_apply_stored_procedure` abstract surfaces missing overrides at + enforcer instantiation (fixture setup) rather than at first SP-apply + inside a running test, where the failure context is harder to triage. + """ + + _stored_procedure_query_sql: str | None = None + + def __init__(self, engine: Engine, baseline: SqlSourceBaseline) -> None: + self._engine = engine + self._baseline = baseline + + # --- internal snapshot ---------------------------------------------- + + def _snapshot(self, conn: Connection) -> _SqlSnapshot: + inspector = inspect(conn) + wanted = set(self._baseline.schemas) + logger.debug("[sql] snapshotting schemas=%s", sorted(wanted)) + + schemas = {s for s in inspector.get_schema_names() if s in wanted} + + tables: dict[tuple[str, str], _TableSnapshot] = {} + for schema in schemas: + for table in inspector.get_table_names(schema=schema): + pk_cols = set(inspector.get_pk_constraint(table, schema=schema).get("constrained_columns", [])) + tables[(schema, table)] = { + "columns": { + col["name"]: { + "sql_type": str(col["type"]).upper(), + "nullable": col["nullable"], + "primary_key": col["name"] in pk_cols, + } + for col in inspector.get_columns(table, schema=schema) + } + } + + views = {(s, v) for s in schemas for v in inspector.get_view_names(schema=s)} + + stored_procedures = self._query_stored_procedures(conn, schemas) + + return { + "schemas": schemas, + "tables": tables, + "views": views, + "stored_procedures": stored_procedures, + } + + def _query_stored_procedures(self, conn: Connection, schemas: set[str]) -> set[tuple[str, str]]: + if not self._stored_procedure_query_sql or not schemas: + return set() + query = text(self._stored_procedure_query_sql).bindparams(bindparam("schemas", expanding=True)) + return {(row[0], row[1]) for row in conn.execute(query, {"schemas": sorted(schemas)})} + + # --- compare -------------------------------------------------------- + + def compare(self, expected: BaselineSpec) -> list[Diff]: + assert isinstance(expected, SqlSourceBaseline), f"expected SqlSourceBaseline, got {type(expected).__name__}" + if not expected.schemas: + return [] + + drifts: list[Diff] = [] + with self._engine.connect() as conn: + state = self._snapshot(conn) + drifts.extend(self._diff_schemas(expected, state)) + drifts.extend(self._diff_tables(expected, state)) + drifts.extend(self._diff_seeds(expected, state, conn)) + drifts.extend(self._diff_views(expected, state)) + drifts.extend(self._diff_stored_procedures(expected, state)) + + logger.debug("[sql] compare produced %d drifts", len(drifts)) + return drifts + + @staticmethod + def _diff_schemas(expected: SqlSourceBaseline, state: _SqlSnapshot) -> list[Diff]: + return [Diff(path=f"schema[{s}]", kind=DiffKind.MISSING) for s in expected.schemas if s not in state["schemas"]] + + def _diff_tables(self, expected: SqlSourceBaseline, state: _SqlSnapshot) -> list[Diff]: + drifts: list[Diff] = [] + actual_tables = state["tables"] + for tbl in expected.metadata.sorted_tables: + fqn = tbl.fullname + actual_tbl = actual_tables.get((tbl.schema, tbl.name)) + if actual_tbl is None: + drifts.append(Diff(path=f"table[{fqn}]", kind=DiffKind.MISSING)) + continue + drifts.extend(self._diff_columns(tbl, actual_tbl["columns"], fqn)) + return drifts + + @staticmethod + def _diff_columns(tbl: Table, actual_cols: dict[str, dict[str, Any]], fqn: str) -> list[Diff]: + drifts: list[Diff] = [] + for col in tbl.columns: + actual_col = actual_cols.get(col.name) + col_path = f"table[{fqn}].column[{col.name}]" + if actual_col is None: + drifts.append(Diff(path=col_path, kind=DiffKind.MISSING)) + continue + expected_type_str = str(col.type).upper() + if _normalize_type(actual_col["sql_type"]) != _normalize_type(expected_type_str): + drifts.append( + Diff( + path=f"{col_path}.type", + expected=expected_type_str, + actual=actual_col["sql_type"], + ) + ) + if actual_col["primary_key"] != col.primary_key: + drifts.append( + Diff( + path=f"{col_path}.primary_key", + expected=col.primary_key, + actual=actual_col["primary_key"], + ) + ) + return drifts + + def _diff_seeds(self, expected: SqlSourceBaseline, state: _SqlSnapshot, conn: Connection) -> list[Diff]: + """Compare seed row counts for tables that already exist. + + Skips seeds whose target table isn't in the snapshot — the missing + table is already flagged by `_diff_tables`, and issuing COUNT(*) + against a nonexistent table (or schema) would raise. The apply() + pass creates the tables + seeds them; next compare() can then + verify row counts. + """ + drifts: list[Diff] = [] + actual_tables = state["tables"] + schema = expected.metadata.schema + for seed in expected.seeds: + if (schema, seed.table_name) not in actual_tables: + continue + fqn = self._seed_fqn(seed) + count = conn.execute(text(f"SELECT COUNT(*) FROM {fqn}")).scalar_one() + if count != seed.expected_row_count: + drifts.append( + Diff( + path=f"table[{fqn}].seed.row_count", + expected=seed.expected_row_count, + actual=count, + ) + ) + return drifts + + @staticmethod + def _diff_views(expected: SqlSourceBaseline, state: _SqlSnapshot) -> list[Diff]: + return [ + Diff(path=f"view[{v.schema}.{v.name}]", kind=DiffKind.MISSING) + for v in expected.views + if (v.schema, v.name) not in state["views"] + ] + + @staticmethod + def _diff_stored_procedures(expected: SqlSourceBaseline, state: _SqlSnapshot) -> list[Diff]: + return [ + Diff(path=f"procedure[{sp.schema}.{sp.name}]", kind=DiffKind.MISSING) + for sp in expected.stored_procedures + if (sp.schema, sp.name) not in state["stored_procedures"] + ] + + # --- apply orchestration -------------------------------------------- + + def apply(self, drifts: list[Diff]) -> None: + logger.debug("[sql] applying %d drifts", len(drifts)) + with self._engine.begin() as conn: + for schema_name in self._baseline.schemas: + conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")) + # metadata.create_all emits CREATE TABLE IF NOT EXISTS + FKs + + # column comments + table comments in the engine's dialect. + self._baseline.metadata.create_all(conn) + for seed in self._baseline.seeds: + self._apply_seed(conn, seed) + for view in self._baseline.views: + self._apply_view(conn, view) + for sp in self._baseline.stored_procedures: + self._apply_stored_procedure(conn, sp) + + def _apply_seed(self, conn: Connection, seed: TableSeed) -> None: + fqn = self._seed_fqn(seed) + count = conn.execute(text(f"SELECT COUNT(*) FROM {fqn}")).scalar_one() + if count == seed.expected_row_count: + return + logger.info( + "[seed] %s: inserting (current=%d, expected=%d)", + fqn, + count, + seed.expected_row_count, + ) + conn.execute(text(seed.insert_sql), seed.rows) + + def _seed_fqn(self, seed: TableSeed) -> str: + schema = self._baseline.metadata.schema + return f"{schema}.{seed.table_name}" if schema else seed.table_name + + @staticmethod + def _apply_view(conn: Connection, view: ViewDefinition) -> None: + """Default: run `view.definition_sql` verbatim.""" + conn.execute(text(view.definition_sql)) + + @abstractmethod + def _apply_stored_procedure(self, conn: Connection, sp: StoredProcedureDefinition) -> None: + """Dialect-specific procedure DDL. Implement as a `pass` no-op + if the connector's baseline declares no stored procedures.""" + + +def _normalize_type(t: str) -> str: + """Canonicalize a SQL native-type string for cross-dialect comparison. + + - upper case + - strip `UNSIGNED` + - collapse whitespace, including "DECIMAL(10, 2)" -> "DECIMAL(10,2)" + - strip single quotes (enum/set members: "ENUM('a','b')" -> "ENUM(A,B)") + - alias INTEGER -> INT, NUMERIC -> DECIMAL + - drop display width for integer family (INT(11) -> INT) + """ + raw = " ".join(t.upper().replace("UNSIGNED", "").split()) + raw = raw.replace(", ", ",").replace("'", "") + head, paren, rest = raw.partition("(") + head = _TYPE_ALIASES.get(head, head) + if head in _INTEGER_TYPES: + return head + return f"{head}({rest}" if paren else head diff --git a/ingestion/tests/cli_e2e_v2/core/source/types.py b/ingestion/tests/cli_e2e_v2/core/source/types.py new file mode 100644 index 000000000000..6fe1e717be71 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/core/source/types.py @@ -0,0 +1,87 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Protocol and base types for source baseline enforcement. + +Per Decision #18 of the v2 spec, baseline enforcement is a compare-then-apply +lifecycle that's uniform across source families (SQL, Dashboard, Pipeline). +MVP ships only the SQL family; the Protocol is defined here so future families +plug in without rework. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum +from typing import Any, Protocol + + +class BaselineSpec: + """Marker base for family-specific baseline specs (SqlSourceBaseline, etc.). + + Deliberately minimal — subclasses carry the real declarative shape. This + class exists so the orchestrator can type `expected: BaselineSpec` without + depending on any specific family module. + """ + + +class DiffKind(Enum): + """Why a `Diff` was produced. + + Replaces brittle string sentinels (``expected="present", actual="missing"``) + with a typed discriminator. Lets downstream code filter diffs by kind + (``[d for d in diffs if d.kind is DiffKind.MISSING]``) without re-parsing + the human-readable expected/actual fields, and lets the renderer pick a + one-liner vs. expected/actual block per kind. + """ + + MISSING = "missing" # entity declared expected, not found in actual + UNEXPECTED = "unexpected" # STRICT mode: actual entity not in expected set + VALUE_MISMATCH = "value" # both sides present, a field differs + + +@dataclass(frozen=True) +class Diff: + """One path-qualified discrepancy between expected and actual. + + Used for both source-side baseline drift (schema / tables / seeds) + and OM-side catalog diffing (service / database / schema / table / + column). Path uses bracket notation — `schema[e2e].table[users].column + [email].type` — so failure output from either domain is scannable by + eye and sortable for grouping. + + `expected` / `actual` are the human-readable values for VALUE_MISMATCH + kinds; for MISSING / UNEXPECTED they are usually omitted (the kind + itself carries the meaning). `__str__` renders accordingly. + """ + + path: str + kind: DiffKind = DiffKind.VALUE_MISMATCH + expected: Any = None + actual: Any = None + + def __str__(self) -> str: + if self.kind is DiffKind.MISSING: + return f" {self.path}: missing" + if self.kind is DiffKind.UNEXPECTED: + extra = f" ({self.actual!r})" if self.actual is not None else "" + return f" {self.path}: unexpected{extra}" + return f" {self.path}:\n expected: {self.expected!r}\n actual: {self.actual!r}" + + +class SourceBaselineEnforcer(Protocol): + """Compare-then-apply lifecycle implemented per connector family. + + Enforcers are constructed by the per-connector baseline module (e.g., + `/baseline.py`) and handed to the orchestrator via an + EnforcementPolicy. The orchestrator calls `compare` first; if drifts + are returned and the policy mode is APPLY, it then calls `apply`. + + Implementations are free to do their own internal snapshotting — the + framework doesn't prescribe a separate "introspect" phase. Engine- + specific state caching belongs inside the enforcer. + """ + + def compare(self, expected: BaselineSpec) -> list[Diff]: ... + + def apply(self, drifts: list[Diff]) -> None: ... diff --git a/ingestion/tests/cli_e2e_v2/meta/__init__.py b/ingestion/tests/cli_e2e_v2/meta/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/ingestion/tests/cli_e2e_v2/meta/test_differ.py b/ingestion/tests/cli_e2e_v2/meta/test_differ.py new file mode 100644 index 000000000000..925aa7378b1c --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/meta/test_differ.py @@ -0,0 +1,420 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Meta-tests: prove StructuralDiffer detects each documented failure mode. + +These run synthetically against a stub OM client — no testcontainers, no +network. They are the safety net that catches regressions in the differ +itself: if a real connector test ever silently passes when OM diverges +from Expected, one of these will already have failed in CI. +""" + +from __future__ import annotations + +from types import SimpleNamespace +from typing import Any + +import pytest + +from metadata.generated.schema.entity.data.database import Database +from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema +from metadata.generated.schema.entity.data.storedProcedure import StoredProcedure +from metadata.generated.schema.entity.data.table import DataType, Table +from metadata.generated.schema.entity.services.databaseService import ( + DatabaseService, + DatabaseServiceType, +) + +from ..core.expected.differ import StructuralMismatch, assert_service_matches +from ..core.expected.types import ( + ExpectedColumn, + ExpectedDatabase, + ExpectedSchema, + ExpectedService, + ExpectedStoredProcedure, + ExpectedTable, + MatchMode, +) +from ..core.fluent.om_client import OmClient +from ..core.source.types import DiffKind + +# --------------------------------------------------------------------------- # +# Stubs # +# --------------------------------------------------------------------------- # + + +class _FakeOM: + """Minimal OpenMetadata stand-in. + + Stores canned `get_by_name` responses keyed on `(entity_cls, fqn)` and + canned `list_all_entities` responses keyed on `(entity_cls, parent_key, parent_value)`. + Anything not registered returns None / []. + """ + + def __init__(self) -> None: + self.entities: dict[tuple[type, str], Any] = {} + self.listings: dict[tuple[type, str, str], list] = {} + + def register(self, entity_cls: type, fqn: str, value: Any) -> None: + self.entities[(entity_cls, fqn)] = value + + def register_list(self, entity_cls: type, parent_key: str, parent_value: str, items: list) -> None: + self.listings[(entity_cls, parent_key, parent_value)] = items + + # --- OpenMetadata API surface used by the differ ----------------------- + + def get_by_name(self, *, entity, fqn, fields=None, include=None): + return self.entities.get((entity, fqn)) + + def list_all_entities(self, *, entity, params, limit=1000): + (parent_key, parent_value) = next(iter(params.items())) + return self.listings.get((entity, parent_key, parent_value), []) + + +def _stub(**kwargs: Any) -> SimpleNamespace: + """Build a SimpleNamespace with given attributes — drop-in for Pydantic + entities the differ reads via attribute access. Defaults cover the + fields touched by the differ for any entity type.""" + defaults = { + "tags": [], + "owners": [], + "columns": [], + "description": None, + "deleted": False, + } + return SimpleNamespace(**{**defaults, **kwargs}) + + +def _column(name: str, data_type: DataType, **extra: Any) -> SimpleNamespace: + return _stub(name=name, dataType=data_type, constraint=None, **extra) + + +SERVICE_FQN = "svc" +DB_FQN = "svc.default" +SCHEMA_FQN = "svc.default.e2e" + + +def _seed_happy_path(fake: _FakeOM, expected: ExpectedService) -> None: + """Register OM responses that exactly match `expected` so the differ + sees zero drift. Negative tests build on top of this by overwriting + one entry to introduce a single, isolated discrepancy.""" + fake.register(DatabaseService, expected.name, _stub(serviceType=expected.service_type)) + for db in expected.databases: + db_fqn = f"{expected.name}.{db.name}" + fake.register(Database, db_fqn, _stub(name=db.name)) + for schema in db.schemas: + schema_fqn = f"{db_fqn}.{schema.name}" + fake.register(DatabaseSchema, schema_fqn, _stub(name=schema.name)) + for table in schema.tables: + fake.register( + Table, + f"{schema_fqn}.{table.name}", + _stub( + name=table.name, + columns=[_column(c.name, c.data_type) for c in table.columns], + ), + ) + for sp in schema.stored_procedures: + fake.register(StoredProcedure, f"{schema_fqn}.{sp.name}", _stub(name=sp.name)) + + +def _baseline_expected() -> ExpectedService: + """Reference Expected tree the negative tests perturb.""" + return ExpectedService( + name="svc", + service_type=DatabaseServiceType.Mysql, + databases=[ + ExpectedDatabase( + name="default", + schemas=[ + ExpectedSchema( + name="e2e", + tables=[ + ExpectedTable( + name="customers", + columns=[ + ExpectedColumn("id", DataType.BIGINT), + ExpectedColumn("email", DataType.VARCHAR), + ], + ), + ExpectedTable( + name="transactions", + columns=[ExpectedColumn("id", DataType.BIGINT)], + ), + ], + stored_procedures=[ExpectedStoredProcedure("sp_count")], + ) + ], + ) + ], + ) + + +def _client(fake: _FakeOM) -> OmClient: + return OmClient(fake) # type: ignore[arg-type] + + +# --------------------------------------------------------------------------- # +# Happy path — the differ should NOT raise when OM matches Expected. # +# --------------------------------------------------------------------------- # + + +def test_happy_path_no_diffs() -> None: + expected = _baseline_expected() + fake = _FakeOM() + _seed_happy_path(fake, expected) + assert_service_matches(expected, _client(fake)) + + +# --------------------------------------------------------------------------- # +# Each parametrize row injects ONE corruption and asserts it is caught. # +# `mutate(fake)` mutates the registered actuals; the Expected tree stays the # +# canonical baseline. `expected_path_fragment` is a substring search against # +# the rendered StructuralMismatch — looser than DiffKind matching but reads # +# closer to the failure message a developer would actually see. # +# --------------------------------------------------------------------------- # + + +def _drop(fake: _FakeOM, entity_cls: type, fqn: str) -> None: + fake.entities[(entity_cls, fqn)] = None + + +def _patch_table(fake: _FakeOM, fqn: str, **kwargs: Any) -> None: + table = fake.entities[(Table, fqn)] + for k, v in kwargs.items(): + setattr(table, k, v) + + +@pytest.mark.parametrize( + "label,mutate,expected_kind,path_fragment", + [ + ( + "missing_service", + lambda fake: _drop(fake, DatabaseService, SERVICE_FQN), + DiffKind.MISSING, + "service[svc]", + ), + ( + "missing_database", + lambda fake: _drop(fake, Database, DB_FQN), + DiffKind.MISSING, + "database[default]", + ), + ( + "missing_schema", + lambda fake: _drop(fake, DatabaseSchema, SCHEMA_FQN), + DiffKind.MISSING, + "schema[e2e]", + ), + ( + "missing_table", + lambda fake: _drop(fake, Table, f"{SCHEMA_FQN}.customers"), + DiffKind.MISSING, + "table[customers]", + ), + ( + "missing_stored_procedure", + lambda fake: _drop(fake, StoredProcedure, f"{SCHEMA_FQN}.sp_count"), + DiffKind.MISSING, + "procedure[sp_count]", + ), + ( + "missing_column", + lambda fake: _patch_table( + fake, + f"{SCHEMA_FQN}.customers", + columns=[_column("id", DataType.BIGINT)], + ), + DiffKind.MISSING, + "column[email]", + ), + ( + "wrong_column_type", + lambda fake: _patch_table( + fake, + f"{SCHEMA_FQN}.customers", + columns=[_column("id", DataType.INT), _column("email", DataType.VARCHAR)], + ), + DiffKind.VALUE_MISMATCH, + "column[id].dataType", + ), + ( + "wrong_service_type", + lambda fake: setattr( + fake.entities[(DatabaseService, SERVICE_FQN)], + "serviceType", + DatabaseServiceType.Postgres, + ), + DiffKind.VALUE_MISMATCH, + "service[svc].serviceType", + ), + ], + ids=lambda v: v if isinstance(v, str) else "", +) +def test_diff_detected(label, mutate, expected_kind, path_fragment) -> None: + expected = _baseline_expected() + fake = _FakeOM() + _seed_happy_path(fake, expected) + mutate(fake) + + with pytest.raises(StructuralMismatch) as exc_info: + assert_service_matches(expected, _client(fake)) + + diffs = exc_info.value.diffs + assert any(d.kind is expected_kind and path_fragment in d.path for d in diffs), ( + f"expected a {expected_kind.name} diff containing {path_fragment!r}; got: {diffs!r}" + ) + + +# --------------------------------------------------------------------------- # +# Field-level assertions that don't fit the parametrize matrix cleanly # +# (each needs additional setup: tags, descriptions, owners). # +# --------------------------------------------------------------------------- # + + +def test_missing_column_tag() -> None: + expected = ExpectedService( + name="svc", + service_type=DatabaseServiceType.Mysql, + databases=[ + ExpectedDatabase( + name="default", + schemas=[ + ExpectedSchema( + name="e2e", + tables=[ + ExpectedTable( + name="customers", + columns=[ + ExpectedColumn( + "email", + DataType.VARCHAR, + tags=frozenset({"PII.Sensitive"}), + ), + ], + ) + ], + ) + ], + ) + ], + ) + fake = _FakeOM() + _seed_happy_path(fake, expected) + # Overwrite the auto-seeded column to drop the tag. + fake.entities[(Table, f"{SCHEMA_FQN}.customers")] = _stub( + name="customers", + columns=[_stub(name="email", dataType=DataType.VARCHAR, constraint=None, tags=[])], + ) + + with pytest.raises(StructuralMismatch, match=r"column\[email\].tags"): + assert_service_matches(expected, _client(fake)) + + +def test_missing_table_description() -> None: + expected = _baseline_expected() + expected = ExpectedService( + name=expected.name, + service_type=expected.service_type, + databases=[ + ExpectedDatabase( + name="default", + schemas=[ + ExpectedSchema( + name="e2e", + tables=[ + ExpectedTable( + name="customers", + columns=[ExpectedColumn("id", DataType.BIGINT)], + description="Customer records", + ) + ], + ) + ], + ) + ], + ) + fake = _FakeOM() + _seed_happy_path(fake, expected) + _patch_table(fake, f"{SCHEMA_FQN}.customers", description="other text") + + with pytest.raises(StructuralMismatch, match=r"table\[customers\].description"): + assert_service_matches(expected, _client(fake)) + + +def test_missing_owner() -> None: + expected = ExpectedService( + name="svc", + service_type=DatabaseServiceType.Mysql, + databases=[ + ExpectedDatabase( + name="default", + schemas=[ + ExpectedSchema( + name="e2e", + tables=[ + ExpectedTable( + name="customers", + columns=[ExpectedColumn("id", DataType.BIGINT)], + owner="alice", + ) + ], + ) + ], + ) + ], + ) + fake = _FakeOM() + _seed_happy_path(fake, expected) + # Default seeded owners is []; assertion requires "alice" → diff fires. + + with pytest.raises(StructuralMismatch, match=r"table\[customers\].owner"): + assert_service_matches(expected, _client(fake)) + + +# --------------------------------------------------------------------------- # +# STRICT mode catches extras that SUPERSET tolerates. # +# --------------------------------------------------------------------------- # + + +def test_strict_flags_extra_table_unexpected() -> None: + expected = _baseline_expected() + fake = _FakeOM() + _seed_happy_path(fake, expected) + fake.register_list( + Table, + "databaseSchema", + SCHEMA_FQN, + [ + _stub(name="customers"), + _stub(name="transactions"), + _stub(name="phantom"), + ], + ) + + # SUPERSET tolerates the extra. + assert_service_matches(expected, _client(fake), mode=MatchMode.SUPERSET) + + # STRICT flags it. + with pytest.raises(StructuralMismatch, match=r"phantom"): + assert_service_matches(expected, _client(fake), mode=MatchMode.STRICT) + + +def test_strict_flags_extra_column() -> None: + expected = _baseline_expected() + fake = _FakeOM() + _seed_happy_path(fake, expected) + _patch_table( + fake, + f"{SCHEMA_FQN}.customers", + columns=[ + _column("id", DataType.BIGINT), + _column("email", DataType.VARCHAR), + _column("phantom", DataType.VARCHAR), + ], + ) + + assert_service_matches(expected, _client(fake), mode=MatchMode.SUPERSET) + + with pytest.raises(StructuralMismatch, match=r"phantom"): + assert_service_matches(expected, _client(fake), mode=MatchMode.STRICT) diff --git a/ingestion/tests/cli_e2e_v2/meta/test_eventually.py b/ingestion/tests/cli_e2e_v2/meta/test_eventually.py new file mode 100644 index 000000000000..19cf28f05d82 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/meta/test_eventually.py @@ -0,0 +1,145 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Meta-tests: prove the eventually polling primitives behave correctly. + +retry_until is the foundation of every fluent `.eventually()` chain in +the framework — a regression here silently turns flaky-but-eventually- +correct ingestion into spurious test passes (or false failures). +""" + +from __future__ import annotations + +import pytest + +from ..core.fluent.eventually import EventuallyRunner, retry_until + + +def _attempt_counter(): + """Return a list whose `len()` is the number of times `check` has been called. + + Mutable container so closures can append on each invocation without a + `nonlocal` declaration on every check. Tests inspect the length to + assert how many attempts retry_until made. + """ + return [] + + +# --------------------------------------------------------------------------- # +# retry_until — the low-level primitive # +# --------------------------------------------------------------------------- # + + +def test_retry_until_returns_value_on_first_success() -> None: + attempts = _attempt_counter() + + def _check() -> str: + attempts.append(None) + return "ok" + + assert retry_until(_check, timeout=2, poll_interval=0.01, name="t") == "ok" + assert len(attempts) == 1 + + +def test_retry_until_retries_until_success() -> None: + attempts = _attempt_counter() + + def _check() -> int: + attempts.append(None) + if len(attempts) < 3: + raise AssertionError("not yet") + return 42 + + assert retry_until(_check, timeout=2, poll_interval=0.01, name="converge") == 42 + assert len(attempts) == 3 + + +def test_retry_until_times_out_with_last_failure() -> None: + def _check() -> None: + raise AssertionError("specific failure text") + + with pytest.raises(AssertionError, match="specific failure text") as exc_info: + retry_until(_check, timeout=0, poll_interval=0.01, name="never") + + msg = str(exc_info.value) + assert "timed out" in msg + assert "never" in msg + + +def test_retry_until_propagates_non_assertion_errors() -> None: + def _check() -> None: + raise RuntimeError("hard error") + + with pytest.raises(RuntimeError, match="hard error"): + retry_until(_check, timeout=2, poll_interval=0.01, name="t") + + +# --------------------------------------------------------------------------- # +# EventuallyRunner — the per-assert dispatcher # +# --------------------------------------------------------------------------- # + + +def test_runner_unarmed_runs_sync() -> None: + runner = EventuallyRunner() + attempts = _attempt_counter() + + def _check() -> str: + attempts.append(None) + return "value" + + assert runner.run(_check, name="sync") == "value" + assert len(attempts) == 1 + + +def test_runner_unarmed_propagates_assertion_error_without_retry() -> None: + runner = EventuallyRunner() + attempts = _attempt_counter() + + def _check() -> None: + attempts.append(None) + raise AssertionError("immediate") + + with pytest.raises(AssertionError, match="immediate"): + runner.run(_check, name="sync") + assert len(attempts) == 1 + + +def test_runner_armed_retries_until_success() -> None: + runner = EventuallyRunner() + runner.arm(timeout=2) + attempts = _attempt_counter() + + def _check() -> str: + attempts.append(None) + if len(attempts) < 2: + raise AssertionError("not yet") + return "done" + + # Note: EventuallyRunner uses retry_until's default poll interval (2s). + # We rely on the check converging fast enough that the natural sleep + # is acceptable. Two attempts ⇒ one ~2s sleep between them. + assert runner.run(_check, name="armed") == "done" + assert len(attempts) == 2 + + +def test_runner_arming_is_one_shot() -> None: + """After a successful armed run, the next call reverts to sync — the + timeout is consumed, not sticky. This is the contract that prevents + accidental cross-test polling state.""" + runner = EventuallyRunner() + runner.arm(timeout=2) + + def _ok() -> str: + return "ok" + + runner.run(_ok, name="first") # consumes the arm + + attempts = _attempt_counter() + + def _fail_once() -> None: + attempts.append(None) + raise AssertionError("immediate") + + with pytest.raises(AssertionError, match="immediate"): + runner.run(_fail_once, name="second") + assert len(attempts) == 1, "second run should have been sync, not retried" diff --git a/ingestion/tests/cli_e2e_v2/meta/test_fluent.py b/ingestion/tests/cli_e2e_v2/meta/test_fluent.py new file mode 100644 index 000000000000..5243fb82fd74 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/meta/test_fluent.py @@ -0,0 +1,296 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Meta-tests: prove the fluent assertion classes raise the right error +on mismatched OM state — and pass on matching state. + +Runs synthetically against a stub OM client. Each test pairs a positive +case (correct state → no raise) with a negative case (mismatched state → +AssertionError with a useful message). +""" + +from __future__ import annotations + +from types import SimpleNamespace +from typing import Any + +import pytest + +from metadata.generated.schema.entity.data.storedProcedure import StoredProcedure +from metadata.generated.schema.entity.data.table import ( + ConstraintType, + DataType, + Table, +) + +from ..core.fluent.stored_procedure_assert import StoredProcedureAssert +from ..core.fluent.table_assert import TableAssert + +# --------------------------------------------------------------------------- # +# Stubs # +# --------------------------------------------------------------------------- # + + +class _FakeOM: + """Stub for the OpenMetadata client. + + Stores canned entities keyed on `(entity_cls, fqn, include_filter)`. The + `include` kwarg distinguishes default `get_by_name` (deleted=False) from + `include="all"` used by `is_soft_deleted` / `is_not_deleted` so each + test can set the right view independently. + """ + + def __init__(self) -> None: + self.entities: dict[tuple[type, str, str | None], Any] = {} + + def register(self, entity_cls: type, fqn: str, value: Any, *, include: str | None = None) -> None: + self.entities[(entity_cls, fqn, include)] = value + + def get_by_name(self, *, entity, fqn, fields=None, include=None): + return self.entities.get((entity, fqn, include)) + + +def _table( + *, + columns: list[Any] | None = None, + tags: list[str] | None = None, + owners: list[str] | None = None, + description: str | None = None, + constraints: list[Any] | None = None, + schema_definition: str | None = None, + deleted: bool = False, +) -> SimpleNamespace: + return SimpleNamespace( + columns=columns or [], + tags=[SimpleNamespace(tagFQN=t) for t in (tags or [])], + owners=[SimpleNamespace(name=o) for o in (owners or [])], + description=description, + tableConstraints=constraints or [], + schemaDefinition=schema_definition, + deleted=deleted, + ) + + +def _column(name: str, data_type: DataType, *, tags: list[str] | None = None, description: str | None = None): + return SimpleNamespace( + name=name, + dataType=data_type, + tags=[SimpleNamespace(tagFQN=t) for t in (tags or [])], + description=description, + ) + + +def _fk(column: str, ref_table: str, ref_column: str) -> SimpleNamespace: + return SimpleNamespace( + constraintType=ConstraintType.FOREIGN_KEY, + columns=[column], + referredColumns=[f"{ref_table}.{ref_column}"], + ) + + +FQN = "svc.default.e2e.customers" + + +# --------------------------------------------------------------------------- # +# TableAssert — entity-level terminals # +# --------------------------------------------------------------------------- # + + +def test_exists_raises_when_entity_missing() -> None: + fake = _FakeOM() + # Nothing registered → get_by_name returns None → exists() raises. + with pytest.raises(AssertionError, match=r"not found"): + TableAssert(fake, FQN).exists() + + +def test_exists_passes_when_entity_present() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table()) + TableAssert(fake, FQN).exists() + + +def test_has_description_containing_passes_on_match() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(description="Customer records — primary table")) + TableAssert(fake, FQN).has_description_containing("Customer records") + + +def test_has_description_containing_raises_on_mismatch() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(description="something else")) + with pytest.raises(AssertionError, match=r"does not contain"): + TableAssert(fake, FQN).has_description_containing("Customer records") + + +def test_has_tag_passes_on_match() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(tags=["PII.Sensitive"])) + TableAssert(fake, FQN).has_tag("PII.Sensitive") + + +def test_has_tag_raises_when_tag_missing() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(tags=["Other.Tag"])) + with pytest.raises(AssertionError, match=r"missing tag 'PII.Sensitive'"): + TableAssert(fake, FQN).has_tag("PII.Sensitive") + + +def test_has_owner_passes_on_match() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(owners=["alice"])) + TableAssert(fake, FQN).has_owner("alice") + + +def test_has_owner_raises_when_owner_missing() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(owners=["bob"])) + with pytest.raises(AssertionError, match=r"missing owner 'alice'"): + TableAssert(fake, FQN).has_owner("alice") + + +def test_has_foreign_key_passes_on_match() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(constraints=[_fk("customer_id", "customers", "id")])) + TableAssert(fake, FQN).has_foreign_key_constraint("customer_id", "customers", "id") + + +def test_has_foreign_key_raises_when_constraint_absent() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(constraints=[])) + with pytest.raises(AssertionError, match=r"missing FOREIGN_KEY"): + TableAssert(fake, FQN).has_foreign_key_constraint("customer_id", "customers", "id") + + +def test_has_schema_definition_containing_is_case_insensitive() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(schema_definition="SELECT * FROM customers LEFT JOIN transactions")) + # MySQL emits lowercase keywords — assertion's lower-cased substring match handles it. + TableAssert(fake, FQN).has_schema_definition_containing("left join") + + +def test_has_schema_definition_containing_raises_on_mismatch() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(schema_definition="SELECT 1")) + with pytest.raises(AssertionError, match=r"does not contain"): + TableAssert(fake, FQN).has_schema_definition_containing("LEFT JOIN") + + +def test_is_soft_deleted_passes_when_deleted() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(deleted=True), include="all") + TableAssert(fake, FQN).is_soft_deleted() + + +def test_is_soft_deleted_raises_when_alive() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(deleted=False), include="all") + with pytest.raises(AssertionError, match=r"not soft-deleted"): + TableAssert(fake, FQN).is_soft_deleted() + + +def test_is_not_deleted_passes_when_alive() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(deleted=False), include="all") + TableAssert(fake, FQN).is_not_deleted() + + +def test_is_not_deleted_raises_when_soft_deleted() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(deleted=True), include="all") + with pytest.raises(AssertionError, match=r"unexpectedly soft-deleted"): + TableAssert(fake, FQN).is_not_deleted() + + +# --------------------------------------------------------------------------- # +# ColumnAssert — descended via TableAssert.column(name) # +# --------------------------------------------------------------------------- # + + +def test_column_has_type_passes_on_match() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(columns=[_column("id", DataType.BIGINT)])) + TableAssert(fake, FQN).column("id").has_type(DataType.BIGINT) + + +def test_column_has_type_raises_on_mismatch() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(columns=[_column("id", DataType.INT)])) + with pytest.raises(AssertionError, match=r"has type DataType.INT"): + TableAssert(fake, FQN).column("id").has_type(DataType.BIGINT) + + +def test_column_lookup_raises_when_column_missing() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(columns=[_column("id", DataType.BIGINT)])) + with pytest.raises(AssertionError, match=r"not found on table"): + TableAssert(fake, FQN).column("missing").has_type(DataType.BIGINT) + + +def test_column_has_tag_passes_on_match() -> None: + fake = _FakeOM() + fake.register( + Table, + FQN, + _table(columns=[_column("email", DataType.VARCHAR, tags=["PII.Sensitive"])]), + ) + TableAssert(fake, FQN).column("email").has_tag("PII.Sensitive") + + +def test_column_has_tag_raises_when_missing() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(columns=[_column("email", DataType.VARCHAR)])) + with pytest.raises(AssertionError, match=r"missing tag 'PII.Sensitive'"): + TableAssert(fake, FQN).column("email").has_tag("PII.Sensitive") + + +def test_column_has_no_tag_raises_when_unexpectedly_present() -> None: + """has_no_tag is the negative complement — guards against + over-classification by PII recognizers.""" + fake = _FakeOM() + fake.register( + Table, + FQN, + _table(columns=[_column("id", DataType.BIGINT, tags=["PII.Sensitive"])]), + ) + with pytest.raises(AssertionError, match=r"unexpectedly carries tag 'PII.Sensitive'"): + TableAssert(fake, FQN).column("id").has_no_tag("PII.Sensitive") + + +def test_column_has_no_tag_passes_when_absent() -> None: + fake = _FakeOM() + fake.register(Table, FQN, _table(columns=[_column("id", DataType.BIGINT)])) + TableAssert(fake, FQN).column("id").has_no_tag("PII.Sensitive") + + +# --------------------------------------------------------------------------- # +# StoredProcedureAssert.has_code_containing # +# --------------------------------------------------------------------------- # + + +def _sp(*, code: str | None) -> SimpleNamespace: + return SimpleNamespace( + storedProcedureCode=SimpleNamespace(code=code) if code is not None else None, + ) + + +def test_sp_has_code_containing_passes_on_match() -> None: + fake = _FakeOM() + fake.register(StoredProcedure, "svc.default.e2e.sp_count", _sp(code="SELECT COUNT(*) FROM customers")) + StoredProcedureAssert(fake, "svc.default.e2e.sp_count").has_code_containing("SELECT COUNT(*)") + + +def test_sp_has_code_containing_raises_on_empty_body() -> None: + """The exact regression that motivated `SHOW_ROUTINE` in conftest.py: + body returns empty string when the OM connector lacks routine-read + privilege.""" + fake = _FakeOM() + fake.register(StoredProcedure, "svc.default.e2e.sp_count", _sp(code="")) + with pytest.raises(AssertionError, match=r"code does not contain 'SELECT COUNT"): + StoredProcedureAssert(fake, "svc.default.e2e.sp_count").has_code_containing("SELECT COUNT(*)") + + +def test_sp_has_code_containing_raises_on_missing_body() -> None: + fake = _FakeOM() + fake.register(StoredProcedure, "svc.default.e2e.sp_count", _sp(code=None)) + with pytest.raises(AssertionError, match=r"code does not contain"): + StoredProcedureAssert(fake, "svc.default.e2e.sp_count").has_code_containing("BEGIN") diff --git a/ingestion/tests/cli_e2e_v2/mysql/__init__.py b/ingestion/tests/cli_e2e_v2/mysql/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/ingestion/tests/cli_e2e_v2/mysql/baseline.py b/ingestion/tests/cli_e2e_v2/mysql/baseline.py new file mode 100644 index 000000000000..e45d2eb3d2d0 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/mysql/baseline.py @@ -0,0 +1,298 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""MySQL source baseline — common portable tables + MySQL-specific all_types. + +Structure: + - Portable tables (customers, transactions) + their seed rows come from + `core/source/common_baseline.py`. + - MySQL adds a dialect-specific `all_types` table exercising every native + type the connector maps (TINYINT / MEDIUMINT / TEXT variants / blobs / + BIT / ENUM / SET etc.). Seed is trivial — id=1..3 with everything else + NULL; tests only assert on row count + type mappings. + - INSERT templates carry MySQL's `ON DUPLICATE KEY UPDATE` idempotency; + the base enforcer binds them against common row data via executemany. + - One view + one stored procedure for lineage and SP-ingestion coverage. + +Schema evolution caveat: + metadata.create_all uses CREATE TABLE IF NOT EXISTS — no ALTER migration. + When baseline shape changes (column add/drop, FK, comments), drop first: + DROP SCHEMA IF EXISTS e2e; +""" + +from __future__ import annotations + +from functools import lru_cache +from typing import Any + +from sqlalchemy import ( + BigInteger, + Column, + Date, + DateTime, + Float, + Integer, + MetaData, + Numeric, + SmallInteger, + Table, + Time, + create_engine, +) +from sqlalchemy.dialects import mysql +from sqlalchemy.engine import URL, Engine + +from ..core.config.env import Env +from ..core.source.common_baseline import ( + COMMON_CUSTOMER_ROWS, + COMMON_TRANSACTION_ROWS, + build_common_metadata, +) +from ..core.source.orchestrator import EnforcementMode, EnforcementPolicy +from ..core.source.sql import ( + SqlSourceBaseline, + StoredProcedureDefinition, + TableSeed, + ViewDefinition, +) +from .enforcer import MySqlEnforcer + +# ----------------------------------------------------------------------------- +# all_types — MySQL-specific native types (exercises connector type mapping) +# ----------------------------------------------------------------------------- + + +def _declare_all_types(md: MetaData) -> Table: + return Table( + "all_types", + md, + Column("id", Integer, primary_key=True, nullable=False), + Column("tiny_int_col", mysql.TINYINT, nullable=True), + Column("small_int_col", SmallInteger, nullable=True), + Column("medium_int_col", mysql.MEDIUMINT, nullable=True), + Column("int_col", Integer, nullable=True), + Column("big_int_col", BigInteger, nullable=True), + Column("float_col", Float, nullable=True), + Column("double_col", mysql.DOUBLE, nullable=True), + Column("decimal_col", Numeric(10, 2), nullable=True), + Column("char_col", mysql.CHAR(10), nullable=True), + Column("varchar_col", mysql.VARCHAR(255), nullable=True), + Column("tinytext_col", mysql.TINYTEXT, nullable=True), + Column("text_col", mysql.TEXT, nullable=True), + Column("mediumtext_col", mysql.MEDIUMTEXT, nullable=True), + Column("longtext_col", mysql.LONGTEXT, nullable=True), + Column("binary_col", mysql.BINARY(16), nullable=True), + Column("varbinary_col", mysql.VARBINARY(255), nullable=True), + Column("tinyblob_col", mysql.TINYBLOB, nullable=True), + Column("blob_col", mysql.BLOB, nullable=True), + Column("mediumblob_col", mysql.MEDIUMBLOB, nullable=True), + Column("longblob_col", mysql.LONGBLOB, nullable=True), + Column("date_col", Date, nullable=True), + Column("time_col", Time, nullable=True), + Column("datetime_col", DateTime, nullable=True), + Column("timestamp_col", mysql.TIMESTAMP, nullable=True), + Column("year_col", mysql.YEAR, nullable=True), + Column("bit_col", mysql.BIT(8), nullable=True), + Column("json_col", mysql.JSON, nullable=True), + Column("enum_col", mysql.ENUM("alpha", "beta", "gamma"), nullable=True), + Column("set_col", mysql.SET("x", "y", "z"), nullable=True), + ) + + +# all_types seed — one row per id, NULL elsewhere. Tests assert row count +# and column type mappings, not cell content, so this is sufficient. +_ALL_TYPES_ROWS: list[dict[str, Any]] = [{"id": 1}, {"id": 2}, {"id": 3}] + + +# ----------------------------------------------------------------------------- +# Dialect-specific INSERT templates (MySQL `ON DUPLICATE KEY UPDATE` idempotency) +# ----------------------------------------------------------------------------- + + +_MYSQL_CUSTOMERS_INSERT = """ +INSERT INTO e2e.customers + (id, first_name, last_name, full_name, email, + address, city, country, zipcode, date_of_birth, age, + credit_score, status, is_active, bio, joined_date) +VALUES + (:id, :first_name, :last_name, :full_name, :email, + :address, :city, :country, :zipcode, :date_of_birth, :age, + :credit_score, :status, :is_active, :bio, :joined_date) +ON DUPLICATE KEY UPDATE + first_name = VALUES(first_name), last_name = VALUES(last_name), + full_name = VALUES(full_name), email = VALUES(email), + address = VALUES(address), city = VALUES(city), + country = VALUES(country), zipcode = VALUES(zipcode), + date_of_birth = VALUES(date_of_birth), age = VALUES(age), + credit_score = VALUES(credit_score), status = VALUES(status), + is_active = VALUES(is_active), bio = VALUES(bio), + joined_date = VALUES(joined_date) +""" + +_MYSQL_TRANSACTIONS_INSERT = """ +INSERT INTO e2e.transactions + (id, customer_id, amount, currency, exchange_rate, status, + txn_at, reference_number, ip_address, notes) +VALUES + (:id, :customer_id, :amount, :currency, :exchange_rate, :status, + :txn_at, :reference_number, :ip_address, :notes) +ON DUPLICATE KEY UPDATE + customer_id = VALUES(customer_id), amount = VALUES(amount), + currency = VALUES(currency), exchange_rate = VALUES(exchange_rate), + status = VALUES(status), txn_at = VALUES(txn_at), + reference_number = VALUES(reference_number), + ip_address = VALUES(ip_address), notes = VALUES(notes) +""" + +_MYSQL_ALL_TYPES_INSERT = """ +INSERT INTO e2e.all_types (id) VALUES (:id) +ON DUPLICATE KEY UPDATE id = VALUES(id) +""" + + +# ----------------------------------------------------------------------------- +# View + stored procedure (dialect-specific DDL) +# ----------------------------------------------------------------------------- + + +_CUSTOMER_TXN_SUMMARY_VIEW = ViewDefinition( + schema="e2e", + name="customer_txn_summary", + definition_sql=""" + CREATE OR REPLACE VIEW e2e.customer_txn_summary AS + SELECT + c.id AS customer_id, + c.full_name, + c.status AS customer_status, + COUNT(t.id) AS txn_count, + COALESCE(SUM(t.amount), 0) AS total_amount + FROM e2e.customers c + LEFT JOIN e2e.transactions t ON c.id = t.customer_id + GROUP BY c.id, c.full_name, c.status + """, +) + + +_SP_ACTIVE_CUSTOMER_COUNT = StoredProcedureDefinition( + schema="e2e", + name="sp_active_customer_count", + definition_sql=""" + CREATE PROCEDURE e2e.sp_active_customer_count() + BEGIN + SELECT COUNT(*) AS active_count + FROM e2e.customers + WHERE status = 'active'; + END + """, +) + + +# A second SP exercising parameterized DML — covers a different code path +# than the read-only `sp_active_customer_count`. The body intentionally +# carries an UPDATE statement so OM's stored-procedure ingestion stores +# DML text, not just SELECT text. +_SP_UPDATE_CUSTOMER_STATUS = StoredProcedureDefinition( + schema="e2e", + name="sp_update_customer_status", + definition_sql=""" + CREATE PROCEDURE e2e.sp_update_customer_status( + IN p_customer_id INT, + IN p_status VARCHAR(20) + ) + BEGIN + UPDATE e2e.customers + SET status = p_status + WHERE id = p_customer_id; + END + """, +) + + +# ----------------------------------------------------------------------------- +# Top-level baseline +# ----------------------------------------------------------------------------- + + +def _build_metadata() -> MetaData: + """Common portable tables + MySQL-specific all_types.""" + md = build_common_metadata("e2e") + _declare_all_types(md) + return md + + +MYSQL_BASELINE = SqlSourceBaseline( + schemas=["e2e"], + metadata=_build_metadata(), + seeds=[ + TableSeed( + table_name="customers", + rows=COMMON_CUSTOMER_ROWS, + insert_sql=_MYSQL_CUSTOMERS_INSERT, + ), + TableSeed( + table_name="transactions", + rows=COMMON_TRANSACTION_ROWS, + insert_sql=_MYSQL_TRANSACTIONS_INSERT, + ), + TableSeed( + table_name="all_types", + rows=_ALL_TYPES_ROWS, + insert_sql=_MYSQL_ALL_TYPES_INSERT, + ), + ], + views=[_CUSTOMER_TXN_SUMMARY_VIEW], + stored_procedures=[_SP_ACTIVE_CUSTOMER_COUNT, _SP_UPDATE_CUSTOMER_STATUS], +) + + +# ----------------------------------------------------------------------------- +# Policy factory +# ----------------------------------------------------------------------------- + + +@lru_cache(maxsize=1) +def get_admin_engine() -> Engine: + """Build (and cache) the SQLAlchemy engine bound to ADMIN credentials. + + Distinct from the ingest credentials `build_mysql_config` uses for the + CLI subprocess: the ingest user (`om_user`) is a scoped account whose + GRANTs match the OM MySQL connector's documented minimum (SELECT, + SHOW VIEW, EXECUTE on the target schema; PROCESS globally). ADMIN + credentials are the container's `root` user, which the enforcer needs + for CREATE SCHEMA / CREATE TABLE / INSERT / SELECT. + + Tests that need to mutate the source out-of-band (e.g. drop a table + to test mark-deleted; create a poisoned view to test error + containment) consume this helper directly — keeps engine construction + centralized so admin DSN never lives in two places. + + E2E_MYSQL_ADMIN_USER / E2E_MYSQL_ADMIN_PASSWORD / E2E_MYSQL_HOST_PORT + are populated automatically by the session-scoped `mysql_container` + fixture in conftest.py, which boots a dedicated MySQL via + testcontainers and creates the scoped `om_user` post-startup. + Teammates do not set these vars manually. + """ + user = Env("E2E_MYSQL_ADMIN_USER", default="root").get() + password = Env("E2E_MYSQL_ADMIN_PASSWORD", default="password").get() + host_port = Env("E2E_MYSQL_HOST_PORT").get() + host, _, port_str = host_port.partition(":") + port = int(port_str) if port_str else None + url = URL.create( + drivername="mysql+pymysql", + username=user, + password=password, + host=host, + port=port, + ) + return create_engine(url) + + +@lru_cache(maxsize=1) +def get_policy() -> EnforcementPolicy: + """Lazy-build and cache the MySQL EnforcementPolicy. + + Reuses `get_admin_engine` for engine construction so the admin DSN + has a single source of truth. + """ + enforcer = MySqlEnforcer(get_admin_engine(), MYSQL_BASELINE) + return EnforcementPolicy(enforcer=enforcer, mode=EnforcementMode.APPLY) diff --git a/ingestion/tests/cli_e2e_v2/mysql/conftest.py b/ingestion/tests/cli_e2e_v2/mysql/conftest.py new file mode 100644 index 000000000000..60ec8e4a7107 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/mysql/conftest.py @@ -0,0 +1,231 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""MySQL-specific pytest fixtures. + +Pytest auto-discovers this conftest for tests under `tests/cli_e2e_v2/mysql/`. +Session-scoped `mysql_container` boots a dedicated MySQL via testcontainers +(no shared infra dependency, no teammate-managed admin creds), bootstraps +the `e2e` target schema, and creates a scoped ingest user `om_user` with +the production-minimum permissions documented for the OpenMetadata MySQL +connector. Subsequent fixtures consume that container. + +Two users live inside the container: + + - `root` (testcontainers default) — used by the framework's + ``SqlBaselineEnforcer`` to seed and reconcile the ``e2e`` schema + (CREATE TABLE / DROP / INSERT / SELECT). Ephemeral and disposable. + - ``om_user`` — the scoped ingest account whose GRANTs match the minimum + OM MySQL connector permissions: + + GRANT SELECT, SHOW VIEW, EXECUTE ON e2e.* TO 'om_user'@'%'; + GRANT PROCESS, SHOW_ROUTINE ON *.* TO 'om_user'@'%'; + + Used by the CLI metadata subprocess so ingestion is exercised against + a production-realistic privilege set, not against the framework's + DDL-capable admin user. + +The ``mysql_container`` fixture also populates the ``E2E_MYSQL_*`` environment +variables so the existing ``Env(key).ref()`` config-builder pattern keeps +rendering ``${E2E_MYSQL_*}`` placeholders into the workflow YAML — secrets +never leak to tmp_path even though they are now generated per-session. + +Filter tests that need isolated services do NOT use ``mysql_cfg`` or +``mysql_metadata_ingested`` — they call ``build_mysql_config(mysql_service_name( +session_uuid, variant="..."), om_server_config)`` directly and run their own +ingest with the variant filter config. + +Depends on ``session_uuid``, ``om_server_config``, and ``registered_services`` +fixtures from the top-level conftest.py. +""" + +from __future__ import annotations + +import os +from typing import TYPE_CHECKING + +import pytest +from sqlalchemy import create_engine, text +from testcontainers.mysql import MySqlContainer + +from ..core.config.pipelines import MetadataPipeline +from ..core.fixtures import metadata_ingest_once, run_source_baseline +from .baseline import MYSQL_BASELINE, get_admin_engine, get_policy +from .connector import build_mysql_config, mysql_service_name +from .expected import mysql_expected + +if TYPE_CHECKING: + from collections.abc import Callable, Generator + + from sqlalchemy.engine import Engine + + from ..core.config.builder import WorkflowConfig + from ..core.config.server import ServerConfig + from ..core.expected.types import ExpectedService + + +_INGEST_USER = "om_user" +_INGEST_PASSWORD = "om_password" +_TARGET_SCHEMA = "e2e" +_MYSQL_IMAGE = "mysql:8.0" + +_ENV_VARS = ( + "E2E_MYSQL_USER", + "E2E_MYSQL_PASSWORD", + "E2E_MYSQL_HOST_PORT", + "E2E_MYSQL_ADMIN_USER", + "E2E_MYSQL_ADMIN_PASSWORD", + "E2E_MYSQL_DATABASE", +) + + +@pytest.fixture(scope="session") +def mysql_container() -> Generator[MySqlContainer, None, None]: + """Boot a dedicated MySQL via testcontainers and bootstrap the OM-doc users. + + Creates ``e2e`` and a scoped ``om_user`` whose GRANTs match the minimum + OM MySQL connector documentation (SELECT, SHOW VIEW, EXECUTE on the + target schema; PROCESS globally for connection-test; SHOW_ROUTINE + globally so stored-procedure bodies are readable). Also populates + ``E2E_MYSQL_*`` environment variables for the rest of the session so + the existing ``Env(key).ref()`` YAML pattern is preserved unchanged. + """ + container = MySqlContainer(_MYSQL_IMAGE) + with container as running: + host = running.get_container_host_ip() + port = running.get_exposed_port(3306) + # MySqlContainer wires MYSQL_ROOT_PASSWORD to the same value as + # the user password, so `running.password` IS the root password. + root_url = f"mysql+pymysql://root:{running.password}@{host}:{port}/" + engine = create_engine(root_url) + try: + with engine.begin() as conn: + conn.execute(text(f"CREATE DATABASE IF NOT EXISTS {_TARGET_SCHEMA}")) + conn.execute(text(f"CREATE USER IF NOT EXISTS '{_INGEST_USER}'@'%' IDENTIFIED BY '{_INGEST_PASSWORD}'")) + conn.execute(text(f"GRANT SELECT, SHOW VIEW, EXECUTE ON {_TARGET_SCHEMA}.* TO '{_INGEST_USER}'@'%'")) + conn.execute(text(f"GRANT PROCESS, SHOW_ROUTINE ON *.* TO '{_INGEST_USER}'@'%'")) + conn.execute(text("FLUSH PRIVILEGES")) + finally: + engine.dispose() + + # Populate Env-readable vars from the running container so neither + # connector.py (Env(...).ref()) nor baseline.py:get_admin_engine + # (Env(...).get()) needs to know about testcontainers. + previous: dict[str, str | None] = {var: os.environ.get(var) for var in _ENV_VARS} + os.environ["E2E_MYSQL_USER"] = _INGEST_USER + os.environ["E2E_MYSQL_PASSWORD"] = _INGEST_PASSWORD + os.environ["E2E_MYSQL_HOST_PORT"] = f"{host}:{port}" + os.environ["E2E_MYSQL_ADMIN_USER"] = "root" + os.environ["E2E_MYSQL_ADMIN_PASSWORD"] = running.password + os.environ["E2E_MYSQL_DATABASE"] = _TARGET_SCHEMA + try: + yield running + finally: + for var, prev in previous.items(): + if prev is None: + os.environ.pop(var, None) + else: + os.environ[var] = prev + # Clear the lru_cache'd engine so a second pytest run in the + # same Python process rebuilds against a freshly booted + # container instead of reusing a stale URL. + get_admin_engine.cache_clear() + get_policy.cache_clear() + + +@pytest.fixture(scope="session") +def mysql_service(session_uuid: str) -> str: + """Session-shared MySQL service name (``e2e_mysql_``). + + Eliminates ``service = mysql_service_name(session_uuid)`` from every + test body. Filter tests still build their own variant-named services + via ``mysql_service_name(session_uuid, variant=...)`` directly — this + fixture is only the default, session-shared name. + """ + return mysql_service_name(session_uuid) + + +@pytest.fixture(scope="module") +def mysql_expected_factory( + mysql_service: str, +) -> Callable[..., ExpectedService]: + """Factory for ExpectedService trees bound to the session's service name. + + Usage: ``mysql_expected_factory()`` returns the full expected catalog; + ``mysql_expected_factory(tables=[...])`` returns a projection (used by + filter tests to pass a pre-built expected tree into the differ). + """ + + def _factory(*, tables: list[str] | None = None) -> ExpectedService: + return mysql_expected(mysql_service, tables=tables) + + return _factory + + +@pytest.fixture(scope="session") +def mysql_admin_engine(mysql_container: MySqlContainer) -> Engine: + """Admin-credentials SQLAlchemy engine for tests that need to mutate + the source out-of-band (drop a baseline table to test mark-deleted, + create a poisoned view to test error containment, etc.). + + Shares the cached engine that ``get_policy()`` builds — single DSN + source of truth — and depends on ``mysql_container`` so the env vars + feeding ``get_admin_engine`` are populated before first use. + """ + return get_admin_engine() + + +@pytest.fixture(scope="session") +def mysql_source_ready(mysql_container: MySqlContainer) -> None: + """Reconcile MySQL source with MYSQL_BASELINE once per pytest session. + + Fires before any MySQL test runs because ``mysql_cfg`` (and test-local + variant configs) declare this as a dependency. Depends on + ``mysql_container`` so admin creds + schema exist before the enforcer + runs CREATE TABLE. + """ + run_source_baseline(get_policy, MYSQL_BASELINE, connector_name="mysql") + + +@pytest.fixture(scope="module") +def mysql_cfg( + om_server_config: ServerConfig, + mysql_service: str, + mysql_source_ready: None, +) -> WorkflowConfig: + """Default module-scoped MySQL config, using the session-shared service name. + + For tests that can share service state across the module (vanilla ingest, + profiler — both operate on the same ingested entities). Filter tests + should build their own variant-named config via build_mysql_config rather + than relying on this shared fixture. + """ + return build_mysql_config(mysql_service, om_server_config) + + +@pytest.fixture(scope="module") +def mysql_metadata_ingested( + tmp_path_factory: pytest.TempPathFactory, + mysql_cfg: WorkflowConfig, + mysql_service: str, + registered_services: list[str], +) -> None: + """Run the MySQL metadata CLI once per module against the shared service. + + Cuts ~6 redundant CLI subprocess runs per module pass. Tests that just + need entities ingested (profiler, lineage, classification, structural, + stored-procedure, descriptions) depend on this fixture instead of + invoking their own metadata ingest. + """ + metadata_ingest_once( + tmp_path_factory, + mysql_cfg, + registered_services, + service_name=mysql_service, + pipeline_options=MetadataPipeline( + includeStoredProcedures=True, + includeDDL=True, # parses view definitions for view->table lineage + ), + filter_kwargs={"schemas_include": ["e2e"]}, + label="mysql", + ) diff --git a/ingestion/tests/cli_e2e_v2/mysql/connector.py b/ingestion/tests/cli_e2e_v2/mysql/connector.py new file mode 100644 index 000000000000..0026a8ddbf16 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/mysql/connector.py @@ -0,0 +1,67 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Helpers for building MySQL WorkflowConfigs and deriving service names. + +Split out from conftest.py because pytest discourages importing from +conftest modules; filter tests need build_mysql_config to construct +variant-named services for isolation. + +Secrets handling: every env-backed YAML field uses Env(key).ref() — the +rendered cfg_*.yaml carries ${E2E_MYSQL_*} literal references, not real +credentials. Env's construction validates presence (raises EnvLoadError +at build time if a required var is unset). The metadata CLI expands the +references at subprocess load time via os.path.expandvars. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +from ..core.config.builder import WorkflowConfig +from ..core.config.env import Env + +if TYPE_CHECKING: + from ..core.config.server import ServerConfig + + +def mysql_service_name(session_uuid: str, variant: str = "") -> str: + """Build the MySQL service name for a given pytest session and optional variant. + + Default variant "" returns the session-shared service (used by tests that + accept shared state across the test module, e.g. vanilla ingest + profiler). + + A non-empty variant (e.g., "filter_inc") produces a sibling service + (e.g., e2e_mysql_abc123_filter_inc) — filter tests use this for isolation + so prior-test residue doesn't pollute "extras" assertions. + """ + base = f"e2e_mysql_{session_uuid}" + return f"{base}_{variant}" if variant else base + + +def build_mysql_config(service_name: str, server: ServerConfig) -> WorkflowConfig: + """Build a base MySQL WorkflowConfig with the given service name. + + All env-backed fields emit ${E2E_MYSQL_*} references. Presence validation + happens in Env's constructor; missing required vars raise EnvLoadError at + build time with a clear message. Real values never enter the dict. + + E2E_MYSQL_DATABASE is optional — instance constructs without raising; + the field is added to the config only when the env var is actually set. + """ + service_connection: dict = { + "type": "Mysql", + "username": Env("E2E_MYSQL_USER").ref(), + "authType": {"password": Env("E2E_MYSQL_PASSWORD").ref()}, + "hostPort": Env("E2E_MYSQL_HOST_PORT").ref(), + } + db = Env("E2E_MYSQL_DATABASE", required=False) + if db.get(): + service_connection["databaseSchema"] = db.ref() + + return WorkflowConfig.build( + source_type="mysql", + service_name=service_name, + service_connection=service_connection, + server=server, + ) diff --git a/ingestion/tests/cli_e2e_v2/mysql/enforcer.py b/ingestion/tests/cli_e2e_v2/mysql/enforcer.py new file mode 100644 index 000000000000..e77d07a5647e --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/mysql/enforcer.py @@ -0,0 +1,45 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""MySQL dialect specifics for SqlBaselineEnforcer. + +Introspection + table DDL (CREATE TABLE + FK + COMMENT) live in the base +via SQLAlchemy Inspector + `metadata.create_all`. This subclass supplies +only what Core doesn't model: + - stored-procedure listing (`INFORMATION_SCHEMA.ROUTINES`) + - DROP + CREATE for procedures (MySQL has no CREATE OR REPLACE PROCEDURE) +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +from sqlalchemy import create_engine, text + +from ..core.source.sql_enforcer import SqlBaselineEnforcer + +if TYPE_CHECKING: + from sqlalchemy.engine import URL, Connection + + from ..core.source.sql import SqlSourceBaseline, StoredProcedureDefinition + +logger = logging.getLogger(__name__) + + +class MySqlEnforcer(SqlBaselineEnforcer): + _stored_procedure_query_sql = ( + "SELECT ROUTINE_SCHEMA, ROUTINE_NAME " + "FROM INFORMATION_SCHEMA.ROUTINES " + "WHERE ROUTINE_SCHEMA IN :schemas AND ROUTINE_TYPE = 'PROCEDURE'" + ) + + @classmethod + def from_url(cls, url: str | URL, baseline: SqlSourceBaseline) -> MySqlEnforcer: + """Construct with a SQLAlchemy engine built from a connection URL.""" + return cls(create_engine(url), baseline) + + def _apply_stored_procedure(self, conn: Connection, sp: StoredProcedureDefinition) -> None: + logger.debug("[mysql] DROP+CREATE PROCEDURE %s.%s", sp.schema, sp.name) + conn.execute(text(f"DROP PROCEDURE IF EXISTS {sp.schema}.{sp.name}")) + conn.execute(text(sp.definition_sql)) diff --git a/ingestion/tests/cli_e2e_v2/mysql/expected.py b/ingestion/tests/cli_e2e_v2/mysql/expected.py new file mode 100644 index 000000000000..bcb5845bed7d --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/mysql/expected.py @@ -0,0 +1,129 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""Expected OM-side catalog for the MySQL baseline. + +Derived from `MYSQL_BASELINE.metadata` via `MYSQL_TYPE_MAP`. The hand-authored +column lists disappear — column types, descriptions, constraints, and +primary keys all come off the SQLAlchemy Column declarations in +`core/source/common_baseline.py` + `mysql/baseline.py`. + +Views aren't in MetaData (they're raw SQL), so the view's ExpectedTable is +appended manually. Stored procedures are passed in as a hand-authored list +to `derive_expected_service`. + +Entries in `MYSQL_TYPE_MAP` that may need adjustment after Task 25's live +run are marked inline. +""" + +from __future__ import annotations + +from sqlalchemy import Boolean +from sqlalchemy.dialects import mysql + +from metadata.generated.schema.entity.data.table import DataType +from metadata.generated.schema.entity.services.databaseService import ( + DatabaseServiceType, +) + +from ..core.expected.derive import derive_expected_service +from ..core.expected.type_map import CORE_TYPE_MAP, TypeMap +from ..core.expected.types import ( + ExpectedColumn, + ExpectedService, + ExpectedStoredProcedure, + ExpectedTable, +) +from .baseline import MYSQL_BASELINE + +# ----------------------------------------------------------------------------- +# MYSQL_TYPE_MAP — extends CORE with MySQL dialect classes + Boolean override. +# Entries flagged TASK25 may need correction after the first live ingest. +# ----------------------------------------------------------------------------- + +MYSQL_TYPE_MAP: TypeMap = { + **CORE_TYPE_MAP, + # Core overrides (dialect behaves differently than the generic mapping). + Boolean: DataType.TINYINT, # MySQL stores BOOL as TINYINT(1); TASK25 + # Integer variants — MRO walks through Integer first, so we must + # override before it resolves to DataType.INT. + mysql.TINYINT: DataType.TINYINT, + mysql.MEDIUMINT: DataType.INT, # no MEDIUMINT in OM DataType + # Float variants — same reasoning; mysql.DOUBLE extends Float. + mysql.DOUBLE: DataType.DOUBLE, + # String-family size variants — mysql.MEDIUMTEXT / LONGTEXT / TINYTEXT + # extend `_StringType`, which MRO-walks to String (not Text), so CORE's + # `String → VARCHAR` would give the wrong answer without these entries. + mysql.TINYTEXT: DataType.TEXT, # no TINYTEXT in OM DataType + mysql.MEDIUMTEXT: DataType.MEDIUMTEXT, + mysql.LONGTEXT: DataType.TEXT, # LONGTEXT absent from enum; TASK25 + # Binary-family — mysql.BINARY / VARBINARY / *BLOB extend `_Binary`, + # which MRO skips past `LargeBinary`, so CORE's `LargeBinary → BLOB` + # doesn't help the binary/varbinary/tiny/medium/long variants. + mysql.BINARY: DataType.BINARY, + mysql.VARBINARY: DataType.VARBINARY, + mysql.TINYBLOB: DataType.BLOB, # no TINYBLOB in OM DataType + mysql.MEDIUMBLOB: DataType.MEDIUMBLOB, + mysql.LONGBLOB: DataType.LONGBLOB, + # Dialect-only types with no generic SQLAlchemy parent in CORE. + mysql.YEAR: DataType.YEAR, + mysql.BIT: DataType.BIT, + mysql.SET: DataType.SET, + # NOTE: mysql.JSON / mysql.ENUM / mysql.BLOB / mysql.TIMESTAMP are + # resolved via CORE_TYPE_MAP through the MRO walk (see type_map.py). + # mysql.VARCHAR / mysql.CHAR / mysql.TEXT likewise — no entries needed. +} + + +def mysql_expected( + service_name: str, + *, + tables: list[str] | None = None, +) -> ExpectedService: + """Build the expected MySQL catalog for a given service name. + + Structural portion (tables + columns + types + PKs + comments) is + derived from `MYSQL_BASELINE.metadata`. The view and stored procedure + are appended since neither lives in MetaData. + + `tables=None` -> full catalog. `tables=[...]` -> only the named tables + survive (used by filter tests with MatchMode.STRICT). + """ + expected = derive_expected_service( + service_name=service_name, + service_type=DatabaseServiceType.Mysql, + metadata=MYSQL_BASELINE.metadata, + type_map=MYSQL_TYPE_MAP, + database="default", + views=[_expected_customer_txn_summary_view()], + stored_procedures=[ + ExpectedStoredProcedure(name="sp_active_customer_count"), + ExpectedStoredProcedure(name="sp_update_customer_status"), + ], + ) + + if tables is not None: + kept = set(tables) + schema = expected.databases[0].schemas[0] + schema.tables[:] = [t for t in schema.tables if t.name in kept] + + return expected + + +def _expected_customer_txn_summary_view() -> ExpectedTable: + """View treated as Table entity (OM uses tableType=View). + + View columns are declared manually since the view body is raw SQL and + not in our SQLAlchemy MetaData. MySQL's `COUNT(*)` returns BIGINT; + `COALESCE(SUM(DECIMAL), 0)` returns DECIMAL. + """ + return ExpectedTable( + name="customer_txn_summary", + columns=[ + ExpectedColumn("customer_id", DataType.INT), + ExpectedColumn("full_name", DataType.VARCHAR), + ExpectedColumn("customer_status", DataType.VARCHAR), + ExpectedColumn("txn_count", DataType.BIGINT), + ExpectedColumn("total_amount", DataType.DECIMAL), + ], + ) diff --git a/ingestion/tests/cli_e2e_v2/mysql/test_mysql.py b/ingestion/tests/cli_e2e_v2/mysql/test_mysql.py new file mode 100644 index 000000000000..c59065820e75 --- /dev/null +++ b/ingestion/tests/cli_e2e_v2/mysql/test_mysql.py @@ -0,0 +1,530 @@ +# Copyright 2026 Collate +# Licensed under the Collate Community License, Version 1.0 (the "License"); +# you may not use this file except in compliance with the License. +"""MySQL pilot — CLI E2E v2 tests. + +Exercises the v2 framework end-to-end against a MySQL source. Covers all +the pipelines the MVP ships (metadata, profiler, auto-classification, and +view lineage via SQL parsing; DQ deferred to post-MVP) plus four filter +scenarios, FK/description coverage, mark-deleted re-ingest, error +containment, and column-level lineage. + +Lineage note: MySQL FK constraints produce TableConstraint entries on the +table entity, not lineage edges (see `project-mysql-fk-no-lineage.md`). +The only real lineage MySQL surfaces is view-to-table lineage derived from +parsing the view definition SQL. The FK assertion targets +`tableConstraints`, not upstream edges. + +Module-scoped `mysql_metadata_ingested` runs the metadata CLI once for +tests that consume the shared service — profiler, lineage, +classification, structural, description, FK. That fixture also registers +the service name for session-end cleanup. + +Tests that mutate the source state (mark-deleted, error containment) use +their own isolated services and clean up after themselves so they +don't perturb the shared fixture. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest +from sqlalchemy import text + +from metadata.generated.schema.configuration.profilerConfiguration import MetricType + +from ..core.config.pipelines import ( + AutoClassificationPipeline, + LineagePipeline, + MetadataPipeline, + ProfilerPipeline, +) +from ..core.expected.differ import MatchMode, assert_service_matches +from ..core.filter_scenarios import ( + COMMON_FILTER_SCENARIOS, + FilterScenario, + expected_tables_for, +) +from .connector import build_mysql_config, mysql_service_name +from .expected import mysql_expected + +if TYPE_CHECKING: + from collections.abc import Callable + + from sqlalchemy.engine import Engine + + from ..core.config.builder import WorkflowConfig + from ..core.config.server import ServerConfig + from ..core.expected.types import ExpectedService + from ..core.fluent.om_client import OmClient + from ..core.runner.cli_runner import CliRunner + +# --------------------------------------------------------------------------- +# Structural (metadata pipeline) — full Expected* tree walk +# --------------------------------------------------------------------------- + + +def test_vanilla_ingest_structural( + om_client: OmClient, + mysql_expected_factory: Callable[..., ExpectedService], + # `mysql_metadata_ingested: None` is a pytest idiom: declaring the + # fixture as a parameter triggers its setup side-effect (here, running + # the metadata CLI once per module). The value itself is always None. + # Every test below that asserts on OM state does the same. + mysql_metadata_ingested: None, +) -> None: + """Metadata ingest produces the declared OM catalog (SUPERSET). + + Walks the full Expected* tree — table structure, every column's + DataType, constraints, descriptions, stored procedures. Subsumes + per-column type spot-checks and per-entity count smoke tests, so we + don't repeat those at the test-function level. + """ + assert_service_matches(mysql_expected_factory(), om_client) + + +# --------------------------------------------------------------------------- +# Profiler — exhaustive metric coverage on representative columns +# --------------------------------------------------------------------------- + + +# Explicit "compute all stat-type metrics" list. Default profiler +# metrics (`get_default_metrics` in OM) omit minLength/maxLength so +# string-length stats come back None — passing this list overrides the +# default. We exclude parameterized metrics (countInSet, *LikeCount, +# regexCount, etc.) that need user-supplied values; they're applicable +# for DQ-style checks, not the "compute everything we can off raw +# rows" coverage this test wants. +_ALL_PROFILER_METRICS: list[MetricType] = [ + # Table-level + MetricType.rowCount, + MetricType.columnCount, + MetricType.columnNames, + # Column counts / proportions + MetricType.valuesCount, + MetricType.nullCount, + MetricType.nullProportion, + MetricType.distinctCount, + MetricType.distinctProportion, + MetricType.uniqueCount, + MetricType.uniqueProportion, + MetricType.duplicateCount, + # Numeric stats + MetricType.min, + MetricType.max, + MetricType.mean, + MetricType.sum, + MetricType.stddev, + MetricType.median, + MetricType.firstQuartile, + MetricType.thirdQuartile, + MetricType.interQuartileRange, + MetricType.nonParametricSkew, + MetricType.histogram, + # String stats + MetricType.minLength, + MetricType.maxLength, +] + + +def test_profiler_metrics( + cli_runner: CliRunner, + om_client: OmClient, + mysql_cfg: WorkflowConfig, + mysql_service: str, + mysql_metadata_ingested: None, +) -> None: + """Profiler emits the full metric suite — table-level + per-column. + + Lean: ONE pipeline run, multiple assertions off the produced state. + Exhaustive: covers numeric + string + count metric paths in one pass. + + Scope: + - Table-level rowCount on three seeded tables. + - Numeric metrics on `customers.credit_score` (deterministic ints + 720, 680, 650, 750, 600 → min=600, max=750, mean=680, sum=3400, + median=680, distinct=5, unique=5, null=0). + - String length metrics on `customers.first_name` (5 values, min + length 3 ("Bob"/"Eve"), max length 7 ("Charlie")). + + Pipeline runs with `metrics=_ALL_PROFILER_METRICS` so OM doesn't + fall through to the default-set which omits minLength/maxLength. + """ + status = cli_runner.run( + mysql_cfg.pipeline(ProfilerPipeline(metrics=_ALL_PROFILER_METRICS)).with_filter(schemas_include=["e2e"]) + ) + assert status.success, f"profiler failures: {status.all_failures}" + + customers_fqn = f"{mysql_service}.default.e2e.customers" + transactions_fqn = f"{mysql_service}.default.e2e.transactions" + all_types_fqn = f"{mysql_service}.default.e2e.all_types" + + # Table-level row counts (seeded determinism). + om_client.table(customers_fqn).profile.eventually().row_count().equals(5) + om_client.table(transactions_fqn).profile.eventually().row_count().equals(5) + om_client.table(all_types_fqn).profile.eventually().row_count().equals(3) + + # Numeric column — credit_score sorted: [600, 650, 680, 720, 750]. + # min=600, max=750, mean=680, sum=3400, distinct=5, unique=5, null=0, + # median=680 (textbook middle of 5-element sample). + om_client.table(customers_fqn).profile.eventually().column("credit_score").has_metrics( + valuesCount=5, + nullCount=0, + distinctCount=5, + uniqueCount=5, + min=600, + max=750, + mean=680, + sum=3400, + median=680, + ) + + # String column — first_name: Alice(5), Bob(3), Charlie(7), Diana(5), Eve(3). + om_client.table(customers_fqn).profile.eventually().column("first_name").has_metrics( + valuesCount=5, + nullCount=0, + minLength=3, + maxLength=7, + ) + + +# --------------------------------------------------------------------------- +# Stored procedures — body content (presence covered by structural walk) +# --------------------------------------------------------------------------- + + +def test_stored_procedure_bodies( + om_client: OmClient, + mysql_service: str, + mysql_metadata_ingested: None, +) -> None: + """Both SP bodies survive ingestion intact. + + Existence of each SP is asserted by the structural walk via + `ExpectedStoredProcedure` entries; this test adds the body-content + coverage that the structural walk doesn't do (and exercises the + parameterized-SP code path on `sp_update_customer_status`). + """ + base = f"{mysql_service}.default.e2e" + + om_client.stored_procedure(f"{base}.sp_active_customer_count").has_code_containing("SELECT COUNT(*)") + + # Parameterized SP with DML body — different code path than the + # parameterless SELECT-only procedure above. + sp_update = om_client.stored_procedure(f"{base}.sp_update_customer_status") + sp_update.has_code_containing("p_customer_id") + sp_update.has_code_containing("UPDATE") + + +# --------------------------------------------------------------------------- +# Lineage — table-level + column-level + schemaDefinition +# --------------------------------------------------------------------------- + + +def test_lineage_view_references_tables( + cli_runner: CliRunner, + om_client: OmClient, + mysql_cfg: WorkflowConfig, + mysql_service: str, + mysql_metadata_ingested: None, +) -> None: + """View → base-table lineage (table-level + column-level) and view DDL. + + The view's `schemaDefinition` is the prerequisite for OM's SQL parser + to produce lineage at all — assert it's present BEFORE asserting the + parsed edges, so a "DDL didn't plumb through" regression points at + the right root cause instead of looking like a parser bug. + """ + view_fqn = f"{mysql_service}.default.e2e.customer_txn_summary" + customers_fqn = f"{mysql_service}.default.e2e.customers" + transactions_fqn = f"{mysql_service}.default.e2e.transactions" + + # Prereq: includeDDL=True actually plumbed the CREATE VIEW body into OM. + om_client.table(view_fqn).has_schema_definition_containing("LEFT JOIN") + + status = cli_runner.run( + mysql_cfg.pipeline( + # processQueryLineage defaults True and needs SELECT on + # mysql.general_log (the slow-query table) — a privilege the + # scoped ingest user deliberately doesn't hold. View lineage + # is what we care about; disable the query-log path. + LineagePipeline(processQueryLineage=False) + ).with_filter(schemas_include=["e2e"]) + ) + assert status.success, f"lineage failures: {status.all_failures}" + + # Table-level lineage edges. + om_client.table(view_fqn).lineage.eventually().has_upstream(customers_fqn) + om_client.table(view_fqn).lineage.eventually().has_upstream(transactions_fqn) + + # Column-level lineage — proves the SQL parser actually parsed, + # not just that "some lineage edge was emitted" via a fallback. + # `customer_id` is `c.id AS customer_id` (identity); `total_amount` + # is `COALESCE(SUM(t.amount), 0)` (aggregate over transactions.amount). + om_client.table(view_fqn).lineage.eventually().has_column_lineage(source="customers.id", target="customer_id") + om_client.table(view_fqn).lineage.eventually().has_column_lineage( + source="transactions.amount", target="total_amount" + ) + + +# --------------------------------------------------------------------------- +# Foreign key TableConstraint (no lineage edge for MySQL) +# --------------------------------------------------------------------------- + + +def test_transactions_foreign_key_constraint( + om_client: OmClient, + mysql_service: str, + mysql_metadata_ingested: None, +) -> None: + """FK on transactions.customer_id -> customers.id lands as TableConstraint. + + Uses eventually because OM processes FK constraints as a post-ingest + PATCH (connector iterates tables, defers FK when referenced table isn't + yet in OM, then patches at end). + """ + transactions_fqn = f"{mysql_service}.default.e2e.transactions" + om_client.table(transactions_fqn).eventually(60).has_foreign_key_constraint( + column="customer_id", + referenced_table="customers", + referenced_column="id", + ) + + +# --------------------------------------------------------------------------- +# Auto-classification (PII via column-name regex) + negative assertion +# --------------------------------------------------------------------------- + + +def test_auto_classification_tags_pii_columns( + cli_runner: CliRunner, + om_client: OmClient, + mysql_cfg: WorkflowConfig, + mysql_service: str, + mysql_metadata_ingested: None, +) -> None: + """Auto-classification tags PII columns AND leaves non-PII columns alone. + + Positive: `email` and `date_of_birth` get the expected PII tags. + Negative: `id` and `status` stay untagged — guards against a + "classifier became trigger-happy" regression that would silently + pass a positive-only suite. + """ + status = cli_runner.run( + mysql_cfg.pipeline( + AutoClassificationPipeline( + storeSampleData=True, + enableAutoClassification=True, + # Lowered from default 80; with only 5 seed rows per + # column the combined score sits at the edge of 80%. + # 60 aligns with PII's server-side `minimumConfidence`. + confidence=60, + ) + ).with_filter(schemas_include=["e2e"]) + ) + assert status.success, f"auto-classification failures: {status.all_failures}" + + customers_fqn = f"{mysql_service}.default.e2e.customers" + + # Positive — deterministic regex-based recognizers. + om_client.table(customers_fqn).column("email").has_tag("PII.Sensitive") + om_client.table(customers_fqn).column("date_of_birth").has_tag("PII.NonSensitive") + + # Negative — primary key and status enum should never be PII-flagged. + # Catches regressions where the classifier becomes overconfident on + # column-name matching across non-PII columns. + om_client.table(customers_fqn).column("id").has_no_tag("PII.Sensitive") + om_client.table(customers_fqn).column("id").has_no_tag("PII.NonSensitive") + om_client.table(customers_fqn).column("status").has_no_tag("PII.Sensitive") + om_client.table(customers_fqn).column("status").has_no_tag("PII.NonSensitive") + + +# --------------------------------------------------------------------------- +# Mark-deleted on re-ingest +# --------------------------------------------------------------------------- + + +def test_mark_deleted_tables_on_reingest( + cli_runner: CliRunner, + om_client: OmClient, + om_server_config: ServerConfig, + session_uuid: str, + registered_services: list[str], + mysql_admin_engine: Engine, + mysql_source_ready: None, +) -> None: + """Dropping a source table + re-ingesting marks the OM entity deleted. + + Lifecycle, end-to-end: + 1. Ingest baseline → all_types present in OM, deleted=False. + 2. Drop e2e.all_types via admin engine (out-of-band of the framework). + 3. Re-ingest with markDeletedTables=True (fixture default). + 4. Assert all_types now has deleted=True in OM. + 5. Restore e2e.all_types via the baseline policy (apply re-runs full + baseline DDL + seeds — idempotent CREATE IF NOT EXISTS path). + + Uses an isolated service so the shared `mysql_metadata_ingested` + fixture's catalog is untouched. + """ + service = mysql_service_name(session_uuid, variant="mark_deleted") + registered_services.append(service) + cfg = build_mysql_config(service, om_server_config) + pipeline_options = MetadataPipeline( + markDeletedTables=True, + includeStoredProcedures=False, # not needed for this test; cuts run time + ) + + all_types_fqn = f"{service}.default.e2e.all_types" + + # Phase 1: initial ingest — all_types present, alive. + status = cli_runner.run(cfg.pipeline(pipeline_options).with_filter(schemas_include=["e2e"])) + assert status.success, f"initial ingest: {status.all_failures}" + om_client.table(all_types_fqn).is_not_deleted() + + # Phase 2: drop the source table — out-of-band mutation via admin engine. + with mysql_admin_engine.begin() as conn: + conn.execute(text("DROP TABLE e2e.all_types")) + + try: + # Phase 3: re-ingest — markDeletedTables flips the entity to deleted=True. + status = cli_runner.run(cfg.pipeline(pipeline_options).with_filter(schemas_include=["e2e"])) + assert status.success, f"re-ingest after drop: {status.all_failures}" + + # Phase 4: verify the soft-delete landed on the OM entity. + om_client.table(all_types_fqn).eventually(30).is_soft_deleted() + finally: + # Phase 5: restore the source so subsequent test sessions start + # from a clean baseline. The policy's apply() is idempotent — + # CREATE TABLE IF NOT EXISTS + the seed insert template's + # ON DUPLICATE KEY UPDATE handle the re-create path. + from .baseline import get_policy + + get_policy().enforcer.apply([]) + + +# --------------------------------------------------------------------------- +# Error containment — one broken view doesn't tank the rest of ingest +# --------------------------------------------------------------------------- + + +def test_error_containment_one_broken_view( + cli_runner: CliRunner, + om_client: OmClient, + om_server_config: ServerConfig, + session_uuid: str, + registered_services: list[str], + mysql_admin_engine: Engine, + mysql_source_ready: None, +) -> None: + """A broken view doesn't abort the whole metadata pipeline. + + Setup: create a helper table, create a view referencing one of its + columns, then DROP that column. The view becomes "invalid" — MySQL + blocks DESCRIBE on it but SHOW CREATE VIEW still works. OM's + metadata ingestion should: + - successfully ingest customers, transactions, all_types + - log an error / fail on the broken view + - NOT crash the whole workflow + + Uses an isolated service so the broken view doesn't pollute other + tests' OM state. Cleans up the source-side artefacts in `finally`. + """ + service = mysql_service_name(session_uuid, variant="error_containment") + registered_services.append(service) + cfg = build_mysql_config(service, om_server_config) + + # Phase 1: synthesize a broken view. + with mysql_admin_engine.begin() as conn: + conn.execute( + text("CREATE TABLE IF NOT EXISTS e2e._helper_for_broken_view (id INT PRIMARY KEY, doomed_col INT)") + ) + conn.execute( + text("CREATE OR REPLACE VIEW e2e._broken_view AS SELECT id, doomed_col FROM e2e._helper_for_broken_view") + ) + conn.execute(text("ALTER TABLE e2e._helper_for_broken_view DROP COLUMN doomed_col")) + # _broken_view now references a non-existent column — DESCRIBE fails. + + try: + # Phase 2: run metadata ingest. Don't assert success — the broken + # view is expected to surface as a step error. Run the pipeline + # and inspect what landed. + try: + status = cli_runner.run( + cfg.pipeline(MetadataPipeline(includeStoredProcedures=False)).with_filter(schemas_include=["e2e"]) + ) + except Exception: + status = None + + # Phase 3: regardless of overall status, the unaffected baseline + # tables must be in OM. That's the whole point of "error + # containment" — one bad apple doesn't drop the rest. + for table in ("customers", "transactions", "all_types"): + om_client.table(f"{service}.default.e2e.{table}").eventually(30).exists() + + # Phase 4: optionally check the broken view was either (a) reported + # as a failure in the status JSON or (b) ingested with no columns. + # We accept either outcome — the key invariant is that the rest + # of the catalog made it. + if status is not None and status.all_failures: + failure_text = " ".join(str(f.get("error", "")) for f in status.all_failures).lower() + assert "_broken_view" in failure_text or "doomed_col" in failure_text or "invalid" in failure_text, ( + f"broken view didn't surface in failures: {status.all_failures}" + ) + finally: + # Cleanup: drop the synthetic objects. + with mysql_admin_engine.begin() as conn: + conn.execute(text("DROP VIEW IF EXISTS e2e._broken_view")) + conn.execute(text("DROP TABLE IF EXISTS e2e._helper_for_broken_view")) + + +# --------------------------------------------------------------------------- +# Filter scenarios — isolated services, STRICT mode catches "extras" +# --------------------------------------------------------------------------- + + +# Per-variant expected-tables for this connector's baseline. The common +# baseline (customers, transactions) is present in every variant unless +# excluded; dialect-specific tables (all_types) and the view +# (customer_txn_summary) are MySQL-only and listed here. +_EXPECTED_TABLES_BY_VARIANT: dict[str, list[str] | None] = { + "inc_exact": ["customers"], + "exc_exact": ["customers", "all_types", "customer_txn_summary"], + "sch_inc": None, # None = full baseline + "regex_prio": ["customers"], +} + + +@pytest.mark.parametrize("scenario", COMMON_FILTER_SCENARIOS, ids=lambda s: s.id) +def test_filter( + scenario: FilterScenario, + cli_runner: CliRunner, + om_client: OmClient, + om_server_config: ServerConfig, + session_uuid: str, + registered_services: list[str], + mysql_source_ready: None, +) -> None: + """Filter patterns — include exact / exclude exact / schema include / + regex include+exclude with exclude priority. + + Each variant builds an isolated service so STRICT-mode extras detection + doesn't cross-contaminate. Expected-tables for this connector's + baseline live in `_EXPECTED_TABLES_BY_VARIANT` above. + """ + expected_tables = expected_tables_for(scenario, _EXPECTED_TABLES_BY_VARIANT, connector="mysql") + + service = mysql_service_name(session_uuid, variant=f"filter_{scenario.variant}") + registered_services.append(service) + + cfg = build_mysql_config(service, om_server_config) + status = cli_runner.run( + cfg.pipeline(MetadataPipeline(includeStoredProcedures=True)).with_filter(**scenario.filter_kwargs) + ) + assert status.success, f"filter[{scenario.variant}] failures: {status.all_failures}" + + assert_service_matches( + mysql_expected(service, tables=expected_tables), + om_client, + mode=MatchMode.STRICT, + ) diff --git a/ingestion/tests/unit/workflow/test_base_workflow.py b/ingestion/tests/unit/workflow/test_base_workflow.py index 14d73fe6aec2..f58f247963fa 100644 --- a/ingestion/tests/unit/workflow/test_base_workflow.py +++ b/ingestion/tests/unit/workflow/test_base_workflow.py @@ -12,6 +12,7 @@ Validate the logic and status handling of the base workflow """ +import json from typing import Iterable, Tuple # noqa: UP035 from unittest import TestCase from unittest.mock import MagicMock, patch @@ -107,6 +108,20 @@ def close(self) -> None: """Nothing to do""" +class OkSink(Sink): + """Sink that never produces failures — every element succeeds.""" + + def _run(self, element: int) -> Either: + return Either(right=element) + + @classmethod + def create(cls, _: dict, __: OpenMetadataConnection) -> "OkSink": + return cls() + + def close(self) -> None: + """Nothing to do""" + + class SimpleWorkflow(IngestionWorkflow): """ Simple Workflow for testing @@ -118,6 +133,14 @@ def set_steps(self): self.steps: Tuple[Step] = (SimpleSink(),) # noqa: UP006 +class OkWorkflow(IngestionWorkflow): + """Workflow wired to OkSink — produces zero failures.""" + + def set_steps(self): + self.source = SimpleSource() + self.steps: tuple[Step] = (OkSink(),) + + class BrokenWorkflow(IngestionWorkflow): """ Simple Workflow for testing @@ -235,3 +258,63 @@ def test_stop_still_runs_when_print_status_raises(self): mock_print_status.assert_called_once() mock_stop.assert_called_once() + + +def test_write_status_file_writes_expected_shape(tmp_path): + workflow = OkWorkflow(config=config) + workflow.execute() + + status_file = tmp_path / "status.json" + workflow.write_status_file(status_file) + + assert status_file.exists() + payload = json.loads(status_file.read_text()) + + assert payload["pipeline_type"] == "simple" + assert payload["ingestion_pipeline_fqn"] is None + assert payload["success"] is True + assert isinstance(payload["steps"], list) + assert len(payload["steps"]) >= 2 + + +def test_write_status_file_reports_failure_shape_with_sink_errors(tmp_path): + workflow = SimpleWorkflow(config=config) + workflow.execute() + + status_file = tmp_path / "status.json" + workflow.write_status_file(status_file) + + payload = json.loads(status_file.read_text()) + + assert isinstance(payload["steps"], list) + assert len(payload["steps"]) >= 2 + sink_steps_with_failures = [s for s in payload["steps"] if s.get("failures")] + assert len(sink_steps_with_failures) >= 1 + # Sink has 1/5 failures (80%) which is below the default 90% threshold + assert payload["success"] is False + + +def test_write_status_file_reports_failure_when_source_fails(tmp_path): + workflow = BrokenWorkflow(config=config) + workflow.execute() + + status_file = tmp_path / "status.json" + workflow.write_status_file(status_file) + + payload = json.loads(status_file.read_text()) + + # BrokenSource yields non-Either values → source failures → result_status FAILURE + assert payload["success"] is False + + +def test_write_status_file_includes_ingestion_pipeline_fqn(tmp_path): + fqn_config = config.model_copy(update={"ingestionPipelineFQN": "test_service.test_pipeline"}) + workflow = SimpleWorkflow(config=fqn_config) + workflow.execute() + + status_file = tmp_path / "status.json" + workflow.write_status_file(status_file) + + payload = json.loads(status_file.read_text()) + + assert payload["ingestion_pipeline_fqn"] == "test_service.test_pipeline"