|
| 1 | +# |
| 2 | +# Copyright (c) 2023 Airbyte, Inc., all rights reserved. |
| 3 | +# |
| 4 | + |
| 5 | +from __future__ import annotations |
| 6 | + |
| 7 | +import logging |
| 8 | +from unittest.mock import MagicMock, patch |
| 9 | + |
| 10 | +import pytest |
| 11 | +from source_s3.v4.availability_strategy import SourceS3AvailabilityStrategy |
| 12 | +from source_s3.v4.config import S3FileBasedStreamConfig |
| 13 | + |
| 14 | +from airbyte_cdk import AirbyteTracedException |
| 15 | +from airbyte_cdk.sources.file_based.exceptions import CheckAvailabilityError, FileBasedSourceError |
| 16 | +from airbyte_cdk.sources.file_based.file_types.parquet_parser import ParquetParser |
| 17 | +from airbyte_cdk.sources.file_based.remote_file import RemoteFile |
| 18 | + |
| 19 | + |
| 20 | +logger = logging.getLogger("test") |
| 21 | + |
| 22 | + |
| 23 | +# --------------------------------------------------------------------------- |
| 24 | +# Helpers |
| 25 | +# --------------------------------------------------------------------------- |
| 26 | + |
| 27 | + |
| 28 | +def _make_stream(skip_full_check_for_parquet: bool = False, parser=None, files=None): |
| 29 | + """Build a mock stream with the minimal surface used by the strategy.""" |
| 30 | + stream = MagicMock() |
| 31 | + stream.name = "test_stream" |
| 32 | + stream.config = MagicMock(spec=S3FileBasedStreamConfig) |
| 33 | + stream.config.skip_full_check_for_parquet = skip_full_check_for_parquet |
| 34 | + |
| 35 | + if parser is None: |
| 36 | + parser = MagicMock(spec=ParquetParser) |
| 37 | + parser.file_read_mode = "rb" |
| 38 | + parser.check_config.return_value = (True, None) |
| 39 | + stream.get_parser.return_value = parser |
| 40 | + |
| 41 | + if files is not None: |
| 42 | + stream.get_files.return_value = iter(files) |
| 43 | + |
| 44 | + return stream |
| 45 | + |
| 46 | + |
| 47 | +def _make_remote_file(uri: str = "s3://bucket/data.parquet") -> RemoteFile: |
| 48 | + return MagicMock(spec=RemoteFile, uri=uri) |
| 49 | + |
| 50 | + |
| 51 | +def _make_strategy(): |
| 52 | + stream_reader = MagicMock() |
| 53 | + return SourceS3AvailabilityStrategy(stream_reader) |
| 54 | + |
| 55 | + |
| 56 | +# --------------------------------------------------------------------------- |
| 57 | +# check_availability_and_parsability – delegation tests |
| 58 | +# --------------------------------------------------------------------------- |
| 59 | + |
| 60 | + |
| 61 | +@pytest.mark.parametrize( |
| 62 | + "skip_full_check_for_parquet,parser_cls", |
| 63 | + [ |
| 64 | + pytest.param(False, ParquetParser, id="flag-off-parquet-parser"), |
| 65 | + pytest.param(True, None, id="flag-on-non-parquet-parser"), |
| 66 | + pytest.param(False, None, id="flag-off-non-parquet-parser"), |
| 67 | + ], |
| 68 | +) |
| 69 | +def test_delegates_to_super_when_skip_not_applicable(skip_full_check_for_parquet, parser_cls): |
| 70 | + """When skip_full_check_for_parquet is False or parser is not ParquetParser, super() is called.""" |
| 71 | + if parser_cls is ParquetParser: |
| 72 | + parser = MagicMock(spec=ParquetParser) |
| 73 | + parser.check_config.return_value = (True, None) |
| 74 | + else: |
| 75 | + parser = MagicMock() # not a ParquetParser instance |
| 76 | + parser.check_config.return_value = (True, None) |
| 77 | + |
| 78 | + stream = _make_stream(skip_full_check_for_parquet=skip_full_check_for_parquet, parser=parser) |
| 79 | + strategy = _make_strategy() |
| 80 | + |
| 81 | + with patch.object( |
| 82 | + SourceS3AvailabilityStrategy.__bases__[0], |
| 83 | + "check_availability_and_parsability", |
| 84 | + return_value=(True, None), |
| 85 | + ) as super_mock: |
| 86 | + result = strategy.check_availability_and_parsability(stream, logger, None) |
| 87 | + |
| 88 | + assert super_mock.called |
| 89 | + assert result == (True, None) |
| 90 | + |
| 91 | + |
| 92 | +# --------------------------------------------------------------------------- |
| 93 | +# Parquet skip-check path tests (flag=True) |
| 94 | +# --------------------------------------------------------------------------- |
| 95 | + |
| 96 | + |
| 97 | +def test_parquet_skips_full_parse_and_opens_file(): |
| 98 | + """When skip_full_check_for_parquet is True and parser is ParquetParser, the strategy skips _check_parse_record and only opens the file.""" |
| 99 | + parser = MagicMock(spec=ParquetParser) |
| 100 | + parser.file_read_mode = "rb" |
| 101 | + parser.check_config.return_value = (True, None) |
| 102 | + |
| 103 | + file = _make_remote_file() |
| 104 | + stream = _make_stream(skip_full_check_for_parquet=True, parser=parser, files=[file]) |
| 105 | + handle_mock = MagicMock() |
| 106 | + stream.stream_reader.open_file.return_value = handle_mock |
| 107 | + |
| 108 | + strategy = _make_strategy() |
| 109 | + |
| 110 | + with patch.object(strategy, "_check_list_files", return_value=file) as list_mock: |
| 111 | + result = strategy.check_availability_and_parsability(stream, logger, None) |
| 112 | + |
| 113 | + list_mock.assert_called_once_with(stream) |
| 114 | + stream.stream_reader.open_file.assert_called_once_with(file, "rb", None, logger) |
| 115 | + handle_mock.close.assert_called_once() |
| 116 | + assert result == (True, None) |
| 117 | + |
| 118 | + |
| 119 | +def test_parquet_returns_false_when_config_check_fails(): |
| 120 | + """When parser.check_config returns False for a parquet stream with skip enabled, availability returns False.""" |
| 121 | + parser = MagicMock(spec=ParquetParser) |
| 122 | + parser.check_config.return_value = (False, "bad config") |
| 123 | + |
| 124 | + stream = _make_stream(skip_full_check_for_parquet=True, parser=parser) |
| 125 | + strategy = _make_strategy() |
| 126 | + |
| 127 | + result = strategy.check_availability_and_parsability(stream, logger, None) |
| 128 | + |
| 129 | + assert result == (False, "bad config") |
| 130 | + |
| 131 | + |
| 132 | +def test_parquet_returns_false_on_check_availability_error(): |
| 133 | + """When _check_list_files raises CheckAvailabilityError, availability returns False.""" |
| 134 | + parser = MagicMock(spec=ParquetParser) |
| 135 | + parser.check_config.return_value = (True, None) |
| 136 | + |
| 137 | + stream = _make_stream(skip_full_check_for_parquet=True, parser=parser) |
| 138 | + strategy = _make_strategy() |
| 139 | + |
| 140 | + with patch.object( |
| 141 | + strategy, "_check_list_files", side_effect=CheckAvailabilityError(FileBasedSourceError.EMPTY_STREAM, stream="test_stream") |
| 142 | + ): |
| 143 | + available, msg = strategy.check_availability_and_parsability(stream, logger, None) |
| 144 | + |
| 145 | + assert available is False |
| 146 | + assert msg is not None |
| 147 | + |
| 148 | + |
| 149 | +def test_parquet_reraises_airbyte_traced_exception(): |
| 150 | + """AirbyteTracedException propagates out of the parquet skip path.""" |
| 151 | + parser = MagicMock(spec=ParquetParser) |
| 152 | + parser.check_config.return_value = (True, None) |
| 153 | + |
| 154 | + stream = _make_stream(skip_full_check_for_parquet=True, parser=parser) |
| 155 | + strategy = _make_strategy() |
| 156 | + |
| 157 | + exc = AirbyteTracedException(message="traced") |
| 158 | + with patch.object(strategy, "_check_list_files", side_effect=exc): |
| 159 | + with pytest.raises(AirbyteTracedException): |
| 160 | + strategy.check_availability_and_parsability(stream, logger, None) |
| 161 | + |
| 162 | + |
| 163 | +def test_parquet_wraps_unexpected_exception_in_check_availability_error(): |
| 164 | + """Unexpected exceptions from open_file are wrapped in CheckAvailabilityError.""" |
| 165 | + parser = MagicMock(spec=ParquetParser) |
| 166 | + parser.file_read_mode = "rb" |
| 167 | + parser.check_config.return_value = (True, None) |
| 168 | + |
| 169 | + file = _make_remote_file() |
| 170 | + stream = _make_stream(skip_full_check_for_parquet=True, parser=parser) |
| 171 | + stream.stream_reader.open_file.side_effect = RuntimeError("unexpected failure") |
| 172 | + |
| 173 | + strategy = _make_strategy() |
| 174 | + |
| 175 | + with patch.object(strategy, "_check_list_files", return_value=file): |
| 176 | + with pytest.raises(CheckAvailabilityError) as exc_info: |
| 177 | + strategy.check_availability_and_parsability(stream, logger, None) |
| 178 | + |
| 179 | + assert isinstance(exc_info.value.__cause__, RuntimeError) |
| 180 | + |
| 181 | + |
| 182 | +# --------------------------------------------------------------------------- |
| 183 | +# S3FileBasedStreamConfig – skip_full_check_for_parquet field |
| 184 | +# --------------------------------------------------------------------------- |
| 185 | + |
| 186 | + |
| 187 | +def test_s3_stream_config_skip_full_check_for_parquet_defaults_false(): |
| 188 | + """The skip_full_check_for_parquet field defaults to False.""" |
| 189 | + cfg = S3FileBasedStreamConfig(name="test", format={"filetype": "parquet"}, globs=["**/*.parquet"], validation_policy="Emit Record") |
| 190 | + assert cfg.skip_full_check_for_parquet is False |
| 191 | + |
| 192 | + |
| 193 | +def test_s3_stream_config_skip_full_check_for_parquet_set_true(): |
| 194 | + """The skip_full_check_for_parquet field can be set to True.""" |
| 195 | + cfg = S3FileBasedStreamConfig( |
| 196 | + name="test", |
| 197 | + format={"filetype": "parquet"}, |
| 198 | + globs=["**/*.parquet"], |
| 199 | + validation_policy="Emit Record", |
| 200 | + skip_full_check_for_parquet=True, |
| 201 | + ) |
| 202 | + assert cfg.skip_full_check_for_parquet is True |
0 commit comments