diff --git a/docs/content/pypaimon/cli.md b/docs/content/pypaimon/cli.md index 6485ca58768e..d5ee2b7e2a9c 100644 --- a/docs/content/pypaimon/cli.md +++ b/docs/content/pypaimon/cli.md @@ -156,6 +156,67 @@ Output: 5 Eve 32 Hangzhou ``` +### Table Explain + +Show the scan plan of a query without reading any data: the target snapshot, the pushed-down predicate / projection / limit, the partition / bucket / file-stats pruning funnel, and split-level signals (raw-convertible ratio, deletion-vector ratio, level histogram, files-per-split and split-size distribution). Useful for previewing the pruning effect of a predicate before actually running the read. + +```shell +paimon table explain mydb.events +``` + +**Options:** + +- `--select, -s`: Project specific columns (comma-separated) +- `--where, -w`: Filter condition in SQL-like syntax (same operators as `table read`) +- `--limit, -l`: Row limit to push down +- `--verbose, -v`: List every split with its files +- `--format, -f`: Output format: `table` (default) or `json` + +**Examples:** + +```shell +# Whole-table scan plan +paimon table explain mydb.events + +# Push filter and projection through the planner +paimon table explain mydb.events --where "dt = '2026-05-16' AND id = 7" -s dt,id,val + +# List every split (and its files) instead of just the aggregates +paimon table explain mydb.events -w "dt = '2026-05-16'" --verbose + +# Machine-readable output for scripting (level_histogram keys are JSON strings) +paimon table explain mydb.events --format json +``` + +Output: +``` +== PyPaimon Scan Plan == +Table: mydb.events (PK, HASH_FIXED) +Snapshot: 5 (schema 0) +Predicate: (dt = '2026-05-16') AND (id = 7) +Projection: [dt, id, val] +Limit: + +Partition pruning: 20 -> 4 (pruned 16) +Bucket pruning: 4 -> 1 (pruned 3) +File skipping: 1 -> 1 (pruned 0) + +Splits: 1 + raw-convertible: 1 / 1 + with DV: 0 / 1 + all-above-L0: 0 / 1 + files/split: min=1 max=1 avg=1.00 + size/split: min=2.6 KiB p50=2.6 KiB p95=2.6 KiB max=2.6 KiB + +Files: 1 +Total size: 2.6 KiB +Estimated rows: 10 (merged: 10) +Level histogram: L0=1 +Deletion files: 0 +``` + +`explain` reads the manifest list and manifest files but never opens any data files, so it is dramatically cheaper than a real read on large tables. + ### Table Get Get and display table schema information in JSON format. The output format is the same as the schema JSON format used diff --git a/docs/content/pypaimon/python-api.md b/docs/content/pypaimon/python-api.md index e83e1fa5067b..2c18cf060aed 100644 --- a/docs/content/pypaimon/python-api.md +++ b/docs/content/pypaimon/python-api.md @@ -660,6 +660,27 @@ What the fields tell you: `ExplainResult` is a plain dataclass — alongside the human-readable `__str__` shown above, every field (`partition_pruning`, `bucket_pruning`, `file_skipping`, `split_count`, `splits_raw_convertible`, `level_histogram`, `splits`, ...) is addressable in Python for programmatic use. +#### CLI + +The same scan plan is available from the `paimon` command line — useful for previewing pruning effects of a predicate without writing any Python: + +```bash +# Whole-table scan +paimon -c paimon.yaml table explain default.events + +# Push down filter / projection / limit and list every split +paimon -c paimon.yaml table explain default.events \ + --where "dt = '2026-05-16' AND id = 7" \ + --select dt,id,val \ + --limit 100 \ + --verbose + +# Machine-readable output (level_histogram keys are JSON strings) +paimon -c paimon.yaml table explain default.events --format json +``` + +`--where` accepts the same SQL-like syntax as `paimon table read`. With `--format json`, the result is a structured dump of `ExplainResult` suitable for piping into `jq` or further processing. + ## Rollback Paimon supports rolling back a table to a previous snapshot or tag. This is useful for undoing unwanted changes or diff --git a/paimon-python/pypaimon/cli/cli_table.py b/paimon-python/pypaimon/cli/cli_table.py index e428bd2d711f..ba8446fcf9c4 100644 --- a/paimon-python/pypaimon/cli/cli_table.py +++ b/paimon-python/pypaimon/cli/cli_table.py @@ -22,6 +22,8 @@ """ import sys +from dataclasses import asdict + from pypaimon.common.json_util import JSON @@ -147,6 +149,98 @@ def cmd_table_read(args): print(df.to_string(index=False)) +def cmd_table_explain(args): + """ + Execute the 'table explain' command. + + Prints the scan plan (snapshot, pushed-down predicate / projection / + limit, partition / bucket / file-stats pruning funnel and split- + level signals) without reading any data files. + """ + from pypaimon.cli.cli import load_catalog_config, create_catalog + + config = load_catalog_config(args.config) + catalog = create_catalog(config) + + table_identifier = args.table + parts = table_identifier.split('.') + if len(parts) != 2: + print(f"Error: Invalid table identifier '{table_identifier}'. " + f"Expected format: 'database.table'", file=sys.stderr) + sys.exit(1) + database_name, table_name = parts + + try: + table = catalog.get_table(f"{database_name}.{table_name}") + except Exception as e: + print(f"Error: Failed to get table '{table_identifier}': {e}", file=sys.stderr) + sys.exit(1) + + read_builder = table.new_read_builder() + available_fields = set(field.name for field in table.table_schema.fields) + + select_columns = getattr(args, 'select', None) + if select_columns: + user_columns = [col.strip() for col in select_columns.split(',')] + invalid_columns = [col for col in user_columns if col not in available_fields] + if invalid_columns: + print(f"Error: Column(s) {invalid_columns} do not exist in table '{table_identifier}'.", + file=sys.stderr) + sys.exit(1) + read_builder = read_builder.with_projection(user_columns) + + where_clause = getattr(args, 'where', None) + if where_clause: + from pypaimon.cli.where_parser import parse_where_clause + try: + predicate = parse_where_clause(where_clause, table.table_schema.fields) + if predicate: + read_builder = read_builder.with_filter(predicate) + except ValueError as e: + print(f"Error: Invalid WHERE clause: {e}", file=sys.stderr) + sys.exit(1) + + # Unlike `table read`, explain always pushes the limit down — the + # whole point of explain is to show what the planner will see, + # including limit pushdown. + limit = getattr(args, 'limit', None) + if limit is not None: + read_builder = read_builder.with_limit(limit) + + verbose = getattr(args, 'verbose', False) + try: + result = read_builder.explain(verbose=verbose) + except Exception as e: + print(f"Error: Failed to explain table '{table_identifier}': {e}", file=sys.stderr) + sys.exit(1) + + output_format = getattr(args, 'format', 'table') + if output_format == 'json': + import json + print(json.dumps(_explain_result_to_json_dict(result), indent=2, ensure_ascii=False)) + else: + print(str(result)) + + +def _explain_result_to_json_dict(result): + """Serialize an ``ExplainResult`` to a JSON-friendly dict. + + ``level_histogram`` has ``int`` keys, both at the top level and + inside each split. ``json.dumps`` would coerce them to strings + silently; we do it up front so the output is explicit and stable. + """ + payload = asdict(result) + payload['level_histogram'] = { + str(level): count for level, count in payload.get('level_histogram', {}).items() + } + if payload.get('splits') is not None: + for split in payload['splits']: + split['level_histogram'] = { + str(level): count for level, count in split.get('level_histogram', {}).items() + } + return payload + + def cmd_table_full_text_search(args): """ Execute the 'table full-text-search' command. @@ -827,7 +921,50 @@ def add_table_subcommands(table_parser): help='Output format: table (default) or json' ) read_parser.set_defaults(func=cmd_table_read) - + + # table explain command + explain_parser = table_subparsers.add_parser( + 'explain', + help='Show the scan plan (snapshot, pushdown, pruning funnel, split shape) ' + 'without reading data' + ) + explain_parser.add_argument( + 'table', + help='Table identifier in format: database.table' + ) + explain_parser.add_argument( + '--select', '-s', + type=str, + default=None, + help='Project specific columns (comma-separated, e.g., "id,name,age")' + ) + explain_parser.add_argument( + '--where', '-w', + type=str, + default=None, + help='Filter condition in SQL-like syntax ' + '(e.g., "age > 18", "dt = \'2026-01-01\' AND id IN (1,2,3)")' + ) + explain_parser.add_argument( + '--limit', '-l', + type=int, + default=None, + help='Row limit to push down' + ) + explain_parser.add_argument( + '--verbose', '-v', + action='store_true', + help='List every split with its files' + ) + explain_parser.add_argument( + '--format', '-f', + type=str, + choices=['table', 'json'], + default='table', + help='Output format: table (default) or json' + ) + explain_parser.set_defaults(func=cmd_table_explain) + # table get command get_parser = table_subparsers.add_parser('get', help='Get table schema information') get_parser.add_argument( diff --git a/paimon-python/pypaimon/read/read_builder.py b/paimon-python/pypaimon/read/read_builder.py index 51233856f259..2ee9a3904002 100644 --- a/paimon-python/pypaimon/read/read_builder.py +++ b/paimon-python/pypaimon/read/read_builder.py @@ -101,9 +101,6 @@ def _nested_name_paths(self) -> Optional[List[List[str]]]: def new_predicate_builder(self) -> PredicateBuilder: return PredicateBuilder(self.read_type()) - # TODO: surface this through pypaimon's CLI (alongside cli_sql / - # cli_table) so users can run `pypaimon explain ...` against a table - # without writing any Python. def explain(self, verbose: bool = False) -> ExplainResult: """Produce a structured scan plan for this builder. diff --git a/paimon-python/pypaimon/tests/cli_table_test.py b/paimon-python/pypaimon/tests/cli_table_test.py index b0e314644b7d..b88eae8b6be8 100644 --- a/paimon-python/pypaimon/tests/cli_table_test.py +++ b/paimon-python/pypaimon/tests/cli_table_test.py @@ -1439,6 +1439,143 @@ def test_cli_table_drop_partition_multiple(self): self.assertEqual(len(result.elements), 1) self.assertEqual(result.elements[0].spec['dt'], '2024-01-02') + def test_cli_table_explain_basic(self): + """Basic `table explain` prints the render anchors and no data.""" + with patch('sys.argv', + ['paimon', '-c', self.config_file, + 'table', 'explain', 'test_db.users']): + with patch('sys.stdout', new_callable=StringIO) as mock_stdout: + try: + main() + except SystemExit: + pass + + output = mock_stdout.getvalue() + + # render_explain anchors + self.assertIn('== PyPaimon Scan Plan ==', output) + self.assertIn('Table:', output) + self.assertIn('Snapshot:', output) + self.assertIn('Splits:', output) + self.assertIn('Files:', output) + # No data rows: row data ('Alice'/'Bob') should not appear + self.assertNotIn('Alice', output) + self.assertNotIn('Bob', output) + + def test_cli_table_explain_with_select_and_limit(self): + """`--select` and `--limit` are reflected in the Projection / Limit lines.""" + with patch('sys.argv', + ['paimon', '-c', self.config_file, + 'table', 'explain', 'test_db.users', + '--select', 'id,name', + '--limit', '3']): + with patch('sys.stdout', new_callable=StringIO) as mock_stdout: + try: + main() + except SystemExit: + pass + + output = mock_stdout.getvalue() + + self.assertIn('Projection:', output) + self.assertIn('[id, name]', output) + self.assertIn('Limit:', output) + self.assertIn('3', output) + + def test_cli_table_explain_verbose_lists_splits(self): + """`--verbose` triggers a Splits[] section listing each split.""" + with patch('sys.argv', + ['paimon', '-c', self.config_file, + 'table', 'explain', 'test_db.users', + '--verbose']): + with patch('sys.stdout', new_callable=StringIO) as mock_stdout: + try: + main() + except SystemExit: + pass + + output = mock_stdout.getvalue() + self.assertIn('Splits[]', output) + # The per-split bullet uses "[0] partition=" as a prefix + self.assertIn('[0] partition=', output) + + def test_cli_table_explain_where_partition_pruning(self): + """A partition predicate fires the partition-pruning funnel.""" + self._create_partitioned_table() + + with patch('sys.argv', + ['paimon', '-c', self.config_file, + 'table', 'explain', 'test_db.partitioned', + '--where', "dt = '2024-01-01' AND region = 'us'"]): + with patch('sys.stdout', new_callable=StringIO) as mock_stdout: + try: + main() + except SystemExit: + pass + + output = mock_stdout.getvalue() + + # Predicate is rendered + self.assertIn('Predicate:', output) + self.assertIn('dt', output) + # Partition pruning funnel shows before -> after (pruned N>0) + self.assertIn('Partition pruning:', output) + self.assertRegex(output, r'Partition pruning:\s+\d+ -> \d+\s+\(pruned [1-9]\d*\)') + + def test_cli_table_explain_format_json(self): + """`--format json` is valid JSON with stringified level_histogram keys.""" + import json + with patch('sys.argv', + ['paimon', '-c', self.config_file, + 'table', 'explain', 'test_db.users', + '--format', 'json']): + with patch('sys.stdout', new_callable=StringIO) as mock_stdout: + try: + main() + except SystemExit: + pass + + output = mock_stdout.getvalue() + payload = json.loads(output) + + # Top-level identity / snapshot / split aggregates + self.assertEqual(payload['table_identifier'], 'test_db.users') + self.assertIn('snapshot_id', payload) + self.assertIn('split_count', payload) + self.assertIn('level_histogram', payload) + self.assertIn('partition_pruning', payload) + + # level_histogram keys must be strings (json-safe). When the + # table has data, at least one level entry exists. + for key in payload['level_histogram'].keys(): + self.assertIsInstance(key, str) + + # Non-verbose => splits is null + self.assertIsNone(payload['splits']) + + def test_cli_table_explain_invalid_table(self): + """Unknown table identifier produces a clean error on stderr.""" + with patch('sys.argv', + ['paimon', '-c', self.config_file, + 'table', 'explain', 'test_db.does_not_exist']): + with patch('sys.stderr', new_callable=StringIO) as mock_stderr: + with self.assertRaises(SystemExit) as ctx: + main() + self.assertEqual(ctx.exception.code, 1) + self.assertIn("Failed to get table", mock_stderr.getvalue()) + + def test_cli_table_explain_invalid_where(self): + """Malformed WHERE produces a clean error on stderr.""" + with patch('sys.argv', + ['paimon', '-c', self.config_file, + 'table', 'explain', 'test_db.users', + '--where', 'this is not a valid clause']): + with patch('sys.stderr', new_callable=StringIO) as mock_stderr: + with self.assertRaises(SystemExit) as ctx: + main() + self.assertEqual(ctx.exception.code, 1) + self.assertIn("Invalid WHERE clause", mock_stderr.getvalue()) + if __name__ == '__main__': unittest.main()