From 4e3815f46e05f721fb11ed5b3fbbd895f2f476a9 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Wed, 11 Mar 2026 20:13:36 -0400 Subject: [PATCH 01/36] feat: add URL scanning support to PlanResolver trait Add scan_url method to PlanResolver so resolvers participate in data source URL handling. DataFusionResolver moves from a privileged terminal resolver into a regular resolver in the pipeline chain. Key changes: - ParsedUrl struct for structured URL representation passed to scanners - ResolverCapabilities proto + MergedCapabilities for URL support negotiation - DataBaseUrlSetting enum for explicit base URL API (Default/Disabled/Custom) - resolve_url() shared function for plan-time and eval-time URL resolution - GetCapabilities RPC for remote capability propagation (gRPC + WASM) - Python bridge: scan_url, scan_url_proto, capabilities on PlanResolver - data_base_url parameter threaded through pre_transform_* and ChartState APIs Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 1 + vegafusion-core/Cargo.toml | 3 + vegafusion-core/src/chart_state.rs | 21 +- vegafusion-core/src/planning/plan.rs | 16 + vegafusion-core/src/proto/pretransform.proto | 11 + vegafusion-core/src/proto/services.proto | 9 + vegafusion-core/src/proto/tasks.proto | 10 + vegafusion-core/src/runtime/mod.rs | 5 +- vegafusion-core/src/runtime/plan_resolver.rs | 453 ++++++++++++++++++ vegafusion-core/src/runtime/runtime.rs | 32 +- vegafusion-core/src/spec/chart.rs | 3 +- vegafusion-core/src/spec/data.rs | 9 +- vegafusion-core/src/spec/visitors.rs | 50 +- vegafusion-python/src/chart_state.rs | 3 +- vegafusion-python/src/lib.rs | 66 +++ vegafusion-python/src/plan_resolver.rs | 149 +++++- vegafusion-python/tests/test_plan_resolver.py | 174 +++++++ vegafusion-python/vegafusion/plan_resolver.py | 100 ++++ vegafusion-runtime/benches/spec_benchmarks.rs | 14 +- .../src/data/datafusion_resolver.rs | 70 ++- vegafusion-runtime/src/data/pipeline.rs | 74 ++- vegafusion-runtime/src/data/tasks.rs | 183 ++----- .../src/task_graph/grpc_runtime.rs | 21 +- vegafusion-runtime/src/task_graph/runtime.rs | 4 + vegafusion-runtime/tests/test_chart_state.rs | 1 + .../test_destringify_selection_datasets.rs | 1 + .../tests/test_image_comparison.rs | 6 +- .../tests/test_plan_resolver.rs | 161 ++++++- vegafusion-runtime/tests/test_planning.rs | 6 +- .../tests/test_pre_transform_extract.rs | 1 + .../test_pre_transform_keep_variables.rs | 3 + .../tests/test_pre_transform_values.rs | 9 + .../tests/test_stringify_datetimes.rs | 5 + .../tests/test_task_graph_runtime.rs | 10 +- vegafusion-server/src/main.rs | 27 +- .../tests/test_task_graph_runtime.rs | 9 +- vegafusion-wasm/src/lib.rs | 104 +++- 37 files changed, 1603 insertions(+), 221 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 388c39500..393e7c964 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4888,6 +4888,7 @@ dependencies = [ "tonic-build", "tonic-prost", "tonic-prost-build", + "url", "vegafusion-common", ] diff --git a/vegafusion-core/Cargo.toml b/vegafusion-core/Cargo.toml index 0bd0efc52..d33d410b5 100644 --- a/vegafusion-core/Cargo.toml +++ b/vegafusion-core/Cargo.toml @@ -73,6 +73,9 @@ workspace = true [dependencies.log] workspace = true +[dependencies.url] +version = "2" + [dependencies.serde] workspace = true diff --git a/vegafusion-core/src/chart_state.rs b/vegafusion-core/src/chart_state.rs index 20398559e..6c49817fc 100644 --- a/vegafusion-core/src/chart_state.rs +++ b/vegafusion-core/src/chart_state.rs @@ -2,7 +2,7 @@ use crate::{ data::dataset::VegaFusionDataset, planning::{ apply_pre_transform::apply_pre_transform_datasets, - plan::SpecPlan, + plan::{PlannerConfig, SpecPlan}, stitch::CommPlan, watch::{ExportUpdate, ExportUpdateJSON, ExportUpdateNamespace}, }, @@ -10,7 +10,7 @@ use crate::{ pretransform::PreTransformSpecWarning, tasks::{NodeValueIndex, TaskGraph, TzConfig, Variable}, }, - runtime::VegaFusionRuntimeTrait, + runtime::{DataBaseUrlSetting, VegaFusionRuntimeTrait}, spec::chart::ChartSpec, task_graph::{graph::ScopedVariable, task_value::TaskValue}, }; @@ -28,6 +28,7 @@ use vegafusion_common::{ pub struct ChartStateOpts { pub tz_config: TzConfig, pub row_limit: Option, + pub data_base_url: DataBaseUrlSetting, } impl Default for ChartStateOpts { @@ -38,6 +39,7 @@ impl Default for ChartStateOpts { default_input_tz: None, }, row_limit: None, + data_base_url: DataBaseUrlSetting::Default, } } } @@ -66,7 +68,18 @@ impl ChartState { .map(|(k, ds)| (k.clone(), ds.fingerprint())) .collect::>(); - let plan = SpecPlan::try_new(&spec, &Default::default())?; + let resolved_base = crate::runtime::resolve_data_base_url( + opts.data_base_url.clone(), + PlannerConfig::default().data_base_url, + )?; + let plan = SpecPlan::try_new( + &spec, + &PlannerConfig { + capabilities: runtime.planner_capabilities(), + data_base_url: resolved_base.clone(), + ..Default::default() + }, + )?; let task_scope = plan .server_spec @@ -74,7 +87,7 @@ impl ChartState { .with_context(|| "Failed to create task scope for server spec")?; let tasks = plan .server_spec - .to_tasks(&opts.tz_config, &dataset_fingerprints) + .to_tasks(&opts.tz_config, &dataset_fingerprints, resolved_base) .unwrap(); let task_graph = TaskGraph::new(tasks, &task_scope).unwrap(); let task_graph_mapping = task_graph.build_mapping(); diff --git a/vegafusion-core/src/planning/plan.rs b/vegafusion-core/src/planning/plan.rs index a9a2b929c..ebb72d060 100644 --- a/vegafusion-core/src/planning/plan.rs +++ b/vegafusion-core/src/planning/plan.rs @@ -12,6 +12,8 @@ use crate::planning::unsupported_data_warning::add_unsupported_data_warnings; use crate::proto::gen::pretransform::{ pre_transform_spec_warning::WarningType, PlannerWarning, PreTransformSpecWarning, }; +use crate::proto::gen::tasks::ResolverCapabilities; +use crate::runtime::MergedCapabilities; use crate::spec::chart::ChartSpec; use crate::task_graph::graph::ScopedVariable; use serde::{Deserialize, Serialize}; @@ -95,8 +97,18 @@ pub struct PlannerConfig { pub strip_description_encoding: bool, pub strip_aria_encoding: bool, pub strip_tooltip_encoding: bool, + /// Merged URL capabilities from all resolvers. Used by DataSpec::supported() + /// to decide if a URL-backed dataset is plannable. + pub capabilities: MergedCapabilities, + /// Base URL for resolving relative data URLs. None means relative paths are an error. + /// Some(url) means relative paths are resolved against this URL. + pub data_base_url: Option, } +/// Default CDN base URL for vega-datasets +pub const VEGA_DATASETS_CDN_BASE: &str = + "https://raw.githubusercontent.com/vega/vega-datasets/v2.3.0/"; + impl Default for PlannerConfig { fn default() -> Self { Self { @@ -114,6 +126,10 @@ impl Default for PlannerConfig { strip_description_encoding: true, strip_aria_encoding: true, strip_tooltip_encoding: false, + capabilities: MergedCapabilities::from_resolver_capabilities(&[ + ResolverCapabilities::datafusion_defaults(), + ]), + data_base_url: Some(VEGA_DATASETS_CDN_BASE.to_string()), } } } diff --git a/vegafusion-core/src/proto/pretransform.proto b/vegafusion-core/src/proto/pretransform.proto index 9bdb87dd9..0718773a8 100644 --- a/vegafusion-core/src/proto/pretransform.proto +++ b/vegafusion-core/src/proto/pretransform.proto @@ -2,6 +2,14 @@ syntax = "proto3"; package pretransform; import "tasks.proto"; +import "google/protobuf/empty.proto"; + +message DataBaseUrlSettingProto { + oneof kind { + string custom = 1; + google.protobuf.Empty disabled = 2; + } +} /// Pre transform spec messages message PreTransformSpecOpts { @@ -10,6 +18,7 @@ message PreTransformSpecOpts { repeated PreTransformVariable keep_variables = 3; string local_tz = 4; optional string default_input_tz = 5; + optional DataBaseUrlSettingProto data_base_url = 6; } message PreTransformSpecRequest { @@ -52,6 +61,7 @@ message PreTransformValuesOpts { optional uint32 row_limit = 1; string local_tz = 2; optional string default_input_tz = 3; + optional DataBaseUrlSettingProto data_base_url = 4; } message PreTransformValuesRequest { @@ -96,6 +106,7 @@ message PreTransformExtractOpts { bool preserve_interactivity = 3; int32 extract_threshold = 4; repeated PreTransformVariable keep_variables = 5; + optional DataBaseUrlSettingProto data_base_url = 6; } message PreTransformExtractWarning { diff --git a/vegafusion-core/src/proto/services.proto b/vegafusion-core/src/proto/services.proto index 20599ba5e..f313808c6 100644 --- a/vegafusion-core/src/proto/services.proto +++ b/vegafusion-core/src/proto/services.proto @@ -10,11 +10,19 @@ service VegaFusionRuntime { rpc PreTransformSpec(pretransform.PreTransformSpecRequest) returns (PreTransformSpecResult) {} rpc PreTransformValues(pretransform.PreTransformValuesRequest) returns (PreTransformValuesResult) {} rpc PreTransformExtract(pretransform.PreTransformExtractRequest) returns (PreTransformExtractResult) {} + rpc GetCapabilities(GetCapabilitiesRequest) returns (GetCapabilitiesResult) {} +} + +message GetCapabilitiesRequest {} + +message GetCapabilitiesResult { + tasks.ResolverCapabilities capabilities = 1; } message QueryRequest { oneof request { tasks.TaskGraphValueRequest task_graph_values = 1; + GetCapabilitiesRequest get_capabilities = 2; } } @@ -22,6 +30,7 @@ message QueryResult { oneof response { errors.Error error = 1; tasks.TaskGraphValueResponse task_graph_values = 2; + GetCapabilitiesResult get_capabilities = 3; } } diff --git a/vegafusion-core/src/proto/tasks.proto b/vegafusion-core/src/proto/tasks.proto index f44ec540e..215f1cede 100644 --- a/vegafusion-core/src/proto/tasks.proto +++ b/vegafusion-core/src/proto/tasks.proto @@ -79,6 +79,10 @@ message DataUrlTask { int32 batch_size = 3; ScanUrlFormat format_type = 4; transforms.TransformPipeline pipeline = 5; + // Base URL for resolving relative URLs in Url::Expr tasks at eval time. + // Written by MakeTasksVisitor from PlannerConfig.data_base_url. + // Absent = no base URL, present = that base URL. + optional string data_base_url = 6; } // ## Inline values task @@ -183,4 +187,10 @@ message InlineDataset { InlineDatasetTable table = 1; InlineDatasetPlan plan = 2; } +} + +message ResolverCapabilities { + repeated string supported_schemes = 1; + repeated string supported_format_types = 2; + repeated string supported_extensions = 3; } \ No newline at end of file diff --git a/vegafusion-core/src/runtime/mod.rs b/vegafusion-core/src/runtime/mod.rs index 5e2831d2f..2da146a12 100644 --- a/vegafusion-core/src/runtime/mod.rs +++ b/vegafusion-core/src/runtime/mod.rs @@ -1,5 +1,8 @@ mod plan_resolver; mod runtime; -pub use plan_resolver::{PlanResolver, ResolutionResult}; +pub use plan_resolver::{ + has_url_scheme, is_absolute_path, normalize_base_url, path_to_file_url, resolve_data_base_url, + resolve_url, DataBaseUrlSetting, MergedCapabilities, ParsedUrl, PlanResolver, ResolutionResult, +}; pub use runtime::{PreTransformExtractTable, VegaFusionRuntimeTrait}; diff --git a/vegafusion-core/src/runtime/plan_resolver.rs b/vegafusion-core/src/runtime/plan_resolver.rs index 9798b7155..22c43f98c 100644 --- a/vegafusion-core/src/runtime/plan_resolver.rs +++ b/vegafusion-core/src/runtime/plan_resolver.rs @@ -1,4 +1,7 @@ +use crate::proto::gen::pretransform::DataBaseUrlSettingProto; +use crate::proto::gen::tasks::ResolverCapabilities; use async_trait::async_trait; +use std::collections::HashSet; use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::Result; @@ -11,8 +14,458 @@ pub enum ResolutionResult { Plan(LogicalPlan), } +/// Explicit representation of the data_base_url setting at public API boundaries. +/// This avoids overloading Option with an empty-string sentinel. +#[derive(Clone, Debug, Default)] +pub enum DataBaseUrlSetting { + /// Use the default CDN base URL (vega-datasets) + #[default] + Default, + /// Disable base URL — relative paths produce an error + Disabled, + /// Use a custom base URL (scheme URL or absolute path) + Custom(String), +} + +impl DataBaseUrlSetting { + /// Convert from the proto representation. + /// Field absent → Default (use CDN), custom → Custom, disabled → Disabled. + pub fn from_proto(proto: Option) -> Self { + use crate::proto::gen::pretransform::data_base_url_setting_proto::Kind; + match proto { + None => DataBaseUrlSetting::Default, + Some(p) => match p.kind { + Some(Kind::Custom(s)) => DataBaseUrlSetting::Custom(s), + Some(Kind::Disabled(_)) => DataBaseUrlSetting::Disabled, + None => DataBaseUrlSetting::Default, + }, + } + } +} + +/// Parsed URL representation passed to resolvers during the scan phase. +/// All fields are populated from the fully-resolved URL (after base URL +/// resolution and hash-stripping). Resolvers pattern-match on these fields +/// rather than doing their own URL string parsing. +pub struct ParsedUrl { + /// Original URL string (after base URL resolution and hash-stripping) + pub url: String, + /// URL scheme (http, https, file, s3, spark, etc.) — always present + pub scheme: String, + /// Host/authority component (e.g. "example.com", S3 bucket name) + pub host: Option, + /// Path component + pub path: String, + /// Query parameters in URL order, preserving duplicates + pub query_params: Vec<(String, String)>, + /// File extension extracted from path (e.g. "csv", "parquet") + pub extension: Option, + /// Explicit format type from Vega spec (overrides extension) + pub format_type: Option, +} + +/// Merged capabilities from all resolvers, with HashSet fields for O(1) lookup. +/// Built by unioning the ResolverCapabilities from each resolver in the pipeline. +#[derive(Clone, Debug, Default)] +pub struct MergedCapabilities { + pub supported_schemes: HashSet, + pub supported_format_types: HashSet, + pub supported_extensions: HashSet, +} + +impl MergedCapabilities { + pub fn from_resolver_capabilities(caps: &[ResolverCapabilities]) -> Self { + let mut merged = Self::default(); + for cap in caps { + merged + .supported_schemes + .extend(cap.supported_schemes.iter().cloned()); + merged + .supported_format_types + .extend(cap.supported_format_types.iter().cloned()); + merged + .supported_extensions + .extend(cap.supported_extensions.iter().cloned()); + } + merged + } + + /// Check if a URL with the given scheme and format info is supported by any resolver. + pub fn url_supported( + &self, + scheme: &str, + format_type: Option<&str>, + extension: Option<&str>, + ) -> bool { + let scheme_ok = self.supported_schemes.contains(scheme); + let format_ok = match (format_type, extension) { + (Some(fmt), _) => self.supported_format_types.contains(fmt), + (None, Some(ext)) => self.supported_extensions.contains(ext), + (None, None) => true, + }; + scheme_ok && format_ok + } +} + +impl ResolverCapabilities { + /// Built-in DataFusion URL capabilities: file, http, https, s3 schemes + /// with csv, tsv, json, arrow, parquet formats. + pub fn datafusion_defaults() -> Self { + Self { + supported_schemes: vec!["http", "https", "s3", "file"] + .into_iter() + .map(String::from) + .collect(), + supported_format_types: vec!["csv", "tsv", "json", "arrow", "parquet"] + .into_iter() + .map(String::from) + .collect(), + supported_extensions: vec!["csv", "tsv", "json", "arrow", "feather", "parquet"] + .into_iter() + .map(String::from) + .collect(), + } + } +} + #[async_trait] pub trait PlanResolver: Send + Sync + 'static { fn name(&self) -> &str; + + /// Declare what URL patterns this resolver supports at planning time. + /// Returns empty capabilities by default (no additional URL support). + fn capabilities(&self) -> ResolverCapabilities { + ResolverCapabilities::default() + } + + /// Given a parsed URL, optionally return a LogicalPlan to handle it. + /// Return Ok(None) to pass the URL to the next resolver in the chain. + async fn scan_url(&self, _parsed_url: &ParsedUrl) -> Result> { + Ok(None) + } + async fn resolve_plan(&self, plan: LogicalPlan) -> Result; } + +/// Map a DataBaseUrlSetting (from public API) to the two-state Option +/// used by PlannerConfig. Custom base URLs are normalized (bare absolute paths +/// become file:// URLs). +pub fn resolve_data_base_url( + api_value: DataBaseUrlSetting, + default: Option, +) -> Result> { + match api_value { + DataBaseUrlSetting::Default => Ok(default), + DataBaseUrlSetting::Disabled => Ok(None), + DataBaseUrlSetting::Custom(s) => Ok(Some(normalize_base_url(s)?)), + } +} + +/// Returns true if the string is already a URL (has a scheme) or is +/// protocol-relative (starts with //). +pub fn has_url_scheme(s: &str) -> bool { + s.contains("://") || s.starts_with("//") +} + +/// Returns true if `path` is an absolute filesystem path. +/// Unix: starts with `/`. Windows: starts with a drive letter `[A-Za-z]:\` or `[A-Za-z]:/`. +pub fn is_absolute_path(path: &str) -> bool { + let bytes = path.as_bytes(); + if bytes.first() == Some(&b'/') { + return true; + } + bytes.len() >= 3 + && bytes[0].is_ascii_alphabetic() + && bytes[1] == b':' + && (bytes[2] == b'\\' || bytes[2] == b'/') +} + +/// Normalize a base URL so it always has a scheme. +/// Bare absolute paths become file:// URLs; protocol-relative and scheme +/// URLs are preserved as-is; everything else is rejected. +pub fn normalize_base_url(base: String) -> Result { + if has_url_scheme(&base) { + Ok(base) + } else if is_absolute_path(&base) { + path_to_file_url(&base) + } else { + Err(vegafusion_common::error::VegaFusionError::specification( + format!("data_base_url must be absolute (scheme URL or absolute path), got: {base}"), + )) + } +} + +/// Convert an absolute local path to a file:// URL. +/// Uses url::Url::from_file_path() for correct percent-encoding. +pub fn path_to_file_url(path: &str) -> Result { + let normalized = path.replace('\\', "/"); + let p = std::path::Path::new(&normalized); + url::Url::from_file_path(p) + .map(|u| u.to_string()) + .map_err(|_| { + vegafusion_common::error::VegaFusionError::specification(format!( + "Cannot convert path to file URL: {}", + p.display() + )) + }) +} + +/// Resolve a spec URL against a base URL. This is the shared function used by +/// both plan-time resolution (MakeTasksVisitor for Url::String) and eval-time +/// resolution (DataUrlTask::eval for Url::Expr). +pub fn resolve_url(url: &str, data_base_url: &Option) -> Result { + // Future: This is the natural place for a URL permissions layer + if has_url_scheme(url) { + Ok(url.to_string()) + } else if is_absolute_path(url) { + path_to_file_url(url) + } else { + // Relative path — resolve against base URL + match data_base_url { + Some(base) => { + let separator = if base.ends_with('/') { "" } else { "/" }; + Ok(format!("{base}{separator}{url}")) + } + None => Err(vegafusion_common::error::VegaFusionError::specification( + format!("Relative URL with no base URL configured: {url}"), + )), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // ── has_url_scheme ── + + #[test] + fn test_has_url_scheme_https() { + assert!(has_url_scheme("https://example.com/data.csv")); + } + + #[test] + fn test_has_url_scheme_custom() { + assert!(has_url_scheme("spark://org.users")); + } + + #[test] + fn test_has_url_scheme_protocol_relative() { + assert!(has_url_scheme("//example.com/data.csv")); + } + + #[test] + fn test_has_url_scheme_absolute_path() { + assert!(!has_url_scheme("/tmp/data.csv")); + } + + #[test] + fn test_has_url_scheme_relative() { + assert!(!has_url_scheme("data/cars.json")); + } + + // ── is_absolute_path ── + + #[test] + fn test_is_absolute_path_unix() { + assert!(is_absolute_path("/tmp/data.csv")); + } + + #[test] + fn test_is_absolute_path_windows_backslash() { + assert!(is_absolute_path("C:\\tmp\\foo.csv")); + } + + #[test] + fn test_is_absolute_path_windows_forward() { + assert!(is_absolute_path("C:/tmp/foo.csv")); + } + + #[test] + fn test_is_absolute_path_rejects_ambiguous_colon() { + assert!(!is_absolute_path("a:b")); + } + + #[test] + fn test_is_absolute_path_rejects_digit_colon() { + assert!(!is_absolute_path("1:/foo")); + } + + #[test] + fn test_is_absolute_path_rejects_relative() { + assert!(!is_absolute_path("relative/path")); + } + + // ── path_to_file_url ── + + #[test] + fn test_path_to_file_url_unix() { + let result = path_to_file_url("/tmp/data.csv").unwrap(); + assert_eq!(result, "file:///tmp/data.csv"); + } + + #[test] + fn test_path_to_file_url_spaces() { + let result = path_to_file_url("/tmp/my data/file.csv").unwrap(); + assert_eq!(result, "file:///tmp/my%20data/file.csv"); + } + + #[test] + fn test_path_to_file_url_hash() { + let result = path_to_file_url("/tmp/file#1.csv").unwrap(); + assert!( + result.contains("%23"), + "Hash should be percent-encoded: {result}" + ); + } + + // ── normalize_base_url ── + + #[test] + fn test_normalize_base_url_scheme() { + let result = normalize_base_url("https://example.com/data/".to_string()).unwrap(); + assert_eq!(result, "https://example.com/data/"); + } + + #[test] + fn test_normalize_base_url_protocol_relative() { + let result = normalize_base_url("//example.com/data/".to_string()).unwrap(); + assert_eq!(result, "//example.com/data/"); + } + + #[test] + fn test_normalize_base_url_absolute_path() { + let result = normalize_base_url("/home/user/data".to_string()).unwrap(); + assert_eq!(result, "file:///home/user/data"); + } + + #[test] + fn test_normalize_base_url_rejects_relative() { + let result = normalize_base_url("relative/path".to_string()); + assert!(result.is_err()); + } + + #[test] + fn test_normalize_base_url_rejects_ambiguous_colon() { + let result = normalize_base_url("a:b".to_string()); + assert!(result.is_err()); + } + + // ── resolve_url ── + + #[test] + fn test_resolve_url_scheme_passthrough() { + let base = Some("https://cdn.example.com/".to_string()); + let result = resolve_url("https://other.com/data.csv", &base).unwrap(); + assert_eq!(result, "https://other.com/data.csv"); + } + + #[test] + fn test_resolve_url_absolute_path_to_file() { + let base = Some("https://cdn.example.com/".to_string()); + let result = resolve_url("/tmp/data.csv", &base).unwrap(); + assert_eq!(result, "file:///tmp/data.csv"); + } + + #[test] + fn test_resolve_url_relative_with_base() { + let base = Some("https://raw.githubusercontent.com/vega/vega-datasets/v2.3.0/".to_string()); + let result = resolve_url("data/cars.json", &base).unwrap(); + assert_eq!( + result, + "https://raw.githubusercontent.com/vega/vega-datasets/v2.3.0/data/cars.json" + ); + } + + #[test] + fn test_resolve_url_relative_without_trailing_slash() { + let base = Some("https://example.com/data".to_string()); + let result = resolve_url("cars.json", &base).unwrap(); + assert_eq!(result, "https://example.com/data/cars.json"); + } + + #[test] + fn test_resolve_url_relative_no_base_errors() { + let result = resolve_url("data/cars.json", &None); + assert!(result.is_err()); + } + + // ── resolve_data_base_url ── + + #[test] + fn test_resolve_data_base_url_default() { + let default = Some("https://cdn.example.com/".to_string()); + let result = resolve_data_base_url(DataBaseUrlSetting::Default, default.clone()).unwrap(); + assert_eq!(result, default); + } + + #[test] + fn test_resolve_data_base_url_disabled() { + let result = resolve_data_base_url( + DataBaseUrlSetting::Disabled, + Some("https://cdn.example.com/".to_string()), + ) + .unwrap(); + assert_eq!(result, None); + } + + #[test] + fn test_resolve_data_base_url_custom() { + let result = resolve_data_base_url( + DataBaseUrlSetting::Custom("https://my-server.com/data/".to_string()), + Some("https://cdn.example.com/".to_string()), + ) + .unwrap(); + assert_eq!(result, Some("https://my-server.com/data/".to_string())); + } + + #[test] + fn test_resolve_data_base_url_custom_path() { + let result = resolve_data_base_url( + DataBaseUrlSetting::Custom("/home/user/data".to_string()), + None, + ) + .unwrap(); + assert_eq!(result, Some("file:///home/user/data".to_string())); + } + + // ── MergedCapabilities ── + + #[test] + fn test_merged_capabilities_from_defaults() { + let caps = MergedCapabilities::from_resolver_capabilities(&[ + ResolverCapabilities::datafusion_defaults(), + ]); + assert!(caps.supported_schemes.contains("http")); + assert!(caps.supported_schemes.contains("file")); + assert!(caps.supported_format_types.contains("csv")); + assert!(caps.supported_extensions.contains("parquet")); + } + + #[test] + fn test_merged_capabilities_union() { + let df_caps = ResolverCapabilities::datafusion_defaults(); + let custom_caps = ResolverCapabilities { + supported_schemes: vec!["spark".to_string()], + supported_format_types: vec!["delta".to_string()], + supported_extensions: vec![], + }; + let merged = MergedCapabilities::from_resolver_capabilities(&[df_caps, custom_caps]); + assert!(merged.supported_schemes.contains("http")); + assert!(merged.supported_schemes.contains("spark")); + assert!(merged.supported_format_types.contains("csv")); + assert!(merged.supported_format_types.contains("delta")); + } + + #[test] + fn test_url_supported_scheme_and_format() { + let caps = MergedCapabilities::from_resolver_capabilities(&[ + ResolverCapabilities::datafusion_defaults(), + ]); + assert!(caps.url_supported("https", Some("csv"), None)); + assert!(caps.url_supported("file", None, Some("parquet"))); + assert!(caps.url_supported("http", None, None)); // no format = ok + assert!(!caps.url_supported("spark", Some("csv"), None)); // unknown scheme + assert!(!caps.url_supported("https", Some("delta"), None)); // unknown format + } +} diff --git a/vegafusion-core/src/runtime/runtime.rs b/vegafusion-core/src/runtime/runtime.rs index 28607b809..11196513f 100644 --- a/vegafusion-core/src/runtime/runtime.rs +++ b/vegafusion-core/src/runtime/runtime.rs @@ -1,6 +1,7 @@ use std::{any::Any, collections::HashMap, sync::Arc}; use crate::proto::gen::pretransform::pre_transform_values_warning::WarningType as ValuesWarningType; +use crate::runtime::{resolve_data_base_url, DataBaseUrlSetting}; use crate::task_graph::task_value::{MaterializedTaskValue, TaskValue}; use crate::{ data::dataset::VegaFusionDataset, @@ -38,6 +39,14 @@ pub struct PreTransformExtractTable { pub trait VegaFusionRuntimeTrait: Send + Sync { fn as_any(&self) -> &dyn Any; + /// Return merged URL capabilities for planning. Default returns DataFusion's built-in + /// capabilities. Runtimes with custom resolvers override this to include their capabilities. + fn planner_capabilities(&self) -> crate::runtime::MergedCapabilities { + crate::runtime::MergedCapabilities::from_resolver_capabilities(&[ + crate::proto::gen::tasks::ResolverCapabilities::datafusion_defaults(), + ]) + } + async fn query_request( &self, task_graph: Arc, @@ -97,11 +106,19 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { preserve_interactivity: bool, inline_datasets: &HashMap, keep_variables: Vec, + data_base_url: DataBaseUrlSetting, ) -> Result<(SpecPlan, Vec)> { + let resolved_base = + resolve_data_base_url(data_base_url, PlannerConfig::default().data_base_url)?; + // Create spec plan let plan = SpecPlan::try_new( spec, - &PlannerConfig::pre_transformed_spec_config(preserve_interactivity, keep_variables), + &PlannerConfig { + capabilities: self.planner_capabilities(), + data_base_url: resolved_base.clone(), + ..PlannerConfig::pre_transformed_spec_config(preserve_interactivity, keep_variables) + }, )?; // Extract inline dataset fingerprints @@ -118,7 +135,7 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { let task_scope = plan.server_spec.to_task_scope().unwrap(); let tasks = plan .server_spec - .to_tasks(&tz_config, &dataset_fingerprints) + .to_tasks(&tz_config, &dataset_fingerprints, resolved_base) .unwrap(); let task_graph = TaskGraph::new(tasks, &task_scope).unwrap(); let task_graph_mapping = task_graph.build_mapping(); @@ -171,6 +188,7 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { options.preserve_interactivity, inline_datasets, keep_variables, + DataBaseUrlSetting::from_proto(options.data_base_url.clone()), ) .await?; @@ -205,6 +223,7 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { options.preserve_interactivity, inline_datasets, keep_variables, + DataBaseUrlSetting::from_proto(options.data_base_url.clone()), ) .await?; let init_arrow = self.materialize_export_updates(init).await?; @@ -329,6 +348,11 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { // if they are not used elsewhere in the spec let keep_variables = Vec::from(variables); + let resolved_base = resolve_data_base_url( + DataBaseUrlSetting::from_proto(options.data_base_url.clone()), + PlannerConfig::default().data_base_url, + )?; + // Create spec plan let plan = SpecPlan::try_new( spec, @@ -339,6 +363,8 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { projection_pushdown: false, allow_client_to_server_comms: true, keep_variables, + capabilities: self.planner_capabilities(), + data_base_url: resolved_base.clone(), ..Default::default() }, )?; @@ -357,7 +383,7 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { let task_scope = plan.server_spec.to_task_scope().unwrap(); let tasks = plan .server_spec - .to_tasks(&tz_config, &dataset_fingerprints)?; + .to_tasks(&tz_config, &dataset_fingerprints, resolved_base)?; let task_graph = TaskGraph::new(tasks, &task_scope).unwrap(); let task_graph_mapping = task_graph.build_mapping(); diff --git a/vegafusion-core/src/spec/chart.rs b/vegafusion-core/src/spec/chart.rs index 0d0f55efd..5866fa393 100644 --- a/vegafusion-core/src/spec/chart.rs +++ b/vegafusion-core/src/spec/chart.rs @@ -150,8 +150,9 @@ impl ChartSpec { &self, tz_config: &TzConfig, dataset_fingerprints: &HashMap, + data_base_url: Option, ) -> Result> { - let mut visitor = MakeTasksVisitor::new(tz_config, dataset_fingerprints); + let mut visitor = MakeTasksVisitor::new(tz_config, dataset_fingerprints, data_base_url); self.walk(&mut visitor)?; Ok(visitor.tasks) } diff --git a/vegafusion-core/src/spec/data.rs b/vegafusion-core/src/spec/data.rs index c7fd3e092..06d046c8e 100644 --- a/vegafusion-core/src/spec/data.rs +++ b/vegafusion-core/src/spec/data.rs @@ -55,9 +55,14 @@ impl DataSpec { task_scope: &TaskScope, scope: &[u32], ) -> DependencyNodeSupported { + // Check if the URL format is supported by any resolver's capabilities if let Some(Some(format_type)) = self.format.as_ref().map(|fmt| fmt.type_.clone()) { - if !matches!(format_type.as_str(), "csv" | "tsv" | "arrow" | "json") { - // We don't know how to read the data, so full node is unsupported + if !planner_config + .capabilities + .supported_format_types + .contains(&format_type) + { + // No resolver knows how to read this format, so full node is unsupported return DependencyNodeSupported::Unsupported; } } diff --git a/vegafusion-core/src/spec/visitors.rs b/vegafusion-core/src/spec/visitors.rs index 9f4fc1eba..814166088 100644 --- a/vegafusion-core/src/spec/visitors.rs +++ b/vegafusion-core/src/spec/visitors.rs @@ -5,6 +5,7 @@ use crate::proto::gen::tasks::{ ScanUrlFormat, Task, TzConfig, Variable, VariableNamespace, }; use crate::proto::gen::transforms::TransformPipeline; +use crate::runtime::resolve_url; use crate::spec::chart::{ChartSpec, ChartVisitor}; use crate::spec::data::{DataFormatParseSpec, DataSpec}; use crate::spec::mark::{MarkFacetSpec, MarkSpec}; @@ -108,14 +109,20 @@ pub struct MakeTasksVisitor<'a> { pub tasks: Vec, pub tz_config: TzConfig, pub dataset_fingerprints: &'a HashMap, + pub data_base_url: Option, } impl<'a> MakeTasksVisitor<'a> { - pub fn new(tz_config: &TzConfig, dataset_fingerprints: &'a HashMap) -> Self { + pub fn new( + tz_config: &TzConfig, + dataset_fingerprints: &'a HashMap, + data_base_url: Option, + ) -> Self { Self { tasks: Default::default(), tz_config: tz_config.clone(), dataset_fingerprints, + data_base_url, } } } @@ -165,28 +172,34 @@ impl ChartVisitor for MakeTasksVisitor<'_> { }; let task = if let Some(url) = &data.url { - let mut proto_url = match url { - StringOrSignalSpec::String(url) => Url::String(url.clone()), + let (proto_url, task_data_base_url) = match url { + StringOrSignalSpec::String(url) => { + // Resolve URL at plan time (base URL, file:// normalization) + let resolved = resolve_url(url, &self.data_base_url)?; + let mut proto_url = Url::String(resolved); + + // Append fingerprint to URL that references an inline dataset + if let Url::String(url_str) = &proto_url { + if let Some(inline_name) = extract_inline_dataset(url_str) { + let inline_name = inline_name.trim().to_string(); + if let Some(fingerprint) = self.dataset_fingerprints.get(&inline_name) { + proto_url = Url::String(format!("{url_str}#{fingerprint}")); + } else { + let fingerprint = random::(); + proto_url = Url::String(format!("{url_str}#{fingerprint}")); + } + } + } + (proto_url, None) + } StringOrSignalSpec::Signal(expr) => { + // Signal-based URL: resolved at eval time. + // Store data_base_url in the task so the remote server has it. let url_expr = parse(&expr.signal)?; - Url::Expr(url_expr) + (Url::Expr(url_expr), self.data_base_url.clone()) } }; - // Append fingerprint to URL that references an inline dataset - if let Url::String(url) = &proto_url { - if let Some(inline_name) = extract_inline_dataset(url) { - let inline_name = inline_name.trim().to_string(); - if let Some(fingerprint) = self.dataset_fingerprints.get(&inline_name) { - proto_url = Url::String(format!("{url}#{fingerprint}")); - } else { - // Unknown fingerprint, use random id to break cache - let fingerprint = random::(); - proto_url = Url::String(format!("{url}#{fingerprint}")); - } - } - } - Task::new_data_url( data_var, scope, @@ -195,6 +208,7 @@ impl ChartVisitor for MakeTasksVisitor<'_> { format_type, pipeline, url: Some(proto_url), + data_base_url: task_data_base_url, }, &self.tz_config, ) diff --git a/vegafusion-python/src/chart_state.rs b/vegafusion-python/src/chart_state.rs index c54ff260f..7af25b865 100644 --- a/vegafusion-python/src/chart_state.rs +++ b/vegafusion-python/src/chart_state.rs @@ -10,7 +10,7 @@ use vegafusion_core::{ data::dataset::VegaFusionDataset, planning::{plan::PreTransformSpecWarningSpec, watch::WatchPlan}, proto::gen::tasks::TzConfig, - runtime::VegaFusionRuntimeTrait, + runtime::{DataBaseUrlSetting, VegaFusionRuntimeTrait}, spec::chart::ChartSpec, }; @@ -37,6 +37,7 @@ impl PyChartState { ChartStateOpts { tz_config, row_limit, + data_base_url: DataBaseUrlSetting::Default, }, ))?; Ok(Self { diff --git a/vegafusion-python/src/lib.rs b/vegafusion-python/src/lib.rs index 8ba93e50c..1ede8ca66 100644 --- a/vegafusion-python/src/lib.rs +++ b/vegafusion-python/src/lib.rs @@ -237,6 +237,7 @@ impl PyVegaFusionRuntime { default_input_tz, row_limit, preserve_interactivity, + data_base_url: None, keep_variables: keep_variables .into_iter() .map(|v| PreTransformVariable { @@ -296,6 +297,7 @@ impl PyVegaFusionRuntime { local_tz, default_input_tz, row_limit, + data_base_url: None, }, )) })?; @@ -378,6 +380,7 @@ impl PyVegaFusionRuntime { default_input_tz, preserve_interactivity, extract_threshold: extract_threshold as i32, + data_base_url: None, keep_variables, }, )) @@ -585,6 +588,68 @@ pub fn inline_table_scan_node(name: String, schema: pyo3_arrow::PySchema) -> PyR Ok(bytes.to_vec()) } +/// Build a LogicalPlanNode protobuf (as bytes) for an external table scan. +/// +/// Use this in `scan_url` implementations to create ExternalTableProvider plan +/// nodes that will later be resolved by `resolve_plan`. +/// +/// Args: +/// table_name: Name for the table in the plan. +/// schema: Arrow schema (arro3.core.Schema) — required for logical planning. +/// protocol: Optional protocol identifier (e.g. "spark"). +/// metadata: Optional JSON-serializable dict of metadata. +/// source: Optional source identifier. +/// +/// Returns: +/// bytes: Serialized LogicalPlanNode protobuf. +#[pyfunction] +#[pyo3(signature = (table_name, schema, protocol=None, metadata=None, source=None))] +pub fn external_table_scan_node( + table_name: String, + schema: pyo3_arrow::PySchema, + protocol: Option, + metadata: Option<&Bound<'_, pyo3::types::PyAny>>, + source: Option, +) -> PyResult> { + use datafusion::datasource::provider_as_source; + use datafusion_proto::bytes::logical_plan_to_bytes_with_extension_codec; + use vegafusion_common::datafusion_expr::LogicalPlanBuilder; + use vegafusion_runtime::data::codec::VegaFusionCodec; + use vegafusion_runtime::data::external_table::ExternalTableProvider; + + let arrow_schema = schema.into_inner(); + + let metadata_value: serde_json::Value = match metadata { + Some(obj) => pythonize::depythonize(obj).map_err(|e| { + pyo3::exceptions::PyValueError::new_err(format!("Failed to convert metadata dict: {e}")) + })?, + None => serde_json::Value::Object(serde_json::Map::new()), + }; + + let provider = Arc::new( + ExternalTableProvider::new(arrow_schema, protocol, metadata_value).with_source(source), + ); + let table_source = provider_as_source(provider); + + let plan = LogicalPlanBuilder::scan(&table_name, table_source, None) + .map_err(|e| { + pyo3::exceptions::PyValueError::new_err(format!("Failed to build scan plan: {e}")) + })? + .build() + .map_err(|e| { + pyo3::exceptions::PyValueError::new_err(format!("Failed to build plan: {e}")) + })?; + + let codec = VegaFusionCodec::new(); + let bytes = logical_plan_to_bytes_with_extension_codec(&plan, &codec).map_err(|e| { + pyo3::exceptions::PyValueError::new_err(format!( + "Failed to serialize external table plan: {e}" + )) + })?; + + Ok(bytes.to_vec()) +} + /// A Python module implemented in Rust. The name of this function must match /// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to /// import the module. @@ -597,6 +662,7 @@ fn _vegafusion(_py: Python, m: &Bound) -> PyResult<()> { m.add_function(wrap_pyfunction!(get_virtual_memory, m)?)?; m.add_function(wrap_pyfunction!(get_cpu_count, m)?)?; m.add_function(wrap_pyfunction!(inline_table_scan_node, m)?)?; + m.add_function(wrap_pyfunction!(external_table_scan_node, m)?)?; m.add_function(wrap_pyfunction!(unparse::unparse_plan_to_sql, m)?)?; m.add("__version__", env!("CARGO_PKG_VERSION"))?; Ok(()) diff --git a/vegafusion-python/src/plan_resolver.rs b/vegafusion-python/src/plan_resolver.rs index cb8fd9a0f..9adfc0a86 100644 --- a/vegafusion-python/src/plan_resolver.rs +++ b/vegafusion-python/src/plan_resolver.rs @@ -15,7 +15,8 @@ use vegafusion_common::arrow::record_batch::RecordBatch; use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::{Result, VegaFusionError}; -use vegafusion_core::runtime::{PlanResolver, ResolutionResult}; +use vegafusion_core::proto::gen::tasks::ResolverCapabilities; +use vegafusion_core::runtime::{ParsedUrl, PlanResolver, ResolutionResult}; use vegafusion_runtime::data::codec::VegaFusionCodec; use vegafusion_runtime::data::external_table::ExternalTableProvider; @@ -28,33 +29,39 @@ pub struct PyPlanResolver { name: String, skip_when_no_external_tables: bool, thread_safe: bool, + has_scan_url_override: bool, } impl PyPlanResolver { pub fn new(py_resolver: Py) -> Self { - let (name, skip_when_no_external_tables, thread_safe) = Python::attach(|py| { - let name = py_resolver - .bind(py) - .get_type() - .qualname() - .map(|q| q.to_string()) - .unwrap_or_else(|_| "PyPlanResolver".to_string()); - let skip = py_resolver - .getattr(py, "skip_when_no_external_tables") - .and_then(|v| v.extract::(py)) - .unwrap_or(true); - let safe = py_resolver - .getattr(py, "thread_safe") - .and_then(|v| v.extract::(py)) - .unwrap_or(true); - (name, skip, safe) - }); + let (name, skip_when_no_external_tables, thread_safe, has_scan_url_override) = + Python::attach(|py| { + let name = py_resolver + .bind(py) + .get_type() + .qualname() + .map(|q| q.to_string()) + .unwrap_or_else(|_| "PyPlanResolver".to_string()); + let skip = py_resolver + .getattr(py, "skip_when_no_external_tables") + .and_then(|v| v.extract::(py)) + .unwrap_or(true); + let safe = py_resolver + .getattr(py, "thread_safe") + .and_then(|v| v.extract::(py)) + .unwrap_or(true); + // Check if the Python class overrides scan_url or scan_url_proto + let has_scan_url = Self::check_method_override(py, &py_resolver, "scan_url") + || Self::check_method_override(py, &py_resolver, "scan_url_proto"); + (name, skip, safe, has_scan_url) + }); Self { py_resolver, name, skip_when_no_external_tables, thread_safe, + has_scan_url_override, } } @@ -62,6 +69,22 @@ impl PyPlanResolver { pub fn thread_safe(&self) -> bool { self.thread_safe } + + /// Check if a Python method is overridden from the base class. + fn check_method_override(py: Python, obj: &Py, method_name: &str) -> bool { + // Get the method from the instance's class and compare to the base PlanResolver class + let result: PyResult = (|| { + let bound = obj.bind(py); + let cls = bound.get_type(); + let base_cls = py + .import("vegafusion.plan_resolver")? + .getattr("PlanResolver")?; + let cls_method = cls.getattr(method_name)?; + let base_method = base_cls.getattr(method_name)?; + Ok(!cls_method.is(&base_method)) + })(); + result.unwrap_or(false) + } } /// Info extracted from an ExternalTableProvider node in the plan. @@ -163,6 +186,96 @@ impl PlanResolver for PyPlanResolver { &self.name } + fn capabilities(&self) -> ResolverCapabilities { + Python::attach(|py| { + let result: PyResult = (|| { + let dict = self.py_resolver.call_method0(py, "capabilities")?; + let dict_ref = dict.bind(py); + + let extract_list = |key: &str| -> PyResult> { + match dict_ref.get_item(key) { + Ok(val) => val.extract(), + Err(_) => Ok(Vec::new()), + } + }; + + Ok(ResolverCapabilities { + supported_schemes: extract_list("supported_schemes")?, + supported_format_types: extract_list("supported_format_types")?, + supported_extensions: extract_list("supported_extensions")?, + }) + })(); + result.unwrap_or_default() + }) + } + + async fn scan_url(&self, parsed_url: &ParsedUrl) -> Result> { + if !self.has_scan_url_override { + return Ok(None); + } + + Python::attach(|py| { + // Serialize ParsedUrl to a Python dict + let dict = PyDict::new(py); + dict.set_item("url", &parsed_url.url) + .map_err(|e| VegaFusionError::internal(format!("Failed to set url: {e}")))?; + dict.set_item("scheme", &parsed_url.scheme) + .map_err(|e| VegaFusionError::internal(format!("Failed to set scheme: {e}")))?; + dict.set_item("host", parsed_url.host.as_deref()) + .map_err(|e| VegaFusionError::internal(format!("Failed to set host: {e}")))?; + dict.set_item("path", &parsed_url.path) + .map_err(|e| VegaFusionError::internal(format!("Failed to set path: {e}")))?; + // query_params as list of [key, value] pairs + let params: Vec<(&str, &str)> = parsed_url + .query_params + .iter() + .map(|(k, v)| (k.as_str(), v.as_str())) + .collect(); + dict.set_item("query_params", params).map_err(|e| { + VegaFusionError::internal(format!("Failed to set query_params: {e}")) + })?; + dict.set_item("extension", parsed_url.extension.as_deref()) + .map_err(|e| VegaFusionError::internal(format!("Failed to set extension: {e}")))?; + dict.set_item("format_type", parsed_url.format_type.as_deref()) + .map_err(|e| { + VegaFusionError::internal(format!("Failed to set format_type: {e}")) + })?; + + let result = self + .py_resolver + .call_method1(py, "scan_url_proto", (&dict,)) + .map_err(|e| { + VegaFusionError::internal(format!("Python scan_url_proto failed: {e}")) + })?; + + let result_ref = result.bind(py); + + if result_ref.is_none() { + return Ok(None); + } + + // Result is bytes — deserialize into LogicalPlan + let plan_bytes: Vec = result_ref.extract().map_err(|e| { + VegaFusionError::internal(format!( + "scan_url_proto must return bytes or None, got: {e}" + )) + })?; + + let ctx = vegafusion_runtime::datafusion::context::make_datafusion_context(); + let codec = VegaFusionCodec::new(); + let plan = datafusion_proto::bytes::logical_plan_from_bytes_with_extension_codec( + &plan_bytes, + &ctx.task_ctx(), + &codec, + ) + .map_err(|e| { + VegaFusionError::internal(format!("Failed to deserialize scan_url plan: {e}")) + })?; + + Ok(Some(plan)) + }) + } + async fn resolve_plan(&self, plan: LogicalPlan) -> Result { let tables = extract_external_tables(&plan); diff --git a/vegafusion-python/tests/test_plan_resolver.py b/vegafusion-python/tests/test_plan_resolver.py index 1f0ed761e..55e73b1ad 100644 --- a/vegafusion-python/tests/test_plan_resolver.py +++ b/vegafusion-python/tests/test_plan_resolver.py @@ -708,3 +708,177 @@ def resolve_plan_proto( assert resolver.error is not None assert "Unknown dialect" in str(resolver.error) + + +# ── scan_url tests ── + + +def test_scan_url_called_with_structured_dict() -> None: + """scan_url receives a structured dict with parsed URL fields.""" + from vegafusion.plan_resolver import external_table_scan_node + + received_urls: list[dict[str, Any]] = [] + + class UrlCapturingResolver(PlanResolver): + def scan_url(self, parsed_url: dict[str, Any]) -> Any: + received_urls.append(parsed_url) + # Create an ExternalTableProvider plan node + schema = pa.schema([("x", pa.int64()), ("y", pa.utf8())]) + return external_table_scan_node( + table_name="captured", + schema=schema, + protocol="test", + metadata={"source_url": parsed_url["url"]}, + ) + + def resolve_table( + self, + name: str, + schema: Any, + metadata: dict[str, Any], + projected_columns: list[str] | None = None, + ) -> pa.Table: + return pa.table({"x": [1, 2], "y": ["a", "b"]}) + + resolver = UrlCapturingResolver() + rt = vf.VegaFusionRuntime(plan_resolver=resolver) + + spec = { + "$schema": "https://vega.github.io/schema/vega/v5.json", + "data": [ + { + "name": "source", + "url": "https://example.com/data.csv?limit=10&format=raw", + "format": {"type": "csv"}, + } + ], + } + + rt.pre_transform_datasets(spec, datasets=["source"], dataset_format="pyarrow") + + assert len(received_urls) == 1 + url_dict = received_urls[0] + assert url_dict["scheme"] == "https" + assert url_dict["host"] == "example.com" + assert url_dict["url"].startswith("https://example.com/data.csv") + assert url_dict["extension"] == "csv" + assert url_dict["format_type"] == "csv" + # Query params preserved + assert isinstance(url_dict["query_params"], list) + + +def test_scan_url_none_falls_back_to_datafusion() -> None: + """scan_url returning None causes DataFusion to handle the URL.""" + + class NoOpScanner(PlanResolver): + def __init__(self) -> None: + self.scan_url_called = False + + def scan_url(self, parsed_url: dict[str, Any]) -> Any: + self.scan_url_called = True + return None # Pass to next resolver (DataFusion) + + csv_path = os.path.join(tempfile.gettempdir(), "vf_scan_fallback.csv") + table = pa.table({"x": [1, 5, 10]}) + pcsv.write_csv(table, csv_path) + + resolver = NoOpScanner() + rt = vf.VegaFusionRuntime(plan_resolver=resolver) + + spec = { + "$schema": "https://vega.github.io/schema/vega/v5.json", + "data": [ + { + "name": "source", + "url": csv_path, + "format": {"type": "csv"}, + } + ], + } + + datasets, _warnings = rt.pre_transform_datasets( + spec, datasets=["source"], dataset_format="pyarrow" + ) + + assert resolver.scan_url_called + assert len(datasets) == 1 + assert datasets[0].num_rows == 3 + + +def test_capabilities_extends_planner_support() -> None: + """capabilities() dict declaring custom scheme lets planner accept it.""" + from vegafusion.plan_resolver import external_table_scan_node + + class CustomSchemeResolver(PlanResolver): + def capabilities(self) -> dict[str, list[str]]: + return {"supported_schemes": ["myproto"]} + + def scan_url(self, parsed_url: dict[str, Any]) -> Any: + if parsed_url["scheme"] == "myproto": + schema = pa.schema([("val", pa.int64())]) + return external_table_scan_node( + table_name="custom_data", + schema=schema, + protocol="myproto", + ) + return None + + def resolve_table( + self, + name: str, + schema: Any, + metadata: dict[str, Any], + projected_columns: list[str] | None = None, + ) -> pa.Table: + return pa.table({"val": [42, 99]}) + + resolver = CustomSchemeResolver() + rt = vf.VegaFusionRuntime(plan_resolver=resolver) + + spec = { + "$schema": "https://vega.github.io/schema/vega/v5.json", + "data": [ + { + "name": "source", + "url": "myproto://database/table1", + } + ], + } + + datasets, _warnings = rt.pre_transform_datasets( + spec, datasets=["source"], dataset_format="pyarrow" + ) + + assert len(datasets) == 1 + assert datasets[0].column("val").to_pylist() == [42, 99] + + +def test_scan_url_not_called_without_override() -> None: + """Resolver without scan_url override does not trigger Python roundtrip.""" + + class SimpleResolver(PlanResolver): + """Only overrides resolve_table — should NOT trigger scan_url calls.""" + + def resolve_table( + self, + name: str, + schema: Any, + metadata: dict[str, Any], + projected_columns: list[str] | None = None, + ) -> pa.Table: + return pa.table({"x": [1, 2, 3]}) + + source_table = pa.table({"x": [1, 2, 3]}) + ext = ExternalDataset(protocol="test", schema=source_table.schema, data=source_table) + resolver = SimpleResolver() + rt = vf.VegaFusionRuntime(plan_resolver=resolver) + + spec = simple_spec() + datasets, _warnings = rt.pre_transform_datasets( + spec, + datasets=["filtered"], + inline_datasets={"source": ext}, + dataset_format="pyarrow", + ) + + assert len(datasets) == 1 diff --git a/vegafusion-python/vegafusion/plan_resolver.py b/vegafusion-python/vegafusion/plan_resolver.py index c29f893d5..042749b72 100644 --- a/vegafusion-python/vegafusion/plan_resolver.py +++ b/vegafusion-python/vegafusion/plan_resolver.py @@ -75,6 +75,61 @@ class PlanResolver: callbacks run on the main thread. Set to False for backends with thread-affine connections (e.g. DuckDB in-memory databases).""" + def capabilities(self) -> dict[str, list[str]]: + """Declare URL patterns this resolver supports at planning time. + + Override to advertise additional URL scheme/format support beyond + DataFusion's built-in capabilities (http, https, file, s3 schemes + with csv, tsv, json, arrow, parquet formats). + + Returns: + Dict with optional keys: ``'supported_schemes'``, + ``'supported_format_types'``, ``'supported_extensions'``. + Values are lists of strings. + """ + return {} + + def scan_url_proto(self, parsed_url: dict[str, Any]) -> bytes | None: + """Handle a URL during the scan phase (raw bytes variant). + + The default implementation delegates to :meth:`scan_url` which works + with deserialized ``LogicalPlanNode`` messages. + + Args: + parsed_url: Dict with keys ``url``, ``scheme``, ``host``, ``path``, + ``query_params``, ``extension``, ``format_type``. + + Returns: + Serialized ``LogicalPlanNode`` bytes, or None to pass to the next + resolver. + """ + result = self.scan_url(parsed_url) + if result is None: + return None + if isinstance(result, bytes): + return result + # It's a LogicalPlanNode proto message + return result.SerializeToString() + + def scan_url( + self, parsed_url: dict[str, Any] + ) -> LogicalPlanNode | bytes | None: + """Handle a URL during the scan phase. + + Override to claim URLs by returning a ``LogicalPlanNode`` or raw bytes. + Use :func:`external_table_scan_node` to build ``ExternalTableProvider`` + plan nodes that will later be resolved by :meth:`resolve_plan`. + + Args: + parsed_url: Dict with keys ``url``, ``scheme``, ``host``, ``path``, + ``query_params``, ``extension``, ``format_type``. + + Returns: + A ``LogicalPlanNode``, raw bytes, or None to pass to the next + resolver. + """ + return None + def resolve_table( self, name: str, @@ -295,6 +350,51 @@ def inline_table_scan_node( return node +def external_table_scan_node( + table_name: str, + schema: Schema, + protocol: str | None = None, + metadata: dict[str, Any] | None = None, + source: str | None = None, +) -> LogicalPlanNode: + """Build a LogicalPlanNode for an external table scan. + + Use this in :meth:`PlanResolver.scan_url` implementations to create + ``ExternalTableProvider`` plan nodes that will later be resolved by + :meth:`PlanResolver.resolve_plan`. + + Args: + table_name: Name for the table in the plan. + schema: Arrow schema (arro3.core.Schema) — required for logical planning. + protocol: Optional protocol identifier (e.g. ``"spark"``). + metadata: Optional JSON-serializable dict of metadata. + source: Optional source identifier. + + Returns: + A deserialized LogicalPlanNode protobuf message. + """ + from vegafusion._vegafusion import external_table_scan_node as _native + + try: + from vegafusion.proto.datafusion_pb2 import ( + LogicalPlanNode, # type: ignore[attr-defined] + ) + except ImportError as e: + raise ImportError(_PROTOBUF_INSTALL_HINT) from e + + node = LogicalPlanNode() + node.ParseFromString( + _native( + table_name=table_name, + schema=schema, + protocol=protocol, + metadata=metadata, + source=source, + ) + ) + return node + + def unparse_to_sql( plan: bytes | LogicalPlanNode, dialect: str = "default", diff --git a/vegafusion-runtime/benches/spec_benchmarks.rs b/vegafusion-runtime/benches/spec_benchmarks.rs index c2a0b313c..7d3484b5a 100644 --- a/vegafusion-runtime/benches/spec_benchmarks.rs +++ b/vegafusion-runtime/benches/spec_benchmarks.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::fs; use std::sync::Arc; -use vegafusion_core::planning::plan::SpecPlan; +use vegafusion_core::planning::plan::{PlannerConfig, SpecPlan}; use vegafusion_core::planning::watch::ExportUpdateBatch; use vegafusion_core::proto::gen::services::query_request::Request; use vegafusion_core::proto::gen::services::QueryRequest; @@ -46,7 +46,11 @@ async fn eval_spec_get_variable(full_spec: ChartSpec, var: &ScopedVariable) -> V let task_scope = spec_plan.server_spec.to_task_scope().unwrap(); let tasks = spec_plan .server_spec - .to_tasks(&tz_config, &Default::default()) + .to_tasks( + &tz_config, + &Default::default(), + PlannerConfig::default().data_base_url, + ) .unwrap(); let task_graph = TaskGraph::new(tasks, &task_scope).unwrap(); let task_graph_mapping = task_graph.build_mapping(); @@ -97,7 +101,11 @@ async fn eval_spec_sequence(full_spec: ChartSpec, full_updates: Vec, } @@ -29,6 +36,59 @@ impl PlanResolver for DataFusionResolver { "DataFusionResolver" } + fn capabilities(&self) -> ResolverCapabilities { + ResolverCapabilities::datafusion_defaults() + } + + async fn scan_url(&self, parsed_url: &ParsedUrl) -> Result> { + // Only handle known schemes + match parsed_url.scheme.as_str() { + "http" | "https" | "s3" | "file" => {} + _ => return Ok(None), + } + + // Determine file type: format_type takes precedence over extension. + // "json" format_type is treated as None (json is Vega-Lite's default, + // shouldn't override extension detection). + let file_type = match &parsed_url.format_type { + Some(ft) if ft != "json" => Some(ft.as_str()), + _ => None, + }; + let ext = parsed_url.extension.as_deref(); + + let url = &parsed_url.url; + let ctx = self.ctx.clone(); + + let df = if file_type == Some("csv") || (file_type.is_none() && ext == Some("csv")) { + read_csv(url, &None, ctx, false).await? + } else if file_type == Some("tsv") || (file_type.is_none() && ext == Some("tsv")) { + read_csv(url, &None, ctx, true).await? + } else if file_type == Some("json") + || (file_type.is_none() && matches!(ext, Some("json") | None)) + { + read_json(url, ctx).await? + } else if file_type == Some("arrow") + || (file_type.is_none() && matches!(ext, Some("arrow") | Some("feather"))) + { + read_arrow(url, ctx).await? + } else if file_type == Some("parquet") || (file_type.is_none() && ext == Some("parquet")) { + cfg_if! { + if #[cfg(feature = "parquet")] { + read_parquet(url, ctx).await? + } else { + return Err(VegaFusionError::internal( + "Enable parquet support by enabling the `parquet` feature flag" + )) + } + } + } else { + // Unrecognized format — pass to next resolver + return Ok(None); + }; + + Ok(Some(df.logical_plan().clone())) + } + async fn resolve_plan(&self, plan: LogicalPlan) -> Result { let table = DataFrame::new(self.ctx.state(), plan) .collect_to_table() diff --git a/vegafusion-runtime/src/data/pipeline.rs b/vegafusion-runtime/src/data/pipeline.rs index 02ed64570..8eb5d6fac 100644 --- a/vegafusion-runtime/src/data/pipeline.rs +++ b/vegafusion-runtime/src/data/pipeline.rs @@ -4,56 +4,90 @@ use datafusion::prelude::SessionContext; use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::Result; -use vegafusion_core::runtime::{PlanResolver, ResolutionResult}; +use vegafusion_core::proto::gen::tasks::ResolverCapabilities; +use vegafusion_core::runtime::{MergedCapabilities, ParsedUrl, PlanResolver, ResolutionResult}; use super::datafusion_resolver::DataFusionResolver; -/// Chains user-supplied resolvers with a terminal `DataFusionResolver`. +/// Chains resolvers with a terminal `DataFusionResolver`. /// -/// Each user resolver either returns a fully materialized `Table` (short-circuiting -/// the pipeline) or a rewritten `Plan` that is passed to the next resolver. -/// The `DataFusionResolver` at the end always executes the plan and returns a table. +/// All resolvers (user-supplied + DataFusionResolver) live in a single vec. +/// DataFusionResolver is always the last resolver in the chain. +/// +/// For `scan_url`, resolvers are tried in order; the first `Some(plan)` wins. +/// For `resolve`, each resolver either returns a `Table` (short-circuiting) +/// or a rewritten `Plan` passed to the next resolver. #[derive(Clone)] pub struct ResolverPipeline { - user_resolvers: Arc>>, - datafusion_resolver: Arc, + resolvers: Arc>>, + ctx: Arc, } impl ResolverPipeline { pub fn new(user_resolvers: Vec>, ctx: Arc) -> Self { + let mut resolvers: Vec> = user_resolvers; + resolvers.push(Arc::new(DataFusionResolver::new(ctx.clone()))); Self { - user_resolvers: Arc::new(user_resolvers), - datafusion_resolver: Arc::new(DataFusionResolver::new(ctx)), + resolvers: Arc::new(resolvers), + ctx, } } /// Whether any user-supplied resolvers are registered. pub fn has_user_resolvers(&self) -> bool { - !self.user_resolvers.is_empty() + self.resolvers.len() > 1 } /// Access the shared `SessionContext`. pub fn ctx(&self) -> &SessionContext { - &self.datafusion_resolver.ctx + &self.ctx + } + + /// Try each resolver's `scan_url` in order. Returns the first `Some(plan)`. + pub async fn scan_url(&self, parsed_url: &ParsedUrl) -> Result> { + for resolver in self.resolvers.iter() { + if let Some(plan) = resolver.scan_url(parsed_url).await? { + return Ok(Some(plan)); + } + } + Ok(None) + } + + /// Merge capabilities from all resolvers into a single set for planner lookups. + pub fn merged_capabilities(&self) -> MergedCapabilities { + MergedCapabilities::from_resolver_capabilities( + &self + .resolvers + .iter() + .map(|r| r.capabilities()) + .collect::>(), + ) + } + + /// Return a single merged `ResolverCapabilities` proto (union of all resolvers). + /// Useful for serializing capabilities over gRPC/WASM. + pub fn merged_resolver_capabilities(&self) -> ResolverCapabilities { + let merged = self.merged_capabilities(); + ResolverCapabilities { + supported_schemes: merged.supported_schemes.into_iter().collect(), + supported_format_types: merged.supported_format_types.into_iter().collect(), + supported_extensions: merged.supported_extensions.into_iter().collect(), + } } /// Resolve a `LogicalPlan` to a `VegaFusionTable`. /// - /// Iterates through user resolvers first; if any returns `Table`, that result - /// is returned immediately. Otherwise the (possibly rewritten) plan is executed - /// by the terminal `DataFusionResolver`. + /// Iterates through all resolvers; if any returns `Table`, that result + /// is returned immediately. Otherwise the (possibly rewritten) plan is + /// passed to the next resolver. pub async fn resolve(&self, plan: LogicalPlan) -> Result { let mut current = plan; - for resolver in self.user_resolvers.iter() { + for resolver in self.resolvers.iter() { match resolver.resolve_plan(current).await? { ResolutionResult::Table(table) => return Ok(table), ResolutionResult::Plan(p) => current = p, } } - // Terminal: DataFusionResolver always returns Table - match self.datafusion_resolver.resolve_plan(current).await? { - ResolutionResult::Table(table) => Ok(table), - ResolutionResult::Plan(_) => unreachable!("DataFusionResolver always returns Table"), - } + unreachable!("DataFusionResolver (last in chain) always returns Table") } } diff --git a/vegafusion-runtime/src/data/tasks.rs b/vegafusion-runtime/src/data/tasks.rs index a42c3dbd2..6089baa02 100644 --- a/vegafusion-runtime/src/data/tasks.rs +++ b/vegafusion-runtime/src/data/tasks.rs @@ -8,7 +8,7 @@ use std::borrow::Cow; use async_trait::async_trait; use datafusion_expr::{lit, Expr}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::path::Path; use std::sync::Arc; use vegafusion_core::data::dataset::VegaFusionDataset; @@ -133,11 +133,16 @@ impl TaskCall for DataUrlTask { // Build url string let url = match self.url.as_ref().unwrap() { - Url::String(url) => url.clone(), + Url::String(url) => { + // Already resolved at plan time by MakeTasksVisitor + url.clone() + } Url::Expr(expr) => { + // Signal-based URL — resolve at eval time let compiled = compile(expr, &config, None).await?; let url_scalar = compiled.eval_to_scalar()?; - url_scalar.to_scalar_string()? + let raw_url = url_scalar.to_scalar_string()?; + vegafusion_core::runtime::resolve_url(&raw_url, &self.data_base_url)? } }; @@ -145,19 +150,16 @@ impl TaskCall for DataUrlTask { let url_parts: Vec<&str> = url.splitn(2, '#').collect(); let url = url_parts.first().cloned().unwrap_or(&url).to_string(); - // Handle references to vega default datasets (e.g. "data/us-10m.json") - let url = check_builtin_dataset(url); - // Load data from URL let parse = self.format_type.as_ref().and_then(|fmt| fmt.parse.clone()); let file_type = self.format_type.as_ref().and_then(|fmt| fmt.r#type.clone()); // Vega-Lite sets unspecified file types to "json", so we don't want this to take // precedence over file extension - let file_type = if file_type == Some("json".to_string()) { + let format_type = if file_type == Some("json".to_string()) { None } else { - file_type.as_deref() + file_type }; let inline_name = extract_inline_dataset(&url).map(|name| name.trim().to_string()); @@ -183,32 +185,17 @@ impl TaskCall for DataUrlTask { "No inline dataset named {inline_name}" ))); } - } else if file_type == Some("csv") || (file_type.is_none() && url.ends_with(".csv")) { - read_csv(&url, &parse, ctx.clone(), false).await? - } else if file_type == Some("tsv") || (file_type.is_none() && url.ends_with(".tsv")) { - read_csv(&url, &parse, ctx.clone(), true).await? - } else if file_type == Some("json") || (file_type.is_none() && url.ends_with(".json")) { - read_json(&url, ctx.clone()).await? - } else if file_type == Some("arrow") - || (file_type.is_none() && (url.ends_with(".arrow") || url.ends_with(".feather"))) - { - read_arrow(&url, ctx.clone()).await? - } else if file_type == Some("parquet") - || (file_type.is_none() && (url.ends_with(".parquet"))) - { - cfg_if! { - if #[cfg(any(feature = "parquet"))] { - read_parquet(&url, ctx.clone()).await? - } else { + } else { + // Construct ParsedUrl and dispatch to pipeline.scan_url() + let parsed_url = build_parsed_url(&url, format_type.as_deref())?; + match pipeline.scan_url(&parsed_url).await? { + Some(plan) => DataFrame::new(ctx.state(), plan), + None => { return Err(VegaFusionError::internal(format!( - "Enable parquet support by enabling the `parquet` feature flag" - ))) + "No resolver handled URL: {url}" + ))); } } - } else { - return Err(VegaFusionError::internal(format!( - "Invalid url file extension {url}" - ))); }; // Ensure there is an ordering column present @@ -237,106 +224,42 @@ impl TaskCall for DataUrlTask { let task_value = result_df.to_task_value(inline_dataset).await?; maybe_materialize_plan(task_value, &pipeline).await? } else { - TaskValue::Table(result_df.collect_to_table().await?) + // URL-sourced data: use Plan when user resolvers exist for lazy evaluation + let task_value = TaskValue::Plan(result_df.logical_plan().clone()); + maybe_materialize_plan(task_value, &pipeline).await? }; Ok((task_value, output_values)) } } -lazy_static! { - static ref BUILT_IN_DATASETS: HashSet<&'static str> = vec![ - "7zip.png", - "airports.csv", - "annual-precip.json", - "anscombe.json", - "barley.json", - "birdstrikes.csv", - "budget.json", - "budgets.json", - "burtin.json", - "cars.json", - "co2-concentration.csv", - "countries.json", - "crimea.json", - "disasters.csv", - "driving.json", - "earthquakes.json", - "ffox.png", - "flare-dependencies.json", - "flare.json", - "flights-10k.json", - "flights-200k.arrow", - "flights-200k.json", - "flights-20k.json", - "flights-2k.json", - "flights-3m.csv", - "flights-5k.json", - "flights-airport.csv", - "football.json", - "gapminder-health-income.csv", - "gapminder.json", - "gimp.png", - "github.csv", - "income.json", - "iowa-electricity.csv", - "jobs.json", - "la-riots.csv", - "londonBoroughs.json", - "londonCentroids.json", - "londonTubeLines.json", - "lookup_groups.csv", - "lookup_people.csv", - "miserables.json", - "monarchs.json", - "movies.json", - "normal-2d.json", - "obesity.json", - "ohlc.json", - "penguins.json", - "platformer-terrain.json", - "points.json", - "political-contributions.json", - "population_engineers_hurricanes.csv", - "population.json", - "seattle-weather.csv", - "seattle-weather-hourly-normals.csv", - "sp500-2000.csv", - "sp500.csv", - "stocks.csv", - "udistrict.json", - "unemployment-across-industries.json", - "unemployment.tsv", - "uniform-2d.json", - "us-10m.json", - "us-employment.csv", - "us-state-capitals.json", - "volcano.json", - "weather.csv", - "weather.json", - "wheat.json", - "windvectors.csv", - "world-110m.json", - "zipcodes.csv", - ] - .into_iter() - .collect(); -} - -const DATASET_BASE: &str = "https://raw.githubusercontent.com/vega/vega-datasets"; -const DATASET_TAG: &str = "v2.3.0"; +/// Construct a `ParsedUrl` from a fully-resolved URL string and optional format type. +fn build_parsed_url( + url: &str, + format_type: Option<&str>, +) -> Result { + let parsed = url::Url::parse(url) + .map_err(|e| VegaFusionError::internal(format!("Failed to parse URL '{url}': {e}")))?; + + let extension = std::path::Path::new(parsed.path()) + .extension() + .and_then(|ext| ext.to_str()) + .map(|s| s.to_string()); + + let query_params: Vec<(String, String)> = parsed + .query_pairs() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); -fn check_builtin_dataset(url: String) -> String { - if let Some(dataset) = url.strip_prefix("data/") { - let path = std::path::Path::new(&url); - if !path.exists() && BUILT_IN_DATASETS.contains(dataset) { - format!("{DATASET_BASE}/{DATASET_TAG}/data/{dataset}") - } else { - url - } - } else { - url - } + Ok(vegafusion_core::runtime::ParsedUrl { + url: url.to_string(), + scheme: parsed.scheme().to_string(), + host: parsed.host_str().map(|s| s.to_string()), + path: parsed.path().to_string(), + query_params, + extension, + format_type: format_type.map(|s| s.to_string()), + }) } /// After processing, all datetime columns are converted to Timestamptz and Date32 @@ -696,7 +619,7 @@ async fn read_csv_with_reqwest( ctx.vegafusion_table(table).await } -async fn read_csv( +pub(crate) async fn read_csv( url: &str, parse: &Option, ctx: Arc, @@ -736,7 +659,7 @@ async fn read_csv( } /// Build final schema by combining the input and inferred schemas -async fn build_csv_schema( +pub(crate) async fn build_csv_schema( csv_opts: &CsvReadOptions<'_>, parse: &Option, uri: impl Into, @@ -798,7 +721,7 @@ async fn build_csv_schema( Ok(Schema::new(new_fields)) } -async fn read_json(url: &str, ctx: Arc) -> Result { +pub(crate) async fn read_json(url: &str, ctx: Arc) -> Result { let value: serde_json::Value = if let Some(base_url) = maybe_register_object_stores_for_url(&ctx, url)? { // Create single use object store that points directly to file @@ -864,18 +787,18 @@ async fn read_json(url: &str, ctx: Arc) -> Result { ctx.vegafusion_table(table).await } -async fn read_arrow(url: &str, ctx: Arc) -> Result { +pub(crate) async fn read_arrow(url: &str, ctx: Arc) -> Result { maybe_register_object_stores_for_url(&ctx, url)?; Ok(ctx.read_arrow(url, ArrowReadOptions::default()).await?) } #[cfg(feature = "parquet")] -async fn read_parquet(url: &str, ctx: Arc) -> Result { +pub(crate) async fn read_parquet(url: &str, ctx: Arc) -> Result { maybe_register_object_stores_for_url(&ctx, url)?; Ok(ctx.read_parquet(url, ParquetReadOptions::default()).await?) } -fn maybe_register_object_stores_for_url( +pub(crate) fn maybe_register_object_stores_for_url( ctx: &SessionContext, url: &str, ) -> Result> { diff --git a/vegafusion-runtime/src/task_graph/grpc_runtime.rs b/vegafusion-runtime/src/task_graph/grpc_runtime.rs index 43360a7be..9cdaaa49f 100644 --- a/vegafusion-runtime/src/task_graph/grpc_runtime.rs +++ b/vegafusion-runtime/src/task_graph/grpc_runtime.rs @@ -3,11 +3,11 @@ use vegafusion_core::{ proto::gen::{ services::{ query_request, query_result, vega_fusion_runtime_client::VegaFusionRuntimeClient, - QueryRequest, + GetCapabilitiesRequest, QueryRequest, }, tasks::{NodeValueIndex, TaskGraph, TaskGraphValueRequest}, }, - runtime::VegaFusionRuntimeTrait, + runtime::{MergedCapabilities, VegaFusionRuntimeTrait}, task_graph::task_value::NamedTaskValue, }; @@ -21,6 +21,7 @@ use vegafusion_common::error::{Result, VegaFusionError}; #[derive(Clone)] pub struct GrpcVegaFusionRuntime { client: Arc>>, + capabilities: MergedCapabilities, } #[async_trait] @@ -29,6 +30,10 @@ impl VegaFusionRuntimeTrait for GrpcVegaFusionRuntime { self } + fn planner_capabilities(&self) -> MergedCapabilities { + self.capabilities.clone() + } + async fn query_request( &self, task_graph: Arc, @@ -66,9 +71,19 @@ impl VegaFusionRuntimeTrait for GrpcVegaFusionRuntime { impl GrpcVegaFusionRuntime { pub async fn try_new(channel: tonic::transport::Channel) -> Result { - let client = VegaFusionRuntimeClient::new(channel); + let mut client = VegaFusionRuntimeClient::new(channel); + + // Fetch capabilities from the server at construction time + let caps_response = client + .get_capabilities(GetCapabilitiesRequest {}) + .await + .map_err(|e| VegaFusionError::internal(format!("Failed to get capabilities: {e}")))?; + let caps = caps_response.into_inner().capabilities.unwrap_or_default(); + let capabilities = MergedCapabilities::from_resolver_capabilities(&[caps]); + Ok(Self { client: Arc::new(Mutex::new(client)), + capabilities, }) } } diff --git a/vegafusion-runtime/src/task_graph/runtime.rs b/vegafusion-runtime/src/task_graph/runtime.rs index 8c0df325d..8fea7392c 100644 --- a/vegafusion-runtime/src/task_graph/runtime.rs +++ b/vegafusion-runtime/src/task_graph/runtime.rs @@ -94,6 +94,10 @@ impl VegaFusionRuntimeTrait for VegaFusionRuntime { self } + fn planner_capabilities(&self) -> vegafusion_core::runtime::MergedCapabilities { + self.pipeline.merged_capabilities() + } + async fn materialize_task_values( &self, values: Vec, diff --git a/vegafusion-runtime/tests/test_chart_state.rs b/vegafusion-runtime/tests/test_chart_state.rs index 2aaca80a3..4432baadd 100644 --- a/vegafusion-runtime/tests/test_chart_state.rs +++ b/vegafusion-runtime/tests/test_chart_state.rs @@ -37,6 +37,7 @@ mod tests { default_input_tz: None, }, row_limit: None, + data_base_url: Default::default(), }, ) .await diff --git a/vegafusion-runtime/tests/test_destringify_selection_datasets.rs b/vegafusion-runtime/tests/test_destringify_selection_datasets.rs index edc5ffba5..3aa8a2968 100644 --- a/vegafusion-runtime/tests/test_destringify_selection_datasets.rs +++ b/vegafusion-runtime/tests/test_destringify_selection_datasets.rs @@ -36,6 +36,7 @@ mod tests { local_tz: "UTC".to_string(), default_input_tz: None, preserve_interactivity: true, + data_base_url: None, }, ) .await diff --git a/vegafusion-runtime/tests/test_image_comparison.rs b/vegafusion-runtime/tests/test_image_comparison.rs index ed57dd24f..da5f3a71d 100644 --- a/vegafusion-runtime/tests/test_image_comparison.rs +++ b/vegafusion-runtime/tests/test_image_comparison.rs @@ -1442,7 +1442,11 @@ async fn check_spec_sequence( // Build task graph let tasks = spec_plan .server_spec - .to_tasks(&tz_config, &Default::default()) + .to_tasks( + &tz_config, + &Default::default(), + PlannerConfig::default().data_base_url, + ) .unwrap(); let mut task_graph = TaskGraph::new(tasks, &task_scope).unwrap(); let task_graph_mapping = task_graph.build_mapping(); diff --git a/vegafusion-runtime/tests/test_plan_resolver.rs b/vegafusion-runtime/tests/test_plan_resolver.rs index 02ec22f16..aa775f812 100644 --- a/vegafusion-runtime/tests/test_plan_resolver.rs +++ b/vegafusion-runtime/tests/test_plan_resolver.rs @@ -14,7 +14,8 @@ use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::{Result, VegaFusionError}; use vegafusion_core::data::dataset::VegaFusionDataset; use vegafusion_core::proto::gen::pretransform::PreTransformSpecOpts; -use vegafusion_core::runtime::{PlanResolver, ResolutionResult, VegaFusionRuntimeTrait}; +use vegafusion_core::proto::gen::tasks::ResolverCapabilities; +use vegafusion_core::runtime::{ParsedUrl, PlanResolver, ResolutionResult, VegaFusionRuntimeTrait}; use vegafusion_core::spec::chart::ChartSpec; use vegafusion_runtime::data::external_table::ExternalTableProvider; use vegafusion_runtime::data::pipeline::ResolverPipeline; @@ -212,6 +213,7 @@ async fn test_custom_executor_called_in_pre_transform_spec() { default_input_tz: None, row_limit: None, keep_variables: vec![], + data_base_url: None, }, ) .await @@ -251,6 +253,7 @@ async fn test_custom_executor_called_in_pre_transform_extract() { default_input_tz: None, extract_threshold: 100, keep_variables: vec![], + data_base_url: None, }, ) .await @@ -297,6 +300,7 @@ async fn test_custom_executor_called_in_pre_transform_values() { local_tz: "UTC".to_string(), default_input_tz: None, row_limit: None, + data_base_url: None, }, ) .await @@ -411,6 +415,7 @@ async fn test_bin_transform_uses_custom_executor() { default_input_tz: None, row_limit: None, keep_variables: vec![], + data_base_url: None, }, ) .await @@ -505,6 +510,7 @@ async fn test_mixed_data_only_executes_plans() { default_input_tz: None, row_limit: None, keep_variables: vec![], + data_base_url: None, }, ) .await @@ -796,6 +802,156 @@ fn get_inline_datasets() -> std::collections::HashMap datasets } +// ── scan_url tests ── + +/// A resolver that claims custom:// URLs by returning an ExternalTableProvider plan +struct CustomSchemeScanner { + schema: Arc, +} + +#[async_trait] +impl PlanResolver for CustomSchemeScanner { + fn name(&self) -> &str { + "custom_scheme_scanner" + } + + fn capabilities(&self) -> ResolverCapabilities { + ResolverCapabilities { + supported_schemes: vec!["custom".to_string()], + supported_format_types: vec![], + supported_extensions: vec![], + } + } + + async fn scan_url(&self, parsed_url: &ParsedUrl) -> Result> { + if parsed_url.scheme == "custom" { + let provider = Arc::new(ExternalTableProvider::new( + self.schema.clone(), + Some("custom".to_string()), + serde_json::json!({"url": parsed_url.url}), + )); + let plan = LogicalPlanBuilder::scan("custom_table", provider_as_source(provider), None) + .unwrap() + .build() + .unwrap(); + Ok(Some(plan)) + } else { + Ok(None) + } + } + + async fn resolve_plan(&self, plan: LogicalPlan) -> Result { + // Rewrite ExternalTableProvider to MemTable for execution + let movies = create_movies_table(); + let mem_table = Arc::new( + MemTable::try_new(movies.schema.clone(), vec![movies.batches.clone()]).unwrap(), + ) as Arc; + let mut rewriter = TableRewriter { + movies_table: mem_table, + }; + let rewritten = plan.rewrite(&mut rewriter).unwrap().data; + Ok(ResolutionResult::Plan(rewritten)) + } +} + +#[tokio::test] +async fn test_scan_url_custom_scheme_first_wins() { + let schema = get_movies_schema(); + let scanner = CustomSchemeScanner { + schema: schema.clone(), + }; + + let ctx = Arc::new(datafusion::prelude::SessionContext::new()); + let pipeline = ResolverPipeline::new(vec![Arc::new(scanner)], ctx); + + let parsed = ParsedUrl { + url: "custom://mydb/table1".to_string(), + scheme: "custom".to_string(), + host: Some("mydb".to_string()), + path: "/table1".to_string(), + query_params: vec![], + extension: None, + format_type: None, + }; + + let result = pipeline.scan_url(&parsed).await.unwrap(); + assert!( + result.is_some(), + "Custom scanner should handle custom:// URLs" + ); +} + +#[tokio::test] +async fn test_scan_url_unknown_scheme_falls_through() { + let ctx = Arc::new(datafusion::prelude::SessionContext::new()); + // Pipeline with only DataFusionResolver (no user resolvers) + let pipeline = ResolverPipeline::new(vec![], ctx); + + let parsed = ParsedUrl { + url: "spark://cluster/table1".to_string(), + scheme: "spark".to_string(), + host: Some("cluster".to_string()), + path: "/table1".to_string(), + query_params: vec![], + extension: None, + format_type: None, + }; + + let result = pipeline.scan_url(&parsed).await.unwrap(); + assert!( + result.is_none(), + "DataFusionResolver should return None for unknown schemes" + ); +} + +#[tokio::test] +async fn test_has_user_resolvers() { + let ctx = Arc::new(datafusion::prelude::SessionContext::new()); + + // No user resolvers + let pipeline = ResolverPipeline::new(vec![], ctx.clone()); + assert!(!pipeline.has_user_resolvers()); + + // With a user resolver + let scanner = CustomSchemeScanner { + schema: get_movies_schema(), + }; + let pipeline = ResolverPipeline::new(vec![Arc::new(scanner)], ctx); + assert!(pipeline.has_user_resolvers()); +} + +#[tokio::test] +async fn test_merged_capabilities_includes_custom_resolver() { + let schema = get_movies_schema(); + let scanner = CustomSchemeScanner { schema }; + + let ctx = Arc::new(datafusion::prelude::SessionContext::new()); + let pipeline = ResolverPipeline::new(vec![Arc::new(scanner)], ctx); + + let caps = pipeline.merged_capabilities(); + // DataFusion built-ins + assert!(caps.supported_schemes.contains("http")); + assert!(caps.supported_schemes.contains("file")); + assert!(caps.supported_format_types.contains("csv")); + // Custom resolver additions + assert!(caps.supported_schemes.contains("custom")); + // url_supported checks + assert!(caps.url_supported("custom", None, None)); + assert!(caps.url_supported("https", Some("csv"), None)); + assert!(!caps.url_supported("spark", None, None)); +} + +#[tokio::test] +async fn test_planner_capabilities_from_runtime() { + let schema = get_movies_schema(); + let scanner = CustomSchemeScanner { schema }; + + let runtime = VegaFusionRuntime::new(None, vec![Arc::new(scanner)]); + let caps = runtime.planner_capabilities(); + assert!(caps.supported_schemes.contains("custom")); + assert!(caps.supported_schemes.contains("http")); +} + /// Test a resolver that returns ResolutionResult::Table directly (bypassing DataFusion execution). #[tokio::test] async fn test_table_returning_resolver() { @@ -851,6 +1007,7 @@ async fn test_table_returning_resolver() { default_input_tz: None, row_limit: None, keep_variables: vec![], + data_base_url: None, }, ) .await @@ -918,6 +1075,7 @@ async fn test_no_resolver() { default_input_tz: None, row_limit: None, keep_variables: vec![], + data_base_url: None, }, ) .await @@ -1189,6 +1347,7 @@ async fn test_resolver_error_propagation() { default_input_tz: None, row_limit: None, keep_variables: vec![], + data_base_url: None, }, ) .await; diff --git a/vegafusion-runtime/tests/test_planning.rs b/vegafusion-runtime/tests/test_planning.rs index 8219ee8c0..f4484bae8 100644 --- a/vegafusion-runtime/tests/test_planning.rs +++ b/vegafusion-runtime/tests/test_planning.rs @@ -55,7 +55,11 @@ async fn test_extract_server_data() { println!("client_stubs: {client_stubs:?}"); let tasks = server_spec - .to_tasks(&tz_config, &Default::default()) + .to_tasks( + &tz_config, + &Default::default(), + PlannerConfig::default().data_base_url, + ) .unwrap(); let graph = Arc::new(TaskGraph::new(tasks, &task_scope).unwrap()); let mapping = graph.build_mapping(); diff --git a/vegafusion-runtime/tests/test_pre_transform_extract.rs b/vegafusion-runtime/tests/test_pre_transform_extract.rs index aa5cb3d1e..3752df166 100644 --- a/vegafusion-runtime/tests/test_pre_transform_extract.rs +++ b/vegafusion-runtime/tests/test_pre_transform_extract.rs @@ -34,6 +34,7 @@ mod tests { &Default::default(), &PreTransformExtractOpts { keep_variables: vec![], + data_base_url: None, extract_threshold: 20, preserve_interactivity: false, local_tz: "UTC".to_string(), diff --git a/vegafusion-runtime/tests/test_pre_transform_keep_variables.rs b/vegafusion-runtime/tests/test_pre_transform_keep_variables.rs index dfe2797dc..a04e555d8 100644 --- a/vegafusion-runtime/tests/test_pre_transform_keep_variables.rs +++ b/vegafusion-runtime/tests/test_pre_transform_keep_variables.rs @@ -38,6 +38,7 @@ mod tests { local_tz: "UTC".to_string(), default_input_tz: None, preserve_interactivity: true, + data_base_url: None, }, ) .await @@ -65,6 +66,7 @@ mod tests { local_tz: "UTC".to_string(), default_input_tz: None, preserve_interactivity: true, + data_base_url: None, }, ) .await @@ -93,6 +95,7 @@ mod tests { local_tz: "UTC".to_string(), default_input_tz: None, preserve_interactivity: true, + data_base_url: None, }, ) .await; diff --git a/vegafusion-runtime/tests/test_pre_transform_values.rs b/vegafusion-runtime/tests/test_pre_transform_values.rs index 9e1c14c73..1a9e0f665 100644 --- a/vegafusion-runtime/tests/test_pre_transform_values.rs +++ b/vegafusion-runtime/tests/test_pre_transform_values.rs @@ -50,6 +50,7 @@ mod tests { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, + data_base_url: None, }, ) .await @@ -99,6 +100,7 @@ mod tests { row_limit: Some(3), local_tz: "UTC".to_string(), default_input_tz: None, + data_base_url: None, }, ) .await @@ -149,6 +151,7 @@ mod tests { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, + data_base_url: None, }, ) .await; @@ -173,6 +176,7 @@ mod tests { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, + data_base_url: None, }, ) .await; @@ -216,6 +220,7 @@ mod tests { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, + data_base_url: None, }, ) .await @@ -262,6 +267,7 @@ mod tests { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, + data_base_url: None, }, ) .await @@ -312,6 +318,7 @@ mod tests { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, + data_base_url: None, }, ) .await @@ -393,6 +400,7 @@ mod tests { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, + data_base_url: None, }, ) .await @@ -452,6 +460,7 @@ mod tests { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, + data_base_url: None, }, ) .await diff --git a/vegafusion-runtime/tests/test_stringify_datetimes.rs b/vegafusion-runtime/tests/test_stringify_datetimes.rs index 5fe197ddc..3368e77d3 100644 --- a/vegafusion-runtime/tests/test_stringify_datetimes.rs +++ b/vegafusion-runtime/tests/test_stringify_datetimes.rs @@ -97,6 +97,7 @@ mod test_stringify_datetimes { local_tz: local_tz.to_string(), default_input_tz: Some(default_input_tz.to_string()), keep_variables: vec![], + data_base_url: None, row_limit: None, preserve_interactivity: true, }, @@ -152,6 +153,7 @@ mod test_stringify_datetimes { local_tz: local_tz.to_string(), default_input_tz: Some(default_input_tz.to_string()), keep_variables: vec![], + data_base_url: None, row_limit: None, preserve_interactivity: true, }, @@ -235,6 +237,7 @@ mod test_stringify_datetimes { local_tz: local_tz.to_string(), default_input_tz: Some(default_input_tz.to_string()), keep_variables: vec![], + data_base_url: None, row_limit: None, preserve_interactivity: true, }, @@ -300,6 +303,7 @@ mod test_stringify_datetimes { local_tz: local_tz.to_string(), default_input_tz: Some(default_input_tz.to_string()), keep_variables: vec![], + data_base_url: None, row_limit: None, preserve_interactivity: true, }, @@ -345,6 +349,7 @@ mod test_stringify_datetimes { local_tz: "UTC".to_string(), default_input_tz: None, keep_variables: vec![], + data_base_url: None, row_limit: None, preserve_interactivity: true, }, diff --git a/vegafusion-runtime/tests/test_task_graph_runtime.rs b/vegafusion-runtime/tests/test_task_graph_runtime.rs index b6a2d0658..b9e133101 100644 --- a/vegafusion-runtime/tests/test_task_graph_runtime.rs +++ b/vegafusion-runtime/tests/test_task_graph_runtime.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use vegafusion_common::data::scalar::ScalarValue; use vegafusion_core::expression::parser::parse; +use vegafusion_core::planning::plan::PlannerConfig; use vegafusion_core::proto::gen::tasks::data_url_task::Url; use vegafusion_core::proto::gen::tasks::{ DataSourceTask, DataUrlTask, NodeValueIndex, Task, TaskGraph, TzConfig, Variable, @@ -50,6 +51,7 @@ async fn try_it() { batch_size: 1024, format_type: None, pipeline: None, + data_base_url: None, }, &tz_config, ), @@ -131,7 +133,13 @@ async fn try_it_from_spec() { default_input_tz: None, }; let task_scope = chart.to_task_scope().unwrap(); - let tasks = chart.to_tasks(&tz_config, &Default::default()).unwrap(); + let tasks = chart + .to_tasks( + &tz_config, + &Default::default(), + PlannerConfig::default().data_base_url, + ) + .unwrap(); println!("task_scope: {task_scope:?}"); println!("tasks: {tasks:?}"); diff --git a/vegafusion-server/src/main.rs b/vegafusion-server/src/main.rs index ea46f714f..a009766d4 100644 --- a/vegafusion-server/src/main.rs +++ b/vegafusion-server/src/main.rs @@ -9,8 +9,9 @@ use vegafusion_core::proto::gen::services::vega_fusion_runtime_server::{ }; use vegafusion_core::proto::gen::services::{ pre_transform_extract_result, pre_transform_spec_result, pre_transform_values_result, - query_request, query_result, PreTransformExtractResult, PreTransformSpecResult, - PreTransformValuesResult, QueryRequest, QueryResult, + query_request, query_result, GetCapabilitiesRequest, GetCapabilitiesResult, + PreTransformExtractResult, PreTransformSpecResult, PreTransformValuesResult, QueryRequest, + QueryResult, }; use vegafusion_core::proto::gen::tasks::TaskGraphValueResponse; use vegafusion_core::proto::gen::tasks::{ @@ -122,6 +123,16 @@ impl VegaFusionRuntimeGrpc { } } } + Some(query_request::Request::GetCapabilities(_)) => { + let caps = self.runtime.pipeline.merged_resolver_capabilities(); + Ok(QueryResult { + response: Some(query_result::Response::GetCapabilities( + GetCapabilitiesResult { + capabilities: Some(caps), + }, + )), + }) + } _ => Err(VegaFusionError::internal( "Invalid VegaFusionRuntimeRequest request", )), @@ -139,6 +150,7 @@ impl VegaFusionRuntimeGrpc { keep_variables: vec![], local_tz: "UTC".to_string(), default_input_tz: None, + data_base_url: None, }); // Decode inline datasets to VegaFusionDatasets @@ -225,6 +237,7 @@ impl VegaFusionRuntimeGrpc { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, + data_base_url: None, }); let variables: Vec = request @@ -322,6 +335,16 @@ impl TonicVegaFusionRuntime for VegaFusionRuntimeGrpc { Err(err) => Err(Status::unknown(err.to_string())), } } + + async fn get_capabilities( + &self, + _request: Request, + ) -> Result, Status> { + let caps = self.runtime.pipeline.merged_resolver_capabilities(); + Ok(Response::new(GetCapabilitiesResult { + capabilities: Some(caps), + })) + } } /// VegaFusion Server diff --git a/vegafusion-server/tests/test_task_graph_runtime.rs b/vegafusion-server/tests/test_task_graph_runtime.rs index b77faffa2..300f391e1 100644 --- a/vegafusion-server/tests/test_task_graph_runtime.rs +++ b/vegafusion-server/tests/test_task_graph_runtime.rs @@ -1,5 +1,6 @@ use std::time::Duration; use vegafusion_common::data::scalar::ScalarValueHelpers; +use vegafusion_core::planning::plan::PlannerConfig; use vegafusion_core::proto::gen::services::query_result::Response; use vegafusion_core::proto::gen::services::vega_fusion_runtime_client::VegaFusionRuntimeClient; use vegafusion_core::proto::gen::services::{query_request, QueryRequest}; @@ -50,7 +51,13 @@ async fn try_it_from_spec() { default_input_tz: None, }; let task_scope = chart.to_task_scope().unwrap(); - let tasks = chart.to_tasks(&tz_config, &Default::default()).unwrap(); + let tasks = chart + .to_tasks( + &tz_config, + &Default::default(), + PlannerConfig::default().data_base_url, + ) + .unwrap(); let graph = TaskGraph::new(tasks, &task_scope).unwrap(); let request = QueryRequest { diff --git a/vegafusion-wasm/src/lib.rs b/vegafusion-wasm/src/lib.rs index 4105f48ad..5268c4ef3 100644 --- a/vegafusion-wasm/src/lib.rs +++ b/vegafusion-wasm/src/lib.rs @@ -24,9 +24,9 @@ use wasm_bindgen_futures::spawn_local; use vegafusion_core::planning::watch::{ExportUpdateJSON, ExportUpdateNamespace, WatchPlan}; use vegafusion_core::proto::gen::services::{ - query_request, query_result, QueryRequest, QueryResult, + query_request, query_result, GetCapabilitiesRequest, QueryRequest, QueryResult, }; -use vegafusion_core::runtime::VegaFusionRuntimeTrait; +use vegafusion_core::runtime::{MergedCapabilities, VegaFusionRuntimeTrait}; use vegafusion_core::spec::chart::ChartSpec; use vegafusion_core::chart_state::{ChartState, ChartStateOpts}; @@ -61,10 +61,11 @@ pub struct QueryFnVegaFusionRuntime { QueryRequest, oneshot::Sender>>, )>, + capabilities: MergedCapabilities, } impl QueryFnVegaFusionRuntime { - pub fn new(query_fn: js_sys::Function) -> Self { + pub fn new(query_fn: js_sys::Function, capabilities: MergedCapabilities) -> Self { let (sender, mut receiver) = async_mpsc::channel::<( QueryRequest, oneshot::Sender>>, @@ -148,11 +149,23 @@ impl QueryFnVegaFusionRuntime { .send(Ok(task_graph_value_response.response_values)) .unwrap(); } + query_result::Response::GetCapabilities(_) => { + // Capabilities responses are handled during initialization, + // not in the normal query loop + response_tx + .send(Err(vegafusion_common::error::VegaFusionError::internal( + "Unexpected GetCapabilities response in query loop".to_string(), + ))) + .unwrap(); + } } } }); - QueryFnVegaFusionRuntime { sender } + QueryFnVegaFusionRuntime { + sender, + capabilities, + } } } @@ -162,6 +175,10 @@ impl VegaFusionRuntimeTrait for QueryFnVegaFusionRuntime { self } + fn planner_capabilities(&self) -> MergedCapabilities { + self.capabilities.clone() + } + async fn query_request( &self, task_graph: Arc, @@ -399,6 +416,79 @@ impl Default for VegaFusionEmbedConfig { } } +/// Fetch capabilities from a remote server via the query_fn callback. +/// Sends a GetCapabilitiesRequest through the existing proto-encoded channel. +async fn fetch_capabilities_via_query_fn( + query_fn: &js_sys::Function, +) -> Result { + use vegafusion_core::proto::gen::tasks::ResolverCapabilities; + + let request = QueryRequest { + request: Some(query_request::Request::GetCapabilities( + GetCapabilitiesRequest {}, + )), + }; + + let mut buf: Vec = Vec::with_capacity(request.encoded_len()); + request + .encode(&mut buf) + .map_err(|e| JsError::new(&format!("Failed to encode capabilities request: {e}")))?; + + let context = JsValue::null(); + let js_buffer = js_sys::Uint8Array::from(buf.as_slice()); + let promise = query_fn + .call1(&context, &js_buffer) + .map_err(|e| { + JsError::new(&format!( + "Failed to call query_fn for capabilities: {}", + js_sys::JSON::stringify(&e).unwrap() + )) + })? + .dyn_into::() + .map_err(|e| { + JsError::new(&format!( + "query_fn did not return a promise: {}", + js_sys::JSON::stringify(&e).unwrap() + )) + })?; + + let response = JsFuture::from(promise).await.map_err(|e| { + JsError::new(&format!( + "Capabilities query failed: {}", + js_sys::JSON::stringify(&e).unwrap() + )) + })?; + + let response_array = response.dyn_into::().map_err(|e| { + JsError::new(&format!( + "Capabilities response is not Uint8Array: {}", + js_sys::JSON::stringify(&e).unwrap() + )) + })?; + + let response_bytes = response_array.to_vec(); + let result = QueryResult::decode(response_bytes.as_slice()) + .map_err(|e| JsError::new(&format!("Failed to decode capabilities response: {e}")))?; + + match result.response { + Some(query_result::Response::GetCapabilities(caps_result)) => { + let caps = caps_result + .capabilities + .unwrap_or_else(ResolverCapabilities::default); + Ok(MergedCapabilities::from_resolver_capabilities(&[caps])) + } + Some(query_result::Response::Error(err)) => { + Err(JsError::new(&format!("Server returned error for capabilities: {err:?}")).into()) + } + _ => { + // Server doesn't support capabilities — use DataFusion defaults + Ok(MergedCapabilities::from_resolver_capabilities(&[ + ResolverCapabilities::datafusion_defaults(), + ])) + } + } +} + /// Embed a Vega chart and accelerate with VegaFusion /// @param element - The DOM element to embed the visualization into /// @param spec - The Vega specification (as string or object) @@ -446,7 +536,10 @@ pub async fn vegafusion_embed( js_sys::JSON::stringify(&e).unwrap() )) })?; - Box::new(QueryFnVegaFusionRuntime::new(query_fn)) + + // Fetch capabilities from the remote server before constructing the runtime + let capabilities = fetch_capabilities_via_query_fn(&query_fn).await?; + Box::new(QueryFnVegaFusionRuntime::new(query_fn, capabilities)) }; let chart_state = ChartState::try_new( @@ -456,6 +549,7 @@ pub async fn vegafusion_embed( ChartStateOpts { tz_config, row_limit: None, + data_base_url: Default::default(), }, ) .await From 8d5cf64bdc710bbae91f2e7caeac27bfc20b4c11 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 12 Mar 2026 08:10:06 -0400 Subject: [PATCH 02/36] fix: address code review findings and CI failures - Replace unreachable!() with proper error in pipeline resolve - Use url::Url::join() for RFC 3986 URL resolution - Check URL scheme in DataSpec::supported against capabilities - Handle protocol-relative URLs by prepending https: - Deduplicate scheme lists in DataFusionResolver::scan_url - Gate url::Url::from_file_path behind cfg(not(wasm32)) - Add catch-all arm in server test match for GetCapabilities variant - Format Python files with ruff Co-Authored-By: Claude Opus 4.6 --- vegafusion-core/src/runtime/plan_resolver.rs | 49 +++++++++++++++---- vegafusion-core/src/spec/data.rs | 18 +++++++ vegafusion-python/tests/test_plan_resolver.py | 4 +- vegafusion-python/vegafusion/plan_resolver.py | 4 +- .../src/data/datafusion_resolver.rs | 11 +++-- vegafusion-runtime/src/data/pipeline.rs | 6 ++- .../tests/test_task_graph_runtime.rs | 1 + 7 files changed, 74 insertions(+), 19 deletions(-) diff --git a/vegafusion-core/src/runtime/plan_resolver.rs b/vegafusion-core/src/runtime/plan_resolver.rs index 22c43f98c..2260281a8 100644 --- a/vegafusion-core/src/runtime/plan_resolver.rs +++ b/vegafusion-core/src/runtime/plan_resolver.rs @@ -181,10 +181,13 @@ pub fn is_absolute_path(path: &str) -> bool { } /// Normalize a base URL so it always has a scheme. -/// Bare absolute paths become file:// URLs; protocol-relative and scheme -/// URLs are preserved as-is; everything else is rejected. +/// Bare absolute paths become file:// URLs; protocol-relative URLs get +/// https: prepended; scheme URLs are preserved as-is; everything else is rejected. pub fn normalize_base_url(base: String) -> Result { - if has_url_scheme(&base) { + if base.starts_with("//") { + // Protocol-relative URL — prepend https: so url::Url::parse works + Ok(format!("https:{base}")) + } else if has_url_scheme(&base) { Ok(base) } else if is_absolute_path(&base) { path_to_file_url(&base) @@ -197,6 +200,7 @@ pub fn normalize_base_url(base: String) -> Result { /// Convert an absolute local path to a file:// URL. /// Uses url::Url::from_file_path() for correct percent-encoding. +#[cfg(not(target_arch = "wasm32"))] pub fn path_to_file_url(path: &str) -> Result { let normalized = path.replace('\\', "/"); let p = std::path::Path::new(&normalized); @@ -210,21 +214,39 @@ pub fn path_to_file_url(path: &str) -> Result { }) } +#[cfg(target_arch = "wasm32")] +pub fn path_to_file_url(path: &str) -> Result { + let normalized = path.replace('\\', "/"); + Ok(format!("file://{normalized}")) +} + /// Resolve a spec URL against a base URL. This is the shared function used by /// both plan-time resolution (MakeTasksVisitor for Url::String) and eval-time /// resolution (DataUrlTask::eval for Url::Expr). pub fn resolve_url(url: &str, data_base_url: &Option) -> Result { // Future: This is the natural place for a URL permissions layer - if has_url_scheme(url) { + if url.starts_with("//") { + // Protocol-relative URL — prepend https: so downstream parsers work + Ok(format!("https:{url}")) + } else if has_url_scheme(url) { Ok(url.to_string()) } else if is_absolute_path(url) { path_to_file_url(url) } else { - // Relative path — resolve against base URL + // Relative path — resolve against base URL using RFC 3986 joining match data_base_url { Some(base) => { - let separator = if base.ends_with('/') { "" } else { "/" }; - Ok(format!("{base}{separator}{url}")) + let base_url = url::Url::parse(base).map_err(|e| { + vegafusion_common::error::VegaFusionError::specification(format!( + "Invalid base URL '{base}': {e}" + )) + })?; + let resolved = base_url.join(url).map_err(|e| { + vegafusion_common::error::VegaFusionError::specification(format!( + "Cannot resolve '{url}' against base '{base}': {e}" + )) + })?; + Ok(resolved.to_string()) } None => Err(vegafusion_common::error::VegaFusionError::specification( format!("Relative URL with no base URL configured: {url}"), @@ -330,7 +352,7 @@ mod tests { #[test] fn test_normalize_base_url_protocol_relative() { let result = normalize_base_url("//example.com/data/".to_string()).unwrap(); - assert_eq!(result, "//example.com/data/"); + assert_eq!(result, "https://example.com/data/"); } #[test] @@ -379,9 +401,18 @@ mod tests { #[test] fn test_resolve_url_relative_without_trailing_slash() { + // Per RFC 3986, joining against a base without trailing slash replaces + // the last path segment: "data" is replaced by "cars.json" let base = Some("https://example.com/data".to_string()); let result = resolve_url("cars.json", &base).unwrap(); - assert_eq!(result, "https://example.com/data/cars.json"); + assert_eq!(result, "https://example.com/cars.json"); + } + + #[test] + fn test_resolve_url_relative_parent_traversal() { + let base = Some("https://example.com/data/v2/".to_string()); + let result = resolve_url("../v1/cars.json", &base).unwrap(); + assert_eq!(result, "https://example.com/data/v1/cars.json"); } #[test] diff --git a/vegafusion-core/src/spec/data.rs b/vegafusion-core/src/spec/data.rs index 06d046c8e..1053bb9ae 100644 --- a/vegafusion-core/src/spec/data.rs +++ b/vegafusion-core/src/spec/data.rs @@ -67,6 +67,24 @@ impl DataSpec { } } + // For static URLs, check the scheme is supported by some resolver. + // Signal-based URLs can't be checked at plan time (scheme unknown). + // Internal dataset URLs (table://, vegafusion+dataset://) are always supported. + if let Some(StringOrSignalSpec::String(url_str)) = &self.url { + if !url_str.starts_with("table://") && !url_str.starts_with("vegafusion+dataset://") { + if let Ok(parsed) = url::Url::parse(url_str) { + let scheme = parsed.scheme(); + if !planner_config + .capabilities + .supported_schemes + .contains(scheme) + { + return DependencyNodeSupported::Unsupported; + } + } + } + } + // Check if inline values array is supported if let Some(values) = &self.values { if !planner_config.extract_inline_data { diff --git a/vegafusion-python/tests/test_plan_resolver.py b/vegafusion-python/tests/test_plan_resolver.py index 55e73b1ad..860927da3 100644 --- a/vegafusion-python/tests/test_plan_resolver.py +++ b/vegafusion-python/tests/test_plan_resolver.py @@ -869,7 +869,9 @@ def resolve_table( return pa.table({"x": [1, 2, 3]}) source_table = pa.table({"x": [1, 2, 3]}) - ext = ExternalDataset(protocol="test", schema=source_table.schema, data=source_table) + ext = ExternalDataset( + protocol="test", schema=source_table.schema, data=source_table + ) resolver = SimpleResolver() rt = vf.VegaFusionRuntime(plan_resolver=resolver) diff --git a/vegafusion-python/vegafusion/plan_resolver.py b/vegafusion-python/vegafusion/plan_resolver.py index 042749b72..fcd5e38e4 100644 --- a/vegafusion-python/vegafusion/plan_resolver.py +++ b/vegafusion-python/vegafusion/plan_resolver.py @@ -111,9 +111,7 @@ def scan_url_proto(self, parsed_url: dict[str, Any]) -> bytes | None: # It's a LogicalPlanNode proto message return result.SerializeToString() - def scan_url( - self, parsed_url: dict[str, Any] - ) -> LogicalPlanNode | bytes | None: + def scan_url(self, parsed_url: dict[str, Any]) -> LogicalPlanNode | bytes | None: """Handle a URL during the scan phase. Override to claim URLs by returning a ``LogicalPlanNode`` or raw bytes. diff --git a/vegafusion-runtime/src/data/datafusion_resolver.rs b/vegafusion-runtime/src/data/datafusion_resolver.rs index 7243b9952..2fbc6dd17 100644 --- a/vegafusion-runtime/src/data/datafusion_resolver.rs +++ b/vegafusion-runtime/src/data/datafusion_resolver.rs @@ -41,10 +41,13 @@ impl PlanResolver for DataFusionResolver { } async fn scan_url(&self, parsed_url: &ParsedUrl) -> Result> { - // Only handle known schemes - match parsed_url.scheme.as_str() { - "http" | "https" | "s3" | "file" => {} - _ => return Ok(None), + // Only handle schemes declared in our capabilities + if !self + .capabilities() + .supported_schemes + .contains(&parsed_url.scheme) + { + return Ok(None); } // Determine file type: format_type takes precedence over extension. diff --git a/vegafusion-runtime/src/data/pipeline.rs b/vegafusion-runtime/src/data/pipeline.rs index 8eb5d6fac..0630e1185 100644 --- a/vegafusion-runtime/src/data/pipeline.rs +++ b/vegafusion-runtime/src/data/pipeline.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use datafusion::prelude::SessionContext; use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; -use vegafusion_common::error::Result; +use vegafusion_common::error::{Result, VegaFusionError}; use vegafusion_core::proto::gen::tasks::ResolverCapabilities; use vegafusion_core::runtime::{MergedCapabilities, ParsedUrl, PlanResolver, ResolutionResult}; @@ -88,6 +88,8 @@ impl ResolverPipeline { ResolutionResult::Plan(p) => current = p, } } - unreachable!("DataFusionResolver (last in chain) always returns Table") + Err(VegaFusionError::internal( + "No resolver produced a final table", + )) } } diff --git a/vegafusion-server/tests/test_task_graph_runtime.rs b/vegafusion-server/tests/test_task_graph_runtime.rs index 300f391e1..82a26d112 100644 --- a/vegafusion-server/tests/test_task_graph_runtime.rs +++ b/vegafusion-server/tests/test_task_graph_runtime.rs @@ -100,6 +100,7 @@ async fn try_it_from_spec() { &[32.1, 59.6], ) } + other => panic!("Unexpected response variant: {other:?}"), } proc.kill().ok(); } From a14dec6ae5e1860c7bab29944afebb1ee6424a3b Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 12 Mar 2026 09:46:51 -0400 Subject: [PATCH 03/36] fix: thread parse spec through ParsedUrl to DataFusionResolver::scan_url The scan_url abstraction was losing the Vega format.parse spec by passing &None to read_csv. This caused incorrect date/timezone handling for CSV datasets with explicit parse directives (e.g., seattle-weather.csv). Co-Authored-By: Claude Opus 4.6 --- vegafusion-core/src/runtime/plan_resolver.rs | 2 ++ vegafusion-runtime/src/data/datafusion_resolver.rs | 4 ++-- vegafusion-runtime/src/data/tasks.rs | 4 +++- vegafusion-runtime/tests/test_plan_resolver.rs | 2 ++ 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/vegafusion-core/src/runtime/plan_resolver.rs b/vegafusion-core/src/runtime/plan_resolver.rs index 2260281a8..c96fdc5f2 100644 --- a/vegafusion-core/src/runtime/plan_resolver.rs +++ b/vegafusion-core/src/runtime/plan_resolver.rs @@ -62,6 +62,8 @@ pub struct ParsedUrl { pub extension: Option, /// Explicit format type from Vega spec (overrides extension) pub format_type: Option, + /// Parse spec from Vega format (e.g., {"date": "date"} for CSV column typing) + pub parse: Option, } /// Merged capabilities from all resolvers, with HashSet fields for O(1) lookup. diff --git a/vegafusion-runtime/src/data/datafusion_resolver.rs b/vegafusion-runtime/src/data/datafusion_resolver.rs index 2fbc6dd17..a4b7afd34 100644 --- a/vegafusion-runtime/src/data/datafusion_resolver.rs +++ b/vegafusion-runtime/src/data/datafusion_resolver.rs @@ -63,9 +63,9 @@ impl PlanResolver for DataFusionResolver { let ctx = self.ctx.clone(); let df = if file_type == Some("csv") || (file_type.is_none() && ext == Some("csv")) { - read_csv(url, &None, ctx, false).await? + read_csv(url, &parsed_url.parse, ctx, false).await? } else if file_type == Some("tsv") || (file_type.is_none() && ext == Some("tsv")) { - read_csv(url, &None, ctx, true).await? + read_csv(url, &parsed_url.parse, ctx, true).await? } else if file_type == Some("json") || (file_type.is_none() && matches!(ext, Some("json") | None)) { diff --git a/vegafusion-runtime/src/data/tasks.rs b/vegafusion-runtime/src/data/tasks.rs index 6089baa02..968f398f3 100644 --- a/vegafusion-runtime/src/data/tasks.rs +++ b/vegafusion-runtime/src/data/tasks.rs @@ -187,7 +187,7 @@ impl TaskCall for DataUrlTask { } } else { // Construct ParsedUrl and dispatch to pipeline.scan_url() - let parsed_url = build_parsed_url(&url, format_type.as_deref())?; + let parsed_url = build_parsed_url(&url, format_type.as_deref(), parse.clone())?; match pipeline.scan_url(&parsed_url).await? { Some(plan) => DataFrame::new(ctx.state(), plan), None => { @@ -237,6 +237,7 @@ impl TaskCall for DataUrlTask { fn build_parsed_url( url: &str, format_type: Option<&str>, + parse: Option, ) -> Result { let parsed = url::Url::parse(url) .map_err(|e| VegaFusionError::internal(format!("Failed to parse URL '{url}': {e}")))?; @@ -259,6 +260,7 @@ fn build_parsed_url( query_params, extension, format_type: format_type.map(|s| s.to_string()), + parse, }) } diff --git a/vegafusion-runtime/tests/test_plan_resolver.rs b/vegafusion-runtime/tests/test_plan_resolver.rs index aa775f812..767ecbecf 100644 --- a/vegafusion-runtime/tests/test_plan_resolver.rs +++ b/vegafusion-runtime/tests/test_plan_resolver.rs @@ -872,6 +872,7 @@ async fn test_scan_url_custom_scheme_first_wins() { query_params: vec![], extension: None, format_type: None, + parse: None, }; let result = pipeline.scan_url(&parsed).await.unwrap(); @@ -895,6 +896,7 @@ async fn test_scan_url_unknown_scheme_falls_through() { query_params: vec![], extension: None, format_type: None, + parse: None, }; let result = pipeline.scan_url(&parsed).await.unwrap(); From 0eb76947b026c761b6fa89abe36f92f25f6a8750 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Thu, 12 Mar 2026 19:54:12 -0400 Subject: [PATCH 04/36] fix: fix Windows CI failures in URL scheme check and path tests - Skip scheme validation for raw absolute paths in data.rs (e.g., C:\Users\...) which haven't been resolved to file:// URLs yet. url::Url::parse misinterprets "C:" as a scheme on Windows. - Gate Unix-path tests with #[cfg(not(target_os = "windows"))] since Url::from_file_path rejects Unix paths on Windows. Co-Authored-By: Claude Opus 4.6 --- vegafusion-core/src/runtime/plan_resolver.rs | 6 ++++++ vegafusion-core/src/spec/data.rs | 10 +++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/vegafusion-core/src/runtime/plan_resolver.rs b/vegafusion-core/src/runtime/plan_resolver.rs index c96fdc5f2..06737d712 100644 --- a/vegafusion-core/src/runtime/plan_resolver.rs +++ b/vegafusion-core/src/runtime/plan_resolver.rs @@ -323,18 +323,21 @@ mod tests { // ── path_to_file_url ── #[test] + #[cfg(not(target_os = "windows"))] fn test_path_to_file_url_unix() { let result = path_to_file_url("/tmp/data.csv").unwrap(); assert_eq!(result, "file:///tmp/data.csv"); } #[test] + #[cfg(not(target_os = "windows"))] fn test_path_to_file_url_spaces() { let result = path_to_file_url("/tmp/my data/file.csv").unwrap(); assert_eq!(result, "file:///tmp/my%20data/file.csv"); } #[test] + #[cfg(not(target_os = "windows"))] fn test_path_to_file_url_hash() { let result = path_to_file_url("/tmp/file#1.csv").unwrap(); assert!( @@ -358,6 +361,7 @@ mod tests { } #[test] + #[cfg(not(target_os = "windows"))] fn test_normalize_base_url_absolute_path() { let result = normalize_base_url("/home/user/data".to_string()).unwrap(); assert_eq!(result, "file:///home/user/data"); @@ -385,6 +389,7 @@ mod tests { } #[test] + #[cfg(not(target_os = "windows"))] fn test_resolve_url_absolute_path_to_file() { let base = Some("https://cdn.example.com/".to_string()); let result = resolve_url("/tmp/data.csv", &base).unwrap(); @@ -453,6 +458,7 @@ mod tests { } #[test] + #[cfg(not(target_os = "windows"))] fn test_resolve_data_base_url_custom_path() { let result = resolve_data_base_url( DataBaseUrlSetting::Custom("/home/user/data".to_string()), diff --git a/vegafusion-core/src/spec/data.rs b/vegafusion-core/src/spec/data.rs index 1053bb9ae..a5d82d40c 100644 --- a/vegafusion-core/src/spec/data.rs +++ b/vegafusion-core/src/spec/data.rs @@ -67,11 +67,15 @@ impl DataSpec { } } - // For static URLs, check the scheme is supported by some resolver. - // Signal-based URLs can't be checked at plan time (scheme unknown). + // For static URLs that already have a scheme, check the scheme is supported + // by some resolver. Skip absolute paths and relative URLs — those are resolved + // later by resolve_url (absolute paths become file://, relatives use base URL). // Internal dataset URLs (table://, vegafusion+dataset://) are always supported. if let Some(StringOrSignalSpec::String(url_str)) = &self.url { - if !url_str.starts_with("table://") && !url_str.starts_with("vegafusion+dataset://") { + if url_str.contains("://") + && !url_str.starts_with("table://") + && !url_str.starts_with("vegafusion+dataset://") + { if let Ok(parsed) = url::Url::parse(url_str) { let scheme = parsed.scheme(); if !planner_config From f705ba2eabe52e8f9ff0b3b8341e2a24aca090fd Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 13 Mar 2026 10:47:49 -0400 Subject: [PATCH 05/36] refactor: rename protocol to scheme across URL and external dataset systems Use the proper URL term "scheme" (RFC 3986) instead of "protocol" for ExternalTableProvider, ExternalDataset, codec serialization keys, and all related APIs. Co-Authored-By: Claude Opus 4.6 --- vegafusion-core/src/runtime/plan_resolver.rs | 8 ++--- vegafusion-python/src/lib.rs | 8 ++--- vegafusion-python/src/plan_resolver.rs | 8 ++--- vegafusion-python/src/utils.rs | 9 +++-- vegafusion-python/tests/test_plan_resolver.py | 36 +++++++++---------- vegafusion-python/vegafusion/dataset.py | 12 +++---- vegafusion-python/vegafusion/plan_resolver.py | 6 ++-- vegafusion-python/vegafusion/runtime.py | 2 +- vegafusion-runtime/src/data/codec.rs | 8 ++--- vegafusion-runtime/src/data/external_table.rs | 16 ++++----- .../tests/test_plan_resolver.rs | 2 +- 11 files changed, 57 insertions(+), 58 deletions(-) diff --git a/vegafusion-core/src/runtime/plan_resolver.rs b/vegafusion-core/src/runtime/plan_resolver.rs index 06737d712..38199d8f5 100644 --- a/vegafusion-core/src/runtime/plan_resolver.rs +++ b/vegafusion-core/src/runtime/plan_resolver.rs @@ -164,7 +164,7 @@ pub fn resolve_data_base_url( } /// Returns true if the string is already a URL (has a scheme) or is -/// protocol-relative (starts with //). +/// scheme-relative (starts with //). pub fn has_url_scheme(s: &str) -> bool { s.contains("://") || s.starts_with("//") } @@ -183,7 +183,7 @@ pub fn is_absolute_path(path: &str) -> bool { } /// Normalize a base URL so it always has a scheme. -/// Bare absolute paths become file:// URLs; protocol-relative URLs get +/// Bare absolute paths become file:// URLs; scheme-relative URLs get /// https: prepended; scheme URLs are preserved as-is; everything else is rejected. pub fn normalize_base_url(base: String) -> Result { if base.starts_with("//") { @@ -274,7 +274,7 @@ mod tests { } #[test] - fn test_has_url_scheme_protocol_relative() { + fn test_has_url_scheme_scheme_relative() { assert!(has_url_scheme("//example.com/data.csv")); } @@ -355,7 +355,7 @@ mod tests { } #[test] - fn test_normalize_base_url_protocol_relative() { + fn test_normalize_base_url_scheme_relative() { let result = normalize_base_url("//example.com/data/".to_string()).unwrap(); assert_eq!(result, "https://example.com/data/"); } diff --git a/vegafusion-python/src/lib.rs b/vegafusion-python/src/lib.rs index 1ede8ca66..d6a9fe852 100644 --- a/vegafusion-python/src/lib.rs +++ b/vegafusion-python/src/lib.rs @@ -596,18 +596,18 @@ pub fn inline_table_scan_node(name: String, schema: pyo3_arrow::PySchema) -> PyR /// Args: /// table_name: Name for the table in the plan. /// schema: Arrow schema (arro3.core.Schema) — required for logical planning. -/// protocol: Optional protocol identifier (e.g. "spark"). +/// scheme: Optional scheme identifier (e.g. "spark"). /// metadata: Optional JSON-serializable dict of metadata. /// source: Optional source identifier. /// /// Returns: /// bytes: Serialized LogicalPlanNode protobuf. #[pyfunction] -#[pyo3(signature = (table_name, schema, protocol=None, metadata=None, source=None))] +#[pyo3(signature = (table_name, schema, scheme=None, metadata=None, source=None))] pub fn external_table_scan_node( table_name: String, schema: pyo3_arrow::PySchema, - protocol: Option, + scheme: Option, metadata: Option<&Bound<'_, pyo3::types::PyAny>>, source: Option, ) -> PyResult> { @@ -627,7 +627,7 @@ pub fn external_table_scan_node( }; let provider = Arc::new( - ExternalTableProvider::new(arrow_schema, protocol, metadata_value).with_source(source), + ExternalTableProvider::new(arrow_schema, scheme, metadata_value).with_source(source), ); let table_source = provider_as_source(provider); diff --git a/vegafusion-python/src/plan_resolver.rs b/vegafusion-python/src/plan_resolver.rs index 9adfc0a86..caeb38f54 100644 --- a/vegafusion-python/src/plan_resolver.rs +++ b/vegafusion-python/src/plan_resolver.rs @@ -90,7 +90,7 @@ impl PyPlanResolver { /// Info extracted from an ExternalTableProvider node in the plan. struct ExternalTableInfo { schema: SchemaRef, - protocol: Option, + scheme: Option, source: Option, metadata: Value, ref_id: Option, @@ -112,7 +112,7 @@ fn extract_external_tables(plan: &LogicalPlan) -> HashMap( // Convert metadata to Python dict let py_metadata = pythonize::pythonize(py, &info.metadata)?; - // Reconstruct ExternalDataset(protocol, schema, metadata, data, source) + // Reconstruct ExternalDataset(scheme, schema, metadata, data, source) let kwargs = PyDict::new(py); - kwargs.set_item("protocol", info.protocol.as_deref())?; + kwargs.set_item("scheme", info.scheme.as_deref())?; kwargs.set_item("schema", py_schema)?; kwargs.set_item("metadata", py_metadata)?; kwargs.set_item("data", &data)?; diff --git a/vegafusion-python/src/utils.rs b/vegafusion-python/src/utils.rs index 9de34cc1e..477372d0e 100644 --- a/vegafusion-python/src/utils.rs +++ b/vegafusion-python/src/utils.rs @@ -24,13 +24,12 @@ pub fn process_inline_datasets( .iter() .map(|(name, inline_dataset)| { let inline_dataset = inline_dataset; - let dataset = if inline_dataset.hasattr("protocol")? + let dataset = if inline_dataset.hasattr("scheme")? && inline_dataset.hasattr("schema")? && inline_dataset.hasattr("metadata")? { - // Handle ExternalDataset with .protocol, .schema, .metadata - let protocol: Option = - inline_dataset.getattr("protocol")?.extract()?; + // Handle ExternalDataset with .scheme, .schema, .metadata + let scheme: Option = inline_dataset.getattr("scheme")?.extract()?; let pyschema = inline_dataset.getattr("schema")?.extract::()?; let schema = pyschema.into_inner(); let metadata_obj = inline_dataset.getattr("metadata")?; @@ -42,7 +41,7 @@ pub fn process_inline_datasets( })?; let provider = - Arc::new(ExternalTableProvider::new(schema, protocol, metadata)); + Arc::new(ExternalTableProvider::new(schema, scheme, metadata)); let table_source = provider_as_source(provider); let logical_plan = LogicalPlanBuilder::scan(name.to_string(), table_source, None) diff --git a/vegafusion-python/tests/test_plan_resolver.py b/vegafusion-python/tests/test_plan_resolver.py index 860927da3..2bf826d38 100644 --- a/vegafusion-python/tests/test_plan_resolver.py +++ b/vegafusion-python/tests/test_plan_resolver.py @@ -84,7 +84,7 @@ def test_passthrough_resolver() -> None: expected_result = pa.table({"x": [5, 10], "y": ["b", "c"]}) ext = ExternalDataset( - protocol="test", schema=source_table.schema, data=source_table + scheme="test", schema=source_table.schema, data=source_table ) resolver = PassthroughResolver(result_table=expected_result) @@ -116,7 +116,7 @@ def test_deserializing_resolver() -> None: expected_result = pa.table({"x": [5, 10], "y": ["b", "c"]}) ext = ExternalDataset( - protocol="test", schema=source_table.schema, data=source_table + scheme="test", schema=source_table.schema, data=source_table ) resolver = DeserializingResolver(result_table=expected_result) @@ -146,11 +146,11 @@ def test_external_dataset_registry() -> None: """ExternalDataset with data registers data in weakref registry.""" table = pa.table({"a": [1, 2, 3]}) ext = ExternalDataset( - protocol="test", schema=table.schema, data=table, metadata={"engine": "test"} + scheme="test", schema=table.schema, data=table, metadata={"engine": "test"} ) - assert ext.protocol == "test" - assert "_vf_protocol" not in ext.metadata # protocol is separate from metadata + assert ext.scheme == "test" + assert "_vf_scheme" not in ext.metadata # scheme is separate from metadata assert "_vf_ref_id" in ext.metadata ref_id = ext.metadata["_vf_ref_id"] assert ExternalDataset.resolve_data(ref_id) is table @@ -161,7 +161,7 @@ def test_external_dataset_registry() -> None: def test_external_dataset_schema_only() -> None: """ExternalDataset without data does not register.""" schema = pa.schema([("x", pa.int64())]) - ext = ExternalDataset(protocol="test", schema=schema) + ext = ExternalDataset(scheme="test", schema=schema) assert "_vf_ref_id" not in ext.metadata assert ext.data is None @@ -332,7 +332,7 @@ def resolve_table( resolver = TableResolver() ext = ExternalDataset( - protocol="test", schema=source_table.schema, data=source_table + scheme="test", schema=source_table.schema, data=source_table ) rt = vf.VegaFusionRuntime(plan_resolver=resolver) @@ -407,7 +407,7 @@ def _replace_custom_scan( resolver = ManualResolver() ext = ExternalDataset( - protocol="test", schema=source_table.schema, data=source_table + scheme="test", schema=source_table.schema, data=source_table ) rt = vf.VegaFusionRuntime(plan_resolver=resolver) @@ -469,8 +469,8 @@ def resolve_table( ], } - ext_a = ExternalDataset(protocol="test", schema=table_a.schema, data=table_a) - ext_b = ExternalDataset(protocol="test", schema=table_b.schema, data=table_b) + ext_a = ExternalDataset(scheme="test", schema=table_a.schema, data=table_a) + ext_b = ExternalDataset(scheme="test", schema=table_b.schema, data=table_b) resolver = MultiTableResolver() rt = vf.VegaFusionRuntime(plan_resolver=resolver) @@ -509,7 +509,7 @@ def resolve_table( raise ValueError("Simulated resolver failure") ext = ExternalDataset( - protocol="test", schema=source_table.schema, data=source_table + scheme="test", schema=source_table.schema, data=source_table ) resolver = FailingResolver() rt = vf.VegaFusionRuntime(plan_resolver=resolver) @@ -582,7 +582,7 @@ def resolve_plan_proto( resolver = SqlCapturingResolver() ext = ExternalDataset( - protocol="test", schema=source_table.schema, data=source_table + scheme="test", schema=source_table.schema, data=source_table ) rt = vf.VegaFusionRuntime(plan_resolver=resolver) @@ -633,7 +633,7 @@ def resolve_plan(self, logical_plan: Any, datasets: dict[str, Any]) -> pa.Table: resolver = ProtoCapturingResolver() ext = ExternalDataset( - protocol="test", schema=source_table.schema, data=source_table + scheme="test", schema=source_table.schema, data=source_table ) rt = vf.VegaFusionRuntime(plan_resolver=resolver) @@ -659,7 +659,7 @@ def test_external_dataset_without_resolver_raises() -> None: """ExternalDataset without a plan resolver raises ValueError with helpful message.""" source_table = pa.table({"x": [1, 2, 3]}) ext = ExternalDataset( - protocol="spark", schema=source_table.schema, data=source_table + scheme="spark", schema=source_table.schema, data=source_table ) rt = vf.VegaFusionRuntime() # No resolver @@ -695,7 +695,7 @@ def resolve_plan_proto( resolver = DialectTestResolver() ext = ExternalDataset( - protocol="test", schema=source_table.schema, data=source_table + scheme="test", schema=source_table.schema, data=source_table ) rt = vf.VegaFusionRuntime(plan_resolver=resolver) @@ -727,7 +727,7 @@ def scan_url(self, parsed_url: dict[str, Any]) -> Any: return external_table_scan_node( table_name="captured", schema=schema, - protocol="test", + scheme="test", metadata={"source_url": parsed_url["url"]}, ) @@ -819,7 +819,7 @@ def scan_url(self, parsed_url: dict[str, Any]) -> Any: return external_table_scan_node( table_name="custom_data", schema=schema, - protocol="myproto", + scheme="myproto", ) return None @@ -870,7 +870,7 @@ def resolve_table( source_table = pa.table({"x": [1, 2, 3]}) ext = ExternalDataset( - protocol="test", schema=source_table.schema, data=source_table + scheme="test", schema=source_table.schema, data=source_table ) resolver = SimpleResolver() rt = vf.VegaFusionRuntime(plan_resolver=resolver) diff --git a/vegafusion-python/vegafusion/dataset.py b/vegafusion-python/vegafusion/dataset.py index 3ad9e10c2..671026c8b 100644 --- a/vegafusion-python/vegafusion/dataset.py +++ b/vegafusion-python/vegafusion/dataset.py @@ -17,9 +17,9 @@ def __init__(self, data: Any) -> None: # noqa: ANN401 class ExternalDataset: - """External dataset with protocol, schema, metadata, and optional data ref. + """External dataset with scheme, schema, metadata, and optional data ref. - The ``protocol`` parameter is an optional short identifier for the data + The ``scheme`` parameter is an optional short identifier for the data source type (e.g. ``"spark"``, ``"snowflake"``, ``"duckdb"``). It is propagated through protobuf separately from metadata so that error messages can name the source when no resolver is registered. @@ -42,7 +42,7 @@ class ExternalDataset: def __init__( self, - protocol: str | None = None, + scheme: str | None = None, schema: Any = None, # noqa: ANN401 metadata: dict[str, Any] | None = None, data: Any = None, # noqa: ANN401 @@ -51,7 +51,7 @@ def __init__( self._schema: Schema = ( Schema.from_arrow(schema) if not isinstance(schema, Schema) else schema ) - self._protocol = protocol + self._scheme = scheme self._source = source self._metadata: dict[str, Any] = dict(metadata) if metadata else {} self._data: Any = data @@ -74,9 +74,9 @@ def resolve_data(cls, ref_id: str) -> Any | None: # noqa: ANN401 return data_ref.data if data_ref is not None else None @property - def protocol(self) -> str | None: + def scheme(self) -> str | None: """Optional short identifier for the data source type (e.g. ``"spark"``).""" - return self._protocol + return self._scheme @property def schema(self) -> Schema: diff --git a/vegafusion-python/vegafusion/plan_resolver.py b/vegafusion-python/vegafusion/plan_resolver.py index fcd5e38e4..b834c1fe8 100644 --- a/vegafusion-python/vegafusion/plan_resolver.py +++ b/vegafusion-python/vegafusion/plan_resolver.py @@ -351,7 +351,7 @@ def inline_table_scan_node( def external_table_scan_node( table_name: str, schema: Schema, - protocol: str | None = None, + scheme: str | None = None, metadata: dict[str, Any] | None = None, source: str | None = None, ) -> LogicalPlanNode: @@ -364,7 +364,7 @@ def external_table_scan_node( Args: table_name: Name for the table in the plan. schema: Arrow schema (arro3.core.Schema) — required for logical planning. - protocol: Optional protocol identifier (e.g. ``"spark"``). + scheme: Optional scheme identifier (e.g. ``"spark"``). metadata: Optional JSON-serializable dict of metadata. source: Optional source identifier. @@ -385,7 +385,7 @@ def external_table_scan_node( _native( table_name=table_name, schema=schema, - protocol=protocol, + scheme=scheme, metadata=metadata, source=source, ) diff --git a/vegafusion-python/vegafusion/runtime.py b/vegafusion-python/vegafusion/runtime.py index 19d8bf464..45b2eb1e9 100644 --- a/vegafusion-python/vegafusion/runtime.py +++ b/vegafusion-python/vegafusion/runtime.py @@ -396,7 +396,7 @@ def _import_inline_datasets( # Validate: ExternalDatasets require a plan resolver if external_dataset_refs and not self._plan_resolvers: details = [ - f" - {name!r} (protocol={value.protocol!r})" + f" - {name!r} (scheme={value.scheme!r})" for name, value in inline_datasets.items() if isinstance(value, ExternalDataset) ] diff --git a/vegafusion-runtime/src/data/codec.rs b/vegafusion-runtime/src/data/codec.rs index fbb187e7a..9625e5f06 100644 --- a/vegafusion-runtime/src/data/codec.rs +++ b/vegafusion-runtime/src/data/codec.rs @@ -76,8 +76,8 @@ impl LogicalExtensionCodec for VegaFusionCodec { match envelope.get("type").and_then(|t| t.as_str()) { Some("external") => { - let protocol = envelope - .get("protocol") + let scheme = envelope + .get("scheme") .and_then(|v| v.as_str()) .map(|s| s.to_string()); let source = envelope @@ -86,7 +86,7 @@ impl LogicalExtensionCodec for VegaFusionCodec { .map(|s| s.to_string()); let metadata = envelope.get("metadata").cloned().unwrap_or(Value::Null); Ok(Arc::new( - ExternalTableProvider::new(schema, protocol, metadata).with_source(source), + ExternalTableProvider::new(schema, scheme, metadata).with_source(source), )) } Some("inline") => { @@ -134,7 +134,7 @@ impl LogicalExtensionCodec for VegaFusionCodec { if let Some(ext) = node.as_any().downcast_ref::() { let mut envelope = serde_json::json!({ "type": "external", - "protocol": ext.protocol(), + "scheme": ext.scheme(), "metadata": ext.metadata(), }); if let Some(source) = ext.source() { diff --git a/vegafusion-runtime/src/data/external_table.rs b/vegafusion-runtime/src/data/external_table.rs index 71adaa026..b4239e523 100644 --- a/vegafusion-runtime/src/data/external_table.rs +++ b/vegafusion-runtime/src/data/external_table.rs @@ -23,16 +23,16 @@ use vegafusion_common::arrow::datatypes::SchemaRef; /// which is serialized into `custom_table_data` by [`super::codec::VegaFusionCodec`]. pub struct ExternalTableProvider { schema: SchemaRef, - protocol: Option, + scheme: Option, source: Option, metadata: Value, } impl ExternalTableProvider { - pub fn new(schema: SchemaRef, protocol: Option, metadata: Value) -> Self { + pub fn new(schema: SchemaRef, scheme: Option, metadata: Value) -> Self { Self { schema, - protocol, + scheme, source: None, metadata, } @@ -43,8 +43,8 @@ impl ExternalTableProvider { self } - pub fn protocol(&self) -> Option<&str> { - self.protocol.as_deref() + pub fn scheme(&self) -> Option<&str> { + self.scheme.as_deref() } pub fn source(&self) -> Option<&str> { @@ -59,7 +59,7 @@ impl ExternalTableProvider { impl Debug for ExternalTableProvider { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("ExternalTableProvider") - .field("protocol", &self.protocol) + .field("scheme", &self.scheme) .field("source", &self.source) .field("schema", &self.schema) .field("metadata", &self.metadata) @@ -88,9 +88,9 @@ impl TableProvider for ExternalTableProvider { _filters: &[Expr], _limit: Option, ) -> Result> { - let protocol = self.protocol().unwrap_or("unknown"); + let scheme = self.scheme().unwrap_or("unknown"); plan_err!( - "ExternalTableProvider (protocol: {protocol}) cannot be executed directly. \ + "ExternalTableProvider (scheme: {scheme}) cannot be executed directly. \ This table represents an external data source that must be resolved \ before execution. Set a PlanResolver on the VegaFusionRuntime to \ handle external table references." diff --git a/vegafusion-runtime/tests/test_plan_resolver.rs b/vegafusion-runtime/tests/test_plan_resolver.rs index 767ecbecf..950192dce 100644 --- a/vegafusion-runtime/tests/test_plan_resolver.rs +++ b/vegafusion-runtime/tests/test_plan_resolver.rs @@ -1242,7 +1242,7 @@ mod serialization_tests { .as_any() .downcast_ref::() .expect("Expected ExternalTableProvider"); - assert_eq!(ext.protocol(), Some("test")); + assert_eq!(ext.scheme(), Some("test")); assert_eq!(ext.metadata(), &metadata); } else { panic!("Expected TableScan, got {:?}", round_tripped); From 9858e58ac668f28a3472d0ddffdd5e0f024cff13 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 13 Mar 2026 10:08:46 -0400 Subject: [PATCH 06/36] refactor: move PlanResolver trait from vegafusion-core to vegafusion-runtime This enables a default resolve_plan implementation that walks the LogicalPlan tree, calls resolve_table for each ExternalTableProvider, and replaces them with MemTable scans. Implementers can now override just resolve_table instead of the full resolve_plan method. The trait needed to live in vegafusion-runtime because the default implementation depends on DataFusion types (ExternalTableProvider, MemTable, TreeNodeRewriter) that are not available in vegafusion-core. Co-Authored-By: Claude Opus 4.6 --- .../rust-examples/examples/custom_resolver.rs | 3 +- vegafusion-core/src/runtime/mod.rs | 2 +- vegafusion-core/src/runtime/plan_resolver.rs | 20 --- vegafusion-python/src/lib.rs | 3 +- vegafusion-python/src/plan_resolver.rs | 3 +- .../src/data/datafusion_resolver.rs | 4 +- vegafusion-runtime/src/data/mod.rs | 1 + vegafusion-runtime/src/data/pipeline.rs | 3 +- vegafusion-runtime/src/data/plan_resolver.rs | 169 ++++++++++++++++++ vegafusion-runtime/src/task_graph/runtime.rs | 2 +- .../tests/test_plan_resolver.rs | 3 +- 11 files changed, 185 insertions(+), 28 deletions(-) create mode 100644 vegafusion-runtime/src/data/plan_resolver.rs diff --git a/examples/rust-examples/examples/custom_resolver.rs b/examples/rust-examples/examples/custom_resolver.rs index 71a515e92..2c9576d30 100644 --- a/examples/rust-examples/examples/custom_resolver.rs +++ b/examples/rust-examples/examples/custom_resolver.rs @@ -1,8 +1,9 @@ use std::sync::Arc; use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::Result; -use vegafusion_core::runtime::{PlanResolver, ResolutionResult, VegaFusionRuntimeTrait}; +use vegafusion_core::runtime::{ResolutionResult, VegaFusionRuntimeTrait}; use vegafusion_core::spec::chart::ChartSpec; +use vegafusion_runtime::data::plan_resolver::PlanResolver; use vegafusion_runtime::task_graph::runtime::VegaFusionRuntime; /// A custom resolver that logs plan resolution and passes through to DataFusion diff --git a/vegafusion-core/src/runtime/mod.rs b/vegafusion-core/src/runtime/mod.rs index 2da146a12..bed6b3043 100644 --- a/vegafusion-core/src/runtime/mod.rs +++ b/vegafusion-core/src/runtime/mod.rs @@ -3,6 +3,6 @@ mod runtime; pub use plan_resolver::{ has_url_scheme, is_absolute_path, normalize_base_url, path_to_file_url, resolve_data_base_url, - resolve_url, DataBaseUrlSetting, MergedCapabilities, ParsedUrl, PlanResolver, ResolutionResult, + resolve_url, DataBaseUrlSetting, MergedCapabilities, ParsedUrl, ResolutionResult, }; pub use runtime::{PreTransformExtractTable, VegaFusionRuntimeTrait}; diff --git a/vegafusion-core/src/runtime/plan_resolver.rs b/vegafusion-core/src/runtime/plan_resolver.rs index 38199d8f5..46432dae5 100644 --- a/vegafusion-core/src/runtime/plan_resolver.rs +++ b/vegafusion-core/src/runtime/plan_resolver.rs @@ -1,6 +1,5 @@ use crate::proto::gen::pretransform::DataBaseUrlSettingProto; use crate::proto::gen::tasks::ResolverCapabilities; -use async_trait::async_trait; use std::collections::HashSet; use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; @@ -130,25 +129,6 @@ impl ResolverCapabilities { } } -#[async_trait] -pub trait PlanResolver: Send + Sync + 'static { - fn name(&self) -> &str; - - /// Declare what URL patterns this resolver supports at planning time. - /// Returns empty capabilities by default (no additional URL support). - fn capabilities(&self) -> ResolverCapabilities { - ResolverCapabilities::default() - } - - /// Given a parsed URL, optionally return a LogicalPlan to handle it. - /// Return Ok(None) to pass the URL to the next resolver in the chain. - async fn scan_url(&self, _parsed_url: &ParsedUrl) -> Result> { - Ok(None) - } - - async fn resolve_plan(&self, plan: LogicalPlan) -> Result; -} - /// Map a DataBaseUrlSetting (from public API) to the two-state Option /// used by PlannerConfig. Custom base URLs are normalized (bare absolute paths /// become file:// URLs). diff --git a/vegafusion-python/src/lib.rs b/vegafusion-python/src/lib.rs index d6a9fe852..cedf1e390 100644 --- a/vegafusion-python/src/lib.rs +++ b/vegafusion-python/src/lib.rs @@ -32,7 +32,8 @@ use vegafusion_core::task_graph::graph::ScopedVariable; use vegafusion_core::task_graph::task_value::MaterializedTaskValue; use vegafusion_runtime::tokio_runtime::TOKIO_THREAD_STACK_SIZE; -use vegafusion_core::runtime::{PlanResolver, VegaFusionRuntimeTrait}; +use vegafusion_core::runtime::VegaFusionRuntimeTrait; +use vegafusion_runtime::data::plan_resolver::PlanResolver; use vegafusion_runtime::task_graph::cache::VegaFusionCache; use crate::chart_state::PyChartState; diff --git a/vegafusion-python/src/plan_resolver.rs b/vegafusion-python/src/plan_resolver.rs index caeb38f54..967aa467e 100644 --- a/vegafusion-python/src/plan_resolver.rs +++ b/vegafusion-python/src/plan_resolver.rs @@ -16,9 +16,10 @@ use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::{Result, VegaFusionError}; use vegafusion_core::proto::gen::tasks::ResolverCapabilities; -use vegafusion_core::runtime::{ParsedUrl, PlanResolver, ResolutionResult}; +use vegafusion_core::runtime::{ParsedUrl, ResolutionResult}; use vegafusion_runtime::data::codec::VegaFusionCodec; use vegafusion_runtime::data::external_table::ExternalTableProvider; +use vegafusion_runtime::data::plan_resolver::PlanResolver; /// A `PlanResolver` that delegates to a Python object. /// diff --git a/vegafusion-runtime/src/data/datafusion_resolver.rs b/vegafusion-runtime/src/data/datafusion_resolver.rs index a4b7afd34..ecba58eb5 100644 --- a/vegafusion-runtime/src/data/datafusion_resolver.rs +++ b/vegafusion-runtime/src/data/datafusion_resolver.rs @@ -8,7 +8,9 @@ use vegafusion_common::error::Result; #[cfg(not(feature = "parquet"))] use vegafusion_common::error::VegaFusionError; use vegafusion_core::proto::gen::tasks::ResolverCapabilities; -use vegafusion_core::runtime::{ParsedUrl, PlanResolver, ResolutionResult}; +use vegafusion_core::runtime::{ParsedUrl, ResolutionResult}; + +use super::plan_resolver::PlanResolver; use super::tasks::{read_arrow, read_csv, read_json}; use super::util::DataFrameUtils; diff --git a/vegafusion-runtime/src/data/mod.rs b/vegafusion-runtime/src/data/mod.rs index 8ad3a1379..6bea15bf9 100644 --- a/vegafusion-runtime/src/data/mod.rs +++ b/vegafusion-runtime/src/data/mod.rs @@ -4,5 +4,6 @@ pub mod datafusion_resolver; pub mod external_table; pub mod inline_table; pub mod pipeline; +pub mod plan_resolver; pub mod tasks; pub mod util; diff --git a/vegafusion-runtime/src/data/pipeline.rs b/vegafusion-runtime/src/data/pipeline.rs index 0630e1185..e53eef472 100644 --- a/vegafusion-runtime/src/data/pipeline.rs +++ b/vegafusion-runtime/src/data/pipeline.rs @@ -5,9 +5,10 @@ use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::{Result, VegaFusionError}; use vegafusion_core::proto::gen::tasks::ResolverCapabilities; -use vegafusion_core::runtime::{MergedCapabilities, ParsedUrl, PlanResolver, ResolutionResult}; +use vegafusion_core::runtime::{MergedCapabilities, ParsedUrl, ResolutionResult}; use super::datafusion_resolver::DataFusionResolver; +use super::plan_resolver::PlanResolver; /// Chains resolvers with a terminal `DataFusionResolver`. /// diff --git a/vegafusion-runtime/src/data/plan_resolver.rs b/vegafusion-runtime/src/data/plan_resolver.rs new file mode 100644 index 000000000..79fb3e776 --- /dev/null +++ b/vegafusion-runtime/src/data/plan_resolver.rs @@ -0,0 +1,169 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::catalog::TableProvider; +use datafusion::datasource::{provider_as_source, source_as_provider, MemTable}; +use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter}; +use datafusion_expr::LogicalPlan as DFLogicalPlan; +use vegafusion_common::arrow::datatypes::SchemaRef; +use vegafusion_common::data::table::VegaFusionTable; +use vegafusion_common::datafusion_expr::LogicalPlan; +use vegafusion_common::error::{Result, VegaFusionError}; +use vegafusion_core::proto::gen::tasks::ResolverCapabilities; +use vegafusion_core::runtime::{ParsedUrl, ResolutionResult}; + +use super::external_table::ExternalTableProvider; + +#[async_trait] +pub trait PlanResolver: Send + Sync + 'static { + fn name(&self) -> &str; + + /// Declare what URL patterns this resolver supports at planning time. + /// Returns empty capabilities by default (no additional URL support). + fn capabilities(&self) -> ResolverCapabilities { + ResolverCapabilities::default() + } + + /// Given a parsed URL, optionally return a LogicalPlan to handle it. + /// Return Ok(None) to pass the URL to the next resolver in the chain. + async fn scan_url(&self, _parsed_url: &ParsedUrl) -> Result> { + Ok(None) + } + + /// Provide data for a single external table reference. + /// + /// Called once per `ExternalTableProvider` node in the plan. + /// Override this instead of [`resolve_plan`](Self::resolve_plan) when + /// each external table can be resolved independently. + /// + /// The default `resolve_plan` walks the plan tree and calls this method + /// for every `ExternalTableProvider` it finds, replacing each with an + /// in-memory table. + /// + /// # Arguments + /// * `name` - table name from the plan + /// * `schema` - full Arrow schema of the external table + /// * `metadata` - JSON metadata from ExternalTableProvider + /// * `projected_columns` - column names DataFusion actually needs, + /// or `None` if all columns are needed + async fn resolve_table( + &self, + _name: &str, + _schema: SchemaRef, + _metadata: &serde_json::Value, + _projected_columns: Option>, + ) -> Result { + Err(VegaFusionError::internal( + "resolve_table not implemented — override resolve_table or resolve_plan", + )) + } + + /// Resolve a LogicalPlan containing external table references. + /// + /// The default implementation walks the plan tree, finds + /// `ExternalTableProvider` nodes, calls [`resolve_table`](Self::resolve_table) + /// for each, and replaces them with in-memory table scans. Plans with + /// no external tables are passed through unchanged. + /// + /// Override this for full control over plan rewriting (e.g. SQL + /// transpilation or remote execution). + async fn resolve_plan(&self, plan: LogicalPlan) -> Result { + let external_tables = extract_external_tables(&plan); + + if external_tables.is_empty() { + return Ok(ResolutionResult::Plan(plan)); + } + + // Resolve each external table, then wrap as MemTable + let mut mem_tables: HashMap> = HashMap::new(); + for (table_name, info) in &external_tables { + let table = self + .resolve_table( + table_name, + info.schema.clone(), + &info.metadata, + info.projected_columns.clone(), + ) + .await?; + let mem_table = + MemTable::try_new(table.schema.clone(), vec![table.batches]).map_err(|e| { + VegaFusionError::internal(format!("Failed to create MemTable: {e}")) + })?; + mem_tables.insert(table_name.clone(), Arc::new(mem_table)); + } + + // Rewrite the plan, replacing ExternalTableProvider with MemTable + let mut rewriter = ResolvedTableRewriter { tables: mem_tables }; + let rewritten = plan + .rewrite(&mut rewriter) + .map_err(|e| VegaFusionError::internal(format!("Failed to rewrite plan: {e}")))? + .data; + + Ok(ResolutionResult::Plan(rewritten)) + } +} + +/// Info extracted from an ExternalTableProvider node in a LogicalPlan. +struct ExternalTableInfo { + schema: SchemaRef, + metadata: serde_json::Value, + projected_columns: Option>, +} + +/// Walk a LogicalPlan and collect ExternalTableProvider info for each table scan. +fn extract_external_tables(plan: &LogicalPlan) -> HashMap { + let mut tables = HashMap::new(); + let _ = plan.apply(|node| { + if let DFLogicalPlan::TableScan(scan) = node { + if let Ok(provider) = source_as_provider(&scan.source) { + if let Some(ext) = provider.as_any().downcast_ref::() { + let projected_columns = scan.projection.as_ref().map(|indices| { + let schema = ext.schema(); + indices + .iter() + .map(|&i| schema.field(i).name().clone()) + .collect() + }); + tables.insert( + scan.table_name.table().to_string(), + ExternalTableInfo { + schema: ext.schema(), + metadata: ext.metadata().clone(), + projected_columns, + }, + ); + } + } + } + Ok(datafusion_common::tree_node::TreeNodeRecursion::Continue) + }); + tables +} + +/// Rewriter that replaces ExternalTableProvider scans with MemTable scans. +struct ResolvedTableRewriter { + tables: HashMap>, +} + +impl TreeNodeRewriter for ResolvedTableRewriter { + type Node = DFLogicalPlan; + + fn f_up(&mut self, node: Self::Node) -> datafusion_common::Result> { + if let DFLogicalPlan::TableScan(scan) = &node { + let table_name = scan.table_name.table(); + if let Some(mem_table) = self.tables.get(table_name) { + let new_scan = DFLogicalPlan::TableScan(datafusion_expr::TableScan { + table_name: scan.table_name.clone(), + source: provider_as_source(mem_table.clone()), + projection: scan.projection.clone(), + projected_schema: scan.projected_schema.clone(), + filters: scan.filters.clone(), + fetch: scan.fetch, + }); + return Ok(Transformed::yes(new_scan)); + } + } + Ok(Transformed::no(node)) + } +} diff --git a/vegafusion-runtime/src/task_graph/runtime.rs b/vegafusion-runtime/src/task_graph/runtime.rs index 8fea7392c..99e264b93 100644 --- a/vegafusion-runtime/src/task_graph/runtime.rs +++ b/vegafusion-runtime/src/task_graph/runtime.rs @@ -1,4 +1,5 @@ use crate::data::pipeline::ResolverPipeline; +use crate::data::plan_resolver::PlanResolver; use crate::datafusion::context::make_datafusion_context; use crate::task_graph::cache::VegaFusionCache; use crate::task_graph::task::TaskCall; @@ -18,7 +19,6 @@ use vegafusion_core::proto::gen::tasks::inline_dataset::Dataset; use vegafusion_core::proto::gen::tasks::{ task::TaskKind, InlineDataset, InlineDatasetTable, NodeValueIndex, TaskGraph, }; -use vegafusion_core::runtime::PlanResolver; use vegafusion_core::runtime::VegaFusionRuntimeTrait; use vegafusion_core::task_graph::task_value::{MaterializedTaskValue, NamedTaskValue, TaskValue}; diff --git a/vegafusion-runtime/tests/test_plan_resolver.rs b/vegafusion-runtime/tests/test_plan_resolver.rs index 950192dce..3364d9067 100644 --- a/vegafusion-runtime/tests/test_plan_resolver.rs +++ b/vegafusion-runtime/tests/test_plan_resolver.rs @@ -15,10 +15,11 @@ use vegafusion_common::error::{Result, VegaFusionError}; use vegafusion_core::data::dataset::VegaFusionDataset; use vegafusion_core::proto::gen::pretransform::PreTransformSpecOpts; use vegafusion_core::proto::gen::tasks::ResolverCapabilities; -use vegafusion_core::runtime::{ParsedUrl, PlanResolver, ResolutionResult, VegaFusionRuntimeTrait}; +use vegafusion_core::runtime::{ParsedUrl, ResolutionResult, VegaFusionRuntimeTrait}; use vegafusion_core::spec::chart::ChartSpec; use vegafusion_runtime::data::external_table::ExternalTableProvider; use vegafusion_runtime::data::pipeline::ResolverPipeline; +use vegafusion_runtime::data::plan_resolver::PlanResolver; use vegafusion_runtime::task_graph::runtime::VegaFusionRuntime; #[derive(Clone, Debug)] From 0ff4df6f68c83553fbae72da68543121b65b21f3 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 13 Mar 2026 11:11:47 -0400 Subject: [PATCH 07/36] feat: add scheme parameter to resolve_table API Threads the ExternalTableProvider's scheme through to resolve_table so resolvers can identify the data source type without parsing metadata. Co-Authored-By: Claude Opus 4.6 --- vegafusion-python/tests/test_plan_resolver.py | 18 ++++++++++++------ vegafusion-python/vegafusion/plan_resolver.py | 6 +++++- vegafusion-runtime/src/data/plan_resolver.rs | 5 +++++ 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/vegafusion-python/tests/test_plan_resolver.py b/vegafusion-python/tests/test_plan_resolver.py index 2bf826d38..04e13289a 100644 --- a/vegafusion-python/tests/test_plan_resolver.py +++ b/vegafusion-python/tests/test_plan_resolver.py @@ -318,7 +318,8 @@ def resolve_table( self, name: str, schema: Any, - metadata: dict[str, Any], + scheme: str | None = None, + metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, ) -> pa.Table: self.resolve_calls.append( @@ -441,7 +442,8 @@ def resolve_table( self, name: str, schema: Any, - metadata: dict[str, Any], + scheme: str | None = None, + metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, ) -> pa.Table: self.resolved_names.append(name) @@ -503,7 +505,8 @@ def resolve_table( self, name: str, schema: Any, - metadata: dict[str, Any], + scheme: str | None = None, + metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, ) -> pa.Table: raise ValueError("Simulated resolver failure") @@ -735,7 +738,8 @@ def resolve_table( self, name: str, schema: Any, - metadata: dict[str, Any], + scheme: str | None = None, + metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, ) -> pa.Table: return pa.table({"x": [1, 2], "y": ["a", "b"]}) @@ -827,7 +831,8 @@ def resolve_table( self, name: str, schema: Any, - metadata: dict[str, Any], + scheme: str | None = None, + metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, ) -> pa.Table: return pa.table({"val": [42, 99]}) @@ -863,7 +868,8 @@ def resolve_table( self, name: str, schema: Any, - metadata: dict[str, Any], + scheme: str | None = None, + metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, ) -> pa.Table: return pa.table({"x": [1, 2, 3]}) diff --git a/vegafusion-python/vegafusion/plan_resolver.py b/vegafusion-python/vegafusion/plan_resolver.py index b834c1fe8..008460e23 100644 --- a/vegafusion-python/vegafusion/plan_resolver.py +++ b/vegafusion-python/vegafusion/plan_resolver.py @@ -132,7 +132,8 @@ def resolve_table( self, name: str, schema: Schema, - metadata: dict[str, Any], + scheme: str | None = None, + metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, ) -> Table: """Provide data for an external table reference. @@ -142,6 +143,8 @@ def resolve_table( Args: name: Table name from the plan. schema: Full schema of the external table. + scheme: Optional URL scheme identifier (e.g. ``"spark"``, + ``"snowflake"``). metadata: JSON metadata dict from ExternalTableProvider. projected_columns: Column names DataFusion actually needs. None if no projection (all columns needed). @@ -234,6 +237,7 @@ def _resolve_external_tables( table_data = self.resolve_table( name=table_name, schema=dataset.schema, + scheme=dataset.scheme, metadata=metadata, projected_columns=projected_columns, ) diff --git a/vegafusion-runtime/src/data/plan_resolver.rs b/vegafusion-runtime/src/data/plan_resolver.rs index 79fb3e776..d41fa084d 100644 --- a/vegafusion-runtime/src/data/plan_resolver.rs +++ b/vegafusion-runtime/src/data/plan_resolver.rs @@ -44,6 +44,7 @@ pub trait PlanResolver: Send + Sync + 'static { /// # Arguments /// * `name` - table name from the plan /// * `schema` - full Arrow schema of the external table + /// * `scheme` - optional URL scheme identifier (e.g. `"spark"`, `"snowflake"`) /// * `metadata` - JSON metadata from ExternalTableProvider /// * `projected_columns` - column names DataFusion actually needs, /// or `None` if all columns are needed @@ -51,6 +52,7 @@ pub trait PlanResolver: Send + Sync + 'static { &self, _name: &str, _schema: SchemaRef, + _scheme: Option<&str>, _metadata: &serde_json::Value, _projected_columns: Option>, ) -> Result { @@ -82,6 +84,7 @@ pub trait PlanResolver: Send + Sync + 'static { .resolve_table( table_name, info.schema.clone(), + info.scheme.as_deref(), &info.metadata, info.projected_columns.clone(), ) @@ -107,6 +110,7 @@ pub trait PlanResolver: Send + Sync + 'static { /// Info extracted from an ExternalTableProvider node in a LogicalPlan. struct ExternalTableInfo { schema: SchemaRef, + scheme: Option, metadata: serde_json::Value, projected_columns: Option>, } @@ -129,6 +133,7 @@ fn extract_external_tables(plan: &LogicalPlan) -> HashMap Date: Fri, 13 Mar 2026 11:25:23 -0400 Subject: [PATCH 08/36] refactor: make scheme required on ExternalTableProvider and ExternalDataset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No backward compatibility needed — this API hasn't been released yet. Scheme now comes before schema in all signatures to reflect its role as the primary discriminator for external data sources. Co-Authored-By: Claude Opus 4.6 --- vegafusion-python/src/lib.rs | 8 +++--- vegafusion-python/src/plan_resolver.rs | 6 ++--- vegafusion-python/src/utils.rs | 4 +-- vegafusion-python/tests/test_plan_resolver.py | 12 ++++----- vegafusion-python/vegafusion/dataset.py | 16 +++++------ vegafusion-python/vegafusion/plan_resolver.py | 14 +++++----- vegafusion-runtime/src/data/codec.rs | 27 ++++++++++--------- vegafusion-runtime/src/data/external_table.rs | 12 ++++----- vegafusion-runtime/src/data/plan_resolver.rs | 10 +++---- .../tests/test_plan_resolver.rs | 12 ++++----- 10 files changed, 61 insertions(+), 60 deletions(-) diff --git a/vegafusion-python/src/lib.rs b/vegafusion-python/src/lib.rs index cedf1e390..fac819e26 100644 --- a/vegafusion-python/src/lib.rs +++ b/vegafusion-python/src/lib.rs @@ -596,19 +596,19 @@ pub fn inline_table_scan_node(name: String, schema: pyo3_arrow::PySchema) -> PyR /// /// Args: /// table_name: Name for the table in the plan. +/// scheme: Scheme identifier (e.g. "spark"). /// schema: Arrow schema (arro3.core.Schema) — required for logical planning. -/// scheme: Optional scheme identifier (e.g. "spark"). /// metadata: Optional JSON-serializable dict of metadata. /// source: Optional source identifier. /// /// Returns: /// bytes: Serialized LogicalPlanNode protobuf. #[pyfunction] -#[pyo3(signature = (table_name, schema, scheme=None, metadata=None, source=None))] +#[pyo3(signature = (table_name, scheme, schema, metadata=None, source=None))] pub fn external_table_scan_node( table_name: String, + scheme: String, schema: pyo3_arrow::PySchema, - scheme: Option, metadata: Option<&Bound<'_, pyo3::types::PyAny>>, source: Option, ) -> PyResult> { @@ -628,7 +628,7 @@ pub fn external_table_scan_node( }; let provider = Arc::new( - ExternalTableProvider::new(arrow_schema, scheme, metadata_value).with_source(source), + ExternalTableProvider::new(scheme, arrow_schema, metadata_value).with_source(source), ); let table_source = provider_as_source(provider); diff --git a/vegafusion-python/src/plan_resolver.rs b/vegafusion-python/src/plan_resolver.rs index 967aa467e..2122d7ae7 100644 --- a/vegafusion-python/src/plan_resolver.rs +++ b/vegafusion-python/src/plan_resolver.rs @@ -90,8 +90,8 @@ impl PyPlanResolver { /// Info extracted from an ExternalTableProvider node in the plan. struct ExternalTableInfo { + scheme: String, schema: SchemaRef, - scheme: Option, source: Option, metadata: Value, ref_id: Option, @@ -112,8 +112,8 @@ fn extract_external_tables(plan: &LogicalPlan) -> HashMap( // Reconstruct ExternalDataset(scheme, schema, metadata, data, source) let kwargs = PyDict::new(py); - kwargs.set_item("scheme", info.scheme.as_deref())?; + kwargs.set_item("scheme", &info.scheme)?; kwargs.set_item("schema", py_schema)?; kwargs.set_item("metadata", py_metadata)?; kwargs.set_item("data", &data)?; diff --git a/vegafusion-python/src/utils.rs b/vegafusion-python/src/utils.rs index 477372d0e..1da72cbfd 100644 --- a/vegafusion-python/src/utils.rs +++ b/vegafusion-python/src/utils.rs @@ -29,7 +29,7 @@ pub fn process_inline_datasets( && inline_dataset.hasattr("metadata")? { // Handle ExternalDataset with .scheme, .schema, .metadata - let scheme: Option = inline_dataset.getattr("scheme")?.extract()?; + let scheme: String = inline_dataset.getattr("scheme")?.extract()?; let pyschema = inline_dataset.getattr("schema")?.extract::()?; let schema = pyschema.into_inner(); let metadata_obj = inline_dataset.getattr("metadata")?; @@ -41,7 +41,7 @@ pub fn process_inline_datasets( })?; let provider = - Arc::new(ExternalTableProvider::new(schema, scheme, metadata)); + Arc::new(ExternalTableProvider::new(scheme, schema, metadata)); let table_source = provider_as_source(provider); let logical_plan = LogicalPlanBuilder::scan(name.to_string(), table_source, None) diff --git a/vegafusion-python/tests/test_plan_resolver.py b/vegafusion-python/tests/test_plan_resolver.py index 04e13289a..4c2ffd2c5 100644 --- a/vegafusion-python/tests/test_plan_resolver.py +++ b/vegafusion-python/tests/test_plan_resolver.py @@ -317,8 +317,8 @@ def __init__(self) -> None: def resolve_table( self, name: str, + scheme: str, schema: Any, - scheme: str | None = None, metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, ) -> pa.Table: @@ -441,8 +441,8 @@ def __init__(self) -> None: def resolve_table( self, name: str, + scheme: str, schema: Any, - scheme: str | None = None, metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, ) -> pa.Table: @@ -504,8 +504,8 @@ class FailingResolver(PlanResolver): def resolve_table( self, name: str, + scheme: str, schema: Any, - scheme: str | None = None, metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, ) -> pa.Table: @@ -737,8 +737,8 @@ def scan_url(self, parsed_url: dict[str, Any]) -> Any: def resolve_table( self, name: str, + scheme: str, schema: Any, - scheme: str | None = None, metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, ) -> pa.Table: @@ -830,8 +830,8 @@ def scan_url(self, parsed_url: dict[str, Any]) -> Any: def resolve_table( self, name: str, + scheme: str, schema: Any, - scheme: str | None = None, metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, ) -> pa.Table: @@ -867,8 +867,8 @@ class SimpleResolver(PlanResolver): def resolve_table( self, name: str, + scheme: str, schema: Any, - scheme: str | None = None, metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, ) -> pa.Table: diff --git a/vegafusion-python/vegafusion/dataset.py b/vegafusion-python/vegafusion/dataset.py index 671026c8b..6cb8f3924 100644 --- a/vegafusion-python/vegafusion/dataset.py +++ b/vegafusion-python/vegafusion/dataset.py @@ -19,10 +19,10 @@ def __init__(self, data: Any) -> None: # noqa: ANN401 class ExternalDataset: """External dataset with scheme, schema, metadata, and optional data ref. - The ``scheme`` parameter is an optional short identifier for the data - source type (e.g. ``"spark"``, ``"snowflake"``, ``"duckdb"``). It is - propagated through protobuf separately from metadata so that error - messages can name the source when no resolver is registered. + The ``scheme`` parameter identifies the data source type + (e.g. ``"spark"``, ``"snowflake"``, ``"duckdb"``). It is propagated + through protobuf separately from metadata so that error messages can + name the source when no resolver is registered. When ``data`` is provided, it is registered in a class-level :class:`weakref.WeakValueDictionary` keyed by a UUID. The UUID is @@ -42,8 +42,8 @@ class ExternalDataset: def __init__( self, - scheme: str | None = None, - schema: Any = None, # noqa: ANN401 + scheme: str, + schema: Any, # noqa: ANN401 metadata: dict[str, Any] | None = None, data: Any = None, # noqa: ANN401 source: str | None = None, @@ -74,8 +74,8 @@ def resolve_data(cls, ref_id: str) -> Any | None: # noqa: ANN401 return data_ref.data if data_ref is not None else None @property - def scheme(self) -> str | None: - """Optional short identifier for the data source type (e.g. ``"spark"``).""" + def scheme(self) -> str: + """Short identifier for the data source type (e.g. ``"spark"``).""" return self._scheme @property diff --git a/vegafusion-python/vegafusion/plan_resolver.py b/vegafusion-python/vegafusion/plan_resolver.py index 008460e23..d1efa8fb1 100644 --- a/vegafusion-python/vegafusion/plan_resolver.py +++ b/vegafusion-python/vegafusion/plan_resolver.py @@ -131,8 +131,8 @@ def scan_url(self, parsed_url: dict[str, Any]) -> LogicalPlanNode | bytes | None def resolve_table( self, name: str, + scheme: str, schema: Schema, - scheme: str | None = None, metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, ) -> Table: @@ -142,9 +142,9 @@ def resolve_table( Args: name: Table name from the plan. - schema: Full schema of the external table. - scheme: Optional URL scheme identifier (e.g. ``"spark"``, + scheme: URL scheme identifier (e.g. ``"spark"``, ``"snowflake"``). + schema: Full schema of the external table. metadata: JSON metadata dict from ExternalTableProvider. projected_columns: Column names DataFusion actually needs. None if no projection (all columns needed). @@ -236,8 +236,8 @@ def _resolve_external_tables( table_data = self.resolve_table( name=table_name, - schema=dataset.schema, scheme=dataset.scheme, + schema=dataset.schema, metadata=metadata, projected_columns=projected_columns, ) @@ -354,8 +354,8 @@ def inline_table_scan_node( def external_table_scan_node( table_name: str, + scheme: str, schema: Schema, - scheme: str | None = None, metadata: dict[str, Any] | None = None, source: str | None = None, ) -> LogicalPlanNode: @@ -367,8 +367,8 @@ def external_table_scan_node( Args: table_name: Name for the table in the plan. + scheme: Scheme identifier (e.g. ``"spark"``). schema: Arrow schema (arro3.core.Schema) — required for logical planning. - scheme: Optional scheme identifier (e.g. ``"spark"``). metadata: Optional JSON-serializable dict of metadata. source: Optional source identifier. @@ -388,8 +388,8 @@ def external_table_scan_node( node.ParseFromString( _native( table_name=table_name, - schema=schema, scheme=scheme, + schema=schema, metadata=metadata, source=source, ) diff --git a/vegafusion-runtime/src/data/codec.rs b/vegafusion-runtime/src/data/codec.rs index 9625e5f06..bad32a6c5 100644 --- a/vegafusion-runtime/src/data/codec.rs +++ b/vegafusion-runtime/src/data/codec.rs @@ -62,12 +62,9 @@ impl LogicalExtensionCodec for VegaFusionCodec { _ctx: &datafusion::execution::TaskContext, ) -> Result> { if buf.is_empty() { - // Backward compatibility: empty buf treated as ExternalTableProvider - return Ok(Arc::new(ExternalTableProvider::new( - schema, - None, - Value::Null, - ))); + return Err(DataFusionError::Plan( + "Empty custom_table_data buffer — expected JSON envelope".to_string(), + )); } let envelope: Value = serde_json::from_slice(buf).map_err(|e| { @@ -79,14 +76,20 @@ impl LogicalExtensionCodec for VegaFusionCodec { let scheme = envelope .get("scheme") .and_then(|v| v.as_str()) - .map(|s| s.to_string()); + .ok_or_else(|| { + DataFusionError::Plan( + "ExternalTableProvider envelope missing required 'scheme' field" + .to_string(), + ) + })? + .to_string(); let source = envelope .get("source") .and_then(|v| v.as_str()) .map(|s| s.to_string()); let metadata = envelope.get("metadata").cloned().unwrap_or(Value::Null); Ok(Arc::new( - ExternalTableProvider::new(schema, scheme, metadata).with_source(source), + ExternalTableProvider::new(scheme, schema, metadata).with_source(source), )) } Some("inline") => { @@ -117,11 +120,9 @@ impl LogicalExtensionCodec for VegaFusionCodec { Some(other) => Err(DataFusionError::Plan(format!( "Unknown table provider type in envelope: '{other}'" ))), - None => { - // No "type" field — treat as legacy ExternalTableProvider where - // the entire JSON value is the metadata - Ok(Arc::new(ExternalTableProvider::new(schema, None, envelope))) - } + None => Err(DataFusionError::Plan( + "Table provider envelope missing required 'type' field".to_string(), + )), } } diff --git a/vegafusion-runtime/src/data/external_table.rs b/vegafusion-runtime/src/data/external_table.rs index b4239e523..23a63e05d 100644 --- a/vegafusion-runtime/src/data/external_table.rs +++ b/vegafusion-runtime/src/data/external_table.rs @@ -22,17 +22,17 @@ use vegafusion_common::arrow::datatypes::SchemaRef; /// Optionally carries arbitrary JSON metadata in [`Self::metadata`], /// which is serialized into `custom_table_data` by [`super::codec::VegaFusionCodec`]. pub struct ExternalTableProvider { + scheme: String, schema: SchemaRef, - scheme: Option, source: Option, metadata: Value, } impl ExternalTableProvider { - pub fn new(schema: SchemaRef, scheme: Option, metadata: Value) -> Self { + pub fn new(scheme: String, schema: SchemaRef, metadata: Value) -> Self { Self { - schema, scheme, + schema, source: None, metadata, } @@ -43,8 +43,8 @@ impl ExternalTableProvider { self } - pub fn scheme(&self) -> Option<&str> { - self.scheme.as_deref() + pub fn scheme(&self) -> &str { + &self.scheme } pub fn source(&self) -> Option<&str> { @@ -88,7 +88,7 @@ impl TableProvider for ExternalTableProvider { _filters: &[Expr], _limit: Option, ) -> Result> { - let scheme = self.scheme().unwrap_or("unknown"); + let scheme = self.scheme(); plan_err!( "ExternalTableProvider (scheme: {scheme}) cannot be executed directly. \ This table represents an external data source that must be resolved \ diff --git a/vegafusion-runtime/src/data/plan_resolver.rs b/vegafusion-runtime/src/data/plan_resolver.rs index d41fa084d..c987a7633 100644 --- a/vegafusion-runtime/src/data/plan_resolver.rs +++ b/vegafusion-runtime/src/data/plan_resolver.rs @@ -43,16 +43,16 @@ pub trait PlanResolver: Send + Sync + 'static { /// /// # Arguments /// * `name` - table name from the plan + /// * `scheme` - URL scheme identifier (e.g. `"spark"`, `"snowflake"`) /// * `schema` - full Arrow schema of the external table - /// * `scheme` - optional URL scheme identifier (e.g. `"spark"`, `"snowflake"`) /// * `metadata` - JSON metadata from ExternalTableProvider /// * `projected_columns` - column names DataFusion actually needs, /// or `None` if all columns are needed async fn resolve_table( &self, _name: &str, + _scheme: &str, _schema: SchemaRef, - _scheme: Option<&str>, _metadata: &serde_json::Value, _projected_columns: Option>, ) -> Result { @@ -83,8 +83,8 @@ pub trait PlanResolver: Send + Sync + 'static { let table = self .resolve_table( table_name, + &info.scheme, info.schema.clone(), - info.scheme.as_deref(), &info.metadata, info.projected_columns.clone(), ) @@ -109,8 +109,8 @@ pub trait PlanResolver: Send + Sync + 'static { /// Info extracted from an ExternalTableProvider node in a LogicalPlan. struct ExternalTableInfo { + scheme: String, schema: SchemaRef, - scheme: Option, metadata: serde_json::Value, projected_columns: Option>, } @@ -132,8 +132,8 @@ fn extract_external_tables(plan: &LogicalPlan) -> HashMap usize { fn build_external_scan_plan(table_name: &str) -> LogicalPlan { let schema = get_movies_schema(); let provider = Arc::new(ExternalTableProvider::new( + "test".to_string(), schema, - Some("test".to_string()), serde_json::Value::Null, )); let table_source = provider_as_source(provider); @@ -827,8 +827,8 @@ impl PlanResolver for CustomSchemeScanner { async fn scan_url(&self, parsed_url: &ParsedUrl) -> Result> { if parsed_url.scheme == "custom" { let provider = Arc::new(ExternalTableProvider::new( + "custom".to_string(), self.schema.clone(), - Some("custom".to_string()), serde_json::json!({"url": parsed_url.url}), )); let plan = LogicalPlanBuilder::scan("custom_table", provider_as_source(provider), None) @@ -1105,8 +1105,8 @@ mod serialization_tests { async fn test_external_table_proto_round_trip() { let schema = get_movies_schema(); let provider = Arc::new(ExternalTableProvider::new( + "test".to_string(), schema, - Some("test".to_string()), serde_json::Value::Null, )); let table_source = provider_as_source(provider); @@ -1139,8 +1139,8 @@ mod serialization_tests { async fn test_external_table_raw_proto_inspection() { let schema = get_movies_schema(); let provider = Arc::new(ExternalTableProvider::new( + "test".to_string(), schema.clone(), - Some("test".to_string()), serde_json::Value::Null, )); let table_source = provider_as_source(provider); @@ -1219,8 +1219,8 @@ mod serialization_tests { "filters": [{"col": "year", "op": ">", "val": 2000}], }); let provider = Arc::new(ExternalTableProvider::new( + "test".to_string(), schema.clone(), - Some("test".to_string()), metadata.clone(), )); let table_source = provider_as_source(provider); @@ -1243,7 +1243,7 @@ mod serialization_tests { .as_any() .downcast_ref::() .expect("Expected ExternalTableProvider"); - assert_eq!(ext.scheme(), Some("test")); + assert_eq!(ext.scheme(), "test"); assert_eq!(ext.metadata(), &metadata); } else { panic!("Expected TableScan, got {:?}", round_tripped); From 1f0e121b535e12b7505d3f7cd139b5d59811e8f9 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 13 Mar 2026 15:06:07 -0400 Subject: [PATCH 09/36] style: fix ruff formatting in test_plan_resolver.py Co-Authored-By: Claude Opus 4.6 --- vegafusion-python/tests/test_plan_resolver.py | 40 +++++-------------- 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/vegafusion-python/tests/test_plan_resolver.py b/vegafusion-python/tests/test_plan_resolver.py index 4c2ffd2c5..04ef5d19e 100644 --- a/vegafusion-python/tests/test_plan_resolver.py +++ b/vegafusion-python/tests/test_plan_resolver.py @@ -83,9 +83,7 @@ def test_passthrough_resolver() -> None: source_table = pa.table({"x": [1, 5, 10], "y": ["a", "b", "c"]}) expected_result = pa.table({"x": [5, 10], "y": ["b", "c"]}) - ext = ExternalDataset( - scheme="test", schema=source_table.schema, data=source_table - ) + ext = ExternalDataset(scheme="test", schema=source_table.schema, data=source_table) resolver = PassthroughResolver(result_table=expected_result) rt = vf.VegaFusionRuntime(plan_resolver=resolver) @@ -115,9 +113,7 @@ def test_deserializing_resolver() -> None: source_table = pa.table({"x": [1, 5, 10], "y": ["a", "b", "c"]}) expected_result = pa.table({"x": [5, 10], "y": ["b", "c"]}) - ext = ExternalDataset( - scheme="test", schema=source_table.schema, data=source_table - ) + ext = ExternalDataset(scheme="test", schema=source_table.schema, data=source_table) resolver = DeserializingResolver(result_table=expected_result) rt = vf.VegaFusionRuntime(plan_resolver=resolver) @@ -332,9 +328,7 @@ def resolve_table( return source_table resolver = TableResolver() - ext = ExternalDataset( - scheme="test", schema=source_table.schema, data=source_table - ) + ext = ExternalDataset(scheme="test", schema=source_table.schema, data=source_table) rt = vf.VegaFusionRuntime(plan_resolver=resolver) spec = simple_spec() @@ -407,9 +401,7 @@ def _replace_custom_scan( self._replace_custom_scan(child, target_name, replacement) resolver = ManualResolver() - ext = ExternalDataset( - scheme="test", schema=source_table.schema, data=source_table - ) + ext = ExternalDataset(scheme="test", schema=source_table.schema, data=source_table) rt = vf.VegaFusionRuntime(plan_resolver=resolver) spec = simple_spec() @@ -511,9 +503,7 @@ def resolve_table( ) -> pa.Table: raise ValueError("Simulated resolver failure") - ext = ExternalDataset( - scheme="test", schema=source_table.schema, data=source_table - ) + ext = ExternalDataset(scheme="test", schema=source_table.schema, data=source_table) resolver = FailingResolver() rt = vf.VegaFusionRuntime(plan_resolver=resolver) spec = simple_spec() @@ -584,9 +574,7 @@ def resolve_plan_proto( return source_table resolver = SqlCapturingResolver() - ext = ExternalDataset( - scheme="test", schema=source_table.schema, data=source_table - ) + ext = ExternalDataset(scheme="test", schema=source_table.schema, data=source_table) rt = vf.VegaFusionRuntime(plan_resolver=resolver) spec = simple_spec() @@ -635,9 +623,7 @@ def resolve_plan(self, logical_plan: Any, datasets: dict[str, Any]) -> pa.Table: return source_table resolver = ProtoCapturingResolver() - ext = ExternalDataset( - scheme="test", schema=source_table.schema, data=source_table - ) + ext = ExternalDataset(scheme="test", schema=source_table.schema, data=source_table) rt = vf.VegaFusionRuntime(plan_resolver=resolver) spec = simple_spec() @@ -661,9 +647,7 @@ def resolve_plan(self, logical_plan: Any, datasets: dict[str, Any]) -> pa.Table: def test_external_dataset_without_resolver_raises() -> None: """ExternalDataset without a plan resolver raises ValueError with helpful message.""" source_table = pa.table({"x": [1, 2, 3]}) - ext = ExternalDataset( - scheme="spark", schema=source_table.schema, data=source_table - ) + ext = ExternalDataset(scheme="spark", schema=source_table.schema, data=source_table) rt = vf.VegaFusionRuntime() # No resolver spec = simple_spec() @@ -697,9 +681,7 @@ def resolve_plan_proto( return source_table resolver = DialectTestResolver() - ext = ExternalDataset( - scheme="test", schema=source_table.schema, data=source_table - ) + ext = ExternalDataset(scheme="test", schema=source_table.schema, data=source_table) rt = vf.VegaFusionRuntime(plan_resolver=resolver) rt.pre_transform_datasets( @@ -875,9 +857,7 @@ def resolve_table( return pa.table({"x": [1, 2, 3]}) source_table = pa.table({"x": [1, 2, 3]}) - ext = ExternalDataset( - scheme="test", schema=source_table.schema, data=source_table - ) + ext = ExternalDataset(scheme="test", schema=source_table.schema, data=source_table) resolver = SimpleResolver() rt = vf.VegaFusionRuntime(plan_resolver=resolver) From 09dbd37b35112049c0674ea48671682a052bcda6 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 13 Mar 2026 19:51:51 -0400 Subject: [PATCH 10/36] docs: clarify wasm32 path_to_file_url fallback comment Accurately document why the fallback exists (method absent on wasm32-unknown-unknown, not just a runtime failure) and note the percent-encoding limitation for reserved characters. Co-Authored-By: Claude Opus 4.6 --- vegafusion-core/src/runtime/plan_resolver.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vegafusion-core/src/runtime/plan_resolver.rs b/vegafusion-core/src/runtime/plan_resolver.rs index 46432dae5..1a0705213 100644 --- a/vegafusion-core/src/runtime/plan_resolver.rs +++ b/vegafusion-core/src/runtime/plan_resolver.rs @@ -196,6 +196,14 @@ pub fn path_to_file_url(path: &str) -> Result { }) } +/// Browser-wasm fallback: `url::Url::from_file_path` is unavailable on +/// `wasm32-unknown-unknown` (not compiled for that target in the `url` crate), +/// and `std::path` absolute-path semantics on that target do not recognize +/// POSIX-like virtual paths such as `/foo`. +/// +/// We therefore synthesize a `file:` URL for the restricted path forms we +/// expect here. Unlike `Url::from_file_path`, this does **not** percent-encode +/// reserved characters, so inputs must not contain `#`, `?`, etc. #[cfg(target_arch = "wasm32")] pub fn path_to_file_url(path: &str) -> Result { let normalized = path.replace('\\', "/"); From e50d7c55216e85002a4eea351b333ca1cd9bd35b Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 13 Mar 2026 19:56:51 -0400 Subject: [PATCH 11/36] fix: anchor scheme detection in has_url_scheme to start of string has_url_scheme used `contains("://")`, so relative references like `fetch?target=http://evil.com/data` were misclassified as absolute URLs. resolve_url then returned them as-is, causing downstream Url::parse to fail with RelativeUrlWithoutBase. Now validates that `://` is preceded by a valid RFC 3986 scheme prefix. Also replaced the duplicated inline `contains("://")` check in data.rs with a call to the fixed has_url_scheme. Co-Authored-By: Claude Opus 4.6 --- vegafusion-core/src/runtime/plan_resolver.rs | 41 +++++++++++++++++++- vegafusion-core/src/spec/data.rs | 2 +- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/vegafusion-core/src/runtime/plan_resolver.rs b/vegafusion-core/src/runtime/plan_resolver.rs index 1a0705213..134425fe6 100644 --- a/vegafusion-core/src/runtime/plan_resolver.rs +++ b/vegafusion-core/src/runtime/plan_resolver.rs @@ -145,8 +145,26 @@ pub fn resolve_data_base_url( /// Returns true if the string is already a URL (has a scheme) or is /// scheme-relative (starts with //). +/// +/// Per RFC 3986, a scheme is `[a-zA-Z][a-zA-Z0-9+.-]*:`. We check that +/// `://` appears only after a valid scheme prefix, so relative references +/// like `fetch?target=http://evil.com` are not misclassified as absolute. pub fn has_url_scheme(s: &str) -> bool { - s.contains("://") || s.starts_with("//") + if s.starts_with("//") { + return true; + } + if let Some(pos) = s.find("://") { + let prefix = &s[..pos]; + let mut chars = prefix.chars(); + match chars.next() { + Some(c) if c.is_ascii_alphabetic() => { + chars.all(|c| c.is_ascii_alphanumeric() || matches!(c, '+' | '-' | '.')) + } + _ => false, + } + } else { + false + } } /// Returns true if `path` is an absolute filesystem path. @@ -276,6 +294,18 @@ mod tests { assert!(!has_url_scheme("data/cars.json")); } + #[test] + fn test_has_url_scheme_embedded_scheme_in_query() { + // Relative reference with "://" in a query parameter — must not be + // misclassified as an absolute URL. + assert!(!has_url_scheme("fetch?target=http://evil.com/data")); + } + + #[test] + fn test_has_url_scheme_embedded_scheme_in_path() { + assert!(!has_url_scheme("foo/http://bar")); + } + // ── is_absolute_path ── #[test] @@ -416,6 +446,15 @@ mod tests { assert!(result.is_err()); } + #[test] + fn test_resolve_url_relative_with_embedded_scheme() { + // A relative reference that contains "://" in a query parameter should + // be joined against the base URL, not treated as absolute. + let base = Some("https://proxy.com/".to_string()); + let result = resolve_url("fetch?target=http://evil.com/data", &base).unwrap(); + assert_eq!(result, "https://proxy.com/fetch?target=http://evil.com/data"); + } + // ── resolve_data_base_url ── #[test] diff --git a/vegafusion-core/src/spec/data.rs b/vegafusion-core/src/spec/data.rs index a5d82d40c..fd7e2c05f 100644 --- a/vegafusion-core/src/spec/data.rs +++ b/vegafusion-core/src/spec/data.rs @@ -72,7 +72,7 @@ impl DataSpec { // later by resolve_url (absolute paths become file://, relatives use base URL). // Internal dataset URLs (table://, vegafusion+dataset://) are always supported. if let Some(StringOrSignalSpec::String(url_str)) = &self.url { - if url_str.contains("://") + if crate::runtime::has_url_scheme(url_str) && !url_str.starts_with("table://") && !url_str.starts_with("vegafusion+dataset://") { From c788d49bb9688c4a68ff84f2a07b18a1013c17af Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Fri, 13 Mar 2026 20:35:25 -0400 Subject: [PATCH 12/36] feat: add supports_arrow_tables capability and plan-aware materialization Add `supports_arrow_tables` bool to `ResolverCapabilities` proto so resolvers can declare whether they efficiently consume in-memory Arrow tables. DataFusion sets this true; remote resolvers (e.g. Spark) default to false. Replaces the `has_user_resolvers()` heuristic with `should_materialize(plan)` which inspects the actual LogicalPlan: - Materialize if all resolvers support Arrow tables (fast path) - Materialize if the plan has no ExternalTableProvider nodes - Otherwise keep lazy for resolver interception This avoids unnecessary lazy plans when a non-arrow resolver is registered but the specific plan doesn't involve any external tables. Co-Authored-By: Claude Opus 4.6 --- vegafusion-core/src/proto/tasks.proto | 5 ++ vegafusion-core/src/runtime/plan_resolver.rs | 23 +++++- vegafusion-python/src/plan_resolver.rs | 6 ++ vegafusion-runtime/src/data/pipeline.rs | 44 +++++++++++- vegafusion-runtime/src/data/tasks.rs | 10 +-- .../tests/test_plan_resolver.rs | 71 ++++++++++++++++--- 6 files changed, 140 insertions(+), 19 deletions(-) diff --git a/vegafusion-core/src/proto/tasks.proto b/vegafusion-core/src/proto/tasks.proto index 215f1cede..e68459dfb 100644 --- a/vegafusion-core/src/proto/tasks.proto +++ b/vegafusion-core/src/proto/tasks.proto @@ -193,4 +193,9 @@ message ResolverCapabilities { repeated string supported_schemes = 1; repeated string supported_format_types = 2; repeated string supported_extensions = 3; + // Whether this resolver can efficiently consume in-memory Arrow tables. + // When true, the runtime may eagerly materialize LogicalPlans into tables. + // When false (proto3 default), the runtime keeps data as lazy plans so the + // resolver can intercept and redirect execution (e.g. to Spark). + bool supports_arrow_tables = 4; } \ No newline at end of file diff --git a/vegafusion-core/src/runtime/plan_resolver.rs b/vegafusion-core/src/runtime/plan_resolver.rs index 134425fe6..f6131466b 100644 --- a/vegafusion-core/src/runtime/plan_resolver.rs +++ b/vegafusion-core/src/runtime/plan_resolver.rs @@ -72,6 +72,11 @@ pub struct MergedCapabilities { pub supported_schemes: HashSet, pub supported_format_types: HashSet, pub supported_extensions: HashSet, + /// True when every resolver in the pipeline can efficiently consume + /// in-memory Arrow tables. When true, the runtime may eagerly materialize + /// LogicalPlans into tables. When false, data is kept as lazy plans so + /// resolvers that need plan-level access (e.g. Spark) can intercept them. + pub all_support_arrow_tables: bool, } impl MergedCapabilities { @@ -88,6 +93,7 @@ impl MergedCapabilities { .supported_extensions .extend(cap.supported_extensions.iter().cloned()); } + merged.all_support_arrow_tables = caps.iter().all(|c| c.supports_arrow_tables); merged } @@ -125,6 +131,7 @@ impl ResolverCapabilities { .into_iter() .map(String::from) .collect(), + supports_arrow_tables: true, } } } @@ -452,7 +459,10 @@ mod tests { // be joined against the base URL, not treated as absolute. let base = Some("https://proxy.com/".to_string()); let result = resolve_url("fetch?target=http://evil.com/data", &base).unwrap(); - assert_eq!(result, "https://proxy.com/fetch?target=http://evil.com/data"); + assert_eq!( + result, + "https://proxy.com/fetch?target=http://evil.com/data" + ); } // ── resolve_data_base_url ── @@ -515,12 +525,23 @@ mod tests { supported_schemes: vec!["spark".to_string()], supported_format_types: vec!["delta".to_string()], supported_extensions: vec![], + supports_arrow_tables: false, }; let merged = MergedCapabilities::from_resolver_capabilities(&[df_caps, custom_caps]); assert!(merged.supported_schemes.contains("http")); assert!(merged.supported_schemes.contains("spark")); assert!(merged.supported_format_types.contains("csv")); assert!(merged.supported_format_types.contains("delta")); + // DataFusion supports arrow but the custom resolver does not + assert!(!merged.all_support_arrow_tables); + } + + #[test] + fn test_merged_capabilities_all_support_arrow() { + let caps = MergedCapabilities::from_resolver_capabilities(&[ + ResolverCapabilities::datafusion_defaults(), + ]); + assert!(caps.all_support_arrow_tables); } #[test] diff --git a/vegafusion-python/src/plan_resolver.rs b/vegafusion-python/src/plan_resolver.rs index 2122d7ae7..225064c70 100644 --- a/vegafusion-python/src/plan_resolver.rs +++ b/vegafusion-python/src/plan_resolver.rs @@ -200,10 +200,16 @@ impl PlanResolver for PyPlanResolver { } }; + let supports_arrow_tables = match dict_ref.get_item("supports_arrow_tables") { + Ok(val) => val.extract().unwrap_or(false), + Err(_) => false, + }; + Ok(ResolverCapabilities { supported_schemes: extract_list("supported_schemes")?, supported_format_types: extract_list("supported_format_types")?, supported_extensions: extract_list("supported_extensions")?, + supports_arrow_tables, }) })(); result.unwrap_or_default() diff --git a/vegafusion-runtime/src/data/pipeline.rs b/vegafusion-runtime/src/data/pipeline.rs index e53eef472..f6f0a535c 100644 --- a/vegafusion-runtime/src/data/pipeline.rs +++ b/vegafusion-runtime/src/data/pipeline.rs @@ -1,6 +1,9 @@ use std::sync::Arc; +use datafusion::datasource::source_as_provider; use datafusion::prelude::SessionContext; +use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; +use datafusion_expr::LogicalPlan as DFLogicalPlan; use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::{Result, VegaFusionError}; @@ -8,6 +11,7 @@ use vegafusion_core::proto::gen::tasks::ResolverCapabilities; use vegafusion_core::runtime::{MergedCapabilities, ParsedUrl, ResolutionResult}; use super::datafusion_resolver::DataFusionResolver; +use super::external_table::ExternalTableProvider; use super::plan_resolver::PlanResolver; /// Chains resolvers with a terminal `DataFusionResolver`. @@ -34,9 +38,21 @@ impl ResolverPipeline { } } - /// Whether any user-supplied resolvers are registered. - pub fn has_user_resolvers(&self) -> bool { - self.resolvers.len() > 1 + /// Whether the runtime should eagerly materialize a `LogicalPlan` into + /// an in-memory Arrow table. + /// + /// Materializes when: + /// 1. All resolvers support in-memory Arrow tables, OR + /// 2. The plan contains no `ExternalTableProvider` nodes (no resolver + /// will need to intercept it) + /// + /// Keeps the plan lazy otherwise, so resolvers that need plan-level + /// access (e.g. a Spark connector) can intercept external tables. + pub fn should_materialize(&self, plan: &LogicalPlan) -> bool { + if self.merged_capabilities().all_support_arrow_tables { + return true; + } + !has_external_table_scans(plan) } /// Access the shared `SessionContext`. @@ -73,6 +89,7 @@ impl ResolverPipeline { supported_schemes: merged.supported_schemes.into_iter().collect(), supported_format_types: merged.supported_format_types.into_iter().collect(), supported_extensions: merged.supported_extensions.into_iter().collect(), + supports_arrow_tables: merged.all_support_arrow_tables, } } @@ -94,3 +111,24 @@ impl ResolverPipeline { )) } } + +/// Returns true if the plan contains any `ExternalTableProvider` table scans. +fn has_external_table_scans(plan: &LogicalPlan) -> bool { + let mut found = false; + let _ = plan.apply(|node| { + if let DFLogicalPlan::TableScan(scan) = node { + if let Ok(provider) = source_as_provider(&scan.source) { + if provider + .as_any() + .downcast_ref::() + .is_some() + { + found = true; + return Ok(TreeNodeRecursion::Stop); + } + } + } + Ok(TreeNodeRecursion::Continue) + }); + found +} diff --git a/vegafusion-runtime/src/data/tasks.rs b/vegafusion-runtime/src/data/tasks.rs index 968f398f3..fd0a85c18 100644 --- a/vegafusion-runtime/src/data/tasks.rs +++ b/vegafusion-runtime/src/data/tasks.rs @@ -59,19 +59,21 @@ use {datafusion::prelude::ParquetReadOptions, vegafusion_common::error::ToExtern #[cfg(target_arch = "wasm32")] use object_store_wasm::HttpStore; -/// If no user resolvers are configured, eagerly materialize a `TaskValue::Plan` -/// into a `TaskValue::Table` via DataFusion execution. Passthrough otherwise. +/// Eagerly materialize a `TaskValue::Plan` into a `TaskValue::Table` when safe: +/// either all resolvers support Arrow tables, or the plan has no external table +/// nodes that a resolver would need to intercept. Otherwise keep it lazy. async fn maybe_materialize_plan( task_value: TaskValue, pipeline: &ResolverPipeline, ) -> Result { - if !pipeline.has_user_resolvers() { - if let TaskValue::Plan(plan) = task_value { + if let TaskValue::Plan(plan) = task_value { + if pipeline.should_materialize(&plan) { let table = DataFrame::new(pipeline.ctx().state(), plan) .collect_to_table() .await?; return Ok(TaskValue::Table(table)); } + return Ok(TaskValue::Plan(plan)); } Ok(task_value) } diff --git a/vegafusion-runtime/tests/test_plan_resolver.rs b/vegafusion-runtime/tests/test_plan_resolver.rs index ec9a03550..5b3b2936f 100644 --- a/vegafusion-runtime/tests/test_plan_resolver.rs +++ b/vegafusion-runtime/tests/test_plan_resolver.rs @@ -821,6 +821,7 @@ impl PlanResolver for CustomSchemeScanner { supported_schemes: vec!["custom".to_string()], supported_format_types: vec![], supported_extensions: vec![], + supports_arrow_tables: false, } } @@ -908,19 +909,40 @@ async fn test_scan_url_unknown_scheme_falls_through() { } #[tokio::test] -async fn test_has_user_resolvers() { +async fn test_should_materialize() { let ctx = Arc::new(datafusion::prelude::SessionContext::new()); + let schema = get_movies_schema(); + + // A plan with no external tables (just an empty MemTable) + let empty_batch = RecordBatch::new_empty(schema.clone()); + let mem_table = MemTable::try_new(schema.clone(), vec![vec![empty_batch]]).unwrap(); + let plain_plan = + LogicalPlanBuilder::scan("plain", provider_as_source(Arc::new(mem_table)), None) + .unwrap() + .build() + .unwrap(); + + // A plan with an ExternalTableProvider + let ext_provider = + ExternalTableProvider::new("custom".to_string(), schema.clone(), serde_json::json!({})); + let external_plan = + LogicalPlanBuilder::scan("ext", provider_as_source(Arc::new(ext_provider)), None) + .unwrap() + .build() + .unwrap(); - // No user resolvers + // DataFusion-only: all support arrow → always materialize let pipeline = ResolverPipeline::new(vec![], ctx.clone()); - assert!(!pipeline.has_user_resolvers()); + assert!(pipeline.should_materialize(&plain_plan)); + assert!(pipeline.should_materialize(&external_plan)); - // With a user resolver + // With a non-arrow resolver: materialize plain plans, not external ones let scanner = CustomSchemeScanner { - schema: get_movies_schema(), + schema: schema.clone(), }; let pipeline = ResolverPipeline::new(vec![Arc::new(scanner)], ctx); - assert!(pipeline.has_user_resolvers()); + assert!(pipeline.should_materialize(&plain_plan)); + assert!(!pipeline.should_materialize(&external_plan)); } #[tokio::test] @@ -1410,21 +1432,48 @@ async fn test_datafusion_resolver_executes_simple_plan() { } #[tokio::test] -async fn test_resolver_pipeline_has_user_resolvers() { +async fn test_resolver_pipeline_should_materialize() { let ctx = Arc::new(datafusion::prelude::SessionContext::new()); + let schema = get_movies_schema(); + // Plan with no external tables + let empty_batch = RecordBatch::new_empty(schema.clone()); + let mem_table = MemTable::try_new(schema.clone(), vec![vec![empty_batch]]).unwrap(); + let plain_plan = + LogicalPlanBuilder::scan("plain", provider_as_source(Arc::new(mem_table)), None) + .unwrap() + .build() + .unwrap(); + + // Plan with an external table + let ext = ExternalTableProvider::new("test".to_string(), schema, serde_json::json!({})); + let external_plan = LogicalPlanBuilder::scan("ext", provider_as_source(Arc::new(ext)), None) + .unwrap() + .build() + .unwrap(); + + // DataFusion-only: always materialize let empty_pipeline = ResolverPipeline::new(vec![], ctx.clone()); assert!( - !empty_pipeline.has_user_resolvers(), - "Empty pipeline should report no user resolvers" + empty_pipeline.should_materialize(&plain_plan), + "DataFusion-only pipeline should materialize plain plans" + ); + assert!( + empty_pipeline.should_materialize(&external_plan), + "DataFusion-only pipeline should materialize even external plans" ); + // With non-arrow resolver: materialize plain, not external let events = Arc::new(Mutex::new(Vec::new())); let resolver = ScriptedResolver::new("test", ResolverBehavior::PassThroughPlan, events); let resolvers: Vec> = vec![Arc::new(resolver)]; let pipeline_with_resolvers = ResolverPipeline::new(resolvers, ctx); assert!( - pipeline_with_resolvers.has_user_resolvers(), - "Pipeline with resolvers should report has user resolvers" + pipeline_with_resolvers.should_materialize(&plain_plan), + "Non-arrow pipeline should still materialize plans with no external tables" + ); + assert!( + !pipeline_with_resolvers.should_materialize(&external_plan), + "Non-arrow pipeline should not materialize plans with external tables" ); } From fd063873ebdebec1ca0bd2ff71672a89541b0c45 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sat, 14 Mar 2026 08:58:53 -0400 Subject: [PATCH 13/36] docs: add PlanResolver documentation, examples, and docstring improvements Add Sphinx feature page, three Python example scripts, and improved docstrings for the PlanResolver extensibility system. New files: - docs/source/features/plan_resolver.md - examples/python-examples/plan_resolver_basic.py - examples/python-examples/plan_resolver_url_scanning.py - examples/python-examples/plan_resolver_sql.py Docstring improvements: - capabilities() now documents supports_arrow_tables key - resolve_plan_proto() and resolve_plan() have Args/Returns sections - ExternalDataset.schema, .metadata, .data properties have docstrings - Rust PlanResolver trait has top-level doc comment Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/source/features/features.md | 1 + docs/source/features/inline_datasets.md | 2 + docs/source/features/plan_resolver.md | 114 ++++++++++++++++++ .../python-examples/plan_resolver_basic.py | 81 +++++++++++++ examples/python-examples/plan_resolver_sql.py | 85 +++++++++++++ .../plan_resolver_url_scanning.py | 85 +++++++++++++ vegafusion-python/vegafusion/dataset.py | 3 + vegafusion-python/vegafusion/plan_resolver.py | 60 +++++++-- vegafusion-runtime/src/data/plan_resolver.rs | 18 +++ 9 files changed, 436 insertions(+), 13 deletions(-) create mode 100644 docs/source/features/plan_resolver.md create mode 100644 examples/python-examples/plan_resolver_basic.py create mode 100644 examples/python-examples/plan_resolver_sql.py create mode 100644 examples/python-examples/plan_resolver_url_scanning.py diff --git a/docs/source/features/features.md b/docs/source/features/features.md index 7c7ed791e..1b37fffe0 100644 --- a/docs/source/features/features.md +++ b/docs/source/features/features.md @@ -13,6 +13,7 @@ transform_spec transform_extract chart_state inline_datasets +plan_resolver grpc embed jupyter_widget diff --git a/docs/source/features/inline_datasets.md b/docs/source/features/inline_datasets.md index 3a054ce24..1c7f0d2c5 100644 --- a/docs/source/features/inline_datasets.md +++ b/docs/source/features/inline_datasets.md @@ -37,3 +37,5 @@ See [inline_datasets.py](https://github.com/vega/vegafusion/tree/main/examples/p In Rust, `inline_datasets` should be a `HashMap` from dataset names (e.g. `movies` in the example above) to `VegaFusionDataset` instances. `VegaFusionDataset` is an enum that may be either a `VegaFusionTable` (which is a thin wrapper around Arrow RecordBatches), or a DataFusion [`LocalPlan`](https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.LogicalPlan.html) (which represents an arbitrary DataFusion query). See [inline_datasets.rs](https://github.com/vega/vegafusion/tree/main/examples/rust-examples/examples/inline_datasets.rs) for a complete example using a `VegaFusionTable`, and see [inline_datasets_plan.rs](https://github.com/vega/vegafusion/tree/main/examples/rust-examples/examples/inline_datasets_plan.rs) for a complete example using a DataFusion ``LogicalPlan``. + +For more advanced data source integration (custom URL schemes, SQL transpilation, remote execution), see [Plan Resolver](./plan_resolver.md). diff --git a/docs/source/features/plan_resolver.md b/docs/source/features/plan_resolver.md new file mode 100644 index 000000000..868f8076e --- /dev/null +++ b/docs/source/features/plan_resolver.md @@ -0,0 +1,114 @@ +# Plan Resolver + +PlanResolver lets you connect custom data sources to VegaFusion. Use it when data lives in an external system (Spark, Snowflake, DuckDB, a custom API) and you want to push computation there instead of pulling it all into memory. For data you already have in Python as DataFrames or Arrow tables, [inline datasets](./inline_datasets.md) are simpler. + +:::{note} +`resolve_table`, `resolve_plan_proto` (bytes variant), and `unparse_to_sql` with bytes require no additional dependencies beyond `vegafusion`. + +`scan_url`, `resolve_plan` (deserialized `LogicalPlanNode`), `external_table_scan_node`, and `inline_table_scan_node` require the protobuf package: + +``` +pip install vegafusion[plan-resolver] +``` +::: + +## Python + +Override one of these methods on `PlanResolver` (simplest first): + +- `resolve_table`: return data for each external table independently. The default `resolve_plan` walks the plan and calls this for every external table. +- `resolve_plan` / `resolve_plan_proto`: receive the entire logical plan. Overriding this supersedes `resolve_table` since the runtime calls `resolve_plan` directly; `resolve_table` is only reached via the default implementation. + +### resolve_table + +```python +class TableResolver(PlanResolver): + def __init__(self, table): + self._table = table + + def resolve_table(self, name, scheme, schema, metadata=None, + projected_columns=None): + return self._table + +source = pa.table({"x": [1, 5, 10], "y": ["a", "b", "c"]}) +ext = ExternalDataset(scheme="custom", schema=source.schema, data=source) +resolver = TableResolver(source) + +rt = vf.VegaFusionRuntime(plan_resolver=resolver) +datasets, _ = rt.pre_transform_datasets( + spec, datasets=["filtered"], + inline_datasets={"source": ext}, dataset_format="pyarrow", +) +``` + +VegaFusion calls `resolve_table` to get the data, then applies Vega transforms (filter, aggregate, etc.) via DataFusion. No protobuf dependency is needed. + +See [plan_resolver_basic.py](https://github.com/vega/vegafusion/tree/main/examples/python-examples/plan_resolver_basic.py) for a complete example. + +### capabilities + scan_url + +For custom URL schemes in Vega specs (e.g. `"url": "mydata://database/sales"`), override `capabilities()` and `scan_url()`: + +```python +class SalesResolver(PlanResolver): + def capabilities(self): + return {"supported_schemes": ["mydata"]} + + def scan_url(self, parsed_url): + if parsed_url["scheme"] == "mydata": + schema = pa.schema([("product", pa.utf8()), ("revenue", pa.int64())]) + return external_table_scan_node( + table_name="sales_data", schema=schema, scheme="mydata", + ) + return None # pass to next resolver + + def resolve_table(self, name, scheme, schema, metadata=None, + projected_columns=None): + return pa.table({"product": ["Widget", "Gadget"], "revenue": [1200, 3400]}) +``` + +`capabilities()` tells the planner that `mydata://` URLs are supported. `scan_url()` creates an `ExternalTableProvider` plan node, and `resolve_table()` provides the data at execution time. + +See [plan_resolver_url_scanning.py](https://github.com/vega/vegafusion/tree/main/examples/python-examples/plan_resolver_url_scanning.py) for a complete example. + +### resolve_plan + unparse_to_sql + +Override `resolve_plan_proto` to receive the serialized logical plan. Use `unparse_to_sql()` to convert it to SQL: + +```python +class SqlResolver(PlanResolver): + def resolve_plan_proto(self, plan_bytes, datasets): + sql = unparse_to_sql(plan_bytes, dialect="postgres") + # Execute SQL against your database and return the result + return execute_query(sql) +``` + +`resolve_plan_proto` receives protobuf bytes that can be passed directly to `unparse_to_sql()` without deserialization. To inspect or modify the plan tree, use `resolve_plan()` instead (it receives a deserialized `LogicalPlanNode`). + +Supported SQL dialects: `"default"`, `"postgres"`, `"mysql"`, `"sqlite"`, `"duckdb"`, `"bigquery"`. + +See [plan_resolver_sql.py](https://github.com/vega/vegafusion/tree/main/examples/python-examples/plan_resolver_sql.py) for a complete example. + +`PlanResolver` cannot be used with `grpc_connect()` (resolvers run in-process). Set `thread_safe = False` for backends with thread-affine connections (e.g. DuckDB). Set `skip_when_no_external_tables = False` to receive all plans (e.g. for logging). The `capabilities()` dict also accepts `supports_arrow_tables: True` to let the runtime eagerly materialize plans into Arrow tables. + +### API Reference + +```{eval-rst} +.. autoclass:: vegafusion.PlanResolver + :members: + +.. autoclass:: vegafusion.ExternalDataset + :members: + +.. autofunction:: vegafusion.plan_resolver.external_table_scan_node + +.. autofunction:: vegafusion.plan_resolver.unparse_to_sql + +.. autofunction:: vegafusion.plan_resolver.inline_table_scan_node +``` + +## Rust + +The `PlanResolver` trait in `vegafusion-runtime` provides the same two-phase architecture (scan_url at planning time, resolve_table/resolve_plan at execution time). + +See [custom_resolver.rs](https://github.com/vega/vegafusion/tree/main/examples/rust-examples/examples/custom_resolver.rs) for a working example, and the [vegafusion-runtime docs on docs.rs](https://docs.rs/vegafusion-runtime/) for the full API. diff --git a/examples/python-examples/plan_resolver_basic.py b/examples/python-examples/plan_resolver_basic.py new file mode 100644 index 000000000..53f54df77 --- /dev/null +++ b/examples/python-examples/plan_resolver_basic.py @@ -0,0 +1,81 @@ +# Demonstrates the simplest PlanResolver pattern: override resolve_table to provide +# data for an ExternalDataset. No protobuf dependency needed. + +import json +from typing import Any + +import pyarrow as pa + +import vegafusion as vf +from vegafusion import ExternalDataset, PlanResolver + + +def main() -> None: + source_table = pa.table({"x": [1, 5, 10], "y": ["a", "b", "c"]}) + ext = ExternalDataset(scheme="custom", schema=source_table.schema, data=source_table) + resolver = TableResolver(source_table) + rt = vf.VegaFusionRuntime(plan_resolver=resolver) + + spec = make_spec() + datasets, warnings = rt.pre_transform_datasets( + spec, + datasets=["filtered"], + inline_datasets={"source": ext}, + dataset_format="pyarrow", + ) + + assert len(datasets) == 1 + result = datasets[0] + assert result.num_rows == 2 + assert result.column("x").to_pylist() == [5, 10] + assert result.column("y").to_pylist() == ["b", "c"] + + print("Result after filter (x > 3):") + print(result.to_pandas().to_string(index=False)) + + +class TableResolver(PlanResolver): + """Returns a fixed table for any resolve_table call.""" + + def __init__(self, table: pa.Table) -> None: + self._table = table + + def resolve_table( + self, + name: str, + scheme: str, + schema: Any, + metadata: dict[str, Any] | None = None, + projected_columns: list[str] | None = None, + ) -> pa.Table: + return self._table + + +def make_spec() -> dict[str, Any]: + return json.loads( + """ +{ + "$schema": "https://vega.github.io/schema/vega/v5.json", + "data": [ + { + "name": "source", + "url": "table://source" + }, + { + "name": "filtered", + "source": "source", + "transform": [ + { + "type": "filter", + "expr": "datum.x > 3" + } + ] + } + ] +} + """ + ) + + +if __name__ == "__main__": + main() diff --git a/examples/python-examples/plan_resolver_sql.py b/examples/python-examples/plan_resolver_sql.py new file mode 100644 index 000000000..c69d90bc2 --- /dev/null +++ b/examples/python-examples/plan_resolver_sql.py @@ -0,0 +1,85 @@ +# Demonstrates SQL transpilation using resolve_plan_proto() + unparse_to_sql(). +# The resolver receives a serialized logical plan, converts it to SQL, and returns +# a result table. In a real application you would execute the SQL against a database. + +import json +from typing import Any + +import pyarrow as pa + +import vegafusion as vf +from vegafusion import ExternalDataset, PlanResolver +from vegafusion.plan_resolver import unparse_to_sql + + +def main() -> None: + source_table = pa.table({"x": [1, 5, 10], "y": ["a", "b", "c"]}) + ext = ExternalDataset(scheme="table", schema=source_table.schema, data=source_table) + + resolver = SqlTranspileResolver(source_table=source_table) + rt = vf.VegaFusionRuntime(plan_resolver=resolver) + + spec = get_spec() + datasets, warnings = rt.pre_transform_datasets( + spec, + datasets=["filtered"], + inline_datasets={"source": ext}, + dataset_format="pyarrow", + ) + + assert warnings == [] + result = datasets[0] + assert result.column("x").to_pylist() == [5, 10] + assert result.column("y").to_pylist() == ["b", "c"] + assert resolver.captured_sql is not None + assert "SELECT" in resolver.captured_sql + + print("Captured SQL (postgres dialect):") + print(resolver.captured_sql) + print() + print("Result table:") + print(result) + + +class SqlTranspileResolver(PlanResolver): + """Converts the logical plan to Postgres-dialect SQL.""" + + def __init__(self, source_table: pa.Table) -> None: + self.source_table = source_table + self.captured_sql: str | None = None + + def resolve_plan_proto( + self, plan_bytes: bytes, datasets: dict[str, Any] + ) -> pa.Table: + sql = unparse_to_sql(plan_bytes, dialect="postgres") + self.captured_sql = sql + # In a real scenario you would execute `sql` against a database. + return pa.table({"x": [5, 10], "y": ["b", "c"]}) + + +def get_spec() -> dict[str, Any]: + return json.loads(""" +{ + "$schema": "https://vega.github.io/schema/vega/v5.json", + "data": [ + { + "name": "source", + "url": "table://source" + }, + { + "name": "filtered", + "source": "source", + "transform": [ + { + "type": "filter", + "expr": "datum.x > 3" + } + ] + } + ] +} + """) + + +if __name__ == "__main__": + main() diff --git a/examples/python-examples/plan_resolver_url_scanning.py b/examples/python-examples/plan_resolver_url_scanning.py new file mode 100644 index 000000000..57511eb28 --- /dev/null +++ b/examples/python-examples/plan_resolver_url_scanning.py @@ -0,0 +1,85 @@ +# Requires: pip install vegafusion[plan-resolver] +""" +Demonstrates the URL scanning pattern for custom URL schemes: +capabilities() + scan_url() + resolve_table() + +VegaFusion's PlanResolver lets you register custom URL schemes so that +data references like "mydata://database/sales" in a Vega spec are resolved +by your own Python code rather than fetched over HTTP. +""" +import json +from typing import Any + +import pyarrow as pa +import vegafusion as vf +from vegafusion import PlanResolver +from vegafusion.plan_resolver import external_table_scan_node + + +def main(): + resolver = SalesDataResolver() + rt = vf.VegaFusionRuntime(plan_resolver=resolver) + + spec = make_spec() + datasets, warnings = rt.pre_transform_datasets( + spec, datasets=["sales"], dataset_format="pyarrow" + ) + + assert warnings == [] + assert len(datasets) == 1 + + table = datasets[0] + assert table.column("product").to_pylist() == ["Widget", "Gadget", "Gizmo"] + assert table.column("revenue").to_pylist() == [1200, 3400, 560] + print("Result table:") + print(table.to_pandas().to_string(index=False)) + print("\nAll assertions passed.") + + +class SalesDataResolver(PlanResolver): + """Resolves URLs with the 'mydata' scheme using in-memory data.""" + + def capabilities(self) -> dict[str, list[str]]: + return {"supported_schemes": ["mydata"]} + + def scan_url(self, parsed_url: dict[str, Any]) -> Any: + if parsed_url["scheme"] == "mydata": + schema = pa.schema([("product", pa.utf8()), ("revenue", pa.int64())]) + return external_table_scan_node( + table_name="sales_data", + schema=schema, + scheme="mydata", + ) + return None + + def resolve_table( + self, + name: str, + scheme: str, + schema: Any, + metadata: dict[str, Any] | None = None, + projected_columns: list[str] | None = None, + ) -> pa.Table: + return pa.table({ + "product": ["Widget", "Gadget", "Gizmo"], + "revenue": [1200, 3400, 560], + }) + + +def make_spec() -> dict[str, Any]: + spec_str = """ +{ + "$schema": "https://vega.github.io/schema/vega/v5.json", + "data": [ + { + "name": "sales", + "url": "mydata://database/sales" + } + ] +} + """ + return json.loads(spec_str) + + +if __name__ == "__main__": + main() diff --git a/vegafusion-python/vegafusion/dataset.py b/vegafusion-python/vegafusion/dataset.py index 6cb8f3924..93ec78ec9 100644 --- a/vegafusion-python/vegafusion/dataset.py +++ b/vegafusion-python/vegafusion/dataset.py @@ -80,12 +80,15 @@ def scheme(self) -> str: @property def schema(self) -> Schema: + """Arrow schema of the external table (``arro3.core.Schema``).""" return self._schema @property def metadata(self) -> dict[str, Any]: + """JSON-serializable metadata dict propagated through the plan.""" return self._metadata @property def data(self) -> Any: # noqa: ANN401 + """The opaque data object, or ``None`` if not provided.""" return self._data diff --git a/vegafusion-python/vegafusion/plan_resolver.py b/vegafusion-python/vegafusion/plan_resolver.py index d1efa8fb1..b8cf74247 100644 --- a/vegafusion-python/vegafusion/plan_resolver.py +++ b/vegafusion-python/vegafusion/plan_resolver.py @@ -46,10 +46,15 @@ class ResolvedPlan: class PlanResolver: """Base class for plan resolvers. - Override one of these (checked in priority order): + Override one of these (simplest first): - 1. ``resolve_table`` — provide data for each external table independently - 2. ``resolve_plan_proto`` / ``resolve_plan`` — full control over resolution + - ``resolve_table``: return data for each external table independently. + The default ``resolve_plan`` walks the plan and calls this for every + ``ExternalTableProvider`` node. + - ``resolve_plan_proto`` / ``resolve_plan``: receive the entire logical + plan. Overriding this supersedes ``resolve_table`` since the runtime + calls ``resolve_plan`` directly; ``resolve_table`` is only reached + via the default implementation. For ``resolve_plan``, override either the ``_proto`` variant (raw bytes) or the non-``_proto`` variant (deserialized ``LogicalPlanNode``). The ``_proto`` @@ -75,7 +80,7 @@ class PlanResolver: callbacks run on the main thread. Set to False for backends with thread-affine connections (e.g. DuckDB in-memory databases).""" - def capabilities(self) -> dict[str, list[str]]: + def capabilities(self) -> dict[str, Any]: """Declare URL patterns this resolver supports at planning time. Override to advertise additional URL scheme/format support beyond @@ -83,9 +88,13 @@ def capabilities(self) -> dict[str, list[str]]: with csv, tsv, json, arrow, parquet formats). Returns: - Dict with optional keys: ``'supported_schemes'``, - ``'supported_format_types'``, ``'supported_extensions'``. - Values are lists of strings. + Dict with optional keys: + + - ``'supported_schemes'``: list of URL schemes (e.g. ``["spark", "snowflake"]``) + - ``'supported_format_types'``: list of format types (e.g. ``["csv", "parquet"]``) + - ``'supported_extensions'``: list of file extensions (e.g. ``[".csv", ".parquet"]``) + - ``'supports_arrow_tables'``: bool (default ``False``). When ``True``, + the runtime eagerly materializes plans into Arrow tables. """ return {} @@ -162,7 +171,20 @@ def resolve_plan_proto( """Resolve a plan given raw protobuf bytes. The default implementation deserializes into a - LogicalPlanNode and calls resolve_plan(). + ``LogicalPlanNode`` and delegates to :meth:`resolve_plan`. + + Override this (instead of ``resolve_plan``) when you only need + the serialized bytes, e.g. to pass them directly to + :func:`unparse_to_sql` without a deserialization round-trip. + + Args: + plan_bytes: Serialized ``LogicalPlanNode`` protobuf bytes. + datasets: Dict mapping table names to :class:`ExternalDataset` + instances for every ``ExternalTableProvider`` in the plan. + + Returns: + An Arrow-compatible table (full execution) or a + :class:`ResolvedPlan` (plan rewriting with sidecar data). """ try: from vegafusion.proto.datafusion_pb2 import ( @@ -189,12 +211,24 @@ def resolve_plan( logical_plan: LogicalPlanNode, datasets: dict[str, ExternalDataset], ) -> ResolutionResult: - """Resolve a plan given a deserialized LogicalPlanNode. + """Resolve a plan given a deserialized ``LogicalPlanNode``. - The default implementation walks the plan tree looking for - CustomTableScanNode nodes that correspond to ExternalTableProvider - entries. For each, it calls resolve_table() and replaces the node - with an inline_table_scan_node. + The default implementation walks the plan tree, finds + ``ExternalTableProvider`` nodes, calls :meth:`resolve_table` for + each, and replaces them with :func:`inline_table_scan_node` markers. + + Override this for full control over plan rewriting, e.g. + to transpile the plan to SQL and execute it remotely. + + Args: + logical_plan: Deserialized ``LogicalPlanNode`` protobuf message. + datasets: Dict mapping table names to :class:`ExternalDataset` + instances for every ``ExternalTableProvider`` in the plan. + + Returns: + An Arrow-compatible table (for full execution by the resolver) + or a :class:`ResolvedPlan` (rewritten plan with sidecar Arrow + data for DataFusion to execute). """ sidecar: dict[str, Table] = {} self._resolve_external_tables(logical_plan, datasets, sidecar) diff --git a/vegafusion-runtime/src/data/plan_resolver.rs b/vegafusion-runtime/src/data/plan_resolver.rs index c987a7633..4814b8d07 100644 --- a/vegafusion-runtime/src/data/plan_resolver.rs +++ b/vegafusion-runtime/src/data/plan_resolver.rs @@ -15,8 +15,26 @@ use vegafusion_core::runtime::{ParsedUrl, ResolutionResult}; use super::external_table::ExternalTableProvider; +/// Trait for custom data source integration with VegaFusion. +/// +/// Resolvers participate in a two-phase pipeline: +/// +/// 1. **Planning phase**: [`capabilities`](Self::capabilities) declares supported +/// URL schemes/formats, and [`scan_url`](Self::scan_url) converts URLs into +/// `LogicalPlan` nodes (typically `ExternalTableProvider` markers). +/// +/// 2. **Execution phase**: [`resolve_table`](Self::resolve_table) or +/// [`resolve_plan`](Self::resolve_plan) provides data for external table +/// references or rewrites the plan for remote execution. +/// +/// Override one of (simplest first): +/// - [`resolve_table`](Self::resolve_table): per-table data provider, called +/// by the default `resolve_plan` for each `ExternalTableProvider` node. +/// - [`resolve_plan`](Self::resolve_plan): receives the entire plan. Overriding +/// this supersedes `resolve_table` since the runtime calls `resolve_plan` directly. #[async_trait] pub trait PlanResolver: Send + Sync + 'static { + /// Human-readable name for logging and error messages. fn name(&self) -> &str; /// Declare what URL patterns this resolver supports at planning time. From 37fdf1105586a0d0d87623ec56feaa01a56eae21 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sat, 14 Mar 2026 15:36:22 -0400 Subject: [PATCH 14/36] style: fix ruff formatting in example scripts Co-Authored-By: Claude Opus 4.6 (1M context) --- examples/python-examples/plan_resolver_basic.py | 4 +++- .../python-examples/plan_resolver_url_scanning.py | 11 +++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/examples/python-examples/plan_resolver_basic.py b/examples/python-examples/plan_resolver_basic.py index 53f54df77..9fe3b9822 100644 --- a/examples/python-examples/plan_resolver_basic.py +++ b/examples/python-examples/plan_resolver_basic.py @@ -12,7 +12,9 @@ def main() -> None: source_table = pa.table({"x": [1, 5, 10], "y": ["a", "b", "c"]}) - ext = ExternalDataset(scheme="custom", schema=source_table.schema, data=source_table) + ext = ExternalDataset( + scheme="custom", schema=source_table.schema, data=source_table + ) resolver = TableResolver(source_table) rt = vf.VegaFusionRuntime(plan_resolver=resolver) diff --git a/examples/python-examples/plan_resolver_url_scanning.py b/examples/python-examples/plan_resolver_url_scanning.py index 57511eb28..86068b171 100644 --- a/examples/python-examples/plan_resolver_url_scanning.py +++ b/examples/python-examples/plan_resolver_url_scanning.py @@ -7,6 +7,7 @@ data references like "mydata://database/sales" in a Vega spec are resolved by your own Python code rather than fetched over HTTP. """ + import json from typing import Any @@ -60,10 +61,12 @@ def resolve_table( metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, ) -> pa.Table: - return pa.table({ - "product": ["Widget", "Gadget", "Gizmo"], - "revenue": [1200, 3400, 560], - }) + return pa.table( + { + "product": ["Widget", "Gadget", "Gizmo"], + "revenue": [1200, 3400, 560], + } + ) def make_spec() -> dict[str, Any]: From 4df3bfbc906a2ecc8b2d02563d5377b829295372 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 16 Mar 2026 08:46:02 -0400 Subject: [PATCH 15/36] style: add future annotations import to example scripts Fixes ruff FA102 (PEP 604 union syntax without future annotations). Co-Authored-By: Claude Opus 4.6 (1M context) --- examples/python-examples/plan_resolver_basic.py | 2 ++ examples/python-examples/plan_resolver_url_scanning.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/examples/python-examples/plan_resolver_basic.py b/examples/python-examples/plan_resolver_basic.py index 9fe3b9822..c85b8ce56 100644 --- a/examples/python-examples/plan_resolver_basic.py +++ b/examples/python-examples/plan_resolver_basic.py @@ -1,6 +1,8 @@ # Demonstrates the simplest PlanResolver pattern: override resolve_table to provide # data for an ExternalDataset. No protobuf dependency needed. +from __future__ import annotations + import json from typing import Any diff --git a/examples/python-examples/plan_resolver_url_scanning.py b/examples/python-examples/plan_resolver_url_scanning.py index 86068b171..fdcb37810 100644 --- a/examples/python-examples/plan_resolver_url_scanning.py +++ b/examples/python-examples/plan_resolver_url_scanning.py @@ -8,6 +8,8 @@ by your own Python code rather than fetched over HTTP. """ +from __future__ import annotations + import json from typing import Any From bff8ac5f30ac77305fea4f09f51a1509e1304520 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 16 Mar 2026 09:01:10 -0400 Subject: [PATCH 16/36] docs: remove journey comment from DataBaseUrlSetting Co-Authored-By: Claude Opus 4.6 (1M context) --- vegafusion-core/src/runtime/plan_resolver.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vegafusion-core/src/runtime/plan_resolver.rs b/vegafusion-core/src/runtime/plan_resolver.rs index f6131466b..10c8abed2 100644 --- a/vegafusion-core/src/runtime/plan_resolver.rs +++ b/vegafusion-core/src/runtime/plan_resolver.rs @@ -13,14 +13,13 @@ pub enum ResolutionResult { Plan(LogicalPlan), } -/// Explicit representation of the data_base_url setting at public API boundaries. -/// This avoids overloading Option with an empty-string sentinel. +/// Three-state base URL setting for public API boundaries. #[derive(Clone, Debug, Default)] pub enum DataBaseUrlSetting { /// Use the default CDN base URL (vega-datasets) #[default] Default, - /// Disable base URL — relative paths produce an error + /// Disable base URL; relative paths produce an error Disabled, /// Use a custom base URL (scheme URL or absolute path) Custom(String), @@ -65,8 +64,8 @@ pub struct ParsedUrl { pub parse: Option, } -/// Merged capabilities from all resolvers, with HashSet fields for O(1) lookup. -/// Built by unioning the ResolverCapabilities from each resolver in the pipeline. +/// Merged capabilities from all resolvers, built by unioning the +/// ResolverCapabilities from each resolver in the pipeline. #[derive(Clone, Debug, Default)] pub struct MergedCapabilities { pub supported_schemes: HashSet, @@ -75,7 +74,7 @@ pub struct MergedCapabilities { /// True when every resolver in the pipeline can efficiently consume /// in-memory Arrow tables. When true, the runtime may eagerly materialize /// LogicalPlans into tables. When false, data is kept as lazy plans so - /// resolvers that need plan-level access (e.g. Spark) can intercept them. + /// resolvers that need plan-level access can intercept them. pub all_support_arrow_tables: bool, } From 9985a33192dbdb2b99b9d64d3428179c42dfc637 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 16 Mar 2026 09:10:17 -0400 Subject: [PATCH 17/36] refactor: simplify has_url_scheme with regex, fix line length Replace manual char-by-char RFC 3986 scheme validation with a compiled regex. Also fix E501 line-length violations in the capabilities() docstring. Co-Authored-By: Claude Opus 4.6 (1M context) --- vegafusion-core/src/runtime/plan_resolver.rs | 29 +++++-------------- vegafusion-python/vegafusion/plan_resolver.py | 9 ++++-- 2 files changed, 14 insertions(+), 24 deletions(-) diff --git a/vegafusion-core/src/runtime/plan_resolver.rs b/vegafusion-core/src/runtime/plan_resolver.rs index 10c8abed2..5c9387b7e 100644 --- a/vegafusion-core/src/runtime/plan_resolver.rs +++ b/vegafusion-core/src/runtime/plan_resolver.rs @@ -1,6 +1,8 @@ use crate::proto::gen::pretransform::DataBaseUrlSettingProto; use crate::proto::gen::tasks::ResolverCapabilities; +use regex::Regex; use std::collections::HashSet; +use std::sync::LazyLock; use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::Result; @@ -149,28 +151,13 @@ pub fn resolve_data_base_url( } } -/// Returns true if the string is already a URL (has a scheme) or is -/// scheme-relative (starts with //). -/// -/// Per RFC 3986, a scheme is `[a-zA-Z][a-zA-Z0-9+.-]*:`. We check that -/// `://` appears only after a valid scheme prefix, so relative references -/// like `fetch?target=http://evil.com` are not misclassified as absolute. +static URL_SCHEME_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^(//|[a-zA-Z][a-zA-Z0-9+.\-]*://)").unwrap()); + +/// Returns true if the string is already a URL (has a scheme per RFC 3986) +/// or is scheme-relative (starts with //). pub fn has_url_scheme(s: &str) -> bool { - if s.starts_with("//") { - return true; - } - if let Some(pos) = s.find("://") { - let prefix = &s[..pos]; - let mut chars = prefix.chars(); - match chars.next() { - Some(c) if c.is_ascii_alphabetic() => { - chars.all(|c| c.is_ascii_alphanumeric() || matches!(c, '+' | '-' | '.')) - } - _ => false, - } - } else { - false - } + URL_SCHEME_RE.is_match(s) } /// Returns true if `path` is an absolute filesystem path. diff --git a/vegafusion-python/vegafusion/plan_resolver.py b/vegafusion-python/vegafusion/plan_resolver.py index b8cf74247..33e23c475 100644 --- a/vegafusion-python/vegafusion/plan_resolver.py +++ b/vegafusion-python/vegafusion/plan_resolver.py @@ -90,9 +90,12 @@ def capabilities(self) -> dict[str, Any]: Returns: Dict with optional keys: - - ``'supported_schemes'``: list of URL schemes (e.g. ``["spark", "snowflake"]``) - - ``'supported_format_types'``: list of format types (e.g. ``["csv", "parquet"]``) - - ``'supported_extensions'``: list of file extensions (e.g. ``[".csv", ".parquet"]``) + - ``'supported_schemes'``: list of URL schemes + (e.g. ``["spark", "snowflake"]``) + - ``'supported_format_types'``: list of format types + (e.g. ``["csv", "parquet"]``) + - ``'supported_extensions'``: list of file extensions + (e.g. ``[".csv", ".parquet"]``) - ``'supports_arrow_tables'``: bool (default ``False``). When ``True``, the runtime eagerly materializes plans into Arrow tables. """ From 45c65cecd737d7dcee16568c11dabb605dde4f5c Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 16 Mar 2026 11:14:58 -0400 Subject: [PATCH 18/36] style: remove added comments from tasks.proto Co-Authored-By: Claude Opus 4.6 (1M context) --- vegafusion-core/src/proto/tasks.proto | 9 --------- 1 file changed, 9 deletions(-) diff --git a/vegafusion-core/src/proto/tasks.proto b/vegafusion-core/src/proto/tasks.proto index e68459dfb..ad806b2a9 100644 --- a/vegafusion-core/src/proto/tasks.proto +++ b/vegafusion-core/src/proto/tasks.proto @@ -4,8 +4,6 @@ package tasks; import "expression.proto"; import "transforms.proto"; -// ## Materialized Task Value -// Represents a fully materialized (computed) task value, either a scalar or table message MaterializedTaskValue { oneof data { /* @@ -79,9 +77,6 @@ message DataUrlTask { int32 batch_size = 3; ScanUrlFormat format_type = 4; transforms.TransformPipeline pipeline = 5; - // Base URL for resolving relative URLs in Url::Expr tasks at eval time. - // Written by MakeTasksVisitor from PlannerConfig.data_base_url. - // Absent = no base URL, present = that base URL. optional string data_base_url = 6; } @@ -193,9 +188,5 @@ message ResolverCapabilities { repeated string supported_schemes = 1; repeated string supported_format_types = 2; repeated string supported_extensions = 3; - // Whether this resolver can efficiently consume in-memory Arrow tables. - // When true, the runtime may eagerly materialize LogicalPlans into tables. - // When false (proto3 default), the runtime keeps data as lazy plans so the - // resolver can intercept and redirect execution (e.g. to Spark). bool supports_arrow_tables = 4; } \ No newline at end of file From 6d887cbd4503fe65694fb6b8b12013297710d122 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 16 Mar 2026 11:33:19 -0400 Subject: [PATCH 19/36] docs: add three-state comment to DataBaseUrlSettingProto Co-Authored-By: Claude Opus 4.6 (1M context) --- vegafusion-core/src/proto/pretransform.proto | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vegafusion-core/src/proto/pretransform.proto b/vegafusion-core/src/proto/pretransform.proto index 0718773a8..694bf3e53 100644 --- a/vegafusion-core/src/proto/pretransform.proto +++ b/vegafusion-core/src/proto/pretransform.proto @@ -4,6 +4,8 @@ package pretransform; import "tasks.proto"; import "google/protobuf/empty.proto"; +// Three states: field absent = use default CDN, custom = use this URL, +// disabled = no base URL (relative paths error). message DataBaseUrlSettingProto { oneof kind { string custom = 1; From 97d0c4fc6b7206bff915ce342ee47ec50b009a25 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 16 Mar 2026 11:46:15 -0400 Subject: [PATCH 20/36] docs: clarify QueryRequest is for WASM transport, not gRPC Co-Authored-By: Claude Opus 4.6 (1M context) --- vegafusion-core/src/proto/services.proto | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vegafusion-core/src/proto/services.proto b/vegafusion-core/src/proto/services.proto index f313808c6..00cfba954 100644 --- a/vegafusion-core/src/proto/services.proto +++ b/vegafusion-core/src/proto/services.proto @@ -19,6 +19,9 @@ message GetCapabilitiesResult { tasks.ResolverCapabilities capabilities = 1; } +// Multiplexed envelope used by the WASM runtime's query_fn callback, +// including gRPC-Web mode. Bundles task graph queries and capability +// requests into one message so a single JS function can handle both. message QueryRequest { oneof request { tasks.TaskGraphValueRequest task_graph_values = 1; From ade31639470fa01a82db70fe61102f31472d98ae Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 16 Mar 2026 15:15:01 -0400 Subject: [PATCH 21/36] refactor: remove ResolverCapabilities from planning, unify URL resolution at runtime Remove the capabilities system (supported_schemes, supported_format_types, supported_extensions) from the planner. The planner no longer checks URL schemes or format types against resolver capabilities -- all URL-backed datasets are considered plannable and errors surface at runtime. Move URL resolution from planning time to runtime. Static URLs are no longer resolved by MakeTasksVisitor; instead DataUrlTask::eval() resolves both static and signal-based URLs uniformly. Replace the supports_arrow_tables proto field with a direct trait method on PlanResolver. ResolverPipeline queries resolvers directly instead of going through MergedCapabilities. Removed: - ResolverCapabilities proto message and GetCapabilities RPC - DataBaseUrlSettingProto and data_base_url from pretransform opts - MergedCapabilities struct and planner_capabilities() trait method - capabilities() from Rust and Python PlanResolver - fetch_capabilities_via_query_fn() from WASM - Scheme/format checks from DataSpec::supported() 35 files changed, +70 -593 lines. Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/source/features/plan_resolver.md | 11 +- .../plan_resolver_url_scanning.py | 5 +- vegafusion-core/src/chart_state.rs | 11 +- vegafusion-core/src/planning/plan.rs | 12 -- vegafusion-core/src/proto/pretransform.proto | 13 -- vegafusion-core/src/proto/services.proto | 15 +- vegafusion-core/src/proto/tasks.proto | 6 - vegafusion-core/src/runtime/mod.rs | 2 +- vegafusion-core/src/runtime/plan_resolver.rs | 141 ------------------ vegafusion-core/src/runtime/runtime.rs | 30 ++-- vegafusion-core/src/spec/data.rs | 34 ----- vegafusion-core/src/spec/visitors.rs | 16 +- vegafusion-python/src/lib.rs | 3 - vegafusion-python/src/plan_resolver.rs | 31 +--- vegafusion-python/tests/test_plan_resolver.py | 7 +- vegafusion-python/vegafusion/plan_resolver.py | 25 +--- vegafusion-runtime/benches/spec_benchmarks.rs | 6 +- .../src/data/datafusion_resolver.rs | 14 +- vegafusion-runtime/src/data/pipeline.rs | 28 +--- vegafusion-runtime/src/data/plan_resolver.rs | 11 +- vegafusion-runtime/src/data/tasks.rs | 8 +- .../src/task_graph/grpc_runtime.rs | 21 +-- vegafusion-runtime/src/task_graph/runtime.rs | 4 - .../test_destringify_selection_datasets.rs | 1 - .../tests/test_image_comparison.rs | 2 +- .../tests/test_plan_resolver.rs | 50 ------- vegafusion-runtime/tests/test_planning.rs | 2 +- .../tests/test_pre_transform_extract.rs | 1 - .../test_pre_transform_keep_variables.rs | 3 - .../tests/test_pre_transform_values.rs | 9 -- .../tests/test_stringify_datetimes.rs | 5 - .../tests/test_task_graph_runtime.rs | 3 +- vegafusion-server/src/main.rs | 27 +--- .../tests/test_task_graph_runtime.rs | 4 +- vegafusion-wasm/src/lib.rs | 102 +------------ 35 files changed, 70 insertions(+), 593 deletions(-) diff --git a/docs/source/features/plan_resolver.md b/docs/source/features/plan_resolver.md index 868f8076e..74095b44c 100644 --- a/docs/source/features/plan_resolver.md +++ b/docs/source/features/plan_resolver.md @@ -45,15 +45,12 @@ VegaFusion calls `resolve_table` to get the data, then applies Vega transforms ( See [plan_resolver_basic.py](https://github.com/vega/vegafusion/tree/main/examples/python-examples/plan_resolver_basic.py) for a complete example. -### capabilities + scan_url +### scan_url -For custom URL schemes in Vega specs (e.g. `"url": "mydata://database/sales"`), override `capabilities()` and `scan_url()`: +For custom URL schemes in Vega specs (e.g. `"url": "mydata://database/sales"`), override `scan_url()`: ```python class SalesResolver(PlanResolver): - def capabilities(self): - return {"supported_schemes": ["mydata"]} - def scan_url(self, parsed_url): if parsed_url["scheme"] == "mydata": schema = pa.schema([("product", pa.utf8()), ("revenue", pa.int64())]) @@ -67,7 +64,7 @@ class SalesResolver(PlanResolver): return pa.table({"product": ["Widget", "Gadget"], "revenue": [1200, 3400]}) ``` -`capabilities()` tells the planner that `mydata://` URLs are supported. `scan_url()` creates an `ExternalTableProvider` plan node, and `resolve_table()` provides the data at execution time. +`scan_url()` creates an `ExternalTableProvider` plan node for URLs your resolver handles, and `resolve_table()` provides the data at execution time. See [plan_resolver_url_scanning.py](https://github.com/vega/vegafusion/tree/main/examples/python-examples/plan_resolver_url_scanning.py) for a complete example. @@ -89,7 +86,7 @@ Supported SQL dialects: `"default"`, `"postgres"`, `"mysql"`, `"sqlite"`, `"duck See [plan_resolver_sql.py](https://github.com/vega/vegafusion/tree/main/examples/python-examples/plan_resolver_sql.py) for a complete example. -`PlanResolver` cannot be used with `grpc_connect()` (resolvers run in-process). Set `thread_safe = False` for backends with thread-affine connections (e.g. DuckDB). Set `skip_when_no_external_tables = False` to receive all plans (e.g. for logging). The `capabilities()` dict also accepts `supports_arrow_tables: True` to let the runtime eagerly materialize plans into Arrow tables. +`PlanResolver` cannot be used with `grpc_connect()` (resolvers run in-process). Set `thread_safe = False` for backends with thread-affine connections (e.g. DuckDB). Set `skip_when_no_external_tables = False` to receive all plans (e.g. for logging). Set `supports_arrow_tables = True` to let the runtime eagerly materialize plans into Arrow tables. ### API Reference diff --git a/examples/python-examples/plan_resolver_url_scanning.py b/examples/python-examples/plan_resolver_url_scanning.py index fdcb37810..b2b047ac0 100644 --- a/examples/python-examples/plan_resolver_url_scanning.py +++ b/examples/python-examples/plan_resolver_url_scanning.py @@ -1,7 +1,7 @@ # Requires: pip install vegafusion[plan-resolver] """ Demonstrates the URL scanning pattern for custom URL schemes: -capabilities() + scan_url() + resolve_table() +scan_url() + resolve_table() VegaFusion's PlanResolver lets you register custom URL schemes so that data references like "mydata://database/sales" in a Vega spec are resolved @@ -42,9 +42,6 @@ def main(): class SalesDataResolver(PlanResolver): """Resolves URLs with the 'mydata' scheme using in-memory data.""" - def capabilities(self) -> dict[str, list[str]]: - return {"supported_schemes": ["mydata"]} - def scan_url(self, parsed_url: dict[str, Any]) -> Any: if parsed_url["scheme"] == "mydata": schema = pa.schema([("product", pa.utf8()), ("revenue", pa.int64())]) diff --git a/vegafusion-core/src/chart_state.rs b/vegafusion-core/src/chart_state.rs index 6c49817fc..a46473b5a 100644 --- a/vegafusion-core/src/chart_state.rs +++ b/vegafusion-core/src/chart_state.rs @@ -70,16 +70,9 @@ impl ChartState { let resolved_base = crate::runtime::resolve_data_base_url( opts.data_base_url.clone(), - PlannerConfig::default().data_base_url, - )?; - let plan = SpecPlan::try_new( - &spec, - &PlannerConfig { - capabilities: runtime.planner_capabilities(), - data_base_url: resolved_base.clone(), - ..Default::default() - }, + Some(crate::planning::plan::VEGA_DATASETS_CDN_BASE.to_string()), )?; + let plan = SpecPlan::try_new(&spec, &PlannerConfig::default())?; let task_scope = plan .server_spec diff --git a/vegafusion-core/src/planning/plan.rs b/vegafusion-core/src/planning/plan.rs index ebb72d060..546c931c9 100644 --- a/vegafusion-core/src/planning/plan.rs +++ b/vegafusion-core/src/planning/plan.rs @@ -12,8 +12,6 @@ use crate::planning::unsupported_data_warning::add_unsupported_data_warnings; use crate::proto::gen::pretransform::{ pre_transform_spec_warning::WarningType, PlannerWarning, PreTransformSpecWarning, }; -use crate::proto::gen::tasks::ResolverCapabilities; -use crate::runtime::MergedCapabilities; use crate::spec::chart::ChartSpec; use crate::task_graph::graph::ScopedVariable; use serde::{Deserialize, Serialize}; @@ -97,12 +95,6 @@ pub struct PlannerConfig { pub strip_description_encoding: bool, pub strip_aria_encoding: bool, pub strip_tooltip_encoding: bool, - /// Merged URL capabilities from all resolvers. Used by DataSpec::supported() - /// to decide if a URL-backed dataset is plannable. - pub capabilities: MergedCapabilities, - /// Base URL for resolving relative data URLs. None means relative paths are an error. - /// Some(url) means relative paths are resolved against this URL. - pub data_base_url: Option, } /// Default CDN base URL for vega-datasets @@ -126,10 +118,6 @@ impl Default for PlannerConfig { strip_description_encoding: true, strip_aria_encoding: true, strip_tooltip_encoding: false, - capabilities: MergedCapabilities::from_resolver_capabilities(&[ - ResolverCapabilities::datafusion_defaults(), - ]), - data_base_url: Some(VEGA_DATASETS_CDN_BASE.to_string()), } } } diff --git a/vegafusion-core/src/proto/pretransform.proto b/vegafusion-core/src/proto/pretransform.proto index 694bf3e53..9bdb87dd9 100644 --- a/vegafusion-core/src/proto/pretransform.proto +++ b/vegafusion-core/src/proto/pretransform.proto @@ -2,16 +2,6 @@ syntax = "proto3"; package pretransform; import "tasks.proto"; -import "google/protobuf/empty.proto"; - -// Three states: field absent = use default CDN, custom = use this URL, -// disabled = no base URL (relative paths error). -message DataBaseUrlSettingProto { - oneof kind { - string custom = 1; - google.protobuf.Empty disabled = 2; - } -} /// Pre transform spec messages message PreTransformSpecOpts { @@ -20,7 +10,6 @@ message PreTransformSpecOpts { repeated PreTransformVariable keep_variables = 3; string local_tz = 4; optional string default_input_tz = 5; - optional DataBaseUrlSettingProto data_base_url = 6; } message PreTransformSpecRequest { @@ -63,7 +52,6 @@ message PreTransformValuesOpts { optional uint32 row_limit = 1; string local_tz = 2; optional string default_input_tz = 3; - optional DataBaseUrlSettingProto data_base_url = 4; } message PreTransformValuesRequest { @@ -108,7 +96,6 @@ message PreTransformExtractOpts { bool preserve_interactivity = 3; int32 extract_threshold = 4; repeated PreTransformVariable keep_variables = 5; - optional DataBaseUrlSettingProto data_base_url = 6; } message PreTransformExtractWarning { diff --git a/vegafusion-core/src/proto/services.proto b/vegafusion-core/src/proto/services.proto index 00cfba954..c4f06981e 100644 --- a/vegafusion-core/src/proto/services.proto +++ b/vegafusion-core/src/proto/services.proto @@ -10,22 +10,14 @@ service VegaFusionRuntime { rpc PreTransformSpec(pretransform.PreTransformSpecRequest) returns (PreTransformSpecResult) {} rpc PreTransformValues(pretransform.PreTransformValuesRequest) returns (PreTransformValuesResult) {} rpc PreTransformExtract(pretransform.PreTransformExtractRequest) returns (PreTransformExtractResult) {} - rpc GetCapabilities(GetCapabilitiesRequest) returns (GetCapabilitiesResult) {} -} - -message GetCapabilitiesRequest {} - -message GetCapabilitiesResult { - tasks.ResolverCapabilities capabilities = 1; } // Multiplexed envelope used by the WASM runtime's query_fn callback, -// including gRPC-Web mode. Bundles task graph queries and capability -// requests into one message so a single JS function can handle both. +// including gRPC-Web mode. Bundles task graph queries into one message +// so a single JS function can handle them. message QueryRequest { oneof request { tasks.TaskGraphValueRequest task_graph_values = 1; - GetCapabilitiesRequest get_capabilities = 2; } } @@ -33,7 +25,6 @@ message QueryResult { oneof response { errors.Error error = 1; tasks.TaskGraphValueResponse task_graph_values = 2; - GetCapabilitiesResult get_capabilities = 3; } } @@ -56,4 +47,4 @@ message PreTransformExtractResult { errors.Error error = 1; pretransform.PreTransformExtractResponse response = 2; } -} \ No newline at end of file +} diff --git a/vegafusion-core/src/proto/tasks.proto b/vegafusion-core/src/proto/tasks.proto index ad806b2a9..a984ab72c 100644 --- a/vegafusion-core/src/proto/tasks.proto +++ b/vegafusion-core/src/proto/tasks.proto @@ -184,9 +184,3 @@ message InlineDataset { } } -message ResolverCapabilities { - repeated string supported_schemes = 1; - repeated string supported_format_types = 2; - repeated string supported_extensions = 3; - bool supports_arrow_tables = 4; -} \ No newline at end of file diff --git a/vegafusion-core/src/runtime/mod.rs b/vegafusion-core/src/runtime/mod.rs index bed6b3043..d3035df5d 100644 --- a/vegafusion-core/src/runtime/mod.rs +++ b/vegafusion-core/src/runtime/mod.rs @@ -3,6 +3,6 @@ mod runtime; pub use plan_resolver::{ has_url_scheme, is_absolute_path, normalize_base_url, path_to_file_url, resolve_data_base_url, - resolve_url, DataBaseUrlSetting, MergedCapabilities, ParsedUrl, ResolutionResult, + resolve_url, DataBaseUrlSetting, ParsedUrl, ResolutionResult, }; pub use runtime::{PreTransformExtractTable, VegaFusionRuntimeTrait}; diff --git a/vegafusion-core/src/runtime/plan_resolver.rs b/vegafusion-core/src/runtime/plan_resolver.rs index 5c9387b7e..9e7ed9820 100644 --- a/vegafusion-core/src/runtime/plan_resolver.rs +++ b/vegafusion-core/src/runtime/plan_resolver.rs @@ -1,7 +1,4 @@ -use crate::proto::gen::pretransform::DataBaseUrlSettingProto; -use crate::proto::gen::tasks::ResolverCapabilities; use regex::Regex; -use std::collections::HashSet; use std::sync::LazyLock; use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; @@ -27,22 +24,6 @@ pub enum DataBaseUrlSetting { Custom(String), } -impl DataBaseUrlSetting { - /// Convert from the proto representation. - /// Field absent → Default (use CDN), custom → Custom, disabled → Disabled. - pub fn from_proto(proto: Option) -> Self { - use crate::proto::gen::pretransform::data_base_url_setting_proto::Kind; - match proto { - None => DataBaseUrlSetting::Default, - Some(p) => match p.kind { - Some(Kind::Custom(s)) => DataBaseUrlSetting::Custom(s), - Some(Kind::Disabled(_)) => DataBaseUrlSetting::Disabled, - None => DataBaseUrlSetting::Default, - }, - } - } -} - /// Parsed URL representation passed to resolvers during the scan phase. /// All fields are populated from the fully-resolved URL (after base URL /// resolution and hash-stripping). Resolvers pattern-match on these fields @@ -66,77 +47,6 @@ pub struct ParsedUrl { pub parse: Option, } -/// Merged capabilities from all resolvers, built by unioning the -/// ResolverCapabilities from each resolver in the pipeline. -#[derive(Clone, Debug, Default)] -pub struct MergedCapabilities { - pub supported_schemes: HashSet, - pub supported_format_types: HashSet, - pub supported_extensions: HashSet, - /// True when every resolver in the pipeline can efficiently consume - /// in-memory Arrow tables. When true, the runtime may eagerly materialize - /// LogicalPlans into tables. When false, data is kept as lazy plans so - /// resolvers that need plan-level access can intercept them. - pub all_support_arrow_tables: bool, -} - -impl MergedCapabilities { - pub fn from_resolver_capabilities(caps: &[ResolverCapabilities]) -> Self { - let mut merged = Self::default(); - for cap in caps { - merged - .supported_schemes - .extend(cap.supported_schemes.iter().cloned()); - merged - .supported_format_types - .extend(cap.supported_format_types.iter().cloned()); - merged - .supported_extensions - .extend(cap.supported_extensions.iter().cloned()); - } - merged.all_support_arrow_tables = caps.iter().all(|c| c.supports_arrow_tables); - merged - } - - /// Check if a URL with the given scheme and format info is supported by any resolver. - pub fn url_supported( - &self, - scheme: &str, - format_type: Option<&str>, - extension: Option<&str>, - ) -> bool { - let scheme_ok = self.supported_schemes.contains(scheme); - let format_ok = match (format_type, extension) { - (Some(fmt), _) => self.supported_format_types.contains(fmt), - (None, Some(ext)) => self.supported_extensions.contains(ext), - (None, None) => true, - }; - scheme_ok && format_ok - } -} - -impl ResolverCapabilities { - /// Built-in DataFusion URL capabilities: file, http, https, s3 schemes - /// with csv, tsv, json, arrow, parquet formats. - pub fn datafusion_defaults() -> Self { - Self { - supported_schemes: vec!["http", "https", "s3", "file"] - .into_iter() - .map(String::from) - .collect(), - supported_format_types: vec!["csv", "tsv", "json", "arrow", "parquet"] - .into_iter() - .map(String::from) - .collect(), - supported_extensions: vec!["csv", "tsv", "json", "arrow", "feather", "parquet"] - .into_iter() - .map(String::from) - .collect(), - supports_arrow_tables: true, - } - } -} - /// Map a DataBaseUrlSetting (from public API) to the two-state Option /// used by PlannerConfig. Custom base URLs are normalized (bare absolute paths /// become file:// URLs). @@ -490,55 +400,4 @@ mod tests { .unwrap(); assert_eq!(result, Some("file:///home/user/data".to_string())); } - - // ── MergedCapabilities ── - - #[test] - fn test_merged_capabilities_from_defaults() { - let caps = MergedCapabilities::from_resolver_capabilities(&[ - ResolverCapabilities::datafusion_defaults(), - ]); - assert!(caps.supported_schemes.contains("http")); - assert!(caps.supported_schemes.contains("file")); - assert!(caps.supported_format_types.contains("csv")); - assert!(caps.supported_extensions.contains("parquet")); - } - - #[test] - fn test_merged_capabilities_union() { - let df_caps = ResolverCapabilities::datafusion_defaults(); - let custom_caps = ResolverCapabilities { - supported_schemes: vec!["spark".to_string()], - supported_format_types: vec!["delta".to_string()], - supported_extensions: vec![], - supports_arrow_tables: false, - }; - let merged = MergedCapabilities::from_resolver_capabilities(&[df_caps, custom_caps]); - assert!(merged.supported_schemes.contains("http")); - assert!(merged.supported_schemes.contains("spark")); - assert!(merged.supported_format_types.contains("csv")); - assert!(merged.supported_format_types.contains("delta")); - // DataFusion supports arrow but the custom resolver does not - assert!(!merged.all_support_arrow_tables); - } - - #[test] - fn test_merged_capabilities_all_support_arrow() { - let caps = MergedCapabilities::from_resolver_capabilities(&[ - ResolverCapabilities::datafusion_defaults(), - ]); - assert!(caps.all_support_arrow_tables); - } - - #[test] - fn test_url_supported_scheme_and_format() { - let caps = MergedCapabilities::from_resolver_capabilities(&[ - ResolverCapabilities::datafusion_defaults(), - ]); - assert!(caps.url_supported("https", Some("csv"), None)); - assert!(caps.url_supported("file", None, Some("parquet"))); - assert!(caps.url_supported("http", None, None)); // no format = ok - assert!(!caps.url_supported("spark", Some("csv"), None)); // unknown scheme - assert!(!caps.url_supported("https", Some("delta"), None)); // unknown format - } } diff --git a/vegafusion-core/src/runtime/runtime.rs b/vegafusion-core/src/runtime/runtime.rs index 11196513f..f941b36b5 100644 --- a/vegafusion-core/src/runtime/runtime.rs +++ b/vegafusion-core/src/runtime/runtime.rs @@ -39,14 +39,6 @@ pub struct PreTransformExtractTable { pub trait VegaFusionRuntimeTrait: Send + Sync { fn as_any(&self) -> &dyn Any; - /// Return merged URL capabilities for planning. Default returns DataFusion's built-in - /// capabilities. Runtimes with custom resolvers override this to include their capabilities. - fn planner_capabilities(&self) -> crate::runtime::MergedCapabilities { - crate::runtime::MergedCapabilities::from_resolver_capabilities(&[ - crate::proto::gen::tasks::ResolverCapabilities::datafusion_defaults(), - ]) - } - async fn query_request( &self, task_graph: Arc, @@ -108,17 +100,15 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { keep_variables: Vec, data_base_url: DataBaseUrlSetting, ) -> Result<(SpecPlan, Vec)> { - let resolved_base = - resolve_data_base_url(data_base_url, PlannerConfig::default().data_base_url)?; + let resolved_base = resolve_data_base_url( + data_base_url, + Some(crate::planning::plan::VEGA_DATASETS_CDN_BASE.to_string()), + )?; // Create spec plan let plan = SpecPlan::try_new( spec, - &PlannerConfig { - capabilities: self.planner_capabilities(), - data_base_url: resolved_base.clone(), - ..PlannerConfig::pre_transformed_spec_config(preserve_interactivity, keep_variables) - }, + &PlannerConfig::pre_transformed_spec_config(preserve_interactivity, keep_variables), )?; // Extract inline dataset fingerprints @@ -188,7 +178,7 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { options.preserve_interactivity, inline_datasets, keep_variables, - DataBaseUrlSetting::from_proto(options.data_base_url.clone()), + DataBaseUrlSetting::Default, ) .await?; @@ -223,7 +213,7 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { options.preserve_interactivity, inline_datasets, keep_variables, - DataBaseUrlSetting::from_proto(options.data_base_url.clone()), + DataBaseUrlSetting::Default, ) .await?; let init_arrow = self.materialize_export_updates(init).await?; @@ -349,8 +339,8 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { let keep_variables = Vec::from(variables); let resolved_base = resolve_data_base_url( - DataBaseUrlSetting::from_proto(options.data_base_url.clone()), - PlannerConfig::default().data_base_url, + DataBaseUrlSetting::Default, + Some(crate::planning::plan::VEGA_DATASETS_CDN_BASE.to_string()), )?; // Create spec plan @@ -363,8 +353,6 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { projection_pushdown: false, allow_client_to_server_comms: true, keep_variables, - capabilities: self.planner_capabilities(), - data_base_url: resolved_base.clone(), ..Default::default() }, )?; diff --git a/vegafusion-core/src/spec/data.rs b/vegafusion-core/src/spec/data.rs index fd7e2c05f..3c51b8ca3 100644 --- a/vegafusion-core/src/spec/data.rs +++ b/vegafusion-core/src/spec/data.rs @@ -55,40 +55,6 @@ impl DataSpec { task_scope: &TaskScope, scope: &[u32], ) -> DependencyNodeSupported { - // Check if the URL format is supported by any resolver's capabilities - if let Some(Some(format_type)) = self.format.as_ref().map(|fmt| fmt.type_.clone()) { - if !planner_config - .capabilities - .supported_format_types - .contains(&format_type) - { - // No resolver knows how to read this format, so full node is unsupported - return DependencyNodeSupported::Unsupported; - } - } - - // For static URLs that already have a scheme, check the scheme is supported - // by some resolver. Skip absolute paths and relative URLs — those are resolved - // later by resolve_url (absolute paths become file://, relatives use base URL). - // Internal dataset URLs (table://, vegafusion+dataset://) are always supported. - if let Some(StringOrSignalSpec::String(url_str)) = &self.url { - if crate::runtime::has_url_scheme(url_str) - && !url_str.starts_with("table://") - && !url_str.starts_with("vegafusion+dataset://") - { - if let Ok(parsed) = url::Url::parse(url_str) { - let scheme = parsed.scheme(); - if !planner_config - .capabilities - .supported_schemes - .contains(scheme) - { - return DependencyNodeSupported::Unsupported; - } - } - } - } - // Check if inline values array is supported if let Some(values) = &self.values { if !planner_config.extract_inline_data { diff --git a/vegafusion-core/src/spec/visitors.rs b/vegafusion-core/src/spec/visitors.rs index 814166088..a407eb2b2 100644 --- a/vegafusion-core/src/spec/visitors.rs +++ b/vegafusion-core/src/spec/visitors.rs @@ -5,7 +5,6 @@ use crate::proto::gen::tasks::{ ScanUrlFormat, Task, TzConfig, Variable, VariableNamespace, }; use crate::proto::gen::transforms::TransformPipeline; -use crate::runtime::resolve_url; use crate::spec::chart::{ChartSpec, ChartVisitor}; use crate::spec::data::{DataFormatParseSpec, DataSpec}; use crate::spec::mark::{MarkFacetSpec, MarkSpec}; @@ -172,11 +171,10 @@ impl ChartVisitor for MakeTasksVisitor<'_> { }; let task = if let Some(url) = &data.url { - let (proto_url, task_data_base_url) = match url { + let proto_url = match url { StringOrSignalSpec::String(url) => { - // Resolve URL at plan time (base URL, file:// normalization) - let resolved = resolve_url(url, &self.data_base_url)?; - let mut proto_url = Url::String(resolved); + // Store raw URL string — resolution happens at eval time + let mut proto_url = Url::String(url.clone()); // Append fingerprint to URL that references an inline dataset if let Url::String(url_str) = &proto_url { @@ -190,13 +188,11 @@ impl ChartVisitor for MakeTasksVisitor<'_> { } } } - (proto_url, None) + proto_url } StringOrSignalSpec::Signal(expr) => { - // Signal-based URL: resolved at eval time. - // Store data_base_url in the task so the remote server has it. let url_expr = parse(&expr.signal)?; - (Url::Expr(url_expr), self.data_base_url.clone()) + Url::Expr(url_expr) } }; @@ -208,7 +204,7 @@ impl ChartVisitor for MakeTasksVisitor<'_> { format_type, pipeline, url: Some(proto_url), - data_base_url: task_data_base_url, + data_base_url: self.data_base_url.clone(), }, &self.tz_config, ) diff --git a/vegafusion-python/src/lib.rs b/vegafusion-python/src/lib.rs index fac819e26..6fc1d62a1 100644 --- a/vegafusion-python/src/lib.rs +++ b/vegafusion-python/src/lib.rs @@ -238,7 +238,6 @@ impl PyVegaFusionRuntime { default_input_tz, row_limit, preserve_interactivity, - data_base_url: None, keep_variables: keep_variables .into_iter() .map(|v| PreTransformVariable { @@ -298,7 +297,6 @@ impl PyVegaFusionRuntime { local_tz, default_input_tz, row_limit, - data_base_url: None, }, )) })?; @@ -381,7 +379,6 @@ impl PyVegaFusionRuntime { default_input_tz, preserve_interactivity, extract_threshold: extract_threshold as i32, - data_base_url: None, keep_variables, }, )) diff --git a/vegafusion-python/src/plan_resolver.rs b/vegafusion-python/src/plan_resolver.rs index 225064c70..3d7cb9fa4 100644 --- a/vegafusion-python/src/plan_resolver.rs +++ b/vegafusion-python/src/plan_resolver.rs @@ -15,7 +15,6 @@ use vegafusion_common::arrow::record_batch::RecordBatch; use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::{Result, VegaFusionError}; -use vegafusion_core::proto::gen::tasks::ResolverCapabilities; use vegafusion_core::runtime::{ParsedUrl, ResolutionResult}; use vegafusion_runtime::data::codec::VegaFusionCodec; use vegafusion_runtime::data::external_table::ExternalTableProvider; @@ -187,32 +186,12 @@ impl PlanResolver for PyPlanResolver { &self.name } - fn capabilities(&self) -> ResolverCapabilities { + fn supports_arrow_tables(&self) -> bool { Python::attach(|py| { - let result: PyResult = (|| { - let dict = self.py_resolver.call_method0(py, "capabilities")?; - let dict_ref = dict.bind(py); - - let extract_list = |key: &str| -> PyResult> { - match dict_ref.get_item(key) { - Ok(val) => val.extract(), - Err(_) => Ok(Vec::new()), - } - }; - - let supports_arrow_tables = match dict_ref.get_item("supports_arrow_tables") { - Ok(val) => val.extract().unwrap_or(false), - Err(_) => false, - }; - - Ok(ResolverCapabilities { - supported_schemes: extract_list("supported_schemes")?, - supported_format_types: extract_list("supported_format_types")?, - supported_extensions: extract_list("supported_extensions")?, - supports_arrow_tables, - }) - })(); - result.unwrap_or_default() + self.py_resolver + .getattr(py, "supports_arrow_tables") + .and_then(|v| v.extract::(py)) + .unwrap_or(false) }) } diff --git a/vegafusion-python/tests/test_plan_resolver.py b/vegafusion-python/tests/test_plan_resolver.py index 04ef5d19e..5716efe2b 100644 --- a/vegafusion-python/tests/test_plan_resolver.py +++ b/vegafusion-python/tests/test_plan_resolver.py @@ -791,14 +791,11 @@ def scan_url(self, parsed_url: dict[str, Any]) -> Any: assert datasets[0].num_rows == 3 -def test_capabilities_extends_planner_support() -> None: - """capabilities() dict declaring custom scheme lets planner accept it.""" +def test_custom_scheme_via_scan_url() -> None: + """Custom scheme URLs are handled via scan_url at runtime.""" from vegafusion.plan_resolver import external_table_scan_node class CustomSchemeResolver(PlanResolver): - def capabilities(self) -> dict[str, list[str]]: - return {"supported_schemes": ["myproto"]} - def scan_url(self, parsed_url: dict[str, Any]) -> Any: if parsed_url["scheme"] == "myproto": schema = pa.schema([("val", pa.int64())]) diff --git a/vegafusion-python/vegafusion/plan_resolver.py b/vegafusion-python/vegafusion/plan_resolver.py index 33e23c475..70614fdd1 100644 --- a/vegafusion-python/vegafusion/plan_resolver.py +++ b/vegafusion-python/vegafusion/plan_resolver.py @@ -80,26 +80,11 @@ class PlanResolver: callbacks run on the main thread. Set to False for backends with thread-affine connections (e.g. DuckDB in-memory databases).""" - def capabilities(self) -> dict[str, Any]: - """Declare URL patterns this resolver supports at planning time. - - Override to advertise additional URL scheme/format support beyond - DataFusion's built-in capabilities (http, https, file, s3 schemes - with csv, tsv, json, arrow, parquet formats). - - Returns: - Dict with optional keys: - - - ``'supported_schemes'``: list of URL schemes - (e.g. ``["spark", "snowflake"]``) - - ``'supported_format_types'``: list of format types - (e.g. ``["csv", "parquet"]``) - - ``'supported_extensions'``: list of file extensions - (e.g. ``[".csv", ".parquet"]``) - - ``'supports_arrow_tables'``: bool (default ``False``). When ``True``, - the runtime eagerly materializes plans into Arrow tables. - """ - return {} + supports_arrow_tables: bool = False + """Whether this resolver can efficiently consume in-memory Arrow tables. + When all resolvers in the pipeline return True, the runtime may eagerly + materialize LogicalPlans into tables. When False, data is kept as lazy + plans so resolvers that need plan-level access can intercept them.""" def scan_url_proto(self, parsed_url: dict[str, Any]) -> bytes | None: """Handle a URL during the scan phase (raw bytes variant). diff --git a/vegafusion-runtime/benches/spec_benchmarks.rs b/vegafusion-runtime/benches/spec_benchmarks.rs index 7d3484b5a..2e8918bcd 100644 --- a/vegafusion-runtime/benches/spec_benchmarks.rs +++ b/vegafusion-runtime/benches/spec_benchmarks.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::fs; use std::sync::Arc; -use vegafusion_core::planning::plan::{PlannerConfig, SpecPlan}; +use vegafusion_core::planning::plan::SpecPlan; use vegafusion_core::planning::watch::ExportUpdateBatch; use vegafusion_core::proto::gen::services::query_request::Request; use vegafusion_core::proto::gen::services::QueryRequest; @@ -49,7 +49,7 @@ async fn eval_spec_get_variable(full_spec: ChartSpec, var: &ScopedVariable) -> V .to_tasks( &tz_config, &Default::default(), - PlannerConfig::default().data_base_url, + Some(vegafusion_core::planning::plan::VEGA_DATASETS_CDN_BASE.to_string()), ) .unwrap(); let task_graph = TaskGraph::new(tasks, &task_scope).unwrap(); @@ -104,7 +104,7 @@ async fn eval_spec_sequence(full_spec: ChartSpec, full_updates: Vec ResolverCapabilities { - ResolverCapabilities::datafusion_defaults() + fn supports_arrow_tables(&self) -> bool { + true } async fn scan_url(&self, parsed_url: &ParsedUrl) -> Result> { - // Only handle schemes declared in our capabilities - if !self - .capabilities() - .supported_schemes - .contains(&parsed_url.scheme) - { + // Only handle schemes that DataFusion supports natively + const SUPPORTED_SCHEMES: &[&str] = &["http", "https", "file", "s3"]; + if !SUPPORTED_SCHEMES.contains(&parsed_url.scheme.as_str()) { return Ok(None); } diff --git a/vegafusion-runtime/src/data/pipeline.rs b/vegafusion-runtime/src/data/pipeline.rs index f6f0a535c..c9630c0bd 100644 --- a/vegafusion-runtime/src/data/pipeline.rs +++ b/vegafusion-runtime/src/data/pipeline.rs @@ -7,8 +7,7 @@ use datafusion_expr::LogicalPlan as DFLogicalPlan; use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::{Result, VegaFusionError}; -use vegafusion_core::proto::gen::tasks::ResolverCapabilities; -use vegafusion_core::runtime::{MergedCapabilities, ParsedUrl, ResolutionResult}; +use vegafusion_core::runtime::{ParsedUrl, ResolutionResult}; use super::datafusion_resolver::DataFusionResolver; use super::external_table::ExternalTableProvider; @@ -49,7 +48,7 @@ impl ResolverPipeline { /// Keeps the plan lazy otherwise, so resolvers that need plan-level /// access (e.g. a Spark connector) can intercept external tables. pub fn should_materialize(&self, plan: &LogicalPlan) -> bool { - if self.merged_capabilities().all_support_arrow_tables { + if self.resolvers.iter().all(|r| r.supports_arrow_tables()) { return true; } !has_external_table_scans(plan) @@ -70,29 +69,6 @@ impl ResolverPipeline { Ok(None) } - /// Merge capabilities from all resolvers into a single set for planner lookups. - pub fn merged_capabilities(&self) -> MergedCapabilities { - MergedCapabilities::from_resolver_capabilities( - &self - .resolvers - .iter() - .map(|r| r.capabilities()) - .collect::>(), - ) - } - - /// Return a single merged `ResolverCapabilities` proto (union of all resolvers). - /// Useful for serializing capabilities over gRPC/WASM. - pub fn merged_resolver_capabilities(&self) -> ResolverCapabilities { - let merged = self.merged_capabilities(); - ResolverCapabilities { - supported_schemes: merged.supported_schemes.into_iter().collect(), - supported_format_types: merged.supported_format_types.into_iter().collect(), - supported_extensions: merged.supported_extensions.into_iter().collect(), - supports_arrow_tables: merged.all_support_arrow_tables, - } - } - /// Resolve a `LogicalPlan` to a `VegaFusionTable`. /// /// Iterates through all resolvers; if any returns `Table`, that result diff --git a/vegafusion-runtime/src/data/plan_resolver.rs b/vegafusion-runtime/src/data/plan_resolver.rs index 4814b8d07..318b1a669 100644 --- a/vegafusion-runtime/src/data/plan_resolver.rs +++ b/vegafusion-runtime/src/data/plan_resolver.rs @@ -10,7 +10,6 @@ use vegafusion_common::arrow::datatypes::SchemaRef; use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::{Result, VegaFusionError}; -use vegafusion_core::proto::gen::tasks::ResolverCapabilities; use vegafusion_core::runtime::{ParsedUrl, ResolutionResult}; use super::external_table::ExternalTableProvider; @@ -37,10 +36,12 @@ pub trait PlanResolver: Send + Sync + 'static { /// Human-readable name for logging and error messages. fn name(&self) -> &str; - /// Declare what URL patterns this resolver supports at planning time. - /// Returns empty capabilities by default (no additional URL support). - fn capabilities(&self) -> ResolverCapabilities { - ResolverCapabilities::default() + /// Whether this resolver can efficiently consume in-memory Arrow tables. + /// When all resolvers in the pipeline return true, the runtime may eagerly + /// materialize LogicalPlans into tables. When false, data is kept as lazy + /// plans so resolvers that need plan-level access can intercept them. + fn supports_arrow_tables(&self) -> bool { + false } /// Given a parsed URL, optionally return a LogicalPlan to handle it. diff --git a/vegafusion-runtime/src/data/tasks.rs b/vegafusion-runtime/src/data/tasks.rs index fd0a85c18..6c3aadea3 100644 --- a/vegafusion-runtime/src/data/tasks.rs +++ b/vegafusion-runtime/src/data/tasks.rs @@ -133,14 +133,10 @@ impl TaskCall for DataUrlTask { let config = build_compilation_config(&self.input_vars(), values, tz_config, pipeline.clone()); - // Build url string + // Build url string — resolve at eval time for both static and signal URLs let url = match self.url.as_ref().unwrap() { - Url::String(url) => { - // Already resolved at plan time by MakeTasksVisitor - url.clone() - } + Url::String(url) => vegafusion_core::runtime::resolve_url(url, &self.data_base_url)?, Url::Expr(expr) => { - // Signal-based URL — resolve at eval time let compiled = compile(expr, &config, None).await?; let url_scalar = compiled.eval_to_scalar()?; let raw_url = url_scalar.to_scalar_string()?; diff --git a/vegafusion-runtime/src/task_graph/grpc_runtime.rs b/vegafusion-runtime/src/task_graph/grpc_runtime.rs index 9cdaaa49f..43360a7be 100644 --- a/vegafusion-runtime/src/task_graph/grpc_runtime.rs +++ b/vegafusion-runtime/src/task_graph/grpc_runtime.rs @@ -3,11 +3,11 @@ use vegafusion_core::{ proto::gen::{ services::{ query_request, query_result, vega_fusion_runtime_client::VegaFusionRuntimeClient, - GetCapabilitiesRequest, QueryRequest, + QueryRequest, }, tasks::{NodeValueIndex, TaskGraph, TaskGraphValueRequest}, }, - runtime::{MergedCapabilities, VegaFusionRuntimeTrait}, + runtime::VegaFusionRuntimeTrait, task_graph::task_value::NamedTaskValue, }; @@ -21,7 +21,6 @@ use vegafusion_common::error::{Result, VegaFusionError}; #[derive(Clone)] pub struct GrpcVegaFusionRuntime { client: Arc>>, - capabilities: MergedCapabilities, } #[async_trait] @@ -30,10 +29,6 @@ impl VegaFusionRuntimeTrait for GrpcVegaFusionRuntime { self } - fn planner_capabilities(&self) -> MergedCapabilities { - self.capabilities.clone() - } - async fn query_request( &self, task_graph: Arc, @@ -71,19 +66,9 @@ impl VegaFusionRuntimeTrait for GrpcVegaFusionRuntime { impl GrpcVegaFusionRuntime { pub async fn try_new(channel: tonic::transport::Channel) -> Result { - let mut client = VegaFusionRuntimeClient::new(channel); - - // Fetch capabilities from the server at construction time - let caps_response = client - .get_capabilities(GetCapabilitiesRequest {}) - .await - .map_err(|e| VegaFusionError::internal(format!("Failed to get capabilities: {e}")))?; - let caps = caps_response.into_inner().capabilities.unwrap_or_default(); - let capabilities = MergedCapabilities::from_resolver_capabilities(&[caps]); - + let client = VegaFusionRuntimeClient::new(channel); Ok(Self { client: Arc::new(Mutex::new(client)), - capabilities, }) } } diff --git a/vegafusion-runtime/src/task_graph/runtime.rs b/vegafusion-runtime/src/task_graph/runtime.rs index 99e264b93..0c1aff015 100644 --- a/vegafusion-runtime/src/task_graph/runtime.rs +++ b/vegafusion-runtime/src/task_graph/runtime.rs @@ -94,10 +94,6 @@ impl VegaFusionRuntimeTrait for VegaFusionRuntime { self } - fn planner_capabilities(&self) -> vegafusion_core::runtime::MergedCapabilities { - self.pipeline.merged_capabilities() - } - async fn materialize_task_values( &self, values: Vec, diff --git a/vegafusion-runtime/tests/test_destringify_selection_datasets.rs b/vegafusion-runtime/tests/test_destringify_selection_datasets.rs index 3aa8a2968..edc5ffba5 100644 --- a/vegafusion-runtime/tests/test_destringify_selection_datasets.rs +++ b/vegafusion-runtime/tests/test_destringify_selection_datasets.rs @@ -36,7 +36,6 @@ mod tests { local_tz: "UTC".to_string(), default_input_tz: None, preserve_interactivity: true, - data_base_url: None, }, ) .await diff --git a/vegafusion-runtime/tests/test_image_comparison.rs b/vegafusion-runtime/tests/test_image_comparison.rs index da5f3a71d..368d26996 100644 --- a/vegafusion-runtime/tests/test_image_comparison.rs +++ b/vegafusion-runtime/tests/test_image_comparison.rs @@ -1445,7 +1445,7 @@ async fn check_spec_sequence( .to_tasks( &tz_config, &Default::default(), - PlannerConfig::default().data_base_url, + Some(vegafusion_core::planning::plan::VEGA_DATASETS_CDN_BASE.to_string()), ) .unwrap(); let mut task_graph = TaskGraph::new(tasks, &task_scope).unwrap(); diff --git a/vegafusion-runtime/tests/test_plan_resolver.rs b/vegafusion-runtime/tests/test_plan_resolver.rs index 5b3b2936f..17df64b84 100644 --- a/vegafusion-runtime/tests/test_plan_resolver.rs +++ b/vegafusion-runtime/tests/test_plan_resolver.rs @@ -14,7 +14,6 @@ use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::{Result, VegaFusionError}; use vegafusion_core::data::dataset::VegaFusionDataset; use vegafusion_core::proto::gen::pretransform::PreTransformSpecOpts; -use vegafusion_core::proto::gen::tasks::ResolverCapabilities; use vegafusion_core::runtime::{ParsedUrl, ResolutionResult, VegaFusionRuntimeTrait}; use vegafusion_core::spec::chart::ChartSpec; use vegafusion_runtime::data::external_table::ExternalTableProvider; @@ -214,7 +213,6 @@ async fn test_custom_executor_called_in_pre_transform_spec() { default_input_tz: None, row_limit: None, keep_variables: vec![], - data_base_url: None, }, ) .await @@ -254,7 +252,6 @@ async fn test_custom_executor_called_in_pre_transform_extract() { default_input_tz: None, extract_threshold: 100, keep_variables: vec![], - data_base_url: None, }, ) .await @@ -301,7 +298,6 @@ async fn test_custom_executor_called_in_pre_transform_values() { local_tz: "UTC".to_string(), default_input_tz: None, row_limit: None, - data_base_url: None, }, ) .await @@ -416,7 +412,6 @@ async fn test_bin_transform_uses_custom_executor() { default_input_tz: None, row_limit: None, keep_variables: vec![], - data_base_url: None, }, ) .await @@ -511,7 +506,6 @@ async fn test_mixed_data_only_executes_plans() { default_input_tz: None, row_limit: None, keep_variables: vec![], - data_base_url: None, }, ) .await @@ -816,15 +810,6 @@ impl PlanResolver for CustomSchemeScanner { "custom_scheme_scanner" } - fn capabilities(&self) -> ResolverCapabilities { - ResolverCapabilities { - supported_schemes: vec!["custom".to_string()], - supported_format_types: vec![], - supported_extensions: vec![], - supports_arrow_tables: false, - } - } - async fn scan_url(&self, parsed_url: &ParsedUrl) -> Result> { if parsed_url.scheme == "custom" { let provider = Arc::new(ExternalTableProvider::new( @@ -945,38 +930,6 @@ async fn test_should_materialize() { assert!(!pipeline.should_materialize(&external_plan)); } -#[tokio::test] -async fn test_merged_capabilities_includes_custom_resolver() { - let schema = get_movies_schema(); - let scanner = CustomSchemeScanner { schema }; - - let ctx = Arc::new(datafusion::prelude::SessionContext::new()); - let pipeline = ResolverPipeline::new(vec![Arc::new(scanner)], ctx); - - let caps = pipeline.merged_capabilities(); - // DataFusion built-ins - assert!(caps.supported_schemes.contains("http")); - assert!(caps.supported_schemes.contains("file")); - assert!(caps.supported_format_types.contains("csv")); - // Custom resolver additions - assert!(caps.supported_schemes.contains("custom")); - // url_supported checks - assert!(caps.url_supported("custom", None, None)); - assert!(caps.url_supported("https", Some("csv"), None)); - assert!(!caps.url_supported("spark", None, None)); -} - -#[tokio::test] -async fn test_planner_capabilities_from_runtime() { - let schema = get_movies_schema(); - let scanner = CustomSchemeScanner { schema }; - - let runtime = VegaFusionRuntime::new(None, vec![Arc::new(scanner)]); - let caps = runtime.planner_capabilities(); - assert!(caps.supported_schemes.contains("custom")); - assert!(caps.supported_schemes.contains("http")); -} - /// Test a resolver that returns ResolutionResult::Table directly (bypassing DataFusion execution). #[tokio::test] async fn test_table_returning_resolver() { @@ -1032,7 +985,6 @@ async fn test_table_returning_resolver() { default_input_tz: None, row_limit: None, keep_variables: vec![], - data_base_url: None, }, ) .await @@ -1100,7 +1052,6 @@ async fn test_no_resolver() { default_input_tz: None, row_limit: None, keep_variables: vec![], - data_base_url: None, }, ) .await @@ -1372,7 +1323,6 @@ async fn test_resolver_error_propagation() { default_input_tz: None, row_limit: None, keep_variables: vec![], - data_base_url: None, }, ) .await; diff --git a/vegafusion-runtime/tests/test_planning.rs b/vegafusion-runtime/tests/test_planning.rs index f4484bae8..ffd0fdd3c 100644 --- a/vegafusion-runtime/tests/test_planning.rs +++ b/vegafusion-runtime/tests/test_planning.rs @@ -58,7 +58,7 @@ async fn test_extract_server_data() { .to_tasks( &tz_config, &Default::default(), - PlannerConfig::default().data_base_url, + Some(vegafusion_core::planning::plan::VEGA_DATASETS_CDN_BASE.to_string()), ) .unwrap(); let graph = Arc::new(TaskGraph::new(tasks, &task_scope).unwrap()); diff --git a/vegafusion-runtime/tests/test_pre_transform_extract.rs b/vegafusion-runtime/tests/test_pre_transform_extract.rs index 3752df166..aa5cb3d1e 100644 --- a/vegafusion-runtime/tests/test_pre_transform_extract.rs +++ b/vegafusion-runtime/tests/test_pre_transform_extract.rs @@ -34,7 +34,6 @@ mod tests { &Default::default(), &PreTransformExtractOpts { keep_variables: vec![], - data_base_url: None, extract_threshold: 20, preserve_interactivity: false, local_tz: "UTC".to_string(), diff --git a/vegafusion-runtime/tests/test_pre_transform_keep_variables.rs b/vegafusion-runtime/tests/test_pre_transform_keep_variables.rs index a04e555d8..dfe2797dc 100644 --- a/vegafusion-runtime/tests/test_pre_transform_keep_variables.rs +++ b/vegafusion-runtime/tests/test_pre_transform_keep_variables.rs @@ -38,7 +38,6 @@ mod tests { local_tz: "UTC".to_string(), default_input_tz: None, preserve_interactivity: true, - data_base_url: None, }, ) .await @@ -66,7 +65,6 @@ mod tests { local_tz: "UTC".to_string(), default_input_tz: None, preserve_interactivity: true, - data_base_url: None, }, ) .await @@ -95,7 +93,6 @@ mod tests { local_tz: "UTC".to_string(), default_input_tz: None, preserve_interactivity: true, - data_base_url: None, }, ) .await; diff --git a/vegafusion-runtime/tests/test_pre_transform_values.rs b/vegafusion-runtime/tests/test_pre_transform_values.rs index 1a9e0f665..9e1c14c73 100644 --- a/vegafusion-runtime/tests/test_pre_transform_values.rs +++ b/vegafusion-runtime/tests/test_pre_transform_values.rs @@ -50,7 +50,6 @@ mod tests { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, - data_base_url: None, }, ) .await @@ -100,7 +99,6 @@ mod tests { row_limit: Some(3), local_tz: "UTC".to_string(), default_input_tz: None, - data_base_url: None, }, ) .await @@ -151,7 +149,6 @@ mod tests { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, - data_base_url: None, }, ) .await; @@ -176,7 +173,6 @@ mod tests { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, - data_base_url: None, }, ) .await; @@ -220,7 +216,6 @@ mod tests { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, - data_base_url: None, }, ) .await @@ -267,7 +262,6 @@ mod tests { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, - data_base_url: None, }, ) .await @@ -318,7 +312,6 @@ mod tests { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, - data_base_url: None, }, ) .await @@ -400,7 +393,6 @@ mod tests { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, - data_base_url: None, }, ) .await @@ -460,7 +452,6 @@ mod tests { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, - data_base_url: None, }, ) .await diff --git a/vegafusion-runtime/tests/test_stringify_datetimes.rs b/vegafusion-runtime/tests/test_stringify_datetimes.rs index 3368e77d3..5fe197ddc 100644 --- a/vegafusion-runtime/tests/test_stringify_datetimes.rs +++ b/vegafusion-runtime/tests/test_stringify_datetimes.rs @@ -97,7 +97,6 @@ mod test_stringify_datetimes { local_tz: local_tz.to_string(), default_input_tz: Some(default_input_tz.to_string()), keep_variables: vec![], - data_base_url: None, row_limit: None, preserve_interactivity: true, }, @@ -153,7 +152,6 @@ mod test_stringify_datetimes { local_tz: local_tz.to_string(), default_input_tz: Some(default_input_tz.to_string()), keep_variables: vec![], - data_base_url: None, row_limit: None, preserve_interactivity: true, }, @@ -237,7 +235,6 @@ mod test_stringify_datetimes { local_tz: local_tz.to_string(), default_input_tz: Some(default_input_tz.to_string()), keep_variables: vec![], - data_base_url: None, row_limit: None, preserve_interactivity: true, }, @@ -303,7 +300,6 @@ mod test_stringify_datetimes { local_tz: local_tz.to_string(), default_input_tz: Some(default_input_tz.to_string()), keep_variables: vec![], - data_base_url: None, row_limit: None, preserve_interactivity: true, }, @@ -349,7 +345,6 @@ mod test_stringify_datetimes { local_tz: "UTC".to_string(), default_input_tz: None, keep_variables: vec![], - data_base_url: None, row_limit: None, preserve_interactivity: true, }, diff --git a/vegafusion-runtime/tests/test_task_graph_runtime.rs b/vegafusion-runtime/tests/test_task_graph_runtime.rs index b9e133101..a58e7d8fc 100644 --- a/vegafusion-runtime/tests/test_task_graph_runtime.rs +++ b/vegafusion-runtime/tests/test_task_graph_runtime.rs @@ -1,7 +1,6 @@ use std::sync::Arc; use vegafusion_common::data::scalar::ScalarValue; use vegafusion_core::expression::parser::parse; -use vegafusion_core::planning::plan::PlannerConfig; use vegafusion_core::proto::gen::tasks::data_url_task::Url; use vegafusion_core::proto::gen::tasks::{ DataSourceTask, DataUrlTask, NodeValueIndex, Task, TaskGraph, TzConfig, Variable, @@ -137,7 +136,7 @@ async fn try_it_from_spec() { .to_tasks( &tz_config, &Default::default(), - PlannerConfig::default().data_base_url, + Some(vegafusion_core::planning::plan::VEGA_DATASETS_CDN_BASE.to_string()), ) .unwrap(); diff --git a/vegafusion-server/src/main.rs b/vegafusion-server/src/main.rs index a009766d4..ea46f714f 100644 --- a/vegafusion-server/src/main.rs +++ b/vegafusion-server/src/main.rs @@ -9,9 +9,8 @@ use vegafusion_core::proto::gen::services::vega_fusion_runtime_server::{ }; use vegafusion_core::proto::gen::services::{ pre_transform_extract_result, pre_transform_spec_result, pre_transform_values_result, - query_request, query_result, GetCapabilitiesRequest, GetCapabilitiesResult, - PreTransformExtractResult, PreTransformSpecResult, PreTransformValuesResult, QueryRequest, - QueryResult, + query_request, query_result, PreTransformExtractResult, PreTransformSpecResult, + PreTransformValuesResult, QueryRequest, QueryResult, }; use vegafusion_core::proto::gen::tasks::TaskGraphValueResponse; use vegafusion_core::proto::gen::tasks::{ @@ -123,16 +122,6 @@ impl VegaFusionRuntimeGrpc { } } } - Some(query_request::Request::GetCapabilities(_)) => { - let caps = self.runtime.pipeline.merged_resolver_capabilities(); - Ok(QueryResult { - response: Some(query_result::Response::GetCapabilities( - GetCapabilitiesResult { - capabilities: Some(caps), - }, - )), - }) - } _ => Err(VegaFusionError::internal( "Invalid VegaFusionRuntimeRequest request", )), @@ -150,7 +139,6 @@ impl VegaFusionRuntimeGrpc { keep_variables: vec![], local_tz: "UTC".to_string(), default_input_tz: None, - data_base_url: None, }); // Decode inline datasets to VegaFusionDatasets @@ -237,7 +225,6 @@ impl VegaFusionRuntimeGrpc { row_limit: None, local_tz: "UTC".to_string(), default_input_tz: None, - data_base_url: None, }); let variables: Vec = request @@ -335,16 +322,6 @@ impl TonicVegaFusionRuntime for VegaFusionRuntimeGrpc { Err(err) => Err(Status::unknown(err.to_string())), } } - - async fn get_capabilities( - &self, - _request: Request, - ) -> Result, Status> { - let caps = self.runtime.pipeline.merged_resolver_capabilities(); - Ok(Response::new(GetCapabilitiesResult { - capabilities: Some(caps), - })) - } } /// VegaFusion Server diff --git a/vegafusion-server/tests/test_task_graph_runtime.rs b/vegafusion-server/tests/test_task_graph_runtime.rs index 82a26d112..032d227eb 100644 --- a/vegafusion-server/tests/test_task_graph_runtime.rs +++ b/vegafusion-server/tests/test_task_graph_runtime.rs @@ -1,6 +1,5 @@ use std::time::Duration; use vegafusion_common::data::scalar::ScalarValueHelpers; -use vegafusion_core::planning::plan::PlannerConfig; use vegafusion_core::proto::gen::services::query_result::Response; use vegafusion_core::proto::gen::services::vega_fusion_runtime_client::VegaFusionRuntimeClient; use vegafusion_core::proto::gen::services::{query_request, QueryRequest}; @@ -55,7 +54,7 @@ async fn try_it_from_spec() { .to_tasks( &tz_config, &Default::default(), - PlannerConfig::default().data_base_url, + Some(vegafusion_core::planning::plan::VEGA_DATASETS_CDN_BASE.to_string()), ) .unwrap(); @@ -100,7 +99,6 @@ async fn try_it_from_spec() { &[32.1, 59.6], ) } - other => panic!("Unexpected response variant: {other:?}"), } proc.kill().ok(); } diff --git a/vegafusion-wasm/src/lib.rs b/vegafusion-wasm/src/lib.rs index 5268c4ef3..63eb53e7f 100644 --- a/vegafusion-wasm/src/lib.rs +++ b/vegafusion-wasm/src/lib.rs @@ -24,9 +24,9 @@ use wasm_bindgen_futures::spawn_local; use vegafusion_core::planning::watch::{ExportUpdateJSON, ExportUpdateNamespace, WatchPlan}; use vegafusion_core::proto::gen::services::{ - query_request, query_result, GetCapabilitiesRequest, QueryRequest, QueryResult, + query_request, query_result, QueryRequest, QueryResult, }; -use vegafusion_core::runtime::{MergedCapabilities, VegaFusionRuntimeTrait}; +use vegafusion_core::runtime::VegaFusionRuntimeTrait; use vegafusion_core::spec::chart::ChartSpec; use vegafusion_core::chart_state::{ChartState, ChartStateOpts}; @@ -61,11 +61,10 @@ pub struct QueryFnVegaFusionRuntime { QueryRequest, oneshot::Sender>>, )>, - capabilities: MergedCapabilities, } impl QueryFnVegaFusionRuntime { - pub fn new(query_fn: js_sys::Function, capabilities: MergedCapabilities) -> Self { + pub fn new(query_fn: js_sys::Function) -> Self { let (sender, mut receiver) = async_mpsc::channel::<( QueryRequest, oneshot::Sender>>, @@ -149,23 +148,11 @@ impl QueryFnVegaFusionRuntime { .send(Ok(task_graph_value_response.response_values)) .unwrap(); } - query_result::Response::GetCapabilities(_) => { - // Capabilities responses are handled during initialization, - // not in the normal query loop - response_tx - .send(Err(vegafusion_common::error::VegaFusionError::internal( - "Unexpected GetCapabilities response in query loop".to_string(), - ))) - .unwrap(); - } } } }); - QueryFnVegaFusionRuntime { - sender, - capabilities, - } + QueryFnVegaFusionRuntime { sender } } } @@ -175,10 +162,6 @@ impl VegaFusionRuntimeTrait for QueryFnVegaFusionRuntime { self } - fn planner_capabilities(&self) -> MergedCapabilities { - self.capabilities.clone() - } - async fn query_request( &self, task_graph: Arc, @@ -416,79 +399,6 @@ impl Default for VegaFusionEmbedConfig { } } -/// Fetch capabilities from a remote server via the query_fn callback. -/// Sends a GetCapabilitiesRequest through the existing proto-encoded channel. -async fn fetch_capabilities_via_query_fn( - query_fn: &js_sys::Function, -) -> Result { - use vegafusion_core::proto::gen::tasks::ResolverCapabilities; - - let request = QueryRequest { - request: Some(query_request::Request::GetCapabilities( - GetCapabilitiesRequest {}, - )), - }; - - let mut buf: Vec = Vec::with_capacity(request.encoded_len()); - request - .encode(&mut buf) - .map_err(|e| JsError::new(&format!("Failed to encode capabilities request: {e}")))?; - - let context = JsValue::null(); - let js_buffer = js_sys::Uint8Array::from(buf.as_slice()); - let promise = query_fn - .call1(&context, &js_buffer) - .map_err(|e| { - JsError::new(&format!( - "Failed to call query_fn for capabilities: {}", - js_sys::JSON::stringify(&e).unwrap() - )) - })? - .dyn_into::() - .map_err(|e| { - JsError::new(&format!( - "query_fn did not return a promise: {}", - js_sys::JSON::stringify(&e).unwrap() - )) - })?; - - let response = JsFuture::from(promise).await.map_err(|e| { - JsError::new(&format!( - "Capabilities query failed: {}", - js_sys::JSON::stringify(&e).unwrap() - )) - })?; - - let response_array = response.dyn_into::().map_err(|e| { - JsError::new(&format!( - "Capabilities response is not Uint8Array: {}", - js_sys::JSON::stringify(&e).unwrap() - )) - })?; - - let response_bytes = response_array.to_vec(); - let result = QueryResult::decode(response_bytes.as_slice()) - .map_err(|e| JsError::new(&format!("Failed to decode capabilities response: {e}")))?; - - match result.response { - Some(query_result::Response::GetCapabilities(caps_result)) => { - let caps = caps_result - .capabilities - .unwrap_or_else(ResolverCapabilities::default); - Ok(MergedCapabilities::from_resolver_capabilities(&[caps])) - } - Some(query_result::Response::Error(err)) => { - Err(JsError::new(&format!("Server returned error for capabilities: {err:?}")).into()) - } - _ => { - // Server doesn't support capabilities — use DataFusion defaults - Ok(MergedCapabilities::from_resolver_capabilities(&[ - ResolverCapabilities::datafusion_defaults(), - ])) - } - } -} - /// Embed a Vega chart and accelerate with VegaFusion /// @param element - The DOM element to embed the visualization into /// @param spec - The Vega specification (as string or object) @@ -537,9 +447,7 @@ pub async fn vegafusion_embed( )) })?; - // Fetch capabilities from the remote server before constructing the runtime - let capabilities = fetch_capabilities_via_query_fn(&query_fn).await?; - Box::new(QueryFnVegaFusionRuntime::new(query_fn, capabilities)) + Box::new(QueryFnVegaFusionRuntime::new(query_fn)) }; let chart_state = ChartState::try_new( From 4e600cdb308c9fac8073189e2c7229a510d26486 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 16 Mar 2026 19:10:50 -0400 Subject: [PATCH 22/36] fix: reject unsupported formats (e.g. topojson) at planning time After removing capabilities-based format checks, the planner pushed all URL-backed datasets server-side including topojson, which DataFusion can't read. Add a hardcoded SUPPORTED_FORMATS list so formats like topojson stay client-side for Vega JS. Co-Authored-By: Claude Opus 4.6 (1M context) --- vegafusion-core/src/spec/data.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vegafusion-core/src/spec/data.rs b/vegafusion-core/src/spec/data.rs index 3c51b8ca3..e6efd4824 100644 --- a/vegafusion-core/src/spec/data.rs +++ b/vegafusion-core/src/spec/data.rs @@ -49,12 +49,23 @@ impl DataSpec { signals.into_iter().sorted().collect() } + /// Formats that VegaFusion can read server-side. Anything else (e.g. topojson) + /// stays client-side for Vega JS to handle. + const SUPPORTED_FORMATS: &'static [&'static str] = &["csv", "tsv", "json", "arrow", "parquet"]; + pub fn supported( &self, planner_config: &PlannerConfig, task_scope: &TaskScope, scope: &[u32], ) -> DependencyNodeSupported { + // Check if the URL format is one VegaFusion can read + if let Some(Some(format_type)) = self.format.as_ref().map(|fmt| fmt.type_.clone()) { + if !Self::SUPPORTED_FORMATS.contains(&format_type.as_str()) { + return DependencyNodeSupported::Unsupported; + } + } + // Check if inline values array is supported if let Some(values) = &self.values { if !planner_config.extract_inline_data { From d2497496e8c4cf65cdc2d237e1c11fb6db764845 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 16 Mar 2026 19:42:37 -0400 Subject: [PATCH 23/36] style: remove section header comments Co-Authored-By: Claude Opus 4.6 (1M context) --- vegafusion-core/src/runtime/plan_resolver.rs | 12 ------------ vegafusion-runtime/tests/test_plan_resolver.rs | 2 -- 2 files changed, 14 deletions(-) diff --git a/vegafusion-core/src/runtime/plan_resolver.rs b/vegafusion-core/src/runtime/plan_resolver.rs index 9e7ed9820..b28d996b5 100644 --- a/vegafusion-core/src/runtime/plan_resolver.rs +++ b/vegafusion-core/src/runtime/plan_resolver.rs @@ -170,8 +170,6 @@ pub fn resolve_url(url: &str, data_base_url: &Option) -> Result mod tests { use super::*; - // ── has_url_scheme ── - #[test] fn test_has_url_scheme_https() { assert!(has_url_scheme("https://example.com/data.csv")); @@ -209,8 +207,6 @@ mod tests { assert!(!has_url_scheme("foo/http://bar")); } - // ── is_absolute_path ── - #[test] fn test_is_absolute_path_unix() { assert!(is_absolute_path("/tmp/data.csv")); @@ -241,8 +237,6 @@ mod tests { assert!(!is_absolute_path("relative/path")); } - // ── path_to_file_url ── - #[test] #[cfg(not(target_os = "windows"))] fn test_path_to_file_url_unix() { @@ -267,8 +261,6 @@ mod tests { ); } - // ── normalize_base_url ── - #[test] fn test_normalize_base_url_scheme() { let result = normalize_base_url("https://example.com/data/".to_string()).unwrap(); @@ -300,8 +292,6 @@ mod tests { assert!(result.is_err()); } - // ── resolve_url ── - #[test] fn test_resolve_url_scheme_passthrough() { let base = Some("https://cdn.example.com/".to_string()); @@ -361,8 +351,6 @@ mod tests { ); } - // ── resolve_data_base_url ── - #[test] fn test_resolve_data_base_url_default() { let default = Some("https://cdn.example.com/".to_string()); diff --git a/vegafusion-runtime/tests/test_plan_resolver.rs b/vegafusion-runtime/tests/test_plan_resolver.rs index 17df64b84..bf8e454e7 100644 --- a/vegafusion-runtime/tests/test_plan_resolver.rs +++ b/vegafusion-runtime/tests/test_plan_resolver.rs @@ -797,8 +797,6 @@ fn get_inline_datasets() -> std::collections::HashMap datasets } -// ── scan_url tests ── - /// A resolver that claims custom:// URLs by returning an ExternalTableProvider plan struct CustomSchemeScanner { schema: Arc, From 1396662c66eba092a17b4ba47cfaebadc2272fcf Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 16 Mar 2026 19:49:39 -0400 Subject: [PATCH 24/36] refactor: move URL utilities from runtime/plan_resolver.rs to data/url.rs The file no longer contains the PlanResolver trait (moved to vegafusion-runtime in an earlier commit). Rename to reflect its actual contents: URL types, resolution, and parsing utilities. Co-Authored-By: Claude Opus 4.6 (1M context) --- vegafusion-core/src/data/mod.rs | 1 + vegafusion-core/src/{runtime/plan_resolver.rs => data/url.rs} | 0 vegafusion-core/src/runtime/mod.rs | 3 +-- 3 files changed, 2 insertions(+), 2 deletions(-) rename vegafusion-core/src/{runtime/plan_resolver.rs => data/url.rs} (100%) diff --git a/vegafusion-core/src/data/mod.rs b/vegafusion-core/src/data/mod.rs index f311152c4..d90e52fd2 100644 --- a/vegafusion-core/src/data/mod.rs +++ b/vegafusion-core/src/data/mod.rs @@ -1,2 +1,3 @@ pub mod dataset; pub mod tasks; +pub mod url; diff --git a/vegafusion-core/src/runtime/plan_resolver.rs b/vegafusion-core/src/data/url.rs similarity index 100% rename from vegafusion-core/src/runtime/plan_resolver.rs rename to vegafusion-core/src/data/url.rs diff --git a/vegafusion-core/src/runtime/mod.rs b/vegafusion-core/src/runtime/mod.rs index d3035df5d..eabb459d7 100644 --- a/vegafusion-core/src/runtime/mod.rs +++ b/vegafusion-core/src/runtime/mod.rs @@ -1,7 +1,6 @@ -mod plan_resolver; mod runtime; -pub use plan_resolver::{ +pub use crate::data::url::{ has_url_scheme, is_absolute_path, normalize_base_url, path_to_file_url, resolve_data_base_url, resolve_url, DataBaseUrlSetting, ParsedUrl, ResolutionResult, }; From 5720a7a6b79fa2a429674a4ab386edd8789ec66c Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Tue, 17 Mar 2026 10:08:44 -0400 Subject: [PATCH 25/36] refactor: move data_base_url from task graph to runtime data_base_url is a runtime configuration, not a task graph property. Remove it from the DataUrlTask proto, MakeTasksVisitor, ChartStateOpts, and the VegaFusionRuntimeTrait method signatures. Instead, store the resolved base URL on ResolverPipeline and read it at eval time. Add data_base_url parameter to Python VegaFusionRuntime: None/True = CDN default, str = custom URL, False = disabled. Co-Authored-By: Claude Opus 4.6 (1M context) --- vegafusion-core/src/chart_state.rs | 10 +-- vegafusion-core/src/data/url.rs | 64 ------------------- vegafusion-core/src/planning/plan.rs | 4 -- vegafusion-core/src/proto/tasks.proto | 2 +- vegafusion-core/src/runtime/mod.rs | 4 +- vegafusion-core/src/runtime/runtime.rs | 18 +----- vegafusion-core/src/spec/chart.rs | 3 +- vegafusion-core/src/spec/visitors.rs | 9 +-- vegafusion-python/src/chart_state.rs | 3 +- vegafusion-python/src/lib.rs | 45 +++++++++++-- vegafusion-python/vegafusion/runtime.py | 34 ++++++++++ vegafusion-runtime/benches/spec_benchmarks.rs | 12 +--- vegafusion-runtime/src/data/pipeline.rs | 40 +++++++++++- vegafusion-runtime/src/data/tasks.rs | 6 +- .../src/expression/compiler/config.rs | 2 +- vegafusion-runtime/src/task_graph/runtime.rs | 18 +++++- vegafusion-runtime/tests/test_chart_state.rs | 1 - .../tests/test_image_comparison.rs | 6 +- .../tests/test_plan_resolver.rs | 20 +++--- vegafusion-runtime/tests/test_planning.rs | 6 +- .../tests/test_task_graph_runtime.rs | 9 +-- .../tests/test_task_graph_runtime.rs | 8 +-- vegafusion-wasm/src/lib.rs | 1 - 23 files changed, 162 insertions(+), 163 deletions(-) diff --git a/vegafusion-core/src/chart_state.rs b/vegafusion-core/src/chart_state.rs index a46473b5a..d7fad7db8 100644 --- a/vegafusion-core/src/chart_state.rs +++ b/vegafusion-core/src/chart_state.rs @@ -10,7 +10,7 @@ use crate::{ pretransform::PreTransformSpecWarning, tasks::{NodeValueIndex, TaskGraph, TzConfig, Variable}, }, - runtime::{DataBaseUrlSetting, VegaFusionRuntimeTrait}, + runtime::VegaFusionRuntimeTrait, spec::chart::ChartSpec, task_graph::{graph::ScopedVariable, task_value::TaskValue}, }; @@ -28,7 +28,6 @@ use vegafusion_common::{ pub struct ChartStateOpts { pub tz_config: TzConfig, pub row_limit: Option, - pub data_base_url: DataBaseUrlSetting, } impl Default for ChartStateOpts { @@ -39,7 +38,6 @@ impl Default for ChartStateOpts { default_input_tz: None, }, row_limit: None, - data_base_url: DataBaseUrlSetting::Default, } } } @@ -68,10 +66,6 @@ impl ChartState { .map(|(k, ds)| (k.clone(), ds.fingerprint())) .collect::>(); - let resolved_base = crate::runtime::resolve_data_base_url( - opts.data_base_url.clone(), - Some(crate::planning::plan::VEGA_DATASETS_CDN_BASE.to_string()), - )?; let plan = SpecPlan::try_new(&spec, &PlannerConfig::default())?; let task_scope = plan @@ -80,7 +74,7 @@ impl ChartState { .with_context(|| "Failed to create task scope for server spec")?; let tasks = plan .server_spec - .to_tasks(&opts.tz_config, &dataset_fingerprints, resolved_base) + .to_tasks(&opts.tz_config, &dataset_fingerprints) .unwrap(); let task_graph = TaskGraph::new(tasks, &task_scope).unwrap(); let task_graph_mapping = task_graph.build_mapping(); diff --git a/vegafusion-core/src/data/url.rs b/vegafusion-core/src/data/url.rs index b28d996b5..b58837b88 100644 --- a/vegafusion-core/src/data/url.rs +++ b/vegafusion-core/src/data/url.rs @@ -12,18 +12,6 @@ pub enum ResolutionResult { Plan(LogicalPlan), } -/// Three-state base URL setting for public API boundaries. -#[derive(Clone, Debug, Default)] -pub enum DataBaseUrlSetting { - /// Use the default CDN base URL (vega-datasets) - #[default] - Default, - /// Disable base URL; relative paths produce an error - Disabled, - /// Use a custom base URL (scheme URL or absolute path) - Custom(String), -} - /// Parsed URL representation passed to resolvers during the scan phase. /// All fields are populated from the fully-resolved URL (after base URL /// resolution and hash-stripping). Resolvers pattern-match on these fields @@ -47,20 +35,6 @@ pub struct ParsedUrl { pub parse: Option, } -/// Map a DataBaseUrlSetting (from public API) to the two-state Option -/// used by PlannerConfig. Custom base URLs are normalized (bare absolute paths -/// become file:// URLs). -pub fn resolve_data_base_url( - api_value: DataBaseUrlSetting, - default: Option, -) -> Result> { - match api_value { - DataBaseUrlSetting::Default => Ok(default), - DataBaseUrlSetting::Disabled => Ok(None), - DataBaseUrlSetting::Custom(s) => Ok(Some(normalize_base_url(s)?)), - } -} - static URL_SCHEME_RE: LazyLock = LazyLock::new(|| Regex::new(r"^(//|[a-zA-Z][a-zA-Z0-9+.\-]*://)").unwrap()); @@ -350,42 +324,4 @@ mod tests { "https://proxy.com/fetch?target=http://evil.com/data" ); } - - #[test] - fn test_resolve_data_base_url_default() { - let default = Some("https://cdn.example.com/".to_string()); - let result = resolve_data_base_url(DataBaseUrlSetting::Default, default.clone()).unwrap(); - assert_eq!(result, default); - } - - #[test] - fn test_resolve_data_base_url_disabled() { - let result = resolve_data_base_url( - DataBaseUrlSetting::Disabled, - Some("https://cdn.example.com/".to_string()), - ) - .unwrap(); - assert_eq!(result, None); - } - - #[test] - fn test_resolve_data_base_url_custom() { - let result = resolve_data_base_url( - DataBaseUrlSetting::Custom("https://my-server.com/data/".to_string()), - Some("https://cdn.example.com/".to_string()), - ) - .unwrap(); - assert_eq!(result, Some("https://my-server.com/data/".to_string())); - } - - #[test] - #[cfg(not(target_os = "windows"))] - fn test_resolve_data_base_url_custom_path() { - let result = resolve_data_base_url( - DataBaseUrlSetting::Custom("/home/user/data".to_string()), - None, - ) - .unwrap(); - assert_eq!(result, Some("file:///home/user/data".to_string())); - } } diff --git a/vegafusion-core/src/planning/plan.rs b/vegafusion-core/src/planning/plan.rs index 546c931c9..a9a2b929c 100644 --- a/vegafusion-core/src/planning/plan.rs +++ b/vegafusion-core/src/planning/plan.rs @@ -97,10 +97,6 @@ pub struct PlannerConfig { pub strip_tooltip_encoding: bool, } -/// Default CDN base URL for vega-datasets -pub const VEGA_DATASETS_CDN_BASE: &str = - "https://raw.githubusercontent.com/vega/vega-datasets/v2.3.0/"; - impl Default for PlannerConfig { fn default() -> Self { Self { diff --git a/vegafusion-core/src/proto/tasks.proto b/vegafusion-core/src/proto/tasks.proto index a984ab72c..0761f24db 100644 --- a/vegafusion-core/src/proto/tasks.proto +++ b/vegafusion-core/src/proto/tasks.proto @@ -77,7 +77,7 @@ message DataUrlTask { int32 batch_size = 3; ScanUrlFormat format_type = 4; transforms.TransformPipeline pipeline = 5; - optional string data_base_url = 6; + reserved 6; } // ## Inline values task diff --git a/vegafusion-core/src/runtime/mod.rs b/vegafusion-core/src/runtime/mod.rs index eabb459d7..00f406ab2 100644 --- a/vegafusion-core/src/runtime/mod.rs +++ b/vegafusion-core/src/runtime/mod.rs @@ -1,7 +1,7 @@ mod runtime; pub use crate::data::url::{ - has_url_scheme, is_absolute_path, normalize_base_url, path_to_file_url, resolve_data_base_url, - resolve_url, DataBaseUrlSetting, ParsedUrl, ResolutionResult, + has_url_scheme, is_absolute_path, normalize_base_url, path_to_file_url, resolve_url, ParsedUrl, + ResolutionResult, }; pub use runtime::{PreTransformExtractTable, VegaFusionRuntimeTrait}; diff --git a/vegafusion-core/src/runtime/runtime.rs b/vegafusion-core/src/runtime/runtime.rs index f941b36b5..28607b809 100644 --- a/vegafusion-core/src/runtime/runtime.rs +++ b/vegafusion-core/src/runtime/runtime.rs @@ -1,7 +1,6 @@ use std::{any::Any, collections::HashMap, sync::Arc}; use crate::proto::gen::pretransform::pre_transform_values_warning::WarningType as ValuesWarningType; -use crate::runtime::{resolve_data_base_url, DataBaseUrlSetting}; use crate::task_graph::task_value::{MaterializedTaskValue, TaskValue}; use crate::{ data::dataset::VegaFusionDataset, @@ -98,13 +97,7 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { preserve_interactivity: bool, inline_datasets: &HashMap, keep_variables: Vec, - data_base_url: DataBaseUrlSetting, ) -> Result<(SpecPlan, Vec)> { - let resolved_base = resolve_data_base_url( - data_base_url, - Some(crate::planning::plan::VEGA_DATASETS_CDN_BASE.to_string()), - )?; - // Create spec plan let plan = SpecPlan::try_new( spec, @@ -125,7 +118,7 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { let task_scope = plan.server_spec.to_task_scope().unwrap(); let tasks = plan .server_spec - .to_tasks(&tz_config, &dataset_fingerprints, resolved_base) + .to_tasks(&tz_config, &dataset_fingerprints) .unwrap(); let task_graph = TaskGraph::new(tasks, &task_scope).unwrap(); let task_graph_mapping = task_graph.build_mapping(); @@ -178,7 +171,6 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { options.preserve_interactivity, inline_datasets, keep_variables, - DataBaseUrlSetting::Default, ) .await?; @@ -213,7 +205,6 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { options.preserve_interactivity, inline_datasets, keep_variables, - DataBaseUrlSetting::Default, ) .await?; let init_arrow = self.materialize_export_updates(init).await?; @@ -338,11 +329,6 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { // if they are not used elsewhere in the spec let keep_variables = Vec::from(variables); - let resolved_base = resolve_data_base_url( - DataBaseUrlSetting::Default, - Some(crate::planning::plan::VEGA_DATASETS_CDN_BASE.to_string()), - )?; - // Create spec plan let plan = SpecPlan::try_new( spec, @@ -371,7 +357,7 @@ pub trait VegaFusionRuntimeTrait: Send + Sync { let task_scope = plan.server_spec.to_task_scope().unwrap(); let tasks = plan .server_spec - .to_tasks(&tz_config, &dataset_fingerprints, resolved_base)?; + .to_tasks(&tz_config, &dataset_fingerprints)?; let task_graph = TaskGraph::new(tasks, &task_scope).unwrap(); let task_graph_mapping = task_graph.build_mapping(); diff --git a/vegafusion-core/src/spec/chart.rs b/vegafusion-core/src/spec/chart.rs index 5866fa393..0d0f55efd 100644 --- a/vegafusion-core/src/spec/chart.rs +++ b/vegafusion-core/src/spec/chart.rs @@ -150,9 +150,8 @@ impl ChartSpec { &self, tz_config: &TzConfig, dataset_fingerprints: &HashMap, - data_base_url: Option, ) -> Result> { - let mut visitor = MakeTasksVisitor::new(tz_config, dataset_fingerprints, data_base_url); + let mut visitor = MakeTasksVisitor::new(tz_config, dataset_fingerprints); self.walk(&mut visitor)?; Ok(visitor.tasks) } diff --git a/vegafusion-core/src/spec/visitors.rs b/vegafusion-core/src/spec/visitors.rs index a407eb2b2..d2cc9372b 100644 --- a/vegafusion-core/src/spec/visitors.rs +++ b/vegafusion-core/src/spec/visitors.rs @@ -108,20 +108,14 @@ pub struct MakeTasksVisitor<'a> { pub tasks: Vec, pub tz_config: TzConfig, pub dataset_fingerprints: &'a HashMap, - pub data_base_url: Option, } impl<'a> MakeTasksVisitor<'a> { - pub fn new( - tz_config: &TzConfig, - dataset_fingerprints: &'a HashMap, - data_base_url: Option, - ) -> Self { + pub fn new(tz_config: &TzConfig, dataset_fingerprints: &'a HashMap) -> Self { Self { tasks: Default::default(), tz_config: tz_config.clone(), dataset_fingerprints, - data_base_url, } } } @@ -204,7 +198,6 @@ impl ChartVisitor for MakeTasksVisitor<'_> { format_type, pipeline, url: Some(proto_url), - data_base_url: self.data_base_url.clone(), }, &self.tz_config, ) diff --git a/vegafusion-python/src/chart_state.rs b/vegafusion-python/src/chart_state.rs index 7af25b865..c54ff260f 100644 --- a/vegafusion-python/src/chart_state.rs +++ b/vegafusion-python/src/chart_state.rs @@ -10,7 +10,7 @@ use vegafusion_core::{ data::dataset::VegaFusionDataset, planning::{plan::PreTransformSpecWarningSpec, watch::WatchPlan}, proto::gen::tasks::TzConfig, - runtime::{DataBaseUrlSetting, VegaFusionRuntimeTrait}, + runtime::VegaFusionRuntimeTrait, spec::chart::ChartSpec, }; @@ -37,7 +37,6 @@ impl PyChartState { ChartStateOpts { tz_config, row_limit, - data_base_url: DataBaseUrlSetting::Default, }, ))?; Ok(Self { diff --git a/vegafusion-python/src/lib.rs b/vegafusion-python/src/lib.rs index 6fc1d62a1..15b5764a5 100644 --- a/vegafusion-python/src/lib.rs +++ b/vegafusion-python/src/lib.rs @@ -67,9 +67,12 @@ impl PyVegaFusionRuntime { worker_threads: Option, resolvers: Vec>, use_current_thread: bool, + data_base_url: Option<&Bound>, ) -> PyResult { initialize_logging(); + let data_base_url_setting = parse_data_base_url(data_base_url)?; + let tokio_runtime_connection = if use_current_thread { tokio::runtime::Builder::new_current_thread() .enable_all() @@ -89,23 +92,54 @@ impl PyVegaFusionRuntime { }; Ok(Self { - runtime: Arc::new(VegaFusionRuntime::new( + runtime: Arc::new(VegaFusionRuntime::new_with_data_base_url( Some(VegaFusionCache::new(max_capacity, memory_limit)), resolvers, - )), + data_base_url_setting, + )?), tokio_runtime: Arc::new(tokio_runtime_connection), }) } } +/// Parse Python `data_base_url` argument into `DataBaseUrlSetting`. +/// +/// - `None` or `True` -> `DataBaseUrlSetting::Default` (CDN) +/// - `str` -> `DataBaseUrlSetting::Custom(s)` +/// - `False` -> `DataBaseUrlSetting::Disabled` +fn parse_data_base_url( + value: Option<&Bound>, +) -> PyResult { + use vegafusion_runtime::data::pipeline::DataBaseUrlSetting; + match value { + None => Ok(DataBaseUrlSetting::Default), + Some(obj) => { + if let Ok(b) = obj.extract::() { + if b { + Ok(DataBaseUrlSetting::Default) + } else { + Ok(DataBaseUrlSetting::Disabled) + } + } else if let Ok(s) = obj.extract::() { + Ok(DataBaseUrlSetting::Custom(s)) + } else { + Err(PyValueError::new_err( + "data_base_url must be a str, bool, or None", + )) + } + } + } +} + #[pymethods] impl PyVegaFusionRuntime { #[staticmethod] - #[pyo3(signature = (max_capacity=None, memory_limit=None, worker_threads=None))] + #[pyo3(signature = (max_capacity=None, memory_limit=None, worker_threads=None, data_base_url=None))] pub fn new_embedded( max_capacity: Option, memory_limit: Option, worker_threads: Option, + data_base_url: Option<&Bound>, ) -> PyResult { Self::build_with_resolvers( max_capacity, @@ -113,16 +147,18 @@ impl PyVegaFusionRuntime { worker_threads, Vec::new(), false, + data_base_url, ) } #[staticmethod] - #[pyo3(signature = (py_resolvers, max_capacity=None, memory_limit=None, worker_threads=None))] + #[pyo3(signature = (py_resolvers, max_capacity=None, memory_limit=None, worker_threads=None, data_base_url=None))] pub fn new_with_resolvers( py_resolvers: Vec>, max_capacity: Option, memory_limit: Option, worker_threads: Option, + data_base_url: Option<&Bound>, ) -> PyResult { let py_resolvers: Vec = py_resolvers .into_iter() @@ -142,6 +178,7 @@ impl PyVegaFusionRuntime { worker_threads, resolvers, use_current_thread, + data_base_url, ) } diff --git a/vegafusion-python/vegafusion/runtime.py b/vegafusion-python/vegafusion/runtime.py index 45b2eb1e9..e0e86cfdf 100644 --- a/vegafusion-python/vegafusion/runtime.py +++ b/vegafusion-python/vegafusion/runtime.py @@ -207,6 +207,7 @@ def __init__( | list[PlanResolver] | tuple[PlanResolver, ...] | None = None, + data_base_url: str | bool | None = None, ) -> None: """ Initialize a VegaFusionRuntime. @@ -220,6 +221,11 @@ def __init__( Can be a single resolver or a list of resolvers that form a pipeline (executed in order; short-circuits on first Table result). + data_base_url: Base URL for resolving relative data URLs. + - None or True: use the default CDN + (https://raw.githubusercontent.com/vega/vega-datasets/v2.3.0/) + - str: custom base URL (scheme URL or absolute path) + - False: disabled; relative paths produce an error """ self._runtime = None self._grpc_url: str | None = None @@ -227,6 +233,7 @@ def __init__( self._memory_limit = memory_limit self._worker_threads = worker_threads self._plan_resolvers = _normalize_resolvers(plan_resolver) + self._data_base_url = data_base_url @property def runtime(self) -> PyVegaFusionRuntime: @@ -251,12 +258,14 @@ def runtime(self) -> PyVegaFusionRuntime: self.cache_capacity, self.memory_limit, self.worker_threads, + data_base_url=self._data_base_url, ) else: self._runtime = PyVegaFusionRuntime.new_embedded( self.cache_capacity, self.memory_limit, self.worker_threads, + data_base_url=self._data_base_url, ) return self._runtime @@ -883,6 +892,31 @@ def cache_capacity(self, value: int) -> None: self._cache_capacity = value self.reset() + @property + def data_base_url(self) -> str | bool | None: + """ + Get the data base URL setting. + + Returns: + The current data_base_url setting. + """ + return self._data_base_url + + @data_base_url.setter + def data_base_url(self, value: str | bool | None) -> None: + """ + Set the data base URL and restart the runtime. + + Args: + value: Base URL for resolving relative data URLs. + - None or True: use the default CDN + - str: custom base URL + - False: disabled + """ + if value != self._data_base_url: + self._data_base_url = value + self.reset() + @property def plan_resolver(self) -> PlanResolver | tuple[PlanResolver, ...] | None: if not self._plan_resolvers: diff --git a/vegafusion-runtime/benches/spec_benchmarks.rs b/vegafusion-runtime/benches/spec_benchmarks.rs index 2e8918bcd..c2a0b313c 100644 --- a/vegafusion-runtime/benches/spec_benchmarks.rs +++ b/vegafusion-runtime/benches/spec_benchmarks.rs @@ -46,11 +46,7 @@ async fn eval_spec_get_variable(full_spec: ChartSpec, var: &ScopedVariable) -> V let task_scope = spec_plan.server_spec.to_task_scope().unwrap(); let tasks = spec_plan .server_spec - .to_tasks( - &tz_config, - &Default::default(), - Some(vegafusion_core::planning::plan::VEGA_DATASETS_CDN_BASE.to_string()), - ) + .to_tasks(&tz_config, &Default::default()) .unwrap(); let task_graph = TaskGraph::new(tasks, &task_scope).unwrap(); let task_graph_mapping = task_graph.build_mapping(); @@ -101,11 +97,7 @@ async fn eval_spec_sequence(full_spec: ChartSpec, full_updates: Vec` used internally. +/// Custom base URLs are normalized (bare absolute paths become file:// URLs). +pub fn resolve_data_base_url(setting: &DataBaseUrlSetting) -> Result> { + match setting { + DataBaseUrlSetting::Default => Ok(Some(VEGA_DATASETS_CDN_BASE.to_string())), + DataBaseUrlSetting::Disabled => Ok(None), + DataBaseUrlSetting::Custom(s) => Ok(Some(normalize_base_url(s.clone())?)), + } +} + /// Chains resolvers with a terminal `DataFusionResolver`. /// /// All resolvers (user-supplied + DataFusionResolver) live in a single vec. @@ -25,18 +52,29 @@ use super::plan_resolver::PlanResolver; pub struct ResolverPipeline { resolvers: Arc>>, ctx: Arc, + data_base_url: Option, } impl ResolverPipeline { - pub fn new(user_resolvers: Vec>, ctx: Arc) -> Self { + pub fn new( + user_resolvers: Vec>, + ctx: Arc, + data_base_url: Option, + ) -> Self { let mut resolvers: Vec> = user_resolvers; resolvers.push(Arc::new(DataFusionResolver::new(ctx.clone()))); Self { resolvers: Arc::new(resolvers), ctx, + data_base_url, } } + /// The resolved data base URL, used for resolving relative URLs at eval time. + pub fn data_base_url(&self) -> &Option { + &self.data_base_url + } + /// Whether the runtime should eagerly materialize a `LogicalPlan` into /// an in-memory Arrow table. /// diff --git a/vegafusion-runtime/src/data/tasks.rs b/vegafusion-runtime/src/data/tasks.rs index 6c3aadea3..162455b35 100644 --- a/vegafusion-runtime/src/data/tasks.rs +++ b/vegafusion-runtime/src/data/tasks.rs @@ -135,12 +135,14 @@ impl TaskCall for DataUrlTask { // Build url string — resolve at eval time for both static and signal URLs let url = match self.url.as_ref().unwrap() { - Url::String(url) => vegafusion_core::runtime::resolve_url(url, &self.data_base_url)?, + Url::String(url) => { + vegafusion_core::runtime::resolve_url(url, pipeline.data_base_url())? + } Url::Expr(expr) => { let compiled = compile(expr, &config, None).await?; let url_scalar = compiled.eval_to_scalar()?; let raw_url = url_scalar.to_scalar_string()?; - vegafusion_core::runtime::resolve_url(&raw_url, &self.data_base_url)? + vegafusion_core::runtime::resolve_url(&raw_url, pipeline.data_base_url())? } }; diff --git a/vegafusion-runtime/src/expression/compiler/config.rs b/vegafusion-runtime/src/expression/compiler/config.rs index 9d0641ea9..bd758bfcc 100644 --- a/vegafusion-runtime/src/expression/compiler/config.rs +++ b/vegafusion-runtime/src/expression/compiler/config.rs @@ -28,7 +28,7 @@ impl Default for CompilationConfig { callable_scope: default_callables(), constants: default_constants(), tz_config: None, - pipeline: ResolverPipeline::new(Vec::new(), ctx), + pipeline: ResolverPipeline::new(Vec::new(), ctx, None), } } } diff --git a/vegafusion-runtime/src/task_graph/runtime.rs b/vegafusion-runtime/src/task_graph/runtime.rs index 0c1aff015..cf6992bb1 100644 --- a/vegafusion-runtime/src/task_graph/runtime.rs +++ b/vegafusion-runtime/src/task_graph/runtime.rs @@ -33,6 +33,8 @@ use { type CacheValue = (TaskValue, Vec); +use crate::data::pipeline::{resolve_data_base_url, DataBaseUrlSetting}; + #[derive(Clone)] pub struct VegaFusionRuntime { pub cache: VegaFusionCache, @@ -42,12 +44,26 @@ pub struct VegaFusionRuntime { impl VegaFusionRuntime { pub fn new(cache: Option, plan_resolvers: Vec>) -> Self { let ctx = Arc::new(make_datafusion_context()); + let data_base_url = resolve_data_base_url(&DataBaseUrlSetting::Default).unwrap_or_default(); Self { cache: cache.unwrap_or_else(|| VegaFusionCache::new(Some(32), None)), - pipeline: ResolverPipeline::new(plan_resolvers, ctx), + pipeline: ResolverPipeline::new(plan_resolvers, ctx, data_base_url), } } + pub fn new_with_data_base_url( + cache: Option, + plan_resolvers: Vec>, + data_base_url_setting: DataBaseUrlSetting, + ) -> vegafusion_core::error::Result { + let ctx = Arc::new(make_datafusion_context()); + let data_base_url = resolve_data_base_url(&data_base_url_setting)?; + Ok(Self { + cache: cache.unwrap_or_else(|| VegaFusionCache::new(Some(32), None)), + pipeline: ResolverPipeline::new(plan_resolvers, ctx, data_base_url), + }) + } + pub async fn get_node_value( &self, task_graph: Arc, diff --git a/vegafusion-runtime/tests/test_chart_state.rs b/vegafusion-runtime/tests/test_chart_state.rs index 4432baadd..2aaca80a3 100644 --- a/vegafusion-runtime/tests/test_chart_state.rs +++ b/vegafusion-runtime/tests/test_chart_state.rs @@ -37,7 +37,6 @@ mod tests { default_input_tz: None, }, row_limit: None, - data_base_url: Default::default(), }, ) .await diff --git a/vegafusion-runtime/tests/test_image_comparison.rs b/vegafusion-runtime/tests/test_image_comparison.rs index 368d26996..ed57dd24f 100644 --- a/vegafusion-runtime/tests/test_image_comparison.rs +++ b/vegafusion-runtime/tests/test_image_comparison.rs @@ -1442,11 +1442,7 @@ async fn check_spec_sequence( // Build task graph let tasks = spec_plan .server_spec - .to_tasks( - &tz_config, - &Default::default(), - Some(vegafusion_core::planning::plan::VEGA_DATASETS_CDN_BASE.to_string()), - ) + .to_tasks(&tz_config, &Default::default()) .unwrap(); let mut task_graph = TaskGraph::new(tasks, &task_scope).unwrap(); let task_graph_mapping = task_graph.build_mapping(); diff --git a/vegafusion-runtime/tests/test_plan_resolver.rs b/vegafusion-runtime/tests/test_plan_resolver.rs index bf8e454e7..0f2d68a16 100644 --- a/vegafusion-runtime/tests/test_plan_resolver.rs +++ b/vegafusion-runtime/tests/test_plan_resolver.rs @@ -559,7 +559,7 @@ async fn test_execute_plan_pipeline_chains_resolvers_in_order() { let plan = build_external_scan_plan("movies_chain"); let ctx = datafusion::prelude::SessionContext::new(); - let pipeline = ResolverPipeline::new(resolvers, Arc::new(ctx)); + let pipeline = ResolverPipeline::new(resolvers, Arc::new(ctx), None); let table = pipeline.resolve(plan).await.unwrap(); assert_eq!(table_row_count(&table), 10); assert_eq!(rewrite_resolver.get_call_count(), 1); @@ -597,7 +597,7 @@ async fn test_execute_plan_pipeline_short_circuits_after_table_result() { let plan = build_external_scan_plan("movies_short_circuit"); let ctx = datafusion::prelude::SessionContext::new(); - let pipeline = ResolverPipeline::new(resolvers, Arc::new(ctx)); + let pipeline = ResolverPipeline::new(resolvers, Arc::new(ctx), None); let table = pipeline.resolve(plan).await.unwrap(); assert_eq!(table_row_count(&table), 10); assert_eq!(table_resolver.get_call_count(), 1); @@ -621,7 +621,7 @@ async fn test_execute_plan_pipeline_fails_if_external_not_resolved() { let plan = build_external_scan_plan("movies_unresolved"); let ctx = datafusion::prelude::SessionContext::new(); - let pipeline = ResolverPipeline::new(resolvers, Arc::new(ctx)); + let pipeline = ResolverPipeline::new(resolvers, Arc::new(ctx), None); let err = pipeline.resolve(plan).await.unwrap_err(); let msg = err.to_string(); assert!( @@ -847,7 +847,7 @@ async fn test_scan_url_custom_scheme_first_wins() { }; let ctx = Arc::new(datafusion::prelude::SessionContext::new()); - let pipeline = ResolverPipeline::new(vec![Arc::new(scanner)], ctx); + let pipeline = ResolverPipeline::new(vec![Arc::new(scanner)], ctx, None); let parsed = ParsedUrl { url: "custom://mydb/table1".to_string(), @@ -871,7 +871,7 @@ async fn test_scan_url_custom_scheme_first_wins() { async fn test_scan_url_unknown_scheme_falls_through() { let ctx = Arc::new(datafusion::prelude::SessionContext::new()); // Pipeline with only DataFusionResolver (no user resolvers) - let pipeline = ResolverPipeline::new(vec![], ctx); + let pipeline = ResolverPipeline::new(vec![], ctx, None); let parsed = ParsedUrl { url: "spark://cluster/table1".to_string(), @@ -915,7 +915,7 @@ async fn test_should_materialize() { .unwrap(); // DataFusion-only: all support arrow → always materialize - let pipeline = ResolverPipeline::new(vec![], ctx.clone()); + let pipeline = ResolverPipeline::new(vec![], ctx.clone(), None); assert!(pipeline.should_materialize(&plain_plan)); assert!(pipeline.should_materialize(&external_plan)); @@ -923,7 +923,7 @@ async fn test_should_materialize() { let scanner = CustomSchemeScanner { schema: schema.clone(), }; - let pipeline = ResolverPipeline::new(vec![Arc::new(scanner)], ctx); + let pipeline = ResolverPipeline::new(vec![Arc::new(scanner)], ctx, None); assert!(pipeline.should_materialize(&plain_plan)); assert!(!pipeline.should_materialize(&external_plan)); } @@ -1346,7 +1346,7 @@ async fn test_execute_plan_pipeline_propagates_resolver_errors() { let ctx = datafusion::prelude::SessionContext::new(); let plan = build_external_scan_plan("movies"); - let pipeline = ResolverPipeline::new(resolvers, Arc::new(ctx)); + let pipeline = ResolverPipeline::new(resolvers, Arc::new(ctx), None); let err = pipeline.resolve(plan).await.unwrap_err(); let msg = err.to_string(); assert!( @@ -1401,7 +1401,7 @@ async fn test_resolver_pipeline_should_materialize() { .unwrap(); // DataFusion-only: always materialize - let empty_pipeline = ResolverPipeline::new(vec![], ctx.clone()); + let empty_pipeline = ResolverPipeline::new(vec![], ctx.clone(), None); assert!( empty_pipeline.should_materialize(&plain_plan), "DataFusion-only pipeline should materialize plain plans" @@ -1415,7 +1415,7 @@ async fn test_resolver_pipeline_should_materialize() { let events = Arc::new(Mutex::new(Vec::new())); let resolver = ScriptedResolver::new("test", ResolverBehavior::PassThroughPlan, events); let resolvers: Vec> = vec![Arc::new(resolver)]; - let pipeline_with_resolvers = ResolverPipeline::new(resolvers, ctx); + let pipeline_with_resolvers = ResolverPipeline::new(resolvers, ctx, None); assert!( pipeline_with_resolvers.should_materialize(&plain_plan), "Non-arrow pipeline should still materialize plans with no external tables" diff --git a/vegafusion-runtime/tests/test_planning.rs b/vegafusion-runtime/tests/test_planning.rs index ffd0fdd3c..8219ee8c0 100644 --- a/vegafusion-runtime/tests/test_planning.rs +++ b/vegafusion-runtime/tests/test_planning.rs @@ -55,11 +55,7 @@ async fn test_extract_server_data() { println!("client_stubs: {client_stubs:?}"); let tasks = server_spec - .to_tasks( - &tz_config, - &Default::default(), - Some(vegafusion_core::planning::plan::VEGA_DATASETS_CDN_BASE.to_string()), - ) + .to_tasks(&tz_config, &Default::default()) .unwrap(); let graph = Arc::new(TaskGraph::new(tasks, &task_scope).unwrap()); let mapping = graph.build_mapping(); diff --git a/vegafusion-runtime/tests/test_task_graph_runtime.rs b/vegafusion-runtime/tests/test_task_graph_runtime.rs index a58e7d8fc..b6a2d0658 100644 --- a/vegafusion-runtime/tests/test_task_graph_runtime.rs +++ b/vegafusion-runtime/tests/test_task_graph_runtime.rs @@ -50,7 +50,6 @@ async fn try_it() { batch_size: 1024, format_type: None, pipeline: None, - data_base_url: None, }, &tz_config, ), @@ -132,13 +131,7 @@ async fn try_it_from_spec() { default_input_tz: None, }; let task_scope = chart.to_task_scope().unwrap(); - let tasks = chart - .to_tasks( - &tz_config, - &Default::default(), - Some(vegafusion_core::planning::plan::VEGA_DATASETS_CDN_BASE.to_string()), - ) - .unwrap(); + let tasks = chart.to_tasks(&tz_config, &Default::default()).unwrap(); println!("task_scope: {task_scope:?}"); println!("tasks: {tasks:?}"); diff --git a/vegafusion-server/tests/test_task_graph_runtime.rs b/vegafusion-server/tests/test_task_graph_runtime.rs index 032d227eb..b77faffa2 100644 --- a/vegafusion-server/tests/test_task_graph_runtime.rs +++ b/vegafusion-server/tests/test_task_graph_runtime.rs @@ -50,13 +50,7 @@ async fn try_it_from_spec() { default_input_tz: None, }; let task_scope = chart.to_task_scope().unwrap(); - let tasks = chart - .to_tasks( - &tz_config, - &Default::default(), - Some(vegafusion_core::planning::plan::VEGA_DATASETS_CDN_BASE.to_string()), - ) - .unwrap(); + let tasks = chart.to_tasks(&tz_config, &Default::default()).unwrap(); let graph = TaskGraph::new(tasks, &task_scope).unwrap(); let request = QueryRequest { diff --git a/vegafusion-wasm/src/lib.rs b/vegafusion-wasm/src/lib.rs index 63eb53e7f..758122799 100644 --- a/vegafusion-wasm/src/lib.rs +++ b/vegafusion-wasm/src/lib.rs @@ -457,7 +457,6 @@ pub async fn vegafusion_embed( ChartStateOpts { tz_config, row_limit: None, - data_base_url: Default::default(), }, ) .await From fa3a62c7a21224148f3aaf28308fc564cfd7ab1d Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Tue, 17 Mar 2026 19:01:45 -0400 Subject: [PATCH 26/36] refactor: introduce TaskContext and move data_base_url to VegaFusionRuntime Bundle the scattered eval parameters (tz_config, inline_datasets, pipeline, data_base_url) into a TaskContext struct. Move data_base_url from ResolverPipeline to VegaFusionRuntime since it's orthogonal to the resolver chain. Co-Authored-By: Claude Opus 4.6 (1M context) --- vegafusion-runtime/src/data/pipeline.rs | 13 +--- vegafusion-runtime/src/data/tasks.rs | 77 ++++++++++--------- .../src/expression/compiler/config.rs | 2 +- vegafusion-runtime/src/signal/mod.rs | 17 ++-- vegafusion-runtime/src/task_graph/runtime.rs | 33 +++++--- vegafusion-runtime/src/task_graph/task.rs | 37 ++++----- .../tests/test_plan_resolver.rs | 20 ++--- 7 files changed, 96 insertions(+), 103 deletions(-) diff --git a/vegafusion-runtime/src/data/pipeline.rs b/vegafusion-runtime/src/data/pipeline.rs index ed7f10fe5..a327f0342 100644 --- a/vegafusion-runtime/src/data/pipeline.rs +++ b/vegafusion-runtime/src/data/pipeline.rs @@ -52,29 +52,18 @@ pub fn resolve_data_base_url(setting: &DataBaseUrlSetting) -> Result>>, ctx: Arc, - data_base_url: Option, } impl ResolverPipeline { - pub fn new( - user_resolvers: Vec>, - ctx: Arc, - data_base_url: Option, - ) -> Self { + pub fn new(user_resolvers: Vec>, ctx: Arc) -> Self { let mut resolvers: Vec> = user_resolvers; resolvers.push(Arc::new(DataFusionResolver::new(ctx.clone()))); Self { resolvers: Arc::new(resolvers), ctx, - data_base_url, } } - /// The resolved data base URL, used for resolving relative URLs at eval time. - pub fn data_base_url(&self) -> &Option { - &self.data_base_url - } - /// Whether the runtime should eagerly materialize a `LogicalPlan` into /// an in-memory Arrow table. /// diff --git a/vegafusion-runtime/src/data/tasks.rs b/vegafusion-runtime/src/data/tasks.rs index 162455b35..b21d42e3c 100644 --- a/vegafusion-runtime/src/data/tasks.rs +++ b/vegafusion-runtime/src/data/tasks.rs @@ -2,7 +2,7 @@ use crate::data::pipeline::ResolverPipeline; use crate::expression::compiler::compile; use crate::expression::compiler::config::CompilationConfig; use crate::expression::compiler::utils::ExprHelpers; -use crate::task_graph::task::TaskCall; +use crate::task_graph::task::{TaskCall, TaskContext}; use std::borrow::Cow; use async_trait::async_trait; @@ -124,25 +124,25 @@ impl TaskCall for DataUrlTask { async fn eval( &self, values: &[TaskValue], - tz_config: &Option, - inline_datasets: HashMap, - pipeline: ResolverPipeline, + ctx: &TaskContext, ) -> Result<(TaskValue, Vec)> { - let ctx = Arc::new(pipeline.ctx().clone()); + let session_ctx = Arc::new(ctx.pipeline.ctx().clone()); // Build compilation config for url signal (if any) and transforms (if any) - let config = - build_compilation_config(&self.input_vars(), values, tz_config, pipeline.clone()); + let config = build_compilation_config( + &self.input_vars(), + values, + &ctx.tz_config, + ctx.pipeline.clone(), + ); // Build url string — resolve at eval time for both static and signal URLs let url = match self.url.as_ref().unwrap() { - Url::String(url) => { - vegafusion_core::runtime::resolve_url(url, pipeline.data_base_url())? - } + Url::String(url) => vegafusion_core::runtime::resolve_url(url, &ctx.data_base_url)?, Url::Expr(expr) => { let compiled = compile(expr, &config, None).await?; let url_scalar = compiled.eval_to_scalar()?; let raw_url = url_scalar.to_scalar_string()?; - vegafusion_core::runtime::resolve_url(&raw_url, pipeline.data_base_url())? + vegafusion_core::runtime::resolve_url(&raw_url, &ctx.data_base_url)? } }; @@ -165,20 +165,20 @@ impl TaskCall for DataUrlTask { let inline_name = extract_inline_dataset(&url).map(|name| name.trim().to_string()); let inline_dataset_info: Option<&VegaFusionDataset> = inline_name .as_ref() - .and_then(|name| inline_datasets.get(name)); + .and_then(|name| ctx.inline_datasets.get(name)); let df = if let Some(inline_name) = &inline_name { if let Some(inline_dataset) = inline_dataset_info { match inline_dataset { VegaFusionDataset::Table { table, .. } => { let table = table.clone().with_ordering()?; - ctx.vegafusion_table(table).await? + session_ctx.vegafusion_table(table).await? } VegaFusionDataset::Plan { plan } => { - DataFrame::new(ctx.state(), plan.clone()).with_index()? + DataFrame::new(session_ctx.state(), plan.clone()).with_index()? } } - } else if let Ok(df) = ctx.table(inline_name).await { + } else if let Ok(df) = session_ctx.table(inline_name).await { df } else { return Err(VegaFusionError::internal(format!( @@ -188,8 +188,8 @@ impl TaskCall for DataUrlTask { } else { // Construct ParsedUrl and dispatch to pipeline.scan_url() let parsed_url = build_parsed_url(&url, format_type.as_deref(), parse.clone())?; - match pipeline.scan_url(&parsed_url).await? { - Some(plan) => DataFrame::new(ctx.state(), plan), + match ctx.pipeline.scan_url(&parsed_url).await? { + Some(plan) => DataFrame::new(session_ctx.state(), plan), None => { return Err(VegaFusionError::internal(format!( "No resolver handled URL: {url}" @@ -222,11 +222,11 @@ impl TaskCall for DataUrlTask { // Return value based on whether inline dataset was used let task_value = if let Some(inline_dataset) = inline_dataset_info { let task_value = result_df.to_task_value(inline_dataset).await?; - maybe_materialize_plan(task_value, &pipeline).await? + maybe_materialize_plan(task_value, &ctx.pipeline).await? } else { // URL-sourced data: use Plan when user resolvers exist for lazy evaluation let task_value = TaskValue::Plan(result_df.logical_plan().clone()); - maybe_materialize_plan(task_value, &pipeline).await? + maybe_materialize_plan(task_value, &ctx.pipeline).await? }; Ok((task_value, output_values)) @@ -410,11 +410,9 @@ impl TaskCall for DataValuesTask { async fn eval( &self, values: &[TaskValue], - tz_config: &Option, - _inline_datasets: HashMap, - pipeline: ResolverPipeline, + ctx: &TaskContext, ) -> Result<(TaskValue, Vec)> { - let ctx = Arc::new(pipeline.ctx().clone()); + let session_ctx = Arc::new(ctx.pipeline.ctx().clone()); // Deserialize data into table let values_table = VegaFusionTable::from_ipc_bytes(&self.values)?; if values_table.schema.fields.is_empty() { @@ -448,11 +446,15 @@ impl TaskCall for DataValuesTask { { let transform_pipeline = self.pipeline.as_ref().unwrap(); - let config = - build_compilation_config(&self.input_vars(), values, tz_config, pipeline.clone()); + let config = build_compilation_config( + &self.input_vars(), + values, + &ctx.tz_config, + ctx.pipeline.clone(), + ); // Process datetime columns - let df = ctx.vegafusion_table(values_table).await?; + let df = session_ctx.vegafusion_table(values_table).await?; let sql_df = process_datetimes(&parse, df, &config.tz_config).await?; let (df, output_values) = transform_pipeline.eval_sql(sql_df, &config).await?; @@ -460,8 +462,8 @@ impl TaskCall for DataValuesTask { (table, output_values) } else { // No transforms - let values_df = ctx.vegafusion_table(values_table).await?; - let values_df: DataFrame = process_datetimes(&parse, values_df, tz_config).await?; + let values_df = session_ctx.vegafusion_table(values_table).await?; + let values_df: DataFrame = process_datetimes(&parse, values_df, &ctx.tz_config).await?; ( values_df.drop_index()?.collect_to_table().await?, Vec::new(), @@ -479,13 +481,12 @@ impl TaskCall for DataSourceTask { async fn eval( &self, values: &[TaskValue], - tz_config: &Option, - _inline_datasets: HashMap, - pipeline: ResolverPipeline, + ctx: &TaskContext, ) -> Result<(TaskValue, Vec)> { - let ctx = Arc::new(pipeline.ctx().clone()); + let session_ctx = Arc::new(ctx.pipeline.ctx().clone()); let input_vars = self.input_vars(); - let mut config = build_compilation_config(&input_vars, values, tz_config, pipeline.clone()); + let mut config = + build_compilation_config(&input_vars, values, &ctx.tz_config, ctx.pipeline.clone()); // Remove source dataset from config let source_dataset = config.data_scope.remove(&self.source).with_context(|| { @@ -505,7 +506,7 @@ impl TaskCall for DataSourceTask { match source_dataset { VegaFusionDataset::Plan { plan } => { let task_value = - maybe_materialize_plan(TaskValue::Plan(plan), &pipeline).await?; + maybe_materialize_plan(TaskValue::Plan(plan), &ctx.pipeline).await?; return Ok((task_value, Vec::new())); } VegaFusionDataset::Table { table, .. } => { @@ -516,8 +517,10 @@ impl TaskCall for DataSourceTask { } let source_df = match &source_dataset { - VegaFusionDataset::Table { table, .. } => ctx.vegafusion_table(table.clone()).await?, - VegaFusionDataset::Plan { plan } => DataFrame::new(ctx.state(), plan.clone()), + VegaFusionDataset::Table { table, .. } => { + session_ctx.vegafusion_table(table.clone()).await? + } + VegaFusionDataset::Plan { plan } => DataFrame::new(session_ctx.state(), plan.clone()), }; let source_df = source_df.with_index()?; @@ -526,7 +529,7 @@ impl TaskCall for DataSourceTask { let (df, output_values) = transform_pipeline.eval_sql(source_df, &config).await?; let df = df.drop_index()?; let task_value = df.to_task_value(&source_dataset).await?; - let task_value = maybe_materialize_plan(task_value, &pipeline).await?; + let task_value = maybe_materialize_plan(task_value, &ctx.pipeline).await?; Ok((task_value, output_values)) } } diff --git a/vegafusion-runtime/src/expression/compiler/config.rs b/vegafusion-runtime/src/expression/compiler/config.rs index bd758bfcc..9d0641ea9 100644 --- a/vegafusion-runtime/src/expression/compiler/config.rs +++ b/vegafusion-runtime/src/expression/compiler/config.rs @@ -28,7 +28,7 @@ impl Default for CompilationConfig { callable_scope: default_callables(), constants: default_constants(), tz_config: None, - pipeline: ResolverPipeline::new(Vec::new(), ctx, None), + pipeline: ResolverPipeline::new(Vec::new(), ctx), } } } diff --git a/vegafusion-runtime/src/signal/mod.rs b/vegafusion-runtime/src/signal/mod.rs index f22feb510..8a3a37f27 100644 --- a/vegafusion-runtime/src/signal/mod.rs +++ b/vegafusion-runtime/src/signal/mod.rs @@ -1,13 +1,9 @@ -use crate::data::pipeline::ResolverPipeline; use crate::data::tasks::build_compilation_config; use crate::expression::compiler::compile; use crate::expression::compiler::utils::ExprHelpers; -use crate::task_graph::task::TaskCall; +use crate::task_graph::task::{TaskCall, TaskContext}; use async_trait::async_trait; -use std::collections::HashMap; -use vegafusion_core::data::dataset::VegaFusionDataset; -use crate::task_graph::timezone::RuntimeTzConfig; use vegafusion_core::error::Result; use vegafusion_core::proto::gen::tasks::SignalTask; use vegafusion_core::task_graph::task::TaskDependencies; @@ -18,11 +14,14 @@ impl TaskCall for SignalTask { async fn eval( &self, values: &[TaskValue], - tz_config: &Option, - _inline_datasets: HashMap, - pipeline: ResolverPipeline, + ctx: &TaskContext, ) -> Result<(TaskValue, Vec)> { - let config = build_compilation_config(&self.input_vars(), values, tz_config, pipeline); + let config = build_compilation_config( + &self.input_vars(), + values, + &ctx.tz_config, + ctx.pipeline.clone(), + ); let expression = self.expr.as_ref().unwrap(); let expr = compile(expression, &config, None).await?; let value = expr.eval_to_scalar()?; diff --git a/vegafusion-runtime/src/task_graph/runtime.rs b/vegafusion-runtime/src/task_graph/runtime.rs index cf6992bb1..ee8ac5425 100644 --- a/vegafusion-runtime/src/task_graph/runtime.rs +++ b/vegafusion-runtime/src/task_graph/runtime.rs @@ -2,7 +2,7 @@ use crate::data::pipeline::ResolverPipeline; use crate::data::plan_resolver::PlanResolver; use crate::datafusion::context::make_datafusion_context; use crate::task_graph::cache::VegaFusionCache; -use crate::task_graph::task::TaskCall; +use crate::task_graph::task::{TaskCall, TaskContext}; use crate::task_graph::timezone::RuntimeTzConfig; use async_recursion::async_recursion; use cfg_if::cfg_if; @@ -39,6 +39,7 @@ use crate::data::pipeline::{resolve_data_base_url, DataBaseUrlSetting}; pub struct VegaFusionRuntime { pub cache: VegaFusionCache, pub pipeline: ResolverPipeline, + pub data_base_url: Option, } impl VegaFusionRuntime { @@ -47,7 +48,8 @@ impl VegaFusionRuntime { let data_base_url = resolve_data_base_url(&DataBaseUrlSetting::Default).unwrap_or_default(); Self { cache: cache.unwrap_or_else(|| VegaFusionCache::new(Some(32), None)), - pipeline: ResolverPipeline::new(plan_resolvers, ctx, data_base_url), + pipeline: ResolverPipeline::new(plan_resolvers, ctx), + data_base_url, } } @@ -60,7 +62,8 @@ impl VegaFusionRuntime { let data_base_url = resolve_data_base_url(&data_base_url_setting)?; Ok(Self { cache: cache.unwrap_or_else(|| VegaFusionCache::new(Some(32), None)), - pipeline: ResolverPipeline::new(plan_resolvers, ctx, data_base_url), + pipeline: ResolverPipeline::new(plan_resolvers, ctx), + data_base_url, }) } @@ -72,13 +75,17 @@ impl VegaFusionRuntime { ) -> Result { // We shouldn't panic inside get_or_compute_node_value, but since this may be used // in a server context, wrap in catch_unwind just in case. - let pipeline = self.pipeline.clone(); + let task_ctx = TaskContext { + tz_config: None, // overridden per-task from task.tz_config + inline_datasets, + pipeline: self.pipeline.clone(), + data_base_url: self.data_base_url.clone(), + }; let node_value = AssertUnwindSafe(get_or_compute_node_value( task_graph, node_value_index.node_index as usize, self.cache.clone(), - inline_datasets, - pipeline, + task_ctx, )) .catch_unwind() .await; @@ -179,8 +186,7 @@ async fn get_or_compute_node_value( task_graph: Arc, node_index: usize, cache: VegaFusionCache, - inline_datasets: HashMap, - pipeline: ResolverPipeline, + task_ctx: TaskContext, ) -> Result { // Get the cache key for requested node let node = task_graph.node(node_index).unwrap(); @@ -211,8 +217,7 @@ async fn get_or_compute_node_value( task_graph.clone(), input_node_index, cloned_cache.clone(), - inline_datasets.clone(), - pipeline.clone(), + task_ctx.clone(), ); cfg_if! { @@ -260,8 +265,12 @@ async fn get_or_compute_node_value( }) .collect::>>()?; - task.eval(&input_values, &tz_config, inline_datasets, pipeline) - .await + // Override tz_config from task + let task_ctx = TaskContext { + tz_config, + ..task_ctx + }; + task.eval(&input_values, &task_ctx).await }; // get or construct from cache diff --git a/vegafusion-runtime/src/task_graph/task.rs b/vegafusion-runtime/src/task_graph/task.rs index 3d7bc9ed3..aaa367166 100644 --- a/vegafusion-runtime/src/task_graph/task.rs +++ b/vegafusion-runtime/src/task_graph/task.rs @@ -9,14 +9,21 @@ use vegafusion_core::proto::gen::tasks::task::TaskKind; use vegafusion_core::proto::gen::tasks::Task; use vegafusion_core::task_graph::task_value::TaskValue; +/// Ambient context available to all tasks during evaluation. +#[derive(Clone)] +pub struct TaskContext { + pub tz_config: Option, + pub inline_datasets: HashMap, + pub pipeline: ResolverPipeline, + pub data_base_url: Option, +} + #[async_trait] pub trait TaskCall { async fn eval( &self, values: &[TaskValue], - tz_config: &Option, - inline_datasets: HashMap, - pipeline: ResolverPipeline, + ctx: &TaskContext, ) -> Result<(TaskValue, Vec)>; } @@ -25,28 +32,14 @@ impl TaskCall for Task { async fn eval( &self, values: &[TaskValue], - tz_config: &Option, - inline_datasets: HashMap, - pipeline: ResolverPipeline, + ctx: &TaskContext, ) -> Result<(TaskValue, Vec)> { match self.task_kind() { TaskKind::Value(value) => Ok((value.try_into()?, Default::default())), - TaskKind::DataUrl(task) => { - task.eval(values, tz_config, inline_datasets, pipeline) - .await - } - TaskKind::DataValues(task) => { - task.eval(values, tz_config, inline_datasets, pipeline) - .await - } - TaskKind::DataSource(task) => { - task.eval(values, tz_config, inline_datasets, pipeline) - .await - } - TaskKind::Signal(task) => { - task.eval(values, tz_config, inline_datasets, pipeline) - .await - } + TaskKind::DataUrl(task) => task.eval(values, ctx).await, + TaskKind::DataValues(task) => task.eval(values, ctx).await, + TaskKind::DataSource(task) => task.eval(values, ctx).await, + TaskKind::Signal(task) => task.eval(values, ctx).await, } } } diff --git a/vegafusion-runtime/tests/test_plan_resolver.rs b/vegafusion-runtime/tests/test_plan_resolver.rs index 0f2d68a16..bf8e454e7 100644 --- a/vegafusion-runtime/tests/test_plan_resolver.rs +++ b/vegafusion-runtime/tests/test_plan_resolver.rs @@ -559,7 +559,7 @@ async fn test_execute_plan_pipeline_chains_resolvers_in_order() { let plan = build_external_scan_plan("movies_chain"); let ctx = datafusion::prelude::SessionContext::new(); - let pipeline = ResolverPipeline::new(resolvers, Arc::new(ctx), None); + let pipeline = ResolverPipeline::new(resolvers, Arc::new(ctx)); let table = pipeline.resolve(plan).await.unwrap(); assert_eq!(table_row_count(&table), 10); assert_eq!(rewrite_resolver.get_call_count(), 1); @@ -597,7 +597,7 @@ async fn test_execute_plan_pipeline_short_circuits_after_table_result() { let plan = build_external_scan_plan("movies_short_circuit"); let ctx = datafusion::prelude::SessionContext::new(); - let pipeline = ResolverPipeline::new(resolvers, Arc::new(ctx), None); + let pipeline = ResolverPipeline::new(resolvers, Arc::new(ctx)); let table = pipeline.resolve(plan).await.unwrap(); assert_eq!(table_row_count(&table), 10); assert_eq!(table_resolver.get_call_count(), 1); @@ -621,7 +621,7 @@ async fn test_execute_plan_pipeline_fails_if_external_not_resolved() { let plan = build_external_scan_plan("movies_unresolved"); let ctx = datafusion::prelude::SessionContext::new(); - let pipeline = ResolverPipeline::new(resolvers, Arc::new(ctx), None); + let pipeline = ResolverPipeline::new(resolvers, Arc::new(ctx)); let err = pipeline.resolve(plan).await.unwrap_err(); let msg = err.to_string(); assert!( @@ -847,7 +847,7 @@ async fn test_scan_url_custom_scheme_first_wins() { }; let ctx = Arc::new(datafusion::prelude::SessionContext::new()); - let pipeline = ResolverPipeline::new(vec![Arc::new(scanner)], ctx, None); + let pipeline = ResolverPipeline::new(vec![Arc::new(scanner)], ctx); let parsed = ParsedUrl { url: "custom://mydb/table1".to_string(), @@ -871,7 +871,7 @@ async fn test_scan_url_custom_scheme_first_wins() { async fn test_scan_url_unknown_scheme_falls_through() { let ctx = Arc::new(datafusion::prelude::SessionContext::new()); // Pipeline with only DataFusionResolver (no user resolvers) - let pipeline = ResolverPipeline::new(vec![], ctx, None); + let pipeline = ResolverPipeline::new(vec![], ctx); let parsed = ParsedUrl { url: "spark://cluster/table1".to_string(), @@ -915,7 +915,7 @@ async fn test_should_materialize() { .unwrap(); // DataFusion-only: all support arrow → always materialize - let pipeline = ResolverPipeline::new(vec![], ctx.clone(), None); + let pipeline = ResolverPipeline::new(vec![], ctx.clone()); assert!(pipeline.should_materialize(&plain_plan)); assert!(pipeline.should_materialize(&external_plan)); @@ -923,7 +923,7 @@ async fn test_should_materialize() { let scanner = CustomSchemeScanner { schema: schema.clone(), }; - let pipeline = ResolverPipeline::new(vec![Arc::new(scanner)], ctx, None); + let pipeline = ResolverPipeline::new(vec![Arc::new(scanner)], ctx); assert!(pipeline.should_materialize(&plain_plan)); assert!(!pipeline.should_materialize(&external_plan)); } @@ -1346,7 +1346,7 @@ async fn test_execute_plan_pipeline_propagates_resolver_errors() { let ctx = datafusion::prelude::SessionContext::new(); let plan = build_external_scan_plan("movies"); - let pipeline = ResolverPipeline::new(resolvers, Arc::new(ctx), None); + let pipeline = ResolverPipeline::new(resolvers, Arc::new(ctx)); let err = pipeline.resolve(plan).await.unwrap_err(); let msg = err.to_string(); assert!( @@ -1401,7 +1401,7 @@ async fn test_resolver_pipeline_should_materialize() { .unwrap(); // DataFusion-only: always materialize - let empty_pipeline = ResolverPipeline::new(vec![], ctx.clone(), None); + let empty_pipeline = ResolverPipeline::new(vec![], ctx.clone()); assert!( empty_pipeline.should_materialize(&plain_plan), "DataFusion-only pipeline should materialize plain plans" @@ -1415,7 +1415,7 @@ async fn test_resolver_pipeline_should_materialize() { let events = Arc::new(Mutex::new(Vec::new())); let resolver = ScriptedResolver::new("test", ResolverBehavior::PassThroughPlan, events); let resolvers: Vec> = vec![Arc::new(resolver)]; - let pipeline_with_resolvers = ResolverPipeline::new(resolvers, ctx, None); + let pipeline_with_resolvers = ResolverPipeline::new(resolvers, ctx); assert!( pipeline_with_resolvers.should_materialize(&plain_plan), "Non-arrow pipeline should still materialize plans with no external tables" From 6ff1cdc1780589a116d7f38778d8efe46824e30b Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Wed, 18 Mar 2026 10:38:01 -0400 Subject: [PATCH 27/36] feat: add filter predicates to resolve_table and unparse_expr_to_sql MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add filters parameter to PlanResolver::resolve_table (Rust and Python) so resolvers can optimize data loading with pushed-down predicates. Filters are hints — DataFusion re-applies them regardless. ExternalTableProvider now reports Inexact for supports_filters_pushdown so DataFusion pushes filter expressions into TableScan nodes. Add unparse_expr_to_sql function (Rust pyfunction + Python wrapper) that converts LogicalExprNode proto messages to SQL strings. Accepts a single expression or a list (joined with AND). Supports all existing SQL dialects (default, postgres, mysql, sqlite, duckdb, bigquery). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../python-examples/plan_resolver_basic.py | 1 + .../plan_resolver_url_scanning.py | 1 + vegafusion-python/src/lib.rs | 1 + vegafusion-python/src/unparse.rs | 106 ++++++++++++------ vegafusion-python/tests/test_plan_resolver.py | 99 ++++++++++++++++ vegafusion-python/vegafusion/plan_resolver.py | 49 +++++++- vegafusion-runtime/src/data/external_table.rs | 12 +- vegafusion-runtime/src/data/plan_resolver.rs | 13 ++- 8 files changed, 245 insertions(+), 37 deletions(-) diff --git a/examples/python-examples/plan_resolver_basic.py b/examples/python-examples/plan_resolver_basic.py index c85b8ce56..8a419b204 100644 --- a/examples/python-examples/plan_resolver_basic.py +++ b/examples/python-examples/plan_resolver_basic.py @@ -51,6 +51,7 @@ def resolve_table( schema: Any, metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, + filters: list[Any] | None = None, ) -> pa.Table: return self._table diff --git a/examples/python-examples/plan_resolver_url_scanning.py b/examples/python-examples/plan_resolver_url_scanning.py index b2b047ac0..0c9f27fa0 100644 --- a/examples/python-examples/plan_resolver_url_scanning.py +++ b/examples/python-examples/plan_resolver_url_scanning.py @@ -59,6 +59,7 @@ def resolve_table( schema: Any, metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, + filters: list[Any] | None = None, ) -> pa.Table: return pa.table( { diff --git a/vegafusion-python/src/lib.rs b/vegafusion-python/src/lib.rs index 15b5764a5..63cc96f17 100644 --- a/vegafusion-python/src/lib.rs +++ b/vegafusion-python/src/lib.rs @@ -699,6 +699,7 @@ fn _vegafusion(_py: Python, m: &Bound) -> PyResult<()> { m.add_function(wrap_pyfunction!(inline_table_scan_node, m)?)?; m.add_function(wrap_pyfunction!(external_table_scan_node, m)?)?; m.add_function(wrap_pyfunction!(unparse::unparse_plan_to_sql, m)?)?; + m.add_function(wrap_pyfunction!(unparse::unparse_expr_to_sql, m)?)?; m.add("__version__", env!("CARGO_PKG_VERSION"))?; Ok(()) } diff --git a/vegafusion-python/src/unparse.rs b/vegafusion-python/src/unparse.rs index 149b291d7..bb7bf255e 100644 --- a/vegafusion-python/src/unparse.rs +++ b/vegafusion-python/src/unparse.rs @@ -2,12 +2,31 @@ use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use datafusion::prelude::SessionContext; +use datafusion_proto::generated::datafusion::LogicalExprNode; +use datafusion_proto::logical_plan::from_proto::parse_expr; use datafusion_sql::unparser::dialect::{ - BigQueryDialect, DefaultDialect, DuckDBDialect, MySqlDialect, PostgreSqlDialect, SqliteDialect, + BigQueryDialect, DefaultDialect, Dialect, DuckDBDialect, MySqlDialect, PostgreSqlDialect, + SqliteDialect, }; use datafusion_sql::unparser::Unparser; +use prost::Message; +use vegafusion_common::datafusion_expr::Expr; use vegafusion_runtime::data::codec::VegaFusionCodec; +fn make_dialect(dialect: &str) -> PyResult> { + match dialect { + "default" => Ok(Box::new(DefaultDialect {})), + "postgres" | "postgresql" => Ok(Box::new(PostgreSqlDialect {})), + "mysql" => Ok(Box::new(MySqlDialect {})), + "sqlite" => Ok(Box::new(SqliteDialect {})), + "duckdb" => Ok(Box::new(DuckDBDialect::new())), + "bigquery" => Ok(Box::new(BigQueryDialect {})), + _ => Err(PyValueError::new_err(format!( + "Unknown dialect '{dialect}'. Supported: default, postgres, mysql, sqlite, duckdb, bigquery" + ))), + } +} + /// Convert a protobuf-serialized LogicalPlan to a SQL string. /// /// Args: @@ -29,39 +48,58 @@ pub fn unparse_plan_to_sql(plan_bytes: Vec, dialect: &str) -> PyResult { - let d = DefaultDialect {}; - Unparser::new(&d).plan_to_sql(&plan) - } - "postgres" | "postgresql" => { - let d = PostgreSqlDialect {}; - Unparser::new(&d).plan_to_sql(&plan) - } - "mysql" => { - let d = MySqlDialect {}; - Unparser::new(&d).plan_to_sql(&plan) - } - "sqlite" => { - let d = SqliteDialect {}; - Unparser::new(&d).plan_to_sql(&plan) - } - "duckdb" => { - let d = DuckDBDialect::new(); - Unparser::new(&d).plan_to_sql(&plan) - } - "bigquery" => { - let d = BigQueryDialect {}; - Unparser::new(&d).plan_to_sql(&plan) - } - _ => { - return Err(PyValueError::new_err(format!( - "Unknown dialect '{}'. Supported: default, postgres, mysql, sqlite, duckdb, bigquery", - dialect - ))); - } - } - .map_err(|e| PyValueError::new_err(format!("Failed to unparse plan to SQL: {e}")))?; + let d = make_dialect(dialect)?; + let sql = Unparser::new(d.as_ref()) + .plan_to_sql(&plan) + .map_err(|e| PyValueError::new_err(format!("Failed to unparse plan to SQL: {e}")))?; Ok(sql.to_string()) } + +/// Convert protobuf-serialized filter expressions to a SQL WHERE clause string. +/// +/// Accepts a single expression or a list of expressions (joined with AND). +/// +/// Args: +/// expr_bytes: A single serialized LogicalExprNode (bytes) or a list of them. +/// dialect: SQL dialect name. One of "default", "postgres", "mysql", +/// "sqlite", "duckdb", "bigquery". +/// +/// Returns: +/// The SQL string representation of the expression(s). +#[pyfunction] +#[pyo3(signature = (expr_bytes, dialect="default"))] +pub fn unparse_expr_to_sql(expr_bytes: Vec>, dialect: &str) -> PyResult { + if expr_bytes.is_empty() { + return Err(PyValueError::new_err( + "expr_bytes must contain at least one expression", + )); + } + + let ctx = SessionContext::new(); + let codec = VegaFusionCodec::new(); + + let exprs: Vec = expr_bytes + .iter() + .map(|bytes| { + let proto = LogicalExprNode::decode(bytes.as_slice()).map_err(|e| { + PyValueError::new_err(format!("Failed to decode LogicalExprNode: {e}")) + })?; + parse_expr(&proto, &ctx, &codec) + .map_err(|e| PyValueError::new_err(format!("Failed to parse expression: {e}"))) + }) + .collect::>>()?; + + // Join multiple expressions with AND + let combined = exprs + .into_iter() + .reduce(|a, b| a.and(b)) + .expect("non-empty after validation"); + + let d = make_dialect(dialect)?; + let sql_expr = Unparser::new(d.as_ref()) + .expr_to_sql(&combined) + .map_err(|e| PyValueError::new_err(format!("Failed to unparse expression to SQL: {e}")))?; + + Ok(sql_expr.to_string()) +} diff --git a/vegafusion-python/tests/test_plan_resolver.py b/vegafusion-python/tests/test_plan_resolver.py index 5716efe2b..e42ec9251 100644 --- a/vegafusion-python/tests/test_plan_resolver.py +++ b/vegafusion-python/tests/test_plan_resolver.py @@ -317,6 +317,7 @@ def resolve_table( schema: Any, metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, + filters: list[Any] | None = None, ) -> pa.Table: self.resolve_calls.append( { @@ -437,6 +438,7 @@ def resolve_table( schema: Any, metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, + filters: list[Any] | None = None, ) -> pa.Table: self.resolved_names.append(name) if name == "source_a": @@ -500,6 +502,7 @@ def resolve_table( schema: Any, metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, + filters: list[Any] | None = None, ) -> pa.Table: raise ValueError("Simulated resolver failure") @@ -723,6 +726,7 @@ def resolve_table( schema: Any, metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, + filters: list[Any] | None = None, ) -> pa.Table: return pa.table({"x": [1, 2], "y": ["a", "b"]}) @@ -813,6 +817,7 @@ def resolve_table( schema: Any, metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, + filters: list[Any] | None = None, ) -> pa.Table: return pa.table({"val": [42, 99]}) @@ -850,6 +855,7 @@ def resolve_table( schema: Any, metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, + filters: list[Any] | None = None, ) -> pa.Table: return pa.table({"x": [1, 2, 3]}) @@ -867,3 +873,96 @@ def resolve_table( ) assert len(datasets) == 1 + + +def test_resolve_table_accepts_filters_param() -> None: + """resolve_table with filters kwarg doesn't crash (filters may be empty).""" + from vegafusion.plan_resolver import external_table_scan_node + + class FilterAcceptingResolver(PlanResolver): + def scan_url(self, parsed_url: dict[str, Any]) -> Any: + if parsed_url["scheme"] == "myproto": + schema = pa.schema([("val", pa.int64())]) + return external_table_scan_node( + table_name="data", + schema=schema, + scheme="myproto", + ) + return None + + def resolve_table( + self, + name: str, + scheme: str, + schema: Any, + metadata: dict[str, Any] | None = None, + projected_columns: list[str] | None = None, + filters: list[Any] | None = None, + ) -> pa.Table: + # filters may be None or empty — just verify it's accepted + return pa.table({"val": [42, 99]}) + + resolver = FilterAcceptingResolver() + rt = vf.VegaFusionRuntime(plan_resolver=resolver) + + spec = { + "$schema": "https://vega.github.io/schema/vega/v5.json", + "data": [{"name": "source", "url": "myproto://db/table"}], + } + + datasets, _warnings = rt.pre_transform_datasets( + spec, + datasets=["source"], + dataset_format="pyarrow", + ) + assert len(datasets) == 1 + assert datasets[0].column("val").to_pylist() == [42, 99] + + +def test_unparse_expr_to_sql() -> None: + """unparse_expr_to_sql converts proto expressions to SQL strings.""" + from vegafusion.plan_resolver import unparse_expr_to_sql + from vegafusion.proto.datafusion_pb2 import ( + BinaryExprNode, + LogicalExprNode, + ) + from vegafusion.proto.datafusion.proto_common.proto.datafusion_common_pb2 import ( + Column as ColumnProto, + ScalarValue, + ) + + # Build proto for: x > 3 + col_x = LogicalExprNode(column=ColumnProto(name="x")) + lit_3 = LogicalExprNode( + literal=ScalarValue(int64_value=3), + ) + gt_expr = LogicalExprNode( + binary_expr=BinaryExprNode( + operands=[col_x, lit_3], + op="Gt", + ) + ) + + # Build proto for: y = 'hello' + col_y = LogicalExprNode(column=ColumnProto(name="y")) + lit_hello = LogicalExprNode( + literal=ScalarValue(utf8_value="hello"), + ) + eq_expr = LogicalExprNode( + binary_expr=BinaryExprNode( + operands=[col_y, lit_hello], + op="Eq", + ) + ) + + # Single expression + sql_single = unparse_expr_to_sql(gt_expr) + assert sql_single == snapshot("(x > 3)") + + # Multiple expressions joined with AND + sql_multi = unparse_expr_to_sql([gt_expr, eq_expr]) + assert sql_multi == snapshot("((x > 3) AND (y = 'hello'))") + + # With postgres dialect + sql_pg = unparse_expr_to_sql([gt_expr, eq_expr], dialect="postgres") + assert sql_pg == snapshot('(("x" > 3) AND ("y" = \'hello\'))') diff --git a/vegafusion-python/vegafusion/plan_resolver.py b/vegafusion-python/vegafusion/plan_resolver.py index 70614fdd1..91a925cce 100644 --- a/vegafusion-python/vegafusion/plan_resolver.py +++ b/vegafusion-python/vegafusion/plan_resolver.py @@ -12,13 +12,14 @@ _PROTOBUF_INSTALL_HINT = ( "The 'protobuf' package is required for plan-level resolvers " "(resolve_plan / resolve_plan_proto) and related utilities " - "(inline_table_scan_node, unparse_to_sql). " + "(inline_table_scan_node, unparse_to_sql, unparse_expr_to_sql). " "Install it with: pip install vegafusion[plan-resolver]" ) if TYPE_CHECKING: from vegafusion.dataset import ExternalDataset from vegafusion.proto.datafusion_pb2 import ( + LogicalExprNode, # type: ignore[attr-defined] LogicalPlanNode, # type: ignore[attr-defined] ) @@ -132,6 +133,7 @@ def resolve_table( schema: Schema, metadata: dict[str, Any] | None = None, projected_columns: list[str] | None = None, + filters: list[Any] | None = None, ) -> Table: """Provide data for an external table reference. @@ -145,6 +147,12 @@ def resolve_table( metadata: JSON metadata dict from ExternalTableProvider. projected_columns: Column names DataFusion actually needs. None if no projection (all columns needed). + filters: Pushed-down filter predicates from DataFusion as + ``LogicalExprNode`` protobuf messages, already split into + a conjunction (individual expressions from AND). These are + hints — resolvers may apply some, all, or none. DataFusion + re-applies all filters on the output regardless. Use + :func:`unparse_expr_to_sql` to convert to SQL strings. Returns: An Arrow-compatible table (arro3, PyArrow, etc.). @@ -256,12 +264,15 @@ def _resolve_external_tables( table_name, ) + filters = list(inner.filters) if inner.filters else None + table_data = self.resolve_table( name=table_name, scheme=dataset.scheme, schema=dataset.schema, metadata=metadata, projected_columns=projected_columns, + filters=filters, ) replacement = inline_table_scan_node( @@ -441,3 +452,39 @@ def unparse_to_sql( if not isinstance(plan, bytes): plan = plan.SerializeToString() return str(_native(plan, dialect)) + + +def unparse_expr_to_sql( + exprs: LogicalExprNode | bytes | list[LogicalExprNode | bytes], + dialect: str = "default", +) -> str: + """Convert filter expression(s) to a SQL string. + + Accepts a single ``LogicalExprNode`` protobuf message or a list of them. + Multiple expressions are joined with ``AND``. + + This is useful for converting the ``filters`` parameter of + :meth:`PlanResolver.resolve_table` into a SQL WHERE clause that can + be passed to external data sources. + + Args: + exprs: A single ``LogicalExprNode`` or a list of them. + dialect: SQL dialect. One of ``"default"``, ``"postgres"``, + ``"mysql"``, ``"sqlite"``, ``"duckdb"``, ``"bigquery"``. + + Returns: + The SQL string representation of the expression(s). + """ + from vegafusion._vegafusion import unparse_expr_to_sql as _native + + if not isinstance(exprs, list): + exprs = [exprs] + + expr_bytes = [] + for expr in exprs: + if isinstance(expr, bytes): + expr_bytes.append(expr) + else: + expr_bytes.append(expr.SerializeToString()) + + return str(_native(expr_bytes, dialect)) diff --git a/vegafusion-runtime/src/data/external_table.rs b/vegafusion-runtime/src/data/external_table.rs index 23a63e05d..843d4f74a 100644 --- a/vegafusion-runtime/src/data/external_table.rs +++ b/vegafusion-runtime/src/data/external_table.rs @@ -8,7 +8,7 @@ use datafusion::catalog::TableProvider; use datafusion::datasource::TableType; use datafusion::physical_plan::ExecutionPlan; use datafusion_common::{plan_err, Result}; -use datafusion_expr::Expr; +use datafusion_expr::{Expr, TableProviderFilterPushDown}; use serde_json::Value; use vegafusion_common::arrow::datatypes::SchemaRef; @@ -81,6 +81,16 @@ impl TableProvider for ExternalTableProvider { TableType::Base } + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> Result> { + // Report Inexact so DataFusion pushes filters into the TableScan + // (where resolve_table can access them) while still re-applying + // them on the output for correctness. + Ok(vec![TableProviderFilterPushDown::Inexact; filters.len()]) + } + async fn scan( &self, _state: &dyn Session, diff --git a/vegafusion-runtime/src/data/plan_resolver.rs b/vegafusion-runtime/src/data/plan_resolver.rs index 318b1a669..13f6f2015 100644 --- a/vegafusion-runtime/src/data/plan_resolver.rs +++ b/vegafusion-runtime/src/data/plan_resolver.rs @@ -5,7 +5,7 @@ use async_trait::async_trait; use datafusion::catalog::TableProvider; use datafusion::datasource::{provider_as_source, source_as_provider, MemTable}; use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter}; -use datafusion_expr::LogicalPlan as DFLogicalPlan; +use datafusion_expr::{Expr, LogicalPlan as DFLogicalPlan}; use vegafusion_common::arrow::datatypes::SchemaRef; use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; @@ -67,6 +67,9 @@ pub trait PlanResolver: Send + Sync + 'static { /// * `metadata` - JSON metadata from ExternalTableProvider /// * `projected_columns` - column names DataFusion actually needs, /// or `None` if all columns are needed + /// * `filters` - pushed-down filter predicates from DataFusion, already + /// split into a conjunction. These are hints — resolvers may apply + /// some, all, or none. DataFusion re-applies all filters regardless. async fn resolve_table( &self, _name: &str, @@ -74,6 +77,7 @@ pub trait PlanResolver: Send + Sync + 'static { _schema: SchemaRef, _metadata: &serde_json::Value, _projected_columns: Option>, + _filters: &[Expr], ) -> Result { Err(VegaFusionError::internal( "resolve_table not implemented — override resolve_table or resolve_plan", @@ -106,6 +110,7 @@ pub trait PlanResolver: Send + Sync + 'static { info.schema.clone(), &info.metadata, info.projected_columns.clone(), + &info.filters, ) .await?; let mem_table = @@ -132,9 +137,14 @@ struct ExternalTableInfo { schema: SchemaRef, metadata: serde_json::Value, projected_columns: Option>, + filters: Vec, } /// Walk a LogicalPlan and collect ExternalTableProvider info for each table scan. +/// +/// Filters come from `scan.filters` on the `TableScan`, which are populated +/// when DataFusion's optimizer pushes filter predicates down to the scan. +/// `ExternalTableProvider` reports `Inexact` for all filters to enable this. fn extract_external_tables(plan: &LogicalPlan) -> HashMap { let mut tables = HashMap::new(); let _ = plan.apply(|node| { @@ -155,6 +165,7 @@ fn extract_external_tables(plan: &LogicalPlan) -> HashMap Date: Wed, 18 Mar 2026 11:13:46 -0400 Subject: [PATCH 28/36] refactor: address PR #587 review comments - Move ResolutionResult from vegafusion-core to vegafusion-runtime (lives with PlanResolver where it's used) - Remove reserved 6 from DataUrlTask proto (never merged with that field) - Update stale docs referencing removed capabilities concept - Remove source field from ExternalTableProvider (unused, metadata covers this use case) - Introduce VegaFusionRuntimeOpts with Default, replacing two constructors with a single new(opts) method - Remove section header comments from Python test file Co-Authored-By: Claude Opus 4.6 (1M context) --- .../rust-examples/examples/custom_resolver.rs | 11 +++-- vegafusion-core/src/data/url.rs | 10 ---- vegafusion-core/src/proto/tasks.proto | 1 - vegafusion-core/src/runtime/mod.rs | 1 - vegafusion-python/src/lib.rs | 24 +++++----- vegafusion-python/src/plan_resolver.rs | 8 ++-- vegafusion-python/tests/test_plan_resolver.py | 3 -- vegafusion-python/vegafusion/dataset.py | 2 - vegafusion-python/vegafusion/plan_resolver.py | 3 -- vegafusion-runtime/src/data/codec.rs | 15 ++---- .../src/data/datafusion_resolver.rs | 4 +- vegafusion-runtime/src/data/external_table.rs | 12 ----- vegafusion-runtime/src/data/pipeline.rs | 4 +- vegafusion-runtime/src/data/plan_resolver.rs | 15 ++++-- vegafusion-runtime/src/task_graph/runtime.rs | 42 ++++++++-------- .../tests/test_plan_resolver.rs | 48 +++++++++++++++---- vegafusion-server/src/main.rs | 13 +++-- 17 files changed, 114 insertions(+), 102 deletions(-) diff --git a/examples/rust-examples/examples/custom_resolver.rs b/examples/rust-examples/examples/custom_resolver.rs index 2c9576d30..96ae7df70 100644 --- a/examples/rust-examples/examples/custom_resolver.rs +++ b/examples/rust-examples/examples/custom_resolver.rs @@ -1,10 +1,11 @@ use std::sync::Arc; use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::Result; -use vegafusion_core::runtime::{ResolutionResult, VegaFusionRuntimeTrait}; +use vegafusion_core::runtime::VegaFusionRuntimeTrait; use vegafusion_core::spec::chart::ChartSpec; use vegafusion_runtime::data::plan_resolver::PlanResolver; -use vegafusion_runtime::task_graph::runtime::VegaFusionRuntime; +use vegafusion_runtime::data::plan_resolver::ResolutionResult; +use vegafusion_runtime::task_graph::runtime::{VegaFusionRuntime, VegaFusionRuntimeOpts}; /// A custom resolver that logs plan resolution and passes through to DataFusion #[derive(Clone)] @@ -35,7 +36,11 @@ async fn main() { let custom_resolver = Arc::new(LoggingResolver) as Arc; // Create runtime with custom resolver - let runtime = VegaFusionRuntime::new(None, vec![custom_resolver]); + let runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts { + plan_resolvers: vec![custom_resolver], + ..Default::default() + }) + .unwrap(); println!("Starting pre-transform with custom resolver\n"); diff --git a/vegafusion-core/src/data/url.rs b/vegafusion-core/src/data/url.rs index b58837b88..121ca33d3 100644 --- a/vegafusion-core/src/data/url.rs +++ b/vegafusion-core/src/data/url.rs @@ -1,17 +1,7 @@ use regex::Regex; use std::sync::LazyLock; -use vegafusion_common::data::table::VegaFusionTable; -use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::Result; -pub enum ResolutionResult { - /// Resolver fully materialized the plan - Table(VegaFusionTable), - /// Resolver produced a rewritten plan for the next resolver to handle, - /// or for DataFusion to execute if this is the last resolver - Plan(LogicalPlan), -} - /// Parsed URL representation passed to resolvers during the scan phase. /// All fields are populated from the fully-resolved URL (after base URL /// resolution and hash-stripping). Resolvers pattern-match on these fields diff --git a/vegafusion-core/src/proto/tasks.proto b/vegafusion-core/src/proto/tasks.proto index 0761f24db..045cf564a 100644 --- a/vegafusion-core/src/proto/tasks.proto +++ b/vegafusion-core/src/proto/tasks.proto @@ -77,7 +77,6 @@ message DataUrlTask { int32 batch_size = 3; ScanUrlFormat format_type = 4; transforms.TransformPipeline pipeline = 5; - reserved 6; } // ## Inline values task diff --git a/vegafusion-core/src/runtime/mod.rs b/vegafusion-core/src/runtime/mod.rs index 00f406ab2..d827cfdc8 100644 --- a/vegafusion-core/src/runtime/mod.rs +++ b/vegafusion-core/src/runtime/mod.rs @@ -2,6 +2,5 @@ mod runtime; pub use crate::data::url::{ has_url_scheme, is_absolute_path, normalize_base_url, path_to_file_url, resolve_url, ParsedUrl, - ResolutionResult, }; pub use runtime::{PreTransformExtractTable, VegaFusionRuntimeTrait}; diff --git a/vegafusion-python/src/lib.rs b/vegafusion-python/src/lib.rs index 63cc96f17..882cac4be 100644 --- a/vegafusion-python/src/lib.rs +++ b/vegafusion-python/src/lib.rs @@ -20,7 +20,7 @@ use vegafusion_core::proto::gen::pretransform::{ use vegafusion_core::proto::gen::tasks::{TzConfig, Variable}; use vegafusion_runtime::task_graph::GrpcVegaFusionRuntime; -use vegafusion_runtime::task_graph::runtime::VegaFusionRuntime; +use vegafusion_runtime::task_graph::runtime::{VegaFusionRuntime, VegaFusionRuntimeOpts}; use env_logger::{Builder, Target}; use serde_json::json; @@ -92,11 +92,11 @@ impl PyVegaFusionRuntime { }; Ok(Self { - runtime: Arc::new(VegaFusionRuntime::new_with_data_base_url( - Some(VegaFusionCache::new(max_capacity, memory_limit)), - resolvers, - data_base_url_setting, - )?), + runtime: Arc::new(VegaFusionRuntime::new(VegaFusionRuntimeOpts { + cache: Some(VegaFusionCache::new(max_capacity, memory_limit)), + plan_resolvers: resolvers, + data_base_url: data_base_url_setting, + })?), tokio_runtime: Arc::new(tokio_runtime_connection), }) } @@ -633,18 +633,16 @@ pub fn inline_table_scan_node(name: String, schema: pyo3_arrow::PySchema) -> PyR /// scheme: Scheme identifier (e.g. "spark"). /// schema: Arrow schema (arro3.core.Schema) — required for logical planning. /// metadata: Optional JSON-serializable dict of metadata. -/// source: Optional source identifier. /// /// Returns: /// bytes: Serialized LogicalPlanNode protobuf. #[pyfunction] -#[pyo3(signature = (table_name, scheme, schema, metadata=None, source=None))] +#[pyo3(signature = (table_name, scheme, schema, metadata=None))] pub fn external_table_scan_node( table_name: String, scheme: String, schema: pyo3_arrow::PySchema, metadata: Option<&Bound<'_, pyo3::types::PyAny>>, - source: Option, ) -> PyResult> { use datafusion::datasource::provider_as_source; use datafusion_proto::bytes::logical_plan_to_bytes_with_extension_codec; @@ -661,9 +659,11 @@ pub fn external_table_scan_node( None => serde_json::Value::Object(serde_json::Map::new()), }; - let provider = Arc::new( - ExternalTableProvider::new(scheme, arrow_schema, metadata_value).with_source(source), - ); + let provider = Arc::new(ExternalTableProvider::new( + scheme, + arrow_schema, + metadata_value, + )); let table_source = provider_as_source(provider); let plan = LogicalPlanBuilder::scan(&table_name, table_source, None) diff --git a/vegafusion-python/src/plan_resolver.rs b/vegafusion-python/src/plan_resolver.rs index 3d7cb9fa4..fab976ef5 100644 --- a/vegafusion-python/src/plan_resolver.rs +++ b/vegafusion-python/src/plan_resolver.rs @@ -15,10 +15,11 @@ use vegafusion_common::arrow::record_batch::RecordBatch; use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::{Result, VegaFusionError}; -use vegafusion_core::runtime::{ParsedUrl, ResolutionResult}; +use vegafusion_core::runtime::ParsedUrl; use vegafusion_runtime::data::codec::VegaFusionCodec; use vegafusion_runtime::data::external_table::ExternalTableProvider; use vegafusion_runtime::data::plan_resolver::PlanResolver; +use vegafusion_runtime::data::plan_resolver::ResolutionResult; /// A `PlanResolver` that delegates to a Python object. /// @@ -91,7 +92,6 @@ impl PyPlanResolver { struct ExternalTableInfo { scheme: String, schema: SchemaRef, - source: Option, metadata: Value, ref_id: Option, } @@ -113,7 +113,6 @@ fn extract_external_tables(plan: &LogicalPlan) -> HashMap( // Convert metadata to Python dict let py_metadata = pythonize::pythonize(py, &info.metadata)?; - // Reconstruct ExternalDataset(scheme, schema, metadata, data, source) + // Reconstruct ExternalDataset(scheme, schema, metadata, data) let kwargs = PyDict::new(py); kwargs.set_item("scheme", &info.scheme)?; kwargs.set_item("schema", py_schema)?; kwargs.set_item("metadata", py_metadata)?; kwargs.set_item("data", &data)?; - kwargs.set_item("source", info.source.as_deref())?; let dataset = dataset_cls.call((), Some(&kwargs))?; dict.set_item(table_name, dataset)?; } diff --git a/vegafusion-python/tests/test_plan_resolver.py b/vegafusion-python/tests/test_plan_resolver.py index e42ec9251..92750fcf5 100644 --- a/vegafusion-python/tests/test_plan_resolver.py +++ b/vegafusion-python/tests/test_plan_resolver.py @@ -698,9 +698,6 @@ def resolve_plan_proto( assert "Unknown dialect" in str(resolver.error) -# ── scan_url tests ── - - def test_scan_url_called_with_structured_dict() -> None: """scan_url receives a structured dict with parsed URL fields.""" from vegafusion.plan_resolver import external_table_scan_node diff --git a/vegafusion-python/vegafusion/dataset.py b/vegafusion-python/vegafusion/dataset.py index 93ec78ec9..13fed4a89 100644 --- a/vegafusion-python/vegafusion/dataset.py +++ b/vegafusion-python/vegafusion/dataset.py @@ -46,13 +46,11 @@ def __init__( schema: Any, # noqa: ANN401 metadata: dict[str, Any] | None = None, data: Any = None, # noqa: ANN401 - source: str | None = None, ) -> None: self._schema: Schema = ( Schema.from_arrow(schema) if not isinstance(schema, Schema) else schema ) self._scheme = scheme - self._source = source self._metadata: dict[str, Any] = dict(metadata) if metadata else {} self._data: Any = data self._data_ref: _DataRef | None = None diff --git a/vegafusion-python/vegafusion/plan_resolver.py b/vegafusion-python/vegafusion/plan_resolver.py index 91a925cce..34cab1009 100644 --- a/vegafusion-python/vegafusion/plan_resolver.py +++ b/vegafusion-python/vegafusion/plan_resolver.py @@ -390,7 +390,6 @@ def external_table_scan_node( scheme: str, schema: Schema, metadata: dict[str, Any] | None = None, - source: str | None = None, ) -> LogicalPlanNode: """Build a LogicalPlanNode for an external table scan. @@ -403,7 +402,6 @@ def external_table_scan_node( scheme: Scheme identifier (e.g. ``"spark"``). schema: Arrow schema (arro3.core.Schema) — required for logical planning. metadata: Optional JSON-serializable dict of metadata. - source: Optional source identifier. Returns: A deserialized LogicalPlanNode protobuf message. @@ -424,7 +422,6 @@ def external_table_scan_node( scheme=scheme, schema=schema, metadata=metadata, - source=source, ) ) return node diff --git a/vegafusion-runtime/src/data/codec.rs b/vegafusion-runtime/src/data/codec.rs index bad32a6c5..121d77976 100644 --- a/vegafusion-runtime/src/data/codec.rs +++ b/vegafusion-runtime/src/data/codec.rs @@ -83,14 +83,10 @@ impl LogicalExtensionCodec for VegaFusionCodec { ) })? .to_string(); - let source = envelope - .get("source") - .and_then(|v| v.as_str()) - .map(|s| s.to_string()); let metadata = envelope.get("metadata").cloned().unwrap_or(Value::Null); - Ok(Arc::new( - ExternalTableProvider::new(scheme, schema, metadata).with_source(source), - )) + Ok(Arc::new(ExternalTableProvider::new( + scheme, schema, metadata, + ))) } Some("inline") => { let name = envelope @@ -133,14 +129,11 @@ impl LogicalExtensionCodec for VegaFusionCodec { buf: &mut Vec, ) -> Result<()> { if let Some(ext) = node.as_any().downcast_ref::() { - let mut envelope = serde_json::json!({ + let envelope = serde_json::json!({ "type": "external", "scheme": ext.scheme(), "metadata": ext.metadata(), }); - if let Some(source) = ext.source() { - envelope["source"] = serde_json::Value::String(source.to_string()); - } let json_bytes = serde_json::to_vec(&envelope).map_err(|e| { DataFusionError::Plan(format!( "Failed to encode ExternalTableProvider envelope: {e}" diff --git a/vegafusion-runtime/src/data/datafusion_resolver.rs b/vegafusion-runtime/src/data/datafusion_resolver.rs index 10801b5f3..c3b1c1428 100644 --- a/vegafusion-runtime/src/data/datafusion_resolver.rs +++ b/vegafusion-runtime/src/data/datafusion_resolver.rs @@ -7,7 +7,9 @@ use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::Result; #[cfg(not(feature = "parquet"))] use vegafusion_common::error::VegaFusionError; -use vegafusion_core::runtime::{ParsedUrl, ResolutionResult}; +use vegafusion_core::runtime::ParsedUrl; + +use super::plan_resolver::ResolutionResult; use super::plan_resolver::PlanResolver; diff --git a/vegafusion-runtime/src/data/external_table.rs b/vegafusion-runtime/src/data/external_table.rs index 843d4f74a..da2474509 100644 --- a/vegafusion-runtime/src/data/external_table.rs +++ b/vegafusion-runtime/src/data/external_table.rs @@ -24,7 +24,6 @@ use vegafusion_common::arrow::datatypes::SchemaRef; pub struct ExternalTableProvider { scheme: String, schema: SchemaRef, - source: Option, metadata: Value, } @@ -33,24 +32,14 @@ impl ExternalTableProvider { Self { scheme, schema, - source: None, metadata, } } - pub fn with_source(mut self, source: Option) -> Self { - self.source = source; - self - } - pub fn scheme(&self) -> &str { &self.scheme } - pub fn source(&self) -> Option<&str> { - self.source.as_deref() - } - pub fn metadata(&self) -> &Value { &self.metadata } @@ -60,7 +49,6 @@ impl Debug for ExternalTableProvider { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("ExternalTableProvider") .field("scheme", &self.scheme) - .field("source", &self.source) .field("schema", &self.schema) .field("metadata", &self.metadata) .finish() diff --git a/vegafusion-runtime/src/data/pipeline.rs b/vegafusion-runtime/src/data/pipeline.rs index a327f0342..a9be7db9a 100644 --- a/vegafusion-runtime/src/data/pipeline.rs +++ b/vegafusion-runtime/src/data/pipeline.rs @@ -8,7 +8,9 @@ use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::{Result, VegaFusionError}; use vegafusion_core::data::url::normalize_base_url; -use vegafusion_core::runtime::{ParsedUrl, ResolutionResult}; +use vegafusion_core::runtime::ParsedUrl; + +use super::plan_resolver::ResolutionResult; use super::datafusion_resolver::DataFusionResolver; use super::external_table::ExternalTableProvider; diff --git a/vegafusion-runtime/src/data/plan_resolver.rs b/vegafusion-runtime/src/data/plan_resolver.rs index 13f6f2015..24772a259 100644 --- a/vegafusion-runtime/src/data/plan_resolver.rs +++ b/vegafusion-runtime/src/data/plan_resolver.rs @@ -10,19 +10,26 @@ use vegafusion_common::arrow::datatypes::SchemaRef; use vegafusion_common::data::table::VegaFusionTable; use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::{Result, VegaFusionError}; -use vegafusion_core::runtime::{ParsedUrl, ResolutionResult}; +use vegafusion_core::runtime::ParsedUrl; use super::external_table::ExternalTableProvider; +pub enum ResolutionResult { + /// Resolver fully materialized the plan + Table(VegaFusionTable), + /// Resolver produced a rewritten plan for the next resolver to handle, + /// or for DataFusion to execute if this is the last resolver + Plan(LogicalPlan), +} + /// Trait for custom data source integration with VegaFusion. /// /// Resolvers participate in a two-phase pipeline: /// -/// 1. **Planning phase**: [`capabilities`](Self::capabilities) declares supported -/// URL schemes/formats, and [`scan_url`](Self::scan_url) converts URLs into +/// 1. **URL scanning**: [`scan_url`](Self::scan_url) converts URLs into /// `LogicalPlan` nodes (typically `ExternalTableProvider` markers). /// -/// 2. **Execution phase**: [`resolve_table`](Self::resolve_table) or +/// 2. **Execution**: [`resolve_table`](Self::resolve_table) or /// [`resolve_plan`](Self::resolve_plan) provides data for external table /// references or rewrites the plan for remote execution. /// diff --git a/vegafusion-runtime/src/task_graph/runtime.rs b/vegafusion-runtime/src/task_graph/runtime.rs index ee8ac5425..1d27c165d 100644 --- a/vegafusion-runtime/src/task_graph/runtime.rs +++ b/vegafusion-runtime/src/task_graph/runtime.rs @@ -35,6 +35,22 @@ type CacheValue = (TaskValue, Vec); use crate::data::pipeline::{resolve_data_base_url, DataBaseUrlSetting}; +pub struct VegaFusionRuntimeOpts { + pub plan_resolvers: Vec>, + pub data_base_url: DataBaseUrlSetting, + pub cache: Option, +} + +impl Default for VegaFusionRuntimeOpts { + fn default() -> Self { + Self { + plan_resolvers: Vec::new(), + data_base_url: DataBaseUrlSetting::Default, + cache: None, + } + } +} + #[derive(Clone)] pub struct VegaFusionRuntime { pub cache: VegaFusionCache, @@ -43,26 +59,14 @@ pub struct VegaFusionRuntime { } impl VegaFusionRuntime { - pub fn new(cache: Option, plan_resolvers: Vec>) -> Self { - let ctx = Arc::new(make_datafusion_context()); - let data_base_url = resolve_data_base_url(&DataBaseUrlSetting::Default).unwrap_or_default(); - Self { - cache: cache.unwrap_or_else(|| VegaFusionCache::new(Some(32), None)), - pipeline: ResolverPipeline::new(plan_resolvers, ctx), - data_base_url, - } - } - - pub fn new_with_data_base_url( - cache: Option, - plan_resolvers: Vec>, - data_base_url_setting: DataBaseUrlSetting, - ) -> vegafusion_core::error::Result { + pub fn new(opts: VegaFusionRuntimeOpts) -> vegafusion_core::error::Result { let ctx = Arc::new(make_datafusion_context()); - let data_base_url = resolve_data_base_url(&data_base_url_setting)?; + let data_base_url = resolve_data_base_url(&opts.data_base_url)?; Ok(Self { - cache: cache.unwrap_or_else(|| VegaFusionCache::new(Some(32), None)), - pipeline: ResolverPipeline::new(plan_resolvers, ctx), + cache: opts + .cache + .unwrap_or_else(|| VegaFusionCache::new(Some(32), None)), + pipeline: ResolverPipeline::new(opts.plan_resolvers, ctx), data_base_url, }) } @@ -107,7 +111,7 @@ impl VegaFusionRuntime { impl Default for VegaFusionRuntime { fn default() -> Self { - Self::new(None, Vec::new()) + Self::new(VegaFusionRuntimeOpts::default()).expect("default opts should not fail") } } diff --git a/vegafusion-runtime/tests/test_plan_resolver.rs b/vegafusion-runtime/tests/test_plan_resolver.rs index bf8e454e7..f0eddb274 100644 --- a/vegafusion-runtime/tests/test_plan_resolver.rs +++ b/vegafusion-runtime/tests/test_plan_resolver.rs @@ -14,12 +14,14 @@ use vegafusion_common::datafusion_expr::LogicalPlan; use vegafusion_common::error::{Result, VegaFusionError}; use vegafusion_core::data::dataset::VegaFusionDataset; use vegafusion_core::proto::gen::pretransform::PreTransformSpecOpts; -use vegafusion_core::runtime::{ParsedUrl, ResolutionResult, VegaFusionRuntimeTrait}; +use vegafusion_core::runtime::{ParsedUrl, VegaFusionRuntimeTrait}; use vegafusion_core::spec::chart::ChartSpec; use vegafusion_runtime::data::external_table::ExternalTableProvider; use vegafusion_runtime::data::pipeline::ResolverPipeline; use vegafusion_runtime::data::plan_resolver::PlanResolver; +use vegafusion_runtime::data::plan_resolver::ResolutionResult; use vegafusion_runtime::task_graph::runtime::VegaFusionRuntime; +use vegafusion_runtime::task_graph::runtime::VegaFusionRuntimeOpts; #[derive(Clone, Debug)] struct ResolverEvent { @@ -198,7 +200,11 @@ async fn test_custom_executor_called_in_pre_transform_spec() { ); let resolver_clone = resolver.clone(); - let runtime = VegaFusionRuntime::new(None, vec![Arc::new(resolver)]); + let runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts { + plan_resolvers: vec![Arc::new(resolver)], + ..Default::default() + }) + .unwrap(); let spec = get_simple_spec(); let inline_datasets = get_inline_datasets(); @@ -237,7 +243,11 @@ async fn test_custom_executor_called_in_pre_transform_extract() { ); let resolver_clone = resolver.clone(); - let runtime = VegaFusionRuntime::new(None, vec![Arc::new(resolver)]); + let runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts { + plan_resolvers: vec![Arc::new(resolver)], + ..Default::default() + }) + .unwrap(); let spec = get_simple_spec(); let inline_datasets = get_inline_datasets(); @@ -276,7 +286,11 @@ async fn test_custom_executor_called_in_pre_transform_values() { ); let resolver_clone = resolver.clone(); - let runtime = VegaFusionRuntime::new(None, vec![Arc::new(resolver)]); + let runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts { + plan_resolvers: vec![Arc::new(resolver)], + ..Default::default() + }) + .unwrap(); let spec = get_simple_spec(); let inline_datasets = get_inline_datasets(); @@ -324,7 +338,11 @@ async fn test_bin_transform_uses_custom_executor() { ); let resolver_clone = resolver.clone(); - let runtime = VegaFusionRuntime::new(None, vec![Arc::new(resolver)]); + let runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts { + plan_resolvers: vec![Arc::new(resolver)], + ..Default::default() + }) + .unwrap(); let spec_str = r#"{ "$schema": "https://vega.github.io/schema/vega/v5.json", @@ -434,7 +452,11 @@ async fn test_mixed_data_only_executes_plans() { ); let resolver_clone = resolver.clone(); - let runtime = VegaFusionRuntime::new(None, vec![Arc::new(resolver)]); + let runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts { + plan_resolvers: vec![Arc::new(resolver)], + ..Default::default() + }) + .unwrap(); let spec_str = r#"{ "$schema": "https://vega.github.io/schema/vega/v5.json", @@ -968,7 +990,11 @@ async fn test_table_returning_resolver() { let resolver = TableResolver { movies_table: mem_table, }; - let runtime = VegaFusionRuntime::new(None, vec![Arc::new(resolver)]); + let runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts { + plan_resolvers: vec![Arc::new(resolver)], + ..Default::default() + }) + .unwrap(); let spec = get_simple_spec(); let inline_datasets = get_inline_datasets(); @@ -994,7 +1020,7 @@ async fn test_table_returning_resolver() { /// Test that VegaFusionRuntime works with no resolver (None) when inline datasets are tables. #[tokio::test] async fn test_no_resolver() { - let runtime = VegaFusionRuntime::new(None, Vec::new()); + let runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts::default()).unwrap(); let spec_str = r#"{ "$schema": "https://vega.github.io/schema/vega/v5.json", @@ -1306,7 +1332,11 @@ async fn test_resolver_error_propagation() { } } - let runtime = VegaFusionRuntime::new(None, vec![Arc::new(FailingResolver)]); + let runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts { + plan_resolvers: vec![Arc::new(FailingResolver)], + ..Default::default() + }) + .unwrap(); let spec = get_simple_spec(); let inline_datasets = get_inline_datasets(); diff --git a/vegafusion-server/src/main.rs b/vegafusion-server/src/main.rs index ea46f714f..fb9c80412 100644 --- a/vegafusion-server/src/main.rs +++ b/vegafusion-server/src/main.rs @@ -19,7 +19,9 @@ use vegafusion_core::proto::gen::tasks::{ use vegafusion_core::runtime::VegaFusionRuntimeTrait; use vegafusion_core::spec::chart::ChartSpec; use vegafusion_core::task_graph::graph::ScopedVariable; -use vegafusion_runtime::task_graph::runtime::{decode_inline_datasets, VegaFusionRuntime}; +use vegafusion_runtime::task_graph::runtime::{ + decode_inline_datasets, VegaFusionRuntime, VegaFusionRuntimeOpts, +}; use clap::Parser; use regex::Regex; @@ -374,10 +376,11 @@ fn main() -> Result<(), VegaFusionError> { .build() .expect("Failed to create tokio runtime"); - let tg_runtime = VegaFusionRuntime::new( - Some(VegaFusionCache::new(Some(args.capacity), memory_limit)), - Vec::new(), - ); + let tg_runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts { + cache: Some(VegaFusionCache::new(Some(args.capacity), memory_limit)), + ..Default::default() + }) + .expect("Failed to create VegaFusionRuntime"); tokio_runtime.block_on(async move { grpc_server(grpc_address, tg_runtime.clone(), args.web) From 0e303ed90a68b54d0e012018a687dac2ceb6f2fb Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Wed, 18 Mar 2026 11:31:54 -0400 Subject: [PATCH 29/36] test: add filter pushdown test with TODO for _vf_order restructuring Add test proving filter transforms work end-to-end with resolve_table (DataFusion applies filters after resolution). Assert that filters are currently not pushed down to resolve_table due to _vf_order window blocking PushDownFilter, with TODO to address via with_index changes. Remove unused optimize_filters infrastructure from ResolverPipeline. Co-Authored-By: Claude Opus 4.6 (1M context) --- vegafusion-python/tests/test_plan_resolver.py | 36 ++++++++++++++----- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/vegafusion-python/tests/test_plan_resolver.py b/vegafusion-python/tests/test_plan_resolver.py index 92750fcf5..35fffaaeb 100644 --- a/vegafusion-python/tests/test_plan_resolver.py +++ b/vegafusion-python/tests/test_plan_resolver.py @@ -872,14 +872,17 @@ def resolve_table( assert len(datasets) == 1 -def test_resolve_table_accepts_filters_param() -> None: - """resolve_table with filters kwarg doesn't crash (filters may be empty).""" +def test_resolve_table_with_filter_transform() -> None: + """resolve_table works with a Vega filter transform; filter is applied after resolution.""" from vegafusion.plan_resolver import external_table_scan_node - class FilterAcceptingResolver(PlanResolver): + class FilterCapturingResolver(PlanResolver): + def __init__(self) -> None: + self.captured_filters: list[Any] = [] + def scan_url(self, parsed_url: dict[str, Any]) -> Any: if parsed_url["scheme"] == "myproto": - schema = pa.schema([("val", pa.int64())]) + schema = pa.schema([("x", pa.int64()), ("y", pa.utf8())]) return external_table_scan_node( table_name="data", schema=schema, @@ -896,15 +899,21 @@ def resolve_table( projected_columns: list[str] | None = None, filters: list[Any] | None = None, ) -> pa.Table: - # filters may be None or empty — just verify it's accepted - return pa.table({"val": [42, 99]}) + self.captured_filters.extend(filters or []) + return pa.table({"x": [1, 5, 10], "y": ["a", "b", "c"]}) - resolver = FilterAcceptingResolver() + resolver = FilterCapturingResolver() rt = vf.VegaFusionRuntime(plan_resolver=resolver) spec = { "$schema": "https://vega.github.io/schema/vega/v5.json", - "data": [{"name": "source", "url": "myproto://db/table"}], + "data": [ + { + "name": "source", + "url": "myproto://db/table", + "transform": [{"type": "filter", "expr": "datum.x > 3"}], + } + ], } datasets, _warnings = rt.pre_transform_datasets( @@ -912,8 +921,17 @@ def resolve_table( datasets=["source"], dataset_format="pyarrow", ) + assert len(datasets) == 1 - assert datasets[0].column("val").to_pylist() == [42, 99] + # Filter is applied by DataFusion after resolve_table returns + result = datasets[0] + assert result.column("x").to_pylist() == [5, 10] + + # TODO: filters should be pushed down to resolve_table so resolvers can + # optimize data loading. Currently blocked because VegaFusion's _vf_order + # window sits between the scan and user filters, preventing DataFusion's + # PushDownFilter from reaching the ExternalTableProvider. + assert resolver.captured_filters == [] def test_unparse_expr_to_sql() -> None: From b081089bc635ce22b716d3cde0d7c69f226534f4 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Wed, 18 Mar 2026 11:45:51 -0400 Subject: [PATCH 30/36] style: remove redundant examples Drop plan_resolver_basic.py (covered by url_scanning example) and custom_resolver.rs (logging pass-through doesn't demonstrate real use). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../python-examples/plan_resolver_basic.py | 86 ------------ .../rust-examples/examples/custom_resolver.rs | 128 ------------------ 2 files changed, 214 deletions(-) delete mode 100644 examples/python-examples/plan_resolver_basic.py delete mode 100644 examples/rust-examples/examples/custom_resolver.rs diff --git a/examples/python-examples/plan_resolver_basic.py b/examples/python-examples/plan_resolver_basic.py deleted file mode 100644 index 8a419b204..000000000 --- a/examples/python-examples/plan_resolver_basic.py +++ /dev/null @@ -1,86 +0,0 @@ -# Demonstrates the simplest PlanResolver pattern: override resolve_table to provide -# data for an ExternalDataset. No protobuf dependency needed. - -from __future__ import annotations - -import json -from typing import Any - -import pyarrow as pa - -import vegafusion as vf -from vegafusion import ExternalDataset, PlanResolver - - -def main() -> None: - source_table = pa.table({"x": [1, 5, 10], "y": ["a", "b", "c"]}) - ext = ExternalDataset( - scheme="custom", schema=source_table.schema, data=source_table - ) - resolver = TableResolver(source_table) - rt = vf.VegaFusionRuntime(plan_resolver=resolver) - - spec = make_spec() - datasets, warnings = rt.pre_transform_datasets( - spec, - datasets=["filtered"], - inline_datasets={"source": ext}, - dataset_format="pyarrow", - ) - - assert len(datasets) == 1 - result = datasets[0] - assert result.num_rows == 2 - assert result.column("x").to_pylist() == [5, 10] - assert result.column("y").to_pylist() == ["b", "c"] - - print("Result after filter (x > 3):") - print(result.to_pandas().to_string(index=False)) - - -class TableResolver(PlanResolver): - """Returns a fixed table for any resolve_table call.""" - - def __init__(self, table: pa.Table) -> None: - self._table = table - - def resolve_table( - self, - name: str, - scheme: str, - schema: Any, - metadata: dict[str, Any] | None = None, - projected_columns: list[str] | None = None, - filters: list[Any] | None = None, - ) -> pa.Table: - return self._table - - -def make_spec() -> dict[str, Any]: - return json.loads( - """ -{ - "$schema": "https://vega.github.io/schema/vega/v5.json", - "data": [ - { - "name": "source", - "url": "table://source" - }, - { - "name": "filtered", - "source": "source", - "transform": [ - { - "type": "filter", - "expr": "datum.x > 3" - } - ] - } - ] -} - """ - ) - - -if __name__ == "__main__": - main() diff --git a/examples/rust-examples/examples/custom_resolver.rs b/examples/rust-examples/examples/custom_resolver.rs deleted file mode 100644 index 96ae7df70..000000000 --- a/examples/rust-examples/examples/custom_resolver.rs +++ /dev/null @@ -1,128 +0,0 @@ -use std::sync::Arc; -use vegafusion_common::datafusion_expr::LogicalPlan; -use vegafusion_common::error::Result; -use vegafusion_core::runtime::VegaFusionRuntimeTrait; -use vegafusion_core::spec::chart::ChartSpec; -use vegafusion_runtime::data::plan_resolver::PlanResolver; -use vegafusion_runtime::data::plan_resolver::ResolutionResult; -use vegafusion_runtime::task_graph::runtime::{VegaFusionRuntime, VegaFusionRuntimeOpts}; - -/// A custom resolver that logs plan resolution and passes through to DataFusion -#[derive(Clone)] -struct LoggingResolver; - -#[async_trait::async_trait] -impl PlanResolver for LoggingResolver { - fn name(&self) -> &str { - "LoggingResolver" - } - - async fn resolve_plan(&self, plan: LogicalPlan) -> Result { - println!("Custom resolver received logical plan"); - println!("Plan details:\n{}\n", plan.display_indent()); - - // Return the plan unchanged — DataFusion will execute it - Ok(ResolutionResult::Plan(plan)) - } -} - -/// This example demonstrates how to use a custom plan resolver with VegaFusion. -/// The custom resolver logs each plan before letting DataFusion execute it. -#[tokio::main] -async fn main() { - let spec = get_spec(); - - // Create a custom resolver - let custom_resolver = Arc::new(LoggingResolver) as Arc; - - // Create runtime with custom resolver - let runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts { - plan_resolvers: vec![custom_resolver], - ..Default::default() - }) - .unwrap(); - - println!("Starting pre-transform with custom resolver\n"); - - let (_transformed_spec, warnings) = runtime - .pre_transform_spec( - &spec, - &Default::default(), // Inline datasets - &Default::default(), // Options - ) - .await - .unwrap(); - println!("Spec transformed"); - assert_eq!(warnings.len(), 0); -} - -fn get_spec() -> ChartSpec { - let spec_str = r##" - { - "$schema": "https://vega.github.io/schema/vega/v5.json", - "description": "A histogram demonstrating custom resolver usage", - "width": 400, - "height": 200, - "padding": 5, - "data": [ - { - "name": "table", - "url": "data/movies.json", - "transform": [ - { - "type": "extent", - "field": "IMDB Rating", - "signal": "extent" - }, - { - "type": "bin", - "signal": "bins", - "field": "IMDB Rating", - "extent": {"signal": "extent"}, - "maxbins": 10 - }, - { - "type": "aggregate", - "groupby": ["bin0", "bin1"], - "ops": ["count"], - "fields": [null], - "as": ["count"] - } - ] - } - ], - "scales": [ - { - "name": "xscale", - "type": "linear", - "range": "width", - "domain": {"signal": "extent"} - }, - { - "name": "yscale", - "type": "linear", - "range": "height", - "round": true, - "domain": {"data": "table", "field": "count"}, - "zero": true, - "nice": true - } - ], - "marks": [ - { - "type": "rect", - "from": {"data": "table"}, - "encode": { - "update": { - "x": {"scale": "xscale", "field": "bin0"}, - "x2": {"scale": "xscale", "field": "bin1"}, - "y": {"scale": "yscale", "field": "count"}, - "y2": {"scale": "yscale", "value": 0} - } - } - } - ] - } - "##; - serde_json::from_str(spec_str).unwrap() -} From 2b7cd8fc4c32c42ed999e4ff942ff689edbdeaae Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Wed, 18 Mar 2026 12:06:29 -0400 Subject: [PATCH 31/36] style: clean up Python test assertions - Remove dead _vf_scheme check (removed implementation detail) - Drop redundant snapshot from proto_message unparse test (already covered by from_resolver test, keep bytes==proto equality check) - Add clarifying comment to scan_url_not_called_without_override test Co-Authored-By: Claude Opus 4.6 (1M context) --- vegafusion-python/tests/test_plan_resolver.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vegafusion-python/tests/test_plan_resolver.py b/vegafusion-python/tests/test_plan_resolver.py index 35fffaaeb..0ac1208aa 100644 --- a/vegafusion-python/tests/test_plan_resolver.py +++ b/vegafusion-python/tests/test_plan_resolver.py @@ -146,7 +146,6 @@ def test_external_dataset_registry() -> None: ) assert ext.scheme == "test" - assert "_vf_scheme" not in ext.metadata # scheme is separate from metadata assert "_vf_ref_id" in ext.metadata ref_id = ext.metadata["_vf_ref_id"] assert ExternalDataset.resolve_data(ref_id) is table @@ -640,11 +639,8 @@ def resolve_plan(self, logical_plan: Any, datasets: dict[str, Any]) -> pa.Table: assert resolver.sql_from_proto is not None assert resolver.sql_from_bytes is not None + # Both paths (bytes and proto message) produce identical SQL assert resolver.sql_from_proto == resolver.sql_from_bytes - # Verify the SQL references the external table name - assert resolver.sql_from_proto == snapshot( - 'SELECT "x", "y" FROM (SELECT "_vf_order" AS "_vf_order", "source"."x" AS "x", "source"."y" AS "y" FROM (SELECT row_number() OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS "_vf_order", "source"."x", "source"."y" FROM "source") AS "derived_projection") AS "derived_projection" WHERE CASE WHEN ("x" > 3.0) IS NULL THEN false ELSE ("x" > 3.0) END ORDER BY "_vf_order" ASC NULLS LAST' - ) def test_external_dataset_without_resolver_raises() -> None: @@ -843,7 +839,7 @@ def test_scan_url_not_called_without_override() -> None: """Resolver without scan_url override does not trigger Python roundtrip.""" class SimpleResolver(PlanResolver): - """Only overrides resolve_table — should NOT trigger scan_url calls.""" + """Only overrides resolve_table — scan_url is not overridden.""" def resolve_table( self, @@ -862,6 +858,10 @@ def resolve_table( rt = vf.VegaFusionRuntime(plan_resolver=resolver) spec = simple_spec() + # This exercises the code path where check_method_override detects no + # scan_url override, so the Rust side skips the Python call entirely. + # If the detection were wrong, the base class scan_url (returning None) + # would still work, but we'd pay an unnecessary Python roundtrip. datasets, _warnings = rt.pre_transform_datasets( spec, datasets=["filtered"], From fd0140a0ef9f683a6e0ec966cb64e847cc213193 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Wed, 18 Mar 2026 12:19:39 -0400 Subject: [PATCH 32/36] docs: fix plan_resolver.md links and update signatures - Remove links to deleted examples (plan_resolver_basic.py, custom_resolver.rs) - Add filters parameter to resolve_table code snippets - Add unparse_expr_to_sql to API reference Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/source/features/plan_resolver.md | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/docs/source/features/plan_resolver.md b/docs/source/features/plan_resolver.md index 74095b44c..6b27d8ce1 100644 --- a/docs/source/features/plan_resolver.md +++ b/docs/source/features/plan_resolver.md @@ -27,7 +27,7 @@ class TableResolver(PlanResolver): self._table = table def resolve_table(self, name, scheme, schema, metadata=None, - projected_columns=None): + projected_columns=None, filters=None): return self._table source = pa.table({"x": [1, 5, 10], "y": ["a", "b", "c"]}) @@ -43,8 +43,6 @@ datasets, _ = rt.pre_transform_datasets( VegaFusion calls `resolve_table` to get the data, then applies Vega transforms (filter, aggregate, etc.) via DataFusion. No protobuf dependency is needed. -See [plan_resolver_basic.py](https://github.com/vega/vegafusion/tree/main/examples/python-examples/plan_resolver_basic.py) for a complete example. - ### scan_url For custom URL schemes in Vega specs (e.g. `"url": "mydata://database/sales"`), override `scan_url()`: @@ -60,7 +58,7 @@ class SalesResolver(PlanResolver): return None # pass to next resolver def resolve_table(self, name, scheme, schema, metadata=None, - projected_columns=None): + projected_columns=None, filters=None): return pa.table({"product": ["Widget", "Gadget"], "revenue": [1200, 3400]}) ``` @@ -101,11 +99,11 @@ See [plan_resolver_sql.py](https://github.com/vega/vegafusion/tree/main/examples .. autofunction:: vegafusion.plan_resolver.unparse_to_sql +.. autofunction:: vegafusion.plan_resolver.unparse_expr_to_sql + .. autofunction:: vegafusion.plan_resolver.inline_table_scan_node ``` ## Rust -The `PlanResolver` trait in `vegafusion-runtime` provides the same two-phase architecture (scan_url at planning time, resolve_table/resolve_plan at execution time). - -See [custom_resolver.rs](https://github.com/vega/vegafusion/tree/main/examples/rust-examples/examples/custom_resolver.rs) for a working example, and the [vegafusion-runtime docs on docs.rs](https://docs.rs/vegafusion-runtime/) for the full API. +The `PlanResolver` trait in `vegafusion-runtime` provides the same two-phase architecture (scan_url at planning time, resolve_table/resolve_plan at execution time). See the [vegafusion-runtime docs on docs.rs](https://docs.rs/vegafusion-runtime/) for the full API. From 746edd5566e6a1d7ec453a4c5368175e0b6a58e7 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Wed, 18 Mar 2026 19:42:55 -0400 Subject: [PATCH 33/36] docs: improve plan_resolver.md examples and fix review findings - Add imports to all code snippets - Use generic examples with comments explaining real-world usage - Show data_base_url for relative URL resolution - Fix configuration bullets to show defaults (thread_safe=True, skip_when_no_external_tables=True, supports_arrow_tables=False) - Clarify protobuf dependency note (external_table_scan_node needs it, not scan_url itself) Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/source/features/plan_resolver.md | 113 ++++++++++++++++++-------- 1 file changed, 78 insertions(+), 35 deletions(-) diff --git a/docs/source/features/plan_resolver.md b/docs/source/features/plan_resolver.md index 6b27d8ce1..46f4bdc1d 100644 --- a/docs/source/features/plan_resolver.md +++ b/docs/source/features/plan_resolver.md @@ -5,7 +5,7 @@ PlanResolver lets you connect custom data sources to VegaFusion. Use it when dat :::{note} `resolve_table`, `resolve_plan_proto` (bytes variant), and `unparse_to_sql` with bytes require no additional dependencies beyond `vegafusion`. -`scan_url`, `resolve_plan` (deserialized `LogicalPlanNode`), `external_table_scan_node`, and `inline_table_scan_node` require the protobuf package: +`external_table_scan_node`, `inline_table_scan_node`, and `resolve_plan` (deserialized `LogicalPlanNode` variant) require the protobuf package: ``` pip install vegafusion[plan-resolver] @@ -16,66 +16,103 @@ pip install vegafusion[plan-resolver] Override one of these methods on `PlanResolver` (simplest first): -- `resolve_table`: return data for each external table independently. The default `resolve_plan` walks the plan and calls this for every external table. -- `resolve_plan` / `resolve_plan_proto`: receive the entire logical plan. Overriding this supersedes `resolve_table` since the runtime calls `resolve_plan` directly; `resolve_table` is only reached via the default implementation. +- `resolve_table`: return an Arrow table for a single external data source. VegaFusion handles the rest — it applies Vega transforms (filter, aggregate, etc.) via DataFusion after your resolver provides the data. +- `resolve_plan` / `resolve_plan_proto`: evaluate an entire logical plan, or the parts your backend supports. Use this to transpile the plan to SQL and execute it remotely, or to push supported operations to your query engine while letting DataFusion handle the rest. -### resolve_table +### scan_url + resolve_table + +For custom URL schemes in Vega specs (e.g. `"url": "mydb://warehouse/sales"`), override `scan_url()` and `resolve_table()`: ```python -class TableResolver(PlanResolver): - def __init__(self, table): - self._table = table +import vegafusion as vf +from vegafusion import PlanResolver +from vegafusion.plan_resolver import external_table_scan_node + +class MyResolver(PlanResolver): + def scan_url(self, parsed_url): + if parsed_url["scheme"] != "mydb": + return None # pass to next resolver + + # Look up the table schema from your data source. + # This is called at planning time, so avoid loading data here. + schema = get_table_schema(parsed_url["path"]) + + return external_table_scan_node( + table_name=parsed_url["url"], + schema=schema, + scheme="mydb", + metadata={"path": parsed_url["path"]}, + ) def resolve_table(self, name, scheme, schema, metadata=None, projected_columns=None, filters=None): - return self._table + # Called at execution time — load the actual data. + # projected_columns lists only the columns DataFusion needs, + # so you can avoid reading unnecessary columns. + return load_table(metadata["path"], columns=projected_columns) +``` -source = pa.table({"x": [1, 5, 10], "y": ["a", "b", "c"]}) -ext = ExternalDataset(scheme="custom", schema=source.schema, data=source) -resolver = TableResolver(source) +`scan_url()` is called at planning time — it inspects the URL and returns an `ExternalTableProvider` plan node with the table's schema. `resolve_table()` is called at execution time to provide the actual data. -rt = vf.VegaFusionRuntime(plan_resolver=resolver) -datasets, _ = rt.pre_transform_datasets( - spec, datasets=["filtered"], - inline_datasets={"source": ext}, dataset_format="pyarrow", +Use `data_base_url` on the runtime to set a base path for relative URLs in Vega specs: + +```python +resolver = MyResolver() +rt = vf.VegaFusionRuntime( + plan_resolver=resolver, + data_base_url="mydb://warehouse/", ) + +# Vega spec with "url": "sales" resolves to "mydb://warehouse/sales" ``` -VegaFusion calls `resolve_table` to get the data, then applies Vega transforms (filter, aggregate, etc.) via DataFusion. No protobuf dependency is needed. +See [plan_resolver_url_scanning.py](https://github.com/vega/vegafusion/tree/main/examples/python-examples/plan_resolver_url_scanning.py) for a complete example. -### scan_url +### resolve_table only -For custom URL schemes in Vega specs (e.g. `"url": "mydata://database/sales"`), override `scan_url()`: +If data comes from `ExternalDataset` inline datasets (not URLs), you only need `resolve_table`: ```python -class SalesResolver(PlanResolver): - def scan_url(self, parsed_url): - if parsed_url["scheme"] == "mydata": - schema = pa.schema([("product", pa.utf8()), ("revenue", pa.int64())]) - return external_table_scan_node( - table_name="sales_data", schema=schema, scheme="mydata", - ) - return None # pass to next resolver +import vegafusion as vf +from vegafusion import ExternalDataset, PlanResolver +class MyResolver(PlanResolver): def resolve_table(self, name, scheme, schema, metadata=None, projected_columns=None, filters=None): - return pa.table({"product": ["Widget", "Gadget"], "revenue": [1200, 3400]}) -``` + # Look up data by name from your data source + df = my_database.query(name, columns=projected_columns) + return df.to_arrow() -`scan_url()` creates an `ExternalTableProvider` plan node for URLs your resolver handles, and `resolve_table()` provides the data at execution time. +ext = ExternalDataset(scheme="mydb", schema=table.schema, data=table) +rt = vf.VegaFusionRuntime(plan_resolver=MyResolver()) +datasets, _ = rt.pre_transform_datasets( + spec, datasets=["result"], + inline_datasets={"source": ext}, dataset_format="pyarrow", +) +``` -See [plan_resolver_url_scanning.py](https://github.com/vega/vegafusion/tree/main/examples/python-examples/plan_resolver_url_scanning.py) for a complete example. +No protobuf dependency is needed for this pattern. ### resolve_plan + unparse_to_sql -Override `resolve_plan_proto` to receive the serialized logical plan. Use `unparse_to_sql()` to convert it to SQL: +Override `resolve_plan_proto` to receive the full logical plan and transpile it to SQL for remote execution: ```python +from vegafusion import PlanResolver +from vegafusion.plan_resolver import unparse_to_sql + class SqlResolver(PlanResolver): + def __init__(self, connection): + self._conn = connection + def resolve_plan_proto(self, plan_bytes, datasets): - sql = unparse_to_sql(plan_bytes, dialect="postgres") - # Execute SQL against your database and return the result - return execute_query(sql) + # Convert the DataFusion logical plan to a SQL string + sql = unparse_to_sql(plan_bytes, dialect="default") + + # Execute the SQL against your database + cursor = self._conn.cursor() + cursor.execute(sql) + return cursor.fetch_arrow_all() ``` `resolve_plan_proto` receives protobuf bytes that can be passed directly to `unparse_to_sql()` without deserialization. To inspect or modify the plan tree, use `resolve_plan()` instead (it receives a deserialized `LogicalPlanNode`). @@ -84,7 +121,13 @@ Supported SQL dialects: `"default"`, `"postgres"`, `"mysql"`, `"sqlite"`, `"duck See [plan_resolver_sql.py](https://github.com/vega/vegafusion/tree/main/examples/python-examples/plan_resolver_sql.py) for a complete example. -`PlanResolver` cannot be used with `grpc_connect()` (resolvers run in-process). Set `thread_safe = False` for backends with thread-affine connections (e.g. DuckDB). Set `skip_when_no_external_tables = False` to receive all plans (e.g. for logging). Set `supports_arrow_tables = True` to let the runtime eagerly materialize plans into Arrow tables. +### Configuration + +`PlanResolver` cannot be used with `grpc_connect()` (resolvers run in-process). Class-level attributes control resolver behavior: + +- `thread_safe` (default `True`) — set to `False` for backends with thread-affine connections (e.g. DuckDB) +- `skip_when_no_external_tables` (default `True`) — set to `False` to receive all plans, not just those with external tables (e.g. for logging) +- `supports_arrow_tables` (default `False`) — set to `True` to let the runtime eagerly materialize plans into Arrow tables ### API Reference From 88bef3259582410141c3cb507c0b5d64d157ebc1 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Wed, 18 Mar 2026 19:48:24 -0400 Subject: [PATCH 34/36] style: fix example review findings - Remove pandas dependency from url_scanning example (use print(table)) - Use parsed_url["url"] as table_name to avoid collisions - Remove unused source_table from SQL example constructor - Clarify hardcoded return in SQL example with explicit comment Co-Authored-By: Claude Opus 4.6 (1M context) --- examples/python-examples/plan_resolver_sql.py | 14 ++++++++------ .../python-examples/plan_resolver_url_scanning.py | 8 ++++++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/examples/python-examples/plan_resolver_sql.py b/examples/python-examples/plan_resolver_sql.py index c69d90bc2..375119f1b 100644 --- a/examples/python-examples/plan_resolver_sql.py +++ b/examples/python-examples/plan_resolver_sql.py @@ -1,6 +1,6 @@ # Demonstrates SQL transpilation using resolve_plan_proto() + unparse_to_sql(). -# The resolver receives a serialized logical plan, converts it to SQL, and returns -# a result table. In a real application you would execute the SQL against a database. +# The resolver receives a serialized logical plan, converts it to SQL, and prints it. +# In a real application you would execute the SQL against a database. import json from typing import Any @@ -16,7 +16,7 @@ def main() -> None: source_table = pa.table({"x": [1, 5, 10], "y": ["a", "b", "c"]}) ext = ExternalDataset(scheme="table", schema=source_table.schema, data=source_table) - resolver = SqlTranspileResolver(source_table=source_table) + resolver = SqlTranspileResolver() rt = vf.VegaFusionRuntime(plan_resolver=resolver) spec = get_spec() @@ -44,8 +44,7 @@ def main() -> None: class SqlTranspileResolver(PlanResolver): """Converts the logical plan to Postgres-dialect SQL.""" - def __init__(self, source_table: pa.Table) -> None: - self.source_table = source_table + def __init__(self) -> None: self.captured_sql: str | None = None def resolve_plan_proto( @@ -53,7 +52,10 @@ def resolve_plan_proto( ) -> pa.Table: sql = unparse_to_sql(plan_bytes, dialect="postgres") self.captured_sql = sql - # In a real scenario you would execute `sql` against a database. + + # In a real resolver, you would execute `sql` against your database + # and return the result as an Arrow table. Here we return hardcoded + # data matching the expected query result for demonstration. return pa.table({"x": [5, 10], "y": ["b", "c"]}) diff --git a/examples/python-examples/plan_resolver_url_scanning.py b/examples/python-examples/plan_resolver_url_scanning.py index 0c9f27fa0..89915f127 100644 --- a/examples/python-examples/plan_resolver_url_scanning.py +++ b/examples/python-examples/plan_resolver_url_scanning.py @@ -35,7 +35,7 @@ def main(): assert table.column("product").to_pylist() == ["Widget", "Gadget", "Gizmo"] assert table.column("revenue").to_pylist() == [1200, 3400, 560] print("Result table:") - print(table.to_pandas().to_string(index=False)) + print(table) print("\nAll assertions passed.") @@ -46,7 +46,9 @@ def scan_url(self, parsed_url: dict[str, Any]) -> Any: if parsed_url["scheme"] == "mydata": schema = pa.schema([("product", pa.utf8()), ("revenue", pa.int64())]) return external_table_scan_node( - table_name="sales_data", + # Use the full URL as the table name so multiple URLs + # produce distinct plan nodes + table_name=parsed_url["url"], schema=schema, scheme="mydata", ) @@ -61,6 +63,8 @@ def resolve_table( projected_columns: list[str] | None = None, filters: list[Any] | None = None, ) -> pa.Table: + # In a real resolver, use `name` or `metadata` to look up data + # from your data source. Here we return a fixed table. return pa.table( { "product": ["Widget", "Gadget", "Gizmo"], From fab9c571aa1b279d8c744922b2611c83f881da55 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Sat, 21 Mar 2026 16:09:29 -0400 Subject: [PATCH 35/36] Align VegaFusion URL config with server policy --- Cargo.lock | 2 + docs/source/features/grpc.md | 13 +- docs/source/features/plan_resolver.md | 4 +- examples/editor-demo/README.md | 5 + vegafusion-core/Cargo.toml | 3 + vegafusion-core/src/data/url.rs | 425 +++++++++++++++++- vegafusion-core/src/runtime/mod.rs | 4 +- vegafusion-python/src/lib.rs | 50 ++- .../tests/test_runtime_config.py | 109 +++++ vegafusion-python/vegafusion/runtime.py | 82 +++- vegafusion-runtime/src/data/pipeline.rs | 14 +- vegafusion-runtime/src/data/tasks.rs | 170 +++---- vegafusion-runtime/src/task_graph/runtime.rs | 21 +- vegafusion-runtime/src/task_graph/task.rs | 4 +- vegafusion-runtime/tests/test_url_policy.rs | 263 +++++++++++ vegafusion-server/Cargo.toml | 3 + vegafusion-server/src/main.rs | 37 +- .../tests/test_task_graph_runtime.rs | 246 +++++++++- 18 files changed, 1292 insertions(+), 163 deletions(-) create mode 100644 vegafusion-python/tests/test_runtime_config.py create mode 100644 vegafusion-runtime/tests/test_url_policy.rs diff --git a/Cargo.lock b/Cargo.lock index 393e7c964..558431fd7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4883,6 +4883,7 @@ dependencies = [ "serde", "serde_json", "sqlparser", + "tempfile", "thiserror 1.0.69", "tonic", "tonic-build", @@ -4964,6 +4965,7 @@ dependencies = [ "protobuf-src", "regex", "serde_json", + "tempfile", "tokio", "tonic", "tonic-build", diff --git a/docs/source/features/grpc.md b/docs/source/features/grpc.md index cd7b67900..d59f9b23a 100644 --- a/docs/source/features/grpc.md +++ b/docs/source/features/grpc.md @@ -2,7 +2,9 @@ The VegaFusion Runtime can run as a [gRPC](https://grpc.io/) service, which makes it possible for multiple clients to connect to the same runtime, and share a cache (See [How it Works](../about/how_it_works) for more details). This also makes it possible for the Runtime to reside on a different host than the client. :::{warning} -VegaFusion's gRPC server does not currently support authentication, and chart specifications may reference the local file system of the machine running the server. It is not currently recommended to use VegaFusion server with untrusted Vega specifications unless other measures are taken to isolate the service. +VegaFusion's gRPC server does not currently support authentication. If you use it with untrusted Vega specifications, lock down the server process with `--no-allowed-urls`, `--allowed-base-url`, `--base-url`, or `--no-base-url`, and apply any additional isolation your deployment requires. + +URL policy is enforced against the initial resolved URL only. VegaFusion does not re-check redirect destinations after a fetch begins. ::: ## VegaFusion Server @@ -18,6 +20,15 @@ The server may then be launched using a particular port as follows: vegafusion-server --port 50051 ``` +The server process owns URL resolution and access policy for all gRPC clients. For example: + +``` +vegafusion-server \ + --port 50051 \ + --base-url https://cdn.jsdelivr.net/npm/vega-datasets@v2.9.0/ \ + --allowed-base-url https://cdn.jsdelivr.net/ +``` + ## Python The `vf.runtime.grpc_connect` method is used to connect the Python client to a VegaFusion Server instance. diff --git a/docs/source/features/plan_resolver.md b/docs/source/features/plan_resolver.md index 46f4bdc1d..03da4135d 100644 --- a/docs/source/features/plan_resolver.md +++ b/docs/source/features/plan_resolver.md @@ -54,13 +54,13 @@ class MyResolver(PlanResolver): `scan_url()` is called at planning time — it inspects the URL and returns an `ExternalTableProvider` plan node with the table's schema. `resolve_table()` is called at execution time to provide the actual data. -Use `data_base_url` on the runtime to set a base path for relative URLs in Vega specs: +Use `base_url` on the runtime to set a base path for relative URLs in Vega specs: ```python resolver = MyResolver() rt = vf.VegaFusionRuntime( plan_resolver=resolver, - data_base_url="mydb://warehouse/", + base_url="mydb://warehouse/", ) # Vega spec with "url": "sales" resolves to "mydb://warehouse/sales" diff --git a/examples/editor-demo/README.md b/examples/editor-demo/README.md index 489b96b9c..02c0396e7 100644 --- a/examples/editor-demo/README.md +++ b/examples/editor-demo/README.md @@ -8,6 +8,11 @@ Launch gRPC-Web server with: ./vegafusion-server --port 50051 --web ``` +Add `--base-url`, `--no-base-url`, `--allowed-base-url`, or `--no-allowed-urls` +to control how the server resolves and accesses external data URLs. +Policy checks apply to the initial resolved URL only; redirect destinations are +not re-checked after a fetch begins. + Build and launch editor with ``` npm install diff --git a/vegafusion-core/Cargo.toml b/vegafusion-core/Cargo.toml index d33d410b5..5b319b8f6 100644 --- a/vegafusion-core/Cargo.toml +++ b/vegafusion-core/Cargo.toml @@ -87,6 +87,9 @@ optional = true workspace = true optional = true +[dev-dependencies.tempfile] +workspace = true + [lints.clippy] module_inception = "allow" diff --git a/vegafusion-core/src/data/url.rs b/vegafusion-core/src/data/url.rs index 121ca33d3..fad5352b9 100644 --- a/vegafusion-core/src/data/url.rs +++ b/vegafusion-core/src/data/url.rs @@ -1,11 +1,15 @@ use regex::Regex; +#[cfg(not(target_arch = "wasm32"))] +use std::fs; +use std::path::{Path, PathBuf}; use std::sync::LazyLock; -use vegafusion_common::error::Result; +use vegafusion_common::error::{Result, VegaFusionError}; /// Parsed URL representation passed to resolvers during the scan phase. /// All fields are populated from the fully-resolved URL (after base URL /// resolution and hash-stripping). Resolvers pattern-match on these fields /// rather than doing their own URL string parsing. +#[derive(Clone, Debug, PartialEq)] pub struct ParsedUrl { /// Original URL string (after base URL resolution and hash-stripping) pub url: String, @@ -25,8 +29,53 @@ pub struct ParsedUrl { pub parse: Option, } +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum AllowedBaseUrlPattern { + Any, + Scheme(String), + Prefix(String), + WildcardHost { + scheme: String, + host_suffix: String, + path_prefix: String, + }, + FilePathPrefix(PathBuf), +} + static URL_SCHEME_RE: LazyLock = LazyLock::new(|| Regex::new(r"^(//|[a-zA-Z][a-zA-Z0-9+.\-]*://)").unwrap()); +static SCHEME_PATTERN_RE: LazyLock = + LazyLock::new(|| Regex::new(r"^[a-zA-Z][a-zA-Z0-9+.\-]*:$").unwrap()); +static WILDCARD_HOST_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"^([a-zA-Z][a-zA-Z0-9+.\-]*)://\*\.([^/?#]+)(/[^?#]*)?$").unwrap() +}); + +#[cfg(not(target_arch = "wasm32"))] +fn normalize_file_base_url(base_url: String) -> Result { + let parsed = match url::Url::parse(&base_url) { + Ok(parsed) => parsed, + Err(_) => return Ok(base_url), + }; + + if parsed.scheme() != "file" { + return Ok(base_url); + } + + let Ok(path) = parsed.to_file_path() else { + return Ok(base_url); + }; + + if path.is_dir() && !base_url.ends_with('/') { + Ok(format!("{base_url}/")) + } else { + Ok(base_url) + } +} + +#[cfg(target_arch = "wasm32")] +fn normalize_file_base_url(base_url: String) -> Result { + Ok(base_url) +} /// Returns true if the string is already a URL (has a scheme per RFC 3986) /// or is scheme-relative (starts with //). @@ -55,13 +104,13 @@ pub fn normalize_base_url(base: String) -> Result { // Protocol-relative URL — prepend https: so url::Url::parse works Ok(format!("https:{base}")) } else if has_url_scheme(&base) { - Ok(base) + normalize_file_base_url(base) } else if is_absolute_path(&base) { - path_to_file_url(&base) + normalize_file_base_url(path_to_file_url(&base)?) } else { - Err(vegafusion_common::error::VegaFusionError::specification( - format!("data_base_url must be absolute (scheme URL or absolute path), got: {base}"), - )) + Err(VegaFusionError::specification(format!( + "base_url must be absolute (scheme URL or absolute path), got: {base}" + ))) } } @@ -73,12 +122,7 @@ pub fn path_to_file_url(path: &str) -> Result { let p = std::path::Path::new(&normalized); url::Url::from_file_path(p) .map(|u| u.to_string()) - .map_err(|_| { - vegafusion_common::error::VegaFusionError::specification(format!( - "Cannot convert path to file URL: {}", - p.display() - )) - }) + .map_err(|_| VegaFusionError::specification(format!("Cannot convert path to file URL: {}", p.display()))) } /// Browser-wasm fallback: `url::Url::from_file_path` is unavailable on @@ -95,11 +139,227 @@ pub fn path_to_file_url(path: &str) -> Result { Ok(format!("file://{normalized}")) } +#[cfg(not(target_arch = "wasm32"))] +pub fn file_url_to_path(url: &str) -> Result { + let parsed = url::Url::parse(url) + .map_err(|e| VegaFusionError::specification(format!("Invalid file URL '{url}': {e}")))?; + parsed + .to_file_path() + .map_err(|_| VegaFusionError::specification(format!("Cannot convert file URL to path: {url}"))) +} + +#[cfg(target_arch = "wasm32")] +pub fn file_url_to_path(url: &str) -> Result { + Err(VegaFusionError::specification(format!( + "Cannot convert file URL to path on wasm target: {url}" + ))) +} + +#[cfg(not(target_arch = "wasm32"))] +fn portable_canonicalize(path: &Path) -> Result { + fs::canonicalize(path).map_err(|e| { + VegaFusionError::specification(format!("Failed to resolve path {}: {e}", path.display())) + }) +} + +#[cfg(target_arch = "wasm32")] +fn portable_canonicalize(path: &Path) -> Result { + Err(VegaFusionError::specification(format!( + "Cannot canonicalize path on wasm target: {}", + path.display() + ))) +} + +pub fn canonicalize_path_for_policy_check(path: &Path) -> Result { + if path.exists() { + return portable_canonicalize(path); + } + + let parent = path.parent().unwrap_or_else(|| Path::new(".")); + let canonical_parent = portable_canonicalize(parent)?; + let Some(file_name) = path.file_name() else { + return Err(VegaFusionError::specification(format!( + "Failed to resolve local path {}: missing file name", + path.display() + ))); + }; + Ok(canonical_parent.join(file_name)) +} + +fn normalize_url_prefix(mut normalized: String) -> String { + if !normalized.ends_with('/') { + normalized.push('/'); + } + normalized +} + +pub fn normalize_allowed_base_urls( + allowed_base_urls: Option>, +) -> Result>> { + allowed_base_urls + .map(|urls| { + urls.into_iter() + .map(|url| normalize_allowed_base_url(&url)) + .collect::>>() + }) + .transpose() +} + +pub fn normalize_allowed_base_url(allowed_base_url: &str) -> Result { + if allowed_base_url == "*" { + return Ok(AllowedBaseUrlPattern::Any); + } + + if SCHEME_PATTERN_RE.is_match(allowed_base_url) { + return Ok(AllowedBaseUrlPattern::Scheme( + allowed_base_url[..allowed_base_url.len() - 1].to_ascii_lowercase(), + )); + } + + if is_absolute_path(allowed_base_url) || allowed_base_url.starts_with("file:///") { + let path = if allowed_base_url.starts_with("file:///") { + file_url_to_path(allowed_base_url)? + } else { + PathBuf::from(allowed_base_url) + }; + let canonical = portable_canonicalize(&path)?; + if !canonical.is_dir() { + return Err(VegaFusionError::specification(format!( + "Filesystem path in allowed_base_urls must be a directory: {}", + canonical.display() + ))); + } + return Ok(AllowedBaseUrlPattern::FilePathPrefix(canonical)); + } + + if let Some(captures) = WILDCARD_HOST_RE.captures(allowed_base_url) { + let scheme = captures + .get(1) + .unwrap() + .as_str() + .to_ascii_lowercase(); + let host_suffix = captures + .get(2) + .unwrap() + .as_str() + .to_ascii_lowercase(); + if host_suffix.is_empty() || host_suffix.contains('@') || host_suffix.contains(':') { + return Err(VegaFusionError::specification(format!( + "Invalid wildcard host pattern in allowed_base_urls: {allowed_base_url}" + ))); + } + let path_prefix = normalize_url_prefix( + captures + .get(3) + .map(|m| m.as_str().to_string()) + .unwrap_or_else(|| "/".to_string()), + ); + return Ok(AllowedBaseUrlPattern::WildcardHost { + scheme, + host_suffix, + path_prefix, + }); + } + + let parsed_url = url::Url::parse(allowed_base_url).map_err(|e| { + VegaFusionError::specification(format!( + "Invalid allowed_base_url '{allowed_base_url}': {e}" + )) + })?; + + if !parsed_url.username().is_empty() || parsed_url.password().is_some() { + return Err(VegaFusionError::specification(format!( + "allowed_base_url cannot include userinfo credentials: {allowed_base_url}" + ))); + } + + if parsed_url.query().is_some() { + return Err(VegaFusionError::specification(format!( + "allowed_base_url cannot include a query component: {allowed_base_url}" + ))); + } + + if parsed_url.fragment().is_some() { + return Err(VegaFusionError::specification(format!( + "allowed_base_url cannot include a fragment component: {allowed_base_url}" + ))); + } + + Ok(AllowedBaseUrlPattern::Prefix(normalize_url_prefix( + parsed_url.to_string(), + ))) +} + +fn url_to_local_path(url: &str) -> Result { + if url.starts_with("file://") { + file_url_to_path(url) + } else if is_absolute_path(url) { + Ok(PathBuf::from(url)) + } else { + Err(VegaFusionError::specification(format!( + "Expected local file path or file URL, got: {url}" + ))) + } +} + +pub fn is_url_allowed(url: &str, allowed_base_urls: &[AllowedBaseUrlPattern]) -> bool { + let parsed_url = url::Url::parse(url).ok(); + + allowed_base_urls.iter().any(|pattern| match pattern { + AllowedBaseUrlPattern::Any => true, + AllowedBaseUrlPattern::Scheme(scheme) => parsed_url + .as_ref() + .map(|parsed| parsed.scheme().eq_ignore_ascii_case(scheme)) + .unwrap_or(false), + AllowedBaseUrlPattern::Prefix(prefix) => parsed_url + .as_ref() + .map(|parsed| parsed.as_str().starts_with(prefix)) + .unwrap_or(false), + AllowedBaseUrlPattern::WildcardHost { + scheme, + host_suffix, + path_prefix, + } => parsed_url + .as_ref() + .and_then(|parsed| { + parsed.host_str().map(|host| { + parsed.scheme().eq_ignore_ascii_case(scheme) + && (host.eq_ignore_ascii_case(host_suffix) + || host + .to_ascii_lowercase() + .ends_with(&format!(".{host_suffix}"))) + && parsed.path().starts_with(path_prefix) + }) + }) + .unwrap_or(false), + AllowedBaseUrlPattern::FilePathPrefix(prefix) => url_to_local_path(url) + .and_then(|path| canonicalize_path_for_policy_check(&path)) + .map(|path| path.starts_with(prefix)) + .unwrap_or(false), + }) +} + +pub fn check_url_allowed( + url: &str, + allowed_base_urls: &Option>, +) -> Result<()> { + if allowed_base_urls + .as_ref() + .map(|patterns| is_url_allowed(url, patterns)) + .unwrap_or(true) + { + Ok(()) + } else { + Err(VegaFusionError::specification(format!( + "URL or path '{url}' blocked by allowed_base_urls. Add the URL prefix to allowed_base_urls or change base_url." + ))) + } +} + /// Resolve a spec URL against a base URL. This is the shared function used by /// both plan-time resolution (MakeTasksVisitor for Url::String) and eval-time /// resolution (DataUrlTask::eval for Url::Expr). -pub fn resolve_url(url: &str, data_base_url: &Option) -> Result { - // Future: This is the natural place for a URL permissions layer +pub fn resolve_url(url: &str, base_url: &Option) -> Result { if url.starts_with("//") { // Protocol-relative URL — prepend https: so downstream parsers work Ok(format!("https:{url}")) @@ -109,23 +369,23 @@ pub fn resolve_url(url: &str, data_base_url: &Option) -> Result path_to_file_url(url) } else { // Relative path — resolve against base URL using RFC 3986 joining - match data_base_url { + match base_url { Some(base) => { let base_url = url::Url::parse(base).map_err(|e| { - vegafusion_common::error::VegaFusionError::specification(format!( + VegaFusionError::specification(format!( "Invalid base URL '{base}': {e}" )) })?; let resolved = base_url.join(url).map_err(|e| { - vegafusion_common::error::VegaFusionError::specification(format!( + VegaFusionError::specification(format!( "Cannot resolve '{url}' against base '{base}': {e}" )) })?; Ok(resolved.to_string()) } - None => Err(vegafusion_common::error::VegaFusionError::specification( - format!("Relative URL with no base URL configured: {url}"), - )), + None => Err(VegaFusionError::specification(format!( + "Relative URL with no base_url configured: {url}" + ))), } } } @@ -256,6 +516,14 @@ mod tests { assert!(result.is_err()); } + #[test] + #[cfg(not(target_os = "windows"))] + fn test_normalize_base_url_existing_directory_adds_trailing_slash() { + let tempdir = tempfile::tempdir().unwrap(); + let result = normalize_base_url(tempdir.path().to_str().unwrap().to_string()).unwrap(); + assert!(result.ends_with('/'), "expected trailing slash, got {result}"); + } + #[test] fn test_resolve_url_scheme_passthrough() { let base = Some("https://cdn.example.com/".to_string()); @@ -314,4 +582,121 @@ mod tests { "https://proxy.com/fetch?target=http://evil.com/data" ); } + + #[test] + fn test_normalize_allowed_base_url_star() { + assert_eq!( + normalize_allowed_base_url("*").unwrap(), + AllowedBaseUrlPattern::Any + ); + } + + #[test] + fn test_normalize_allowed_base_url_generic_scheme() { + assert_eq!( + normalize_allowed_base_url("s3:").unwrap(), + AllowedBaseUrlPattern::Scheme("s3".to_string()) + ); + } + + #[test] + fn test_normalize_allowed_base_url_prefix() { + assert_eq!( + normalize_allowed_base_url("https://example.com/data").unwrap(), + AllowedBaseUrlPattern::Prefix("https://example.com/data/".to_string()) + ); + } + + #[test] + fn test_normalize_allowed_base_url_wildcard_host() { + assert_eq!( + normalize_allowed_base_url("https://*.example.com/data").unwrap(), + AllowedBaseUrlPattern::WildcardHost { + scheme: "https".to_string(), + host_suffix: "example.com".to_string(), + path_prefix: "/data/".to_string(), + } + ); + } + + #[test] + #[cfg(not(target_os = "windows"))] + fn test_normalize_allowed_base_url_filesystem_root() { + let tempdir = tempfile::tempdir().unwrap(); + let normalized = normalize_allowed_base_url(tempdir.path().to_str().unwrap()).unwrap(); + assert_eq!( + normalized, + AllowedBaseUrlPattern::FilePathPrefix(fs::canonicalize(tempdir.path()).unwrap()) + ); + } + + #[test] + fn test_normalize_allowed_base_url_rejects_query() { + assert!(normalize_allowed_base_url("https://example.com/data?q=1").is_err()); + } + + #[test] + fn test_is_url_allowed_generic_scheme() { + let patterns = vec![normalize_allowed_base_url("myproto:").unwrap()]; + assert!(is_url_allowed("myproto://warehouse/sales", &patterns)); + assert!(!is_url_allowed("otherproto://warehouse/sales", &patterns)); + } + + #[test] + fn test_is_url_allowed_prefix() { + let patterns = vec![normalize_allowed_base_url("https://example.com/data/").unwrap()]; + assert!(is_url_allowed("https://example.com/data/cars.json", &patterns)); + assert!(!is_url_allowed("https://example.com/other/cars.json", &patterns)); + } + + #[test] + fn test_is_url_allowed_wildcard_host() { + let patterns = vec![normalize_allowed_base_url("https://*.example.com/data/").unwrap()]; + assert!(is_url_allowed("https://example.com/data/cars.json", &patterns)); + assert!(is_url_allowed( + "https://cdn.example.com/data/cars.json", + &patterns + )); + assert!(!is_url_allowed( + "https://example.com.evil.com/data/cars.json", + &patterns + )); + assert!(!is_url_allowed("https://cdn.example.com/other/cars.json", &patterns)); + } + + #[test] + #[cfg(not(target_os = "windows"))] + fn test_is_url_allowed_filesystem_canonicalization() { + let root = tempfile::tempdir().unwrap(); + let nested = root.path().join("nested"); + std::fs::create_dir_all(&nested).unwrap(); + let file_path = nested.join("data.json"); + std::fs::write(&file_path, "{}").unwrap(); + + let patterns = vec![normalize_allowed_base_url(root.path().to_str().unwrap()).unwrap()]; + assert!(is_url_allowed(&format!("file://{}", file_path.display()), &patterns)); + } + + #[test] + #[cfg(not(target_os = "windows"))] + fn test_is_url_allowed_rejects_parent_traversal() { + let root = tempfile::tempdir().unwrap(); + let allowed = root.path().join("allowed"); + std::fs::create_dir_all(&allowed).unwrap(); + let outside = root.path().join("outside"); + std::fs::create_dir_all(&outside).unwrap(); + let file_path = allowed.join("../outside/data.json"); + + let patterns = vec![normalize_allowed_base_url(allowed.to_str().unwrap()).unwrap()]; + assert!(!is_url_allowed(&format!("file://{}", file_path.display()), &patterns)); + } + + #[test] + #[cfg(not(target_os = "windows"))] + fn test_file_url_to_path_roundtrip() { + let path = "/tmp/my data/file.csv"; + let url = path_to_file_url(path).unwrap(); + let roundtrip = file_url_to_path(&url).unwrap(); + assert_eq!(roundtrip, PathBuf::from(path)); + } } diff --git a/vegafusion-core/src/runtime/mod.rs b/vegafusion-core/src/runtime/mod.rs index d827cfdc8..15578fb7f 100644 --- a/vegafusion-core/src/runtime/mod.rs +++ b/vegafusion-core/src/runtime/mod.rs @@ -1,6 +1,8 @@ mod runtime; pub use crate::data::url::{ - has_url_scheme, is_absolute_path, normalize_base_url, path_to_file_url, resolve_url, ParsedUrl, + canonicalize_path_for_policy_check, check_url_allowed, file_url_to_path, has_url_scheme, + is_absolute_path, is_url_allowed, normalize_allowed_base_url, normalize_allowed_base_urls, + normalize_base_url, path_to_file_url, resolve_url, AllowedBaseUrlPattern, ParsedUrl, }; pub use runtime::{PreTransformExtractTable, VegaFusionRuntimeTrait}; diff --git a/vegafusion-python/src/lib.rs b/vegafusion-python/src/lib.rs index 882cac4be..940b6014e 100644 --- a/vegafusion-python/src/lib.rs +++ b/vegafusion-python/src/lib.rs @@ -67,11 +67,12 @@ impl PyVegaFusionRuntime { worker_threads: Option, resolvers: Vec>, use_current_thread: bool, - data_base_url: Option<&Bound>, + base_url: Option<&Bound>, + allowed_base_urls: Option>, ) -> PyResult { initialize_logging(); - let data_base_url_setting = parse_data_base_url(data_base_url)?; + let base_url_setting = parse_base_url(base_url)?; let tokio_runtime_connection = if use_current_thread { tokio::runtime::Builder::new_current_thread() @@ -95,37 +96,36 @@ impl PyVegaFusionRuntime { runtime: Arc::new(VegaFusionRuntime::new(VegaFusionRuntimeOpts { cache: Some(VegaFusionCache::new(max_capacity, memory_limit)), plan_resolvers: resolvers, - data_base_url: data_base_url_setting, + base_url: base_url_setting, + allowed_base_urls, })?), tokio_runtime: Arc::new(tokio_runtime_connection), }) } } -/// Parse Python `data_base_url` argument into `DataBaseUrlSetting`. +/// Parse Python `base_url` argument into `BaseUrlSetting`. /// -/// - `None` or `True` -> `DataBaseUrlSetting::Default` (CDN) -/// - `str` -> `DataBaseUrlSetting::Custom(s)` -/// - `False` -> `DataBaseUrlSetting::Disabled` -fn parse_data_base_url( +/// - `None` or `True` -> `BaseUrlSetting::Default` (CDN) +/// - `str` -> `BaseUrlSetting::Custom(s)` +/// - `False` -> `BaseUrlSetting::Disabled` +fn parse_base_url( value: Option<&Bound>, -) -> PyResult { - use vegafusion_runtime::data::pipeline::DataBaseUrlSetting; +) -> PyResult { + use vegafusion_runtime::data::pipeline::BaseUrlSetting; match value { - None => Ok(DataBaseUrlSetting::Default), + None => Ok(BaseUrlSetting::Default), Some(obj) => { if let Ok(b) = obj.extract::() { if b { - Ok(DataBaseUrlSetting::Default) + Ok(BaseUrlSetting::Default) } else { - Ok(DataBaseUrlSetting::Disabled) + Ok(BaseUrlSetting::Disabled) } } else if let Ok(s) = obj.extract::() { - Ok(DataBaseUrlSetting::Custom(s)) + Ok(BaseUrlSetting::Custom(s)) } else { - Err(PyValueError::new_err( - "data_base_url must be a str, bool, or None", - )) + Err(PyValueError::new_err("base_url must be a str, bool, or None")) } } } @@ -134,12 +134,13 @@ fn parse_data_base_url( #[pymethods] impl PyVegaFusionRuntime { #[staticmethod] - #[pyo3(signature = (max_capacity=None, memory_limit=None, worker_threads=None, data_base_url=None))] + #[pyo3(signature = (max_capacity=None, memory_limit=None, worker_threads=None, base_url=None, allowed_base_urls=None))] pub fn new_embedded( max_capacity: Option, memory_limit: Option, worker_threads: Option, - data_base_url: Option<&Bound>, + base_url: Option<&Bound>, + allowed_base_urls: Option>, ) -> PyResult { Self::build_with_resolvers( max_capacity, @@ -147,18 +148,20 @@ impl PyVegaFusionRuntime { worker_threads, Vec::new(), false, - data_base_url, + base_url, + allowed_base_urls, ) } #[staticmethod] - #[pyo3(signature = (py_resolvers, max_capacity=None, memory_limit=None, worker_threads=None, data_base_url=None))] + #[pyo3(signature = (py_resolvers, max_capacity=None, memory_limit=None, worker_threads=None, base_url=None, allowed_base_urls=None))] pub fn new_with_resolvers( py_resolvers: Vec>, max_capacity: Option, memory_limit: Option, worker_threads: Option, - data_base_url: Option<&Bound>, + base_url: Option<&Bound>, + allowed_base_urls: Option>, ) -> PyResult { let py_resolvers: Vec = py_resolvers .into_iter() @@ -178,7 +181,8 @@ impl PyVegaFusionRuntime { worker_threads, resolvers, use_current_thread, - data_base_url, + base_url, + allowed_base_urls, ) } diff --git a/vegafusion-python/tests/test_runtime_config.py b/vegafusion-python/tests/test_runtime_config.py new file mode 100644 index 000000000..424ed3a5c --- /dev/null +++ b/vegafusion-python/tests/test_runtime_config.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +import pytest + +import vegafusion as vf +import vegafusion._vegafusion as _core + + +def test_runtime_exposes_url_policy_properties() -> None: + rt = vf.VegaFusionRuntime( + memory_limit=1, + worker_threads=1, + base_url="https://example.com/data/", + allowed_base_urls=["https://example.com/data/"], + ) + + assert rt.base_url == "https://example.com/data/" + assert rt.allowed_base_urls == ["https://example.com/data/"] + + +def test_runtime_passes_url_policy_to_embedded_runtime(monkeypatch: pytest.MonkeyPatch) -> None: + calls: list[dict[str, object]] = [] + + class FakeRuntime: + def clear_cache(self) -> None: + return None + + class FakePyVegaFusionRuntime: + @staticmethod + def new_embedded( + cache_capacity: int, + memory_limit: int, + worker_threads: int, + base_url: str | bool | None = None, + allowed_base_urls: list[str] | None = None, + ) -> FakeRuntime: + calls.append( + { + "cache_capacity": cache_capacity, + "memory_limit": memory_limit, + "worker_threads": worker_threads, + "base_url": base_url, + "allowed_base_urls": allowed_base_urls, + } + ) + return FakeRuntime() + + monkeypatch.setattr(_core, "PyVegaFusionRuntime", FakePyVegaFusionRuntime) + + rt = vf.VegaFusionRuntime( + cache_capacity=8, + memory_limit=256, + worker_threads=2, + base_url=False, + allowed_base_urls=["file:///tmp/allowed/"], + ) + + _ = rt.runtime + + assert calls == [ + { + "cache_capacity": 8, + "memory_limit": 256, + "worker_threads": 2, + "base_url": False, + "allowed_base_urls": ["file:///tmp/allowed/"], + } + ] + + +def test_grpc_connect_rejects_local_url_policy() -> None: + rt = vf.VegaFusionRuntime(base_url=False) + + with pytest.raises(ValueError, match="base_url or allowed_base_urls"): + rt.grpc_connect("http://127.0.0.1:50051") + + rt = vf.VegaFusionRuntime(allowed_base_urls=[]) + + with pytest.raises(ValueError, match="base_url or allowed_base_urls"): + rt.grpc_connect("http://127.0.0.1:50051") + + +def test_url_policy_setters_reject_changes_while_using_grpc( + monkeypatch: pytest.MonkeyPatch, +) -> None: + calls: list[str] = [] + + class FakeRuntime: + def clear_cache(self) -> None: + calls.append("clear_cache") + + class FakePyVegaFusionRuntime: + @staticmethod + def new_grpc(url: str) -> FakeRuntime: + calls.append(url) + return FakeRuntime() + + monkeypatch.setattr(_core, "PyVegaFusionRuntime", FakePyVegaFusionRuntime) + + rt = vf.VegaFusionRuntime() + rt.grpc_connect("http://127.0.0.1:50051") + + with pytest.raises(ValueError, match="vegafusion-server"): + rt.base_url = False + + with pytest.raises(ValueError, match="vegafusion-server"): + rt.allowed_base_urls = [] + + assert calls == ["http://127.0.0.1:50051"] diff --git a/vegafusion-python/vegafusion/runtime.py b/vegafusion-python/vegafusion/runtime.py index e0e86cfdf..2d2f4a201 100644 --- a/vegafusion-python/vegafusion/runtime.py +++ b/vegafusion-python/vegafusion/runtime.py @@ -207,7 +207,8 @@ def __init__( | list[PlanResolver] | tuple[PlanResolver, ...] | None = None, - data_base_url: str | bool | None = None, + base_url: str | bool | None = None, + allowed_base_urls: list[str] | None = None, ) -> None: """ Initialize a VegaFusionRuntime. @@ -221,11 +222,17 @@ def __init__( Can be a single resolver or a list of resolvers that form a pipeline (executed in order; short-circuits on first Table result). - data_base_url: Base URL for resolving relative data URLs. + base_url: Base URL for resolving relative data URLs. - None or True: use the default CDN (https://raw.githubusercontent.com/vega/vega-datasets/v2.3.0/) - str: custom base URL (scheme URL or absolute path) - False: disabled; relative paths produce an error + allowed_base_urls: Optional allowlist for external data access. + - None: unrestricted for embedded VegaFusion runtimes + - []: deny all external data access + - list[str]: allow matching URL/path patterns only + Policy checks apply to the initial resolved URL only; redirect + destinations are not re-checked after a fetch begins. """ self._runtime = None self._grpc_url: str | None = None @@ -233,7 +240,18 @@ def __init__( self._memory_limit = memory_limit self._worker_threads = worker_threads self._plan_resolvers = _normalize_resolvers(plan_resolver) - self._data_base_url = data_base_url + self._base_url = base_url + self._allowed_base_urls = allowed_base_urls + + def _has_non_default_url_policy(self) -> bool: + return self._base_url not in (None, True) or self._allowed_base_urls is not None + + def _ensure_not_using_grpc_for_url_policy_change(self) -> None: + if self._grpc_url is not None: + raise ValueError( + "Cannot change base_url or allowed_base_urls while using a gRPC runtime. " + "Configure these on the vegafusion-server process instead." + ) @property def runtime(self) -> PyVegaFusionRuntime: @@ -247,6 +265,10 @@ def runtime(self) -> PyVegaFusionRuntime: # Try to initialize a VegaFusion runtime from vegafusion._vegafusion import PyVegaFusionRuntime + if self._grpc_url is not None: + self._runtime = PyVegaFusionRuntime.new_grpc(self._grpc_url) + return self._runtime + if self.memory_limit is None: self.memory_limit = get_virtual_memory() // 2 if self.worker_threads is None: @@ -258,14 +280,16 @@ def runtime(self) -> PyVegaFusionRuntime: self.cache_capacity, self.memory_limit, self.worker_threads, - data_base_url=self._data_base_url, + base_url=self._base_url, + allowed_base_urls=self._allowed_base_urls, ) else: self._runtime = PyVegaFusionRuntime.new_embedded( self.cache_capacity, self.memory_limit, self.worker_threads, - data_base_url=self._data_base_url, + base_url=self._base_url, + allowed_base_urls=self._allowed_base_urls, ) return self._runtime @@ -282,6 +306,11 @@ def grpc_connect(self, url: str) -> None: "Plan resolvers run locally and are not supported " "with remote gRPC runtimes." ) + if self._has_non_default_url_policy(): + raise ValueError( + "Cannot use grpc_connect with local base_url or allowed_base_urls settings. " + "Configure URL policy on the vegafusion-server process instead." + ) from vegafusion._vegafusion import PyVegaFusionRuntime @@ -893,19 +922,19 @@ def cache_capacity(self, value: int) -> None: self.reset() @property - def data_base_url(self) -> str | bool | None: + def base_url(self) -> str | bool | None: """ - Get the data base URL setting. + Get the base URL setting. Returns: - The current data_base_url setting. + The current base_url setting. """ - return self._data_base_url + return self._base_url - @data_base_url.setter - def data_base_url(self, value: str | bool | None) -> None: + @base_url.setter + def base_url(self, value: str | bool | None) -> None: """ - Set the data base URL and restart the runtime. + Set the base URL and restart the runtime. Args: value: Base URL for resolving relative data URLs. @@ -913,8 +942,33 @@ def data_base_url(self, value: str | bool | None) -> None: - str: custom base URL - False: disabled """ - if value != self._data_base_url: - self._data_base_url = value + if value != self._base_url: + self._ensure_not_using_grpc_for_url_policy_change() + self._base_url = value + self.reset() + + @property + def allowed_base_urls(self) -> list[str] | None: + """ + Get the allowed_base_urls setting. + + Returns: + The current allowed_base_urls setting. + """ + return self._allowed_base_urls + + @allowed_base_urls.setter + def allowed_base_urls(self, value: list[str] | None) -> None: + """ + Set the external data allowlist and restart the runtime. + + Args: + value: None for unrestricted embedded access, [] to deny all external + access, or a list of URL/path patterns to allow. + """ + if value != self._allowed_base_urls: + self._ensure_not_using_grpc_for_url_policy_change() + self._allowed_base_urls = value self.reset() @property diff --git a/vegafusion-runtime/src/data/pipeline.rs b/vegafusion-runtime/src/data/pipeline.rs index a9be7db9a..389c20a03 100644 --- a/vegafusion-runtime/src/data/pipeline.rs +++ b/vegafusion-runtime/src/data/pipeline.rs @@ -16,13 +16,13 @@ use super::datafusion_resolver::DataFusionResolver; use super::external_table::ExternalTableProvider; use super::plan_resolver::PlanResolver; -/// CDN base URL for vega-datasets, used as the default data_base_url. +/// CDN base URL for vega-datasets, used as the default base_url. pub const VEGA_DATASETS_CDN_BASE: &str = "https://raw.githubusercontent.com/vega/vega-datasets/v2.3.0/"; /// Three-state base URL setting for public API boundaries. #[derive(Clone, Debug, Default)] -pub enum DataBaseUrlSetting { +pub enum BaseUrlSetting { /// Use the default CDN base URL (vega-datasets) #[default] Default, @@ -32,13 +32,13 @@ pub enum DataBaseUrlSetting { Custom(String), } -/// Map a `DataBaseUrlSetting` to the two-state `Option` used internally. +/// Map a `BaseUrlSetting` to the two-state `Option` used internally. /// Custom base URLs are normalized (bare absolute paths become file:// URLs). -pub fn resolve_data_base_url(setting: &DataBaseUrlSetting) -> Result> { +pub fn resolve_base_url(setting: &BaseUrlSetting) -> Result> { match setting { - DataBaseUrlSetting::Default => Ok(Some(VEGA_DATASETS_CDN_BASE.to_string())), - DataBaseUrlSetting::Disabled => Ok(None), - DataBaseUrlSetting::Custom(s) => Ok(Some(normalize_base_url(s.clone())?)), + BaseUrlSetting::Default => Ok(Some(VEGA_DATASETS_CDN_BASE.to_string())), + BaseUrlSetting::Disabled => Ok(None), + BaseUrlSetting::Custom(s) => Ok(Some(normalize_base_url(s.clone())?)), } } diff --git a/vegafusion-runtime/src/data/tasks.rs b/vegafusion-runtime/src/data/tasks.rs index b21d42e3c..4c0e20106 100644 --- a/vegafusion-runtime/src/data/tasks.rs +++ b/vegafusion-runtime/src/data/tasks.rs @@ -24,12 +24,13 @@ use datafusion_common::config::TableOptions; use datafusion_functions::expr_fn::make_date; use vegafusion_common::data::scalar::{ScalarValue, ScalarValueHelpers}; -use vegafusion_common::error::{Result, ResultWithContext, VegaFusionError}; +use vegafusion_common::error::{Result, ResultWithContext, ToExternalError, VegaFusionError}; use vegafusion_core::proto::gen::tasks::data_url_task::Url; use vegafusion_core::proto::gen::tasks::scan_url_format; use vegafusion_core::proto::gen::tasks::scan_url_format::Parse; use vegafusion_core::proto::gen::tasks::{DataSourceTask, DataUrlTask, DataValuesTask}; +use vegafusion_core::runtime::{check_url_allowed, file_url_to_path, path_to_file_url}; use vegafusion_core::task_graph::task::{InputVariable, TaskDependencies}; use vegafusion_core::task_graph::task_value::TaskValue; @@ -54,7 +55,7 @@ use object_store::{http::HttpBuilder, ClientOptions}; use tokio::io::AsyncReadExt; #[cfg(feature = "parquet")] -use {datafusion::prelude::ParquetReadOptions, vegafusion_common::error::ToExternalError}; +use datafusion::prelude::ParquetReadOptions; #[cfg(target_arch = "wasm32")] use object_store_wasm::HttpStore; @@ -137,12 +138,12 @@ impl TaskCall for DataUrlTask { // Build url string — resolve at eval time for both static and signal URLs let url = match self.url.as_ref().unwrap() { - Url::String(url) => vegafusion_core::runtime::resolve_url(url, &ctx.data_base_url)?, + Url::String(url) => vegafusion_core::runtime::resolve_url(url, &ctx.base_url)?, Url::Expr(expr) => { let compiled = compile(expr, &config, None).await?; let url_scalar = compiled.eval_to_scalar()?; let raw_url = url_scalar.to_scalar_string()?; - vegafusion_core::runtime::resolve_url(&raw_url, &ctx.data_base_url)? + vegafusion_core::runtime::resolve_url(&raw_url, &ctx.base_url)? } }; @@ -167,6 +168,10 @@ impl TaskCall for DataUrlTask { .as_ref() .and_then(|name| ctx.inline_datasets.get(name)); + if inline_name.is_none() { + check_url_allowed(&url, &ctx.allowed_base_urls)?; + } + let df = if let Some(inline_name) = &inline_name { if let Some(inline_dataset) = inline_dataset_info { match inline_dataset { @@ -264,6 +269,26 @@ fn build_parsed_url( }) } +#[cfg(feature = "http")] +async fn fetch_http_bytes(url: &str) -> Result> { + let client = reqwest::Client::new(); + let response = client + .get(url) + .send() + .await + .external(format!("Failed to fetch URL: {url}"))?; + + let response = response + .error_for_status() + .external(format!("Failed to fetch URL: {url}"))?; + + Ok(response + .bytes() + .await + .external("Failed to read response bytes")? + .to_vec()) +} + /// After processing, all datetime columns are converted to Timestamptz and Date32 async fn process_datetimes( parse: &Option, @@ -570,18 +595,8 @@ async fn read_csv_with_reqwest( is_tsv: bool, ext: &str, ) -> Result { - // Fetch CSV content using reqwest - let client = reqwest::Client::new(); - let response = client - .get(url) - .send() - .await - .external(format!("Failed to fetch URL: {url}"))?; - - let text = response - .text() - .await - .external("Failed to read response as text")?; + let bytes = fetch_http_bytes(url).await?; + let text: Cow = String::from_utf8_lossy(&bytes); // Create a temporary file to store the CSV content use std::io::Write; @@ -592,7 +607,7 @@ async fn read_csv_with_reqwest( temp_file.sync_all()?; // Read the CSV from the temporary file - let temp_url = format!("file://{}", temp_path.display()); + let temp_url = path_to_file_url(temp_path.to_str().unwrap())?; // Build CSV options let mut csv_opts = if is_tsv { @@ -726,79 +741,83 @@ pub(crate) async fn build_csv_schema( Ok(Schema::new(new_fields)) } -pub(crate) async fn read_json(url: &str, ctx: Arc) -> Result { - let value: serde_json::Value = - if let Some(base_url) = maybe_register_object_stores_for_url(&ctx, url)? { - // Create single use object store that points directly to file - let store = ctx.runtime_env().object_store(&base_url)?; - let child_url = url.strip_prefix(&base_url.to_string()).unwrap(); - match store.get(&child_url.into()).await { - Ok(get_res) => { - let bytes = get_res.bytes().await?.to_vec(); - let text: Cow = String::from_utf8_lossy(&bytes); - serde_json::from_str(text.as_ref())? - } - Err(e) => { - cfg_if::cfg_if! { - if #[cfg(feature="http")] { - if url.starts_with("http://") || url.starts_with("https://") { - // Fallback to direct reqwest implementation. This is needed in some cases because - // the object-store http implementation has stricter requirements on what the - // server provides. For example the content-length header is required. - let client = reqwest::Client::new(); - let response = client - .get(url) - .send() - .await - .external(format!("Failed to fetch URL: {url}"))?; - - let text = response - .text() - .await - .external("Failed to read response as text")?; - serde_json::from_str(&text)? - } else { - return Err(VegaFusionError::from(e)); - } +async fn read_json_via_store_or_file(url: &str, ctx: Arc) -> Result { + if let Some(base_url) = maybe_register_object_stores_for_url(&ctx, url)? { + // Create single use object store that points directly to file + let store = ctx.runtime_env().object_store(&base_url)?; + let child_url = url.strip_prefix(&base_url.to_string()).unwrap(); + match store.get(&child_url.into()).await { + Ok(get_res) => { + let bytes = get_res.bytes().await?.to_vec(); + let text: Cow = String::from_utf8_lossy(&bytes); + Ok(serde_json::from_str(text.as_ref())?) + } + Err(e) => { + cfg_if::cfg_if! { + if #[cfg(feature="http")] { + if url.starts_with("http://") || url.starts_with("https://") { + let bytes = fetch_http_bytes(url).await?; + Ok(serde_json::from_slice(&bytes)?) } else { - return Err(VegaFusionError::from(e)); + Err(VegaFusionError::from(e)) } + } else { + Err(VegaFusionError::from(e)) } } } - } else { - cfg_if::cfg_if! { - if #[cfg(feature="fs")] { - // Assume local file - let mut file = tokio::fs::File::open(url) - .await - .external(format!("Failed to open as local file: {url}"))?; - - let mut json_str = String::new(); - file.read_to_string(&mut json_str) - .await - .external("Failed to read file contents to string")?; - - serde_json::from_str(&json_str)? + } + } else { + cfg_if::cfg_if! { + if #[cfg(feature="fs")] { + let local_path = if url.starts_with("file://") { + file_url_to_path(url)? } else { - return Err(VegaFusionError::internal( - "The `fs` feature flag must be enabled for file system support" - )); - } + std::path::PathBuf::from(url) + }; + + let mut file = tokio::fs::File::open(&local_path) + .await + .external(format!("Failed to open as local file: {}", local_path.display()))?; + + let mut json_str = String::new(); + file.read_to_string(&mut json_str) + .await + .external("Failed to read file contents to string")?; + + Ok(serde_json::from_str(&json_str)?) + } else { + Err(VegaFusionError::internal( + "The `fs` feature flag must be enabled for file system support" + )) } - }; + } + } +} + +pub(crate) async fn read_json( + url: &str, + ctx: Arc, +) -> Result { + let value: serde_json::Value = read_json_via_store_or_file(url, ctx.clone()).await?; let table = VegaFusionTable::from_json(&value)?.with_ordering()?; ctx.vegafusion_table(table).await } -pub(crate) async fn read_arrow(url: &str, ctx: Arc) -> Result { +pub(crate) async fn read_arrow( + url: &str, + ctx: Arc, +) -> Result { maybe_register_object_stores_for_url(&ctx, url)?; Ok(ctx.read_arrow(url, ArrowReadOptions::default()).await?) } #[cfg(feature = "parquet")] -pub(crate) async fn read_parquet(url: &str, ctx: Arc) -> Result { +pub(crate) async fn read_parquet( + url: &str, + ctx: Arc, +) -> Result { maybe_register_object_stores_for_url(&ctx, url)?; Ok(ctx.read_parquet(url, ParquetReadOptions::default()).await?) } @@ -814,10 +833,11 @@ pub(crate) fn maybe_register_object_stores_for_url( if let Some(path) = url.strip_prefix(prefix) { let Some((root, _)) = path.split_once('/') else { return Err(VegaFusionError::specification(format!( - "Invalid https URL: {url}" + "Invalid {prefix} URL: {url}" ))); }; - let base_url_str = format!("https://{root}"); + let scheme = prefix.trim_end_matches("://"); + let base_url_str = format!("{scheme}://{root}"); let base_url = url::Url::parse(&base_url_str)?; // Register store for url if not already registered diff --git a/vegafusion-runtime/src/task_graph/runtime.rs b/vegafusion-runtime/src/task_graph/runtime.rs index 1d27c165d..7c4e3d3d8 100644 --- a/vegafusion-runtime/src/task_graph/runtime.rs +++ b/vegafusion-runtime/src/task_graph/runtime.rs @@ -19,6 +19,7 @@ use vegafusion_core::proto::gen::tasks::inline_dataset::Dataset; use vegafusion_core::proto::gen::tasks::{ task::TaskKind, InlineDataset, InlineDatasetTable, NodeValueIndex, TaskGraph, }; +use vegafusion_core::runtime::{normalize_allowed_base_urls, AllowedBaseUrlPattern}; use vegafusion_core::runtime::VegaFusionRuntimeTrait; use vegafusion_core::task_graph::task_value::{MaterializedTaskValue, NamedTaskValue, TaskValue}; @@ -33,11 +34,12 @@ use { type CacheValue = (TaskValue, Vec); -use crate::data::pipeline::{resolve_data_base_url, DataBaseUrlSetting}; +use crate::data::pipeline::{resolve_base_url, BaseUrlSetting}; pub struct VegaFusionRuntimeOpts { pub plan_resolvers: Vec>, - pub data_base_url: DataBaseUrlSetting, + pub base_url: BaseUrlSetting, + pub allowed_base_urls: Option>, pub cache: Option, } @@ -45,7 +47,8 @@ impl Default for VegaFusionRuntimeOpts { fn default() -> Self { Self { plan_resolvers: Vec::new(), - data_base_url: DataBaseUrlSetting::Default, + base_url: BaseUrlSetting::Default, + allowed_base_urls: None, cache: None, } } @@ -55,19 +58,22 @@ impl Default for VegaFusionRuntimeOpts { pub struct VegaFusionRuntime { pub cache: VegaFusionCache, pub pipeline: ResolverPipeline, - pub data_base_url: Option, + pub base_url: Option, + pub allowed_base_urls: Option>, } impl VegaFusionRuntime { pub fn new(opts: VegaFusionRuntimeOpts) -> vegafusion_core::error::Result { let ctx = Arc::new(make_datafusion_context()); - let data_base_url = resolve_data_base_url(&opts.data_base_url)?; + let base_url = resolve_base_url(&opts.base_url)?; + let allowed_base_urls = normalize_allowed_base_urls(opts.allowed_base_urls)?; Ok(Self { cache: opts .cache .unwrap_or_else(|| VegaFusionCache::new(Some(32), None)), pipeline: ResolverPipeline::new(opts.plan_resolvers, ctx), - data_base_url, + base_url, + allowed_base_urls, }) } @@ -83,7 +89,8 @@ impl VegaFusionRuntime { tz_config: None, // overridden per-task from task.tz_config inline_datasets, pipeline: self.pipeline.clone(), - data_base_url: self.data_base_url.clone(), + base_url: self.base_url.clone(), + allowed_base_urls: self.allowed_base_urls.clone(), }; let node_value = AssertUnwindSafe(get_or_compute_node_value( task_graph, diff --git a/vegafusion-runtime/src/task_graph/task.rs b/vegafusion-runtime/src/task_graph/task.rs index aaa367166..2946c4c45 100644 --- a/vegafusion-runtime/src/task_graph/task.rs +++ b/vegafusion-runtime/src/task_graph/task.rs @@ -7,6 +7,7 @@ use vegafusion_core::data::dataset::VegaFusionDataset; use vegafusion_core::error::Result; use vegafusion_core::proto::gen::tasks::task::TaskKind; use vegafusion_core::proto::gen::tasks::Task; +use vegafusion_core::runtime::AllowedBaseUrlPattern; use vegafusion_core::task_graph::task_value::TaskValue; /// Ambient context available to all tasks during evaluation. @@ -15,7 +16,8 @@ pub struct TaskContext { pub tz_config: Option, pub inline_datasets: HashMap, pub pipeline: ResolverPipeline, - pub data_base_url: Option, + pub base_url: Option, + pub allowed_base_urls: Option>, } #[async_trait] diff --git a/vegafusion-runtime/tests/test_url_policy.rs b/vegafusion-runtime/tests/test_url_policy.rs new file mode 100644 index 000000000..25b83e73c --- /dev/null +++ b/vegafusion-runtime/tests/test_url_policy.rs @@ -0,0 +1,263 @@ +use async_trait::async_trait; +use datafusion::datasource::{provider_as_source, MemTable}; +use datafusion::logical_expr::{LogicalPlan, LogicalPlanBuilder}; +use serde_json::json; +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use tempfile::TempDir; +use vegafusion_common::arrow::array::{ArrayRef, Float64Array}; +use vegafusion_common::arrow::record_batch::RecordBatch; +use vegafusion_common::data::scalar::ScalarValueHelpers; +use vegafusion_common::error::Result; +use vegafusion_core::proto::gen::tasks::{TaskGraph, TzConfig, Variable}; +use vegafusion_core::spec::chart::ChartSpec; +use vegafusion_runtime::data::pipeline::BaseUrlSetting; +use vegafusion_runtime::data::plan_resolver::{PlanResolver, ResolutionResult}; +use vegafusion_runtime::task_graph::runtime::{VegaFusionRuntime, VegaFusionRuntimeOpts}; + +fn write_json_rows(dir: &Path, name: &str, values: &[f64]) -> PathBuf { + let path = dir.join(name); + let rows: Vec<_> = values.iter().map(|value| json!({ "x": value })).collect(); + fs::write(&path, serde_json::to_string(&rows).unwrap()).unwrap(); + path +} + +fn extent_spec(url: serde_json::Value) -> ChartSpec { + serde_json::from_value(json!({ + "$schema": "https://vega.github.io/schema/vega/v5.json", + "data": [ + { + "name": "source", + "url": url, + "format": {"type": "json"}, + }, + { + "name": "derived", + "source": "source", + "transform": [ + { + "type": "extent", + "signal": "my_extent", + "field": "x", + } + ], + } + ] + })) + .unwrap() +} + +fn extent_spec_with_url_signal(signal_url: &str) -> ChartSpec { + serde_json::from_value(json!({ + "$schema": "https://vega.github.io/schema/vega/v5.json", + "signals": [ + { + "name": "url", + "value": signal_url, + } + ], + "data": [ + { + "name": "source", + "url": {"signal": "url"}, + "format": {"type": "json"}, + }, + { + "name": "derived", + "source": "source", + "transform": [ + { + "type": "extent", + "signal": "my_extent", + "field": "x", + } + ], + } + ] + })) + .unwrap() +} + +async fn query_extent(runtime: &VegaFusionRuntime, spec: &ChartSpec) -> Result<[f64; 2]> { + let tz_config = TzConfig { + local_tz: "UTC".to_string(), + default_input_tz: None, + }; + let task_scope = spec.to_task_scope().unwrap(); + let tasks = spec.to_tasks(&tz_config, &Default::default()).unwrap(); + let graph = Arc::new(TaskGraph::new(tasks, &task_scope).unwrap()); + let mapping = graph.build_mapping(); + let node = mapping + .get(&(Variable::new_signal("my_extent"), Vec::new())) + .cloned() + .unwrap(); + let value = runtime + .get_node_value(graph, &node, Default::default()) + .await?; + value.as_scalar()?.to_f64x2() +} + +struct CustomSchemeResolver; + +#[async_trait] +impl PlanResolver for CustomSchemeResolver { + fn name(&self) -> &str { + "custom_scheme_resolver" + } + + async fn scan_url( + &self, + parsed_url: &vegafusion_core::runtime::ParsedUrl, + ) -> Result> { + if parsed_url.scheme != "custom" { + return Ok(None); + } + + let batch = RecordBatch::try_from_iter(vec![( + "x", + Arc::new(Float64Array::from(vec![10.0, 20.0, 30.0])) as ArrayRef, + )]) + .unwrap(); + let mem_table = MemTable::try_new(batch.schema(), vec![vec![batch]]).unwrap(); + let plan = LogicalPlanBuilder::scan( + "custom_table", + provider_as_source(Arc::new(mem_table)), + None, + ) + .unwrap() + .build() + .unwrap(); + Ok(Some(plan)) + } + + async fn resolve_plan(&self, plan: LogicalPlan) -> Result { + Ok(ResolutionResult::Plan(plan)) + } +} + +fn tempdir_str(tempdir: &TempDir) -> String { + tempdir.path().to_str().unwrap().to_string() +} + +#[tokio::test] +async fn test_relative_url_resolves_against_base_url_and_allowlist() { + let tempdir = tempfile::tempdir().unwrap(); + write_json_rows(tempdir.path(), "data.json", &[1.0, 2.0, 3.0]); + + let runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts { + base_url: BaseUrlSetting::Custom(tempdir_str(&tempdir)), + allowed_base_urls: Some(vec![tempdir_str(&tempdir)]), + ..Default::default() + }) + .unwrap(); + + let extent = query_extent(&runtime, &extent_spec(json!("data.json"))) + .await + .unwrap(); + assert_eq!(extent, [1.0, 3.0]); +} + +#[tokio::test] +async fn test_relative_url_fails_when_base_url_disabled() { + let tempdir = tempfile::tempdir().unwrap(); + write_json_rows(tempdir.path(), "data.json", &[1.0, 2.0, 3.0]); + + let runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts { + base_url: BaseUrlSetting::Disabled, + ..Default::default() + }) + .unwrap(); + + let err = query_extent(&runtime, &extent_spec(json!("data.json"))) + .await + .unwrap_err(); + let message = err.to_string(); + assert!( + message.contains("Relative URL with no base_url configured"), + "unexpected error: {message}" + ); +} + +#[tokio::test] +async fn test_allowed_base_urls_block_local_file_access() { + let allowed_dir = tempfile::tempdir().unwrap(); + let blocked_dir = tempfile::tempdir().unwrap(); + write_json_rows(blocked_dir.path(), "data.json", &[1.0, 2.0, 3.0]); + + let runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts { + base_url: BaseUrlSetting::Custom(tempdir_str(&blocked_dir)), + allowed_base_urls: Some(vec![tempdir_str(&allowed_dir)]), + ..Default::default() + }) + .unwrap(); + + let err = query_extent(&runtime, &extent_spec(json!("data.json"))) + .await + .unwrap_err(); + let message = err.to_string(); + assert!( + message.contains("blocked by allowed_base_urls"), + "unexpected error: {message}" + ); +} + +#[tokio::test] +async fn test_allowed_base_urls_gate_custom_scheme_resolvers() { + let runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts { + plan_resolvers: vec![Arc::new(CustomSchemeResolver)], + allowed_base_urls: Some(vec!["custom://allowed-host/".to_string()]), + ..Default::default() + }) + .unwrap(); + + let allowed_extent = query_extent( + &runtime, + &extent_spec(json!("custom://allowed-host/warehouse/table")), + ) + .await + .unwrap(); + assert_eq!(allowed_extent, [10.0, 30.0]); + + let err = query_extent( + &runtime, + &extent_spec(json!("custom://blocked-host/warehouse/table")), + ) + .await + .unwrap_err(); + let message = err.to_string(); + assert!( + message.contains("blocked by allowed_base_urls"), + "unexpected error: {message}" + ); +} + +#[tokio::test] +async fn test_signal_updated_urls_are_revalidated_against_policy() { + let runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts { + plan_resolvers: vec![Arc::new(CustomSchemeResolver)], + allowed_base_urls: Some(vec!["custom://allowed-host/".to_string()]), + ..Default::default() + }) + .unwrap(); + + let allowed_extent = query_extent( + &runtime, + &extent_spec_with_url_signal("custom://allowed-host/warehouse/table"), + ) + .await + .unwrap(); + assert_eq!(allowed_extent, [10.0, 30.0]); + + let err = query_extent( + &runtime, + &extent_spec_with_url_signal("custom://blocked-host/warehouse/table"), + ) + .await + .unwrap_err(); + let message = err.to_string(); + assert!( + message.contains("blocked by allowed_base_urls"), + "unexpected error: {message}" + ); +} diff --git a/vegafusion-server/Cargo.toml b/vegafusion-server/Cargo.toml index b364264d2..8f0d4124a 100644 --- a/vegafusion-server/Cargo.toml +++ b/vegafusion-server/Cargo.toml @@ -21,6 +21,9 @@ h2 = "0.4" assert_cmd = "2.0.17" predicates = "3.1.3" +[dev-dependencies.tempfile] +workspace = true + [dependencies.regex] workspace = true diff --git a/vegafusion-server/src/main.rs b/vegafusion-server/src/main.rs index fb9c80412..d0dd5f8e9 100644 --- a/vegafusion-server/src/main.rs +++ b/vegafusion-server/src/main.rs @@ -19,11 +19,12 @@ use vegafusion_core::proto::gen::tasks::{ use vegafusion_core::runtime::VegaFusionRuntimeTrait; use vegafusion_core::spec::chart::ChartSpec; use vegafusion_core::task_graph::graph::ScopedVariable; +use vegafusion_runtime::data::pipeline::BaseUrlSetting; use vegafusion_runtime::task_graph::runtime::{ decode_inline_datasets, VegaFusionRuntime, VegaFusionRuntimeOpts, }; -use clap::Parser; +use clap::{ArgAction, Parser}; use regex::Regex; use vegafusion_core::proto::gen::pretransform::{ PreTransformExtractDataset, PreTransformExtractRequest, PreTransformExtractResponse, @@ -349,6 +350,22 @@ struct Args { /// Include compatibility with gRPC-Web #[clap(long, num_args = 0)] pub web: bool, + + /// Base URL for resolving relative data URLs + #[clap(long, conflicts_with = "no_base_url")] + pub base_url: Option, + + /// Disable base URL resolution for relative data URLs + #[clap(long, action = ArgAction::SetTrue, conflicts_with = "base_url")] + pub no_base_url: bool, + + /// Allowlist entry for external data access. Repeat for multiple entries. + #[clap(long = "allowed-base-url", action = ArgAction::Append, conflicts_with = "no_allowed_urls")] + pub allowed_base_url: Vec, + + /// Disable all external data access + #[clap(long, action = ArgAction::SetTrue, conflicts_with = "allowed_base_url")] + pub no_allowed_urls: bool, } fn main() -> Result<(), VegaFusionError> { @@ -370,6 +387,22 @@ fn main() -> Result<(), VegaFusionError> { None }; + let base_url = if args.no_base_url { + BaseUrlSetting::Disabled + } else if let Some(base_url) = args.base_url.clone() { + BaseUrlSetting::Custom(base_url) + } else { + BaseUrlSetting::Default + }; + + let allowed_base_urls = if args.no_allowed_urls { + Some(vec![]) + } else if args.allowed_base_url.is_empty() { + None + } else { + Some(args.allowed_base_url.clone()) + }; + let tokio_runtime = tokio::runtime::Builder::new_multi_thread() .enable_all() .thread_stack_size(TOKIO_THREAD_STACK_SIZE) @@ -378,6 +411,8 @@ fn main() -> Result<(), VegaFusionError> { let tg_runtime = VegaFusionRuntime::new(VegaFusionRuntimeOpts { cache: Some(VegaFusionCache::new(Some(args.capacity), memory_limit)), + base_url, + allowed_base_urls, ..Default::default() }) .expect("Failed to create VegaFusionRuntime"); diff --git a/vegafusion-server/tests/test_task_graph_runtime.rs b/vegafusion-server/tests/test_task_graph_runtime.rs index b77faffa2..dbae563e4 100644 --- a/vegafusion-server/tests/test_task_graph_runtime.rs +++ b/vegafusion-server/tests/test_task_graph_runtime.rs @@ -1,12 +1,143 @@ +use serde_json::json; +use std::fs; +use std::net::TcpListener; +use std::path::{Path, PathBuf}; +use std::process::{Child, Command, Stdio}; use std::time::Duration; +use tokio::time::sleep; use vegafusion_common::data::scalar::ScalarValueHelpers; use vegafusion_core::proto::gen::services::query_result::Response; use vegafusion_core::proto::gen::services::vega_fusion_runtime_client::VegaFusionRuntimeClient; use vegafusion_core::proto::gen::services::{query_request, QueryRequest}; use vegafusion_core::proto::gen::tasks::{ - NodeValueIndex, TaskGraph, TaskGraphValueRequest, TzConfig, VariableNamespace, + TaskGraph, TaskGraphValueRequest, TzConfig, Variable, VariableNamespace, }; -use vegafusion_core::spec::chart::ChartSpec; // Add methods on commands +use vegafusion_core::spec::chart::ChartSpec; + +struct ServerProcess { + child: Child, +} + +impl Drop for ServerProcess { + fn drop(&mut self) { + let _ = self.child.kill(); + let _ = self.child.wait(); + } +} + +fn pick_unused_port() -> u16 { + TcpListener::bind("127.0.0.1:0") + .unwrap() + .local_addr() + .unwrap() + .port() +} + +fn write_json_rows(dir: &Path, name: &str, values: &[f64]) -> PathBuf { + let path = dir.join(name); + let rows: Vec<_> = values.iter().map(|value| json!({ "x": value })).collect(); + fs::write(&path, serde_json::to_string(&rows).unwrap()).unwrap(); + path +} + +fn extent_spec(url: serde_json::Value) -> ChartSpec { + serde_json::from_value(json!({ + "$schema": "https://vega.github.io/schema/vega/v5.json", + "data": [ + { + "name": "source", + "url": url, + "format": {"type": "json"}, + }, + { + "name": "derived", + "source": "source", + "transform": [ + { + "type": "extent", + "signal": "my_extent", + "field": "x", + } + ], + } + ] + })) + .unwrap() +} + +fn build_request(chart: &ChartSpec) -> QueryRequest { + let tz_config = TzConfig { + local_tz: "UTC".to_string(), + default_input_tz: None, + }; + let task_scope = chart.to_task_scope().unwrap(); + let tasks = chart.to_tasks(&tz_config, &Default::default()).unwrap(); + let graph = TaskGraph::new(tasks, &task_scope).unwrap(); + let mapping = graph.build_mapping(); + let extent_node = mapping + .get(&(Variable::new_signal("my_extent"), Vec::new())) + .cloned() + .unwrap(); + + QueryRequest { + request: Some(query_request::Request::TaskGraphValues( + TaskGraphValueRequest { + task_graph: Some(graph), + indices: vec![extent_node], + inline_datasets: vec![], + }, + )), + } +} + +async fn spawn_server(extra_args: &[String]) -> (ServerProcess, String) { + let port = pick_unused_port(); + let mut cmd = Command::new(assert_cmd::cargo::cargo_bin!("vegafusion-server")); + cmd.arg("--host") + .arg("127.0.0.1") + .arg("--port") + .arg(port.to_string()) + .args(extra_args) + .stdout(Stdio::null()) + .stderr(Stdio::null()); + + let child = cmd.spawn().expect("Failed to spawn vegafusion-server"); + let address = format!("http://127.0.0.1:{port}"); + + for _ in 0..60 { + if VegaFusionRuntimeClient::connect(address.clone()).await.is_ok() { + return (ServerProcess { child }, address); + } + sleep(Duration::from_millis(100)).await; + } + + panic!("Timed out waiting for vegafusion-server to start on port {port}"); +} + +async fn query_extent(address: String, chart: &ChartSpec) -> std::result::Result<[f64; 2], String> { + let mut client = VegaFusionRuntimeClient::connect(address) + .await + .map_err(|err| err.to_string())?; + let response = client + .task_graph_query(build_request(chart)) + .await + .map_err(|err| err.to_string())?; + + let query_result = response.into_inner(); + match query_result.response.unwrap() { + Response::Error(error) => Err(format!("{error:?}")), + Response::TaskGraphValues(values_response) => { + let response_values = values_response.deserialize().unwrap(); + let (_var, scope, value) = &response_values[0]; + assert_eq!(scope, &Vec::::new()); + value + .as_scalar() + .map_err(|err| err.to_string())? + .to_f64x2() + .map_err(|err| err.to_string()) + } + } +} #[tokio::test(flavor = "multi_thread")] async fn try_it_from_spec() { @@ -53,23 +184,24 @@ async fn try_it_from_spec() { let tasks = chart.to_tasks(&tz_config, &Default::default()).unwrap(); let graph = TaskGraph::new(tasks, &task_scope).unwrap(); + let mapping = graph.build_mapping(); let request = QueryRequest { request: Some(query_request::Request::TaskGraphValues( TaskGraphValueRequest { task_graph: Some(graph), - indices: vec![NodeValueIndex::new(2, Some(0))], + indices: vec![ + mapping + .get(&(Variable::new_signal("my_extent"), Vec::new())) + .cloned() + .unwrap(), + ], inline_datasets: vec![], }, )), }; - let mut bin = std::process::Command::new(assert_cmd::cargo::cargo_bin!("vegafusion-server")); - let cmd = bin.args(["--port", "50059"]); - - let mut proc = cmd.spawn().expect("Failed to spawn vegafusion-server"); - std::thread::sleep(Duration::from_millis(2000)); - - let mut client = VegaFusionRuntimeClient::connect("http://127.0.0.1:50059") + let (_server, address) = spawn_server(&[]).await; + let mut client = VegaFusionRuntimeClient::connect(address) .await .expect("Failed to connect to gRPC server"); let response = client.task_graph_query(request).await.unwrap(); @@ -94,5 +226,97 @@ async fn try_it_from_spec() { ) } } - proc.kill().ok(); +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_server_base_url_flag_resolves_relative_urls() { + let tempdir = tempfile::tempdir().unwrap(); + write_json_rows(tempdir.path(), "data.json", &[1.0, 2.0, 3.0]); + + let args = vec![ + "--base-url".to_string(), + tempdir.path().to_str().unwrap().to_string(), + "--allowed-base-url".to_string(), + tempdir.path().to_str().unwrap().to_string(), + ]; + let (_server, address) = spawn_server(&args).await; + + let extent = query_extent(address, &extent_spec(json!("data.json"))) + .await + .unwrap(); + assert_eq!(extent, [1.0, 3.0]); +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_server_no_base_url_rejects_relative_urls() { + let tempdir = tempfile::tempdir().unwrap(); + write_json_rows(tempdir.path(), "data.json", &[1.0, 2.0, 3.0]); + + let args = vec!["--no-base-url".to_string()]; + let (_server, address) = spawn_server(&args).await; + + let err = query_extent(address, &extent_spec(json!("data.json"))) + .await + .unwrap_err(); + assert!( + err.contains("Relative URL with no base_url configured"), + "unexpected error: {err}" + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_server_no_allowed_urls_blocks_external_access() { + let tempdir = tempfile::tempdir().unwrap(); + let data_path = write_json_rows(tempdir.path(), "data.json", &[1.0, 2.0, 3.0]); + + let args = vec!["--no-allowed-urls".to_string()]; + let (_server, address) = spawn_server(&args).await; + + let err = query_extent( + address, + &extent_spec(json!(data_path.to_str().unwrap().to_string())), + ) + .await + .unwrap_err(); + assert!( + err.contains("blocked by allowed_base_urls"), + "unexpected error: {err}" + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn test_server_repeatable_allowed_base_url_flags_allow_multiple_roots() { + let first_dir = tempfile::tempdir().unwrap(); + let second_dir = tempfile::tempdir().unwrap(); + let blocked_dir = tempfile::tempdir().unwrap(); + write_json_rows(first_dir.path(), "first.json", &[1.0, 2.0, 3.0]); + let second_path = write_json_rows(second_dir.path(), "second.json", &[4.0, 5.0, 6.0]); + let blocked_path = write_json_rows(blocked_dir.path(), "blocked.json", &[7.0, 8.0, 9.0]); + + let args = vec![ + "--allowed-base-url".to_string(), + first_dir.path().to_str().unwrap().to_string(), + "--allowed-base-url".to_string(), + second_dir.path().to_str().unwrap().to_string(), + ]; + let (_server, address) = spawn_server(&args).await; + + let extent = query_extent( + address.clone(), + &extent_spec(json!(second_path.to_str().unwrap().to_string())), + ) + .await + .unwrap(); + assert_eq!(extent, [4.0, 6.0]); + + let err = query_extent( + address, + &extent_spec(json!(blocked_path.to_str().unwrap().to_string())), + ) + .await + .unwrap_err(); + assert!( + err.contains("blocked by allowed_base_urls"), + "unexpected error: {err}" + ); } From cae68672a62f1acde7d3f641165a22ee9ea769e2 Mon Sep 17 00:00:00 2001 From: Jon Mease Date: Mon, 23 Mar 2026 10:01:04 -0400 Subject: [PATCH 36/36] fix: strip Windows \\?\ prefix in canonicalize and fix formatting Strip extended-length path prefix (\\?\) from fs::canonicalize on Windows to fix path prefix matching in allowed_base_urls checks. Also fix formatting issues across Rust and Python files. Co-Authored-By: Claude Opus 4.6 (1M context) --- vegafusion-core/src/data/url.rs | 78 ++++++++++++------- vegafusion-python/src/lib.rs | 4 +- .../tests/test_runtime_config.py | 4 +- vegafusion-python/vegafusion/runtime.py | 12 ++- vegafusion-runtime/src/data/tasks.rs | 20 ++--- vegafusion-runtime/src/task_graph/runtime.rs | 2 +- .../tests/test_task_graph_runtime.rs | 15 ++-- 7 files changed, 82 insertions(+), 53 deletions(-) diff --git a/vegafusion-core/src/data/url.rs b/vegafusion-core/src/data/url.rs index fad5352b9..e08e52976 100644 --- a/vegafusion-core/src/data/url.rs +++ b/vegafusion-core/src/data/url.rs @@ -122,7 +122,12 @@ pub fn path_to_file_url(path: &str) -> Result { let p = std::path::Path::new(&normalized); url::Url::from_file_path(p) .map(|u| u.to_string()) - .map_err(|_| VegaFusionError::specification(format!("Cannot convert path to file URL: {}", p.display()))) + .map_err(|_| { + VegaFusionError::specification(format!( + "Cannot convert path to file URL: {}", + p.display() + )) + }) } /// Browser-wasm fallback: `url::Url::from_file_path` is unavailable on @@ -143,9 +148,9 @@ pub fn path_to_file_url(path: &str) -> Result { pub fn file_url_to_path(url: &str) -> Result { let parsed = url::Url::parse(url) .map_err(|e| VegaFusionError::specification(format!("Invalid file URL '{url}': {e}")))?; - parsed - .to_file_path() - .map_err(|_| VegaFusionError::specification(format!("Cannot convert file URL to path: {url}"))) + parsed.to_file_path().map_err(|_| { + VegaFusionError::specification(format!("Cannot convert file URL to path: {url}")) + }) } #[cfg(target_arch = "wasm32")] @@ -157,9 +162,19 @@ pub fn file_url_to_path(url: &str) -> Result { #[cfg(not(target_arch = "wasm32"))] fn portable_canonicalize(path: &Path) -> Result { - fs::canonicalize(path).map_err(|e| { + let canonical = fs::canonicalize(path).map_err(|e| { VegaFusionError::specification(format!("Failed to resolve path {}: {e}", path.display())) - }) + })?; + // On Windows, fs::canonicalize returns extended-length paths (\\?\C:\...) + // which break prefix matching. Strip the prefix for consistent comparisons. + #[cfg(target_os = "windows")] + { + let s = canonical.to_string_lossy(); + if let Some(stripped) = s.strip_prefix(r"\\?\") { + return Ok(PathBuf::from(stripped)); + } + } + Ok(canonical) } #[cfg(target_arch = "wasm32")] @@ -233,16 +248,8 @@ pub fn normalize_allowed_base_url(allowed_base_url: &str) -> Result) -> Result { match base_url { Some(base) => { let base_url = url::Url::parse(base).map_err(|e| { - VegaFusionError::specification(format!( - "Invalid base URL '{base}': {e}" - )) + VegaFusionError::specification(format!("Invalid base URL '{base}': {e}")) })?; let resolved = base_url.join(url).map_err(|e| { VegaFusionError::specification(format!( @@ -521,7 +526,10 @@ mod tests { fn test_normalize_base_url_existing_directory_adds_trailing_slash() { let tempdir = tempfile::tempdir().unwrap(); let result = normalize_base_url(tempdir.path().to_str().unwrap().to_string()).unwrap(); - assert!(result.ends_with('/'), "expected trailing slash, got {result}"); + assert!( + result.ends_with('/'), + "expected trailing slash, got {result}" + ); } #[test] @@ -645,14 +653,23 @@ mod tests { #[test] fn test_is_url_allowed_prefix() { let patterns = vec![normalize_allowed_base_url("https://example.com/data/").unwrap()]; - assert!(is_url_allowed("https://example.com/data/cars.json", &patterns)); - assert!(!is_url_allowed("https://example.com/other/cars.json", &patterns)); + assert!(is_url_allowed( + "https://example.com/data/cars.json", + &patterns + )); + assert!(!is_url_allowed( + "https://example.com/other/cars.json", + &patterns + )); } #[test] fn test_is_url_allowed_wildcard_host() { let patterns = vec![normalize_allowed_base_url("https://*.example.com/data/").unwrap()]; - assert!(is_url_allowed("https://example.com/data/cars.json", &patterns)); + assert!(is_url_allowed( + "https://example.com/data/cars.json", + &patterns + )); assert!(is_url_allowed( "https://cdn.example.com/data/cars.json", &patterns @@ -661,7 +678,10 @@ mod tests { "https://example.com.evil.com/data/cars.json", &patterns )); - assert!(!is_url_allowed("https://cdn.example.com/other/cars.json", &patterns)); + assert!(!is_url_allowed( + "https://cdn.example.com/other/cars.json", + &patterns + )); } #[test] @@ -674,7 +694,10 @@ mod tests { std::fs::write(&file_path, "{}").unwrap(); let patterns = vec![normalize_allowed_base_url(root.path().to_str().unwrap()).unwrap()]; - assert!(is_url_allowed(&format!("file://{}", file_path.display()), &patterns)); + assert!(is_url_allowed( + &format!("file://{}", file_path.display()), + &patterns + )); } #[test] @@ -688,7 +711,10 @@ mod tests { let file_path = allowed.join("../outside/data.json"); let patterns = vec![normalize_allowed_base_url(allowed.to_str().unwrap()).unwrap()]; - assert!(!is_url_allowed(&format!("file://{}", file_path.display()), &patterns)); + assert!(!is_url_allowed( + &format!("file://{}", file_path.display()), + &patterns + )); } #[test] diff --git a/vegafusion-python/src/lib.rs b/vegafusion-python/src/lib.rs index 940b6014e..843702056 100644 --- a/vegafusion-python/src/lib.rs +++ b/vegafusion-python/src/lib.rs @@ -125,7 +125,9 @@ fn parse_base_url( } else if let Ok(s) = obj.extract::() { Ok(BaseUrlSetting::Custom(s)) } else { - Err(PyValueError::new_err("base_url must be a str, bool, or None")) + Err(PyValueError::new_err( + "base_url must be a str, bool, or None", + )) } } } diff --git a/vegafusion-python/tests/test_runtime_config.py b/vegafusion-python/tests/test_runtime_config.py index 424ed3a5c..2721679e7 100644 --- a/vegafusion-python/tests/test_runtime_config.py +++ b/vegafusion-python/tests/test_runtime_config.py @@ -18,7 +18,9 @@ def test_runtime_exposes_url_policy_properties() -> None: assert rt.allowed_base_urls == ["https://example.com/data/"] -def test_runtime_passes_url_policy_to_embedded_runtime(monkeypatch: pytest.MonkeyPatch) -> None: +def test_runtime_passes_url_policy_to_embedded_runtime( + monkeypatch: pytest.MonkeyPatch, +) -> None: calls: list[dict[str, object]] = [] class FakeRuntime: diff --git a/vegafusion-python/vegafusion/runtime.py b/vegafusion-python/vegafusion/runtime.py index 2d2f4a201..9209a46e2 100644 --- a/vegafusion-python/vegafusion/runtime.py +++ b/vegafusion-python/vegafusion/runtime.py @@ -249,8 +249,10 @@ def _has_non_default_url_policy(self) -> bool: def _ensure_not_using_grpc_for_url_policy_change(self) -> None: if self._grpc_url is not None: raise ValueError( - "Cannot change base_url or allowed_base_urls while using a gRPC runtime. " - "Configure these on the vegafusion-server process instead." + "Cannot change base_url or allowed_base_urls " + "while using a gRPC runtime. " + "Configure these on the vegafusion-server " + "process instead." ) @property @@ -308,8 +310,10 @@ def grpc_connect(self, url: str) -> None: ) if self._has_non_default_url_policy(): raise ValueError( - "Cannot use grpc_connect with local base_url or allowed_base_urls settings. " - "Configure URL policy on the vegafusion-server process instead." + "Cannot use grpc_connect with local " + "base_url or allowed_base_urls settings. " + "Configure URL policy on the " + "vegafusion-server process instead." ) from vegafusion._vegafusion import PyVegaFusionRuntime diff --git a/vegafusion-runtime/src/data/tasks.rs b/vegafusion-runtime/src/data/tasks.rs index 4c0e20106..4bb204e38 100644 --- a/vegafusion-runtime/src/data/tasks.rs +++ b/vegafusion-runtime/src/data/tasks.rs @@ -741,7 +741,10 @@ pub(crate) async fn build_csv_schema( Ok(Schema::new(new_fields)) } -async fn read_json_via_store_or_file(url: &str, ctx: Arc) -> Result { +async fn read_json_via_store_or_file( + url: &str, + ctx: Arc, +) -> Result { if let Some(base_url) = maybe_register_object_stores_for_url(&ctx, url)? { // Create single use object store that points directly to file let store = ctx.runtime_env().object_store(&base_url)?; @@ -795,29 +798,20 @@ async fn read_json_via_store_or_file(url: &str, ctx: Arc) -> Res } } -pub(crate) async fn read_json( - url: &str, - ctx: Arc, -) -> Result { +pub(crate) async fn read_json(url: &str, ctx: Arc) -> Result { let value: serde_json::Value = read_json_via_store_or_file(url, ctx.clone()).await?; let table = VegaFusionTable::from_json(&value)?.with_ordering()?; ctx.vegafusion_table(table).await } -pub(crate) async fn read_arrow( - url: &str, - ctx: Arc, -) -> Result { +pub(crate) async fn read_arrow(url: &str, ctx: Arc) -> Result { maybe_register_object_stores_for_url(&ctx, url)?; Ok(ctx.read_arrow(url, ArrowReadOptions::default()).await?) } #[cfg(feature = "parquet")] -pub(crate) async fn read_parquet( - url: &str, - ctx: Arc, -) -> Result { +pub(crate) async fn read_parquet(url: &str, ctx: Arc) -> Result { maybe_register_object_stores_for_url(&ctx, url)?; Ok(ctx.read_parquet(url, ParquetReadOptions::default()).await?) } diff --git a/vegafusion-runtime/src/task_graph/runtime.rs b/vegafusion-runtime/src/task_graph/runtime.rs index 7c4e3d3d8..2fa174c06 100644 --- a/vegafusion-runtime/src/task_graph/runtime.rs +++ b/vegafusion-runtime/src/task_graph/runtime.rs @@ -19,8 +19,8 @@ use vegafusion_core::proto::gen::tasks::inline_dataset::Dataset; use vegafusion_core::proto::gen::tasks::{ task::TaskKind, InlineDataset, InlineDatasetTable, NodeValueIndex, TaskGraph, }; -use vegafusion_core::runtime::{normalize_allowed_base_urls, AllowedBaseUrlPattern}; use vegafusion_core::runtime::VegaFusionRuntimeTrait; +use vegafusion_core::runtime::{normalize_allowed_base_urls, AllowedBaseUrlPattern}; use vegafusion_core::task_graph::task_value::{MaterializedTaskValue, NamedTaskValue, TaskValue}; #[cfg(feature = "proto")] diff --git a/vegafusion-server/tests/test_task_graph_runtime.rs b/vegafusion-server/tests/test_task_graph_runtime.rs index dbae563e4..9332a0005 100644 --- a/vegafusion-server/tests/test_task_graph_runtime.rs +++ b/vegafusion-server/tests/test_task_graph_runtime.rs @@ -105,7 +105,10 @@ async fn spawn_server(extra_args: &[String]) -> (ServerProcess, String) { let address = format!("http://127.0.0.1:{port}"); for _ in 0..60 { - if VegaFusionRuntimeClient::connect(address.clone()).await.is_ok() { + if VegaFusionRuntimeClient::connect(address.clone()) + .await + .is_ok() + { return (ServerProcess { child }, address); } sleep(Duration::from_millis(100)).await; @@ -189,12 +192,10 @@ async fn try_it_from_spec() { request: Some(query_request::Request::TaskGraphValues( TaskGraphValueRequest { task_graph: Some(graph), - indices: vec![ - mapping - .get(&(Variable::new_signal("my_extent"), Vec::new())) - .cloned() - .unwrap(), - ], + indices: vec![mapping + .get(&(Variable::new_signal("my_extent"), Vec::new())) + .cloned() + .unwrap()], inline_datasets: vec![], }, )),