From db11d98f1fb61e42f3df4f5d6e15c3d4c7e8b2d8 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sat, 25 Apr 2026 12:55:27 -0500 Subject: [PATCH 1/4] Mark JSON-bearing string fields with is_json metadata `json_get_array` returned `List` whose items are raw JSON strings, but the inner field had no `is_json: true` metadata, so downstream consumers could not detect that the items were JSON. Also mark `json_get_json`'s top-level `Utf8` return field (it returns a raw JSON sub-document). Centralizes the metadata construction in a shared `is_json_metadata()` helper and reuses it from `common_union`. `json_as_text` is intentionally not marked, since for `Peek::String` it returns the unescaped string value (no surrounding quotes), so the output is not consistently valid JSON. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/common.rs | 8 ++++++++ src/common_union.rs | 9 ++++----- src/json_get_array.rs | 24 +++++++++++------------- src/json_get_json.rs | 17 ++++++++++++++--- tests/main.rs | 27 +++++++++++++++++++++++++++ 5 files changed, 64 insertions(+), 21 deletions(-) diff --git a/src/common.rs b/src/common.rs index 58a3642..223caff 100644 --- a/src/common.rs +++ b/src/common.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::str::Utf8Error; use std::sync::Arc; @@ -16,6 +17,13 @@ use crate::common_union::{ is_json_union, json_from_union_scalar, nested_json_array, nested_json_array_ref, TYPE_ID_NULL, }; +/// Field metadata that marks a `Utf8` column/field as containing raw JSON. +/// Downstream consumers (e.g. the rewrite layer, other UDFs) use this to +/// recognize JSON-bearing string columns. +pub fn is_json_metadata() -> HashMap { + HashMap::from_iter(vec![("is_json".to_string(), "true".to_string())]) +} + /// General implementation of `ScalarUDFImpl::return_type`. /// /// # Arguments diff --git a/src/common_union.rs b/src/common_union.rs index 217042b..7d14d18 100644 --- a/src/common_union.rs +++ b/src/common_union.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::sync::{Arc, LazyLock, OnceLock}; use datafusion::arrow::array::{ @@ -9,6 +8,8 @@ use datafusion::arrow::datatypes::{DataType, Field, UnionFields, UnionMode}; use datafusion::arrow::error::ArrowError; use datafusion::common::ScalarValue; +use crate::common::is_json_metadata; + pub fn is_json_union(data_type: &DataType) -> bool { match data_type { DataType::Union(fields, UnionMode::Sparse) => fields == &union_fields(), @@ -161,8 +162,6 @@ fn union_fields() -> UnionFields { static FIELDS: OnceLock = OnceLock::new(); FIELDS .get_or_init(|| { - let json_metadata: HashMap = - HashMap::from_iter(vec![("is_json".to_string(), "true".to_string())]); UnionFields::from_iter([ (TYPE_ID_NULL, Arc::new(Field::new("null", DataType::Null, true))), (TYPE_ID_BOOL, Arc::new(Field::new("bool", DataType::Boolean, false))), @@ -171,11 +170,11 @@ fn union_fields() -> UnionFields { (TYPE_ID_STR, Arc::new(Field::new("str", DataType::Utf8, false))), ( TYPE_ID_ARRAY, - Arc::new(Field::new("array", DataType::Utf8, false).with_metadata(json_metadata.clone())), + Arc::new(Field::new("array", DataType::Utf8, false).with_metadata(is_json_metadata())), ), ( TYPE_ID_OBJECT, - Arc::new(Field::new("object", DataType::Utf8, false).with_metadata(json_metadata.clone())), + Arc::new(Field::new("object", DataType::Utf8, false).with_metadata(is_json_metadata())), ), ]) }) diff --git a/src/json_get_array.rs b/src/json_get_array.rs index 5b74a2f..a5c28e9 100644 --- a/src/json_get_array.rs +++ b/src/json_get_array.rs @@ -2,14 +2,20 @@ use std::any::Any; use std::sync::Arc; use datafusion::arrow::array::{ArrayRef, ListBuilder, StringBuilder}; -use datafusion::arrow::datatypes::DataType; +use datafusion::arrow::datatypes::{DataType, Field}; use datafusion::common::{Result as DataFusionResult, ScalarValue}; use datafusion::logical_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; use jiter::Peek; -use crate::common::{get_err, invoke, jiter_json_find, return_type_check, GetError, InvokeResult, JsonPath}; +use crate::common::{ + get_err, invoke, is_json_metadata, jiter_json_find, return_type_check, GetError, InvokeResult, JsonPath, +}; use crate::common_macros::make_udf_function; +fn list_item_field() -> Field { + Field::new("item", DataType::Utf8, true).with_metadata(is_json_metadata()) +} + make_udf_function!( JsonGetArray, json_get_array, @@ -46,15 +52,7 @@ impl ScalarUDFImpl for JsonGetArray { } fn return_type(&self, arg_types: &[DataType]) -> DataFusionResult { - return_type_check( - arg_types, - self.name(), - DataType::List(Arc::new(datafusion::arrow::datatypes::Field::new( - "item", - DataType::Utf8, - true, - ))), - ) + return_type_check(arg_types, self.name(), DataType::List(Arc::new(list_item_field()))) } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> DataFusionResult { @@ -96,7 +94,7 @@ impl InvokeResult for BuildArrayList { fn builder(capacity: usize) -> Self::Builder { let values_builder = StringBuilder::new(); - ListBuilder::with_capacity(values_builder, capacity) + ListBuilder::with_capacity(values_builder, capacity).with_field(list_item_field()) } fn append_value(builder: &mut Self::Builder, value: Option) { @@ -108,7 +106,7 @@ impl InvokeResult for BuildArrayList { } fn scalar(value: Option) -> ScalarValue { - let mut builder = ListBuilder::new(StringBuilder::new()); + let mut builder = ListBuilder::new(StringBuilder::new()).with_field(list_item_field()); if let Some(array_items) = value { for item in array_items { diff --git a/src/json_get_json.rs b/src/json_get_json.rs index 4b3b678..d90a571 100644 --- a/src/json_get_json.rs +++ b/src/json_get_json.rs @@ -1,11 +1,14 @@ use std::any::Any; +use std::sync::Arc; use datafusion::arrow::array::StringArray; -use datafusion::arrow::datatypes::DataType; +use datafusion::arrow::datatypes::{DataType, Field, FieldRef}; use datafusion::common::Result as DataFusionResult; -use datafusion::logical_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; +use datafusion::logical_expr::{ + ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; -use crate::common::{get_err, invoke, jiter_json_find, return_type_check, GetError, JsonPath}; +use crate::common::{get_err, invoke, is_json_metadata, jiter_json_find, return_type_check, GetError, JsonPath}; use crate::common_macros::make_udf_function; make_udf_function!( @@ -47,6 +50,14 @@ impl ScalarUDFImpl for JsonGetJson { return_type_check(arg_types, self.name(), DataType::Utf8) } + fn return_field_from_args(&self, args: ReturnFieldArgs) -> DataFusionResult { + let arg_types: Vec = args.arg_fields.iter().map(|f| f.data_type().clone()).collect(); + let return_type = self.return_type(&arg_types)?; + Ok(Arc::new( + Field::new(self.name(), return_type, true).with_metadata(is_json_metadata()), + )) + } + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> DataFusionResult { invoke::(&args.args, jiter_json_get_json) } diff --git a/tests/main.rs b/tests/main.rs index cbc371c..763c6a7 100644 --- a/tests/main.rs +++ b/tests/main.rs @@ -161,6 +161,24 @@ async fn test_json_get_array_with_path() { assert_eq!(value_repr, "[1, 2, 3]"); } +#[tokio::test] +async fn test_json_get_array_inner_field_is_json_metadata() { + let sql = r#"select json_get_array('[{"a": 1}, {"b": 2}]') as v"#; + let batches = run_query(sql).await.unwrap(); + let schema = batches[0].schema(); + let field = schema.field(0); + let DataType::List(inner_field) = field.data_type() else { + panic!("expected List, got {:?}", field.data_type()); + }; + assert_eq!(inner_field.metadata().get("is_json").map(String::as_str), Some("true")); + + let array_field = batches[0].column(0).as_any().downcast_ref::().unwrap(); + let DataType::List(produced_inner) = array_field.data_type() else { + panic!("expected List in produced array"); + }; + assert_eq!(produced_inner.metadata().get("is_json").map(String::as_str), Some("true")); +} + #[tokio::test] async fn test_json_get_equals() { let e = run_query(r"select name, json_get(json_data, 'foo')='abc' from test") @@ -411,6 +429,15 @@ async fn test_json_get_json_float() { assert_eq!(display_val(batches).await, (DataType::Utf8, "4.2e-1".to_string())); } +#[tokio::test] +async fn test_json_get_json_is_json_metadata() { + let sql = r#"select json_get_json('{"x": [1, 2]}', 'x') as v"#; + let batches = run_query(sql).await.unwrap(); + let schema = batches[0].schema(); + let field = schema.field(0); + assert_eq!(field.metadata().get("is_json").map(String::as_str), Some("true")); +} + #[tokio::test] async fn test_json_length_array() { let sql = "select json_length('[1, 2, 3]')"; From d09966bc83c1d36fcf3b17eda954df68e030452b Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sat, 25 Apr 2026 13:10:35 -0500 Subject: [PATCH 2/4] Format test to satisfy cargo fmt Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/main.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/main.rs b/tests/main.rs index 763c6a7..d6d6b72 100644 --- a/tests/main.rs +++ b/tests/main.rs @@ -172,11 +172,18 @@ async fn test_json_get_array_inner_field_is_json_metadata() { }; assert_eq!(inner_field.metadata().get("is_json").map(String::as_str), Some("true")); - let array_field = batches[0].column(0).as_any().downcast_ref::().unwrap(); + let array_field = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); let DataType::List(produced_inner) = array_field.data_type() else { panic!("expected List in produced array"); }; - assert_eq!(produced_inner.metadata().get("is_json").map(String::as_str), Some("true")); + assert_eq!( + produced_inner.metadata().get("is_json").map(String::as_str), + Some("true") + ); } #[tokio::test] From 5b3ff39d2c809e87a19f75eaabec0e1d2d5e8e87 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sat, 25 Apr 2026 13:15:24 -0500 Subject: [PATCH 3/4] Update src/common.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/common.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common.rs b/src/common.rs index 223caff..5940ade 100644 --- a/src/common.rs +++ b/src/common.rs @@ -21,7 +21,7 @@ use crate::common_union::{ /// Downstream consumers (e.g. the rewrite layer, other UDFs) use this to /// recognize JSON-bearing string columns. pub fn is_json_metadata() -> HashMap { - HashMap::from_iter(vec![("is_json".to_string(), "true".to_string())]) + HashMap::from([("is_json".to_string(), "true".to_string())]) } /// General implementation of `ScalarUDFImpl::return_type`. From 13109942148f83e8752dd3ce11b204146e2b4ff3 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sat, 25 Apr 2026 13:31:49 -0500 Subject: [PATCH 4/4] Rename helper to json_field_metadata and move to common_union `is_json_metadata()` was ambiguous about direction (check vs. mark). The helper is a constructor of metadata that marks a field as containing JSON-encoded data. Renaming to `json_field_metadata()` makes that unambiguous and stays correct if more keys (e.g. canonical Arrow extension keys) are added to the returned map in the future. Move it to `src/common_union.rs`, which already houses JSON-typing concerns. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/common.rs | 8 -------- src/common_union.rs | 13 ++++++++++--- src/json_get_array.rs | 7 +++---- src/json_get_json.rs | 5 +++-- 4 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/common.rs b/src/common.rs index 5940ade..58a3642 100644 --- a/src/common.rs +++ b/src/common.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::str::Utf8Error; use std::sync::Arc; @@ -17,13 +16,6 @@ use crate::common_union::{ is_json_union, json_from_union_scalar, nested_json_array, nested_json_array_ref, TYPE_ID_NULL, }; -/// Field metadata that marks a `Utf8` column/field as containing raw JSON. -/// Downstream consumers (e.g. the rewrite layer, other UDFs) use this to -/// recognize JSON-bearing string columns. -pub fn is_json_metadata() -> HashMap { - HashMap::from([("is_json".to_string(), "true".to_string())]) -} - /// General implementation of `ScalarUDFImpl::return_type`. /// /// # Arguments diff --git a/src/common_union.rs b/src/common_union.rs index 7d14d18..495d88f 100644 --- a/src/common_union.rs +++ b/src/common_union.rs @@ -1,3 +1,4 @@ +use std::collections::HashMap; use std::sync::{Arc, LazyLock, OnceLock}; use datafusion::arrow::array::{ @@ -8,7 +9,13 @@ use datafusion::arrow::datatypes::{DataType, Field, UnionFields, UnionMode}; use datafusion::arrow::error::ArrowError; use datafusion::common::ScalarValue; -use crate::common::is_json_metadata; +/// Field metadata used to mark a `Utf8` field as containing raw JSON. +/// +/// Attach this to any Arrow `Field` whose values are JSON-encoded strings so +/// downstream consumers can recognize them as JSON rather than opaque text. +pub fn json_field_metadata() -> HashMap { + HashMap::from([("is_json".to_string(), "true".to_string())]) +} pub fn is_json_union(data_type: &DataType) -> bool { match data_type { @@ -170,11 +177,11 @@ fn union_fields() -> UnionFields { (TYPE_ID_STR, Arc::new(Field::new("str", DataType::Utf8, false))), ( TYPE_ID_ARRAY, - Arc::new(Field::new("array", DataType::Utf8, false).with_metadata(is_json_metadata())), + Arc::new(Field::new("array", DataType::Utf8, false).with_metadata(json_field_metadata())), ), ( TYPE_ID_OBJECT, - Arc::new(Field::new("object", DataType::Utf8, false).with_metadata(is_json_metadata())), + Arc::new(Field::new("object", DataType::Utf8, false).with_metadata(json_field_metadata())), ), ]) }) diff --git a/src/json_get_array.rs b/src/json_get_array.rs index a5c28e9..fb73ac5 100644 --- a/src/json_get_array.rs +++ b/src/json_get_array.rs @@ -7,13 +7,12 @@ use datafusion::common::{Result as DataFusionResult, ScalarValue}; use datafusion::logical_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; use jiter::Peek; -use crate::common::{ - get_err, invoke, is_json_metadata, jiter_json_find, return_type_check, GetError, InvokeResult, JsonPath, -}; +use crate::common::{get_err, invoke, jiter_json_find, return_type_check, GetError, InvokeResult, JsonPath}; use crate::common_macros::make_udf_function; +use crate::common_union::json_field_metadata; fn list_item_field() -> Field { - Field::new("item", DataType::Utf8, true).with_metadata(is_json_metadata()) + Field::new("item", DataType::Utf8, true).with_metadata(json_field_metadata()) } make_udf_function!( diff --git a/src/json_get_json.rs b/src/json_get_json.rs index d90a571..d744d10 100644 --- a/src/json_get_json.rs +++ b/src/json_get_json.rs @@ -8,8 +8,9 @@ use datafusion::logical_expr::{ ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, }; -use crate::common::{get_err, invoke, is_json_metadata, jiter_json_find, return_type_check, GetError, JsonPath}; +use crate::common::{get_err, invoke, jiter_json_find, return_type_check, GetError, JsonPath}; use crate::common_macros::make_udf_function; +use crate::common_union::json_field_metadata; make_udf_function!( JsonGetJson, @@ -54,7 +55,7 @@ impl ScalarUDFImpl for JsonGetJson { let arg_types: Vec = args.arg_fields.iter().map(|f| f.data_type().clone()).collect(); let return_type = self.return_type(&arg_types)?; Ok(Arc::new( - Field::new(self.name(), return_type, true).with_metadata(is_json_metadata()), + Field::new(self.name(), return_type, true).with_metadata(json_field_metadata()), )) }