-
Notifications
You must be signed in to change notification settings - Fork 307
Expand file tree
/
Copy pathencode_strict.sql
More file actions
88 lines (69 loc) · 3.38 KB
/
encode_strict.sql
File metadata and controls
88 lines (69 loc) · 3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
-- Licensed to the Apache Software Foundation (ASF) under one
-- or more contributor license agreements. See the NOTICE file
-- distributed with this work for additional information
-- regarding copyright ownership. The ASF licenses this file
-- to you under the Apache License, Version 2.0 (the
-- "License"); you may not use this file except in compliance
-- with the License. You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing,
-- software distributed under the License is distributed on an
-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-- KIND, either express or implied. See the License for the
-- specific language governing permissions and limitations
-- under the License.
-- Tests for the strict charset whitelist and raise-on-unmappable behavior that
-- Spark 4.0 enabled by default. Earlier Spark versions have
-- spark.sql.legacy.javaCharsets=true and spark.sql.legacy.codingErrorAction=true
-- by default, which permit extra aliases and replace unmappable characters with
-- '?', so these assertions only hold on Spark 4.0 and later.
-- MinSparkVersion: 4.0
-- ============================================================================
-- Charset whitelist: Spark accepts exactly us-ascii, iso-8859-1, utf-8,
-- utf-16, utf-16be, utf-16le, utf-32. Anything else raises
-- INVALID_PARAMETER_VALUE.CHARSET.
-- ============================================================================
-- UTF-32BE and UTF-32LE are not accepted (only UTF-32 is)
query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
SELECT encode('A', 'UTF-32BE')
query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
SELECT encode('A', 'UTF-32LE')
-- Aliases without the hyphen are not accepted
query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
SELECT encode('abc', 'UTF8')
query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
SELECT encode('abc', 'UTF16')
query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
SELECT encode('abc', 'UTF16BE')
-- ASCII without the US- prefix is not accepted
query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
SELECT encode('abc', 'ASCII')
-- ISO-8859-1 aliases LATIN1 and ISO88591 are not accepted
query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
SELECT encode('abc', 'LATIN1')
query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
SELECT encode('abc', 'ISO88591')
-- Completely unknown charsets
query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
SELECT encode('abc', 'EBCDIC')
-- ============================================================================
-- Raise on unmappable characters (legacy.codingErrorAction defaults to false)
-- ============================================================================
-- U+00E9 (é) is not representable in US-ASCII
query expect_error(MALFORMED_CHARACTER_CODING)
SELECT encode('é', 'US-ASCII')
-- U+0100 (Ā) is not representable in ISO-8859-1
query expect_error(MALFORMED_CHARACTER_CODING)
SELECT encode(CAST(x'C480' AS BINARY), 'ISO-8859-1')
-- emoji is not representable in US-ASCII
query expect_error(MALFORMED_CHARACTER_CODING)
SELECT encode('😀', 'US-ASCII')
-- column argument with an unmappable value also raises
statement
CREATE TABLE test_encode_unmappable(s string) USING parquet
statement
INSERT INTO test_encode_unmappable VALUES ('é')
query expect_error(MALFORMED_CHARACTER_CODING)
SELECT encode(s, 'US-ASCII') FROM test_encode_unmappable