[SPARK-36224][SQL] Use Void as the type name of NullType
### What changes were proposed in this pull request? Change the `NullType.simpleString` to "void" to set "void" as the formal type name of `NullType` ### Why are the changes needed? This PR is intended to address the type name discussion in PR #28833. Here are the reasons: 1. The type name of NullType is displayed everywhere, e.g. schema string, error message, document. Hence it's not possible to hide it from users, we have to choose a proper name 2. The "void" is widely used as the type name of "NULL", e.g. Hive, pgSQL 3. Changing to "void" can enable the round trip of `toDDL`/`fromDDL` for NullType. (i.e. make `from_json(col, schema.toDDL)`) work ### Does this PR introduce _any_ user-facing change? Yes, the type name of "NULL" is changed from "null" to "void". for example: ``` scala> sql("select null as a, 1 as b").schema.catalogString res5: String = struct<a:void,b:int> ``` ### How was this patch tested? existing test cases Closes #33437 from linhongliu-db/SPARK-36224-void-type-name. Authored-by: Linhong Liu <linhong.liu@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
parent
a98d919da4
commit
2f700773c2
|
@ -511,8 +511,7 @@ class TypesTests(ReusedSQLTestCase):
|
|||
def test_parse_datatype_string(self):
|
||||
from pyspark.sql.types import _all_atomic_types, _parse_datatype_string
|
||||
for k, t in _all_atomic_types.items():
|
||||
if t != NullType:
|
||||
self.assertEqual(t(), _parse_datatype_string(k))
|
||||
self.assertEqual(t(), _parse_datatype_string(k))
|
||||
self.assertEqual(IntegerType(), _parse_datatype_string("int"))
|
||||
self.assertEqual(DecimalType(1, 1), _parse_datatype_string("decimal(1 ,1)"))
|
||||
self.assertEqual(DecimalType(10, 1), _parse_datatype_string("decimal( 10,1 )"))
|
||||
|
|
|
@ -107,7 +107,9 @@ class NullType(DataType, metaclass=DataTypeSingleton):
|
|||
|
||||
The data type representing None, used for the types that cannot be inferred.
|
||||
"""
|
||||
pass
|
||||
@classmethod
|
||||
def typeName(cls):
|
||||
return 'void'
|
||||
|
||||
|
||||
class AtomicType(DataType):
|
||||
|
|
|
@ -195,6 +195,8 @@ object DataType {
|
|||
case FIXED_DECIMAL(precision, scale) => DecimalType(precision.toInt, scale.toInt)
|
||||
case CHAR_TYPE(length) => CharType(length.toInt)
|
||||
case VARCHAR_TYPE(length) => VarcharType(length.toInt)
|
||||
// For backwards compatibility, previously the type name of NullType is "null"
|
||||
case "null" => NullType
|
||||
case other => otherTypes.getOrElse(
|
||||
other,
|
||||
throw new IllegalArgumentException(
|
||||
|
|
|
@ -32,6 +32,8 @@ class NullType private() extends DataType {
|
|||
override def defaultSize: Int = 1
|
||||
|
||||
private[spark] override def asNullable: NullType = this
|
||||
|
||||
override def typeName: String = "void"
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -183,6 +183,10 @@ class DataTypeSuite extends SparkFunSuite {
|
|||
assert(!arrayType.existsRecursively(_.isInstanceOf[IntegerType]))
|
||||
}
|
||||
|
||||
test("SPARK-36224: Backwards compatibility test for NullType.json") {
|
||||
assert(DataType.fromJson("\"null\"") == NullType)
|
||||
}
|
||||
|
||||
def checkDataTypeFromJson(dataType: DataType): Unit = {
|
||||
test(s"from Json - $dataType") {
|
||||
assert(DataType.fromJson(dataType.json) === dataType)
|
||||
|
@ -198,6 +202,7 @@ class DataTypeSuite extends SparkFunSuite {
|
|||
}
|
||||
|
||||
checkDataTypeFromJson(NullType)
|
||||
checkDataTypeFromDDL(NullType)
|
||||
|
||||
checkDataTypeFromJson(BooleanType)
|
||||
checkDataTypeFromDDL(BooleanType)
|
||||
|
@ -424,6 +429,7 @@ class DataTypeSuite extends SparkFunSuite {
|
|||
i => StructField(s"col$i", IntegerType, nullable = true)
|
||||
})
|
||||
|
||||
checkCatalogString(NullType)
|
||||
checkCatalogString(BooleanType)
|
||||
checkCatalogString(ByteType)
|
||||
checkCatalogString(ShortType)
|
||||
|
|
|
@ -34,7 +34,7 @@
|
|||
| org.apache.spark.sql.catalyst.expressions.Ascii | ascii | SELECT ascii('222') | struct<ascii(222):int> |
|
||||
| org.apache.spark.sql.catalyst.expressions.Asin | asin | SELECT asin(0) | struct<ASIN(0):double> |
|
||||
| org.apache.spark.sql.catalyst.expressions.Asinh | asinh | SELECT asinh(0) | struct<ASINH(0):double> |
|
||||
| org.apache.spark.sql.catalyst.expressions.AssertTrue | assert_true | SELECT assert_true(0 < 1) | struct<assert_true((0 < 1), '(0 < 1)' is not true!):null> |
|
||||
| org.apache.spark.sql.catalyst.expressions.AssertTrue | assert_true | SELECT assert_true(0 < 1) | struct<assert_true((0 < 1), '(0 < 1)' is not true!):void> |
|
||||
| org.apache.spark.sql.catalyst.expressions.Atan | atan | SELECT atan(0) | struct<ATAN(0):double> |
|
||||
| org.apache.spark.sql.catalyst.expressions.Atan2 | atan2 | SELECT atan2(0, 0) | struct<ATAN2(0, 0):double> |
|
||||
| org.apache.spark.sql.catalyst.expressions.Atanh | atanh | SELECT atanh(0) | struct<ATANH(0):double> |
|
||||
|
@ -223,7 +223,7 @@
|
|||
| org.apache.spark.sql.catalyst.expressions.RLike | regexp | SELECT regexp('%SystemDrive%\Users\John', '%SystemDrive%\\Users.*') | struct<REGEXP(%SystemDrive%UsersJohn, %SystemDrive%\Users.*):boolean> |
|
||||
| org.apache.spark.sql.catalyst.expressions.RLike | regexp_like | SELECT regexp_like('%SystemDrive%\Users\John', '%SystemDrive%\\Users.*') | struct<REGEXP_LIKE(%SystemDrive%UsersJohn, %SystemDrive%\Users.*):boolean> |
|
||||
| org.apache.spark.sql.catalyst.expressions.RLike | rlike | SELECT rlike('%SystemDrive%\Users\John', '%SystemDrive%\\Users.*') | struct<RLIKE(%SystemDrive%UsersJohn, %SystemDrive%\Users.*):boolean> |
|
||||
| org.apache.spark.sql.catalyst.expressions.RaiseError | raise_error | SELECT raise_error('custom error message') | struct<raise_error(custom error message):null> |
|
||||
| org.apache.spark.sql.catalyst.expressions.RaiseError | raise_error | SELECT raise_error('custom error message') | struct<raise_error(custom error message):void> |
|
||||
| org.apache.spark.sql.catalyst.expressions.Rand | rand | SELECT rand() | struct<rand():double> |
|
||||
| org.apache.spark.sql.catalyst.expressions.Rand | random | SELECT random() | struct<rand():double> |
|
||||
| org.apache.spark.sql.catalyst.expressions.Randn | randn | SELECT randn() | struct<randn():double> |
|
||||
|
@ -366,4 +366,4 @@
|
|||
| org.apache.spark.sql.catalyst.expressions.xml.XPathList | xpath | SELECT xpath('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>','a/b/text()') | struct<xpath(<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>, a/b/text()):array<string>> |
|
||||
| org.apache.spark.sql.catalyst.expressions.xml.XPathLong | xpath_long | SELECT xpath_long('<a><b>1</b><b>2</b></a>', 'sum(a/b)') | struct<xpath_long(<a><b>1</b><b>2</b></a>, sum(a/b)):bigint> |
|
||||
| org.apache.spark.sql.catalyst.expressions.xml.XPathShort | xpath_short | SELECT xpath_short('<a><b>1</b><b>2</b></a>', 'sum(a/b)') | struct<xpath_short(<a><b>1</b><b>2</b></a>, sum(a/b)):smallint> |
|
||||
| org.apache.spark.sql.catalyst.expressions.xml.XPathString | xpath_string | SELECT xpath_string('<a><b>b</b><c>cc</c></a>','a/c') | struct<xpath_string(<a><b>b</b><c>cc</c></a>, a/c):string> |
|
||||
| org.apache.spark.sql.catalyst.expressions.xml.XPathString | xpath_string | SELECT xpath_string('<a><b>b</b><c>cc</c></a>','a/c') | struct<xpath_string(<a><b>b</b><c>cc</c></a>, a/c):string> |
|
|
@ -5,7 +5,7 @@
|
|||
-- !query
|
||||
select null, Null, nUll
|
||||
-- !query schema
|
||||
struct<NULL:null,NULL:null,NULL:null>
|
||||
struct<NULL:void,NULL:void,NULL:void>
|
||||
-- !query output
|
||||
NULL NULL NULL
|
||||
|
||||
|
|
|
@ -74,7 +74,7 @@ select left(null, -2)
|
|||
struct<>
|
||||
-- !query output
|
||||
org.apache.spark.sql.AnalysisException
|
||||
cannot resolve 'substring(NULL, 1, -2)' due to data type mismatch: argument 1 requires (string or binary) type, however, 'NULL' is of null type.; line 1 pos 7
|
||||
cannot resolve 'substring(NULL, 1, -2)' due to data type mismatch: argument 1 requires (string or binary) type, however, 'NULL' is of void type.; line 1 pos 7
|
||||
|
||||
|
||||
-- !query
|
||||
|
@ -101,7 +101,7 @@ select right(null, -2)
|
|||
struct<>
|
||||
-- !query output
|
||||
org.apache.spark.sql.AnalysisException
|
||||
cannot resolve 'substring(NULL, (- -2), 2147483647)' due to data type mismatch: argument 1 requires (string or binary) type, however, 'NULL' is of null type.; line 1 pos 7
|
||||
cannot resolve 'substring(NULL, (- -2), 2147483647)' due to data type mismatch: argument 1 requires (string or binary) type, however, 'NULL' is of void type.; line 1 pos 7
|
||||
|
||||
|
||||
-- !query
|
||||
|
|
|
@ -49,7 +49,7 @@ two 2
|
|||
-- !query
|
||||
select * from values ("one", null), ("two", null) as data(a, b)
|
||||
-- !query schema
|
||||
struct<a:string,b:null>
|
||||
struct<a:string,b:void>
|
||||
-- !query output
|
||||
one NULL
|
||||
two NULL
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
-- !query
|
||||
select null, Null, nUll
|
||||
-- !query schema
|
||||
struct<NULL:null,NULL:null,NULL:null>
|
||||
struct<NULL:void,NULL:void,NULL:void>
|
||||
-- !query output
|
||||
NULL NULL NULL
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ select typeof(null)
|
|||
-- !query schema
|
||||
struct<typeof(NULL):string>
|
||||
-- !query output
|
||||
null
|
||||
void
|
||||
|
||||
|
||||
-- !query
|
||||
|
@ -61,7 +61,7 @@ array<int> map<int,int> struct<a:int,b:string>
|
|||
-- !query
|
||||
SELECT assert_true(true), assert_true(boolean(1))
|
||||
-- !query schema
|
||||
struct<assert_true(true, 'true' is not true!):null,assert_true(1, 'cast(1 as boolean)' is not true!):null>
|
||||
struct<assert_true(true, 'true' is not true!):void,assert_true(1, 'cast(1 as boolean)' is not true!):void>
|
||||
-- !query output
|
||||
NULL NULL
|
||||
|
||||
|
|
|
@ -308,7 +308,7 @@ struct<1:int>
|
|||
-- !query
|
||||
select foo.* from (select null) as foo
|
||||
-- !query schema
|
||||
struct<NULL:null>
|
||||
struct<NULL:void>
|
||||
-- !query output
|
||||
NULL
|
||||
|
||||
|
@ -316,7 +316,7 @@ NULL
|
|||
-- !query
|
||||
select foo.* from (select 'xyzzy',1,null) as foo
|
||||
-- !query schema
|
||||
struct<xyzzy:string,1:int,NULL:null>
|
||||
struct<xyzzy:string,1:int,NULL:void>
|
||||
-- !query output
|
||||
xyzzy 1 NULL
|
||||
|
||||
|
|
|
@ -130,7 +130,7 @@ select concat_ws(',',10,20,null,30)
|
|||
struct<>
|
||||
-- !query output
|
||||
org.apache.spark.sql.AnalysisException
|
||||
cannot resolve 'concat_ws(',', 10, 20, NULL, 30)' due to data type mismatch: argument 2 requires (array<string> or string) type, however, '10' is of int type. argument 3 requires (array<string> or string) type, however, '20' is of int type. argument 4 requires (array<string> or string) type, however, 'NULL' is of null type. argument 5 requires (array<string> or string) type, however, '30' is of int type.; line 1 pos 7
|
||||
cannot resolve 'concat_ws(',', 10, 20, NULL, 30)' due to data type mismatch: argument 2 requires (array<string> or string) type, however, '10' is of int type. argument 3 requires (array<string> or string) type, however, '20' is of int type. argument 4 requires (array<string> or string) type, however, 'NULL' is of void type. argument 5 requires (array<string> or string) type, however, '30' is of int type.; line 1 pos 7
|
||||
|
||||
|
||||
-- !query
|
||||
|
@ -139,7 +139,7 @@ select concat_ws('',10,20,null,30)
|
|||
struct<>
|
||||
-- !query output
|
||||
org.apache.spark.sql.AnalysisException
|
||||
cannot resolve 'concat_ws('', 10, 20, NULL, 30)' due to data type mismatch: argument 2 requires (array<string> or string) type, however, '10' is of int type. argument 3 requires (array<string> or string) type, however, '20' is of int type. argument 4 requires (array<string> or string) type, however, 'NULL' is of null type. argument 5 requires (array<string> or string) type, however, '30' is of int type.; line 1 pos 7
|
||||
cannot resolve 'concat_ws('', 10, 20, NULL, 30)' due to data type mismatch: argument 2 requires (array<string> or string) type, however, '10' is of int type. argument 3 requires (array<string> or string) type, however, '20' is of int type. argument 4 requires (array<string> or string) type, however, 'NULL' is of void type. argument 5 requires (array<string> or string) type, however, '30' is of int type.; line 1 pos 7
|
||||
|
||||
|
||||
-- !query
|
||||
|
@ -148,7 +148,7 @@ select concat_ws(NULL,10,20,null,30) is null
|
|||
struct<>
|
||||
-- !query output
|
||||
org.apache.spark.sql.AnalysisException
|
||||
cannot resolve 'concat_ws(CAST(NULL AS STRING), 10, 20, NULL, 30)' due to data type mismatch: argument 2 requires (array<string> or string) type, however, '10' is of int type. argument 3 requires (array<string> or string) type, however, '20' is of int type. argument 4 requires (array<string> or string) type, however, 'NULL' is of null type. argument 5 requires (array<string> or string) type, however, '30' is of int type.; line 1 pos 7
|
||||
cannot resolve 'concat_ws(CAST(NULL AS STRING), 10, 20, NULL, 30)' due to data type mismatch: argument 2 requires (array<string> or string) type, however, '10' is of int type. argument 3 requires (array<string> or string) type, however, '20' is of int type. argument 4 requires (array<string> or string) type, however, 'NULL' is of void type. argument 5 requires (array<string> or string) type, however, '30' is of int type.; line 1 pos 7
|
||||
|
||||
|
||||
-- !query
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
-- !query
|
||||
SELECT ifnull(null, 'x'), ifnull('y', 'x'), ifnull(null, null)
|
||||
-- !query schema
|
||||
struct<ifnull(NULL, x):string,ifnull(y, x):string,ifnull(NULL, NULL):null>
|
||||
struct<ifnull(NULL, x):string,ifnull(y, x):string,ifnull(NULL, NULL):void>
|
||||
-- !query output
|
||||
x y NULL
|
||||
|
||||
|
@ -21,7 +21,7 @@ NULL x
|
|||
-- !query
|
||||
SELECT nvl(null, 'x'), nvl('y', 'x'), nvl(null, null)
|
||||
-- !query schema
|
||||
struct<nvl(NULL, x):string,nvl(y, x):string,nvl(NULL, NULL):null>
|
||||
struct<nvl(NULL, x):string,nvl(y, x):string,nvl(NULL, NULL):void>
|
||||
-- !query output
|
||||
x y NULL
|
||||
|
||||
|
@ -29,7 +29,7 @@ x y NULL
|
|||
-- !query
|
||||
SELECT nvl2(null, 'x', 'y'), nvl2('n', 'x', 'y'), nvl2(null, null, null)
|
||||
-- !query schema
|
||||
struct<nvl2(NULL, x, y):string,nvl2(n, x, y):string,nvl2(NULL, NULL, NULL):null>
|
||||
struct<nvl2(NULL, x, y):string,nvl2(n, x, y):string,nvl2(NULL, NULL, NULL):void>
|
||||
-- !query output
|
||||
y x NULL
|
||||
|
||||
|
|
|
@ -89,7 +89,7 @@ Table-valued function range with alternatives:
|
|||
range(start: long, end: long, step: long)
|
||||
range(start: long, end: long)
|
||||
range(end: long)
|
||||
cannot be applied to (integer, null): Incompatible input data type. Expected: long; Found: null; line 1 pos 14
|
||||
cannot be applied to (integer, void): Incompatible input data type. Expected: long; Found: void; line 1 pos 14
|
||||
|
||||
|
||||
-- !query
|
||||
|
|
|
@ -49,7 +49,7 @@ two 2
|
|||
-- !query
|
||||
select udf(a), b from values ("one", null), ("two", null) as data(a, b)
|
||||
-- !query schema
|
||||
struct<udf(a):string,b:null>
|
||||
struct<udf(a):string,b:void>
|
||||
-- !query output
|
||||
one NULL
|
||||
two NULL
|
||||
|
|
|
@ -421,7 +421,7 @@ class FileBasedDataSourceSuite extends QueryTest
|
|||
""
|
||||
}
|
||||
def errorMessage(format: String): String = {
|
||||
s"$format data source does not support null data type."
|
||||
s"$format data source does not support void data type."
|
||||
}
|
||||
withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> useV1List) {
|
||||
withTempDir { dir =>
|
||||
|
|
|
@ -375,7 +375,6 @@ object SparkExecuteStatementOperation {
|
|||
def getTableSchema(structType: StructType): TableSchema = {
|
||||
val schema = structType.map { field =>
|
||||
val attrTypeString = field.dataType match {
|
||||
case NullType => "void"
|
||||
case CalendarIntervalType => StringType.catalogString
|
||||
case _: YearMonthIntervalType => "interval_year_month"
|
||||
case _: DayTimeIntervalType => "interval_day_time"
|
||||
|
|
|
@ -1000,7 +1000,7 @@ private[hive] object HiveClientImpl extends Logging {
|
|||
// When reading data in parquet, orc, or avro file format with string type for char,
|
||||
// the tailing spaces may lost if we are not going to pad it.
|
||||
val typeString = CharVarcharUtils.getRawTypeString(c.metadata)
|
||||
.getOrElse(HiveVoidType.replaceVoidType(c.dataType).catalogString)
|
||||
.getOrElse(c.dataType.catalogString)
|
||||
new FieldSchema(c.name, typeString, c.getComment().orNull)
|
||||
}
|
||||
|
||||
|
@ -1278,22 +1278,3 @@ private[hive] object HiveClientImpl extends Logging {
|
|||
hiveConf
|
||||
}
|
||||
}
|
||||
|
||||
private[hive] case object HiveVoidType extends DataType {
|
||||
override def defaultSize: Int = 1
|
||||
override def asNullable: DataType = HiveVoidType
|
||||
override def simpleString: String = "void"
|
||||
|
||||
def replaceVoidType(dt: DataType): DataType = dt match {
|
||||
case ArrayType(et, nullable) =>
|
||||
ArrayType(replaceVoidType(et), nullable)
|
||||
case MapType(kt, vt, nullable) =>
|
||||
MapType(replaceVoidType(kt), replaceVoidType(vt), nullable)
|
||||
case StructType(fields) =>
|
||||
StructType(fields.map { field =>
|
||||
field.copy(dataType = replaceVoidType(field.dataType))
|
||||
})
|
||||
case _: NullType => HiveVoidType
|
||||
case _ => dt
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2393,12 +2393,12 @@ class HiveDDLSuite
|
|||
}
|
||||
}
|
||||
|
||||
test("SPARK-36241: support creating tables with null datatype") {
|
||||
// CTAS with null type
|
||||
test("SPARK-36241: support creating tables with void datatype") {
|
||||
// CTAS with void type
|
||||
withTable("t1", "t2", "t3") {
|
||||
assertAnalysisError(
|
||||
"CREATE TABLE t1 USING PARQUET AS SELECT NULL AS null_col",
|
||||
"Parquet data source does not support null data type")
|
||||
"Parquet data source does not support void data type")
|
||||
|
||||
assertAnalysisError(
|
||||
"CREATE TABLE t2 STORED AS PARQUET AS SELECT null as null_col",
|
||||
|
@ -2408,11 +2408,11 @@ class HiveDDLSuite
|
|||
checkAnswer(sql("SELECT * FROM t3"), Row(null))
|
||||
}
|
||||
|
||||
// Create table with null type
|
||||
// Create table with void type
|
||||
withTable("t1", "t2", "t3", "t4") {
|
||||
assertAnalysisError(
|
||||
"CREATE TABLE t1 (v VOID) USING PARQUET",
|
||||
"Parquet data source does not support null data type")
|
||||
"Parquet data source does not support void data type")
|
||||
|
||||
assertAnalysisError(
|
||||
"CREATE TABLE t2 (v VOID) STORED AS PARQUET",
|
||||
|
@ -2425,7 +2425,7 @@ class HiveDDLSuite
|
|||
checkAnswer(sql("SELECT * FROM t4"), Seq.empty)
|
||||
}
|
||||
|
||||
// Create table with null type using spark.catalog.createTable
|
||||
// Create table with void type using spark.catalog.createTable
|
||||
withTable("t") {
|
||||
val schema = new StructType().add("c", NullType)
|
||||
spark.catalog.createTable(
|
||||
|
|
|
@ -121,7 +121,7 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton {
|
|||
msg = intercept[AnalysisException] {
|
||||
sql("select null").write.mode("overwrite").orc(orcDir)
|
||||
}.getMessage
|
||||
assert(msg.contains("ORC data source does not support null data type."))
|
||||
assert(msg.contains("ORC data source does not support void data type."))
|
||||
|
||||
msg = intercept[AnalysisException] {
|
||||
spark.udf.register("testType", () => new IntervalData())
|
||||
|
|
Loading…
Reference in a new issue