[SPARK-36224][SQL] Use Void as the type name of NullType

### What changes were proposed in this pull request?
Change the `NullType.simpleString` to "void" to set "void" as the formal type name of `NullType`

### Why are the changes needed?
This PR is intended to address the type name discussion in PR #28833. Here are the reasons:
1. The type name of NullType is displayed everywhere, e.g. schema string, error message, document. Hence it's not possible to hide it from users, we have to choose a proper name
2. The "void" is widely used as the type name of "NULL", e.g. Hive, pgSQL
3. Changing to "void" can enable the round trip of `toDDL`/`fromDDL` for NullType. (i.e. make `from_json(col, schema.toDDL)`) work

### Does this PR introduce _any_ user-facing change?
Yes, the type name of "NULL" is changed from "null" to "void". for example:
```
scala> sql("select null as a, 1 as b").schema.catalogString
res5: String = struct<a:void,b:int>
```

### How was this patch tested?
existing test cases

Closes #33437 from linhongliu-db/SPARK-36224-void-type-name.

Authored-by: Linhong Liu <linhong.liu@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
Linhong Liu 2021-08-02 23:19:54 +08:00 committed by Wenchen Fan
parent a98d919da4
commit 2f700773c2
21 changed files with 43 additions and 52 deletions

View file

@ -511,8 +511,7 @@ class TypesTests(ReusedSQLTestCase):
def test_parse_datatype_string(self):
from pyspark.sql.types import _all_atomic_types, _parse_datatype_string
for k, t in _all_atomic_types.items():
if t != NullType:
self.assertEqual(t(), _parse_datatype_string(k))
self.assertEqual(t(), _parse_datatype_string(k))
self.assertEqual(IntegerType(), _parse_datatype_string("int"))
self.assertEqual(DecimalType(1, 1), _parse_datatype_string("decimal(1 ,1)"))
self.assertEqual(DecimalType(10, 1), _parse_datatype_string("decimal( 10,1 )"))

View file

@ -107,7 +107,9 @@ class NullType(DataType, metaclass=DataTypeSingleton):
The data type representing None, used for the types that cannot be inferred.
"""
pass
@classmethod
def typeName(cls):
return 'void'
class AtomicType(DataType):

View file

@ -195,6 +195,8 @@ object DataType {
case FIXED_DECIMAL(precision, scale) => DecimalType(precision.toInt, scale.toInt)
case CHAR_TYPE(length) => CharType(length.toInt)
case VARCHAR_TYPE(length) => VarcharType(length.toInt)
// For backwards compatibility, previously the type name of NullType is "null"
case "null" => NullType
case other => otherTypes.getOrElse(
other,
throw new IllegalArgumentException(

View file

@ -32,6 +32,8 @@ class NullType private() extends DataType {
override def defaultSize: Int = 1
private[spark] override def asNullable: NullType = this
override def typeName: String = "void"
}
/**

View file

@ -183,6 +183,10 @@ class DataTypeSuite extends SparkFunSuite {
assert(!arrayType.existsRecursively(_.isInstanceOf[IntegerType]))
}
test("SPARK-36224: Backwards compatibility test for NullType.json") {
assert(DataType.fromJson("\"null\"") == NullType)
}
def checkDataTypeFromJson(dataType: DataType): Unit = {
test(s"from Json - $dataType") {
assert(DataType.fromJson(dataType.json) === dataType)
@ -198,6 +202,7 @@ class DataTypeSuite extends SparkFunSuite {
}
checkDataTypeFromJson(NullType)
checkDataTypeFromDDL(NullType)
checkDataTypeFromJson(BooleanType)
checkDataTypeFromDDL(BooleanType)
@ -424,6 +429,7 @@ class DataTypeSuite extends SparkFunSuite {
i => StructField(s"col$i", IntegerType, nullable = true)
})
checkCatalogString(NullType)
checkCatalogString(BooleanType)
checkCatalogString(ByteType)
checkCatalogString(ShortType)

View file

@ -34,7 +34,7 @@
| org.apache.spark.sql.catalyst.expressions.Ascii | ascii | SELECT ascii('222') | struct<ascii(222):int> |
| org.apache.spark.sql.catalyst.expressions.Asin | asin | SELECT asin(0) | struct<ASIN(0):double> |
| org.apache.spark.sql.catalyst.expressions.Asinh | asinh | SELECT asinh(0) | struct<ASINH(0):double> |
| org.apache.spark.sql.catalyst.expressions.AssertTrue | assert_true | SELECT assert_true(0 < 1) | struct<assert_true((0 < 1), '(0 < 1)' is not true!):null> |
| org.apache.spark.sql.catalyst.expressions.AssertTrue | assert_true | SELECT assert_true(0 < 1) | struct<assert_true((0 < 1), '(0 < 1)' is not true!):void> |
| org.apache.spark.sql.catalyst.expressions.Atan | atan | SELECT atan(0) | struct<ATAN(0):double> |
| org.apache.spark.sql.catalyst.expressions.Atan2 | atan2 | SELECT atan2(0, 0) | struct<ATAN2(0, 0):double> |
| org.apache.spark.sql.catalyst.expressions.Atanh | atanh | SELECT atanh(0) | struct<ATANH(0):double> |
@ -223,7 +223,7 @@
| org.apache.spark.sql.catalyst.expressions.RLike | regexp | SELECT regexp('%SystemDrive%\Users\John', '%SystemDrive%\\Users.*') | struct<REGEXP(%SystemDrive%UsersJohn, %SystemDrive%\Users.*):boolean> |
| org.apache.spark.sql.catalyst.expressions.RLike | regexp_like | SELECT regexp_like('%SystemDrive%\Users\John', '%SystemDrive%\\Users.*') | struct<REGEXP_LIKE(%SystemDrive%UsersJohn, %SystemDrive%\Users.*):boolean> |
| org.apache.spark.sql.catalyst.expressions.RLike | rlike | SELECT rlike('%SystemDrive%\Users\John', '%SystemDrive%\\Users.*') | struct<RLIKE(%SystemDrive%UsersJohn, %SystemDrive%\Users.*):boolean> |
| org.apache.spark.sql.catalyst.expressions.RaiseError | raise_error | SELECT raise_error('custom error message') | struct<raise_error(custom error message):null> |
| org.apache.spark.sql.catalyst.expressions.RaiseError | raise_error | SELECT raise_error('custom error message') | struct<raise_error(custom error message):void> |
| org.apache.spark.sql.catalyst.expressions.Rand | rand | SELECT rand() | struct<rand():double> |
| org.apache.spark.sql.catalyst.expressions.Rand | random | SELECT random() | struct<rand():double> |
| org.apache.spark.sql.catalyst.expressions.Randn | randn | SELECT randn() | struct<randn():double> |
@ -366,4 +366,4 @@
| org.apache.spark.sql.catalyst.expressions.xml.XPathList | xpath | SELECT xpath('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>','a/b/text()') | struct<xpath(<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>, a/b/text()):array<string>> |
| org.apache.spark.sql.catalyst.expressions.xml.XPathLong | xpath_long | SELECT xpath_long('<a><b>1</b><b>2</b></a>', 'sum(a/b)') | struct<xpath_long(<a><b>1</b><b>2</b></a>, sum(a/b)):bigint> |
| org.apache.spark.sql.catalyst.expressions.xml.XPathShort | xpath_short | SELECT xpath_short('<a><b>1</b><b>2</b></a>', 'sum(a/b)') | struct<xpath_short(<a><b>1</b><b>2</b></a>, sum(a/b)):smallint> |
| org.apache.spark.sql.catalyst.expressions.xml.XPathString | xpath_string | SELECT xpath_string('<a><b>b</b><c>cc</c></a>','a/c') | struct<xpath_string(<a><b>b</b><c>cc</c></a>, a/c):string> |
| org.apache.spark.sql.catalyst.expressions.xml.XPathString | xpath_string | SELECT xpath_string('<a><b>b</b><c>cc</c></a>','a/c') | struct<xpath_string(<a><b>b</b><c>cc</c></a>, a/c):string> |

View file

@ -5,7 +5,7 @@
-- !query
select null, Null, nUll
-- !query schema
struct<NULL:null,NULL:null,NULL:null>
struct<NULL:void,NULL:void,NULL:void>
-- !query output
NULL NULL NULL

View file

@ -74,7 +74,7 @@ select left(null, -2)
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
cannot resolve 'substring(NULL, 1, -2)' due to data type mismatch: argument 1 requires (string or binary) type, however, 'NULL' is of null type.; line 1 pos 7
cannot resolve 'substring(NULL, 1, -2)' due to data type mismatch: argument 1 requires (string or binary) type, however, 'NULL' is of void type.; line 1 pos 7
-- !query
@ -101,7 +101,7 @@ select right(null, -2)
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
cannot resolve 'substring(NULL, (- -2), 2147483647)' due to data type mismatch: argument 1 requires (string or binary) type, however, 'NULL' is of null type.; line 1 pos 7
cannot resolve 'substring(NULL, (- -2), 2147483647)' due to data type mismatch: argument 1 requires (string or binary) type, however, 'NULL' is of void type.; line 1 pos 7
-- !query

View file

@ -49,7 +49,7 @@ two 2
-- !query
select * from values ("one", null), ("two", null) as data(a, b)
-- !query schema
struct<a:string,b:null>
struct<a:string,b:void>
-- !query output
one NULL
two NULL

View file

@ -5,7 +5,7 @@
-- !query
select null, Null, nUll
-- !query schema
struct<NULL:null,NULL:null,NULL:null>
struct<NULL:void,NULL:void,NULL:void>
-- !query output
NULL NULL NULL

View file

@ -7,7 +7,7 @@ select typeof(null)
-- !query schema
struct<typeof(NULL):string>
-- !query output
null
void
-- !query
@ -61,7 +61,7 @@ array<int> map<int,int> struct<a:int,b:string>
-- !query
SELECT assert_true(true), assert_true(boolean(1))
-- !query schema
struct<assert_true(true, 'true' is not true!):null,assert_true(1, 'cast(1 as boolean)' is not true!):null>
struct<assert_true(true, 'true' is not true!):void,assert_true(1, 'cast(1 as boolean)' is not true!):void>
-- !query output
NULL NULL

View file

@ -308,7 +308,7 @@ struct<1:int>
-- !query
select foo.* from (select null) as foo
-- !query schema
struct<NULL:null>
struct<NULL:void>
-- !query output
NULL
@ -316,7 +316,7 @@ NULL
-- !query
select foo.* from (select 'xyzzy',1,null) as foo
-- !query schema
struct<xyzzy:string,1:int,NULL:null>
struct<xyzzy:string,1:int,NULL:void>
-- !query output
xyzzy 1 NULL

View file

@ -130,7 +130,7 @@ select concat_ws(',',10,20,null,30)
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
cannot resolve 'concat_ws(',', 10, 20, NULL, 30)' due to data type mismatch: argument 2 requires (array<string> or string) type, however, '10' is of int type. argument 3 requires (array<string> or string) type, however, '20' is of int type. argument 4 requires (array<string> or string) type, however, 'NULL' is of null type. argument 5 requires (array<string> or string) type, however, '30' is of int type.; line 1 pos 7
cannot resolve 'concat_ws(',', 10, 20, NULL, 30)' due to data type mismatch: argument 2 requires (array<string> or string) type, however, '10' is of int type. argument 3 requires (array<string> or string) type, however, '20' is of int type. argument 4 requires (array<string> or string) type, however, 'NULL' is of void type. argument 5 requires (array<string> or string) type, however, '30' is of int type.; line 1 pos 7
-- !query
@ -139,7 +139,7 @@ select concat_ws('',10,20,null,30)
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
cannot resolve 'concat_ws('', 10, 20, NULL, 30)' due to data type mismatch: argument 2 requires (array<string> or string) type, however, '10' is of int type. argument 3 requires (array<string> or string) type, however, '20' is of int type. argument 4 requires (array<string> or string) type, however, 'NULL' is of null type. argument 5 requires (array<string> or string) type, however, '30' is of int type.; line 1 pos 7
cannot resolve 'concat_ws('', 10, 20, NULL, 30)' due to data type mismatch: argument 2 requires (array<string> or string) type, however, '10' is of int type. argument 3 requires (array<string> or string) type, however, '20' is of int type. argument 4 requires (array<string> or string) type, however, 'NULL' is of void type. argument 5 requires (array<string> or string) type, however, '30' is of int type.; line 1 pos 7
-- !query
@ -148,7 +148,7 @@ select concat_ws(NULL,10,20,null,30) is null
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
cannot resolve 'concat_ws(CAST(NULL AS STRING), 10, 20, NULL, 30)' due to data type mismatch: argument 2 requires (array<string> or string) type, however, '10' is of int type. argument 3 requires (array<string> or string) type, however, '20' is of int type. argument 4 requires (array<string> or string) type, however, 'NULL' is of null type. argument 5 requires (array<string> or string) type, however, '30' is of int type.; line 1 pos 7
cannot resolve 'concat_ws(CAST(NULL AS STRING), 10, 20, NULL, 30)' due to data type mismatch: argument 2 requires (array<string> or string) type, however, '10' is of int type. argument 3 requires (array<string> or string) type, however, '20' is of int type. argument 4 requires (array<string> or string) type, however, 'NULL' is of void type. argument 5 requires (array<string> or string) type, however, '30' is of int type.; line 1 pos 7
-- !query

View file

@ -5,7 +5,7 @@
-- !query
SELECT ifnull(null, 'x'), ifnull('y', 'x'), ifnull(null, null)
-- !query schema
struct<ifnull(NULL, x):string,ifnull(y, x):string,ifnull(NULL, NULL):null>
struct<ifnull(NULL, x):string,ifnull(y, x):string,ifnull(NULL, NULL):void>
-- !query output
x y NULL
@ -21,7 +21,7 @@ NULL x
-- !query
SELECT nvl(null, 'x'), nvl('y', 'x'), nvl(null, null)
-- !query schema
struct<nvl(NULL, x):string,nvl(y, x):string,nvl(NULL, NULL):null>
struct<nvl(NULL, x):string,nvl(y, x):string,nvl(NULL, NULL):void>
-- !query output
x y NULL
@ -29,7 +29,7 @@ x y NULL
-- !query
SELECT nvl2(null, 'x', 'y'), nvl2('n', 'x', 'y'), nvl2(null, null, null)
-- !query schema
struct<nvl2(NULL, x, y):string,nvl2(n, x, y):string,nvl2(NULL, NULL, NULL):null>
struct<nvl2(NULL, x, y):string,nvl2(n, x, y):string,nvl2(NULL, NULL, NULL):void>
-- !query output
y x NULL

View file

@ -89,7 +89,7 @@ Table-valued function range with alternatives:
range(start: long, end: long, step: long)
range(start: long, end: long)
range(end: long)
cannot be applied to (integer, null): Incompatible input data type. Expected: long; Found: null; line 1 pos 14
cannot be applied to (integer, void): Incompatible input data type. Expected: long; Found: void; line 1 pos 14
-- !query

View file

@ -49,7 +49,7 @@ two 2
-- !query
select udf(a), b from values ("one", null), ("two", null) as data(a, b)
-- !query schema
struct<udf(a):string,b:null>
struct<udf(a):string,b:void>
-- !query output
one NULL
two NULL

View file

@ -421,7 +421,7 @@ class FileBasedDataSourceSuite extends QueryTest
""
}
def errorMessage(format: String): String = {
s"$format data source does not support null data type."
s"$format data source does not support void data type."
}
withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> useV1List) {
withTempDir { dir =>

View file

@ -375,7 +375,6 @@ object SparkExecuteStatementOperation {
def getTableSchema(structType: StructType): TableSchema = {
val schema = structType.map { field =>
val attrTypeString = field.dataType match {
case NullType => "void"
case CalendarIntervalType => StringType.catalogString
case _: YearMonthIntervalType => "interval_year_month"
case _: DayTimeIntervalType => "interval_day_time"

View file

@ -1000,7 +1000,7 @@ private[hive] object HiveClientImpl extends Logging {
// When reading data in parquet, orc, or avro file format with string type for char,
// the tailing spaces may lost if we are not going to pad it.
val typeString = CharVarcharUtils.getRawTypeString(c.metadata)
.getOrElse(HiveVoidType.replaceVoidType(c.dataType).catalogString)
.getOrElse(c.dataType.catalogString)
new FieldSchema(c.name, typeString, c.getComment().orNull)
}
@ -1278,22 +1278,3 @@ private[hive] object HiveClientImpl extends Logging {
hiveConf
}
}
private[hive] case object HiveVoidType extends DataType {
override def defaultSize: Int = 1
override def asNullable: DataType = HiveVoidType
override def simpleString: String = "void"
def replaceVoidType(dt: DataType): DataType = dt match {
case ArrayType(et, nullable) =>
ArrayType(replaceVoidType(et), nullable)
case MapType(kt, vt, nullable) =>
MapType(replaceVoidType(kt), replaceVoidType(vt), nullable)
case StructType(fields) =>
StructType(fields.map { field =>
field.copy(dataType = replaceVoidType(field.dataType))
})
case _: NullType => HiveVoidType
case _ => dt
}
}

View file

@ -2393,12 +2393,12 @@ class HiveDDLSuite
}
}
test("SPARK-36241: support creating tables with null datatype") {
// CTAS with null type
test("SPARK-36241: support creating tables with void datatype") {
// CTAS with void type
withTable("t1", "t2", "t3") {
assertAnalysisError(
"CREATE TABLE t1 USING PARQUET AS SELECT NULL AS null_col",
"Parquet data source does not support null data type")
"Parquet data source does not support void data type")
assertAnalysisError(
"CREATE TABLE t2 STORED AS PARQUET AS SELECT null as null_col",
@ -2408,11 +2408,11 @@ class HiveDDLSuite
checkAnswer(sql("SELECT * FROM t3"), Row(null))
}
// Create table with null type
// Create table with void type
withTable("t1", "t2", "t3", "t4") {
assertAnalysisError(
"CREATE TABLE t1 (v VOID) USING PARQUET",
"Parquet data source does not support null data type")
"Parquet data source does not support void data type")
assertAnalysisError(
"CREATE TABLE t2 (v VOID) STORED AS PARQUET",
@ -2425,7 +2425,7 @@ class HiveDDLSuite
checkAnswer(sql("SELECT * FROM t4"), Seq.empty)
}
// Create table with null type using spark.catalog.createTable
// Create table with void type using spark.catalog.createTable
withTable("t") {
val schema = new StructType().add("c", NullType)
spark.catalog.createTable(

View file

@ -121,7 +121,7 @@ class HiveOrcSourceSuite extends OrcSuite with TestHiveSingleton {
msg = intercept[AnalysisException] {
sql("select null").write.mode("overwrite").orc(orcDir)
}.getMessage
assert(msg.contains("ORC data source does not support null data type."))
assert(msg.contains("ORC data source does not support void data type."))
msg = intercept[AnalysisException] {
spark.udf.register("testType", () => new IntervalData())