[SPARK-4856] [SQL] NullType instead of StringType when sampling against empty string or nul...
``` TestSQLContext.sparkContext.parallelize( """{"ip":"27.31.100.29","headers":{"Host":"1.abc.com","Charset":"UTF-8"}}""" :: """{"ip":"27.31.100.29","headers":{}}""" :: """{"ip":"27.31.100.29","headers":""}""" :: Nil) ``` As empty string (the "headers") will be considered as String in the beginning (in line 2 and 3), it ignores the real nested data type (struct type "headers" in line 1), and also take the line 1 (the "headers") as String Type, which is not our expected. Author: Cheng Hao <hao.cheng@intel.com> Closes #3708 from chenghao-intel/json and squashes the following commits: e7a72e9 [Cheng Hao] add more concise unit test 853de51 [Cheng Hao] NullType instead of StringType when sampling against empty string or null value
This commit is contained in:
parent
19c0faad6d
commit
8d0d2a65eb
|
@ -263,6 +263,8 @@ private[sql] object JsonRDD extends Logging {
|
||||||
val elementType = typeOfArray(array)
|
val elementType = typeOfArray(array)
|
||||||
buildKeyPathForInnerStructs(array, elementType) :+ (key, elementType)
|
buildKeyPathForInnerStructs(array, elementType) :+ (key, elementType)
|
||||||
}
|
}
|
||||||
|
// we couldn't tell what the type is if the value is null or empty string
|
||||||
|
case (key: String, value) if value == "" || value == null => (key, NullType) :: Nil
|
||||||
case (key: String, value) => (key, typeOfPrimitiveValue(value)) :: Nil
|
case (key: String, value) => (key, typeOfPrimitiveValue(value)) :: Nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -400,13 +402,13 @@ private[sql] object JsonRDD extends Logging {
|
||||||
} else {
|
} else {
|
||||||
desiredType match {
|
desiredType match {
|
||||||
case StringType => toString(value)
|
case StringType => toString(value)
|
||||||
|
case _ if value == null || value == "" => null // guard the non string type
|
||||||
case IntegerType => value.asInstanceOf[IntegerType.JvmType]
|
case IntegerType => value.asInstanceOf[IntegerType.JvmType]
|
||||||
case LongType => toLong(value)
|
case LongType => toLong(value)
|
||||||
case DoubleType => toDouble(value)
|
case DoubleType => toDouble(value)
|
||||||
case DecimalType() => toDecimal(value)
|
case DecimalType() => toDecimal(value)
|
||||||
case BooleanType => value.asInstanceOf[BooleanType.JvmType]
|
case BooleanType => value.asInstanceOf[BooleanType.JvmType]
|
||||||
case NullType => null
|
case NullType => null
|
||||||
|
|
||||||
case ArrayType(elementType, _) =>
|
case ArrayType(elementType, _) =>
|
||||||
value.asInstanceOf[Seq[Any]].map(enforceCorrectType(_, elementType))
|
value.asInstanceOf[Seq[Any]].map(enforceCorrectType(_, elementType))
|
||||||
case struct: StructType => asRow(value.asInstanceOf[Map[String, Any]], struct)
|
case struct: StructType => asRow(value.asInstanceOf[Map[String, Any]], struct)
|
||||||
|
|
|
@ -193,6 +193,25 @@ class JsonSuite extends QueryTest {
|
||||||
StringType)
|
StringType)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
test("Complex field and type inferring with null in sampling") {
|
||||||
|
val jsonSchemaRDD = jsonRDD(jsonNullStruct)
|
||||||
|
val expectedSchema = StructType(
|
||||||
|
StructField("headers", StructType(
|
||||||
|
StructField("Charset", StringType, true) ::
|
||||||
|
StructField("Host", StringType, true) :: Nil)
|
||||||
|
, true) ::
|
||||||
|
StructField("ip", StringType, true) ::
|
||||||
|
StructField("nullstr", StringType, true):: Nil)
|
||||||
|
|
||||||
|
assert(expectedSchema === jsonSchemaRDD.schema)
|
||||||
|
jsonSchemaRDD.registerTempTable("jsonTable")
|
||||||
|
|
||||||
|
checkAnswer(
|
||||||
|
sql("select nullstr, headers.Host from jsonTable"),
|
||||||
|
Seq(Row("", "1.abc.com"), Row("", null), Row("", null), Row(null, null))
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
test("Primitive field and type inferring") {
|
test("Primitive field and type inferring") {
|
||||||
val jsonSchemaRDD = jsonRDD(primitiveFieldAndType)
|
val jsonSchemaRDD = jsonRDD(primitiveFieldAndType)
|
||||||
|
|
||||||
|
|
|
@ -43,6 +43,13 @@ object TestJsonData {
|
||||||
"""{"num_num_1":21474836570, "num_num_2":1.1, "num_num_3": 21474836470,
|
"""{"num_num_1":21474836570, "num_num_2":1.1, "num_num_3": 21474836470,
|
||||||
"num_bool":null, "num_str":92233720368547758070, "str_bool":null}""" :: Nil)
|
"num_bool":null, "num_str":92233720368547758070, "str_bool":null}""" :: Nil)
|
||||||
|
|
||||||
|
val jsonNullStruct =
|
||||||
|
TestSQLContext.sparkContext.parallelize(
|
||||||
|
"""{"nullstr":"","ip":"27.31.100.29","headers":{"Host":"1.abc.com","Charset":"UTF-8"}}""" ::
|
||||||
|
"""{"nullstr":"","ip":"27.31.100.29","headers":{}}""" ::
|
||||||
|
"""{"nullstr":"","ip":"27.31.100.29","headers":""}""" ::
|
||||||
|
"""{"nullstr":null,"ip":"27.31.100.29","headers":null}""" :: Nil)
|
||||||
|
|
||||||
val complexFieldValueTypeConflict =
|
val complexFieldValueTypeConflict =
|
||||||
TestSQLContext.sparkContext.parallelize(
|
TestSQLContext.sparkContext.parallelize(
|
||||||
"""{"num_struct":11, "str_array":[1, 2, 3],
|
"""{"num_struct":11, "str_array":[1, 2, 3],
|
||||||
|
|
Loading…
Reference in a new issue