[SPARK-32130][SQL] Disable the JSON option inferTimestamp by default

### What changes were proposed in this pull request?
Set the JSON option `inferTimestamp` to `false` if an user don't pass it as datasource option.

### Why are the changes needed?
To prevent perf regression while inferring schemas from JSON with potential timestamps fields.

### Does this PR introduce _any_ user-facing change?
Yes

### How was this patch tested?
- Modified existing tests in `JsonSuite` and `JsonInferSchemaSuite`.
- Regenerated results of `JsonBenchmark` in the environment:

| Item | Description |
| ---- | ----|
| Region | us-west-2 (Oregon) |
| Instance | r3.xlarge |
| AMI | ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20190722.1 (ami-06f2f779464715dc5) |
| Java | OpenJDK 64-Bit Server VM 1.8.0_252 and OpenJDK 64-Bit Server VM 11.0.7+10 |

Closes #28966 from MaxGekk/json-inferTimestamps-disable-by-default.

Authored-by: Max Gekk <max.gekk@gmail.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
This commit is contained in:
Max Gekk 2020-07-01 15:45:39 -07:00 committed by Dongjoon Hyun
parent 6edb20df83
commit bcf23307f4
6 changed files with 130 additions and 112 deletions

View file

@ -31,7 +31,11 @@ license: |
- In Spark 3.1, `from_unixtime`, `unix_timestamp`,`to_unix_timestamp`, `to_timestamp` and `to_date` will fail if the specified datetime pattern is invalid. In Spark 3.0 or earlier, they result `NULL`. - In Spark 3.1, `from_unixtime`, `unix_timestamp`,`to_unix_timestamp`, `to_timestamp` and `to_date` will fail if the specified datetime pattern is invalid. In Spark 3.0 or earlier, they result `NULL`.
- In Spark 3.1, casting numeric to timestamp will be forbidden by default. It's strongly recommended to use dedicated functions: TIMESTAMP_SECONDS, TIMESTAMP_MILLIS and TIMESTAMP_MICROS. Or you can set `spark.sql.legacy.allowCastNumericToTimestamp` to true to work around it. See more details in SPARK-31710. - In Spark 3.1, casting numeric to timestamp will be forbidden by default. It's strongly recommended to use dedicated functions: TIMESTAMP_SECONDS, TIMESTAMP_MILLIS and TIMESTAMP_MICROS. Or you can set `spark.sql.legacy.allowCastNumericToTimestamp` to true to work around it. See more details in SPARK-31710.
## Upgrading from Spark SQL 3.0 to 3.0.1
- In Spark 3.0, JSON datasource and JSON function `schema_of_json` infer TimestampType from string values if they match to the pattern defined by the JSON option `timestampFormat`. Since version 3.0.1, the timestamp type inference is disabled by default. Set the JSON option `inferTimestamp` to `true` to enable such type inference.
## Upgrading from Spark SQL 2.4 to 3.0 ## Upgrading from Spark SQL 2.4 to 3.0
### Dataset/DataFrame APIs ### Dataset/DataFrame APIs

View file

@ -133,7 +133,7 @@ private[sql] class JSONOptions(
* Enables inferring of TimestampType from strings matched to the timestamp pattern * Enables inferring of TimestampType from strings matched to the timestamp pattern
* defined by the timestampFormat option. * defined by the timestampFormat option.
*/ */
val inferTimestamp: Boolean = parameters.get("inferTimestamp").map(_.toBoolean).getOrElse(true) val inferTimestamp: Boolean = parameters.get("inferTimestamp").map(_.toBoolean).getOrElse(false)
/** Build a Jackson [[JsonFactory]] using JSON options. */ /** Build a Jackson [[JsonFactory]] using JSON options. */
def buildJsonFactory(): JsonFactory = { def buildJsonFactory(): JsonFactory = {

View file

@ -35,22 +35,29 @@ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper {
assert(inferSchema.inferField(parser) === expectedType) assert(inferSchema.inferField(parser) === expectedType)
} }
def checkTimestampType(pattern: String, json: String): Unit = { def checkTimestampType(pattern: String, json: String, inferTimestamp: Boolean): Unit = {
checkType(Map("timestampFormat" -> pattern), json, TimestampType) checkType(
Map("timestampFormat" -> pattern, "inferTimestamp" -> inferTimestamp.toString),
json,
if (inferTimestamp) TimestampType else StringType)
} }
test("inferring timestamp type") { test("inferring timestamp type") {
Seq("legacy", "corrected").foreach { legacyParserPolicy => Seq(true, false).foreach { inferTimestamp =>
withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) { Seq("legacy", "corrected").foreach { legacyParserPolicy =>
checkTimestampType("yyyy", """{"a": "2018"}""") withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) {
checkTimestampType("yyyy=MM", """{"a": "2018=12"}""") checkTimestampType("yyyy", """{"a": "2018"}""", inferTimestamp)
checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""") checkTimestampType("yyyy=MM", """{"a": "2018=12"}""", inferTimestamp)
checkTimestampType( checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""", inferTimestamp)
"yyyy-MM-dd'T'HH:mm:ss.SSS", checkTimestampType(
"""{"a": "2018-12-02T21:04:00.123"}""") "yyyy-MM-dd'T'HH:mm:ss.SSS",
checkTimestampType( """{"a": "2018-12-02T21:04:00.123"}""",
"yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX", inferTimestamp)
"""{"a": "2018-12-02T21:04:00.123567+01:00"}""") checkTimestampType(
"yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX",
"""{"a": "2018-12-02T21:04:00.123567+01:00"}""",
inferTimestamp)
}
} }
} }
} }
@ -71,16 +78,19 @@ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper {
} }
test("skip decimal type inferring") { test("skip decimal type inferring") {
Seq("legacy", "corrected").foreach { legacyParserPolicy => Seq(true, false).foreach { inferTimestamp =>
withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) { Seq("legacy", "corrected").foreach { legacyParserPolicy =>
checkType( withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) {
options = Map( checkType(
"prefersDecimal" -> "false", options = Map(
"timestampFormat" -> "yyyyMMdd.HHmmssSSS" "prefersDecimal" -> "false",
), "timestampFormat" -> "yyyyMMdd.HHmmssSSS",
json = """{"a": "20181202.210400123"}""", "inferTimestamp" -> inferTimestamp.toString
dt = TimestampType ),
) json = """{"a": "20181202.210400123"}""",
dt = if (inferTimestamp) TimestampType else StringType
)
}
} }
} }
} }

View file

@ -7,106 +7,106 @@ OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-106
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
No encoding 68879 68993 116 1.5 688.8 1.0X No encoding 69219 69342 116 1.4 692.2 1.0X
UTF-8 is set 115270 115602 455 0.9 1152.7 0.6X UTF-8 is set 143950 143986 55 0.7 1439.5 0.5X
Preparing data for benchmarking ... Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
No encoding 47452 47538 113 2.1 474.5 1.0X No encoding 57828 57913 136 1.7 578.3 1.0X
UTF-8 is set 77330 77354 30 1.3 773.3 0.6X UTF-8 is set 83649 83711 60 1.2 836.5 0.7X
Preparing data for benchmarking ... Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
No encoding 60470 60900 534 0.2 6047.0 1.0X No encoding 64560 65193 1023 0.2 6456.0 1.0X
UTF-8 is set 104733 104931 189 0.1 10473.3 0.6X UTF-8 is set 102925 103174 216 0.1 10292.5 0.6X
Preparing data for benchmarking ... Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
No encoding 130302 131072 976 0.0 260604.6 1.0X No encoding 131002 132316 1160 0.0 262003.1 1.0X
UTF-8 is set 150860 151284 377 0.0 301720.1 0.9X UTF-8 is set 152128 152371 332 0.0 304256.5 0.9X
Preparing data for benchmarking ... Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
Select 10 columns 18619 18684 99 0.5 1861.9 1.0X Select 10 columns 19376 19514 160 0.5 1937.6 1.0X
Select 1 column 24227 24270 38 0.4 2422.7 0.8X Select 1 column 24089 24156 58 0.4 2408.9 0.8X
Preparing data for benchmarking ... Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
Short column without encoding 7947 7971 21 1.3 794.7 1.0X Short column without encoding 8131 8219 103 1.2 813.1 1.0X
Short column with UTF-8 12700 12753 58 0.8 1270.0 0.6X Short column with UTF-8 13464 13508 44 0.7 1346.4 0.6X
Wide column without encoding 92632 92955 463 0.1 9263.2 0.1X Wide column without encoding 108012 108598 914 0.1 10801.2 0.1X
Wide column with UTF-8 147013 147170 188 0.1 14701.3 0.1X Wide column with UTF-8 150988 151369 412 0.1 15098.8 0.1X
Preparing data for benchmarking ... Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
Text read 713 734 19 14.0 71.3 1.0X Text read 753 765 18 13.3 75.3 1.0X
from_json 22019 22429 456 0.5 2201.9 0.0X from_json 23182 23446 230 0.4 2318.2 0.0X
json_tuple 27987 28047 74 0.4 2798.7 0.0X json_tuple 31129 31304 181 0.3 3112.9 0.0X
get_json_object 21468 21870 350 0.5 2146.8 0.0X get_json_object 22821 23073 225 0.4 2282.1 0.0X
Preparing data for benchmarking ... Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
Text read 2887 2910 24 17.3 57.7 1.0X Text read 3078 3101 26 16.2 61.6 1.0X
schema inferring 31793 31843 43 1.6 635.9 0.1X schema inferring 30225 30434 333 1.7 604.5 0.1X
parsing 36791 37104 294 1.4 735.8 0.1X parsing 32237 32308 63 1.6 644.7 0.1X
Preparing data for benchmarking ... Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
Text read 10570 10611 45 4.7 211.4 1.0X Text read 10835 10900 86 4.6 216.7 1.0X
Schema inferring 48729 48763 41 1.0 974.6 0.2X Schema inferring 37720 37805 110 1.3 754.4 0.3X
Parsing without charset 35490 35648 141 1.4 709.8 0.3X Parsing without charset 35464 35538 100 1.4 709.3 0.3X
Parsing with UTF-8 63853 63994 163 0.8 1277.1 0.2X Parsing with UTF-8 67311 67738 381 0.7 1346.2 0.2X
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
Create a dataset of timestamps 2187 2190 5 4.6 218.7 1.0X Create a dataset of timestamps 2208 2222 14 4.5 220.8 1.0X
to_json(timestamp) 16262 16503 323 0.6 1626.2 0.1X to_json(timestamp) 14299 14570 285 0.7 1429.9 0.2X
write timestamps to files 11679 11692 12 0.9 1167.9 0.2X write timestamps to files 12955 12969 13 0.8 1295.5 0.2X
Create a dataset of dates 2297 2310 12 4.4 229.7 1.0X Create a dataset of dates 2297 2323 30 4.4 229.7 1.0X
to_json(date) 10904 10956 46 0.9 1090.4 0.2X to_json(date) 8509 8561 74 1.2 850.9 0.3X
write dates to files 6610 6645 35 1.5 661.0 0.3X write dates to files 6786 6827 45 1.5 678.6 0.3X
OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
read timestamp text from files 2524 2530 9 4.0 252.4 1.0X read timestamp text from files 2598 2613 18 3.8 259.8 1.0X
read timestamps from files 41002 41052 59 0.2 4100.2 0.1X read timestamps from files 42007 42028 19 0.2 4200.7 0.1X
infer timestamps from files 84621 84939 526 0.1 8462.1 0.0X infer timestamps from files 18102 18120 28 0.6 1810.2 0.1X
read date text from files 2292 2302 9 4.4 229.2 1.1X read date text from files 2355 2360 5 4.2 235.5 1.1X
read date from files 16954 16976 21 0.6 1695.4 0.1X read date from files 17420 17458 33 0.6 1742.0 0.1X
timestamp strings 3067 3077 13 3.3 306.7 0.8X timestamp strings 3099 3101 3 3.2 309.9 0.8X
parse timestamps from Dataset[String] 48690 48971 243 0.2 4869.0 0.1X parse timestamps from Dataset[String] 48188 48215 25 0.2 4818.8 0.1X
infer timestamps from Dataset[String] 97463 97786 338 0.1 9746.3 0.0X infer timestamps from Dataset[String] 22929 22988 102 0.4 2292.9 0.1X
date strings 3952 3956 3 2.5 395.2 0.6X date strings 4090 4103 11 2.4 409.0 0.6X
parse dates from Dataset[String] 24210 24241 30 0.4 2421.0 0.1X parse dates from Dataset[String] 24952 25068 139 0.4 2495.2 0.1X
from_json(timestamp) 71710 72242 629 0.1 7171.0 0.0X from_json(timestamp) 66038 66352 413 0.2 6603.8 0.0X
from_json(date) 42465 42481 13 0.2 4246.5 0.1X from_json(date) 43755 43782 27 0.2 4375.5 0.1X

View file

@ -7,106 +7,106 @@ OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aw
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
No encoding 63981 64044 56 1.6 639.8 1.0X No encoding 64950 65182 306 1.5 649.5 1.0X
UTF-8 is set 112672 113350 962 0.9 1126.7 0.6X UTF-8 is set 129566 129796 229 0.8 1295.7 0.5X
Preparing data for benchmarking ... Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
No encoding 51256 51449 180 2.0 512.6 1.0X No encoding 50896 51277 372 2.0 509.0 1.0X
UTF-8 is set 83694 83859 148 1.2 836.9 0.6X UTF-8 is set 89712 89763 49 1.1 897.1 0.6X
Preparing data for benchmarking ... Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
No encoding 58440 59097 569 0.2 5844.0 1.0X No encoding 59415 59785 372 0.2 5941.5 1.0X
UTF-8 is set 102746 102883 198 0.1 10274.6 0.6X UTF-8 is set 103059 103165 156 0.1 10305.9 0.6X
Preparing data for benchmarking ... Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
No encoding 128982 129304 356 0.0 257965.0 1.0X No encoding 132951 133122 288 0.0 265901.9 1.0X
UTF-8 is set 147247 147415 231 0.0 294494.1 0.9X UTF-8 is set 149318 149441 107 0.0 298635.3 0.9X
Preparing data for benchmarking ... Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
Select 10 columns 18837 19048 331 0.5 1883.7 1.0X Select 10 columns 18491 18552 85 0.5 1849.1 1.0X
Select 1 column 24707 24723 14 0.4 2470.7 0.8X Select 1 column 25908 25946 65 0.4 2590.8 0.7X
Preparing data for benchmarking ... Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
Short column without encoding 8218 8234 17 1.2 821.8 1.0X Short column without encoding 9264 9307 49 1.1 926.4 1.0X
Short column with UTF-8 12374 12438 107 0.8 1237.4 0.7X Short column with UTF-8 14707 14727 17 0.7 1470.7 0.6X
Wide column without encoding 136918 137298 345 0.1 13691.8 0.1X Wide column without encoding 141138 141347 276 0.1 14113.8 0.1X
Wide column with UTF-8 176961 177142 257 0.1 17696.1 0.0X Wide column with UTF-8 179601 180035 664 0.1 17960.1 0.1X
Preparing data for benchmarking ... Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
Text read 1268 1278 12 7.9 126.8 1.0X Text read 1173 1184 9 8.5 117.3 1.0X
from_json 23348 23479 176 0.4 2334.8 0.1X from_json 23432 23738 338 0.4 2343.2 0.1X
json_tuple 29606 30221 1024 0.3 2960.6 0.0X json_tuple 32573 32851 358 0.3 3257.3 0.0X
get_json_object 21898 22148 226 0.5 2189.8 0.1X get_json_object 22442 22489 47 0.4 2244.2 0.1X
Preparing data for benchmarking ... Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
Text read 5887 5944 49 8.5 117.7 1.0X Text read 5656 5680 31 8.8 113.1 1.0X
schema inferring 46696 47054 312 1.1 933.9 0.1X schema inferring 33283 33337 64 1.5 665.7 0.2X
parsing 32336 32450 129 1.5 646.7 0.2X parsing 41771 41929 178 1.2 835.4 0.1X
Preparing data for benchmarking ... Preparing data for benchmarking ...
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
Text read 9756 9769 11 5.1 195.1 1.0X Text read 9626 9668 39 5.2 192.5 1.0X
Schema inferring 51318 51433 108 1.0 1026.4 0.2X Schema inferring 39489 39579 91 1.3 789.8 0.2X
Parsing without charset 43609 43743 118 1.1 872.2 0.2X Parsing without charset 38096 38232 125 1.3 761.9 0.3X
Parsing with UTF-8 60775 60844 106 0.8 1215.5 0.2X Parsing with UTF-8 64565 64725 165 0.8 1291.3 0.1X
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
Create a dataset of timestamps 1998 2015 17 5.0 199.8 1.0X Create a dataset of timestamps 1898 1912 13 5.3 189.8 1.0X
to_json(timestamp) 18156 18317 263 0.6 1815.6 0.1X to_json(timestamp) 20011 20092 119 0.5 2001.1 0.1X
write timestamps to files 12912 12917 5 0.8 1291.2 0.2X write timestamps to files 13388 13427 35 0.7 1338.8 0.1X
Create a dataset of dates 2209 2270 53 4.5 220.9 0.9X Create a dataset of dates 2351 2368 18 4.3 235.1 0.8X
to_json(date) 9433 9489 90 1.1 943.3 0.2X to_json(date) 11884 11913 40 0.8 1188.4 0.2X
write dates to files 6915 6923 8 1.4 691.5 0.3X write dates to files 7317 7326 9 1.4 731.7 0.3X
OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------ ------------------------------------------------------------------------------------------------------------------------
read timestamp text from files 2395 2412 17 4.2 239.5 1.0X read timestamp text from files 2316 2324 13 4.3 231.6 1.0X
read timestamps from files 47269 47334 89 0.2 4726.9 0.1X read timestamps from files 43712 43900 165 0.2 4371.2 0.1X
infer timestamps from files 91806 91851 67 0.1 9180.6 0.0X infer timestamps from files 19302 19328 38 0.5 1930.2 0.1X
read date text from files 2118 2133 13 4.7 211.8 1.1X read date text from files 2090 2099 11 4.8 209.0 1.1X
read date from files 17267 17340 115 0.6 1726.7 0.1X read date from files 18914 18940 44 0.5 1891.4 0.1X
timestamp strings 3906 3935 26 2.6 390.6 0.6X timestamp strings 3785 3793 11 2.6 378.5 0.6X
parse timestamps from Dataset[String] 52244 52534 279 0.2 5224.4 0.0X parse timestamps from Dataset[String] 51177 51353 160 0.2 5117.7 0.0X
infer timestamps from Dataset[String] 100488 100714 198 0.1 10048.8 0.0X infer timestamps from Dataset[String] 27907 28119 186 0.4 2790.7 0.1X
date strings 4572 4584 12 2.2 457.2 0.5X date strings 4446 4452 6 2.2 444.6 0.5X
parse dates from Dataset[String] 26749 26768 17 0.4 2674.9 0.1X parse dates from Dataset[String] 28124 28172 55 0.4 2812.4 0.1X
from_json(timestamp) 71414 71867 556 0.1 7141.4 0.0X from_json(timestamp) 71432 71827 354 0.1 7143.2 0.0X
from_json(date) 45322 45549 250 0.2 4532.2 0.1X from_json(date) 46497 46651 163 0.2 4649.7 0.0X

View file

@ -2610,7 +2610,9 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson
} }
test("inferring timestamp type") { test("inferring timestamp type") {
def schemaOf(jsons: String*): StructType = spark.read.json(jsons.toDS).schema def schemaOf(jsons: String*): StructType = {
spark.read.option("inferTimestamp", true).json(jsons.toDS).schema
}
assert(schemaOf( assert(schemaOf(
"""{"a":"2018-12-17T10:11:12.123-01:00"}""", """{"a":"2018-12-17T10:11:12.123-01:00"}""",
@ -2633,6 +2635,7 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson
val timestampsWithFormatPath = s"${dir.getCanonicalPath}/timestampsWithFormat.json" val timestampsWithFormatPath = s"${dir.getCanonicalPath}/timestampsWithFormat.json"
val timestampsWithFormat = spark.read val timestampsWithFormat = spark.read
.option("timestampFormat", "dd/MM/yyyy HH:mm") .option("timestampFormat", "dd/MM/yyyy HH:mm")
.option("inferTimestamp", true)
.json(datesRecords) .json(datesRecords)
assert(timestampsWithFormat.schema === customSchema) assert(timestampsWithFormat.schema === customSchema)
@ -2645,6 +2648,7 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson
val readBack = spark.read val readBack = spark.read
.option("timestampFormat", "yyyy-MM-dd HH:mm:ss") .option("timestampFormat", "yyyy-MM-dd HH:mm:ss")
.option(DateTimeUtils.TIMEZONE_OPTION, "UTC") .option(DateTimeUtils.TIMEZONE_OPTION, "UTC")
.option("inferTimestamp", true)
.json(timestampsWithFormatPath) .json(timestampsWithFormatPath)
assert(readBack.schema === customSchema) assert(readBack.schema === customSchema)