[SPARK-31414][SQL] Fix performance regression with new TimestampFormatter for json and csv time parsing
### What changes were proposed in this pull request? With benchmark original, where the timestamp values are valid to the new parser the result is ```scala [info] Running benchmark: Read dates and timestamps [info] Running case: timestamp strings [info] Stopped after 3 iterations, 5781 ms [info] Running case: parse timestamps from Dataset[String] [info] Stopped after 3 iterations, 44764 ms [info] Running case: infer timestamps from Dataset[String] [info] Stopped after 3 iterations, 93764 ms [info] Running case: from_json(timestamp) [info] Stopped after 3 iterations, 59021 ms ``` When we modify the benchmark to ```scala def timestampStr: Dataset[String] = { spark.range(0, rowsNum, 1, 1).mapPartitions { iter => iter.map(i => s"""{"timestamp":"1970-01-01T01:02:03.${i % 100}"}""") }.select($"value".as("timestamp")).as[String] } readBench.addCase("timestamp strings", numIters) { _ => timestampStr.noop() } readBench.addCase("parse timestamps from Dataset[String]", numIters) { _ => spark.read.schema(tsSchema).json(timestampStr).noop() } readBench.addCase("infer timestamps from Dataset[String]", numIters) { _ => spark.read.json(timestampStr).noop() } ``` where the timestamp values are invalid for the new parser which causes a fallback to legacy parser(2.4). the result is ```scala [info] Running benchmark: Read dates and timestamps [info] Running case: timestamp strings [info] Stopped after 3 iterations, 5623 ms [info] Running case: parse timestamps from Dataset[String] [info] Stopped after 3 iterations, 506637 ms [info] Running case: infer timestamps from Dataset[String] [info] Stopped after 3 iterations, 509076 ms ``` About 10x perf-regression BUT if we modify the timestamp pattern to `....HH:mm:ss[.SSS][XXX]` which make all timestamp values valid for the new parser to prohibit fallback, the result is ```scala [info] Running benchmark: Read dates and timestamps [info] Running case: timestamp strings [info] Stopped after 3 iterations, 5623 ms [info] Running case: parse timestamps from Dataset[String] [info] Stopped after 3 iterations, 506637 ms [info] Running case: infer timestamps from Dataset[String] [info] Stopped after 3 iterations, 509076 ms ``` ### Why are the changes needed? Fix performance regression. ### Does this PR introduce any user-facing change? NO ### How was this patch tested? new tests added. Closes #28181 from yaooqinn/SPARK-31414. Authored-by: Kent Yao <yaooqinn@hotmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
parent
1b87015044
commit
d65f534c5a
|
@ -26,6 +26,7 @@ import com.univocity.parsers.csv.{CsvParserSettings, CsvWriterSettings, Unescape
|
|||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.sql.catalyst.util._
|
||||
import org.apache.spark.sql.internal.SQLConf
|
||||
import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
|
||||
|
||||
class CSVOptions(
|
||||
@transient val parameters: CaseInsensitiveMap[String],
|
||||
|
@ -148,8 +149,12 @@ class CSVOptions(
|
|||
|
||||
val dateFormat: String = parameters.getOrElse("dateFormat", DateFormatter.defaultPattern)
|
||||
|
||||
val timestampFormat: String =
|
||||
parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")
|
||||
val timestampFormat: String = parameters.getOrElse("timestampFormat",
|
||||
if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) {
|
||||
s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX"
|
||||
} else {
|
||||
s"${DateFormatter.defaultPattern}'T'HH:mm:ss[.SSS][XXX]"
|
||||
})
|
||||
|
||||
val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false)
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@ import com.fasterxml.jackson.core.json.JsonReadFeature
|
|||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.sql.catalyst.util._
|
||||
import org.apache.spark.sql.internal.SQLConf
|
||||
import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
|
||||
|
||||
/**
|
||||
* Options for parsing JSON data into Spark SQL rows.
|
||||
|
@ -90,8 +91,12 @@ private[sql] class JSONOptions(
|
|||
|
||||
val dateFormat: String = parameters.getOrElse("dateFormat", DateFormatter.defaultPattern)
|
||||
|
||||
val timestampFormat: String =
|
||||
parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX")
|
||||
val timestampFormat: String = parameters.getOrElse("timestampFormat",
|
||||
if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) {
|
||||
s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX"
|
||||
} else {
|
||||
s"${DateFormatter.defaultPattern}'T'HH:mm:ss[.SSS][XXX]"
|
||||
})
|
||||
|
||||
val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false)
|
||||
|
||||
|
|
|
@ -2,66 +2,66 @@
|
|||
Benchmark to measure CSV read/write performance
|
||||
================================================================================================
|
||||
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Parsing quoted values: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
One quoted string 44297 44515 373 0.0 885948.7 1.0X
|
||||
One quoted string 24907 29374 NaN 0.0 498130.5 1.0X
|
||||
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Wide rows with 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Select 1000 columns 196720 197783 1560 0.0 196719.8 1.0X
|
||||
Select 100 columns 46691 46861 219 0.0 46691.4 4.2X
|
||||
Select one column 36811 36922 111 0.0 36811.3 5.3X
|
||||
count() 8520 8610 106 0.1 8520.5 23.1X
|
||||
Select 100 columns, one bad input field 67914 67994 136 0.0 67914.0 2.9X
|
||||
Select 100 columns, corrupt record field 77272 77445 214 0.0 77272.0 2.5X
|
||||
Select 1000 columns 62811 63690 1416 0.0 62811.4 1.0X
|
||||
Select 100 columns 23839 24064 230 0.0 23839.5 2.6X
|
||||
Select one column 19936 20641 827 0.1 19936.4 3.2X
|
||||
count() 4174 4380 206 0.2 4174.4 15.0X
|
||||
Select 100 columns, one bad input field 41015 42380 1688 0.0 41015.4 1.5X
|
||||
Select 100 columns, corrupt record field 46281 46338 93 0.0 46280.5 1.4X
|
||||
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Count a dataset with 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Select 10 columns + count() 25965 26054 103 0.4 2596.5 1.0X
|
||||
Select 1 column + count() 18591 18666 91 0.5 1859.1 1.4X
|
||||
count() 6102 6119 18 1.6 610.2 4.3X
|
||||
Select 10 columns + count() 10810 10997 163 0.9 1081.0 1.0X
|
||||
Select 1 column + count() 7608 7641 47 1.3 760.8 1.4X
|
||||
count() 2415 2462 77 4.1 241.5 4.5X
|
||||
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Create a dataset of timestamps 2142 2161 17 4.7 214.2 1.0X
|
||||
to_csv(timestamp) 14744 14950 182 0.7 1474.4 0.1X
|
||||
write timestamps to files 12078 12202 175 0.8 1207.8 0.2X
|
||||
Create a dataset of dates 2275 2291 18 4.4 227.5 0.9X
|
||||
to_csv(date) 11407 11464 51 0.9 1140.7 0.2X
|
||||
write dates to files 7638 7702 90 1.3 763.8 0.3X
|
||||
Create a dataset of timestamps 874 914 37 11.4 87.4 1.0X
|
||||
to_csv(timestamp) 7051 7223 250 1.4 705.1 0.1X
|
||||
write timestamps to files 6712 6741 31 1.5 671.2 0.1X
|
||||
Create a dataset of dates 909 945 35 11.0 90.9 1.0X
|
||||
to_csv(date) 4222 4231 8 2.4 422.2 0.2X
|
||||
write dates to files 3799 3813 14 2.6 379.9 0.2X
|
||||
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
read timestamp text from files 2578 2590 10 3.9 257.8 1.0X
|
||||
read timestamps from files 60103 60694 512 0.2 6010.3 0.0X
|
||||
infer timestamps from files 107871 108268 351 0.1 10787.1 0.0X
|
||||
read date text from files 2306 2310 4 4.3 230.6 1.1X
|
||||
read date from files 47415 47657 367 0.2 4741.5 0.1X
|
||||
infer date from files 35261 35447 164 0.3 3526.1 0.1X
|
||||
timestamp strings 3045 3056 11 3.3 304.5 0.8X
|
||||
parse timestamps from Dataset[String] 62221 63173 849 0.2 6222.1 0.0X
|
||||
infer timestamps from Dataset[String] 118838 119629 697 0.1 11883.8 0.0X
|
||||
date strings 3459 3481 19 2.9 345.9 0.7X
|
||||
parse dates from Dataset[String] 51026 51447 503 0.2 5102.6 0.1X
|
||||
from_csv(timestamp) 60738 61818 936 0.2 6073.8 0.0X
|
||||
from_csv(date) 46012 46278 370 0.2 4601.2 0.1X
|
||||
read timestamp text from files 1342 1364 35 7.5 134.2 1.0X
|
||||
read timestamps from files 20300 20473 247 0.5 2030.0 0.1X
|
||||
infer timestamps from files 40705 40744 54 0.2 4070.5 0.0X
|
||||
read date text from files 1146 1151 6 8.7 114.6 1.2X
|
||||
read date from files 12278 12408 117 0.8 1227.8 0.1X
|
||||
infer date from files 12734 12872 220 0.8 1273.4 0.1X
|
||||
timestamp strings 1467 1482 15 6.8 146.7 0.9X
|
||||
parse timestamps from Dataset[String] 21708 22234 477 0.5 2170.8 0.1X
|
||||
infer timestamps from Dataset[String] 42357 43253 922 0.2 4235.7 0.0X
|
||||
date strings 1512 1532 18 6.6 151.2 0.9X
|
||||
parse dates from Dataset[String] 13436 13470 33 0.7 1343.6 0.1X
|
||||
from_csv(timestamp) 20390 20486 95 0.5 2039.0 0.1X
|
||||
from_csv(date) 12592 12693 139 0.8 1259.2 0.1X
|
||||
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.15.2
|
||||
Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
w/o filters 11889 11945 52 0.0 118893.1 1.0X
|
||||
pushdown disabled 11790 11860 115 0.0 117902.3 1.0X
|
||||
w/ filters 1240 1278 33 0.1 12400.8 9.6X
|
||||
w/o filters 12535 12606 67 0.0 125348.8 1.0X
|
||||
pushdown disabled 12611 12672 91 0.0 126112.9 1.0X
|
||||
w/ filters 1093 1099 11 0.1 10928.3 11.5X
|
||||
|
||||
|
||||
|
|
|
@ -2,66 +2,66 @@
|
|||
Benchmark to measure CSV read/write performance
|
||||
================================================================================================
|
||||
|
||||
OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Parsing quoted values: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
One quoted string 51602 51659 59 0.0 1032039.4 1.0X
|
||||
One quoted string 24073 24109 33 0.0 481463.5 1.0X
|
||||
|
||||
OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Wide rows with 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Select 1000 columns 191926 192879 1615 0.0 191925.6 1.0X
|
||||
Select 100 columns 46766 46846 69 0.0 46766.1 4.1X
|
||||
Select one column 35877 35930 83 0.0 35876.8 5.3X
|
||||
count() 11186 11262 65 0.1 11186.0 17.2X
|
||||
Select 100 columns, one bad input field 59943 60107 232 0.0 59943.0 3.2X
|
||||
Select 100 columns, corrupt record field 73062 73406 479 0.0 73062.2 2.6X
|
||||
Select 1000 columns 58415 59611 2071 0.0 58414.8 1.0X
|
||||
Select 100 columns 22568 23020 594 0.0 22568.0 2.6X
|
||||
Select one column 18995 19058 99 0.1 18995.0 3.1X
|
||||
count() 5301 5332 30 0.2 5300.9 11.0X
|
||||
Select 100 columns, one bad input field 39736 40153 361 0.0 39736.1 1.5X
|
||||
Select 100 columns, corrupt record field 47195 47826 590 0.0 47195.2 1.2X
|
||||
|
||||
OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Count a dataset with 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Select 10 columns + count() 22389 22447 87 0.4 2238.9 1.0X
|
||||
Select 1 column + count() 14844 14890 43 0.7 1484.4 1.5X
|
||||
count() 5519 5538 18 1.8 551.9 4.1X
|
||||
Select 10 columns + count() 9884 9904 25 1.0 988.4 1.0X
|
||||
Select 1 column + count() 6794 6835 46 1.5 679.4 1.5X
|
||||
count() 2060 2065 5 4.9 206.0 4.8X
|
||||
|
||||
OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Create a dataset of timestamps 1949 1977 25 5.1 194.9 1.0X
|
||||
to_csv(timestamp) 14944 15702 714 0.7 1494.4 0.1X
|
||||
write timestamps to files 12983 12998 14 0.8 1298.3 0.2X
|
||||
Create a dataset of dates 2156 2164 7 4.6 215.6 0.9X
|
||||
to_csv(date) 9675 9709 41 1.0 967.5 0.2X
|
||||
write dates to files 7880 7897 15 1.3 788.0 0.2X
|
||||
Create a dataset of timestamps 717 732 18 14.0 71.7 1.0X
|
||||
to_csv(timestamp) 6994 7100 121 1.4 699.4 0.1X
|
||||
write timestamps to files 6417 6435 27 1.6 641.7 0.1X
|
||||
Create a dataset of dates 827 855 24 12.1 82.7 0.9X
|
||||
to_csv(date) 4408 4438 32 2.3 440.8 0.2X
|
||||
write dates to files 3738 3758 28 2.7 373.8 0.2X
|
||||
|
||||
OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
read timestamp text from files 2235 2245 10 4.5 223.5 1.0X
|
||||
read timestamps from files 54490 54690 283 0.2 5449.0 0.0X
|
||||
infer timestamps from files 104501 104737 236 0.1 10450.1 0.0X
|
||||
read date text from files 2035 2040 6 4.9 203.5 1.1X
|
||||
read date from files 39650 39707 52 0.3 3965.0 0.1X
|
||||
infer date from files 29235 29363 164 0.3 2923.5 0.1X
|
||||
timestamp strings 3412 3426 18 2.9 341.2 0.7X
|
||||
parse timestamps from Dataset[String] 66864 67804 981 0.1 6686.4 0.0X
|
||||
infer timestamps from Dataset[String] 118780 119284 837 0.1 11878.0 0.0X
|
||||
date strings 3730 3734 4 2.7 373.0 0.6X
|
||||
parse dates from Dataset[String] 48728 49071 309 0.2 4872.8 0.0X
|
||||
from_csv(timestamp) 62294 62493 260 0.2 6229.4 0.0X
|
||||
from_csv(date) 44581 44665 117 0.2 4458.1 0.1X
|
||||
read timestamp text from files 1121 1176 52 8.9 112.1 1.0X
|
||||
read timestamps from files 21298 21366 105 0.5 2129.8 0.1X
|
||||
infer timestamps from files 41008 41051 39 0.2 4100.8 0.0X
|
||||
read date text from files 962 967 5 10.4 96.2 1.2X
|
||||
read date from files 11749 11772 22 0.9 1174.9 0.1X
|
||||
infer date from files 12426 12459 29 0.8 1242.6 0.1X
|
||||
timestamp strings 1508 1519 9 6.6 150.8 0.7X
|
||||
parse timestamps from Dataset[String] 21674 21997 455 0.5 2167.4 0.1X
|
||||
infer timestamps from Dataset[String] 42141 42230 105 0.2 4214.1 0.0X
|
||||
date strings 1694 1701 8 5.9 169.4 0.7X
|
||||
parse dates from Dataset[String] 12929 12951 25 0.8 1292.9 0.1X
|
||||
from_csv(timestamp) 20603 20786 166 0.5 2060.3 0.1X
|
||||
from_csv(date) 12325 12338 12 0.8 1232.5 0.1X
|
||||
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.2
|
||||
Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
w/o filters 12557 12634 78 0.0 125572.9 1.0X
|
||||
pushdown disabled 12449 12509 65 0.0 124486.4 1.0X
|
||||
w/ filters 1372 1393 18 0.1 13724.8 9.1X
|
||||
w/o filters 12455 12474 22 0.0 124553.8 1.0X
|
||||
pushdown disabled 12462 12486 29 0.0 124624.9 1.0X
|
||||
w/ filters 1073 1092 18 0.1 10727.6 11.6X
|
||||
|
||||
|
||||
|
|
|
@ -3,110 +3,110 @@ Benchmark for performance of JSON parsing
|
|||
================================================================================================
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
No encoding 84774 84927 264 1.2 847.7 1.0X
|
||||
UTF-8 is set 119081 120155 1773 0.8 1190.8 0.7X
|
||||
No encoding 46010 46118 113 2.2 460.1 1.0X
|
||||
UTF-8 is set 54407 55427 1718 1.8 544.1 0.8X
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
No encoding 49293 49356 70 2.0 492.9 1.0X
|
||||
UTF-8 is set 80183 80211 25 1.2 801.8 0.6X
|
||||
No encoding 26614 28220 1461 3.8 266.1 1.0X
|
||||
UTF-8 is set 42765 43400 550 2.3 427.6 0.6X
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
No encoding 61070 61476 536 0.2 6107.0 1.0X
|
||||
UTF-8 is set 109765 109881 102 0.1 10976.5 0.6X
|
||||
No encoding 35696 35821 113 0.3 3569.6 1.0X
|
||||
UTF-8 is set 55441 56176 1037 0.2 5544.1 0.6X
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
No encoding 176999 178163 1008 0.0 353997.9 1.0X
|
||||
UTF-8 is set 201209 201641 614 0.0 402419.0 0.9X
|
||||
No encoding 61514 62968 NaN 0.0 123027.2 1.0X
|
||||
UTF-8 is set 72096 72933 1162 0.0 144192.7 0.9X
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Select 10 columns 18768 20587 496 0.5 1876.8 1.0X
|
||||
Select 1 column 22642 22644 3 0.4 2264.2 0.8X
|
||||
Select 10 columns 9859 9913 79 1.0 985.9 1.0X
|
||||
Select 1 column 10981 11003 36 0.9 1098.1 0.9X
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Short column without encoding 7697 7738 55 1.3 769.7 1.0X
|
||||
Short column with UTF-8 14051 14189 176 0.7 1405.1 0.5X
|
||||
Wide column without encoding 108999 110075 1085 0.1 10899.9 0.1X
|
||||
Wide column with UTF-8 157433 157779 308 0.1 15743.3 0.0X
|
||||
Short column without encoding 3555 3579 27 2.8 355.5 1.0X
|
||||
Short column with UTF-8 5204 5227 35 1.9 520.4 0.7X
|
||||
Wide column without encoding 60458 60637 164 0.2 6045.8 0.1X
|
||||
Wide column with UTF-8 77544 78111 551 0.1 7754.4 0.0X
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Text read 644 647 4 15.5 64.4 1.0X
|
||||
from_json 25859 25872 12 0.4 2585.9 0.0X
|
||||
json_tuple 31679 31761 71 0.3 3167.9 0.0X
|
||||
get_json_object 24772 25220 389 0.4 2477.2 0.0X
|
||||
Text read 342 346 3 29.2 34.2 1.0X
|
||||
from_json 7123 7318 179 1.4 712.3 0.0X
|
||||
json_tuple 9843 9957 132 1.0 984.3 0.0X
|
||||
get_json_object 7827 8046 194 1.3 782.7 0.0X
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Text read 3135 3165 52 15.9 62.7 1.0X
|
||||
schema inferring 29383 29389 10 1.7 587.7 0.1X
|
||||
parsing 32623 35183 NaN 1.5 652.5 0.1X
|
||||
Text read 1856 1884 32 26.9 37.1 1.0X
|
||||
schema inferring 16734 16900 153 3.0 334.7 0.1X
|
||||
parsing 14884 15203 470 3.4 297.7 0.1X
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Text read 11874 11948 82 4.2 237.5 1.0X
|
||||
Schema inferring 42382 42398 23 1.2 847.6 0.3X
|
||||
Parsing without charset 36410 36442 54 1.4 728.2 0.3X
|
||||
Parsing with UTF-8 62412 62463 48 0.8 1248.2 0.2X
|
||||
Text read 5932 6148 228 8.4 118.6 1.0X
|
||||
Schema inferring 20836 21938 1086 2.4 416.7 0.3X
|
||||
Parsing without charset 18134 18661 457 2.8 362.7 0.3X
|
||||
Parsing with UTF-8 27734 28069 378 1.8 554.7 0.2X
|
||||
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Create a dataset of timestamps 2191 2209 20 4.6 219.1 1.0X
|
||||
to_json(timestamp) 18670 19042 565 0.5 1867.0 0.1X
|
||||
write timestamps to files 11836 13156 NaN 0.8 1183.6 0.2X
|
||||
Create a dataset of dates 2321 2351 33 4.3 232.1 0.9X
|
||||
to_json(date) 12703 12726 24 0.8 1270.3 0.2X
|
||||
write dates to files 8230 8303 76 1.2 823.0 0.3X
|
||||
Create a dataset of timestamps 889 914 28 11.2 88.9 1.0X
|
||||
to_json(timestamp) 7920 8172 353 1.3 792.0 0.1X
|
||||
write timestamps to files 6726 6822 129 1.5 672.6 0.1X
|
||||
Create a dataset of dates 953 963 12 10.5 95.3 0.9X
|
||||
to_json(date) 5370 5705 320 1.9 537.0 0.2X
|
||||
write dates to files 4109 4166 52 2.4 410.9 0.2X
|
||||
|
||||
OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
read timestamp text from files 2780 2795 13 3.6 278.0 1.0X
|
||||
read timestamps from files 37158 37305 137 0.3 3715.8 0.1X
|
||||
infer timestamps from files 73666 73838 149 0.1 7366.6 0.0X
|
||||
read date text from files 2597 2609 10 3.9 259.7 1.1X
|
||||
read date from files 24439 24501 56 0.4 2443.9 0.1X
|
||||
timestamp strings 3052 3064 12 3.3 305.2 0.9X
|
||||
parse timestamps from Dataset[String] 43611 43665 52 0.2 4361.1 0.1X
|
||||
infer timestamps from Dataset[String] 83745 84153 376 0.1 8374.5 0.0X
|
||||
date strings 4068 4076 10 2.5 406.8 0.7X
|
||||
parse dates from Dataset[String] 34700 34807 118 0.3 3470.0 0.1X
|
||||
from_json(timestamp) 64074 64124 53 0.2 6407.4 0.0X
|
||||
from_json(date) 52520 52617 101 0.2 5252.0 0.1X
|
||||
read timestamp text from files 1614 1675 55 6.2 161.4 1.0X
|
||||
read timestamps from files 16640 16858 209 0.6 1664.0 0.1X
|
||||
infer timestamps from files 33239 33388 227 0.3 3323.9 0.0X
|
||||
read date text from files 1310 1340 44 7.6 131.0 1.2X
|
||||
read date from files 9470 9513 41 1.1 947.0 0.2X
|
||||
timestamp strings 1303 1342 47 7.7 130.3 1.2X
|
||||
parse timestamps from Dataset[String] 17650 18073 380 0.6 1765.0 0.1X
|
||||
infer timestamps from Dataset[String] 32623 34065 1330 0.3 3262.3 0.0X
|
||||
date strings 1864 1871 7 5.4 186.4 0.9X
|
||||
parse dates from Dataset[String] 10914 11316 482 0.9 1091.4 0.1X
|
||||
from_json(timestamp) 21102 21990 929 0.5 2110.2 0.1X
|
||||
from_json(date) 15275 15961 598 0.7 1527.5 0.1X
|
||||
|
||||
|
||||
|
|
|
@ -3,110 +3,110 @@ Benchmark for performance of JSON parsing
|
|||
================================================================================================
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
No encoding 61888 61918 27 1.6 618.9 1.0X
|
||||
UTF-8 is set 109057 113663 NaN 0.9 1090.6 0.6X
|
||||
No encoding 38998 41002 NaN 2.6 390.0 1.0X
|
||||
UTF-8 is set 61231 63282 1854 1.6 612.3 0.6X
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
No encoding 44517 44535 29 2.2 445.2 1.0X
|
||||
UTF-8 is set 75722 75840 111 1.3 757.2 0.6X
|
||||
No encoding 28272 28338 70 3.5 282.7 1.0X
|
||||
UTF-8 is set 58681 62243 1517 1.7 586.8 0.5X
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
No encoding 63677 64090 633 0.2 6367.7 1.0X
|
||||
UTF-8 is set 99424 99615 185 0.1 9942.4 0.6X
|
||||
No encoding 44026 51829 1329 0.2 4402.6 1.0X
|
||||
UTF-8 is set 65839 68596 500 0.2 6583.9 0.7X
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
No encoding 174052 174251 174 0.0 348104.1 1.0X
|
||||
UTF-8 is set 189000 189098 113 0.0 378000.9 0.9X
|
||||
No encoding 72144 74820 NaN 0.0 144287.6 1.0X
|
||||
UTF-8 is set 69571 77888 NaN 0.0 139142.3 1.0X
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Select 10 columns 18387 18473 142 0.5 1838.7 1.0X
|
||||
Select 1 column 25560 25571 13 0.4 2556.0 0.7X
|
||||
Select 10 columns 9502 9604 106 1.1 950.2 1.0X
|
||||
Select 1 column 11861 11948 109 0.8 1186.1 0.8X
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Short column without encoding 9323 9384 58 1.1 932.3 1.0X
|
||||
Short column with UTF-8 14016 14058 55 0.7 1401.6 0.7X
|
||||
Wide column without encoding 133258 133532 382 0.1 13325.8 0.1X
|
||||
Wide column with UTF-8 181212 181283 61 0.1 18121.2 0.1X
|
||||
Short column without encoding 3830 3846 15 2.6 383.0 1.0X
|
||||
Short column with UTF-8 5538 5543 7 1.8 553.8 0.7X
|
||||
Wide column without encoding 66899 69158 NaN 0.1 6689.9 0.1X
|
||||
Wide column with UTF-8 90052 93235 NaN 0.1 9005.2 0.0X
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Text read 1168 1174 5 8.6 116.8 1.0X
|
||||
from_json 22604 23571 883 0.4 2260.4 0.1X
|
||||
json_tuple 29979 30053 91 0.3 2997.9 0.0X
|
||||
get_json_object 21987 22263 241 0.5 2198.7 0.1X
|
||||
Text read 659 674 13 15.2 65.9 1.0X
|
||||
from_json 7676 7943 405 1.3 767.6 0.1X
|
||||
json_tuple 9881 10172 273 1.0 988.1 0.1X
|
||||
get_json_object 7949 8055 119 1.3 794.9 0.1X
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Text read 5831 5842 14 8.6 116.6 1.0X
|
||||
schema inferring 31372 31456 73 1.6 627.4 0.2X
|
||||
parsing 35911 36191 254 1.4 718.2 0.2X
|
||||
Text read 3314 3326 17 15.1 66.3 1.0X
|
||||
schema inferring 16549 17037 484 3.0 331.0 0.2X
|
||||
parsing 15138 15283 172 3.3 302.8 0.2X
|
||||
|
||||
Preparing data for benchmarking ...
|
||||
OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Text read 10249 10314 77 4.9 205.0 1.0X
|
||||
Schema inferring 35403 35436 40 1.4 708.1 0.3X
|
||||
Parsing without charset 32875 32879 4 1.5 657.5 0.3X
|
||||
Parsing with UTF-8 53444 53519 100 0.9 1068.9 0.2X
|
||||
Text read 5136 5446 268 9.7 102.7 1.0X
|
||||
Schema inferring 19864 20568 1191 2.5 397.3 0.3X
|
||||
Parsing without charset 17535 17888 329 2.9 350.7 0.3X
|
||||
Parsing with UTF-8 25609 25758 218 2.0 512.2 0.2X
|
||||
|
||||
OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
Create a dataset of timestamps 1909 1924 17 5.2 190.9 1.0X
|
||||
to_json(timestamp) 18956 19122 208 0.5 1895.6 0.1X
|
||||
write timestamps to files 13446 13472 43 0.7 1344.6 0.1X
|
||||
Create a dataset of dates 2180 2200 28 4.6 218.0 0.9X
|
||||
to_json(date) 12780 12899 109 0.8 1278.0 0.1X
|
||||
write dates to files 7835 7865 29 1.3 783.5 0.2X
|
||||
Create a dataset of timestamps 784 790 7 12.8 78.4 1.0X
|
||||
to_json(timestamp) 8005 8055 50 1.2 800.5 0.1X
|
||||
write timestamps to files 6515 6559 45 1.5 651.5 0.1X
|
||||
Create a dataset of dates 854 881 24 11.7 85.4 0.9X
|
||||
to_json(date) 5187 5194 7 1.9 518.7 0.2X
|
||||
write dates to files 3663 3684 22 2.7 366.3 0.2X
|
||||
|
||||
OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws
|
||||
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
|
||||
Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4
|
||||
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
|
||||
Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
|
||||
------------------------------------------------------------------------------------------------------------------------
|
||||
read timestamp text from files 2467 2477 9 4.1 246.7 1.0X
|
||||
read timestamps from files 40186 40342 135 0.2 4018.6 0.1X
|
||||
infer timestamps from files 82005 82079 71 0.1 8200.5 0.0X
|
||||
read date text from files 2243 2264 22 4.5 224.3 1.1X
|
||||
read date from files 24852 24863 19 0.4 2485.2 0.1X
|
||||
timestamp strings 3836 3854 16 2.6 383.6 0.6X
|
||||
parse timestamps from Dataset[String] 51521 51697 242 0.2 5152.1 0.0X
|
||||
infer timestamps from Dataset[String] 97300 97398 133 0.1 9730.0 0.0X
|
||||
date strings 4488 4491 5 2.2 448.8 0.5X
|
||||
parse dates from Dataset[String] 37918 37976 68 0.3 3791.8 0.1X
|
||||
from_json(timestamp) 69611 69632 36 0.1 6961.1 0.0X
|
||||
from_json(date) 56598 56974 347 0.2 5659.8 0.0X
|
||||
read timestamp text from files 1297 1316 26 7.7 129.7 1.0X
|
||||
read timestamps from files 16915 17723 963 0.6 1691.5 0.1X
|
||||
infer timestamps from files 33967 34304 360 0.3 3396.7 0.0X
|
||||
read date text from files 1095 1100 7 9.1 109.5 1.2X
|
||||
read date from files 8376 8513 209 1.2 837.6 0.2X
|
||||
timestamp strings 1807 1816 8 5.5 180.7 0.7X
|
||||
parse timestamps from Dataset[String] 18189 18242 74 0.5 1818.9 0.1X
|
||||
infer timestamps from Dataset[String] 37906 38547 571 0.3 3790.6 0.0X
|
||||
date strings 2191 2194 4 4.6 219.1 0.6X
|
||||
parse dates from Dataset[String] 11593 11625 33 0.9 1159.3 0.1X
|
||||
from_json(timestamp) 22589 22650 101 0.4 2258.9 0.1X
|
||||
from_json(date) 16479 16619 159 0.6 1647.9 0.1X
|
||||
|
||||
|
||||
|
|
|
@ -238,4 +238,16 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession {
|
|||
spark.range(1).select(schema_of_csv(input)),
|
||||
Seq(Row("struct<_c0:double,_c1:int>")))
|
||||
}
|
||||
|
||||
test("optional datetime parser does not affect csv time formatting") {
|
||||
val s = "2015-08-26 12:34:46"
|
||||
def toDF(p: String): DataFrame = sql(
|
||||
s"""
|
||||
|SELECT
|
||||
| to_csv(
|
||||
| named_struct('time', timestamp'$s'), map('timestampFormat', "$p")
|
||||
| )
|
||||
| """.stripMargin)
|
||||
checkAnswer(toDF("yyyy-MM-dd'T'HH:mm:ss.SSSXXX"), toDF("yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]"))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -710,4 +710,15 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession {
|
|||
Seq(Row("string")))
|
||||
}
|
||||
|
||||
test("optional datetime parser does not affect json time formatting") {
|
||||
val s = "2015-08-26 12:34:46"
|
||||
def toDF(p: String): DataFrame = sql(
|
||||
s"""
|
||||
|SELECT
|
||||
| to_json(
|
||||
| named_struct('time', timestamp'$s'), map('timestampFormat', "$p")
|
||||
| )
|
||||
| """.stripMargin)
|
||||
checkAnswer(toDF("yyyy-MM-dd'T'HH:mm:ss.SSSXXX"), toDF("yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]"))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -238,7 +238,9 @@ object CSVBenchmark extends SqlBasedBenchmark {
|
|||
|
||||
def timestampStr: Dataset[String] = {
|
||||
spark.range(0, rowsNum, 1, 1).mapPartitions { iter =>
|
||||
iter.map(i => s"1970-01-01T01:02:03.${100 + i % 100}Z")
|
||||
iter.map {
|
||||
i => s"1970-01-01T01:02:03.${i % 200}Z".stripSuffix(".0Z")
|
||||
}
|
||||
}.select($"value".as("timestamp")).as[String]
|
||||
}
|
||||
|
||||
|
|
|
@ -445,7 +445,9 @@ object JsonBenchmark extends SqlBasedBenchmark {
|
|||
|
||||
def timestampStr: Dataset[String] = {
|
||||
spark.range(0, rowsNum, 1, 1).mapPartitions { iter =>
|
||||
iter.map(i => s"""{"timestamp":"1970-01-01T01:02:03.${100 + i % 100}Z"}""")
|
||||
iter.map { i =>
|
||||
s"""{"timestamp":"1970-01-01T01:02:03.${i % 200}Z"}""".stripSuffix(".0Z")
|
||||
}
|
||||
}.select($"value".as("timestamp")).as[String]
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue