[SPARK-31878][SQL] Create date formatter only once in HiveResult
### What changes were proposed in this pull request? 1. Replace `def dateFormatter` to `val dateFormatter`. 2. Modify the `date formatting in hive result` test in `HiveResultSuite` to check modified code on various time zones. ### Why are the changes needed? To avoid creation of `DateFormatter` per every incoming date in `HiveResult.toHiveString`. This should eliminate unnecessary creation of `SimpleDateFormat` instances and compilation of the default pattern `yyyy-MM-dd`. The changes can speed up processing of legacy date values of the `java.sql.Date` type which is collected by default. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Modified a test in `HiveResultSuite`. Closes #28687 from MaxGekk/HiveResult-val-dateFormatter. Authored-by: Max Gekk <max.gekk@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
parent
e5b9b862e6
commit
125a89ce08
|
@ -19,10 +19,10 @@ package org.apache.spark.sql.execution
|
|||
|
||||
import java.nio.charset.StandardCharsets
|
||||
import java.sql.{Date, Timestamp}
|
||||
import java.time.{Instant, LocalDate}
|
||||
import java.time.{Instant, LocalDate, ZoneOffset}
|
||||
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, TimestampFormatter}
|
||||
import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeUtils, LegacyDateFormats, TimestampFormatter}
|
||||
import org.apache.spark.sql.execution.command.{DescribeCommandBase, ExecutedCommandExec, ShowTablesCommand, ShowViewsCommand}
|
||||
import org.apache.spark.sql.execution.datasources.v2.{DescribeTableExec, ShowTablesExec}
|
||||
import org.apache.spark.sql.internal.SQLConf
|
||||
|
@ -72,9 +72,23 @@ object HiveResult {
|
|||
}
|
||||
}
|
||||
|
||||
private def zoneId = DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone)
|
||||
private def dateFormatter = DateFormatter(zoneId)
|
||||
private def timestampFormatter = TimestampFormatter.getFractionFormatter(zoneId)
|
||||
// We can create the date formatter only once because it does not depend on Spark's
|
||||
// session time zone controlled by the SQL config `spark.sql.session.timeZone`.
|
||||
// The `zoneId` parameter is used only in parsing of special date values like `now`,
|
||||
// `yesterday` and etc. but not in date formatting. While formatting of:
|
||||
// - `java.time.LocalDate`, zone id is not used by `DateTimeFormatter` at all.
|
||||
// - `java.sql.Date`, the date formatter delegates formatting to the legacy formatter
|
||||
// which uses the default system time zone `TimeZone.getDefault`. This works correctly
|
||||
// due to `DateTimeUtils.toJavaDate` which is based on the system time zone too.
|
||||
private val dateFormatter = DateFormatter(
|
||||
format = DateFormatter.defaultPattern,
|
||||
// We can set any time zone id. UTC was taken for simplicity.
|
||||
zoneId = ZoneOffset.UTC,
|
||||
locale = DateFormatter.defaultLocale,
|
||||
// Use `FastDateFormat` as the legacy formatter because it is thread-safe.
|
||||
legacyFormat = LegacyDateFormats.FAST_DATE_FORMAT)
|
||||
private def timestampFormatter = TimestampFormatter.getFractionFormatter(
|
||||
DateTimeUtils.getZoneId(SQLConf.get.sessionLocalTimeZone))
|
||||
|
||||
/** Formats a datum (based on the given data type) and returns the string representation. */
|
||||
def toHiveString(a: (Any, DataType), nested: Boolean = false): String = a match {
|
||||
|
|
|
@ -17,21 +17,27 @@
|
|||
|
||||
package org.apache.spark.sql.execution
|
||||
|
||||
import org.apache.spark.sql.catalyst.util.DateTimeTestUtils
|
||||
import org.apache.spark.sql.connector.InMemoryTableCatalog
|
||||
import org.apache.spark.sql.internal.SQLConf
|
||||
import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SharedSparkSession}
|
||||
|
||||
class HiveResultSuite extends SharedSparkSession {
|
||||
import testImplicits._
|
||||
|
||||
test("date formatting in hive result") {
|
||||
val dates = Seq("2018-12-28", "1582-10-03", "1582-10-04", "1582-10-15")
|
||||
val df = dates.toDF("a").selectExpr("cast(a as date) as b")
|
||||
val executedPlan1 = df.queryExecution.executedPlan
|
||||
val result = HiveResult.hiveResultString(executedPlan1)
|
||||
assert(result == dates)
|
||||
val executedPlan2 = df.selectExpr("array(b)").queryExecution.executedPlan
|
||||
val result2 = HiveResult.hiveResultString(executedPlan2)
|
||||
assert(result2 == dates.map(x => s"[$x]"))
|
||||
DateTimeTestUtils.outstandingTimezonesIds.foreach { zoneId =>
|
||||
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> zoneId) {
|
||||
val dates = Seq("2018-12-28", "1582-10-03", "1582-10-04", "1582-10-15")
|
||||
val df = dates.toDF("a").selectExpr("cast(a as date) as b")
|
||||
val executedPlan1 = df.queryExecution.executedPlan
|
||||
val result = HiveResult.hiveResultString(executedPlan1)
|
||||
assert(result == dates)
|
||||
val executedPlan2 = df.selectExpr("array(b)").queryExecution.executedPlan
|
||||
val result2 = HiveResult.hiveResultString(executedPlan2)
|
||||
assert(result2 == dates.map(x => s"[$x]"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("timestamp formatting in hive result") {
|
||||
|
|
Loading…
Reference in a new issue