[SPARK-26243][SQL] Use java.time API for parsing timestamps and dates from JSON
## What changes were proposed in this pull request? In the PR, I propose to switch on **java.time API** for parsing timestamps and dates from JSON inputs with microseconds precision. The SQL config `spark.sql.legacy.timeParser.enabled` allow to switch back to previous behavior with using `java.text.SimpleDateFormat`/`FastDateFormat` for parsing/generating timestamps/dates. ## How was this patch tested? It was tested by `JsonExpressionsSuite`, `JsonFunctionsSuite` and `JsonSuite`. Closes #23196 from MaxGekk/json-time-parser. Lead-authored-by: Maxim Gekk <maxim.gekk@databricks.com> Co-authored-by: Maxim Gekk <max.gekk@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
This commit is contained in:
parent
860f4497f2
commit
8a27952cdb
|
@ -35,7 +35,7 @@ displayTitle: Spark SQL Upgrading Guide
|
|||
|
||||
- Spark applications which are built with Spark version 2.4 and prior, and call methods of `UserDefinedFunction`, need to be re-compiled with Spark 3.0, as they are not binary compatible with Spark 3.0.
|
||||
|
||||
- Since Spark 3.0, CSV datasource uses java.time API for parsing and generating CSV content. New formatting implementation supports date/timestamp patterns conformed to ISO 8601. To switch back to the implementation used in Spark 2.4 and earlier, set `spark.sql.legacy.timeParser.enabled` to `true`.
|
||||
- Since Spark 3.0, CSV/JSON datasources use java.time API for parsing and generating CSV/JSON content. In Spark version 2.4 and earlier, java.text.SimpleDateFormat is used for the same purpuse with fallbacks to the parsing mechanisms of Spark 2.0 and 1.x. For example, `2018-12-08 10:39:21.123` with the pattern `yyyy-MM-dd'T'HH:mm:ss.SSS` cannot be parsed since Spark 3.0 because the timestamp does not match to the pattern but it can be parsed by earlier Spark versions due to a fallback to `Timestamp.valueOf`. To parse the same timestamp since Spark 3.0, the pattern should be `yyyy-MM-dd HH:mm:ss.SSS`. To switch back to the implementation used in Spark 2.4 and earlier, set `spark.sql.legacy.timeParser.enabled` to `true`.
|
||||
|
||||
- In Spark version 2.4 and earlier, CSV datasource converts a malformed CSV string to a row with all `null`s in the PERMISSIVE mode. Since Spark 3.0, the returned row can contain non-`null` fields if some of CSV column values were parsed and converted to desired types successfully.
|
||||
|
||||
|
|
|
@ -22,13 +22,13 @@ import scala.util.control.Exception.allCatch
|
|||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.catalyst.analysis.TypeCoercion
|
||||
import org.apache.spark.sql.catalyst.expressions.ExprUtils
|
||||
import org.apache.spark.sql.catalyst.util.DateTimeFormatter
|
||||
import org.apache.spark.sql.catalyst.util.TimestampFormatter
|
||||
import org.apache.spark.sql.types._
|
||||
|
||||
class CSVInferSchema(val options: CSVOptions) extends Serializable {
|
||||
|
||||
@transient
|
||||
private lazy val timeParser = DateTimeFormatter(
|
||||
private lazy val timestampParser = TimestampFormatter(
|
||||
options.timestampFormat,
|
||||
options.timeZone,
|
||||
options.locale)
|
||||
|
@ -160,7 +160,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
|
|||
|
||||
private def tryParseTimestamp(field: String): DataType = {
|
||||
// This case infers a custom `dataFormat` is set.
|
||||
if ((allCatch opt timeParser.parse(field)).isDefined) {
|
||||
if ((allCatch opt timestampParser.parse(field)).isDefined) {
|
||||
TimestampType
|
||||
} else {
|
||||
tryParseBoolean(field)
|
||||
|
|
|
@ -22,7 +22,7 @@ import java.io.Writer
|
|||
import com.univocity.parsers.csv.CsvWriter
|
||||
|
||||
import org.apache.spark.sql.catalyst.InternalRow
|
||||
import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeFormatter}
|
||||
import org.apache.spark.sql.catalyst.util.{DateFormatter, TimestampFormatter}
|
||||
import org.apache.spark.sql.types._
|
||||
|
||||
class UnivocityGenerator(
|
||||
|
@ -41,18 +41,18 @@ class UnivocityGenerator(
|
|||
private val valueConverters: Array[ValueConverter] =
|
||||
schema.map(_.dataType).map(makeConverter).toArray
|
||||
|
||||
private val timeFormatter = DateTimeFormatter(
|
||||
private val timestampFormatter = TimestampFormatter(
|
||||
options.timestampFormat,
|
||||
options.timeZone,
|
||||
options.locale)
|
||||
private val dateFormatter = DateFormatter(options.dateFormat, options.timeZone, options.locale)
|
||||
private val dateFormatter = DateFormatter(options.dateFormat, options.locale)
|
||||
|
||||
private def makeConverter(dataType: DataType): ValueConverter = dataType match {
|
||||
case DateType =>
|
||||
(row: InternalRow, ordinal: Int) => dateFormatter.format(row.getInt(ordinal))
|
||||
|
||||
case TimestampType =>
|
||||
(row: InternalRow, ordinal: Int) => timeFormatter.format(row.getLong(ordinal))
|
||||
(row: InternalRow, ordinal: Int) => timestampFormatter.format(row.getLong(ordinal))
|
||||
|
||||
case udt: UserDefinedType[_] => makeConverter(udt.sqlType)
|
||||
|
||||
|
|
|
@ -74,11 +74,11 @@ class UnivocityParser(
|
|||
|
||||
private val row = new GenericInternalRow(requiredSchema.length)
|
||||
|
||||
private val timeFormatter = DateTimeFormatter(
|
||||
private val timestampFormatter = TimestampFormatter(
|
||||
options.timestampFormat,
|
||||
options.timeZone,
|
||||
options.locale)
|
||||
private val dateFormatter = DateFormatter(options.dateFormat, options.timeZone, options.locale)
|
||||
private val dateFormatter = DateFormatter(options.dateFormat, options.locale)
|
||||
|
||||
// Retrieve the raw record string.
|
||||
private def getCurrentInput: UTF8String = {
|
||||
|
@ -158,7 +158,7 @@ class UnivocityParser(
|
|||
}
|
||||
|
||||
case _: TimestampType => (d: String) =>
|
||||
nullSafeDatum(d, name, nullable, options)(timeFormatter.parse)
|
||||
nullSafeDatum(d, name, nullable, options)(timestampFormatter.parse)
|
||||
|
||||
case _: DateType => (d: String) =>
|
||||
nullSafeDatum(d, name, nullable, options)(dateFormatter.parse)
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.nio.charset.{Charset, StandardCharsets}
|
|||
import java.util.{Locale, TimeZone}
|
||||
|
||||
import com.fasterxml.jackson.core.{JsonFactory, JsonParser}
|
||||
import org.apache.commons.lang3.time.FastDateFormat
|
||||
|
||||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.sql.catalyst.util._
|
||||
|
@ -82,13 +81,10 @@ private[sql] class JSONOptions(
|
|||
val timeZone: TimeZone = DateTimeUtils.getTimeZone(
|
||||
parameters.getOrElse(DateTimeUtils.TIMEZONE_OPTION, defaultTimeZoneId))
|
||||
|
||||
// Uses `FastDateFormat` which can be direct replacement for `SimpleDateFormat` and thread-safe.
|
||||
val dateFormat: FastDateFormat =
|
||||
FastDateFormat.getInstance(parameters.getOrElse("dateFormat", "yyyy-MM-dd"), locale)
|
||||
val dateFormat: String = parameters.getOrElse("dateFormat", "yyyy-MM-dd")
|
||||
|
||||
val timestampFormat: FastDateFormat =
|
||||
FastDateFormat.getInstance(
|
||||
parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSXXX"), timeZone, locale)
|
||||
val timestampFormat: String =
|
||||
parameters.getOrElse("timestampFormat", "yyyy-MM-dd'T'HH:mm:ss.SSSXXX")
|
||||
|
||||
val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false)
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@ import com.fasterxml.jackson.core._
|
|||
|
||||
import org.apache.spark.sql.catalyst.InternalRow
|
||||
import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
|
||||
import org.apache.spark.sql.catalyst.util.{ArrayData, DateTimeUtils, MapData}
|
||||
import org.apache.spark.sql.catalyst.util._
|
||||
import org.apache.spark.sql.types._
|
||||
|
||||
/**
|
||||
|
@ -77,6 +77,12 @@ private[sql] class JacksonGenerator(
|
|||
|
||||
private val lineSeparator: String = options.lineSeparatorInWrite
|
||||
|
||||
private val timestampFormatter = TimestampFormatter(
|
||||
options.timestampFormat,
|
||||
options.timeZone,
|
||||
options.locale)
|
||||
private val dateFormatter = DateFormatter(options.dateFormat, options.locale)
|
||||
|
||||
private def makeWriter(dataType: DataType): ValueWriter = dataType match {
|
||||
case NullType =>
|
||||
(row: SpecializedGetters, ordinal: Int) =>
|
||||
|
@ -116,14 +122,12 @@ private[sql] class JacksonGenerator(
|
|||
|
||||
case TimestampType =>
|
||||
(row: SpecializedGetters, ordinal: Int) =>
|
||||
val timestampString =
|
||||
options.timestampFormat.format(DateTimeUtils.toJavaTimestamp(row.getLong(ordinal)))
|
||||
val timestampString = timestampFormatter.format(row.getLong(ordinal))
|
||||
gen.writeString(timestampString)
|
||||
|
||||
case DateType =>
|
||||
(row: SpecializedGetters, ordinal: Int) =>
|
||||
val dateString =
|
||||
options.dateFormat.format(DateTimeUtils.toJavaDate(row.getInt(ordinal)))
|
||||
val dateString = dateFormatter.format(row.getInt(ordinal))
|
||||
gen.writeString(dateString)
|
||||
|
||||
case BinaryType =>
|
||||
|
|
|
@ -55,6 +55,12 @@ class JacksonParser(
|
|||
private val factory = new JsonFactory()
|
||||
options.setJacksonOptions(factory)
|
||||
|
||||
private val timestampFormatter = TimestampFormatter(
|
||||
options.timestampFormat,
|
||||
options.timeZone,
|
||||
options.locale)
|
||||
private val dateFormatter = DateFormatter(options.dateFormat, options.locale)
|
||||
|
||||
/**
|
||||
* Create a converter which converts the JSON documents held by the `JsonParser`
|
||||
* to a value according to a desired schema. This is a wrapper for the method
|
||||
|
@ -218,17 +224,7 @@ class JacksonParser(
|
|||
case TimestampType =>
|
||||
(parser: JsonParser) => parseJsonToken[java.lang.Long](parser, dataType) {
|
||||
case VALUE_STRING if parser.getTextLength >= 1 =>
|
||||
val stringValue = parser.getText
|
||||
// This one will lose microseconds parts.
|
||||
// See https://issues.apache.org/jira/browse/SPARK-10681.
|
||||
Long.box {
|
||||
Try(options.timestampFormat.parse(stringValue).getTime * 1000L)
|
||||
.getOrElse {
|
||||
// If it fails to parse, then tries the way used in 2.0 and 1.x for backwards
|
||||
// compatibility.
|
||||
DateTimeUtils.stringToTime(stringValue).getTime * 1000L
|
||||
}
|
||||
}
|
||||
timestampFormatter.parse(parser.getText)
|
||||
|
||||
case VALUE_NUMBER_INT =>
|
||||
parser.getLongValue * 1000000L
|
||||
|
@ -237,22 +233,7 @@ class JacksonParser(
|
|||
case DateType =>
|
||||
(parser: JsonParser) => parseJsonToken[java.lang.Integer](parser, dataType) {
|
||||
case VALUE_STRING if parser.getTextLength >= 1 =>
|
||||
val stringValue = parser.getText
|
||||
// This one will lose microseconds parts.
|
||||
// See https://issues.apache.org/jira/browse/SPARK-10681.x
|
||||
Int.box {
|
||||
Try(DateTimeUtils.millisToDays(options.dateFormat.parse(stringValue).getTime))
|
||||
.orElse {
|
||||
// If it fails to parse, then tries the way used in 2.0 and 1.x for backwards
|
||||
// compatibility.
|
||||
Try(DateTimeUtils.millisToDays(DateTimeUtils.stringToTime(stringValue).getTime))
|
||||
}
|
||||
.getOrElse {
|
||||
// In Spark 1.5.0, we store the data as number of days since epoch in string.
|
||||
// So, we just convert it to Int.
|
||||
stringValue.toInt
|
||||
}
|
||||
}
|
||||
dateFormatter.parse(parser.getText)
|
||||
}
|
||||
|
||||
case BinaryType =>
|
||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.util
|
|||
|
||||
import java.time._
|
||||
import java.time.format.DateTimeFormatterBuilder
|
||||
import java.time.temporal.{ChronoField, TemporalQueries}
|
||||
import java.time.temporal.{ChronoField, TemporalAccessor, TemporalQueries}
|
||||
import java.util.{Locale, TimeZone}
|
||||
|
||||
import scala.util.Try
|
||||
|
@ -28,16 +28,17 @@ import org.apache.commons.lang3.time.FastDateFormat
|
|||
|
||||
import org.apache.spark.sql.internal.SQLConf
|
||||
|
||||
sealed trait DateTimeFormatter {
|
||||
sealed trait TimestampFormatter {
|
||||
def parse(s: String): Long // returns microseconds since epoch
|
||||
def format(us: Long): String
|
||||
}
|
||||
|
||||
class Iso8601DateTimeFormatter(
|
||||
trait FormatterUtils {
|
||||
protected def zoneId: ZoneId
|
||||
protected def buildFormatter(
|
||||
pattern: String,
|
||||
timeZone: TimeZone,
|
||||
locale: Locale) extends DateTimeFormatter {
|
||||
val formatter = new DateTimeFormatterBuilder()
|
||||
locale: Locale): java.time.format.DateTimeFormatter = {
|
||||
new DateTimeFormatterBuilder()
|
||||
.appendPattern(pattern)
|
||||
.parseDefaulting(ChronoField.YEAR_OF_ERA, 1970)
|
||||
.parseDefaulting(ChronoField.MONTH_OF_YEAR, 1)
|
||||
|
@ -46,13 +47,25 @@ class Iso8601DateTimeFormatter(
|
|||
.parseDefaulting(ChronoField.MINUTE_OF_HOUR, 0)
|
||||
.parseDefaulting(ChronoField.SECOND_OF_MINUTE, 0)
|
||||
.toFormatter(locale)
|
||||
}
|
||||
protected def toInstantWithZoneId(temporalAccessor: TemporalAccessor): java.time.Instant = {
|
||||
val localDateTime = LocalDateTime.from(temporalAccessor)
|
||||
val zonedDateTime = ZonedDateTime.of(localDateTime, zoneId)
|
||||
Instant.from(zonedDateTime)
|
||||
}
|
||||
}
|
||||
|
||||
class Iso8601TimestampFormatter(
|
||||
pattern: String,
|
||||
timeZone: TimeZone,
|
||||
locale: Locale) extends TimestampFormatter with FormatterUtils {
|
||||
val zoneId = timeZone.toZoneId
|
||||
val formatter = buildFormatter(pattern, locale)
|
||||
|
||||
def toInstant(s: String): Instant = {
|
||||
val temporalAccessor = formatter.parse(s)
|
||||
if (temporalAccessor.query(TemporalQueries.offset()) == null) {
|
||||
val localDateTime = LocalDateTime.from(temporalAccessor)
|
||||
val zonedDateTime = ZonedDateTime.of(localDateTime, timeZone.toZoneId)
|
||||
Instant.from(zonedDateTime)
|
||||
toInstantWithZoneId(temporalAccessor)
|
||||
} else {
|
||||
Instant.from(temporalAccessor)
|
||||
}
|
||||
|
@ -75,10 +88,10 @@ class Iso8601DateTimeFormatter(
|
|||
}
|
||||
}
|
||||
|
||||
class LegacyDateTimeFormatter(
|
||||
class LegacyTimestampFormatter(
|
||||
pattern: String,
|
||||
timeZone: TimeZone,
|
||||
locale: Locale) extends DateTimeFormatter {
|
||||
locale: Locale) extends TimestampFormatter {
|
||||
val format = FastDateFormat.getInstance(pattern, timeZone, locale)
|
||||
|
||||
protected def toMillis(s: String): Long = format.parse(s).getTime
|
||||
|
@ -90,21 +103,21 @@ class LegacyDateTimeFormatter(
|
|||
}
|
||||
}
|
||||
|
||||
class LegacyFallbackDateTimeFormatter(
|
||||
class LegacyFallbackTimestampFormatter(
|
||||
pattern: String,
|
||||
timeZone: TimeZone,
|
||||
locale: Locale) extends LegacyDateTimeFormatter(pattern, timeZone, locale) {
|
||||
locale: Locale) extends LegacyTimestampFormatter(pattern, timeZone, locale) {
|
||||
override def toMillis(s: String): Long = {
|
||||
Try {super.toMillis(s)}.getOrElse(DateTimeUtils.stringToTime(s).getTime)
|
||||
}
|
||||
}
|
||||
|
||||
object DateTimeFormatter {
|
||||
def apply(format: String, timeZone: TimeZone, locale: Locale): DateTimeFormatter = {
|
||||
object TimestampFormatter {
|
||||
def apply(format: String, timeZone: TimeZone, locale: Locale): TimestampFormatter = {
|
||||
if (SQLConf.get.legacyTimeParserEnabled) {
|
||||
new LegacyFallbackDateTimeFormatter(format, timeZone, locale)
|
||||
new LegacyFallbackTimestampFormatter(format, timeZone, locale)
|
||||
} else {
|
||||
new Iso8601DateTimeFormatter(format, timeZone, locale)
|
||||
new Iso8601TimestampFormatter(format, timeZone, locale)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -116,13 +129,19 @@ sealed trait DateFormatter {
|
|||
|
||||
class Iso8601DateFormatter(
|
||||
pattern: String,
|
||||
timeZone: TimeZone,
|
||||
locale: Locale) extends DateFormatter {
|
||||
locale: Locale) extends DateFormatter with FormatterUtils {
|
||||
|
||||
val dateTimeFormatter = new Iso8601DateTimeFormatter(pattern, timeZone, locale)
|
||||
val zoneId = ZoneId.of("UTC")
|
||||
|
||||
val formatter = buildFormatter(pattern, locale)
|
||||
|
||||
def toInstant(s: String): Instant = {
|
||||
val temporalAccessor = formatter.parse(s)
|
||||
toInstantWithZoneId(temporalAccessor)
|
||||
}
|
||||
|
||||
override def parse(s: String): Int = {
|
||||
val seconds = dateTimeFormatter.toInstant(s).getEpochSecond
|
||||
val seconds = toInstant(s).getEpochSecond
|
||||
val days = Math.floorDiv(seconds, DateTimeUtils.SECONDS_PER_DAY)
|
||||
|
||||
days.toInt
|
||||
|
@ -130,15 +149,12 @@ class Iso8601DateFormatter(
|
|||
|
||||
override def format(days: Int): String = {
|
||||
val instant = Instant.ofEpochSecond(days * DateTimeUtils.SECONDS_PER_DAY)
|
||||
dateTimeFormatter.formatter.withZone(timeZone.toZoneId).format(instant)
|
||||
formatter.withZone(zoneId).format(instant)
|
||||
}
|
||||
}
|
||||
|
||||
class LegacyDateFormatter(
|
||||
pattern: String,
|
||||
timeZone: TimeZone,
|
||||
locale: Locale) extends DateFormatter {
|
||||
val format = FastDateFormat.getInstance(pattern, timeZone, locale)
|
||||
class LegacyDateFormatter(pattern: String, locale: Locale) extends DateFormatter {
|
||||
val format = FastDateFormat.getInstance(pattern, locale)
|
||||
|
||||
def parse(s: String): Int = {
|
||||
val milliseconds = format.parse(s).getTime
|
||||
|
@ -153,8 +169,7 @@ class LegacyDateFormatter(
|
|||
|
||||
class LegacyFallbackDateFormatter(
|
||||
pattern: String,
|
||||
timeZone: TimeZone,
|
||||
locale: Locale) extends LegacyDateFormatter(pattern, timeZone, locale) {
|
||||
locale: Locale) extends LegacyDateFormatter(pattern, locale) {
|
||||
override def parse(s: String): Int = {
|
||||
Try(super.parse(s)).orElse {
|
||||
// If it fails to parse, then tries the way used in 2.0 and 1.x for backwards
|
||||
|
@ -169,11 +184,11 @@ class LegacyFallbackDateFormatter(
|
|||
}
|
||||
|
||||
object DateFormatter {
|
||||
def apply(format: String, timeZone: TimeZone, locale: Locale): DateFormatter = {
|
||||
def apply(format: String, locale: Locale): DateFormatter = {
|
||||
if (SQLConf.get.legacyTimeParserEnabled) {
|
||||
new LegacyFallbackDateFormatter(format, timeZone, locale)
|
||||
new LegacyFallbackDateFormatter(format, locale)
|
||||
} else {
|
||||
new Iso8601DateFormatter(format, timeZone, locale)
|
||||
new Iso8601DateFormatter(format, locale)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,103 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.util
|
||||
|
||||
import java.util.{Locale, TimeZone}
|
||||
|
||||
import org.apache.spark.SparkFunSuite
|
||||
import org.apache.spark.sql.catalyst.util.{DateFormatter, DateTimeFormatter, DateTimeTestUtils}
|
||||
|
||||
class DateTimeFormatterSuite extends SparkFunSuite {
|
||||
test("parsing dates using time zones") {
|
||||
val localDate = "2018-12-02"
|
||||
val expectedDays = Map(
|
||||
"UTC" -> 17867,
|
||||
"PST" -> 17867,
|
||||
"CET" -> 17866,
|
||||
"Africa/Dakar" -> 17867,
|
||||
"America/Los_Angeles" -> 17867,
|
||||
"Antarctica/Vostok" -> 17866,
|
||||
"Asia/Hong_Kong" -> 17866,
|
||||
"Europe/Amsterdam" -> 17866)
|
||||
DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
|
||||
val formatter = DateFormatter("yyyy-MM-dd", TimeZone.getTimeZone(timeZone), Locale.US)
|
||||
val daysSinceEpoch = formatter.parse(localDate)
|
||||
assert(daysSinceEpoch === expectedDays(timeZone))
|
||||
}
|
||||
}
|
||||
|
||||
test("parsing timestamps using time zones") {
|
||||
val localDate = "2018-12-02T10:11:12.001234"
|
||||
val expectedMicros = Map(
|
||||
"UTC" -> 1543745472001234L,
|
||||
"PST" -> 1543774272001234L,
|
||||
"CET" -> 1543741872001234L,
|
||||
"Africa/Dakar" -> 1543745472001234L,
|
||||
"America/Los_Angeles" -> 1543774272001234L,
|
||||
"Antarctica/Vostok" -> 1543723872001234L,
|
||||
"Asia/Hong_Kong" -> 1543716672001234L,
|
||||
"Europe/Amsterdam" -> 1543741872001234L)
|
||||
DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
|
||||
val formatter = DateTimeFormatter(
|
||||
"yyyy-MM-dd'T'HH:mm:ss.SSSSSS",
|
||||
TimeZone.getTimeZone(timeZone),
|
||||
Locale.US)
|
||||
val microsSinceEpoch = formatter.parse(localDate)
|
||||
assert(microsSinceEpoch === expectedMicros(timeZone))
|
||||
}
|
||||
}
|
||||
|
||||
test("format dates using time zones") {
|
||||
val daysSinceEpoch = 17867
|
||||
val expectedDate = Map(
|
||||
"UTC" -> "2018-12-02",
|
||||
"PST" -> "2018-12-01",
|
||||
"CET" -> "2018-12-02",
|
||||
"Africa/Dakar" -> "2018-12-02",
|
||||
"America/Los_Angeles" -> "2018-12-01",
|
||||
"Antarctica/Vostok" -> "2018-12-02",
|
||||
"Asia/Hong_Kong" -> "2018-12-02",
|
||||
"Europe/Amsterdam" -> "2018-12-02")
|
||||
DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
|
||||
val formatter = DateFormatter("yyyy-MM-dd", TimeZone.getTimeZone(timeZone), Locale.US)
|
||||
val date = formatter.format(daysSinceEpoch)
|
||||
assert(date === expectedDate(timeZone))
|
||||
}
|
||||
}
|
||||
|
||||
test("format timestamps using time zones") {
|
||||
val microsSinceEpoch = 1543745472001234L
|
||||
val expectedTimestamp = Map(
|
||||
"UTC" -> "2018-12-02T10:11:12.001234",
|
||||
"PST" -> "2018-12-02T02:11:12.001234",
|
||||
"CET" -> "2018-12-02T11:11:12.001234",
|
||||
"Africa/Dakar" -> "2018-12-02T10:11:12.001234",
|
||||
"America/Los_Angeles" -> "2018-12-02T02:11:12.001234",
|
||||
"Antarctica/Vostok" -> "2018-12-02T16:11:12.001234",
|
||||
"Asia/Hong_Kong" -> "2018-12-02T18:11:12.001234",
|
||||
"Europe/Amsterdam" -> "2018-12-02T11:11:12.001234")
|
||||
DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
|
||||
val formatter = DateTimeFormatter(
|
||||
"yyyy-MM-dd'T'HH:mm:ss.SSSSSS",
|
||||
TimeZone.getTimeZone(timeZone),
|
||||
Locale.US)
|
||||
val timestamp = formatter.format(microsSinceEpoch)
|
||||
assert(timestamp === expectedTimestamp(timeZone))
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,174 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.util
|
||||
|
||||
import java.util.{Locale, TimeZone}
|
||||
|
||||
import org.apache.spark.SparkFunSuite
|
||||
import org.apache.spark.sql.catalyst.plans.SQLHelper
|
||||
import org.apache.spark.sql.catalyst.util._
|
||||
import org.apache.spark.sql.internal.SQLConf
|
||||
|
||||
class DateTimestampFormatterSuite extends SparkFunSuite with SQLHelper {
|
||||
test("parsing dates") {
|
||||
DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
|
||||
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {
|
||||
val formatter = DateFormatter("yyyy-MM-dd", Locale.US)
|
||||
val daysSinceEpoch = formatter.parse("2018-12-02")
|
||||
assert(daysSinceEpoch === 17867)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("parsing timestamps using time zones") {
|
||||
val localDate = "2018-12-02T10:11:12.001234"
|
||||
val expectedMicros = Map(
|
||||
"UTC" -> 1543745472001234L,
|
||||
"PST" -> 1543774272001234L,
|
||||
"CET" -> 1543741872001234L,
|
||||
"Africa/Dakar" -> 1543745472001234L,
|
||||
"America/Los_Angeles" -> 1543774272001234L,
|
||||
"Antarctica/Vostok" -> 1543723872001234L,
|
||||
"Asia/Hong_Kong" -> 1543716672001234L,
|
||||
"Europe/Amsterdam" -> 1543741872001234L)
|
||||
DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
|
||||
val formatter = TimestampFormatter(
|
||||
"yyyy-MM-dd'T'HH:mm:ss.SSSSSS",
|
||||
TimeZone.getTimeZone(timeZone),
|
||||
Locale.US)
|
||||
val microsSinceEpoch = formatter.parse(localDate)
|
||||
assert(microsSinceEpoch === expectedMicros(timeZone))
|
||||
}
|
||||
}
|
||||
|
||||
test("format dates") {
|
||||
DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
|
||||
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {
|
||||
val formatter = DateFormatter("yyyy-MM-dd", Locale.US)
|
||||
val date = formatter.format(17867)
|
||||
assert(date === "2018-12-02")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("format timestamps using time zones") {
|
||||
val microsSinceEpoch = 1543745472001234L
|
||||
val expectedTimestamp = Map(
|
||||
"UTC" -> "2018-12-02T10:11:12.001234",
|
||||
"PST" -> "2018-12-02T02:11:12.001234",
|
||||
"CET" -> "2018-12-02T11:11:12.001234",
|
||||
"Africa/Dakar" -> "2018-12-02T10:11:12.001234",
|
||||
"America/Los_Angeles" -> "2018-12-02T02:11:12.001234",
|
||||
"Antarctica/Vostok" -> "2018-12-02T16:11:12.001234",
|
||||
"Asia/Hong_Kong" -> "2018-12-02T18:11:12.001234",
|
||||
"Europe/Amsterdam" -> "2018-12-02T11:11:12.001234")
|
||||
DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
|
||||
val formatter = TimestampFormatter(
|
||||
"yyyy-MM-dd'T'HH:mm:ss.SSSSSS",
|
||||
TimeZone.getTimeZone(timeZone),
|
||||
Locale.US)
|
||||
val timestamp = formatter.format(microsSinceEpoch)
|
||||
assert(timestamp === expectedTimestamp(timeZone))
|
||||
}
|
||||
}
|
||||
|
||||
test("roundtrip timestamp -> micros -> timestamp using timezones") {
|
||||
Seq(
|
||||
-58710115316212000L,
|
||||
-18926315945345679L,
|
||||
-9463427405253013L,
|
||||
-244000001L,
|
||||
0L,
|
||||
99628200102030L,
|
||||
1543749753123456L,
|
||||
2177456523456789L,
|
||||
11858049903010203L).foreach { micros =>
|
||||
DateTimeTestUtils.outstandingTimezones.foreach { timeZone =>
|
||||
val formatter = TimestampFormatter("yyyy-MM-dd'T'HH:mm:ss.SSSSSS", timeZone, Locale.US)
|
||||
val timestamp = formatter.format(micros)
|
||||
val parsed = formatter.parse(timestamp)
|
||||
assert(micros === parsed)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("roundtrip micros -> timestamp -> micros using timezones") {
|
||||
Seq(
|
||||
"0109-07-20T18:38:03.788000",
|
||||
"1370-04-01T10:00:54.654321",
|
||||
"1670-02-11T14:09:54.746987",
|
||||
"1969-12-31T23:55:55.999999",
|
||||
"1970-01-01T00:00:00.000000",
|
||||
"1973-02-27T02:30:00.102030",
|
||||
"2018-12-02T11:22:33.123456",
|
||||
"2039-01-01T01:02:03.456789",
|
||||
"2345-10-07T22:45:03.010203").foreach { timestamp =>
|
||||
DateTimeTestUtils.outstandingTimezones.foreach { timeZone =>
|
||||
val formatter = TimestampFormatter("yyyy-MM-dd'T'HH:mm:ss.SSSSSS", timeZone, Locale.US)
|
||||
val micros = formatter.parse(timestamp)
|
||||
val formatted = formatter.format(micros)
|
||||
assert(timestamp === formatted)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("roundtrip date -> days -> date") {
|
||||
Seq(
|
||||
"0050-01-01",
|
||||
"0953-02-02",
|
||||
"1423-03-08",
|
||||
"1969-12-31",
|
||||
"1972-08-25",
|
||||
"1975-09-26",
|
||||
"2018-12-12",
|
||||
"2038-01-01",
|
||||
"5010-11-17").foreach { date =>
|
||||
DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
|
||||
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {
|
||||
val formatter = DateFormatter("yyyy-MM-dd", Locale.US)
|
||||
val days = formatter.parse(date)
|
||||
val formatted = formatter.format(days)
|
||||
assert(date === formatted)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("roundtrip days -> date -> days") {
|
||||
Seq(
|
||||
-701265,
|
||||
-371419,
|
||||
-199722,
|
||||
-1,
|
||||
0,
|
||||
967,
|
||||
2094,
|
||||
17877,
|
||||
24837,
|
||||
1110657).foreach { days =>
|
||||
DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
|
||||
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {
|
||||
val formatter = DateFormatter("yyyy-MM-dd", Locale.US)
|
||||
val date = formatter.format(days)
|
||||
val parsed = formatter.parse(date)
|
||||
assert(days === parsed)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -57,14 +57,17 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
|
|||
}
|
||||
|
||||
val factory = new JsonFactory()
|
||||
def enforceCorrectType(value: Any, dataType: DataType): Any = {
|
||||
def enforceCorrectType(
|
||||
value: Any,
|
||||
dataType: DataType,
|
||||
options: Map[String, String] = Map.empty): Any = {
|
||||
val writer = new StringWriter()
|
||||
Utils.tryWithResource(factory.createGenerator(writer)) { generator =>
|
||||
generator.writeObject(value)
|
||||
generator.flush()
|
||||
}
|
||||
|
||||
val dummyOption = new JSONOptions(Map.empty[String, String], "GMT")
|
||||
val dummyOption = new JSONOptions(options, SQLConf.get.sessionLocalTimeZone)
|
||||
val dummySchema = StructType(Seq.empty)
|
||||
val parser = new JacksonParser(dummySchema, dummyOption, allowArrayAsStructs = true)
|
||||
|
||||
|
@ -96,19 +99,27 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
|
|||
checkTypePromotion(DateTimeUtils.fromJavaTimestamp(new Timestamp(intNumber.toLong * 1000L)),
|
||||
enforceCorrectType(intNumber.toLong, TimestampType))
|
||||
val strTime = "2014-09-30 12:34:56"
|
||||
checkTypePromotion(DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf(strTime)),
|
||||
enforceCorrectType(strTime, TimestampType))
|
||||
checkTypePromotion(
|
||||
expected = DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf(strTime)),
|
||||
enforceCorrectType(strTime, TimestampType,
|
||||
Map("timestampFormat" -> "yyyy-MM-dd HH:mm:ss")))
|
||||
|
||||
val strDate = "2014-10-15"
|
||||
checkTypePromotion(
|
||||
DateTimeUtils.fromJavaDate(Date.valueOf(strDate)), enforceCorrectType(strDate, DateType))
|
||||
|
||||
val ISO8601Time1 = "1970-01-01T01:00:01.0Z"
|
||||
val ISO8601Time2 = "1970-01-01T02:00:01-01:00"
|
||||
checkTypePromotion(DateTimeUtils.fromJavaTimestamp(new Timestamp(3601000)),
|
||||
enforceCorrectType(ISO8601Time1, TimestampType))
|
||||
enforceCorrectType(
|
||||
ISO8601Time1,
|
||||
TimestampType,
|
||||
Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ss.SX")))
|
||||
val ISO8601Time2 = "1970-01-01T02:00:01-01:00"
|
||||
checkTypePromotion(DateTimeUtils.fromJavaTimestamp(new Timestamp(10801000)),
|
||||
enforceCorrectType(ISO8601Time2, TimestampType))
|
||||
enforceCorrectType(
|
||||
ISO8601Time2,
|
||||
TimestampType,
|
||||
Map("timestampFormat" -> "yyyy-MM-dd'T'HH:mm:ssXXX")))
|
||||
|
||||
val ISO8601Date = "1970-01-01"
|
||||
checkTypePromotion(DateTimeUtils.millisToDays(32400000),
|
||||
|
@ -1440,6 +1451,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
|
|||
}
|
||||
|
||||
test("backward compatibility") {
|
||||
withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> "true") {
|
||||
// This test we make sure our JSON support can read JSON data generated by previous version
|
||||
// of Spark generated through toJSON method and JSON data source.
|
||||
// The data is generated by the following program.
|
||||
|
@ -1539,6 +1551,7 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
|
|||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("SPARK-11544 test pathfilter") {
|
||||
withTempPath { dir =>
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.spark.sql.sources
|
||||
|
||||
import java.io.File
|
||||
import java.util.TimeZone
|
||||
|
||||
import scala.util.Random
|
||||
|
||||
|
@ -125,6 +126,8 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
|
|||
} else {
|
||||
Seq(false)
|
||||
}
|
||||
// TODO: Support new parser too, see SPARK-26374.
|
||||
withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> "true") {
|
||||
for (dataType <- supportedDataTypes) {
|
||||
for (parquetDictionaryEncodingEnabled <- parquetDictionaryEncodingEnabledConfs) {
|
||||
val extraMessage = if (isParquetDataSource) {
|
||||
|
@ -141,10 +144,12 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
|
|||
withTempPath { file =>
|
||||
val path = file.getCanonicalPath
|
||||
|
||||
val seed = System.nanoTime()
|
||||
withClue(s"Random data generated with the seed: ${seed}") {
|
||||
val dataGenerator = RandomDataGenerator.forType(
|
||||
dataType = dataType,
|
||||
nullable = true,
|
||||
new Random(System.nanoTime())
|
||||
new Random(seed)
|
||||
).getOrElse {
|
||||
fail(s"Failed to create data generator for schema $dataType")
|
||||
}
|
||||
|
@ -179,6 +184,8 @@ abstract class HadoopFsRelationTest extends QueryTest with SQLTestUtils with Tes
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test("save()/load() - non-partitioned table - Overwrite") {
|
||||
withTempPath { file =>
|
||||
|
|
Loading…
Reference in a new issue